LLVM 21.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
51#include "llvm/IR/Attributes.h"
52#include "llvm/IR/BasicBlock.h"
53#include "llvm/IR/Constant.h"
54#include "llvm/IR/Constants.h"
55#include "llvm/IR/DataLayout.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/IRBuilder.h"
60#include "llvm/IR/InstrTypes.h"
61#include "llvm/IR/Instruction.h"
64#include "llvm/IR/Intrinsics.h"
65#include "llvm/IR/Module.h"
66#include "llvm/IR/Operator.h"
68#include "llvm/IR/Type.h"
69#include "llvm/IR/Use.h"
70#include "llvm/IR/User.h"
71#include "llvm/IR/Value.h"
72#include "llvm/IR/ValueHandle.h"
73#ifdef EXPENSIVE_CHECKS
74#include "llvm/IR/Verifier.h"
75#endif
76#include "llvm/Pass.h"
81#include "llvm/Support/Debug.h"
93#include <algorithm>
94#include <cassert>
95#include <cstdint>
96#include <iterator>
97#include <memory>
98#include <optional>
99#include <set>
100#include <string>
101#include <tuple>
102#include <utility>
103
104using namespace llvm;
105using namespace llvm::PatternMatch;
106using namespace slpvectorizer;
107using namespace std::placeholders;
108
109#define SV_NAME "slp-vectorizer"
110#define DEBUG_TYPE "SLP"
111
112STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
113
114DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",
115 "Controls which SLP graphs should be vectorized.");
116
117static cl::opt<bool>
118 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
119 cl::desc("Run the SLP vectorization passes"));
120
121static cl::opt<bool>
122 SLPReVec("slp-revec", cl::init(false), cl::Hidden,
123 cl::desc("Enable vectorization for wider vector utilization"));
124
125static cl::opt<int>
127 cl::desc("Only vectorize if you gain more than this "
128 "number "));
129
131 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
132 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
133 "heuristics and makes vectorization decision via cost modeling."));
134
135static cl::opt<bool>
136ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
137 cl::desc("Attempt to vectorize horizontal reductions"));
138
140 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
141 cl::desc(
142 "Attempt to vectorize horizontal reductions feeding into a store"));
143
144static cl::opt<int>
146 cl::desc("Attempt to vectorize for this register size in bits"));
147
150 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
151
152/// Limits the size of scheduling regions in a block.
153/// It avoid long compile times for _very_ large blocks where vector
154/// instructions are spread over a wide range.
155/// This limit is way higher than needed by real-world functions.
156static cl::opt<int>
157ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
158 cl::desc("Limit the size of the SLP scheduling region per block"));
159
161 "slp-min-reg-size", cl::init(128), cl::Hidden,
162 cl::desc("Attempt to vectorize for this register size in bits"));
163
165 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
166 cl::desc("Limit the recursion depth when building a vectorizable tree"));
167
169 "slp-min-tree-size", cl::init(3), cl::Hidden,
170 cl::desc("Only vectorize small trees if they are fully vectorizable"));
171
172// The maximum depth that the look-ahead score heuristic will explore.
173// The higher this value, the higher the compilation time overhead.
175 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
176 cl::desc("The maximum look-ahead depth for operand reordering scores"));
177
178// The maximum depth that the look-ahead score heuristic will explore
179// when it probing among candidates for vectorization tree roots.
180// The higher this value, the higher the compilation time overhead but unlike
181// similar limit for operands ordering this is less frequently used, hence
182// impact of higher value is less noticeable.
184 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
185 cl::desc("The maximum look-ahead depth for searching best rooting option"));
186
188 "slp-min-strided-loads", cl::init(2), cl::Hidden,
189 cl::desc("The minimum number of loads, which should be considered strided, "
190 "if the stride is > 1 or is runtime value"));
191
193 "slp-max-stride", cl::init(8), cl::Hidden,
194 cl::desc("The maximum stride, considered to be profitable."));
195
196static cl::opt<bool>
197 ViewSLPTree("view-slp-tree", cl::Hidden,
198 cl::desc("Display the SLP trees with Graphviz"));
199
201 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
202 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
203
204// Limit the number of alias checks. The limit is chosen so that
205// it has no negative effect on the llvm benchmarks.
206static const unsigned AliasedCheckLimit = 10;
207
208// Limit of the number of uses for potentially transformed instructions/values,
209// used in checks to avoid compile-time explode.
210static constexpr int UsesLimit = 64;
211
212// Another limit for the alias checks: The maximum distance between load/store
213// instructions where alias checks are done.
214// This limit is useful for very large basic blocks.
215static const unsigned MaxMemDepDistance = 160;
216
217/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
218/// regions to be handled.
219static const int MinScheduleRegionSize = 16;
220
221/// Maximum allowed number of operands in the PHI nodes.
222static const unsigned MaxPHINumOperands = 128;
223
224/// Predicate for the element types that the SLP vectorizer supports.
225///
226/// The most important thing to filter here are types which are invalid in LLVM
227/// vectors. We also filter target specific types which have absolutely no
228/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
229/// avoids spending time checking the cost model and realizing that they will
230/// be inevitably scalarized.
231static bool isValidElementType(Type *Ty) {
232 // TODO: Support ScalableVectorType.
233 if (SLPReVec && isa<FixedVectorType>(Ty))
234 Ty = Ty->getScalarType();
235 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
236 !Ty->isPPC_FP128Ty();
237}
238
239/// Returns the type of the given value/instruction \p V. If it is store,
240/// returns the type of its value operand, for Cmp - the types of the compare
241/// operands and for insertelement - the type os the inserted operand.
242/// Otherwise, just the type of the value is returned.
244 if (auto *SI = dyn_cast<StoreInst>(V))
245 return SI->getValueOperand()->getType();
246 if (auto *CI = dyn_cast<CmpInst>(V))
247 return CI->getOperand(0)->getType();
248 if (auto *IE = dyn_cast<InsertElementInst>(V))
249 return IE->getOperand(1)->getType();
250 return V->getType();
251}
252
253/// \returns the number of elements for Ty.
254static unsigned getNumElements(Type *Ty) {
255 assert(!isa<ScalableVectorType>(Ty) &&
256 "ScalableVectorType is not supported.");
257 if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
258 return VecTy->getNumElements();
259 return 1;
260}
261
262/// \returns the vector type of ScalarTy based on vectorization factor.
263static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
264 return FixedVectorType::get(ScalarTy->getScalarType(),
265 VF * getNumElements(ScalarTy));
266}
267
268/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
269/// which forms type, which splits by \p TTI into whole vector types during
270/// legalization.
272 Type *Ty, unsigned Sz) {
273 if (!isValidElementType(Ty))
274 return bit_ceil(Sz);
275 // Find the number of elements, which forms full vectors.
276 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
277 if (NumParts == 0 || NumParts >= Sz)
278 return bit_ceil(Sz);
279 return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
280}
281
282/// Returns the number of elements of the given type \p Ty, not greater than \p
283/// Sz, which forms type, which splits by \p TTI into whole vector types during
284/// legalization.
285static unsigned
287 unsigned Sz) {
288 if (!isValidElementType(Ty))
289 return bit_floor(Sz);
290 // Find the number of elements, which forms full vectors.
291 unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
292 if (NumParts == 0 || NumParts >= Sz)
293 return bit_floor(Sz);
294 unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
295 if (RegVF > Sz)
296 return bit_floor(Sz);
297 return (Sz / RegVF) * RegVF;
298}
299
300static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
301 SmallVectorImpl<int> &Mask) {
302 // The ShuffleBuilder implementation use shufflevector to splat an "element".
303 // But the element have different meaning for SLP (scalar) and REVEC
304 // (vector). We need to expand Mask into masks which shufflevector can use
305 // directly.
306 SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
307 for (unsigned I : seq<unsigned>(Mask.size()))
308 for (auto [J, MaskV] : enumerate(MutableArrayRef(NewMask).slice(
309 I * VecTyNumElements, VecTyNumElements)))
310 MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem
311 : Mask[I] * VecTyNumElements + J;
312 Mask.swap(NewMask);
313}
314
315/// \returns the number of groups of shufflevector
316/// A group has the following features
317/// 1. All of value in a group are shufflevector.
318/// 2. The mask of all shufflevector is isExtractSubvectorMask.
319/// 3. The mask of all shufflevector uses all of the elements of the source.
320/// e.g., it is 1 group (%0)
321/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
322/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
323/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
324/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
325/// it is 2 groups (%3 and %4)
326/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
327/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
328/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
329/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
330/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
331/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
332/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
333/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
334/// it is 0 group
335/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
336/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
337/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
338/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
340 if (VL.empty())
341 return 0;
342 if (!all_of(VL, IsaPred<ShuffleVectorInst>))
343 return 0;
344 auto *SV = cast<ShuffleVectorInst>(VL.front());
345 unsigned SVNumElements =
346 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
347 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
348 if (SVNumElements % ShuffleMaskSize != 0)
349 return 0;
350 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
351 if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
352 return 0;
353 unsigned NumGroup = 0;
354 for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) {
355 auto *SV = cast<ShuffleVectorInst>(VL[I]);
356 Value *Src = SV->getOperand(0);
357 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
358 SmallBitVector ExpectedIndex(GroupSize);
359 if (!all_of(Group, [&](Value *V) {
360 auto *SV = cast<ShuffleVectorInst>(V);
361 // From the same source.
362 if (SV->getOperand(0) != Src)
363 return false;
364 int Index;
365 if (!SV->isExtractSubvectorMask(Index))
366 return false;
367 ExpectedIndex.set(Index / ShuffleMaskSize);
368 return true;
369 }))
370 return 0;
371 if (!ExpectedIndex.all())
372 return 0;
373 ++NumGroup;
374 }
375 assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
376 return NumGroup;
377}
378
379/// \returns a shufflevector mask which is used to vectorize shufflevectors
380/// e.g.,
381/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
382/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
383/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
384/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
385/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
386/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
387/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
388/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
389/// the result is
390/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
392 assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
393 auto *SV = cast<ShuffleVectorInst>(VL.front());
394 unsigned SVNumElements =
395 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
396 SmallVector<int> Mask;
397 unsigned AccumulateLength = 0;
398 for (Value *V : VL) {
399 auto *SV = cast<ShuffleVectorInst>(V);
400 for (int M : SV->getShuffleMask())
401 Mask.push_back(M == PoisonMaskElem ? PoisonMaskElem
402 : AccumulateLength + M);
403 AccumulateLength += SVNumElements;
404 }
405 return Mask;
406}
407
408/// \returns True if the value is a constant (but not globals/constant
409/// expressions).
410static bool isConstant(Value *V) {
411 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
412}
413
414/// Checks if \p V is one of vector-like instructions, i.e. undef,
415/// insertelement/extractelement with constant indices for fixed vector type or
416/// extractvalue instruction.
418 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
419 !isa<ExtractValueInst, UndefValue>(V))
420 return false;
421 auto *I = dyn_cast<Instruction>(V);
422 if (!I || isa<ExtractValueInst>(I))
423 return true;
424 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
425 return false;
426 if (isa<ExtractElementInst>(I))
427 return isConstant(I->getOperand(1));
428 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
429 return isConstant(I->getOperand(2));
430}
431
432/// Returns power-of-2 number of elements in a single register (part), given the
433/// total number of elements \p Size and number of registers (parts) \p
434/// NumParts.
435static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
436 return std::min<unsigned>(Size, bit_ceil(divideCeil(Size, NumParts)));
437}
438
439/// Returns correct remaining number of elements, considering total amount \p
440/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
441/// and current register (part) \p Part.
442static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
443 unsigned Part) {
444 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
445}
446
447#if !defined(NDEBUG)
448/// Print a short descriptor of the instruction bundle suitable for debug output.
449static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
450 std::string Result;
451 raw_string_ostream OS(Result);
452 if (Idx >= 0)
453 OS << "Idx: " << Idx << ", ";
454 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
455 return Result;
456}
457#endif
458
459/// \returns true if all of the instructions in \p VL are in the same block or
460/// false otherwise.
462 auto *It = find_if(VL, IsaPred<Instruction>);
463 if (It == VL.end())
464 return false;
465 Instruction *I0 = cast<Instruction>(*It);
467 return true;
468
469 BasicBlock *BB = I0->getParent();
470 for (Value *V : iterator_range(It, VL.end())) {
471 if (isa<PoisonValue>(V))
472 continue;
473 auto *II = dyn_cast<Instruction>(V);
474 if (!II)
475 return false;
476
477 if (BB != II->getParent())
478 return false;
479 }
480 return true;
481}
482
483/// \returns True if all of the values in \p VL are constants (but not
484/// globals/constant expressions).
486 // Constant expressions and globals can't be vectorized like normal integer/FP
487 // constants.
488 return all_of(VL, isConstant);
489}
490
491/// \returns True if all of the values in \p VL are identical or some of them
492/// are UndefValue.
493static bool isSplat(ArrayRef<Value *> VL) {
494 Value *FirstNonUndef = nullptr;
495 for (Value *V : VL) {
496 if (isa<UndefValue>(V))
497 continue;
498 if (!FirstNonUndef) {
499 FirstNonUndef = V;
500 continue;
501 }
502 if (V != FirstNonUndef)
503 return false;
504 }
505 return FirstNonUndef != nullptr;
506}
507
508/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
510 if (auto *Cmp = dyn_cast<CmpInst>(I))
511 return Cmp->isCommutative();
512 if (auto *BO = dyn_cast<BinaryOperator>(I))
513 return BO->isCommutative() ||
514 (BO->getOpcode() == Instruction::Sub &&
515 !BO->hasNUsesOrMore(UsesLimit) &&
516 all_of(
517 BO->uses(),
518 [](const Use &U) {
519 // Commutative, if icmp eq/ne sub, 0
520 CmpPredicate Pred;
521 if (match(U.getUser(),
522 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
523 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
524 return true;
525 // Commutative, if abs(sub nsw, true) or abs(sub, false).
526 ConstantInt *Flag;
527 return match(U.getUser(),
528 m_Intrinsic<Intrinsic::abs>(
529 m_Specific(U.get()), m_ConstantInt(Flag))) &&
530 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
531 Flag->isOne());
532 })) ||
533 (BO->getOpcode() == Instruction::FSub &&
534 !BO->hasNUsesOrMore(UsesLimit) &&
535 all_of(BO->uses(), [](const Use &U) {
536 return match(U.getUser(),
537 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
538 }));
539 return I->isCommutative();
540}
541
542template <typename T>
543static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
544 unsigned Offset) {
545 static_assert(std::is_same_v<T, InsertElementInst> ||
546 std::is_same_v<T, ExtractElementInst>,
547 "unsupported T");
548 int Index = Offset;
549 if (const auto *IE = dyn_cast<T>(Inst)) {
550 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
551 if (!VT)
552 return std::nullopt;
553 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
554 if (!CI)
555 return std::nullopt;
556 if (CI->getValue().uge(VT->getNumElements()))
557 return std::nullopt;
558 Index *= VT->getNumElements();
559 Index += CI->getZExtValue();
560 return Index;
561 }
562 return std::nullopt;
563}
564
565/// \returns inserting or extracting index of InsertElement, ExtractElement or
566/// InsertValue instruction, using Offset as base offset for index.
567/// \returns std::nullopt if the index is not an immediate.
568static std::optional<unsigned> getElementIndex(const Value *Inst,
569 unsigned Offset = 0) {
570 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
571 return Index;
572 if (auto Index = getInsertExtractIndex<ExtractElementInst>(Inst, Offset))
573 return Index;
574
575 int Index = Offset;
576
577 const auto *IV = dyn_cast<InsertValueInst>(Inst);
578 if (!IV)
579 return std::nullopt;
580
581 Type *CurrentType = IV->getType();
582 for (unsigned I : IV->indices()) {
583 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
584 Index *= ST->getNumElements();
585 CurrentType = ST->getElementType(I);
586 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
587 Index *= AT->getNumElements();
588 CurrentType = AT->getElementType();
589 } else {
590 return std::nullopt;
591 }
592 Index += I;
593 }
594 return Index;
595}
596
597namespace {
598/// Specifies the way the mask should be analyzed for undefs/poisonous elements
599/// in the shuffle mask.
600enum class UseMask {
601 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
602 ///< check for the mask elements for the first argument (mask
603 ///< indices are in range [0:VF)).
604 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
605 ///< for the mask elements for the second argument (mask indices
606 ///< are in range [VF:2*VF))
607 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
608 ///< future shuffle elements and mark them as ones as being used
609 ///< in future. Non-undef elements are considered as unused since
610 ///< they're already marked as used in the mask.
611};
612} // namespace
613
614/// Prepares a use bitset for the given mask either for the first argument or
615/// for the second.
617 UseMask MaskArg) {
618 SmallBitVector UseMask(VF, true);
619 for (auto [Idx, Value] : enumerate(Mask)) {
620 if (Value == PoisonMaskElem) {
621 if (MaskArg == UseMask::UndefsAsMask)
622 UseMask.reset(Idx);
623 continue;
624 }
625 if (MaskArg == UseMask::FirstArg && Value < VF)
626 UseMask.reset(Value);
627 else if (MaskArg == UseMask::SecondArg && Value >= VF)
628 UseMask.reset(Value - VF);
629 }
630 return UseMask;
631}
632
633/// Checks if the given value is actually an undefined constant vector.
634/// Also, if the \p UseMask is not empty, tries to check if the non-masked
635/// elements actually mask the insertelement buildvector, if any.
636template <bool IsPoisonOnly = false>
638 const SmallBitVector &UseMask = {}) {
639 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
640 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
641 if (isa<T>(V))
642 return Res;
643 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
644 if (!VecTy)
645 return Res.reset();
646 auto *C = dyn_cast<Constant>(V);
647 if (!C) {
648 if (!UseMask.empty()) {
649 const Value *Base = V;
650 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
651 Base = II->getOperand(0);
652 if (isa<T>(II->getOperand(1)))
653 continue;
654 std::optional<unsigned> Idx = getElementIndex(II);
655 if (!Idx) {
656 Res.reset();
657 return Res;
658 }
659 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
660 Res.reset(*Idx);
661 }
662 // TODO: Add analysis for shuffles here too.
663 if (V == Base) {
664 Res.reset();
665 } else {
666 SmallBitVector SubMask(UseMask.size(), false);
667 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
668 }
669 } else {
670 Res.reset();
671 }
672 return Res;
673 }
674 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
675 if (Constant *Elem = C->getAggregateElement(I))
676 if (!isa<T>(Elem) &&
677 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
678 Res.reset(I);
679 }
680 return Res;
681}
682
683/// Checks if the vector of instructions can be represented as a shuffle, like:
684/// %x0 = extractelement <4 x i8> %x, i32 0
685/// %x3 = extractelement <4 x i8> %x, i32 3
686/// %y1 = extractelement <4 x i8> %y, i32 1
687/// %y2 = extractelement <4 x i8> %y, i32 2
688/// %x0x0 = mul i8 %x0, %x0
689/// %x3x3 = mul i8 %x3, %x3
690/// %y1y1 = mul i8 %y1, %y1
691/// %y2y2 = mul i8 %y2, %y2
692/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
693/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
694/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
695/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
696/// ret <4 x i8> %ins4
697/// can be transformed into:
698/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
699/// i32 6>
700/// %2 = mul <4 x i8> %1, %1
701/// ret <4 x i8> %2
702/// Mask will return the Shuffle Mask equivalent to the extracted elements.
703/// TODO: Can we split off and reuse the shuffle mask detection from
704/// ShuffleVectorInst/getShuffleCost?
705static std::optional<TargetTransformInfo::ShuffleKind>
707 AssumptionCache *AC) {
708 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
709 if (It == VL.end())
710 return std::nullopt;
711 unsigned Size =
712 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) {
713 auto *EI = dyn_cast<ExtractElementInst>(V);
714 if (!EI)
715 return S;
716 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
717 if (!VTy)
718 return S;
719 return std::max(S, VTy->getNumElements());
720 });
721
722 Value *Vec1 = nullptr;
723 Value *Vec2 = nullptr;
724 bool HasNonUndefVec = any_of(VL, [&](Value *V) {
725 auto *EE = dyn_cast<ExtractElementInst>(V);
726 if (!EE)
727 return false;
728 Value *Vec = EE->getVectorOperand();
729 if (isa<UndefValue>(Vec))
730 return false;
731 return isGuaranteedNotToBePoison(Vec, AC);
732 });
733 enum ShuffleMode { Unknown, Select, Permute };
734 ShuffleMode CommonShuffleMode = Unknown;
735 Mask.assign(VL.size(), PoisonMaskElem);
736 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
737 // Undef can be represented as an undef element in a vector.
738 if (isa<UndefValue>(VL[I]))
739 continue;
740 auto *EI = cast<ExtractElementInst>(VL[I]);
741 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
742 return std::nullopt;
743 auto *Vec = EI->getVectorOperand();
744 // We can extractelement from undef or poison vector.
745 if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
746 continue;
747 // All vector operands must have the same number of vector elements.
748 if (isa<UndefValue>(Vec)) {
749 Mask[I] = I;
750 } else {
751 if (isa<UndefValue>(EI->getIndexOperand()))
752 continue;
753 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
754 if (!Idx)
755 return std::nullopt;
756 // Undefined behavior if Idx is negative or >= Size.
757 if (Idx->getValue().uge(Size))
758 continue;
759 unsigned IntIdx = Idx->getValue().getZExtValue();
760 Mask[I] = IntIdx;
761 }
762 if (isUndefVector(Vec).all() && HasNonUndefVec)
763 continue;
764 // For correct shuffling we have to have at most 2 different vector operands
765 // in all extractelement instructions.
766 if (!Vec1 || Vec1 == Vec) {
767 Vec1 = Vec;
768 } else if (!Vec2 || Vec2 == Vec) {
769 Vec2 = Vec;
770 Mask[I] += Size;
771 } else {
772 return std::nullopt;
773 }
774 if (CommonShuffleMode == Permute)
775 continue;
776 // If the extract index is not the same as the operation number, it is a
777 // permutation.
778 if (Mask[I] % Size != I) {
779 CommonShuffleMode = Permute;
780 continue;
781 }
782 CommonShuffleMode = Select;
783 }
784 // If we're not crossing lanes in different vectors, consider it as blending.
785 if (CommonShuffleMode == Select && Vec2)
787 // If Vec2 was never used, we have a permutation of a single vector, otherwise
788 // we have permutation of 2 vectors.
791}
792
793/// \returns True if Extract{Value,Element} instruction extracts element Idx.
794static std::optional<unsigned> getExtractIndex(Instruction *E) {
795 unsigned Opcode = E->getOpcode();
796 assert((Opcode == Instruction::ExtractElement ||
797 Opcode == Instruction::ExtractValue) &&
798 "Expected extractelement or extractvalue instruction.");
799 if (Opcode == Instruction::ExtractElement) {
800 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
801 if (!CI)
802 return std::nullopt;
803 return CI->getZExtValue();
804 }
805 auto *EI = cast<ExtractValueInst>(E);
806 if (EI->getNumIndices() != 1)
807 return std::nullopt;
808 return *EI->idx_begin();
809}
810
811namespace {
812
813/// Main data required for vectorization of instructions.
814class InstructionsState {
815 /// The main/alternate instruction. MainOp is also VL0.
816 Instruction *MainOp = nullptr;
817 Instruction *AltOp = nullptr;
818
819public:
820 Instruction *getMainOp() const {
821 assert(valid() && "InstructionsState is invalid.");
822 return MainOp;
823 }
824
825 Instruction *getAltOp() const {
826 assert(valid() && "InstructionsState is invalid.");
827 return AltOp;
828 }
829
830 /// The main/alternate opcodes for the list of instructions.
831 unsigned getOpcode() const { return getMainOp()->getOpcode(); }
832
833 unsigned getAltOpcode() const { return getAltOp()->getOpcode(); }
834
835 /// Some of the instructions in the list have alternate opcodes.
836 bool isAltShuffle() const { return getMainOp() != getAltOp(); }
837
838 bool isOpcodeOrAlt(Instruction *I) const {
839 unsigned CheckedOpcode = I->getOpcode();
840 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
841 }
842
843 /// Checks if the current state is valid, i.e. has non-null MainOp
844 bool valid() const { return MainOp && AltOp; }
845
846 explicit operator bool() const { return valid(); }
847
848 InstructionsState() = delete;
849 InstructionsState(Instruction *MainOp, Instruction *AltOp)
850 : MainOp(MainOp), AltOp(AltOp) {}
851 static InstructionsState invalid() { return {nullptr, nullptr}; }
852};
853
854} // end anonymous namespace
855
856/// \returns true if \p Opcode is allowed as part of the main/alternate
857/// instruction for SLP vectorization.
858///
859/// Example of unsupported opcode is SDIV that can potentially cause UB if the
860/// "shuffled out" lane would result in division by zero.
861static bool isValidForAlternation(unsigned Opcode) {
862 if (Instruction::isIntDivRem(Opcode))
863 return false;
864
865 return true;
866}
867
868static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
869 const TargetLibraryInfo &TLI);
870
871/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
872/// compatible instructions or constants, or just some other regular values.
873static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
874 Value *Op1, const TargetLibraryInfo &TLI) {
875 return (isConstant(BaseOp0) && isConstant(Op0)) ||
876 (isConstant(BaseOp1) && isConstant(Op1)) ||
877 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
878 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
879 BaseOp0 == Op0 || BaseOp1 == Op1 ||
880 getSameOpcode({BaseOp0, Op0}, TLI) ||
881 getSameOpcode({BaseOp1, Op1}, TLI);
882}
883
884/// \returns true if a compare instruction \p CI has similar "look" and
885/// same predicate as \p BaseCI, "as is" or with its operands and predicate
886/// swapped, false otherwise.
887static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
888 const TargetLibraryInfo &TLI) {
889 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
890 "Assessing comparisons of different types?");
891 CmpInst::Predicate BasePred = BaseCI->getPredicate();
892 CmpInst::Predicate Pred = CI->getPredicate();
894
895 Value *BaseOp0 = BaseCI->getOperand(0);
896 Value *BaseOp1 = BaseCI->getOperand(1);
897 Value *Op0 = CI->getOperand(0);
898 Value *Op1 = CI->getOperand(1);
899
900 return (BasePred == Pred &&
901 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
902 (BasePred == SwappedPred &&
903 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
904}
905
906/// \returns analysis of the Instructions in \p VL described in
907/// InstructionsState, the Opcode that we suppose the whole list
908/// could be vectorized even if its structure is diverse.
909static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
910 const TargetLibraryInfo &TLI) {
911 // Make sure these are all Instructions.
912 if (!all_of(VL, IsaPred<Instruction, PoisonValue>))
913 return InstructionsState::invalid();
914
915 auto *It = find_if(VL, IsaPred<Instruction>);
916 if (It == VL.end())
917 return InstructionsState::invalid();
918
919 Instruction *MainOp = cast<Instruction>(*It);
920 unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);
921 if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) ||
922 (VL.size() == 2 && InstCnt < 2))
923 return InstructionsState::invalid();
924
925 bool IsCastOp = isa<CastInst>(MainOp);
926 bool IsBinOp = isa<BinaryOperator>(MainOp);
927 bool IsCmpOp = isa<CmpInst>(MainOp);
928 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
930 Instruction *AltOp = MainOp;
931 unsigned Opcode = MainOp->getOpcode();
932 unsigned AltOpcode = Opcode;
933
934 bool SwappedPredsCompatible = IsCmpOp && [&]() {
935 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
936 UniquePreds.insert(BasePred);
937 UniqueNonSwappedPreds.insert(BasePred);
938 for (Value *V : VL) {
939 auto *I = dyn_cast<CmpInst>(V);
940 if (!I)
941 return false;
942 CmpInst::Predicate CurrentPred = I->getPredicate();
943 CmpInst::Predicate SwappedCurrentPred =
944 CmpInst::getSwappedPredicate(CurrentPred);
945 UniqueNonSwappedPreds.insert(CurrentPred);
946 if (!UniquePreds.contains(CurrentPred) &&
947 !UniquePreds.contains(SwappedCurrentPred))
948 UniquePreds.insert(CurrentPred);
949 }
950 // Total number of predicates > 2, but if consider swapped predicates
951 // compatible only 2, consider swappable predicates as compatible opcodes,
952 // not alternate.
953 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
954 }();
955 // Check for one alternate opcode from another BinaryOperator.
956 // TODO - generalize to support all operators (types, calls etc.).
957 Intrinsic::ID BaseID = 0;
958 SmallVector<VFInfo> BaseMappings;
959 if (auto *CallBase = dyn_cast<CallInst>(MainOp)) {
961 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
962 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
963 return InstructionsState::invalid();
964 }
965 bool AnyPoison = InstCnt != VL.size();
966 // Check MainOp too to be sure that it matches the requirements for the
967 // instructions.
968 for (Value *V : iterator_range(It, VL.end())) {
969 auto *I = dyn_cast<Instruction>(V);
970 if (!I)
971 continue;
972
973 // Cannot combine poison and divisions.
974 // TODO: do some smart analysis of the CallInsts to exclude divide-like
975 // intrinsics/functions only.
976 if (AnyPoison && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
977 return InstructionsState::invalid();
978 unsigned InstOpcode = I->getOpcode();
979 if (IsBinOp && isa<BinaryOperator>(I)) {
980 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
981 continue;
982 if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
983 isValidForAlternation(Opcode)) {
984 AltOpcode = InstOpcode;
985 AltOp = I;
986 continue;
987 }
988 } else if (IsCastOp && isa<CastInst>(I)) {
989 Value *Op0 = MainOp->getOperand(0);
990 Type *Ty0 = Op0->getType();
991 Value *Op1 = I->getOperand(0);
992 Type *Ty1 = Op1->getType();
993 if (Ty0 == Ty1) {
994 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
995 continue;
996 if (Opcode == AltOpcode) {
998 isValidForAlternation(InstOpcode) &&
999 "Cast isn't safe for alternation, logic needs to be updated!");
1000 AltOpcode = InstOpcode;
1001 AltOp = I;
1002 continue;
1003 }
1004 }
1005 } else if (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) {
1006 auto *BaseInst = cast<CmpInst>(MainOp);
1007 Type *Ty0 = BaseInst->getOperand(0)->getType();
1008 Type *Ty1 = Inst->getOperand(0)->getType();
1009 if (Ty0 == Ty1) {
1010 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
1011 assert(InstOpcode == AltOpcode &&
1012 "Alternate instructions are only supported by BinaryOperator "
1013 "and CastInst.");
1014 // Check for compatible operands. If the corresponding operands are not
1015 // compatible - need to perform alternate vectorization.
1016 CmpInst::Predicate CurrentPred = Inst->getPredicate();
1017 CmpInst::Predicate SwappedCurrentPred =
1018 CmpInst::getSwappedPredicate(CurrentPred);
1019
1020 if ((VL.size() == 2 || SwappedPredsCompatible) &&
1021 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1022 continue;
1023
1024 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
1025 continue;
1026 auto *AltInst = cast<CmpInst>(AltOp);
1027 if (MainOp != AltOp) {
1028 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
1029 continue;
1030 } else if (BasePred != CurrentPred) {
1031 assert(
1032 isValidForAlternation(InstOpcode) &&
1033 "CmpInst isn't safe for alternation, logic needs to be updated!");
1034 AltOp = I;
1035 continue;
1036 }
1037 CmpInst::Predicate AltPred = AltInst->getPredicate();
1038 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1039 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1040 continue;
1041 }
1042 } else if (InstOpcode == Opcode) {
1043 assert(InstOpcode == AltOpcode &&
1044 "Alternate instructions are only supported by BinaryOperator and "
1045 "CastInst.");
1046 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
1047 if (Gep->getNumOperands() != 2 ||
1048 Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType())
1049 return InstructionsState::invalid();
1050 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
1052 return InstructionsState::invalid();
1053 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
1054 auto *BaseLI = cast<LoadInst>(MainOp);
1055 if (!LI->isSimple() || !BaseLI->isSimple())
1056 return InstructionsState::invalid();
1057 } else if (auto *Call = dyn_cast<CallInst>(I)) {
1058 auto *CallBase = cast<CallInst>(MainOp);
1059 if (Call->getCalledFunction() != CallBase->getCalledFunction())
1060 return InstructionsState::invalid();
1061 if (Call->hasOperandBundles() &&
1063 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1064 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1065 CallBase->op_begin() +
1067 return InstructionsState::invalid();
1069 if (ID != BaseID)
1070 return InstructionsState::invalid();
1071 if (!ID) {
1072 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
1073 if (Mappings.size() != BaseMappings.size() ||
1074 Mappings.front().ISA != BaseMappings.front().ISA ||
1075 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
1076 Mappings.front().VectorName != BaseMappings.front().VectorName ||
1077 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
1078 Mappings.front().Shape.Parameters !=
1079 BaseMappings.front().Shape.Parameters)
1080 return InstructionsState::invalid();
1081 }
1082 }
1083 continue;
1084 }
1085 return InstructionsState::invalid();
1086 }
1087
1088 return InstructionsState(MainOp, AltOp);
1089}
1090
1091/// \returns true if all of the values in \p VL have the same type or false
1092/// otherwise.
1094 Type *Ty = VL.front()->getType();
1095 return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
1096}
1097
1098/// \returns True if in-tree use also needs extract. This refers to
1099/// possible scalar operand in vectorized instruction.
1100static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
1101 TargetLibraryInfo *TLI,
1102 const TargetTransformInfo *TTI) {
1103 if (!UserInst)
1104 return false;
1105 unsigned Opcode = UserInst->getOpcode();
1106 switch (Opcode) {
1107 case Instruction::Load: {
1108 LoadInst *LI = cast<LoadInst>(UserInst);
1109 return (LI->getPointerOperand() == Scalar);
1110 }
1111 case Instruction::Store: {
1112 StoreInst *SI = cast<StoreInst>(UserInst);
1113 return (SI->getPointerOperand() == Scalar);
1114 }
1115 case Instruction::Call: {
1116 CallInst *CI = cast<CallInst>(UserInst);
1118 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
1119 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1120 Arg.value().get() == Scalar;
1121 });
1122 }
1123 default:
1124 return false;
1125 }
1126}
1127
1128/// \returns the AA location that is being access by the instruction.
1130 if (StoreInst *SI = dyn_cast<StoreInst>(I))
1131 return MemoryLocation::get(SI);
1132 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1133 return MemoryLocation::get(LI);
1134 return MemoryLocation();
1135}
1136
1137/// \returns True if the instruction is not a volatile or atomic load/store.
1138static bool isSimple(Instruction *I) {
1139 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1140 return LI->isSimple();
1141 if (StoreInst *SI = dyn_cast<StoreInst>(I))
1142 return SI->isSimple();
1143 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
1144 return !MI->isVolatile();
1145 return true;
1146}
1147
1148/// Shuffles \p Mask in accordance with the given \p SubMask.
1149/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1150/// one but two input vectors.
1151static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
1152 bool ExtendingManyInputs = false) {
1153 if (SubMask.empty())
1154 return;
1155 assert(
1156 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
1157 // Check if input scalars were extended to match the size of other node.
1158 (SubMask.size() == Mask.size() && Mask.back() == PoisonMaskElem)) &&
1159 "SubMask with many inputs support must be larger than the mask.");
1160 if (Mask.empty()) {
1161 Mask.append(SubMask.begin(), SubMask.end());
1162 return;
1163 }
1164 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1165 int TermValue = std::min(Mask.size(), SubMask.size());
1166 for (int I = 0, E = SubMask.size(); I < E; ++I) {
1167 if (SubMask[I] == PoisonMaskElem ||
1168 (!ExtendingManyInputs &&
1169 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1170 continue;
1171 NewMask[I] = Mask[SubMask[I]];
1172 }
1173 Mask.swap(NewMask);
1174}
1175
1176/// Order may have elements assigned special value (size) which is out of
1177/// bounds. Such indices only appear on places which correspond to undef values
1178/// (see canReuseExtract for details) and used in order to avoid undef values
1179/// have effect on operands ordering.
1180/// The first loop below simply finds all unused indices and then the next loop
1181/// nest assigns these indices for undef values positions.
1182/// As an example below Order has two undef positions and they have assigned
1183/// values 3 and 7 respectively:
1184/// before: 6 9 5 4 9 2 1 0
1185/// after: 6 3 5 4 7 2 1 0
1187 const unsigned Sz = Order.size();
1188 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1189 SmallBitVector MaskedIndices(Sz);
1190 for (unsigned I = 0; I < Sz; ++I) {
1191 if (Order[I] < Sz)
1192 UnusedIndices.reset(Order[I]);
1193 else
1194 MaskedIndices.set(I);
1195 }
1196 if (MaskedIndices.none())
1197 return;
1198 assert(UnusedIndices.count() == MaskedIndices.count() &&
1199 "Non-synced masked/available indices.");
1200 int Idx = UnusedIndices.find_first();
1201 int MIdx = MaskedIndices.find_first();
1202 while (MIdx >= 0) {
1203 assert(Idx >= 0 && "Indices must be synced.");
1204 Order[MIdx] = Idx;
1205 Idx = UnusedIndices.find_next(Idx);
1206 MIdx = MaskedIndices.find_next(MIdx);
1207 }
1208}
1209
1210/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1211/// Opcode1.
1213 unsigned Opcode1) {
1214 Type *ScalarTy = VL[0]->getType();
1215 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
1216 SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
1217 for (unsigned Lane : seq<unsigned>(VL.size())) {
1218 if (isa<PoisonValue>(VL[Lane]))
1219 continue;
1220 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1221 OpcodeMask.set(Lane * ScalarTyNumElements,
1222 Lane * ScalarTyNumElements + ScalarTyNumElements);
1223 }
1224 return OpcodeMask;
1225}
1226
1227namespace llvm {
1228
1230 SmallVectorImpl<int> &Mask) {
1231 Mask.clear();
1232 const unsigned E = Indices.size();
1233 Mask.resize(E, PoisonMaskElem);
1234 for (unsigned I = 0; I < E; ++I)
1235 Mask[Indices[I]] = I;
1236}
1237
1238/// Reorders the list of scalars in accordance with the given \p Mask.
1240 ArrayRef<int> Mask) {
1241 assert(!Mask.empty() && "Expected non-empty mask.");
1242 SmallVector<Value *> Prev(Scalars.size(),
1243 PoisonValue::get(Scalars.front()->getType()));
1244 Prev.swap(Scalars);
1245 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1246 if (Mask[I] != PoisonMaskElem)
1247 Scalars[Mask[I]] = Prev[I];
1248}
1249
1250/// Checks if the provided value does not require scheduling. It does not
1251/// require scheduling if this is not an instruction or it is an instruction
1252/// that does not read/write memory and all operands are either not instructions
1253/// or phi nodes or instructions from different blocks.
1255 auto *I = dyn_cast<Instruction>(V);
1256 if (!I)
1257 return true;
1258 return !mayHaveNonDefUseDependency(*I) &&
1259 all_of(I->operands(), [I](Value *V) {
1260 auto *IO = dyn_cast<Instruction>(V);
1261 if (!IO)
1262 return true;
1263 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1264 });
1265}
1266
1267/// Checks if the provided value does not require scheduling. It does not
1268/// require scheduling if this is not an instruction or it is an instruction
1269/// that does not read/write memory and all users are phi nodes or instructions
1270/// from the different blocks.
1271static bool isUsedOutsideBlock(Value *V) {
1272 auto *I = dyn_cast<Instruction>(V);
1273 if (!I)
1274 return true;
1275 // Limits the number of uses to save compile time.
1276 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1277 all_of(I->users(), [I](User *U) {
1278 auto *IU = dyn_cast<Instruction>(U);
1279 if (!IU)
1280 return true;
1281 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1282 });
1283}
1284
1285/// Checks if the specified value does not require scheduling. It does not
1286/// require scheduling if all operands and all users do not need to be scheduled
1287/// in the current basic block.
1290}
1291
1292/// Checks if the specified array of instructions does not require scheduling.
1293/// It is so if all either instructions have operands that do not require
1294/// scheduling or their users do not require scheduling since they are phis or
1295/// in other basic blocks.
1297 return !VL.empty() &&
1299}
1300
1301/// Returns true if widened type of \p Ty elements with size \p Sz represents
1302/// full vector type, i.e. adding extra element results in extra parts upon type
1303/// legalization.
1305 unsigned Sz) {
1306 if (Sz <= 1)
1307 return false;
1308 if (!isValidElementType(Ty) && !isa<FixedVectorType>(Ty))
1309 return false;
1310 if (has_single_bit(Sz))
1311 return true;
1312 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
1313 return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
1314 Sz % NumParts == 0;
1315}
1316
1317/// Returns number of parts, the type \p VecTy will be split at the codegen
1318/// phase. If the type is going to be scalarized or does not uses whole
1319/// registers, returns 1.
1320static unsigned
1322 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1323 unsigned NumParts = TTI.getNumberOfParts(VecTy);
1324 if (NumParts == 0 || NumParts >= Limit)
1325 return 1;
1326 unsigned Sz = getNumElements(VecTy);
1327 if (NumParts >= Sz || Sz % NumParts != 0 ||
1328 !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), Sz / NumParts))
1329 return 1;
1330 return NumParts;
1331}
1332
1333namespace slpvectorizer {
1334
1335/// Bottom Up SLP Vectorizer.
1336class BoUpSLP {
1337 struct TreeEntry;
1338 struct ScheduleData;
1341
1342public:
1343 /// Tracks the state we can represent the loads in the given sequence.
1344 enum class LoadsState {
1345 Gather,
1346 Vectorize,
1349 };
1350
1357
1359 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1362 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1363 AC(AC), DB(DB), DL(DL), ORE(ORE),
1364 Builder(Se->getContext(), TargetFolder(*DL)) {
1365 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1366 // Use the vector register size specified by the target unless overridden
1367 // by a command-line option.
1368 // TODO: It would be better to limit the vectorization factor based on
1369 // data type rather than just register size. For example, x86 AVX has
1370 // 256-bit registers, but it does not support integer operations
1371 // at that width (that requires AVX2).
1372 if (MaxVectorRegSizeOption.getNumOccurrences())
1373 MaxVecRegSize = MaxVectorRegSizeOption;
1374 else
1375 MaxVecRegSize =
1377 .getFixedValue();
1378
1379 if (MinVectorRegSizeOption.getNumOccurrences())
1380 MinVecRegSize = MinVectorRegSizeOption;
1381 else
1382 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1383 }
1384
1385 /// Vectorize the tree that starts with the elements in \p VL.
1386 /// Returns the vectorized root.
1388
1389 /// Vectorize the tree but with the list of externally used values \p
1390 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1391 /// generated extractvalue instructions.
1392 Value *
1393 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1394 Instruction *ReductionRoot = nullptr);
1395
1396 /// \returns the cost incurred by unwanted spills and fills, caused by
1397 /// holding live values over call sites.
1399
1400 /// \returns the vectorization cost of the subtree that starts at \p VL.
1401 /// A negative number means that this is profitable.
1402 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = {});
1403
1404 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1405 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1406 void buildTree(ArrayRef<Value *> Roots,
1407 const SmallDenseSet<Value *> &UserIgnoreLst);
1408
1409 /// Construct a vectorizable tree that starts at \p Roots.
1410 void buildTree(ArrayRef<Value *> Roots);
1411
1412 /// Returns whether the root node has in-tree uses.
1414 return !VectorizableTree.empty() &&
1415 !VectorizableTree.front()->UserTreeIndices.empty();
1416 }
1417
1418 /// Return the scalars of the root node.
1420 assert(!VectorizableTree.empty() && "No graph to get the first node from");
1421 return VectorizableTree.front()->Scalars;
1422 }
1423
1424 /// Returns the type/is-signed info for the root node in the graph without
1425 /// casting.
1426 std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const {
1427 const TreeEntry &Root = *VectorizableTree.front().get();
1428 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
1429 !Root.Scalars.front()->getType()->isIntegerTy())
1430 return std::nullopt;
1431 auto It = MinBWs.find(&Root);
1432 if (It != MinBWs.end())
1433 return std::make_pair(IntegerType::get(Root.Scalars.front()->getContext(),
1434 It->second.first),
1435 It->second.second);
1436 if (Root.getOpcode() == Instruction::ZExt ||
1437 Root.getOpcode() == Instruction::SExt)
1438 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
1439 Root.getOpcode() == Instruction::SExt);
1440 return std::nullopt;
1441 }
1442
1443 /// Checks if the root graph node can be emitted with narrower bitwidth at
1444 /// codegen and returns it signedness, if so.
1446 return MinBWs.at(VectorizableTree.front().get()).second;
1447 }
1448
1449 /// Returns reduction type after minbitdth analysis.
1451 if (ReductionBitWidth == 0 ||
1452 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
1453 ReductionBitWidth >=
1454 DL->getTypeSizeInBits(
1455 VectorizableTree.front()->Scalars.front()->getType()))
1456 return getWidenedType(
1457 VectorizableTree.front()->Scalars.front()->getType(),
1458 VectorizableTree.front()->getVectorFactor());
1459 return getWidenedType(
1461 VectorizableTree.front()->Scalars.front()->getContext(),
1462 ReductionBitWidth),
1463 VectorizableTree.front()->getVectorFactor());
1464 }
1465
1466 /// Builds external uses of the vectorized scalars, i.e. the list of
1467 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
1468 /// ExternallyUsedValues contains additional list of external uses to handle
1469 /// vectorization of reductions.
1470 void
1471 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1472
1473 /// Transforms graph nodes to target specific representations, if profitable.
1474 void transformNodes();
1475
1476 /// Clear the internal data structures that are created by 'buildTree'.
1477 void deleteTree() {
1478 VectorizableTree.clear();
1479 ScalarToTreeEntries.clear();
1480 MustGather.clear();
1481 NonScheduledFirst.clear();
1482 EntryToLastInstruction.clear();
1483 LoadEntriesToVectorize.clear();
1484 IsGraphTransformMode = false;
1485 GatheredLoadsEntriesFirst.reset();
1486 ExternalUses.clear();
1487 ExternalUsesAsOriginalScalar.clear();
1488 for (auto &Iter : BlocksSchedules) {
1489 BlockScheduling *BS = Iter.second.get();
1490 BS->clear();
1491 }
1492 MinBWs.clear();
1493 ReductionBitWidth = 0;
1494 BaseGraphSize = 1;
1495 CastMaxMinBWSizes.reset();
1496 ExtraBitWidthNodes.clear();
1497 InstrElementSize.clear();
1498 UserIgnoreList = nullptr;
1499 PostponedGathers.clear();
1500 ValueToGatherNodes.clear();
1501 }
1502
1503 unsigned getTreeSize() const { return VectorizableTree.size(); }
1504
1505 /// Returns the base graph size, before any transformations.
1506 unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
1507
1508 /// Perform LICM and CSE on the newly generated gather sequences.
1510
1511 /// Does this non-empty order represent an identity order? Identity
1512 /// should be represented as an empty order, so this is used to
1513 /// decide if we can canonicalize a computed order. Undef elements
1514 /// (represented as size) are ignored.
1516 assert(!Order.empty() && "expected non-empty order");
1517 const unsigned Sz = Order.size();
1518 return all_of(enumerate(Order), [&](const auto &P) {
1519 return P.value() == P.index() || P.value() == Sz;
1520 });
1521 }
1522
1523 /// Checks if the specified gather tree entry \p TE can be represented as a
1524 /// shuffled vector entry + (possibly) permutation with other gathers. It
1525 /// implements the checks only for possibly ordered scalars (Loads,
1526 /// ExtractElement, ExtractValue), which can be part of the graph.
1527 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
1528
1529 /// Sort loads into increasing pointers offsets to allow greater clustering.
1530 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1531
1532 /// Gets reordering data for the given tree entry. If the entry is vectorized
1533 /// - just return ReorderIndices, otherwise check if the scalars can be
1534 /// reordered and return the most optimal order.
1535 /// \return std::nullopt if ordering is not important, empty order, if
1536 /// identity order is important, or the actual order.
1537 /// \param TopToBottom If true, include the order of vectorized stores and
1538 /// insertelement nodes, otherwise skip them.
1539 std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
1540 bool TopToBottom);
1541
1542 /// Reorders the current graph to the most profitable order starting from the
1543 /// root node to the leaf nodes. The best order is chosen only from the nodes
1544 /// of the same size (vectorization factor). Smaller nodes are considered
1545 /// parts of subgraph with smaller VF and they are reordered independently. We
1546 /// can make it because we still need to extend smaller nodes to the wider VF
1547 /// and we can merge reordering shuffles with the widening shuffles.
1548 void reorderTopToBottom();
1549
1550 /// Reorders the current graph to the most profitable order starting from
1551 /// leaves to the root. It allows to rotate small subgraphs and reduce the
1552 /// number of reshuffles if the leaf nodes use the same order. In this case we
1553 /// can merge the orders and just shuffle user node instead of shuffling its
1554 /// operands. Plus, even the leaf nodes have different orders, it allows to
1555 /// sink reordering in the graph closer to the root node and merge it later
1556 /// during analysis.
1557 void reorderBottomToTop(bool IgnoreReorder = false);
1558
1559 /// \return The vector element size in bits to use when vectorizing the
1560 /// expression tree ending at \p V. If V is a store, the size is the width of
1561 /// the stored value. Otherwise, the size is the width of the largest loaded
1562 /// value reaching V. This method is used by the vectorizer to calculate
1563 /// vectorization factors.
1564 unsigned getVectorElementSize(Value *V);
1565
1566 /// Compute the minimum type sizes required to represent the entries in a
1567 /// vectorizable tree.
1569
1570 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
1571 unsigned getMaxVecRegSize() const {
1572 return MaxVecRegSize;
1573 }
1574
1575 // \returns minimum vector register size as set by cl::opt.
1576 unsigned getMinVecRegSize() const {
1577 return MinVecRegSize;
1578 }
1579
1580 unsigned getMinVF(unsigned Sz) const {
1581 return std::max(2U, getMinVecRegSize() / Sz);
1582 }
1583
1584 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1585 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
1586 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
1587 return MaxVF ? MaxVF : UINT_MAX;
1588 }
1589
1590 /// Check if homogeneous aggregate is isomorphic to some VectorType.
1591 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1592 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1593 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1594 ///
1595 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1596 unsigned canMapToVector(Type *T) const;
1597
1598 /// \returns True if the VectorizableTree is both tiny and not fully
1599 /// vectorizable. We do not vectorize such trees.
1600 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
1601
1602 /// Checks if the graph and all its subgraphs cannot be better vectorized.
1603 /// It may happen, if all gather nodes are loads and they cannot be
1604 /// "clusterized". In this case even subgraphs cannot be vectorized more
1605 /// effectively than the base graph.
1606 bool isTreeNotExtendable() const;
1607
1608 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1609 /// can be load combined in the backend. Load combining may not be allowed in
1610 /// the IR optimizer, so we do not want to alter the pattern. For example,
1611 /// partially transforming a scalar bswap() pattern into vector code is
1612 /// effectively impossible for the backend to undo.
1613 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1614 /// may not be necessary.
1615 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
1616
1617 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1618 /// can be load combined in the backend. Load combining may not be allowed in
1619 /// the IR optimizer, so we do not want to alter the pattern. For example,
1620 /// partially transforming a scalar bswap() pattern into vector code is
1621 /// effectively impossible for the backend to undo.
1622 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1623 /// may not be necessary.
1624 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
1625
1626 /// Checks if the given array of loads can be represented as a vectorized,
1627 /// scatter or just simple gather.
1628 /// \param VL list of loads.
1629 /// \param VL0 main load value.
1630 /// \param Order returned order of load instructions.
1631 /// \param PointerOps returned list of pointer operands.
1632 /// \param BestVF return best vector factor, if recursive check found better
1633 /// vectorization sequences rather than masked gather.
1634 /// \param TryRecursiveCheck used to check if long masked gather can be
1635 /// represented as a serie of loads/insert subvector, if profitable.
1638 SmallVectorImpl<Value *> &PointerOps,
1639 unsigned *BestVF = nullptr,
1640 bool TryRecursiveCheck = true) const;
1641
1642 /// Registers non-vectorizable sequence of loads
1643 template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
1644 ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
1645 }
1646
1647 /// Checks if the given loads sequence is known as not vectorizable
1648 template <typename T>
1650 return ListOfKnonwnNonVectorizableLoads.contains(hash_value(VL));
1651 }
1652
1654
1655 /// This structure holds any data we need about the edges being traversed
1656 /// during buildTree_rec(). We keep track of:
1657 /// (i) the user TreeEntry index, and
1658 /// (ii) the index of the edge.
1659 struct EdgeInfo {
1660 EdgeInfo() = default;
1661 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
1663 /// The user TreeEntry.
1664 TreeEntry *UserTE = nullptr;
1665 /// The operand index of the use.
1666 unsigned EdgeIdx = UINT_MAX;
1667#ifndef NDEBUG
1669 const BoUpSLP::EdgeInfo &EI) {
1670 EI.dump(OS);
1671 return OS;
1672 }
1673 /// Debug print.
1674 void dump(raw_ostream &OS) const {
1675 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
1676 << " EdgeIdx:" << EdgeIdx << "}";
1677 }
1678 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
1679#endif
1680 bool operator == (const EdgeInfo &Other) const {
1681 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
1682 }
1683 };
1684
1685 /// A helper class used for scoring candidates for two consecutive lanes.
1687 const TargetLibraryInfo &TLI;
1688 const DataLayout &DL;
1689 ScalarEvolution &SE;
1690 const BoUpSLP &R;
1691 int NumLanes; // Total number of lanes (aka vectorization factor).
1692 int MaxLevel; // The maximum recursion depth for accumulating score.
1693
1694 public:
1696 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
1697 int MaxLevel)
1698 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1699 MaxLevel(MaxLevel) {}
1700
1701 // The hard-coded scores listed here are not very important, though it shall
1702 // be higher for better matches to improve the resulting cost. When
1703 // computing the scores of matching one sub-tree with another, we are
1704 // basically counting the number of values that are matching. So even if all
1705 // scores are set to 1, we would still get a decent matching result.
1706 // However, sometimes we have to break ties. For example we may have to
1707 // choose between matching loads vs matching opcodes. This is what these
1708 // scores are helping us with: they provide the order of preference. Also,
1709 // this is important if the scalar is externally used or used in another
1710 // tree entry node in the different lane.
1711
1712 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1713 static const int ScoreConsecutiveLoads = 4;
1714 /// The same load multiple times. This should have a better score than
1715 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1716 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1717 /// a vector load and 1.0 for a broadcast.
1718 static const int ScoreSplatLoads = 3;
1719 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1720 static const int ScoreReversedLoads = 3;
1721 /// A load candidate for masked gather.
1722 static const int ScoreMaskedGatherCandidate = 1;
1723 /// ExtractElementInst from same vector and consecutive indexes.
1724 static const int ScoreConsecutiveExtracts = 4;
1725 /// ExtractElementInst from same vector and reversed indices.
1726 static const int ScoreReversedExtracts = 3;
1727 /// Constants.
1728 static const int ScoreConstants = 2;
1729 /// Instructions with the same opcode.
1730 static const int ScoreSameOpcode = 2;
1731 /// Instructions with alt opcodes (e.g, add + sub).
1732 static const int ScoreAltOpcodes = 1;
1733 /// Identical instructions (a.k.a. splat or broadcast).
1734 static const int ScoreSplat = 1;
1735 /// Matching with an undef is preferable to failing.
1736 static const int ScoreUndef = 1;
1737 /// Score for failing to find a decent match.
1738 static const int ScoreFail = 0;
1739 /// Score if all users are vectorized.
1740 static const int ScoreAllUserVectorized = 1;
1741
1742 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1743 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
1744 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1745 /// MainAltOps.
1747 ArrayRef<Value *> MainAltOps) const {
1748 if (!isValidElementType(V1->getType()) ||
1749 !isValidElementType(V2->getType()))
1751
1752 if (V1 == V2) {
1753 if (isa<LoadInst>(V1)) {
1754 // Retruns true if the users of V1 and V2 won't need to be extracted.
1755 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
1756 // Bail out if we have too many uses to save compilation time.
1757 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
1758 return false;
1759
1760 auto AllUsersVectorized = [U1, U2, this](Value *V) {
1761 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
1762 return U == U1 || U == U2 || R.isVectorized(U);
1763 });
1764 };
1765 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1766 };
1767 // A broadcast of a load can be cheaper on some targets.
1768 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1769 ElementCount::getFixed(NumLanes)) &&
1770 ((int)V1->getNumUses() == NumLanes ||
1771 AllUsersAreInternal(V1, V2)))
1773 }
1775 }
1776
1777 auto CheckSameEntryOrFail = [&]() {
1778 if (ArrayRef<TreeEntry *> TEs1 = R.getTreeEntries(V1); !TEs1.empty()) {
1779 SmallPtrSet<TreeEntry *, 4> Set(TEs1.begin(), TEs1.end());
1780 if (ArrayRef<TreeEntry *> TEs2 = R.getTreeEntries(V2);
1781 !TEs2.empty() &&
1782 any_of(TEs2, [&](TreeEntry *E) { return Set.contains(E); }))
1784 }
1786 };
1787
1788 auto *LI1 = dyn_cast<LoadInst>(V1);
1789 auto *LI2 = dyn_cast<LoadInst>(V2);
1790 if (LI1 && LI2) {
1791 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1792 !LI2->isSimple())
1793 return CheckSameEntryOrFail();
1794
1795 std::optional<int> Dist = getPointersDiff(
1796 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1797 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
1798 if (!Dist || *Dist == 0) {
1799 if (getUnderlyingObject(LI1->getPointerOperand()) ==
1800 getUnderlyingObject(LI2->getPointerOperand()) &&
1801 R.TTI->isLegalMaskedGather(
1802 getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
1804 return CheckSameEntryOrFail();
1805 }
1806 // The distance is too large - still may be profitable to use masked
1807 // loads/gathers.
1808 if (std::abs(*Dist) > NumLanes / 2)
1810 // This still will detect consecutive loads, but we might have "holes"
1811 // in some cases. It is ok for non-power-2 vectorization and may produce
1812 // better results. It should not affect current vectorization.
1815 }
1816
1817 auto *C1 = dyn_cast<Constant>(V1);
1818 auto *C2 = dyn_cast<Constant>(V2);
1819 if (C1 && C2)
1821
1822 // Extracts from consecutive indexes of the same vector better score as
1823 // the extracts could be optimized away.
1824 Value *EV1;
1825 ConstantInt *Ex1Idx;
1826 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
1827 // Undefs are always profitable for extractelements.
1828 // Compiler can easily combine poison and extractelement <non-poison> or
1829 // undef and extractelement <poison>. But combining undef +
1830 // extractelement <non-poison-but-may-produce-poison> requires some
1831 // extra operations.
1832 if (isa<UndefValue>(V2))
1833 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
1836 Value *EV2 = nullptr;
1837 ConstantInt *Ex2Idx = nullptr;
1838 if (match(V2,
1840 m_Undef())))) {
1841 // Undefs are always profitable for extractelements.
1842 if (!Ex2Idx)
1844 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
1846 if (EV2 == EV1) {
1847 int Idx1 = Ex1Idx->getZExtValue();
1848 int Idx2 = Ex2Idx->getZExtValue();
1849 int Dist = Idx2 - Idx1;
1850 // The distance is too large - still may be profitable to use
1851 // shuffles.
1852 if (std::abs(Dist) == 0)
1854 if (std::abs(Dist) > NumLanes / 2)
1858 }
1860 }
1861 return CheckSameEntryOrFail();
1862 }
1863
1864 auto *I1 = dyn_cast<Instruction>(V1);
1865 auto *I2 = dyn_cast<Instruction>(V2);
1866 if (I1 && I2) {
1867 if (I1->getParent() != I2->getParent())
1868 return CheckSameEntryOrFail();
1869 SmallVector<Value *, 4> Ops(MainAltOps);
1870 Ops.push_back(I1);
1871 Ops.push_back(I2);
1872 InstructionsState S = getSameOpcode(Ops, TLI);
1873 // Note: Only consider instructions with <= 2 operands to avoid
1874 // complexity explosion.
1875 if (S &&
1876 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||
1877 !S.isAltShuffle()) &&
1878 all_of(Ops, [&S](Value *V) {
1879 return isa<PoisonValue>(V) ||
1880 cast<Instruction>(V)->getNumOperands() ==
1881 S.getMainOp()->getNumOperands();
1882 }))
1883 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1885 }
1886
1887 if (I1 && isa<PoisonValue>(V2))
1889
1890 if (isa<UndefValue>(V2))
1892
1893 return CheckSameEntryOrFail();
1894 }
1895
1896 /// Go through the operands of \p LHS and \p RHS recursively until
1897 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1898 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1899 /// of \p U1 and \p U2), except at the beginning of the recursion where
1900 /// these are set to nullptr.
1901 ///
1902 /// For example:
1903 /// \verbatim
1904 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1905 /// \ / \ / \ / \ /
1906 /// + + + +
1907 /// G1 G2 G3 G4
1908 /// \endverbatim
1909 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1910 /// each level recursively, accumulating the score. It starts from matching
1911 /// the additions at level 0, then moves on to the loads (level 1). The
1912 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1913 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1914 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1915 /// Please note that the order of the operands does not matter, as we
1916 /// evaluate the score of all profitable combinations of operands. In
1917 /// other words the score of G1 and G4 is the same as G1 and G2. This
1918 /// heuristic is based on ideas described in:
1919 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
1920 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1921 /// Luís F. W. Góes
1923 Instruction *U2, int CurrLevel,
1924 ArrayRef<Value *> MainAltOps) const {
1925
1926 // Get the shallow score of V1 and V2.
1927 int ShallowScoreAtThisLevel =
1928 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
1929
1930 // If reached MaxLevel,
1931 // or if V1 and V2 are not instructions,
1932 // or if they are SPLAT,
1933 // or if they are not consecutive,
1934 // or if profitable to vectorize loads or extractelements, early return
1935 // the current cost.
1936 auto *I1 = dyn_cast<Instruction>(LHS);
1937 auto *I2 = dyn_cast<Instruction>(RHS);
1938 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1939 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
1940 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1941 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1942 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1943 ShallowScoreAtThisLevel))
1944 return ShallowScoreAtThisLevel;
1945 assert(I1 && I2 && "Should have early exited.");
1946
1947 // Contains the I2 operand indexes that got matched with I1 operands.
1948 SmallSet<unsigned, 4> Op2Used;
1949
1950 // Recursion towards the operands of I1 and I2. We are trying all possible
1951 // operand pairs, and keeping track of the best score.
1952 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1953 OpIdx1 != NumOperands1; ++OpIdx1) {
1954 // Try to pair op1I with the best operand of I2.
1955 int MaxTmpScore = 0;
1956 unsigned MaxOpIdx2 = 0;
1957 bool FoundBest = false;
1958 // If I2 is commutative try all combinations.
1959 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
1960 unsigned ToIdx = isCommutative(I2)
1961 ? I2->getNumOperands()
1962 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1963 assert(FromIdx <= ToIdx && "Bad index");
1964 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1965 // Skip operands already paired with OpIdx1.
1966 if (Op2Used.count(OpIdx2))
1967 continue;
1968 // Recursively calculate the cost at each level
1969 int TmpScore =
1970 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
1971 I1, I2, CurrLevel + 1, {});
1972 // Look for the best score.
1973 if (TmpScore > LookAheadHeuristics::ScoreFail &&
1974 TmpScore > MaxTmpScore) {
1975 MaxTmpScore = TmpScore;
1976 MaxOpIdx2 = OpIdx2;
1977 FoundBest = true;
1978 }
1979 }
1980 if (FoundBest) {
1981 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1982 Op2Used.insert(MaxOpIdx2);
1983 ShallowScoreAtThisLevel += MaxTmpScore;
1984 }
1985 }
1986 return ShallowScoreAtThisLevel;
1987 }
1988 };
1989 /// A helper data structure to hold the operands of a vector of instructions.
1990 /// This supports a fixed vector length for all operand vectors.
1992 /// For each operand we need (i) the value, and (ii) the opcode that it
1993 /// would be attached to if the expression was in a left-linearized form.
1994 /// This is required to avoid illegal operand reordering.
1995 /// For example:
1996 /// \verbatim
1997 /// 0 Op1
1998 /// |/
1999 /// Op1 Op2 Linearized + Op2
2000 /// \ / ----------> |/
2001 /// - -
2002 ///
2003 /// Op1 - Op2 (0 + Op1) - Op2
2004 /// \endverbatim
2005 ///
2006 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
2007 ///
2008 /// Another way to think of this is to track all the operations across the
2009 /// path from the operand all the way to the root of the tree and to
2010 /// calculate the operation that corresponds to this path. For example, the
2011 /// path from Op2 to the root crosses the RHS of the '-', therefore the
2012 /// corresponding operation is a '-' (which matches the one in the
2013 /// linearized tree, as shown above).
2014 ///
2015 /// For lack of a better term, we refer to this operation as Accumulated
2016 /// Path Operation (APO).
2017 struct OperandData {
2018 OperandData() = default;
2019 OperandData(Value *V, bool APO, bool IsUsed)
2020 : V(V), APO(APO), IsUsed(IsUsed) {}
2021 /// The operand value.
2022 Value *V = nullptr;
2023 /// TreeEntries only allow a single opcode, or an alternate sequence of
2024 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
2025 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
2026 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
2027 /// (e.g., Add/Mul)
2028 bool APO = false;
2029 /// Helper data for the reordering function.
2030 bool IsUsed = false;
2031 };
2032
2033 /// During operand reordering, we are trying to select the operand at lane
2034 /// that matches best with the operand at the neighboring lane. Our
2035 /// selection is based on the type of value we are looking for. For example,
2036 /// if the neighboring lane has a load, we need to look for a load that is
2037 /// accessing a consecutive address. These strategies are summarized in the
2038 /// 'ReorderingMode' enumerator.
2039 enum class ReorderingMode {
2040 Load, ///< Matching loads to consecutive memory addresses
2041 Opcode, ///< Matching instructions based on opcode (same or alternate)
2042 Constant, ///< Matching constants
2043 Splat, ///< Matching the same instruction multiple times (broadcast)
2044 Failed, ///< We failed to create a vectorizable group
2045 };
2046
2048
2049 /// A vector of operand vectors.
2051 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
2052 /// is not IntrinsicInst, ArgSize is User::getNumOperands.
2053 unsigned ArgSize = 0;
2054
2055 const TargetLibraryInfo &TLI;
2056 const DataLayout &DL;
2057 ScalarEvolution &SE;
2058 const BoUpSLP &R;
2059 const Loop *L = nullptr;
2060
2061 /// \returns the operand data at \p OpIdx and \p Lane.
2062 OperandData &getData(unsigned OpIdx, unsigned Lane) {
2063 return OpsVec[OpIdx][Lane];
2064 }
2065
2066 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
2067 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
2068 return OpsVec[OpIdx][Lane];
2069 }
2070
2071 /// Clears the used flag for all entries.
2072 void clearUsed() {
2073 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
2074 OpIdx != NumOperands; ++OpIdx)
2075 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2076 ++Lane)
2077 OpsVec[OpIdx][Lane].IsUsed = false;
2078 }
2079
2080 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
2081 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
2082 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2083 }
2084
2085 /// \param Lane lane of the operands under analysis.
2086 /// \param OpIdx operand index in \p Lane lane we're looking the best
2087 /// candidate for.
2088 /// \param Idx operand index of the current candidate value.
2089 /// \returns The additional score due to possible broadcasting of the
2090 /// elements in the lane. It is more profitable to have power-of-2 unique
2091 /// elements in the lane, it will be vectorized with higher probability
2092 /// after removing duplicates. Currently the SLP vectorizer supports only
2093 /// vectorization of the power-of-2 number of unique scalars.
2094 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
2095 const SmallBitVector &UsedLanes) const {
2096 Value *IdxLaneV = getData(Idx, Lane).V;
2097 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2098 isa<ExtractElementInst>(IdxLaneV))
2099 return 0;
2101 for (unsigned Ln : seq<unsigned>(getNumLanes())) {
2102 if (Ln == Lane)
2103 continue;
2104 Value *OpIdxLnV = getData(OpIdx, Ln).V;
2105 if (!isa<Instruction>(OpIdxLnV))
2106 return 0;
2107 Uniques.try_emplace(OpIdxLnV, Ln);
2108 }
2109 unsigned UniquesCount = Uniques.size();
2110 auto IdxIt = Uniques.find(IdxLaneV);
2111 unsigned UniquesCntWithIdxLaneV =
2112 IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2113 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2114 auto OpIdxIt = Uniques.find(OpIdxLaneV);
2115 unsigned UniquesCntWithOpIdxLaneV =
2116 OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2117 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2118 return 0;
2119 return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) -
2120 UniquesCntWithOpIdxLaneV,
2121 UniquesCntWithOpIdxLaneV -
2122 bit_floor(UniquesCntWithOpIdxLaneV)) -
2123 ((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second))
2124 ? UniquesCntWithIdxLaneV - bit_floor(UniquesCntWithIdxLaneV)
2125 : bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2126 }
2127
2128 /// \param Lane lane of the operands under analysis.
2129 /// \param OpIdx operand index in \p Lane lane we're looking the best
2130 /// candidate for.
2131 /// \param Idx operand index of the current candidate value.
2132 /// \returns The additional score for the scalar which users are all
2133 /// vectorized.
2134 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
2135 Value *IdxLaneV = getData(Idx, Lane).V;
2136 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2137 // Do not care about number of uses for vector-like instructions
2138 // (extractelement/extractvalue with constant indices), they are extracts
2139 // themselves and already externally used. Vectorization of such
2140 // instructions does not add extra extractelement instruction, just may
2141 // remove it.
2142 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
2143 isVectorLikeInstWithConstOps(OpIdxLaneV))
2145 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2146 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2147 return 0;
2148 return R.areAllUsersVectorized(IdxLaneI)
2150 : 0;
2151 }
2152
2153 /// Score scaling factor for fully compatible instructions but with
2154 /// different number of external uses. Allows better selection of the
2155 /// instructions with less external uses.
2156 static const int ScoreScaleFactor = 10;
2157
2158 /// \Returns the look-ahead score, which tells us how much the sub-trees
2159 /// rooted at \p LHS and \p RHS match, the more they match the higher the
2160 /// score. This helps break ties in an informed way when we cannot decide on
2161 /// the order of the operands by just considering the immediate
2162 /// predecessors.
2163 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
2164 int Lane, unsigned OpIdx, unsigned Idx,
2165 bool &IsUsed, const SmallBitVector &UsedLanes) {
2166 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
2168 // Keep track of the instruction stack as we recurse into the operands
2169 // during the look-ahead score exploration.
2170 int Score =
2171 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
2172 /*CurrLevel=*/1, MainAltOps);
2173 if (Score) {
2174 int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
2175 if (Score <= -SplatScore) {
2176 // Failed score.
2177 Score = 0;
2178 } else {
2179 Score += SplatScore;
2180 // Scale score to see the difference between different operands
2181 // and similar operands but all vectorized/not all vectorized
2182 // uses. It does not affect actual selection of the best
2183 // compatible operand in general, just allows to select the
2184 // operand with all vectorized uses.
2185 Score *= ScoreScaleFactor;
2186 Score += getExternalUseScore(Lane, OpIdx, Idx);
2187 IsUsed = true;
2188 }
2189 }
2190 return Score;
2191 }
2192
2193 /// Best defined scores per lanes between the passes. Used to choose the
2194 /// best operand (with the highest score) between the passes.
2195 /// The key - {Operand Index, Lane}.
2196 /// The value - the best score between the passes for the lane and the
2197 /// operand.
2199 BestScoresPerLanes;
2200
2201 // Search all operands in Ops[*][Lane] for the one that matches best
2202 // Ops[OpIdx][LastLane] and return its opreand index.
2203 // If no good match can be found, return std::nullopt.
2204 std::optional<unsigned>
2205 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
2206 ArrayRef<ReorderingMode> ReorderingModes,
2207 ArrayRef<Value *> MainAltOps,
2208 const SmallBitVector &UsedLanes) {
2209 unsigned NumOperands = getNumOperands();
2210
2211 // The operand of the previous lane at OpIdx.
2212 Value *OpLastLane = getData(OpIdx, LastLane).V;
2213
2214 // Our strategy mode for OpIdx.
2215 ReorderingMode RMode = ReorderingModes[OpIdx];
2216 if (RMode == ReorderingMode::Failed)
2217 return std::nullopt;
2218
2219 // The linearized opcode of the operand at OpIdx, Lane.
2220 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2221
2222 // The best operand index and its score.
2223 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
2224 // are using the score to differentiate between the two.
2225 struct BestOpData {
2226 std::optional<unsigned> Idx;
2227 unsigned Score = 0;
2228 } BestOp;
2229 BestOp.Score =
2230 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
2231 .first->second;
2232
2233 // Track if the operand must be marked as used. If the operand is set to
2234 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
2235 // want to reestimate the operands again on the following iterations).
2236 bool IsUsed = RMode == ReorderingMode::Splat ||
2237 RMode == ReorderingMode::Constant ||
2238 RMode == ReorderingMode::Load;
2239 // Iterate through all unused operands and look for the best.
2240 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2241 // Get the operand at Idx and Lane.
2242 OperandData &OpData = getData(Idx, Lane);
2243 Value *Op = OpData.V;
2244 bool OpAPO = OpData.APO;
2245
2246 // Skip already selected operands.
2247 if (OpData.IsUsed)
2248 continue;
2249
2250 // Skip if we are trying to move the operand to a position with a
2251 // different opcode in the linearized tree form. This would break the
2252 // semantics.
2253 if (OpAPO != OpIdxAPO)
2254 continue;
2255
2256 // Look for an operand that matches the current mode.
2257 switch (RMode) {
2258 case ReorderingMode::Load:
2259 case ReorderingMode::Opcode: {
2260 bool LeftToRight = Lane > LastLane;
2261 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
2262 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
2263 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2264 OpIdx, Idx, IsUsed, UsedLanes);
2265 if (Score > static_cast<int>(BestOp.Score) ||
2266 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
2267 Idx == OpIdx)) {
2268 BestOp.Idx = Idx;
2269 BestOp.Score = Score;
2270 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2271 }
2272 break;
2273 }
2274 case ReorderingMode::Constant:
2275 if (isa<Constant>(Op) ||
2276 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
2277 BestOp.Idx = Idx;
2278 if (isa<Constant>(Op)) {
2280 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2282 }
2283 if (isa<UndefValue>(Op) || !isa<Constant>(Op))
2284 IsUsed = false;
2285 }
2286 break;
2287 case ReorderingMode::Splat:
2288 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
2289 IsUsed = Op == OpLastLane;
2290 if (Op == OpLastLane) {
2291 BestOp.Score = LookAheadHeuristics::ScoreSplat;
2292 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2294 }
2295 BestOp.Idx = Idx;
2296 }
2297 break;
2298 case ReorderingMode::Failed:
2299 llvm_unreachable("Not expected Failed reordering mode.");
2300 }
2301 }
2302
2303 if (BestOp.Idx) {
2304 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2305 return BestOp.Idx;
2306 }
2307 // If we could not find a good match return std::nullopt.
2308 return std::nullopt;
2309 }
2310
2311 /// Helper for reorderOperandVecs.
2312 /// \returns the lane that we should start reordering from. This is the one
2313 /// which has the least number of operands that can freely move about or
2314 /// less profitable because it already has the most optimal set of operands.
2315 unsigned getBestLaneToStartReordering() const {
2316 unsigned Min = UINT_MAX;
2317 unsigned SameOpNumber = 0;
2318 // std::pair<unsigned, unsigned> is used to implement a simple voting
2319 // algorithm and choose the lane with the least number of operands that
2320 // can freely move about or less profitable because it already has the
2321 // most optimal set of operands. The first unsigned is a counter for
2322 // voting, the second unsigned is the counter of lanes with instructions
2323 // with same/alternate opcodes and same parent basic block.
2325 // Try to be closer to the original results, if we have multiple lanes
2326 // with same cost. If 2 lanes have the same cost, use the one with the
2327 // highest index.
2328 for (int I = getNumLanes(); I > 0; --I) {
2329 unsigned Lane = I - 1;
2330 OperandsOrderData NumFreeOpsHash =
2331 getMaxNumOperandsThatCanBeReordered(Lane);
2332 // Compare the number of operands that can move and choose the one with
2333 // the least number.
2334 if (NumFreeOpsHash.NumOfAPOs < Min) {
2335 Min = NumFreeOpsHash.NumOfAPOs;
2336 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2337 HashMap.clear();
2338 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2339 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2340 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2341 // Select the most optimal lane in terms of number of operands that
2342 // should be moved around.
2343 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2344 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2345 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2346 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2347 auto [It, Inserted] =
2348 HashMap.try_emplace(NumFreeOpsHash.Hash, 1, Lane);
2349 if (!Inserted)
2350 ++It->second.first;
2351 }
2352 }
2353 // Select the lane with the minimum counter.
2354 unsigned BestLane = 0;
2355 unsigned CntMin = UINT_MAX;
2356 for (const auto &Data : reverse(HashMap)) {
2357 if (Data.second.first < CntMin) {
2358 CntMin = Data.second.first;
2359 BestLane = Data.second.second;
2360 }
2361 }
2362 return BestLane;
2363 }
2364
2365 /// Data structure that helps to reorder operands.
2366 struct OperandsOrderData {
2367 /// The best number of operands with the same APOs, which can be
2368 /// reordered.
2369 unsigned NumOfAPOs = UINT_MAX;
2370 /// Number of operands with the same/alternate instruction opcode and
2371 /// parent.
2372 unsigned NumOpsWithSameOpcodeParent = 0;
2373 /// Hash for the actual operands ordering.
2374 /// Used to count operands, actually their position id and opcode
2375 /// value. It is used in the voting mechanism to find the lane with the
2376 /// least number of operands that can freely move about or less profitable
2377 /// because it already has the most optimal set of operands. Can be
2378 /// replaced with SmallVector<unsigned> instead but hash code is faster
2379 /// and requires less memory.
2380 unsigned Hash = 0;
2381 };
2382 /// \returns the maximum number of operands that are allowed to be reordered
2383 /// for \p Lane and the number of compatible instructions(with the same
2384 /// parent/opcode). This is used as a heuristic for selecting the first lane
2385 /// to start operand reordering.
2386 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
2387 unsigned CntTrue = 0;
2388 unsigned NumOperands = getNumOperands();
2389 // Operands with the same APO can be reordered. We therefore need to count
2390 // how many of them we have for each APO, like this: Cnt[APO] = x.
2391 // Since we only have two APOs, namely true and false, we can avoid using
2392 // a map. Instead we can simply count the number of operands that
2393 // correspond to one of them (in this case the 'true' APO), and calculate
2394 // the other by subtracting it from the total number of operands.
2395 // Operands with the same instruction opcode and parent are more
2396 // profitable since we don't need to move them in many cases, with a high
2397 // probability such lane already can be vectorized effectively.
2398 bool AllUndefs = true;
2399 unsigned NumOpsWithSameOpcodeParent = 0;
2400 Instruction *OpcodeI = nullptr;
2401 BasicBlock *Parent = nullptr;
2402 unsigned Hash = 0;
2403 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2404 const OperandData &OpData = getData(OpIdx, Lane);
2405 if (OpData.APO)
2406 ++CntTrue;
2407 // Use Boyer-Moore majority voting for finding the majority opcode and
2408 // the number of times it occurs.
2409 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
2410 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI) ||
2411 I->getParent() != Parent) {
2412 if (NumOpsWithSameOpcodeParent == 0) {
2413 NumOpsWithSameOpcodeParent = 1;
2414 OpcodeI = I;
2415 Parent = I->getParent();
2416 } else {
2417 --NumOpsWithSameOpcodeParent;
2418 }
2419 } else {
2420 ++NumOpsWithSameOpcodeParent;
2421 }
2422 }
2423 Hash = hash_combine(
2424 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2425 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2426 }
2427 if (AllUndefs)
2428 return {};
2429 OperandsOrderData Data;
2430 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2431 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2432 Data.Hash = Hash;
2433 return Data;
2434 }
2435
2436 /// Go through the instructions in VL and append their operands.
2437 void appendOperandsOfVL(ArrayRef<Value *> VL, const InstructionsState &S) {
2438 assert(!VL.empty() && "Bad VL");
2439 assert((empty() || VL.size() == getNumLanes()) &&
2440 "Expected same number of lanes");
2441 assert(S.valid() && "InstructionsState is invalid.");
2442 // IntrinsicInst::isCommutative returns true if swapping the first "two"
2443 // arguments to the intrinsic produces the same result.
2444 constexpr unsigned IntrinsicNumOperands = 2;
2445 Instruction *MainOp = S.getMainOp();
2446 unsigned NumOperands = MainOp->getNumOperands();
2447 ArgSize = isa<IntrinsicInst>(MainOp) ? IntrinsicNumOperands : NumOperands;
2448 OpsVec.resize(NumOperands);
2449 unsigned NumLanes = VL.size();
2450 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2451 OpsVec[OpIdx].resize(NumLanes);
2452 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2453 assert((isa<Instruction>(VL[Lane]) || isa<PoisonValue>(VL[Lane])) &&
2454 "Expected instruction or poison value");
2455 // Our tree has just 3 nodes: the root and two operands.
2456 // It is therefore trivial to get the APO. We only need to check the
2457 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
2458 // RHS operand. The LHS operand of both add and sub is never attached
2459 // to an inversese operation in the linearized form, therefore its APO
2460 // is false. The RHS is true only if VL[Lane] is an inverse operation.
2461
2462 // Since operand reordering is performed on groups of commutative
2463 // operations or alternating sequences (e.g., +, -), we can safely
2464 // tell the inverse operations by checking commutativity.
2465 if (isa<PoisonValue>(VL[Lane])) {
2466 if (auto *EI = dyn_cast<ExtractElementInst>(MainOp)) {
2467 if (OpIdx == 0) {
2468 OpsVec[OpIdx][Lane] = {EI->getVectorOperand(), true, false};
2469 continue;
2470 }
2471 } else if (auto *EV = dyn_cast<ExtractValueInst>(MainOp)) {
2472 if (OpIdx == 0) {
2473 OpsVec[OpIdx][Lane] = {EV->getAggregateOperand(), true, false};
2474 continue;
2475 }
2476 }
2477 OpsVec[OpIdx][Lane] = {
2478 PoisonValue::get(MainOp->getOperand(OpIdx)->getType()), true,
2479 false};
2480 continue;
2481 }
2482 bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
2483 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
2484 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2485 APO, false};
2486 }
2487 }
2488 }
2489
2490 /// \returns the number of operands.
2491 unsigned getNumOperands() const { return ArgSize; }
2492
2493 /// \returns the number of lanes.
2494 unsigned getNumLanes() const { return OpsVec[0].size(); }
2495
2496 /// \returns the operand value at \p OpIdx and \p Lane.
2497 Value *getValue(unsigned OpIdx, unsigned Lane) const {
2498 return getData(OpIdx, Lane).V;
2499 }
2500
2501 /// \returns true if the data structure is empty.
2502 bool empty() const { return OpsVec.empty(); }
2503
2504 /// Clears the data.
2505 void clear() { OpsVec.clear(); }
2506
2507 /// \Returns true if there are enough operands identical to \p Op to fill
2508 /// the whole vector (it is mixed with constants or loop invariant values).
2509 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
2510 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
2511 assert(Op == getValue(OpIdx, Lane) &&
2512 "Op is expected to be getValue(OpIdx, Lane).");
2513 // Small number of loads - try load matching.
2514 if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
2515 return false;
2516 bool OpAPO = getData(OpIdx, Lane).APO;
2517 bool IsInvariant = L && L->isLoopInvariant(Op);
2518 unsigned Cnt = 0;
2519 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2520 if (Ln == Lane)
2521 continue;
2522 // This is set to true if we found a candidate for broadcast at Lane.
2523 bool FoundCandidate = false;
2524 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2525 OperandData &Data = getData(OpI, Ln);
2526 if (Data.APO != OpAPO || Data.IsUsed)
2527 continue;
2528 Value *OpILane = getValue(OpI, Lane);
2529 bool IsConstantOp = isa<Constant>(OpILane);
2530 // Consider the broadcast candidate if:
2531 // 1. Same value is found in one of the operands.
2532 if (Data.V == Op ||
2533 // 2. The operand in the given lane is not constant but there is a
2534 // constant operand in another lane (which can be moved to the
2535 // given lane). In this case we can represent it as a simple
2536 // permutation of constant and broadcast.
2537 (!IsConstantOp &&
2538 ((Lns > 2 && isa<Constant>(Data.V)) ||
2539 // 2.1. If we have only 2 lanes, need to check that value in the
2540 // next lane does not build same opcode sequence.
2541 (Lns == 2 &&
2542 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) &&
2543 isa<Constant>(Data.V)))) ||
2544 // 3. The operand in the current lane is loop invariant (can be
2545 // hoisted out) and another operand is also a loop invariant
2546 // (though not a constant). In this case the whole vector can be
2547 // hoisted out.
2548 // FIXME: need to teach the cost model about this case for better
2549 // estimation.
2550 (IsInvariant && !isa<Constant>(Data.V) &&
2551 !getSameOpcode({Op, Data.V}, TLI) &&
2552 L->isLoopInvariant(Data.V))) {
2553 FoundCandidate = true;
2554 Data.IsUsed = Data.V == Op;
2555 if (Data.V == Op)
2556 ++Cnt;
2557 break;
2558 }
2559 }
2560 if (!FoundCandidate)
2561 return false;
2562 }
2563 return getNumLanes() == 2 || Cnt > 1;
2564 }
2565
2566 /// Checks if there is at least single compatible operand in lanes other
2567 /// than \p Lane, compatible with the operand \p Op.
2568 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
2569 assert(Op == getValue(OpIdx, Lane) &&
2570 "Op is expected to be getValue(OpIdx, Lane).");
2571 bool OpAPO = getData(OpIdx, Lane).APO;
2572 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2573 if (Ln == Lane)
2574 continue;
2575 if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
2576 const OperandData &Data = getData(OpI, Ln);
2577 if (Data.APO != OpAPO || Data.IsUsed)
2578 return true;
2579 Value *OpILn = getValue(OpI, Ln);
2580 return (L && L->isLoopInvariant(OpILn)) ||
2581 (getSameOpcode({Op, OpILn}, TLI) &&
2582 allSameBlock({Op, OpILn}));
2583 }))
2584 return true;
2585 }
2586 return false;
2587 }
2588
2589 public:
2590 /// Initialize with all the operands of the instruction vector \p RootVL.
2591 VLOperands(ArrayRef<Value *> RootVL, const InstructionsState &S,
2592 const BoUpSLP &R)
2593 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
2594 L(R.LI->getLoopFor(S.getMainOp()->getParent())) {
2595 // Append all the operands of RootVL.
2596 appendOperandsOfVL(RootVL, S);
2597 }
2598
2599 /// \Returns a value vector with the operands across all lanes for the
2600 /// opearnd at \p OpIdx.
2601 ValueList getVL(unsigned OpIdx) const {
2602 ValueList OpVL(OpsVec[OpIdx].size());
2603 assert(OpsVec[OpIdx].size() == getNumLanes() &&
2604 "Expected same num of lanes across all operands");
2605 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2606 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2607 return OpVL;
2608 }
2609
2610 // Performs operand reordering for 2 or more operands.
2611 // The original operands are in OrigOps[OpIdx][Lane].
2612 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
2613 void reorder() {
2614 unsigned NumOperands = getNumOperands();
2615 unsigned NumLanes = getNumLanes();
2616 // Each operand has its own mode. We are using this mode to help us select
2617 // the instructions for each lane, so that they match best with the ones
2618 // we have selected so far.
2619 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
2620
2621 // This is a greedy single-pass algorithm. We are going over each lane
2622 // once and deciding on the best order right away with no back-tracking.
2623 // However, in order to increase its effectiveness, we start with the lane
2624 // that has operands that can move the least. For example, given the
2625 // following lanes:
2626 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
2627 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
2628 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
2629 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
2630 // we will start at Lane 1, since the operands of the subtraction cannot
2631 // be reordered. Then we will visit the rest of the lanes in a circular
2632 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2633
2634 // Find the first lane that we will start our search from.
2635 unsigned FirstLane = getBestLaneToStartReordering();
2636
2637 // Initialize the modes.
2638 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2639 Value *OpLane0 = getValue(OpIdx, FirstLane);
2640 // Keep track if we have instructions with all the same opcode on one
2641 // side.
2642 if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
2643 // Check if OpLane0 should be broadcast.
2644 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
2645 !canBeVectorized(OpILane0, OpIdx, FirstLane))
2646 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2647 else if (isa<LoadInst>(OpILane0))
2648 ReorderingModes[OpIdx] = ReorderingMode::Load;
2649 else
2650 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2651 } else if (isa<Constant>(OpLane0)) {
2652 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2653 } else if (isa<Argument>(OpLane0)) {
2654 // Our best hope is a Splat. It may save some cost in some cases.
2655 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2656 } else {
2657 llvm_unreachable("Unexpected value kind.");
2658 }
2659 }
2660
2661 // Check that we don't have same operands. No need to reorder if operands
2662 // are just perfect diamond or shuffled diamond match. Do not do it only
2663 // for possible broadcasts or non-power of 2 number of scalars (just for
2664 // now).
2665 auto &&SkipReordering = [this]() {
2666 SmallPtrSet<Value *, 4> UniqueValues;
2667 ArrayRef<OperandData> Op0 = OpsVec.front();
2668 for (const OperandData &Data : Op0)
2669 UniqueValues.insert(Data.V);
2671 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
2672 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
2673 return !UniqueValues.contains(Data.V);
2674 }))
2675 return false;
2676 }
2677 // TODO: Check if we can remove a check for non-power-2 number of
2678 // scalars after full support of non-power-2 vectorization.
2679 return UniqueValues.size() != 2 &&
2680 hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(),
2681 UniqueValues.size());
2682 };
2683
2684 // If the initial strategy fails for any of the operand indexes, then we
2685 // perform reordering again in a second pass. This helps avoid assigning
2686 // high priority to the failed strategy, and should improve reordering for
2687 // the non-failed operand indexes.
2688 for (int Pass = 0; Pass != 2; ++Pass) {
2689 // Check if no need to reorder operands since they're are perfect or
2690 // shuffled diamond match.
2691 // Need to do it to avoid extra external use cost counting for
2692 // shuffled matches, which may cause regressions.
2693 if (SkipReordering())
2694 break;
2695 // Skip the second pass if the first pass did not fail.
2696 bool StrategyFailed = false;
2697 // Mark all operand data as free to use.
2698 clearUsed();
2699 // We keep the original operand order for the FirstLane, so reorder the
2700 // rest of the lanes. We are visiting the nodes in a circular fashion,
2701 // using FirstLane as the center point and increasing the radius
2702 // distance.
2703 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
2704 for (unsigned I = 0; I < NumOperands; ++I)
2705 MainAltOps[I].push_back(getData(I, FirstLane).V);
2706
2707 SmallBitVector UsedLanes(NumLanes);
2708 UsedLanes.set(FirstLane);
2709 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2710 // Visit the lane on the right and then the lane on the left.
2711 for (int Direction : {+1, -1}) {
2712 int Lane = FirstLane + Direction * Distance;
2713 if (Lane < 0 || Lane >= (int)NumLanes)
2714 continue;
2715 UsedLanes.set(Lane);
2716 int LastLane = Lane - Direction;
2717 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
2718 "Out of bounds");
2719 // Look for a good match for each operand.
2720 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2721 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
2722 std::optional<unsigned> BestIdx =
2723 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
2724 MainAltOps[OpIdx], UsedLanes);
2725 // By not selecting a value, we allow the operands that follow to
2726 // select a better matching value. We will get a non-null value in
2727 // the next run of getBestOperand().
2728 if (BestIdx) {
2729 // Swap the current operand with the one returned by
2730 // getBestOperand().
2731 swap(OpIdx, *BestIdx, Lane);
2732 } else {
2733 // Enable the second pass.
2734 StrategyFailed = true;
2735 }
2736 // Try to get the alternate opcode and follow it during analysis.
2737 if (MainAltOps[OpIdx].size() != 2) {
2738 OperandData &AltOp = getData(OpIdx, Lane);
2739 InstructionsState OpS =
2740 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
2741 if (OpS && OpS.isAltShuffle())
2742 MainAltOps[OpIdx].push_back(AltOp.V);
2743 }
2744 }
2745 }
2746 }
2747 // Skip second pass if the strategy did not fail.
2748 if (!StrategyFailed)
2749 break;
2750 }
2751 }
2752
2753#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2754 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
2755 switch (RMode) {
2756 case ReorderingMode::Load:
2757 return "Load";
2758 case ReorderingMode::Opcode:
2759 return "Opcode";
2760 case ReorderingMode::Constant:
2761 return "Constant";
2762 case ReorderingMode::Splat:
2763 return "Splat";
2764 case ReorderingMode::Failed:
2765 return "Failed";
2766 }
2767 llvm_unreachable("Unimplemented Reordering Type");
2768 }
2769
2770 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
2771 raw_ostream &OS) {
2772 return OS << getModeStr(RMode);
2773 }
2774
2775 /// Debug print.
2776 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
2777 printMode(RMode, dbgs());
2778 }
2779
2780 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2781 return printMode(RMode, OS);
2782 }
2783
2785 const unsigned Indent = 2;
2786 unsigned Cnt = 0;
2787 for (const OperandDataVec &OpDataVec : OpsVec) {
2788 OS << "Operand " << Cnt++ << "\n";
2789 for (const OperandData &OpData : OpDataVec) {
2790 OS.indent(Indent) << "{";
2791 if (Value *V = OpData.V)
2792 OS << *V;
2793 else
2794 OS << "null";
2795 OS << ", APO:" << OpData.APO << "}\n";
2796 }
2797 OS << "\n";
2798 }
2799 return OS;
2800 }
2801
2802 /// Debug print.
2803 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
2804#endif
2805 };
2806
2807 /// Evaluate each pair in \p Candidates and return index into \p Candidates
2808 /// for a pair which have highest score deemed to have best chance to form
2809 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
2810 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2811 /// of the cost, considered to be good enough score.
2812 std::optional<int>
2813 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
2814 int Limit = LookAheadHeuristics::ScoreFail) const {
2815 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
2817 int BestScore = Limit;
2818 std::optional<int> Index;
2819 for (int I : seq<int>(0, Candidates.size())) {
2820 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
2821 Candidates[I].second,
2822 /*U1=*/nullptr, /*U2=*/nullptr,
2823 /*CurrLevel=*/1, {});
2824 if (Score > BestScore) {
2825 BestScore = Score;
2826 Index = I;
2827 }
2828 }
2829 return Index;
2830 }
2831
2832 /// Checks if the instruction is marked for deletion.
2833 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
2834
2835 /// Removes an instruction from its block and eventually deletes it.
2836 /// It's like Instruction::eraseFromParent() except that the actual deletion
2837 /// is delayed until BoUpSLP is destructed.
2839 DeletedInstructions.insert(I);
2840 }
2841
2842 /// Remove instructions from the parent function and clear the operands of \p
2843 /// DeadVals instructions, marking for deletion trivially dead operands.
2844 template <typename T>
2847 for (T *V : DeadVals) {
2848 auto *I = cast<Instruction>(V);
2849 DeletedInstructions.insert(I);
2850 }
2851 DenseSet<Value *> Processed;
2852 for (T *V : DeadVals) {
2853 if (!V || !Processed.insert(V).second)
2854 continue;
2855 auto *I = cast<Instruction>(V);
2857 ArrayRef<TreeEntry *> Entries = getTreeEntries(I);
2858 for (Use &U : I->operands()) {
2859 if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
2860 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
2862 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
2863 return Entry->VectorizedValue == OpI;
2864 })))
2865 DeadInsts.push_back(OpI);
2866 }
2867 I->dropAllReferences();
2868 }
2869 for (T *V : DeadVals) {
2870 auto *I = cast<Instruction>(V);
2871 if (!I->getParent())
2872 continue;
2873 assert((I->use_empty() || all_of(I->uses(),
2874 [&](Use &U) {
2875 return isDeleted(
2876 cast<Instruction>(U.getUser()));
2877 })) &&
2878 "trying to erase instruction with users.");
2879 I->removeFromParent();
2880 SE->forgetValue(I);
2881 }
2882 // Process the dead instruction list until empty.
2883 while (!DeadInsts.empty()) {
2884 Value *V = DeadInsts.pop_back_val();
2885 Instruction *VI = cast_or_null<Instruction>(V);
2886 if (!VI || !VI->getParent())
2887 continue;
2889 "Live instruction found in dead worklist!");
2890 assert(VI->use_empty() && "Instructions with uses are not dead.");
2891
2892 // Don't lose the debug info while deleting the instructions.
2893 salvageDebugInfo(*VI);
2894
2895 // Null out all of the instruction's operands to see if any operand
2896 // becomes dead as we go.
2897 for (Use &OpU : VI->operands()) {
2898 Value *OpV = OpU.get();
2899 if (!OpV)
2900 continue;
2901 OpU.set(nullptr);
2902
2903 if (!OpV->use_empty())
2904 continue;
2905
2906 // If the operand is an instruction that became dead as we nulled out
2907 // the operand, and if it is 'trivially' dead, delete it in a future
2908 // loop iteration.
2909 if (auto *OpI = dyn_cast<Instruction>(OpV))
2910 if (!DeletedInstructions.contains(OpI) &&
2912 DeadInsts.push_back(OpI);
2913 }
2914
2915 VI->removeFromParent();
2916 DeletedInstructions.insert(VI);
2917 SE->forgetValue(VI);
2918 }
2919 }
2920
2921 /// Checks if the instruction was already analyzed for being possible
2922 /// reduction root.
2924 return AnalyzedReductionsRoots.count(I);
2925 }
2926 /// Register given instruction as already analyzed for being possible
2927 /// reduction root.
2929 AnalyzedReductionsRoots.insert(I);
2930 }
2931 /// Checks if the provided list of reduced values was checked already for
2932 /// vectorization.
2934 return AnalyzedReductionVals.contains(hash_value(VL));
2935 }
2936 /// Adds the list of reduced values to list of already checked values for the
2937 /// vectorization.
2939 AnalyzedReductionVals.insert(hash_value(VL));
2940 }
2941 /// Clear the list of the analyzed reduction root instructions.
2943 AnalyzedReductionsRoots.clear();
2944 AnalyzedReductionVals.clear();
2945 AnalyzedMinBWVals.clear();
2946 }
2947 /// Checks if the given value is gathered in one of the nodes.
2948 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
2949 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
2950 }
2951 /// Checks if the given value is gathered in one of the nodes.
2952 bool isGathered(const Value *V) const {
2953 return MustGather.contains(V);
2954 }
2955 /// Checks if the specified value was not schedule.
2956 bool isNotScheduled(const Value *V) const {
2957 return NonScheduledFirst.contains(V);
2958 }
2959
2960 /// Check if the value is vectorized in the tree.
2961 bool isVectorized(Value *V) const {
2962 assert(V && "V cannot be nullptr.");
2963 return ScalarToTreeEntries.contains(V);
2964 }
2965
2966 ~BoUpSLP();
2967
2968private:
2969 /// Determine if a node \p E in can be demoted to a smaller type with a
2970 /// truncation. We collect the entries that will be demoted in ToDemote.
2971 /// \param E Node for analysis
2972 /// \param ToDemote indices of the nodes to be demoted.
2973 bool collectValuesToDemote(
2974 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
2976 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
2977 bool &IsProfitableToDemote, bool IsTruncRoot) const;
2978
2979 /// Check if the operands on the edges \p Edges of the \p UserTE allows
2980 /// reordering (i.e. the operands can be reordered because they have only one
2981 /// user and reordarable).
2982 /// \param ReorderableGathers List of all gather nodes that require reordering
2983 /// (e.g., gather of extractlements or partially vectorizable loads).
2984 /// \param GatherOps List of gather operand nodes for \p UserTE that require
2985 /// reordering, subset of \p NonVectorized.
2986 bool
2987 canReorderOperands(TreeEntry *UserTE,
2988 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2989 ArrayRef<TreeEntry *> ReorderableGathers,
2990 SmallVectorImpl<TreeEntry *> &GatherOps);
2991
2992 /// Checks if the given \p TE is a gather node with clustered reused scalars
2993 /// and reorders it per given \p Mask.
2994 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
2995
2996 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2997 /// if any. If it is not vectorized (gather node), returns nullptr.
2998 TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
2999 ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
3000 TreeEntry *TE = nullptr;
3001 const auto *It = find_if(VL, [&](Value *V) {
3002 for (TreeEntry *E : getTreeEntries(V)) {
3003 if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
3004 TE = E;
3005 return true;
3006 }
3007 }
3008 return false;
3009 });
3010 if (It != VL.end()) {
3011 assert(TE->isSame(VL) && "Expected same scalars.");
3012 return TE;
3013 }
3014 return nullptr;
3015 }
3016
3017 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
3018 /// if any. If it is not vectorized (gather node), returns nullptr.
3019 const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
3020 unsigned OpIdx) const {
3021 return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
3022 const_cast<TreeEntry *>(UserTE), OpIdx);
3023 }
3024
3025 /// Checks if all users of \p I are the part of the vectorization tree.
3026 bool areAllUsersVectorized(
3027 Instruction *I,
3028 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
3029
3030 /// Return information about the vector formed for the specified index
3031 /// of a vector of (the same) instruction.
3033
3034 /// \ returns the graph entry for the \p Idx operand of the \p E entry.
3035 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
3036
3037 /// Gets the root instruction for the given node. If the node is a strided
3038 /// load/store node with the reverse order, the root instruction is the last
3039 /// one.
3040 Instruction *getRootEntryInstruction(const TreeEntry &Entry) const;
3041
3042 /// \returns Cast context for the given graph node.
3044 getCastContextHint(const TreeEntry &TE) const;
3045
3046 /// \returns the cost of the vectorizable entry.
3047 InstructionCost getEntryCost(const TreeEntry *E,
3048 ArrayRef<Value *> VectorizedVals,
3049 SmallPtrSetImpl<Value *> &CheckedExtracts);
3050
3051 /// This is the recursive part of buildTree.
3052 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
3053 const EdgeInfo &EI, unsigned InterleaveFactor = 0);
3054
3055 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
3056 /// be vectorized to use the original vector (or aggregate "bitcast" to a
3057 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
3058 /// returns false, setting \p CurrentOrder to either an empty vector or a
3059 /// non-identity permutation that allows to reuse extract instructions.
3060 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
3061 /// extract order.
3062 bool canReuseExtract(ArrayRef<Value *> VL,
3063 SmallVectorImpl<unsigned> &CurrentOrder,
3064 bool ResizeAllowed = false) const;
3065
3066 /// Vectorize a single entry in the tree.
3067 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
3068 /// avoid issues with def-use order.
3069 Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);
3070
3071 /// Returns vectorized operand node, that matches the order of the scalars
3072 /// operand number \p NodeIdx in entry \p E.
3073 TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx);
3074 const TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E,
3075 unsigned NodeIdx) const {
3076 return const_cast<BoUpSLP *>(this)->getMatchedVectorizedOperand(E, NodeIdx);
3077 }
3078
3079 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
3080 /// \p E.
3081 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
3082 /// avoid issues with def-use order.
3083 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs);
3084
3085 /// Create a new vector from a list of scalar values. Produces a sequence
3086 /// which exploits values reused across lanes, and arranges the inserts
3087 /// for ease of later optimization.
3088 template <typename BVTy, typename ResTy, typename... Args>
3089 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
3090
3091 /// Create a new vector from a list of scalar values. Produces a sequence
3092 /// which exploits values reused across lanes, and arranges the inserts
3093 /// for ease of later optimization.
3094 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy,
3095 bool PostponedPHIs);
3096
3097 /// Returns the instruction in the bundle, which can be used as a base point
3098 /// for scheduling. Usually it is the last instruction in the bundle, except
3099 /// for the case when all operands are external (in this case, it is the first
3100 /// instruction in the list).
3101 Instruction &getLastInstructionInBundle(const TreeEntry *E);
3102
3103 /// Tries to find extractelement instructions with constant indices from fixed
3104 /// vector type and gather such instructions into a bunch, which highly likely
3105 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3106 /// was successful, the matched scalars are replaced by poison values in \p VL
3107 /// for future analysis.
3108 std::optional<TargetTransformInfo::ShuffleKind>
3109 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
3110 SmallVectorImpl<int> &Mask) const;
3111
3112 /// Tries to find extractelement instructions with constant indices from fixed
3113 /// vector type and gather such instructions into a bunch, which highly likely
3114 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3115 /// was successful, the matched scalars are replaced by poison values in \p VL
3116 /// for future analysis.
3118 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
3120 unsigned NumParts) const;
3121
3122 /// Checks if the gathered \p VL can be represented as a single register
3123 /// shuffle(s) of previous tree entries.
3124 /// \param TE Tree entry checked for permutation.
3125 /// \param VL List of scalars (a subset of the TE scalar), checked for
3126 /// permutations. Must form single-register vector.
3127 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3128 /// commands to build the mask using the original vector value, without
3129 /// relying on the potential reordering.
3130 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
3131 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
3132 std::optional<TargetTransformInfo::ShuffleKind>
3133 isGatherShuffledSingleRegisterEntry(
3134 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
3135 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
3136 bool ForOrder);
3137
3138 /// Checks if the gathered \p VL can be represented as multi-register
3139 /// shuffle(s) of previous tree entries.
3140 /// \param TE Tree entry checked for permutation.
3141 /// \param VL List of scalars (a subset of the TE scalar), checked for
3142 /// permutations.
3143 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3144 /// commands to build the mask using the original vector value, without
3145 /// relying on the potential reordering.
3146 /// \returns per-register series of ShuffleKind, if gathered values can be
3147 /// represented as shuffles of previous tree entries. \p Mask is filled with
3148 /// the shuffle mask (also on per-register base).
3150 isGatherShuffledEntry(
3151 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
3153 unsigned NumParts, bool ForOrder = false);
3154
3155 /// \returns the cost of gathering (inserting) the values in \p VL into a
3156 /// vector.
3157 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
3158 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
3159 Type *ScalarTy) const;
3160
3161 /// Set the Builder insert point to one after the last instruction in
3162 /// the bundle
3163 void setInsertPointAfterBundle(const TreeEntry *E);
3164
3165 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
3166 /// specified, the starting vector value is poison.
3167 Value *
3168 gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
3169 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle);
3170
3171 /// \returns whether the VectorizableTree is fully vectorizable and will
3172 /// be beneficial even the tree height is tiny.
3173 bool isFullyVectorizableTinyTree(bool ForReduction) const;
3174
3175 /// Run through the list of all gathered loads in the graph and try to find
3176 /// vector loads/masked gathers instead of regular gathers. Later these loads
3177 /// are reshufled to build final gathered nodes.
3178 void tryToVectorizeGatheredLoads(
3179 const SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
3180 SmallVector<SmallVector<std::pair<LoadInst *, int>>>,
3181 8> &GatheredLoads);
3182
3183 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
3184 /// users of \p TE and collects the stores. It returns the map from the store
3185 /// pointers to the collected stores.
3187 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
3188
3189 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
3190 /// stores in \p StoresVec can form a vector instruction. If so it returns
3191 /// true and populates \p ReorderIndices with the shuffle indices of the
3192 /// stores when compared to the sorted vector.
3193 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
3194 OrdersType &ReorderIndices) const;
3195
3196 /// Iterates through the users of \p TE, looking for scalar stores that can be
3197 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
3198 /// their order and builds an order index vector for each store bundle. It
3199 /// returns all these order vectors found.
3200 /// We run this after the tree has formed, otherwise we may come across user
3201 /// instructions that are not yet in the tree.
3203 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
3204
3205 /// Tries to reorder the gathering node for better vectorization
3206 /// opportunities.
3207 void reorderGatherNode(TreeEntry &TE);
3208
3209 struct TreeEntry {
3210 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
3211 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3212
3213 /// \returns Common mask for reorder indices and reused scalars.
3214 SmallVector<int> getCommonMask() const {
3216 inversePermutation(ReorderIndices, Mask);
3217 ::addMask(Mask, ReuseShuffleIndices);
3218 return Mask;
3219 }
3220
3221 /// \returns true if the scalars in VL are equal to this entry.
3222 bool isSame(ArrayRef<Value *> VL) const {
3223 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
3224 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
3225 return std::equal(VL.begin(), VL.end(), Scalars.begin());
3226 return VL.size() == Mask.size() &&
3227 std::equal(VL.begin(), VL.end(), Mask.begin(),
3228 [Scalars](Value *V, int Idx) {
3229 return (isa<UndefValue>(V) &&
3230 Idx == PoisonMaskElem) ||
3231 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3232 });
3233 };
3234 if (!ReorderIndices.empty()) {
3235 // TODO: implement matching if the nodes are just reordered, still can
3236 // treat the vector as the same if the list of scalars matches VL
3237 // directly, without reordering.
3239 inversePermutation(ReorderIndices, Mask);
3240 if (VL.size() == Scalars.size())
3241 return IsSame(Scalars, Mask);
3242 if (VL.size() == ReuseShuffleIndices.size()) {
3243 ::addMask(Mask, ReuseShuffleIndices);
3244 return IsSame(Scalars, Mask);
3245 }
3246 return false;
3247 }
3248 return IsSame(Scalars, ReuseShuffleIndices);
3249 }
3250
3251 bool isOperandGatherNode(const EdgeInfo &UserEI) const {
3252 return isGather() && !UserTreeIndices.empty() &&
3253 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
3254 UserTreeIndices.front().UserTE == UserEI.UserTE;
3255 }
3256
3257 /// \returns true if current entry has same operands as \p TE.
3258 bool hasEqualOperands(const TreeEntry &TE) const {
3259 if (TE.getNumOperands() != getNumOperands())
3260 return false;
3261 SmallBitVector Used(getNumOperands());
3262 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
3263 unsigned PrevCount = Used.count();
3264 for (unsigned K = 0; K < E; ++K) {
3265 if (Used.test(K))
3266 continue;
3267 if (getOperand(K) == TE.getOperand(I)) {
3268 Used.set(K);
3269 break;
3270 }
3271 }
3272 // Check if we actually found the matching operand.
3273 if (PrevCount == Used.count())
3274 return false;
3275 }
3276 return true;
3277 }
3278
3279 /// \return Final vectorization factor for the node. Defined by the total
3280 /// number of vectorized scalars, including those, used several times in the
3281 /// entry and counted in the \a ReuseShuffleIndices, if any.
3282 unsigned getVectorFactor() const {
3283 if (!ReuseShuffleIndices.empty())
3284 return ReuseShuffleIndices.size();
3285 return Scalars.size();
3286 };
3287
3288 /// Checks if the current node is a gather node.
3289 bool isGather() const { return State == NeedToGather; }
3290
3291 /// A vector of scalars.
3292 ValueList Scalars;
3293
3294 /// The Scalars are vectorized into this value. It is initialized to Null.
3295 WeakTrackingVH VectorizedValue = nullptr;
3296
3297 /// New vector phi instructions emitted for the vectorized phi nodes.
3298 PHINode *PHI = nullptr;
3299
3300 /// Do we need to gather this sequence or vectorize it
3301 /// (either with vector instruction or with scatter/gather
3302 /// intrinsics for store/load)?
3303 enum EntryState {
3304 Vectorize, ///< The node is regularly vectorized.
3305 ScatterVectorize, ///< Masked scatter/gather node.
3306 StridedVectorize, ///< Strided loads (and stores)
3307 NeedToGather, ///< Gather/buildvector node.
3308 CombinedVectorize, ///< Vectorized node, combined with its user into more
3309 ///< complex node like select/cmp to minmax, mul/add to
3310 ///< fma, etc. Must be used for the following nodes in
3311 ///< the pattern, not the very first one.
3312 };
3313 EntryState State;
3314
3315 /// List of combined opcodes supported by the vectorizer.
3316 enum CombinedOpcode {
3317 NotCombinedOp = -1,
3318 MinMax = Instruction::OtherOpsEnd + 1,
3319 };
3320 CombinedOpcode CombinedOp = NotCombinedOp;
3321
3322 /// Does this sequence require some shuffling?
3323 SmallVector<int, 4> ReuseShuffleIndices;
3324
3325 /// Does this entry require reordering?
3326 SmallVector<unsigned, 4> ReorderIndices;
3327
3328 /// Points back to the VectorizableTree.
3329 ///
3330 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
3331 /// to be a pointer and needs to be able to initialize the child iterator.
3332 /// Thus we need a reference back to the container to translate the indices
3333 /// to entries.
3334 VecTreeTy &Container;
3335
3336 /// The TreeEntry index containing the user of this entry. We can actually
3337 /// have multiple users so the data structure is not truly a tree.
3338 SmallVector<EdgeInfo, 1> UserTreeIndices;
3339
3340 /// The index of this treeEntry in VectorizableTree.
3341 unsigned Idx = 0;
3342
3343 /// For gather/buildvector/alt opcode (TODO) nodes, which are combined from
3344 /// other nodes as a series of insertvector instructions.
3345 SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
3346
3347 private:
3348 /// The operands of each instruction in each lane Operands[op_index][lane].
3349 /// Note: This helps avoid the replication of the code that performs the
3350 /// reordering of operands during buildTree_rec() and vectorizeTree().
3352
3353 /// MainOp and AltOp are recorded inside. S should be obtained from
3354 /// newTreeEntry.
3355 InstructionsState S = InstructionsState::invalid();
3356
3357 /// Interleaving factor for interleaved loads Vectorize nodes.
3358 unsigned InterleaveFactor = 0;
3359
3360 public:
3361 /// Returns interleave factor for interleave nodes.
3362 unsigned getInterleaveFactor() const { return InterleaveFactor; }
3363 /// Sets interleaving factor for the interleaving nodes.
3364 void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
3365
3366 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
3367 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
3368 if (Operands.size() < OpIdx + 1)
3369 Operands.resize(OpIdx + 1);
3370 assert(Operands[OpIdx].empty() && "Already resized?");
3371 assert(OpVL.size() <= Scalars.size() &&
3372 "Number of operands is greater than the number of scalars.");
3373 Operands[OpIdx].resize(OpVL.size());
3374 copy(OpVL, Operands[OpIdx].begin());
3375 }
3376
3377 /// Set this bundle's operand from Scalars.
3378 void setOperand(const BoUpSLP &R, bool RequireReorder = false) {
3379 VLOperands Ops(Scalars, S, R);
3380 if (RequireReorder)
3381 Ops.reorder();
3382 for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands()))
3383 setOperand(I, Ops.getVL(I));
3384 }
3385
3386 /// Reorders operands of the node to the given mask \p Mask.
3387 void reorderOperands(ArrayRef<int> Mask) {
3388 for (ValueList &Operand : Operands)
3389 reorderScalars(Operand, Mask);
3390 }
3391
3392 /// \returns the \p OpIdx operand of this TreeEntry.
3393 ValueList &getOperand(unsigned OpIdx) {
3394 assert(OpIdx < Operands.size() && "Off bounds");
3395 return Operands[OpIdx];
3396 }
3397
3398 /// \returns the \p OpIdx operand of this TreeEntry.
3399 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
3400 assert(OpIdx < Operands.size() && "Off bounds");
3401 return Operands[OpIdx];
3402 }
3403
3404 /// \returns the number of operands.
3405 unsigned getNumOperands() const { return Operands.size(); }
3406
3407 /// \return the single \p OpIdx operand.
3408 Value *getSingleOperand(unsigned OpIdx) const {
3409 assert(OpIdx < Operands.size() && "Off bounds");
3410 assert(!Operands[OpIdx].empty() && "No operand available");
3411 return Operands[OpIdx][0];
3412 }
3413
3414 /// Some of the instructions in the list have alternate opcodes.
3415 bool isAltShuffle() const { return S.isAltShuffle(); }
3416
3417 bool isOpcodeOrAlt(Instruction *I) const { return S.isOpcodeOrAlt(I); }
3418
3419 /// Chooses the correct key for scheduling data. If \p Op has the same (or
3420 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
3421 /// \p OpValue.
3422 Value *isOneOf(Value *Op) const {
3423 auto *I = dyn_cast<Instruction>(Op);
3424 if (I && isOpcodeOrAlt(I))
3425 return Op;
3426 return S.getMainOp();
3427 }
3428
3429 void setOperations(const InstructionsState &S) {
3430 assert(S && "InstructionsState is invalid.");
3431 this->S = S;
3432 }
3433
3434 Instruction *getMainOp() const { return S.getMainOp(); }
3435
3436 Instruction *getAltOp() const { return S.getAltOp(); }
3437
3438 /// The main/alternate opcodes for the list of instructions.
3439 unsigned getOpcode() const { return S.getOpcode(); }
3440
3441 unsigned getAltOpcode() const { return S.getAltOpcode(); }
3442
3443 bool hasState() const { return S.valid(); }
3444
3445 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
3446 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
3447 int findLaneForValue(Value *V) const {
3448 unsigned FoundLane = getVectorFactor();
3449 for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End;
3450 std::advance(It, 1)) {
3451 if (*It != V)
3452 continue;
3453 FoundLane = std::distance(Scalars.begin(), It);
3454 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3455 if (!ReorderIndices.empty())
3456 FoundLane = ReorderIndices[FoundLane];
3457 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3458 if (ReuseShuffleIndices.empty())
3459 break;
3460 if (auto *RIt = find(ReuseShuffleIndices, FoundLane);
3461 RIt != ReuseShuffleIndices.end()) {
3462 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
3463 break;
3464 }
3465 }
3466 assert(FoundLane < getVectorFactor() && "Unable to find given value.");
3467 return FoundLane;
3468 }
3469
3470 /// Build a shuffle mask for graph entry which represents a merge of main
3471 /// and alternate operations.
3472 void
3473 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
3475 SmallVectorImpl<Value *> *OpScalars = nullptr,
3476 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
3477
3478 /// Return true if this is a non-power-of-2 node.
3479 bool isNonPowOf2Vec() const {
3480 bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
3481 return IsNonPowerOf2;
3482 }
3483
3484 /// Return true if this is a node, which tries to vectorize number of
3485 /// elements, forming whole vectors.
3486 bool
3487 hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
3488 bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
3489 TTI, getValueType(Scalars.front()), Scalars.size());
3490 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
3491 "Reshuffling not supported with non-power-of-2 vectors yet.");
3492 return IsNonPowerOf2;
3493 }
3494
3495 Value *getOrdered(unsigned Idx) const {
3496 assert(isGather() && "Must be used only for buildvectors/gathers.");
3497 if (ReorderIndices.empty())
3498 return Scalars[Idx];
3500 inversePermutation(ReorderIndices, Mask);
3501 return Scalars[Mask[Idx]];
3502 }
3503
3504#ifndef NDEBUG
3505 /// Debug printer.
3506 LLVM_DUMP_METHOD void dump() const {
3507 dbgs() << Idx << ".\n";
3508 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
3509 dbgs() << "Operand " << OpI << ":\n";
3510 for (const Value *V : Operands[OpI])
3511 dbgs().indent(2) << *V << "\n";
3512 }
3513 dbgs() << "Scalars: \n";
3514 for (Value *V : Scalars)
3515 dbgs().indent(2) << *V << "\n";
3516 dbgs() << "State: ";
3517 switch (State) {
3518 case Vectorize:
3519 if (InterleaveFactor > 0) {
3520 dbgs() << "Vectorize with interleave factor " << InterleaveFactor
3521 << "\n";
3522 } else {
3523 dbgs() << "Vectorize\n";
3524 }
3525 break;
3526 case ScatterVectorize:
3527 dbgs() << "ScatterVectorize\n";
3528 break;
3529 case StridedVectorize:
3530 dbgs() << "StridedVectorize\n";
3531 break;
3532 case NeedToGather:
3533 dbgs() << "NeedToGather\n";
3534 break;
3535 case CombinedVectorize:
3536 dbgs() << "CombinedVectorize\n";
3537 break;
3538 }
3539 if (S) {
3540 dbgs() << "MainOp: " << *S.getMainOp() << "\n";
3541 dbgs() << "AltOp: " << *S.getAltOp() << "\n";
3542 } else {
3543 dbgs() << "MainOp: NULL\n";
3544 dbgs() << "AltOp: NULL\n";
3545 }
3546 dbgs() << "VectorizedValue: ";
3547 if (VectorizedValue)
3548 dbgs() << *VectorizedValue << "\n";
3549 else
3550 dbgs() << "NULL\n";
3551 dbgs() << "ReuseShuffleIndices: ";
3552 if (ReuseShuffleIndices.empty())
3553 dbgs() << "Empty";
3554 else
3555 for (int ReuseIdx : ReuseShuffleIndices)
3556 dbgs() << ReuseIdx << ", ";
3557 dbgs() << "\n";
3558 dbgs() << "ReorderIndices: ";
3559 for (unsigned ReorderIdx : ReorderIndices)
3560 dbgs() << ReorderIdx << ", ";
3561 dbgs() << "\n";
3562 dbgs() << "UserTreeIndices: ";
3563 for (const auto &EInfo : UserTreeIndices)
3564 dbgs() << EInfo << ", ";
3565 dbgs() << "\n";
3566 if (!CombinedEntriesWithIndices.empty()) {
3567 dbgs() << "Combined entries: ";
3568 interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
3569 dbgs() << "Entry index " << P.first << " with offset " << P.second;
3570 });
3571 dbgs() << "\n";
3572 }
3573 }
3574#endif
3575 };
3576
3577#ifndef NDEBUG
3578 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
3579 InstructionCost VecCost, InstructionCost ScalarCost,
3580 StringRef Banner) const {
3581 dbgs() << "SLP: " << Banner << ":\n";
3582 E->dump();
3583 dbgs() << "SLP: Costs:\n";
3584 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
3585 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
3586 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
3587 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
3588 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
3589 }
3590#endif
3591
3592 /// Create a new VectorizableTree entry.
3593 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3594 std::optional<ScheduleData *> Bundle,
3595 const InstructionsState &S,
3596 const EdgeInfo &UserTreeIdx,
3597 ArrayRef<int> ReuseShuffleIndices = {},
3598 ArrayRef<unsigned> ReorderIndices = {},
3599 unsigned InterleaveFactor = 0) {
3600 TreeEntry::EntryState EntryState =
3601 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3602 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3603 ReuseShuffleIndices, ReorderIndices);
3604 if (E && InterleaveFactor > 0)
3605 E->setInterleave(InterleaveFactor);
3606 return E;
3607 }
3608
3609 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3610 TreeEntry::EntryState EntryState,
3611 std::optional<ScheduleData *> Bundle,
3612 const InstructionsState &S,
3613 const EdgeInfo &UserTreeIdx,
3614 ArrayRef<int> ReuseShuffleIndices = {},
3615 ArrayRef<unsigned> ReorderIndices = {}) {
3616 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3617 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3618 "Need to vectorize gather entry?");
3619 // Gathered loads still gathered? Do not create entry, use the original one.
3620 if (GatheredLoadsEntriesFirst.has_value() &&
3621 EntryState == TreeEntry::NeedToGather && S &&
3622 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
3623 !UserTreeIdx.UserTE)
3624 return nullptr;
3625 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
3626 TreeEntry *Last = VectorizableTree.back().get();
3627 Last->Idx = VectorizableTree.size() - 1;
3628 Last->State = EntryState;
3629 // FIXME: Remove once support for ReuseShuffleIndices has been implemented
3630 // for non-power-of-two vectors.
3631 assert(
3633 ReuseShuffleIndices.empty()) &&
3634 "Reshuffling scalars not yet supported for nodes with padding");
3635 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3636 ReuseShuffleIndices.end());
3637 if (ReorderIndices.empty()) {
3638 Last->Scalars.assign(VL.begin(), VL.end());
3639 if (S)
3640 Last->setOperations(S);
3641 } else {
3642 // Reorder scalars and build final mask.
3643 Last->Scalars.assign(VL.size(), nullptr);
3644 transform(ReorderIndices, Last->Scalars.begin(),
3645 [VL](unsigned Idx) -> Value * {
3646 if (Idx >= VL.size())
3647 return UndefValue::get(VL.front()->getType());
3648 return VL[Idx];
3649 });
3650 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
3651 if (S)
3652 Last->setOperations(S);
3653 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
3654 }
3655 if (!Last->isGather()) {
3656 SmallPtrSet<Value *, 4> Processed;
3657 for (Value *V : VL) {
3658 if (isa<PoisonValue>(V))
3659 continue;
3660 auto It = ScalarToTreeEntries.find(V);
3661 assert(
3662 (It == ScalarToTreeEntries.end() ||
3663 (It->getSecond().size() == 1 && It->getSecond().front() == Last) ||
3665 "Scalar already in tree!");
3666 if (It == ScalarToTreeEntries.end()) {
3667 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(Last);
3668 (void)Processed.insert(V);
3669 } else if (Processed.insert(V).second) {
3670 assert(!is_contained(It->getSecond(), Last) &&
3671 "Value already associated with the node.");
3672 It->getSecond().push_back(Last);
3673 }
3674 }
3675 // Update the scheduler bundle to point to this TreeEntry.
3676 ScheduleData *BundleMember = *Bundle;
3677 assert((BundleMember || isa<PHINode>(S.getMainOp()) ||
3678 isVectorLikeInstWithConstOps(S.getMainOp()) ||
3679 doesNotNeedToSchedule(VL)) &&
3680 "Bundle and VL out of sync");
3681 if (BundleMember) {
3682 for (Value *V : VL) {
3684 continue;
3685 if (!BundleMember)
3686 continue;
3687 BundleMember->TE = Last;
3688 BundleMember = BundleMember->NextInBundle;
3689 }
3690 }
3691 assert(!BundleMember && "Bundle and VL out of sync");
3692 } else {
3693 // Build a map for gathered scalars to the nodes where they are used.
3694 bool AllConstsOrCasts = true;
3695 for (Value *V : VL)
3696 if (!isConstant(V)) {
3697 auto *I = dyn_cast<CastInst>(V);
3698 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
3699 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
3700 !UserTreeIdx.UserTE->isGather())
3701 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
3702 }
3703 if (AllConstsOrCasts)
3704 CastMaxMinBWSizes =
3705 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3706 MustGather.insert(VL.begin(), VL.end());
3707 }
3708
3709 if (UserTreeIdx.UserTE)
3710 Last->UserTreeIndices.push_back(UserTreeIdx);
3711 return Last;
3712 }
3713
3714 /// -- Vectorization State --
3715 /// Holds all of the tree entries.
3716 TreeEntry::VecTreeTy VectorizableTree;
3717
3718#ifndef NDEBUG
3719 /// Debug printer.
3720 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
3721 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3722 VectorizableTree[Id]->dump();
3723 dbgs() << "\n";
3724 }
3725 }
3726#endif
3727
3728 /// Get list of vector entries, associated with the value \p V.
3729 ArrayRef<TreeEntry *> getTreeEntries(Value *V) const {
3730 assert(V && "V cannot be nullptr.");
3731 auto It = ScalarToTreeEntries.find(V);
3732 if (It == ScalarToTreeEntries.end())
3733 return {};
3734 return It->getSecond();
3735 }
3736
3737 /// Returns first vector node for value \p V, matching values \p VL.
3738 TreeEntry *getSameValuesTreeEntry(Value *V, ArrayRef<Value *> VL,
3739 bool SameVF = false) const {
3740 assert(V && "V cannot be nullptr.");
3741 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
3742 if ((!SameVF || TE->getVectorFactor() == VL.size()) && TE->isSame(VL))
3743 return TE;
3744 return nullptr;
3745 }
3746
3747 /// Check that the operand node of alternate node does not generate
3748 /// buildvector sequence. If it is, then probably not worth it to build
3749 /// alternate shuffle, if number of buildvector operands + alternate
3750 /// instruction > than the number of buildvector instructions.
3751 /// \param S the instructions state of the analyzed values.
3752 /// \param VL list of the instructions with alternate opcodes.
3753 bool areAltOperandsProfitable(const InstructionsState &S,
3754 ArrayRef<Value *> VL) const;
3755
3756 /// Checks if the specified list of the instructions/values can be vectorized
3757 /// and fills required data before actual scheduling of the instructions.
3758 TreeEntry::EntryState
3759 getScalarsVectorizationState(const InstructionsState &S, ArrayRef<Value *> VL,
3760 bool IsScatterVectorizeUserTE,
3761 OrdersType &CurrentOrder,
3762 SmallVectorImpl<Value *> &PointerOps);
3763
3764 /// Maps a specific scalar to its tree entry(ies).
3766
3767 /// Maps a value to the proposed vectorizable size.
3768 SmallDenseMap<Value *, unsigned> InstrElementSize;
3769
3770 /// A list of scalars that we found that we need to keep as scalars.
3771 ValueSet MustGather;
3772
3773 /// A set of first non-schedulable values.
3774 ValueSet NonScheduledFirst;
3775
3776 /// A map between the vectorized entries and the last instructions in the
3777 /// bundles. The bundles are built in use order, not in the def order of the
3778 /// instructions. So, we cannot rely directly on the last instruction in the
3779 /// bundle being the last instruction in the program order during
3780 /// vectorization process since the basic blocks are affected, need to
3781 /// pre-gather them before.
3782 DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
3783
3784 /// List of gather nodes, depending on other gather/vector nodes, which should
3785 /// be emitted after the vector instruction emission process to correctly
3786 /// handle order of the vector instructions and shuffles.
3787 SetVector<const TreeEntry *> PostponedGathers;
3788
3789 using ValueToGatherNodesMap =
3791 ValueToGatherNodesMap ValueToGatherNodes;
3792
3793 /// A list of the load entries (node indices), which can be vectorized using
3794 /// strided or masked gather approach, but attempted to be represented as
3795 /// contiguous loads.
3796 SetVector<unsigned> LoadEntriesToVectorize;
3797
3798 /// true if graph nodes transforming mode is on.
3799 bool IsGraphTransformMode = false;
3800
3801 /// The index of the first gathered load entry in the VectorizeTree.
3802 std::optional<unsigned> GatheredLoadsEntriesFirst;
3803
3804 /// This POD struct describes one external user in the vectorized tree.
3805 struct ExternalUser {
3806 ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, int L)
3807 : Scalar(S), User(U), E(E), Lane(L) {}
3808
3809 /// Which scalar in our function.
3810 Value *Scalar = nullptr;
3811
3812 /// Which user that uses the scalar.
3813 llvm::User *User = nullptr;
3814
3815 /// Vector node, the value is part of.
3816 const TreeEntry &E;
3817
3818 /// Which lane does the scalar belong to.
3819 int Lane;
3820 };
3821 using UserList = SmallVector<ExternalUser, 16>;
3822
3823 /// Checks if two instructions may access the same memory.
3824 ///
3825 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
3826 /// is invariant in the calling loop.
3827 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
3828 Instruction *Inst2) {
3829 if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))
3830 return true;
3831 // First check if the result is already in the cache.
3832 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
3833 auto It = AliasCache.find(Key);
3834 if (It != AliasCache.end())
3835 return It->second;
3836 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
3837 // Store the result in the cache.
3838 AliasCache.try_emplace(Key, Aliased);
3839 AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3840 return Aliased;
3841 }
3842
3843 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3844
3845 /// Cache for alias results.
3846 /// TODO: consider moving this to the AliasAnalysis itself.
3848
3849 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
3850 // globally through SLP because we don't perform any action which
3851 // invalidates capture results.
3852 BatchAAResults BatchAA;
3853
3854 /// Temporary store for deleted instructions. Instructions will be deleted
3855 /// eventually when the BoUpSLP is destructed. The deferral is required to
3856 /// ensure that there are no incorrect collisions in the AliasCache, which
3857 /// can happen if a new instruction is allocated at the same address as a
3858 /// previously deleted instruction.
3859 DenseSet<Instruction *> DeletedInstructions;
3860
3861 /// Set of the instruction, being analyzed already for reductions.
3862 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
3863
3864 /// Set of hashes for the list of reduction values already being analyzed.
3865 DenseSet<size_t> AnalyzedReductionVals;
3866
3867 /// Values, already been analyzed for mininmal bitwidth and found to be
3868 /// non-profitable.
3869 DenseSet<Value *> AnalyzedMinBWVals;
3870
3871 /// A list of values that need to extracted out of the tree.
3872 /// This list holds pairs of (Internal Scalar : External User). External User
3873 /// can be nullptr, it means that this Internal Scalar will be used later,
3874 /// after vectorization.
3875 UserList ExternalUses;
3876
3877 /// A list of GEPs which can be reaplced by scalar GEPs instead of
3878 /// extractelement instructions.
3879 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
3880
3881 /// Values used only by @llvm.assume calls.
3883
3884 /// Holds all of the instructions that we gathered, shuffle instructions and
3885 /// extractelements.
3886 SetVector<Instruction *> GatherShuffleExtractSeq;
3887
3888 /// A list of blocks that we are going to CSE.
3889 DenseSet<BasicBlock *> CSEBlocks;
3890
3891 /// List of hashes of vector of loads, which are known to be non vectorizable.
3892 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
3893
3894 /// Contains all scheduling relevant data for an instruction.
3895 /// A ScheduleData either represents a single instruction or a member of an
3896 /// instruction bundle (= a group of instructions which is combined into a
3897 /// vector instruction).
3898 struct ScheduleData {
3899 // The initial value for the dependency counters. It means that the
3900 // dependencies are not calculated yet.
3901 enum { InvalidDeps = -1 };
3902
3903 ScheduleData() = default;
3904
3905 void init(int BlockSchedulingRegionID, Instruction *I) {
3906 FirstInBundle = this;
3907 NextInBundle = nullptr;
3908 NextLoadStore = nullptr;
3909 IsScheduled = false;
3910 SchedulingRegionID = BlockSchedulingRegionID;
3911 clearDependencies();
3912 Inst = I;
3913 TE = nullptr;
3914 }
3915
3916 /// Verify basic self consistency properties
3917 void verify() {
3918 if (hasValidDependencies()) {
3919 assert(UnscheduledDeps <= Dependencies && "invariant");
3920 } else {
3921 assert(UnscheduledDeps == Dependencies && "invariant");
3922 }
3923
3924 if (IsScheduled) {
3925 assert(isSchedulingEntity() &&
3926 "unexpected scheduled state");
3927 for (const ScheduleData *BundleMember = this; BundleMember;
3928 BundleMember = BundleMember->NextInBundle) {
3929 assert(BundleMember->hasValidDependencies() &&
3930 BundleMember->UnscheduledDeps == 0 &&
3931 "unexpected scheduled state");
3932 assert((BundleMember == this || !BundleMember->IsScheduled) &&
3933 "only bundle is marked scheduled");
3934 }
3935 }
3936
3937 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3938 "all bundle members must be in same basic block");
3939 }
3940
3941 /// Returns true if the dependency information has been calculated.
3942 /// Note that depenendency validity can vary between instructions within
3943 /// a single bundle.
3944 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
3945
3946 /// Returns true for single instructions and for bundle representatives
3947 /// (= the head of a bundle).
3948 bool isSchedulingEntity() const { return FirstInBundle == this; }
3949
3950 /// Returns true if it represents an instruction bundle and not only a
3951 /// single instruction.
3952 bool isPartOfBundle() const {
3953 return NextInBundle != nullptr || FirstInBundle != this || TE;
3954 }
3955
3956 /// Returns true if it is ready for scheduling, i.e. it has no more
3957 /// unscheduled depending instructions/bundles.
3958 bool isReady() const {
3959 assert(isSchedulingEntity() &&
3960 "can't consider non-scheduling entity for ready list");
3961 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3962 }
3963
3964 /// Modifies the number of unscheduled dependencies for this instruction,
3965 /// and returns the number of remaining dependencies for the containing
3966 /// bundle.
3967 int incrementUnscheduledDeps(int Incr) {
3968 assert(hasValidDependencies() &&
3969 "increment of unscheduled deps would be meaningless");
3970 UnscheduledDeps += Incr;
3971 return FirstInBundle->unscheduledDepsInBundle();
3972 }
3973
3974 /// Sets the number of unscheduled dependencies to the number of
3975 /// dependencies.
3976 void resetUnscheduledDeps() {
3977 UnscheduledDeps = Dependencies;
3978 }
3979
3980 /// Clears all dependency information.
3981 void clearDependencies() {
3982 Dependencies = InvalidDeps;
3983 resetUnscheduledDeps();
3984 MemoryDependencies.clear();
3985 ControlDependencies.clear();
3986 }
3987
3988 int unscheduledDepsInBundle() const {
3989 assert(isSchedulingEntity() && "only meaningful on the bundle");
3990 int Sum = 0;
3991 for (const ScheduleData *BundleMember = this; BundleMember;
3992 BundleMember = BundleMember->NextInBundle) {
3993 if (BundleMember->UnscheduledDeps == InvalidDeps)
3994 return InvalidDeps;
3995 Sum += BundleMember->UnscheduledDeps;
3996 }
3997 return Sum;
3998 }
3999
4000 void dump(raw_ostream &os) const {
4001 if (!isSchedulingEntity()) {
4002 os << "/ " << *Inst;
4003 } else if (NextInBundle) {
4004 os << '[' << *Inst;
4005 ScheduleData *SD = NextInBundle;
4006 while (SD) {
4007 os << ';' << *SD->Inst;
4008 SD = SD->NextInBundle;
4009 }
4010 os << ']';
4011 } else {
4012 os << *Inst;
4013 }
4014 }
4015
4016 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
4017
4018 Instruction *Inst = nullptr;
4019
4020 /// The TreeEntry that this instruction corresponds to.
4021 TreeEntry *TE = nullptr;
4022
4023 /// Points to the head in an instruction bundle (and always to this for
4024 /// single instructions).
4025 ScheduleData *FirstInBundle = nullptr;
4026
4027 /// Single linked list of all instructions in a bundle. Null if it is a
4028 /// single instruction.
4029 ScheduleData *NextInBundle = nullptr;
4030
4031 /// Single linked list of all memory instructions (e.g. load, store, call)
4032 /// in the block - until the end of the scheduling region.
4033 ScheduleData *NextLoadStore = nullptr;
4034
4035 /// The dependent memory instructions.
4036 /// This list is derived on demand in calculateDependencies().
4037 SmallVector<ScheduleData *, 4> MemoryDependencies;
4038
4039 /// List of instructions which this instruction could be control dependent
4040 /// on. Allowing such nodes to be scheduled below this one could introduce
4041 /// a runtime fault which didn't exist in the original program.
4042 /// ex: this is a load or udiv following a readonly call which inf loops
4043 SmallVector<ScheduleData *, 4> ControlDependencies;
4044
4045 /// This ScheduleData is in the current scheduling region if this matches
4046 /// the current SchedulingRegionID of BlockScheduling.
4047 int SchedulingRegionID = 0;
4048
4049 /// Used for getting a "good" final ordering of instructions.
4050 int SchedulingPriority = 0;
4051
4052 /// The number of dependencies. Constitutes of the number of users of the
4053 /// instruction plus the number of dependent memory instructions (if any).
4054 /// This value is calculated on demand.
4055 /// If InvalidDeps, the number of dependencies is not calculated yet.
4056 int Dependencies = InvalidDeps;
4057
4058 /// The number of dependencies minus the number of dependencies of scheduled
4059 /// instructions. As soon as this is zero, the instruction/bundle gets ready
4060 /// for scheduling.
4061 /// Note that this is negative as long as Dependencies is not calculated.
4062 int UnscheduledDeps = InvalidDeps;
4063
4064 /// True if this instruction is scheduled (or considered as scheduled in the
4065 /// dry-run).
4066 bool IsScheduled = false;
4067 };
4068
4069#ifndef NDEBUG
4071 const BoUpSLP::ScheduleData &SD) {
4072 SD.dump(os);
4073 return os;
4074 }
4075#endif
4076
4077 friend struct GraphTraits<BoUpSLP *>;
4078 friend struct DOTGraphTraits<BoUpSLP *>;
4079
4080 /// Contains all scheduling data for a basic block.
4081 /// It does not schedules instructions, which are not memory read/write
4082 /// instructions and their operands are either constants, or arguments, or
4083 /// phis, or instructions from others blocks, or their users are phis or from
4084 /// the other blocks. The resulting vector instructions can be placed at the
4085 /// beginning of the basic block without scheduling (if operands does not need
4086 /// to be scheduled) or at the end of the block (if users are outside of the
4087 /// block). It allows to save some compile time and memory used by the
4088 /// compiler.
4089 /// ScheduleData is assigned for each instruction in between the boundaries of
4090 /// the tree entry, even for those, which are not part of the graph. It is
4091 /// required to correctly follow the dependencies between the instructions and
4092 /// their correct scheduling. The ScheduleData is not allocated for the
4093 /// instructions, which do not require scheduling, like phis, nodes with
4094 /// extractelements/insertelements only or nodes with instructions, with
4095 /// uses/operands outside of the block.
4096 struct BlockScheduling {
4097 BlockScheduling(BasicBlock *BB)
4098 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
4099
4100 void clear() {
4101 ReadyInsts.clear();
4102 ScheduleStart = nullptr;
4103 ScheduleEnd = nullptr;
4104 FirstLoadStoreInRegion = nullptr;
4105 LastLoadStoreInRegion = nullptr;
4106 RegionHasStackSave = false;
4107
4108 // Reduce the maximum schedule region size by the size of the
4109 // previous scheduling run.
4110 ScheduleRegionSizeLimit -= ScheduleRegionSize;
4111 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
4112 ScheduleRegionSizeLimit = MinScheduleRegionSize;
4113 ScheduleRegionSize = 0;
4114
4115 // Make a new scheduling region, i.e. all existing ScheduleData is not
4116 // in the new region yet.
4117 ++SchedulingRegionID;
4118 }
4119
4120 ScheduleData *getScheduleData(Instruction *I) {
4121 if (BB != I->getParent())
4122 // Avoid lookup if can't possibly be in map.
4123 return nullptr;
4124 ScheduleData *SD = ScheduleDataMap.lookup(I);
4125 if (SD && isInSchedulingRegion(SD))
4126 return SD;
4127 return nullptr;
4128 }
4129
4130 ScheduleData *getScheduleData(Value *V) {
4131 if (auto *I = dyn_cast<Instruction>(V))
4132 return getScheduleData(I);
4133 return nullptr;
4134 }
4135
4136 bool isInSchedulingRegion(ScheduleData *SD) const {
4137 return SD->SchedulingRegionID == SchedulingRegionID;
4138 }
4139
4140 /// Marks an instruction as scheduled and puts all dependent ready
4141 /// instructions into the ready-list.
4142 template <typename ReadyListType>
4143 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
4144 SD->IsScheduled = true;
4145 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
4146
4147 for (ScheduleData *BundleMember = SD; BundleMember;
4148 BundleMember = BundleMember->NextInBundle) {
4149
4150 // Handle the def-use chain dependencies.
4151
4152 // Decrement the unscheduled counter and insert to ready list if ready.
4153 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
4154 ScheduleData *OpDef = getScheduleData(I);
4155 if (OpDef && OpDef->hasValidDependencies() &&
4156 OpDef->incrementUnscheduledDeps(-1) == 0) {
4157 // There are no more unscheduled dependencies after
4158 // decrementing, so we can put the dependent instruction
4159 // into the ready list.
4160 ScheduleData *DepBundle = OpDef->FirstInBundle;
4161 assert(!DepBundle->IsScheduled &&
4162 "already scheduled bundle gets ready");
4163 ReadyList.insert(DepBundle);
4165 << "SLP: gets ready (def): " << *DepBundle << "\n");
4166 }
4167 };
4168
4169 // If BundleMember is a vector bundle, its operands may have been
4170 // reordered during buildTree(). We therefore need to get its operands
4171 // through the TreeEntry.
4172 if (TreeEntry *TE = BundleMember->TE) {
4173 // Need to search for the lane since the tree entry can be reordered.
4174 auto *In = BundleMember->Inst;
4175 int Lane = std::distance(TE->Scalars.begin(),
4176 find(TE->Scalars, In));
4177 assert(Lane >= 0 && "Lane not set");
4178
4179 // Since vectorization tree is being built recursively this assertion
4180 // ensures that the tree entry has all operands set before reaching
4181 // this code. Couple of exceptions known at the moment are extracts
4182 // where their second (immediate) operand is not added. Since
4183 // immediates do not affect scheduler behavior this is considered
4184 // okay.
4185 assert(
4186 In &&
4187 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
4188 In->getNumOperands() == TE->getNumOperands()) &&
4189 "Missed TreeEntry operands?");
4190
4191 for (unsigned OpIdx : seq<unsigned>(TE->getNumOperands()))
4192 if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
4193 DecrUnsched(I);
4194 } else {
4195 // If BundleMember is a stand-alone instruction, no operand reordering
4196 // has taken place, so we directly access its operands.
4197 for (Use &U : BundleMember->Inst->operands())
4198 if (auto *I = dyn_cast<Instruction>(U.get()))
4199 DecrUnsched(I);
4200 }
4201 // Handle the memory dependencies.
4202 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
4203 if (MemoryDepSD->hasValidDependencies() &&
4204 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
4205 // There are no more unscheduled dependencies after decrementing,
4206 // so we can put the dependent instruction into the ready list.
4207 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
4208 assert(!DepBundle->IsScheduled &&
4209 "already scheduled bundle gets ready");
4210 ReadyList.insert(DepBundle);
4212 << "SLP: gets ready (mem): " << *DepBundle << "\n");
4213 }
4214 }
4215 // Handle the control dependencies.
4216 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
4217 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
4218 // There are no more unscheduled dependencies after decrementing,
4219 // so we can put the dependent instruction into the ready list.
4220 ScheduleData *DepBundle = DepSD->FirstInBundle;
4221 assert(!DepBundle->IsScheduled &&
4222 "already scheduled bundle gets ready");
4223 ReadyList.insert(DepBundle);
4225 << "SLP: gets ready (ctl): " << *DepBundle << "\n");
4226 }
4227 }
4228 }
4229 }
4230
4231 /// Verify basic self consistency properties of the data structure.
4232 void verify() {
4233 if (!ScheduleStart)
4234 return;
4235
4236 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
4237 ScheduleStart->comesBefore(ScheduleEnd) &&
4238 "Not a valid scheduling region?");
4239
4240 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
4241 auto *SD = getScheduleData(I);
4242 if (!SD)
4243 continue;
4244 assert(isInSchedulingRegion(SD) &&
4245 "primary schedule data not in window?");
4246 assert(isInSchedulingRegion(SD->FirstInBundle) &&
4247 "entire bundle in window!");
4248 SD->verify();
4249 }
4250
4251 for (auto *SD : ReadyInsts) {
4252 assert(SD->isSchedulingEntity() && SD->isReady() &&
4253 "item in ready list not ready?");
4254 (void)SD;
4255 }
4256 }
4257
4258 /// Put all instructions into the ReadyList which are ready for scheduling.
4259 template <typename ReadyListType>
4260 void initialFillReadyList(ReadyListType &ReadyList) {
4261 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
4262 ScheduleData *SD = getScheduleData(I);
4263 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies() &&
4264 SD->isReady()) {
4265 ReadyList.insert(SD);
4267 << "SLP: initially in ready list: " << *SD << "\n");
4268 }
4269 }
4270 }
4271
4272 /// Build a bundle from the ScheduleData nodes corresponding to the
4273 /// scalar instruction for each lane.
4274 ScheduleData *buildBundle(ArrayRef<Value *> VL);
4275
4276 /// Checks if a bundle of instructions can be scheduled, i.e. has no
4277 /// cyclic dependencies. This is only a dry-run, no instructions are
4278 /// actually moved at this stage.
4279 /// \returns the scheduling bundle. The returned Optional value is not
4280 /// std::nullopt if \p VL is allowed to be scheduled.
4281 std::optional<ScheduleData *>
4282 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
4283 const InstructionsState &S);
4284
4285 /// Un-bundles a group of instructions.
4286 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
4287
4288 /// Allocates schedule data chunk.
4289 ScheduleData *allocateScheduleDataChunks();
4290
4291 /// Extends the scheduling region so that V is inside the region.
4292 /// \returns true if the region size is within the limit.
4293 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
4294
4295 /// Initialize the ScheduleData structures for new instructions in the
4296 /// scheduling region.
4297 void initScheduleData(Instruction *FromI, Instruction *ToI,
4298 ScheduleData *PrevLoadStore,
4299 ScheduleData *NextLoadStore);
4300
4301 /// Updates the dependency information of a bundle and of all instructions/
4302 /// bundles which depend on the original bundle.
4303 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
4304 BoUpSLP *SLP);
4305
4306 /// Sets all instruction in the scheduling region to un-scheduled.
4307 void resetSchedule();
4308
4309 BasicBlock *BB;
4310
4311 /// Simple memory allocation for ScheduleData.
4313
4314 /// The size of a ScheduleData array in ScheduleDataChunks.
4315 int ChunkSize;
4316
4317 /// The allocator position in the current chunk, which is the last entry
4318 /// of ScheduleDataChunks.
4319 int ChunkPos;
4320
4321 /// Attaches ScheduleData to Instruction.
4322 /// Note that the mapping survives during all vectorization iterations, i.e.
4323 /// ScheduleData structures are recycled.
4325
4326 /// The ready-list for scheduling (only used for the dry-run).
4327 SetVector<ScheduleData *> ReadyInsts;
4328
4329 /// The first instruction of the scheduling region.
4330 Instruction *ScheduleStart = nullptr;
4331
4332 /// The first instruction _after_ the scheduling region.
4333 Instruction *ScheduleEnd = nullptr;
4334
4335 /// The first memory accessing instruction in the scheduling region
4336 /// (can be null).
4337 ScheduleData *FirstLoadStoreInRegion = nullptr;
4338
4339 /// The last memory accessing instruction in the scheduling region
4340 /// (can be null).
4341 ScheduleData *LastLoadStoreInRegion = nullptr;
4342
4343 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
4344 /// region? Used to optimize the dependence calculation for the
4345 /// common case where there isn't.
4346 bool RegionHasStackSave = false;
4347
4348 /// The current size of the scheduling region.
4349 int ScheduleRegionSize = 0;
4350
4351 /// The maximum size allowed for the scheduling region.
4352 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
4353
4354 /// The ID of the scheduling region. For a new vectorization iteration this
4355 /// is incremented which "removes" all ScheduleData from the region.
4356 /// Make sure that the initial SchedulingRegionID is greater than the
4357 /// initial SchedulingRegionID in ScheduleData (which is 0).
4358 int SchedulingRegionID = 1;
4359 };
4360
4361 /// Attaches the BlockScheduling structures to basic blocks.
4363
4364 /// Performs the "real" scheduling. Done before vectorization is actually
4365 /// performed in a basic block.
4366 void scheduleBlock(BlockScheduling *BS);
4367
4368 /// List of users to ignore during scheduling and that don't need extracting.
4369 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
4370
4371 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
4372 /// sorted SmallVectors of unsigned.
4373 struct OrdersTypeDenseMapInfo {
4374 static OrdersType getEmptyKey() {
4375 OrdersType V;
4376 V.push_back(~1U);
4377 return V;
4378 }
4379
4380 static OrdersType getTombstoneKey() {
4381 OrdersType V;
4382 V.push_back(~2U);
4383 return V;
4384 }
4385
4386 static unsigned getHashValue(const OrdersType &V) {
4387 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
4388 }
4389
4390 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
4391 return LHS == RHS;
4392 }
4393 };
4394
4395 // Analysis and block reference.
4396 Function *F;
4397 ScalarEvolution *SE;
4399 TargetLibraryInfo *TLI;
4400 LoopInfo *LI;
4401 DominatorTree *DT;
4402 AssumptionCache *AC;
4403 DemandedBits *DB;
4404 const DataLayout *DL;
4406
4407 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
4408 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
4409
4410 /// Instruction builder to construct the vectorized tree.
4412
4413 /// A map of scalar integer values to the smallest bit width with which they
4414 /// can legally be represented. The values map to (width, signed) pairs,
4415 /// where "width" indicates the minimum bit width and "signed" is True if the
4416 /// value must be signed-extended, rather than zero-extended, back to its
4417 /// original width.
4419
4420 /// Final size of the reduced vector, if the current graph represents the
4421 /// input for the reduction and it was possible to narrow the size of the
4422 /// reduction.
4423 unsigned ReductionBitWidth = 0;
4424
4425 /// Canonical graph size before the transformations.
4426 unsigned BaseGraphSize = 1;
4427
4428 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
4429 /// type sizes, used in the tree.
4430 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4431
4432 /// Indices of the vectorized nodes, which supposed to be the roots of the new
4433 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
4434 DenseSet<unsigned> ExtraBitWidthNodes;
4435};
4436
4437} // end namespace slpvectorizer
4438
4439template <> struct GraphTraits<BoUpSLP *> {
4440 using TreeEntry = BoUpSLP::TreeEntry;
4441
4442 /// NodeRef has to be a pointer per the GraphWriter.
4444
4446
4447 /// Add the VectorizableTree to the index iterator to be able to return
4448 /// TreeEntry pointers.
4449 struct ChildIteratorType
4450 : public iterator_adaptor_base<
4451 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
4453
4455 ContainerTy &VT)
4456 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
4457
4458 NodeRef operator*() { return I->UserTE; }
4459 };
4460
4462 return R.VectorizableTree[0].get();
4463 }
4464
4465 static ChildIteratorType child_begin(NodeRef N) {
4466 return {N->UserTreeIndices.begin(), N->Container};
4467 }
4468
4469 static ChildIteratorType child_end(NodeRef N) {
4470 return {N->UserTreeIndices.end(), N->Container};
4471 }
4472
4473 /// For the node iterator we just need to turn the TreeEntry iterator into a
4474 /// TreeEntry* iterator so that it dereferences to NodeRef.
4475 class nodes_iterator {
4477 ItTy It;
4478
4479 public:
4480 nodes_iterator(const ItTy &It2) : It(It2) {}
4481 NodeRef operator*() { return It->get(); }
4482 nodes_iterator operator++() {
4483 ++It;
4484 return *this;
4485 }
4486 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
4487 };
4488
4489 static nodes_iterator nodes_begin(BoUpSLP *R) {
4490 return nodes_iterator(R->VectorizableTree.begin());
4491 }
4492
4493 static nodes_iterator nodes_end(BoUpSLP *R) {
4494 return nodes_iterator(R->VectorizableTree.end());
4495 }
4496
4497 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
4498};
4499
4500template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
4501 using TreeEntry = BoUpSLP::TreeEntry;
4502
4503 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
4504
4505 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
4506 std::string Str;
4508 OS << Entry->Idx << ".\n";
4509 if (isSplat(Entry->Scalars))
4510 OS << "<splat> ";
4511 for (auto *V : Entry->Scalars) {
4512 OS << *V;
4513 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
4514 return EU.Scalar == V;
4515 }))
4516 OS << " <extract>";
4517 OS << "\n";
4518 }
4519 return Str;
4520 }
4521
4522 static std::string getNodeAttributes(const TreeEntry *Entry,
4523 const BoUpSLP *) {
4524 if (Entry->isGather())
4525 return "color=red";
4526 if (Entry->State == TreeEntry::ScatterVectorize ||
4527 Entry->State == TreeEntry::StridedVectorize)
4528 return "color=blue";
4529 return "";
4530 }
4531};
4532
4533} // end namespace llvm
4534
4537 for (auto *I : DeletedInstructions) {
4538 if (!I->getParent()) {
4539 // Temporarily insert instruction back to erase them from parent and
4540 // memory later.
4541 if (isa<PHINode>(I))
4542 // Phi nodes must be the very first instructions in the block.
4543 I->insertBefore(F->getEntryBlock(),
4544 F->getEntryBlock().getFirstNonPHIIt());
4545 else
4546 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
4547 continue;
4548 }
4549 for (Use &U : I->operands()) {
4550 auto *Op = dyn_cast<Instruction>(U.get());
4551 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
4553 DeadInsts.emplace_back(Op);
4554 }
4555 I->dropAllReferences();
4556 }
4557 for (auto *I : DeletedInstructions) {
4558 assert(I->use_empty() &&
4559 "trying to erase instruction with users.");
4560 I->eraseFromParent();
4561 }
4562
4563 // Cleanup any dead scalar code feeding the vectorized instructions
4565
4566#ifdef EXPENSIVE_CHECKS
4567 // If we could guarantee that this call is not extremely slow, we could
4568 // remove the ifdef limitation (see PR47712).
4569 assert(!verifyFunction(*F, &dbgs()));
4570#endif
4571}
4572
4573/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
4574/// contains original mask for the scalars reused in the node. Procedure
4575/// transform this mask in accordance with the given \p Mask.
4577 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
4578 "Expected non-empty mask.");
4579 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
4580 Prev.swap(Reuses);
4581 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
4582 if (Mask[I] != PoisonMaskElem)
4583 Reuses[Mask[I]] = Prev[I];
4584}
4585
4586/// Reorders the given \p Order according to the given \p Mask. \p Order - is
4587/// the original order of the scalars. Procedure transforms the provided order
4588/// in accordance with the given \p Mask. If the resulting \p Order is just an
4589/// identity order, \p Order is cleared.
4591 bool BottomOrder = false) {
4592 assert(!Mask.empty() && "Expected non-empty mask.");
4593 unsigned Sz = Mask.size();
4594 if (BottomOrder) {
4595 SmallVector<unsigned> PrevOrder;
4596 if (Order.empty()) {
4597 PrevOrder.resize(Sz);
4598 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
4599 } else {
4600 PrevOrder.swap(Order);
4601 }
4602 Order.assign(Sz, Sz);
4603 for (unsigned I = 0; I < Sz; ++I)
4604 if (Mask[I] != PoisonMaskElem)
4605 Order[I] = PrevOrder[Mask[I]];
4606 if (all_of(enumerate(Order), [&](const auto &Data) {
4607 return Data.value() == Sz || Data.index() == Data.value();
4608 })) {
4609 Order.clear();
4610 return;
4611 }
4612 fixupOrderingIndices(Order);
4613 return;
4614 }
4615 SmallVector<int> MaskOrder;
4616 if (Order.empty()) {
4617 MaskOrder.resize(Sz);
4618 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
4619 } else {
4620 inversePermutation(Order, MaskOrder);
4621 }
4622 reorderReuses(MaskOrder, Mask);
4623 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
4624 Order.clear();
4625 return;
4626 }
4627 Order.assign(Sz, Sz);
4628 for (unsigned I = 0; I < Sz; ++I)
4629 if (MaskOrder[I] != PoisonMaskElem)
4630 Order[MaskOrder[I]] = I;
4631 fixupOrderingIndices(Order);
4632}
4633
4634std::optional<BoUpSLP::OrdersType>
4635BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
4636 assert(TE.isGather() && "Expected gather node only.");
4637 // Try to find subvector extract/insert patterns and reorder only such
4638 // patterns.
4639 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
4640 Type *ScalarTy = GatheredScalars.front()->getType();
4641 int NumScalars = GatheredScalars.size();
4642 if (!isValidElementType(ScalarTy))
4643 return std::nullopt;
4644 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
4645 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, NumScalars);
4646 SmallVector<int> ExtractMask;
4647 SmallVector<int> Mask;
4650 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4652 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4653 /*ForOrder=*/true);
4654 // No shuffled operands - ignore.
4655 if (GatherShuffles.empty() && ExtractShuffles.empty())
4656 return std::nullopt;
4657 OrdersType CurrentOrder(NumScalars, NumScalars);
4658 if (GatherShuffles.size() == 1 &&
4659 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
4660 Entries.front().front()->isSame(TE.Scalars)) {
4661 // Perfect match in the graph, will reuse the previously vectorized
4662 // node. Cost is 0.
4663 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
4664 return CurrentOrder;
4665 }
4666 auto IsSplatMask = [](ArrayRef<int> Mask) {
4667 int SingleElt = PoisonMaskElem;
4668 return all_of(Mask, [&](int I) {
4669 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
4670 SingleElt = I;
4671 return I == PoisonMaskElem || I == SingleElt;
4672 });
4673 };
4674 // Exclusive broadcast mask - ignore.
4675 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
4676 (Entries.size() != 1 ||
4677 Entries.front().front()->ReorderIndices.empty())) ||
4678 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
4679 return std::nullopt;
4680 SmallBitVector ShuffledSubMasks(NumParts);
4681 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
4682 ArrayRef<int> Mask, int PartSz, int NumParts,
4683 function_ref<unsigned(unsigned)> GetVF) {
4684 for (int I : seq<int>(0, NumParts)) {
4685 if (ShuffledSubMasks.test(I))
4686 continue;
4687 const int VF = GetVF(I);
4688 if (VF == 0)
4689 continue;
4690 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
4691 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
4692 // Shuffle of at least 2 vectors - ignore.
4693 if (any_of(Slice, [&](int I) { return I != NumScalars; })) {
4694 std::fill(Slice.begin(), Slice.end(), NumScalars);
4695 ShuffledSubMasks.set(I);
4696 continue;
4697 }
4698 // Try to include as much elements from the mask as possible.
4699 int FirstMin = INT_MAX;
4700 int SecondVecFound = false;
4701 for (int K : seq<int>(Limit)) {
4702 int Idx = Mask[I * PartSz + K];
4703 if (Idx == PoisonMaskElem) {
4704 Value *V = GatheredScalars[I * PartSz + K];
4705 if (isConstant(V) && !isa<PoisonValue>(V)) {
4706 SecondVecFound = true;
4707 break;
4708 }
4709 continue;
4710 }
4711 if (Idx < VF) {
4712 if (FirstMin > Idx)
4713 FirstMin = Idx;
4714 } else {
4715 SecondVecFound = true;
4716 break;
4717 }
4718 }
4719 FirstMin = (FirstMin / PartSz) * PartSz;
4720 // Shuffle of at least 2 vectors - ignore.
4721 if (SecondVecFound) {
4722 std::fill(Slice.begin(), Slice.end(), NumScalars);
4723 ShuffledSubMasks.set(I);
4724 continue;
4725 }
4726 for (int K : seq<int>(Limit)) {
4727 int Idx = Mask[I * PartSz + K];
4728 if (Idx == PoisonMaskElem)
4729 continue;
4730 Idx -= FirstMin;
4731 if (Idx >= PartSz) {
4732 SecondVecFound = true;
4733 break;
4734 }
4735 if (CurrentOrder[I * PartSz + Idx] >
4736 static_cast<unsigned>(I * PartSz + K) &&
4737 CurrentOrder[I * PartSz + Idx] !=
4738 static_cast<unsigned>(I * PartSz + Idx))
4739 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
4740 }
4741 // Shuffle of at least 2 vectors - ignore.
4742 if (SecondVecFound) {
4743 std::fill(Slice.begin(), Slice.end(), NumScalars);
4744 ShuffledSubMasks.set(I);
4745 continue;
4746 }
4747 }
4748 };
4749 int PartSz = getPartNumElems(NumScalars, NumParts);
4750 if (!ExtractShuffles.empty())
4751 TransformMaskToOrder(
4752 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
4753 if (!ExtractShuffles[I])
4754 return 0U;
4755 unsigned VF = 0;
4756 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
4757 for (unsigned Idx : seq<unsigned>(Sz)) {
4758 int K = I * PartSz + Idx;
4759 if (ExtractMask[K] == PoisonMaskElem)
4760 continue;
4761 if (!TE.ReuseShuffleIndices.empty())
4762 K = TE.ReuseShuffleIndices[K];
4763 if (K == PoisonMaskElem)
4764 continue;
4765 if (!TE.ReorderIndices.empty())
4766 K = std::distance(TE.ReorderIndices.begin(),
4767 find(TE.ReorderIndices, K));
4768 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4769 if (!EI)
4770 continue;
4771 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4772 ->getElementCount()
4773 .getKnownMinValue());
4774 }
4775 return VF;
4776 });
4777 // Check special corner case - single shuffle of the same entry.
4778 if (GatherShuffles.size() == 1 && NumParts != 1) {
4779 if (ShuffledSubMasks.any())
4780 return std::nullopt;
4781 PartSz = NumScalars;
4782 NumParts = 1;
4783 }
4784 if (!Entries.empty())
4785 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
4786 if (!GatherShuffles[I])
4787 return 0U;
4788 return std::max(Entries[I].front()->getVectorFactor(),
4789 Entries[I].back()->getVectorFactor());
4790 });
4791 int NumUndefs =
4792 count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; });
4793 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4794 return std::nullopt;
4795 return std::move(CurrentOrder);
4796}
4797
4798static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
4799 const TargetLibraryInfo &TLI,
4800 bool CompareOpcodes = true) {
4803 return false;
4804 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4805 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4806 return (!GEP1 || GEP1->getNumOperands() == 2) &&
4807 (!GEP2 || GEP2->getNumOperands() == 2) &&
4808 (((!GEP1 || isConstant(GEP1->getOperand(1))) &&
4809 (!GEP2 || isConstant(GEP2->getOperand(1)))) ||
4810 !CompareOpcodes ||
4811 (GEP1 && GEP2 &&
4812 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
4813}
4814
4815/// Calculates minimal alignment as a common alignment.
4816template <typename T>
4818 Align CommonAlignment = cast<T>(VL.front())->getAlign();
4819 for (Value *V : VL.drop_front())
4820 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
4821 return CommonAlignment;
4822}
4823
4824/// Check if \p Order represents reverse order.
4826 assert(!Order.empty() &&
4827 "Order is empty. Please check it before using isReverseOrder.");
4828 unsigned Sz = Order.size();
4829 return all_of(enumerate(Order), [&](const auto &Pair) {
4830 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4831 });
4832}
4833
4834/// Checks if the provided list of pointers \p Pointers represents the strided
4835/// pointers for type ElemTy. If they are not, std::nullopt is returned.
4836/// Otherwise, if \p Inst is not specified, just initialized optional value is
4837/// returned to show that the pointers represent strided pointers. If \p Inst
4838/// specified, the runtime stride is materialized before the given \p Inst.
4839/// \returns std::nullopt if the pointers are not pointers with the runtime
4840/// stride, nullptr or actual stride value, otherwise.
4841static std::optional<Value *>
4843 const DataLayout &DL, ScalarEvolution &SE,
4844 SmallVectorImpl<unsigned> &SortedIndices,
4845 Instruction *Inst = nullptr) {
4847 const SCEV *PtrSCEVLowest = nullptr;
4848 const SCEV *PtrSCEVHighest = nullptr;
4849 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
4850 // addresses).
4851 for (Value *Ptr : PointerOps) {
4852 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4853 if (!PtrSCEV)
4854 return std::nullopt;
4855 SCEVs.push_back(PtrSCEV);
4856 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4857 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4858 continue;
4859 }
4860 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4861 if (isa<SCEVCouldNotCompute>(Diff))
4862 return std::nullopt;
4863 if (Diff->isNonConstantNegative()) {
4864 PtrSCEVLowest = PtrSCEV;
4865 continue;
4866 }
4867 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
4868 if (isa<SCEVCouldNotCompute>(Diff1))
4869 return std::nullopt;
4870 if (Diff1->isNonConstantNegative()) {
4871 PtrSCEVHighest = PtrSCEV;
4872 continue;
4873 }
4874 }
4875 // Dist = PtrSCEVHighest - PtrSCEVLowest;
4876 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
4877 if (isa<SCEVCouldNotCompute>(Dist))
4878 return std::nullopt;
4879 int Size = DL.getTypeStoreSize(ElemTy);
4880 auto TryGetStride = [&](const SCEV *Dist,
4881 const SCEV *Multiplier) -> const SCEV * {
4882 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4883 if (M->getOperand(0) == Multiplier)
4884 return M->getOperand(1);
4885 if (M->getOperand(1) == Multiplier)
4886 return M->getOperand(0);
4887 return nullptr;
4888 }
4889 if (Multiplier == Dist)
4890 return SE.getConstant(Dist->getType(), 1);
4891 return SE.getUDivExactExpr(Dist, Multiplier);
4892 };
4893 // Stride_in_elements = Dist / element_size * (num_elems - 1).
4894 const SCEV *Stride = nullptr;
4895 if (Size != 1 || SCEVs.size() > 2) {
4896 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
4897 Stride = TryGetStride(Dist, Sz);
4898 if (!Stride)
4899 return std::nullopt;
4900 }
4901 if (!Stride || isa<SCEVConstant>(Stride))
4902 return std::nullopt;
4903 // Iterate through all pointers and check if all distances are
4904 // unique multiple of Stride.
4905 using DistOrdPair = std::pair<int64_t, int>;
4906 auto Compare = llvm::less_first();
4907 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
4908 int Cnt = 0;
4909 bool IsConsecutive = true;
4910 for (const SCEV *PtrSCEV : SCEVs) {
4911 unsigned Dist = 0;
4912 if (PtrSCEV != PtrSCEVLowest) {
4913 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4914 const SCEV *Coeff = TryGetStride(Diff, Stride);
4915 if (!Coeff)
4916 return std::nullopt;
4917 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4918 if (!SC || isa<SCEVCouldNotCompute>(SC))
4919 return std::nullopt;
4920 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
4921 SE.getMulExpr(Stride, SC)))
4922 ->isZero())
4923 return std::nullopt;
4924 Dist = SC->getAPInt().getZExtValue();
4925 }
4926 // If the strides are not the same or repeated, we can't vectorize.
4927 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
4928 return std::nullopt;
4929 auto Res = Offsets.emplace(Dist, Cnt);
4930 if (!Res.second)
4931 return std::nullopt;
4932 // Consecutive order if the inserted element is the last one.
4933 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4934 ++Cnt;
4935 }
4936 if (Offsets.size() != SCEVs.size())
4937 return std::nullopt;
4938 SortedIndices.clear();
4939 if (!IsConsecutive) {
4940 // Fill SortedIndices array only if it is non-consecutive.
4941 SortedIndices.resize(PointerOps.size());
4942 Cnt = 0;
4943 for (const std::pair<int64_t, int> &Pair : Offsets) {
4944 SortedIndices[Cnt] = Pair.second;
4945 ++Cnt;
4946 }
4947 }
4948 if (!Inst)
4949 return nullptr;
4950 SCEVExpander Expander(SE, DL, "strided-load-vec");
4951 return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
4952}
4953
4954static std::pair<InstructionCost, InstructionCost>
4956 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
4957 Type *ScalarTy, VectorType *VecTy);
4958
4959/// Returns the cost of the shuffle instructions with the given \p Kind, vector
4960/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
4961/// subvector pattern.
4962static InstructionCost
4964 VectorType *Tp, ArrayRef<int> Mask = {},
4966 int Index = 0, VectorType *SubTp = nullptr,
4968 if (Kind != TTI::SK_PermuteTwoSrc)
4969 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
4970 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
4971 int NumSubElts;
4973 Mask, NumSrcElts, NumSubElts, Index)) {
4974 if (Index + NumSubElts > NumSrcElts &&
4975 Index + NumSrcElts <= static_cast<int>(Mask.size()))
4976 return TTI.getShuffleCost(
4978 getWidenedType(Tp->getElementType(), Mask.size()), Mask,
4980 }
4981 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
4982}
4983
4984/// Correctly creates insert_subvector, checking that the index is multiple of
4985/// the subvectors length. Otherwise, generates shuffle using \p Generator or
4986/// using default shuffle.
4988 IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index,
4989 function_ref<Value *(Value *, Value *, ArrayRef<int>)> Generator = {}) {
4990 const unsigned SubVecVF = getNumElements(V->getType());
4991 if (Index % SubVecVF == 0) {
4992 Vec = Builder.CreateInsertVector(Vec->getType(), Vec, V,
4993 Builder.getInt64(Index));
4994 } else {
4995 // Create shuffle, insertvector requires that index is multiple of
4996 // the subvector length.
4997 const unsigned VecVF = getNumElements(Vec->getType());
4999 std::iota(Mask.begin(), Mask.end(), 0);
5000 for (unsigned I : seq<unsigned>(SubVecVF))
5001 Mask[I + Index] = I + VecVF;
5002 if (Generator) {
5003 Vec = Generator(Vec, V, Mask);
5004 } else {
5005 // 1. Resize V to the size of Vec.
5006 SmallVector<int> ResizeMask(VecVF, PoisonMaskElem);
5007 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
5008 V = Builder.CreateShuffleVector(V, ResizeMask);
5009 Vec = Builder.CreateShuffleVector(Vec, V, Mask);
5010 }
5011 }
5012 return Vec;
5013}
5014
5015/// Correctly creates extract_subvector, checking that the index is multiple of
5016/// the subvectors length. Otherwise, generates shuffle using \p Generator or
5017/// using default shuffle.
5019 unsigned SubVecVF, unsigned Index) {
5020 if (Index % SubVecVF == 0) {
5021 VectorType *SubVecTy =
5022 getWidenedType(Vec->getType()->getScalarType(), SubVecVF);
5023 return Builder.CreateExtractVector(SubVecTy, Vec, Builder.getInt64(Index));
5024 }
5025 // Create shuffle, extract_subvector requires that index is multiple of
5026 // the subvector length.
5027 SmallVector<int> Mask(SubVecVF, PoisonMaskElem);
5028 std::iota(Mask.begin(), Mask.end(), Index);
5029 return Builder.CreateShuffleVector(Vec, Mask);
5030}
5031
5035 SmallVectorImpl<Value *> &PointerOps,
5036 unsigned *BestVF, bool TryRecursiveCheck) const {
5037 // Check that a vectorized load would load the same memory as a scalar
5038 // load. For example, we don't want to vectorize loads that are smaller
5039 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
5040 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
5041 // from such a struct, we read/write packed bits disagreeing with the
5042 // unvectorized version.
5043 if (BestVF)
5044 *BestVF = 0;
5046 return LoadsState::Gather;
5047 Type *ScalarTy = VL0->getType();
5048
5049 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
5050 return LoadsState::Gather;
5051
5052 // Make sure all loads in the bundle are simple - we can't vectorize
5053 // atomic or volatile loads.
5054 PointerOps.clear();
5055 const unsigned Sz = VL.size();
5056 PointerOps.resize(Sz);
5057 auto *POIter = PointerOps.begin();
5058 for (Value *V : VL) {
5059 auto *L = dyn_cast<LoadInst>(V);
5060 if (!L || !L->isSimple())
5061 return LoadsState::Gather;
5062 *POIter = L->getPointerOperand();
5063 ++POIter;
5064 }
5065
5066 Order.clear();
5067 // Check the order of pointer operands or that all pointers are the same.
5068 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
5069
5070 auto *VecTy = getWidenedType(ScalarTy, Sz);
5071 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
5072 if (!IsSorted) {
5073 if (Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy)) {
5074 if (TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
5075 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
5077 }
5078
5079 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
5080 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
5081 return LoadsState::Gather;
5082
5083 if (!all_of(PointerOps, [&](Value *P) {
5084 return arePointersCompatible(P, PointerOps.front(), *TLI);
5085 }))
5086 return LoadsState::Gather;
5087
5088 } else {
5089 Value *Ptr0;
5090 Value *PtrN;
5091 if (Order.empty()) {
5092 Ptr0 = PointerOps.front();
5093 PtrN = PointerOps.back();
5094 } else {
5095 Ptr0 = PointerOps[Order.front()];
5096 PtrN = PointerOps[Order.back()];
5097 }
5098 std::optional<int> Diff =
5099 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
5100 // Check that the sorted loads are consecutive.
5101 if (static_cast<unsigned>(*Diff) == Sz - 1)
5102 return LoadsState::Vectorize;
5103 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
5104 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
5105 return LoadsState::Gather;
5106 // Simple check if not a strided access - clear order.
5107 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
5108 // Try to generate strided load node if:
5109 // 1. Target with strided load support is detected.
5110 // 2. The number of loads is greater than MinProfitableStridedLoads,
5111 // or the potential stride <= MaxProfitableLoadStride and the
5112 // potential stride is power-of-2 (to avoid perf regressions for the very
5113 // small number of loads) and max distance > number of loads, or potential
5114 // stride is -1.
5115 // 3. The loads are ordered, or number of unordered loads <=
5116 // MaxProfitableUnorderedLoads, or loads are in reversed order.
5117 // (this check is to avoid extra costs for very expensive shuffles).
5118 // 4. Any pointer operand is an instruction with the users outside of the
5119 // current graph (for masked gathers extra extractelement instructions
5120 // might be required).
5121 auto IsAnyPointerUsedOutGraph =
5122 IsPossibleStrided && any_of(PointerOps, [&](Value *V) {
5123 return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
5124 return !isVectorized(U) && !MustGather.contains(U);
5125 });
5126 });
5127 const unsigned AbsoluteDiff = std::abs(*Diff);
5128 if (IsPossibleStrided &&
5129 (IsAnyPointerUsedOutGraph ||
5130 (AbsoluteDiff > Sz &&
5132 (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
5133 AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) ||
5134 *Diff == -(static_cast<int>(Sz) - 1))) {
5135 int Stride = *Diff / static_cast<int>(Sz - 1);
5136 if (*Diff == Stride * static_cast<int>(Sz - 1)) {
5137 Align Alignment =
5138 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
5139 ->getAlign();
5140 if (TTI->isLegalStridedLoadStore(VecTy, Alignment)) {
5141 // Iterate through all pointers and check if all distances are
5142 // unique multiple of Dist.
5143 SmallSet<int, 4> Dists;
5144 for (Value *Ptr : PointerOps) {
5145 int Dist = 0;
5146 if (Ptr == PtrN)
5147 Dist = *Diff;
5148 else if (Ptr != Ptr0)
5149 Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
5150 // If the strides are not the same or repeated, we can't
5151 // vectorize.
5152 if (((Dist / Stride) * Stride) != Dist ||
5153 !Dists.insert(Dist).second)
5154 break;
5155 }
5156 if (Dists.size() == Sz)
5158 }
5159 }
5160 }
5161 }
5162 // Correctly identify compare the cost of loads + shuffles rather than
5163 // strided/masked gather loads. Returns true if vectorized + shuffles
5164 // representation is better than just gather.
5165 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
5166 unsigned *BestVF,
5167 bool ProfitableGatherPointers) {
5168 if (BestVF)
5169 *BestVF = 0;
5170 // Compare masked gather cost and loads + insert subvector costs.
5172 auto [ScalarGEPCost, VectorGEPCost] =
5173 getGEPCosts(TTI, PointerOps, PointerOps.front(),
5174 Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
5175 // Estimate the cost of masked gather GEP. If not a splat, roughly
5176 // estimate as a buildvector, otherwise estimate as splat.
5177 APInt DemandedElts = APInt::getAllOnes(VecTy->getNumElements());
5178 VectorType *PtrVecTy =
5179 getWidenedType(PointerOps.front()->getType()->getScalarType(),
5180 VecTy->getNumElements());
5181 if (static_cast<unsigned>(count_if(
5182 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
5183 any_of(PointerOps, [&](Value *V) {
5184 return getUnderlyingObject(V) !=
5185 getUnderlyingObject(PointerOps.front());
5186 }))
5187 VectorGEPCost += TTI.getScalarizationOverhead(
5188 PtrVecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind);
5189 else
5190 VectorGEPCost +=
5192 PtrVecTy, APInt::getOneBitSet(VecTy->getNumElements(), 0),
5193 /*Insert=*/true, /*Extract=*/false, CostKind) +
5195 // The cost of scalar loads.
5196 InstructionCost ScalarLoadsCost =
5197 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
5198 [&](InstructionCost C, Value *V) {
5199 return C + TTI.getInstructionCost(
5200 cast<Instruction>(V), CostKind);
5201 }) +
5202 ScalarGEPCost;
5203 // The cost of masked gather.
5204 InstructionCost MaskedGatherCost =
5206 Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
5207 /*VariableMask=*/false, CommonAlignment, CostKind) +
5208 (ProfitableGatherPointers ? 0 : VectorGEPCost);
5209 InstructionCost GatherCost =
5210 TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
5211 /*Extract=*/false, CostKind) +
5212 ScalarLoadsCost;
5213 // The list of loads is small or perform partial check already - directly
5214 // compare masked gather cost and gather cost.
5215 constexpr unsigned ListLimit = 4;
5216 if (!TryRecursiveCheck || VL.size() < ListLimit)
5217 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
5218
5219 // FIXME: The following code has not been updated for non-power-of-2
5220 // vectors (and not whole registers). The splitting logic here does not
5221 // cover the original vector if the vector factor is not a power of two.
5222 if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size()))
5223 return false;
5224
5225 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
5226 unsigned MinVF = getMinVF(2 * Sz);
5227 DemandedElts.clearAllBits();
5228 // Iterate through possible vectorization factors and check if vectorized +
5229 // shuffles is better than just gather.
5230 for (unsigned VF =
5231 getFloorFullVectorNumberOfElements(TTI, ScalarTy, VL.size() - 1);
5232 VF >= MinVF;
5233 VF = getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {
5235 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
5236 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
5238 SmallVector<Value *> PointerOps;
5239 LoadsState LS =
5240 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps, BestVF,
5241 /*TryRecursiveCheck=*/false);
5242 // Check that the sorted loads are consecutive.
5243 if (LS == LoadsState::Gather) {
5244 if (BestVF) {
5245 DemandedElts.setAllBits();
5246 break;
5247 }
5248 DemandedElts.setBits(Cnt, Cnt + VF);
5249 continue;
5250 }
5251 // If need the reorder - consider as high-cost masked gather for now.
5252 if ((LS == LoadsState::Vectorize ||
5254 !Order.empty() && !isReverseOrder(Order))
5256 States.push_back(LS);
5257 }
5258 if (DemandedElts.isAllOnes())
5259 // All loads gathered - try smaller VF.
5260 continue;
5261 // Can be vectorized later as a serie of loads/insertelements.
5262 InstructionCost VecLdCost = 0;
5263 if (!DemandedElts.isZero()) {
5264 VecLdCost =
5265 TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
5266 /*Extract=*/false, CostKind) +
5267 ScalarGEPCost;
5268 for (unsigned Idx : seq<unsigned>(VL.size()))
5269 if (DemandedElts[Idx])
5270 VecLdCost +=
5271 TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
5272 }
5273 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
5274 auto *SubVecTy = getWidenedType(ScalarTy, VF);
5275 for (auto [I, LS] : enumerate(States)) {
5276 auto *LI0 = cast<LoadInst>(VL[I * VF]);
5277 InstructionCost VectorGEPCost =
5278 (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
5279 ? 0
5280 : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
5281 LI0->getPointerOperand(),
5282 Instruction::GetElementPtr, CostKind, ScalarTy,
5283 SubVecTy)
5284 .second;
5285 if (LS == LoadsState::ScatterVectorize) {
5286 if (static_cast<unsigned>(
5287 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
5288 PointerOps.size() - 1 ||
5289 any_of(PointerOps, [&](Value *V) {
5290 return getUnderlyingObject(V) !=
5291 getUnderlyingObject(PointerOps.front());
5292 }))
5293 VectorGEPCost += TTI.getScalarizationOverhead(
5294 SubVecTy, APInt::getAllOnes(VF),
5295 /*Insert=*/true, /*Extract=*/false, CostKind);
5296 else
5297 VectorGEPCost +=
5299 SubVecTy, APInt::getOneBitSet(ScalarTyNumElements * VF, 0),
5300 /*Insert=*/true, /*Extract=*/false, CostKind) +
5301 ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
5302 CostKind);
5303 }
5304 switch (LS) {
5306 VecLdCost +=
5307 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
5308 LI0->getPointerAddressSpace(), CostKind,
5310 VectorGEPCost;
5311 break;
5313 VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
5314 LI0->getPointerOperand(),
5315 /*VariableMask=*/false,
5316 CommonAlignment, CostKind) +
5317 VectorGEPCost;
5318 break;
5320 VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
5321 LI0->getPointerOperand(),
5322 /*VariableMask=*/false,
5323 CommonAlignment, CostKind) +
5324 VectorGEPCost;
5325 break;
5326 case LoadsState::Gather:
5327 // Gathers are already calculated - ignore.
5328 continue;
5329 }
5330 SmallVector<int> ShuffleMask(VL.size());
5331 for (int Idx : seq<int>(0, VL.size()))
5332 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
5333 if (I > 0)
5334 VecLdCost +=
5335 ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, ShuffleMask,
5336 CostKind, I * VF, SubVecTy);
5337 }
5338 // If masked gather cost is higher - better to vectorize, so
5339 // consider it as a gather node. It will be better estimated
5340 // later.
5341 if (MaskedGatherCost >= VecLdCost &&
5342 VecLdCost - GatherCost < -SLPCostThreshold) {
5343 if (BestVF)
5344 *BestVF = VF;
5345 return true;
5346 }
5347 }
5348 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
5349 };
5350 // TODO: need to improve analysis of the pointers, if not all of them are
5351 // GEPs or have > 2 operands, we end up with a gather node, which just
5352 // increases the cost.
5353 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
5354 bool ProfitableGatherPointers =
5355 L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
5356 return L->isLoopInvariant(V);
5357 })) <= Sz / 2;
5358 if (ProfitableGatherPointers || all_of(PointerOps, [](Value *P) {
5359 auto *GEP = dyn_cast<GetElementPtrInst>(P);
5360 return (!GEP && doesNotNeedToBeScheduled(P)) ||
5361 (GEP && GEP->getNumOperands() == 2 &&
5362 isa<Constant, Instruction>(GEP->getOperand(1)));
5363 })) {
5364 // Check if potential masked gather can be represented as series
5365 // of loads + insertsubvectors.
5366 // If masked gather cost is higher - better to vectorize, so
5367 // consider it as a gather node. It will be better estimated
5368 // later.
5369 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
5370 ProfitableGatherPointers))
5372 }
5373
5374 return LoadsState::Gather;
5375}
5376
5378 ArrayRef<BasicBlock *> BBs, Type *ElemTy,
5379 const DataLayout &DL, ScalarEvolution &SE,
5380 SmallVectorImpl<unsigned> &SortedIndices) {
5381 assert(
5382 all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
5383 "Expected list of pointer operands.");
5384 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
5385 // Ptr into, sort and return the sorted indices with values next to one
5386 // another.
5389 Bases;
5390 Bases
5391 .try_emplace(std::make_pair(
5393 .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U);
5394
5395 SortedIndices.clear();
5396 for (auto [Cnt, Ptr] : enumerate(VL.drop_front())) {
5397 auto Key = std::make_pair(BBs[Cnt + 1],
5399 bool Found = any_of(Bases.try_emplace(Key).first->second,
5400 [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) {
5401 std::optional<int> Diff = getPointersDiff(
5402 ElemTy, std::get<0>(Base.front()), ElemTy,
5403 Ptr, DL, SE,
5404 /*StrictCheck=*/true);
5405 if (!Diff)
5406 return false;
5407
5408 Base.emplace_back(Ptr, *Diff, Cnt + 1);
5409 return true;
5410 });
5411
5412 if (!Found) {
5413 // If we haven't found enough to usefully cluster, return early.
5414 if (Bases.size() > VL.size() / 2 - 1)
5415 return false;
5416
5417 // Not found already - add a new Base
5418 Bases.find(Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
5419 }
5420 }
5421
5422 if (Bases.size() == VL.size())
5423 return false;
5424
5425 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
5426 Bases.front().second.size() == VL.size()))
5427 return false;
5428
5429 // For each of the bases sort the pointers by Offset and check if any of the
5430 // base become consecutively allocated.
5431 auto ComparePointers = [](Value *Ptr1, Value *Ptr2) {
5432 SmallPtrSet<Value *, 13> FirstPointers;
5433 SmallPtrSet<Value *, 13> SecondPointers;
5434 Value *P1 = Ptr1;
5435 Value *P2 = Ptr2;
5436 unsigned Depth = 0;
5437 while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1)) {
5438 if (P1 == P2 || Depth > RecursionMaxDepth)
5439 return false;
5440 FirstPointers.insert(P1);
5441 SecondPointers.insert(P2);
5442 P1 = getUnderlyingObject(P1, /*MaxLookup=*/1);
5443 P2 = getUnderlyingObject(P2, /*MaxLookup=*/1);
5444 ++Depth;
5445 }
5446 assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) &&
5447 "Unable to find matching root.");
5448 return FirstPointers.contains(P2) && !SecondPointers.contains(P1);
5449 };
5450 for (auto &Base : Bases) {
5451 for (auto &Vec : Base.second) {
5452 if (Vec.size() > 1) {
5453 stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
5454 const std::tuple<Value *, int, unsigned> &Y) {
5455 return std::get<1>(X) < std::get<1>(Y);
5456 });
5457 int InitialOffset = std::get<1>(Vec[0]);
5458 bool AnyConsecutive =
5459 all_of(enumerate(Vec), [InitialOffset](const auto &P) {
5460 return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
5461 });
5462 // Fill SortedIndices array only if it looks worth-while to sort the
5463 // ptrs.
5464 if (!AnyConsecutive)
5465 return false;
5466 }
5467 }
5468 stable_sort(Base.second, [&](const auto &V1, const auto &V2) {
5469 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
5470 });
5471 }
5472
5473 for (auto &T : Bases)
5474 for (const auto &Vec : T.second)
5475 for (const auto &P : Vec)
5476 SortedIndices.push_back(std::get<2>(P));
5477
5478 assert(SortedIndices.size() == VL.size() &&
5479 "Expected SortedIndices to be the size of VL");
5480 return true;
5481}
5482
5483std::optional<BoUpSLP::OrdersType>
5484BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
5485 assert(TE.isGather() && "Expected gather node only.");
5486 Type *ScalarTy = TE.Scalars[0]->getType();
5487
5489 Ptrs.reserve(TE.Scalars.size());
5491 BBs.reserve(TE.Scalars.size());
5492 for (Value *V : TE.Scalars) {
5493 auto *L = dyn_cast<LoadInst>(V);
5494 if (!L || !L->isSimple())
5495 return std::nullopt;
5496 Ptrs.push_back(L->getPointerOperand());
5497 BBs.push_back(L->getParent());
5498 }
5499
5500 BoUpSLP::OrdersType Order;
5501 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
5502 clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order))
5503 return std::move(Order);
5504 return std::nullopt;
5505}
5506
5507/// Check if two insertelement instructions are from the same buildvector.
5510 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
5511 // Instructions must be from the same basic blocks.
5512 if (VU->getParent() != V->getParent())
5513 return false;
5514 // Checks if 2 insertelements are from the same buildvector.
5515 if (VU->getType() != V->getType())
5516 return false;
5517 // Multiple used inserts are separate nodes.
5518 if (!VU->hasOneUse() && !V->hasOneUse())
5519 return false;
5520 auto *IE1 = VU;
5521 auto *IE2 = V;
5522 std::optional<unsigned> Idx1 = getElementIndex(IE1);
5523 std::optional<unsigned> Idx2 = getElementIndex(IE2);
5524 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
5525 return false;
5526 // Go through the vector operand of insertelement instructions trying to find
5527 // either VU as the original vector for IE2 or V as the original vector for
5528 // IE1.
5529 SmallBitVector ReusedIdx(
5530 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
5531 bool IsReusedIdx = false;
5532 do {
5533 if (IE2 == VU && !IE1)
5534 return VU->hasOneUse();
5535 if (IE1 == V && !IE2)
5536 return V->hasOneUse();
5537 if (IE1 && IE1 != V) {
5538 unsigned Idx1 = getElementIndex(IE1).value_or(*Idx2);
5539 IsReusedIdx |= ReusedIdx.test(Idx1);
5540 ReusedIdx.set(Idx1);
5541 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
5542 IE1 = nullptr;
5543 else
5544 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
5545 }
5546 if (IE2 && IE2 != VU) {
5547 unsigned Idx2 = getElementIndex(IE2).value_or(*Idx1);
5548 IsReusedIdx |= ReusedIdx.test(Idx2);
5549 ReusedIdx.set(Idx2);
5550 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
5551 IE2 = nullptr;
5552 else
5553 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
5554 }
5555 } while (!IsReusedIdx && (IE1 || IE2));
5556 return false;
5557}
5558
5559std::optional<BoUpSLP::OrdersType>
5560BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
5561 // No need to reorder if need to shuffle reuses, still need to shuffle the
5562 // node.
5563 if (!TE.ReuseShuffleIndices.empty()) {
5564 // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
5565 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
5566 "Reshuffling scalars not yet supported for nodes with padding");
5567
5568 if (isSplat(TE.Scalars))
5569 return std::nullopt;
5570 // Check if reuse shuffle indices can be improved by reordering.
5571 // For this, check that reuse mask is "clustered", i.e. each scalar values
5572 // is used once in each submask of size <number_of_scalars>.
5573 // Example: 4 scalar values.
5574 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
5575 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
5576 // element 3 is used twice in the second submask.
5577 unsigned Sz = TE.Scalars.size();
5578 if (TE.isGather()) {
5579 if (std::optional<OrdersType> CurrentOrder =
5581 SmallVector<int> Mask;
5582 fixupOrderingIndices(*CurrentOrder);
5583 inversePermutation(*CurrentOrder, Mask);
5584 ::addMask(Mask, TE.ReuseShuffleIndices);
5585 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
5586 unsigned Sz = TE.Scalars.size();
5587 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
5588 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
5589 if (Idx != PoisonMaskElem)
5590 Res[Idx + K * Sz] = I + K * Sz;
5591 }
5592 return std::move(Res);
5593 }
5594 }
5595 if (Sz == 2 && TE.getVectorFactor() == 4 &&
5596 ::getNumberOfParts(*TTI, getWidenedType(TE.Scalars.front()->getType(),
5597 2 * TE.getVectorFactor())) == 1)
5598 return std::nullopt;
5599 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
5600 Sz)) {
5601 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
5602 if (TE.ReorderIndices.empty())
5603 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
5604 else
5605 inversePermutation(TE.ReorderIndices, ReorderMask);
5606 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
5607 unsigned VF = ReorderMask.size();
5608 OrdersType ResOrder(VF, VF);
5609 unsigned NumParts = divideCeil(VF, Sz);
5610 SmallBitVector UsedVals(NumParts);
5611 for (unsigned I = 0; I < VF; I += Sz) {
5612 int Val = PoisonMaskElem;
5613 unsigned UndefCnt = 0;
5614 unsigned Limit = std::min(Sz, VF - I);
5615 if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
5616 [&](int Idx) {
5617 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
5618 Val = Idx;
5619 if (Idx == PoisonMaskElem)
5620 ++UndefCnt;
5621 return Idx != PoisonMaskElem && Idx != Val;
5622 }) ||
5623 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
5624 UndefCnt > Sz / 2)
5625 return std::nullopt;
5626 UsedVals.set(Val);
5627 for (unsigned K = 0; K < NumParts; ++K) {
5628 unsigned Idx = Val + Sz * K;
5629 if (Idx < VF)
5630 ResOrder[Idx] = I + K;
5631 }
5632 }
5633 return std::move(ResOrder);
5634 }
5635 unsigned VF = TE.getVectorFactor();
5636 // Try build correct order for extractelement instructions.
5637 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
5638 TE.ReuseShuffleIndices.end());
5639 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
5640 all_of(TE.Scalars, [Sz](Value *V) {
5641 if (isa<PoisonValue>(V))
5642 return true;
5643 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
5644 return Idx && *Idx < Sz;
5645 })) {
5646 assert(!TE.isAltShuffle() && "Alternate instructions are only supported "
5647 "by BinaryOperator and CastInst.");
5648 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
5649 if (TE.ReorderIndices.empty())
5650 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
5651 else
5652 inversePermutation(TE.ReorderIndices, ReorderMask);
5653 for (unsigned I = 0; I < VF; ++I) {
5654 int &Idx = ReusedMask[I];
5655 if (Idx == PoisonMaskElem)
5656 continue;
5657 Value *V = TE.Scalars[ReorderMask[Idx]];
5658 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
5659 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
5660 }
5661 }
5662 // Build the order of the VF size, need to reorder reuses shuffles, they are
5663 // always of VF size.
5664 OrdersType ResOrder(VF);
5665 std::iota(ResOrder.begin(), ResOrder.end(), 0);
5666 auto *It = ResOrder.begin();
5667 for (unsigned K = 0; K < VF; K += Sz) {
5668 OrdersType CurrentOrder(TE.ReorderIndices);
5669 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
5670 if (SubMask.front() == PoisonMaskElem)
5671 std::iota(SubMask.begin(), SubMask.end(), 0);
5672 reorderOrder(CurrentOrder, SubMask);
5673 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
5674 std::advance(It, Sz);
5675 }
5676 if (TE.isGather() && all_of(enumerate(ResOrder), [](const auto &Data) {
5677 return Data.index() == Data.value();
5678 }))
5679 return std::nullopt; // No need to reorder.
5680 return std::move(ResOrder);
5681 }
5682 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
5683 any_of(TE.UserTreeIndices,
5684 [](const EdgeInfo &EI) {
5685 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
5686 }) &&
5687 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
5688 return std::nullopt;
5689 if ((TE.State == TreeEntry::Vectorize ||
5690 TE.State == TreeEntry::StridedVectorize) &&
5691 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
5692 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp())))) {
5693 assert(!TE.isAltShuffle() && "Alternate instructions are only supported by "
5694 "BinaryOperator and CastInst.");
5695 return TE.ReorderIndices;
5696 }
5697 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
5698 if (!TE.ReorderIndices.empty())
5699 return TE.ReorderIndices;
5700
5701 SmallVector<Instruction *> UserBVHead(TE.Scalars.size());
5702 for (auto [I, V] : zip(UserBVHead, TE.Scalars)) {
5703 if (!V->hasNUsesOrMore(1))
5704 continue;
5705 auto *II = dyn_cast<InsertElementInst>(*V->user_begin());
5706 if (!II)
5707 continue;
5708 Instruction *BVHead = nullptr;
5709 BasicBlock *BB = II->getParent();
5710 while (II && II->hasOneUse() && II->getParent() == BB) {
5711 BVHead = II;
5712 II = dyn_cast<InsertElementInst>(II->getOperand(0));
5713 }
5714 I = BVHead;
5715 }
5716
5717 auto CompareByBasicBlocks = [&](BasicBlock *BB1, BasicBlock *BB2) {
5718 assert(BB1 != BB2 && "Expected different basic blocks.");
5719 auto *NodeA = DT->getNode(BB1);
5720 auto *NodeB = DT->getNode(BB2);
5721 assert(NodeA && "Should only process reachable instructions");
5722 assert(NodeB && "Should only process reachable instructions");
5723 assert((NodeA == NodeB) ==
5724 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
5725 "Different nodes should have different DFS numbers");
5726 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
5727 };
5728 auto PHICompare = [&](unsigned I1, unsigned I2) {
5729 Value *V1 = TE.Scalars[I1];
5730 Value *V2 = TE.Scalars[I2];
5731 if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0))
5732 return false;
5733 if (isa<PoisonValue>(V1))
5734 return true;
5735 if (isa<PoisonValue>(V2))
5736 return false;
5737 if (V1->getNumUses() < V2->getNumUses())
5738 return true;
5739 if (V1->getNumUses() > V2->getNumUses())
5740 return false;
5741 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
5742 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
5743 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
5744 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
5745 FirstUserOfPhi2->getParent());
5746 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
5747 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
5748 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
5749 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
5750 if (IE1 && !IE2)
5751 return true;
5752 if (!IE1 && IE2)
5753 return false;
5754 if (IE1 && IE2) {
5755 if (UserBVHead[I1] && !UserBVHead[I2])
5756 return true;
5757 if (!UserBVHead[I1])
5758 return false;
5759 if (UserBVHead[I1] == UserBVHead[I2])
5760 return getElementIndex(IE1) < getElementIndex(IE2);
5761 if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent())
5762 return CompareByBasicBlocks(UserBVHead[I1]->getParent(),
5763 UserBVHead[I2]->getParent());
5764 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
5765 }
5766 if (EE1 && !EE2)
5767 return true;
5768 if (!EE1 && EE2)
5769 return false;
5770 if (EE1 && EE2) {
5771 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
5772 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
5773 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
5774 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
5775 if (!Inst2 && !P2)
5776 return Inst1 || P1;
5777 if (EE1->getOperand(0) == EE2->getOperand(0))
5778 return getElementIndex(EE1) < getElementIndex(EE2);
5779 if (!Inst1 && Inst2)
5780 return false;
5781 if (Inst1 && Inst2) {
5782 if (Inst1->getParent() != Inst2->getParent())
5783 return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent());
5784 return Inst1->comesBefore(Inst2);
5785 }
5786 if (!P1 && P2)
5787 return false;
5788 assert(P1 && P2 &&
5789 "Expected either instructions or arguments vector operands.");
5790 return P1->getArgNo() < P2->getArgNo();
5791 }
5792 return false;
5793 };
5794 OrdersType Phis(TE.Scalars.size());
5795 std::iota(Phis.begin(), Phis.end(), 0);
5796 stable_sort(Phis, PHICompare);
5797 if (isIdentityOrder(Phis))
5798 return std::nullopt; // No need to reorder.
5799 return std::move(Phis);
5800 }
5801 if (TE.isGather() && (!TE.hasState() || !TE.isAltShuffle()) &&
5802 allSameType(TE.Scalars)) {
5803 // TODO: add analysis of other gather nodes with extractelement
5804 // instructions and other values/instructions, not only undefs.
5805 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
5806 (all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
5807 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
5808 all_of(TE.Scalars, [](Value *V) {
5809 auto *EE = dyn_cast<ExtractElementInst>(V);
5810 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
5811 })) {
5812 // Check that gather of extractelements can be represented as
5813 // just a shuffle of a single vector.
5814 OrdersType CurrentOrder;
5815 bool Reuse =
5816 canReuseExtract(TE.Scalars, CurrentOrder, /*ResizeAllowed=*/true);
5817 if (Reuse || !CurrentOrder.empty())
5818 return std::move(CurrentOrder);
5819 }
5820 // If the gather node is <undef, v, .., poison> and
5821 // insertelement poison, v, 0 [+ permute]
5822 // is cheaper than
5823 // insertelement poison, v, n - try to reorder.
5824 // If rotating the whole graph, exclude the permute cost, the whole graph
5825 // might be transformed.
5826 int Sz = TE.Scalars.size();
5827 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
5828 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
5829 const auto *It =
5830 find_if(TE.Scalars, [](Value *V) { return !isConstant(V); });
5831 if (It == TE.Scalars.begin())
5832 return OrdersType();
5833 auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
5834 if (It != TE.Scalars.end()) {
5835 OrdersType Order(Sz, Sz);
5836 unsigned Idx = std::distance(TE.Scalars.begin(), It);
5837 Order[Idx] = 0;
5838 fixupOrderingIndices(Order);
5839 SmallVector<int> Mask;
5840 inversePermutation(Order, Mask);
5841 InstructionCost PermuteCost =
5842 TopToBottom
5843 ? 0
5845 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
5846 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
5847 PoisonValue::get(Ty), *It);
5848 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
5849 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
5850 PoisonValue::get(Ty), *It);
5851 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
5852 OrdersType Order(Sz, Sz);
5853 Order[Idx] = 0;
5854 return std::move(Order);
5855 }
5856 }
5857 }
5858 if (isSplat(TE.Scalars))
5859 return std::nullopt;
5860 if (TE.Scalars.size() >= 3)
5861 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
5862 return Order;
5863 // Check if can include the order of vectorized loads. For masked gathers do
5864 // extra analysis later, so include such nodes into a special list.
5865 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
5866 SmallVector<Value *> PointerOps;
5867 OrdersType CurrentOrder;
5868 LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
5869 CurrentOrder, PointerOps);
5871 return std::move(CurrentOrder);
5872 }
5873 // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
5874 // has been auditted for correctness with non-power-of-two vectors.
5875 if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
5876 if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
5877 return CurrentOrder;
5878 }
5879 return std::nullopt;
5880}
5881
5882/// Checks if the given mask is a "clustered" mask with the same clusters of
5883/// size \p Sz, which are not identity submasks.
5885 unsigned Sz) {
5886 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
5887 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
5888 return false;
5889 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
5890 ArrayRef<int> Cluster = Mask.slice(I, Sz);
5891 if (Cluster != FirstCluster)
5892 return false;
5893 }
5894 return true;
5895}
5896
5897void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
5898 // Reorder reuses mask.
5899 reorderReuses(TE.ReuseShuffleIndices, Mask);
5900 const unsigned Sz = TE.Scalars.size();
5901 // For vectorized and non-clustered reused no need to do anything else.
5902 if (!TE.isGather() ||
5904 Sz) ||
5905 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
5906 return;
5907 SmallVector<int> NewMask;
5908 inversePermutation(TE.ReorderIndices, NewMask);
5909 addMask(NewMask, TE.ReuseShuffleIndices);
5910 // Clear reorder since it is going to be applied to the new mask.
5911 TE.ReorderIndices.clear();
5912 // Try to improve gathered nodes with clustered reuses, if possible.
5913 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
5914 SmallVector<unsigned> NewOrder(Slice);
5915 inversePermutation(NewOrder, NewMask);
5916 reorderScalars(TE.Scalars, NewMask);
5917 // Fill the reuses mask with the identity submasks.
5918 for (auto *It = TE.ReuseShuffleIndices.begin(),
5919 *End = TE.ReuseShuffleIndices.end();
5920 It != End; std::advance(It, Sz))
5921 std::iota(It, std::next(It, Sz), 0);
5922}
5923
5925 ArrayRef<unsigned> SecondaryOrder) {
5926 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
5927 "Expected same size of orders");
5928 unsigned Sz = Order.size();
5929 SmallBitVector UsedIndices(Sz);
5930 for (unsigned Idx : seq<unsigned>(0, Sz)) {
5931 if (Order[Idx] != Sz)
5932 UsedIndices.set(Order[Idx]);
5933 }
5934 if (SecondaryOrder.empty()) {
5935 for (unsigned Idx : seq<unsigned>(0, Sz))
5936 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
5937 Order[Idx] = Idx;
5938 } else {
5939 for (unsigned Idx : seq<unsigned>(0, Sz))
5940 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
5941 !UsedIndices.test(SecondaryOrder[Idx]))
5942 Order[Idx] = SecondaryOrder[Idx];
5943 }
5944}
5945
5947 // Maps VF to the graph nodes.
5949 // ExtractElement gather nodes which can be vectorized and need to handle
5950 // their ordering.
5952
5953 // Phi nodes can have preferred ordering based on their result users
5955
5956 // AltShuffles can also have a preferred ordering that leads to fewer
5957 // instructions, e.g., the addsub instruction in x86.
5958 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
5959
5960 // Maps a TreeEntry to the reorder indices of external users.
5962 ExternalUserReorderMap;
5963 // Find all reorderable nodes with the given VF.
5964 // Currently the are vectorized stores,loads,extracts + some gathering of
5965 // extracts.
5966 for_each(VectorizableTree, [&, &TTIRef = *TTI](
5967 const std::unique_ptr<TreeEntry> &TE) {
5968 // Look for external users that will probably be vectorized.
5969 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
5970 findExternalStoreUsersReorderIndices(TE.get());
5971 if (!ExternalUserReorderIndices.empty()) {
5972 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5973 ExternalUserReorderMap.try_emplace(TE.get(),
5974 std::move(ExternalUserReorderIndices));
5975 }
5976
5977 // Patterns like [fadd,fsub] can be combined into a single instruction in
5978 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
5979 // to take into account their order when looking for the most used order.
5980 if (TE->hasState() && TE->isAltShuffle()) {
5981 VectorType *VecTy =
5982 getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size());
5983 unsigned Opcode0 = TE->getOpcode();
5984 unsigned Opcode1 = TE->getAltOpcode();
5985 SmallBitVector OpcodeMask(getAltInstrMask(TE->Scalars, Opcode0, Opcode1));
5986 // If this pattern is supported by the target then we consider the order.
5987 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5988 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5989 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
5990 }
5991 // TODO: Check the reverse order too.
5992 }
5993
5994 if (std::optional<OrdersType> CurrentOrder =
5995 getReorderingData(*TE, /*TopToBottom=*/true)) {
5996 // Do not include ordering for nodes used in the alt opcode vectorization,
5997 // better to reorder them during bottom-to-top stage. If follow the order
5998 // here, it causes reordering of the whole graph though actually it is
5999 // profitable just to reorder the subgraph that starts from the alternate
6000 // opcode vectorization node. Such nodes already end-up with the shuffle
6001 // instruction and it is just enough to change this shuffle rather than
6002 // rotate the scalars for the whole graph.
6003 unsigned Cnt = 0;
6004 const TreeEntry *UserTE = TE.get();
6005 while (UserTE && Cnt < RecursionMaxDepth) {
6006 if (UserTE->UserTreeIndices.size() != 1)
6007 break;
6008 if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
6009 return EI.UserTE->State == TreeEntry::Vectorize &&
6010 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
6011 }))
6012 return;
6013 UserTE = UserTE->UserTreeIndices.back().UserTE;
6014 ++Cnt;
6015 }
6016 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
6017 if (!(TE->State == TreeEntry::Vectorize ||
6018 TE->State == TreeEntry::StridedVectorize) ||
6019 !TE->ReuseShuffleIndices.empty())
6020 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
6021 if (TE->State == TreeEntry::Vectorize &&
6022 TE->getOpcode() == Instruction::PHI)
6023 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
6024 }
6025 });
6026
6027 // Reorder the graph nodes according to their vectorization factor.
6028 for (unsigned VF = VectorizableTree.front()->getVectorFactor();
6029 !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
6030 auto It = VFToOrderedEntries.find(VF);
6031 if (It == VFToOrderedEntries.end())
6032 continue;
6033 // Try to find the most profitable order. We just are looking for the most
6034 // used order and reorder scalar elements in the nodes according to this
6035 // mostly used order.
6036 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
6037 // Delete VF entry upon exit.
6038 auto Cleanup = make_scope_exit([&]() { VFToOrderedEntries.erase(It); });
6039
6040 // All operands are reordered and used only in this node - propagate the
6041 // most used order to the user node.
6044 OrdersUses;
6046 for (const TreeEntry *OpTE : OrderedEntries) {
6047 // No need to reorder this nodes, still need to extend and to use shuffle,
6048 // just need to merge reordering shuffle and the reuse shuffle.
6049 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
6050 continue;
6051 // Count number of orders uses.
6052 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
6053 &PhisToOrders]() -> const OrdersType & {
6054 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
6055 auto It = GathersToOrders.find(OpTE);
6056 if (It != GathersToOrders.end())
6057 return It->second;
6058 }
6059 if (OpTE->hasState() && OpTE->isAltShuffle()) {
6060 auto It = AltShufflesToOrders.find(OpTE);
6061 if (It != AltShufflesToOrders.end())
6062 return It->second;
6063 }
6064 if (OpTE->State == TreeEntry::Vectorize &&
6065 OpTE->getOpcode() == Instruction::PHI) {
6066 auto It = PhisToOrders.find(OpTE);
6067 if (It != PhisToOrders.end())
6068 return It->second;
6069 }
6070 return OpTE->ReorderIndices;
6071 }();
6072 // First consider the order of the external scalar users.
6073 auto It = ExternalUserReorderMap.find(OpTE);
6074 if (It != ExternalUserReorderMap.end()) {
6075 const auto &ExternalUserReorderIndices = It->second;
6076 // If the OpTE vector factor != number of scalars - use natural order,
6077 // it is an attempt to reorder node with reused scalars but with
6078 // external uses.
6079 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
6080 OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
6081 ExternalUserReorderIndices.size();
6082 } else {
6083 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
6084 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
6085 }
6086 // No other useful reorder data in this entry.
6087 if (Order.empty())
6088 continue;
6089 }
6090 // Stores actually store the mask, not the order, need to invert.
6091 if (OpTE->State == TreeEntry::Vectorize &&
6092 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6093 assert(!OpTE->isAltShuffle() &&
6094 "Alternate instructions are only supported by BinaryOperator "
6095 "and CastInst.");
6096 SmallVector<int> Mask;
6097 inversePermutation(Order, Mask);
6098 unsigned E = Order.size();
6099 OrdersType CurrentOrder(E, E);
6100 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
6101 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6102 });
6103 fixupOrderingIndices(CurrentOrder);
6104 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
6105 } else {
6106 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
6107 }
6108 }
6109 if (OrdersUses.empty())
6110 continue;
6111 // Choose the most used order.
6112 unsigned IdentityCnt = 0;
6113 unsigned FilledIdentityCnt = 0;
6114 OrdersType IdentityOrder(VF, VF);
6115 for (auto &Pair : OrdersUses) {
6116 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
6117 if (!Pair.first.empty())
6118 FilledIdentityCnt += Pair.second;
6119 IdentityCnt += Pair.second;
6120 combineOrders(IdentityOrder, Pair.first);
6121 }
6122 }
6123 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
6124 unsigned Cnt = IdentityCnt;
6125 for (auto &Pair : OrdersUses) {
6126 // Prefer identity order. But, if filled identity found (non-empty order)
6127 // with same number of uses, as the new candidate order, we can choose
6128 // this candidate order.
6129 if (Cnt < Pair.second ||
6130 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
6131 Cnt == Pair.second && !BestOrder.empty() &&
6132 isIdentityOrder(BestOrder))) {
6133 combineOrders(Pair.first, BestOrder);
6134 BestOrder = Pair.first;
6135 Cnt = Pair.second;
6136 } else {
6137 combineOrders(BestOrder, Pair.first);
6138 }
6139 }
6140 // Set order of the user node.
6141 if (isIdentityOrder(BestOrder))
6142 continue;
6143 fixupOrderingIndices(BestOrder);
6144 SmallVector<int> Mask;
6145 inversePermutation(BestOrder, Mask);
6146 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
6147 unsigned E = BestOrder.size();
6148 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
6149 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6150 });
6151 // Do an actual reordering, if profitable.
6152 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6153 // Just do the reordering for the nodes with the given VF.
6154 if (TE->Scalars.size() != VF) {
6155 if (TE->ReuseShuffleIndices.size() == VF) {
6156 // Need to reorder the reuses masks of the operands with smaller VF to
6157 // be able to find the match between the graph nodes and scalar
6158 // operands of the given node during vectorization/cost estimation.
6159 assert(all_of(TE->UserTreeIndices,
6160 [VF, &TE](const EdgeInfo &EI) {
6161 return EI.UserTE->Scalars.size() == VF ||
6162 EI.UserTE->Scalars.size() ==
6163 TE->Scalars.size();
6164 }) &&
6165 "All users must be of VF size.");
6166 if (SLPReVec) {
6167 assert(SLPReVec && "Only supported by REVEC.");
6168 // ShuffleVectorInst does not do reorderOperands (and it should not
6169 // because ShuffleVectorInst supports only a limited set of
6170 // patterns). Only do reorderNodeWithReuses if all of the users are
6171 // not ShuffleVectorInst.
6172 if (all_of(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
6173 return isa<ShuffleVectorInst>(EI.UserTE->getMainOp());
6174 }))
6175 continue;
6176 assert(none_of(TE->UserTreeIndices,
6177 [&](const EdgeInfo &EI) {
6178 return isa<ShuffleVectorInst>(
6179 EI.UserTE->getMainOp());
6180 }) &&
6181 "Does not know how to reorder.");
6182 }
6183 // Update ordering of the operands with the smaller VF than the given
6184 // one.
6185 reorderNodeWithReuses(*TE, Mask);
6186 }
6187 continue;
6188 }
6189 if ((TE->State == TreeEntry::Vectorize ||
6190 TE->State == TreeEntry::StridedVectorize) &&
6192 InsertElementInst>(TE->getMainOp()) ||
6193 (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp())))) {
6194 assert(!TE->isAltShuffle() &&
6195 "Alternate instructions are only supported by BinaryOperator "
6196 "and CastInst.");
6197 // Build correct orders for extract{element,value}, loads and
6198 // stores.
6199 reorderOrder(TE->ReorderIndices, Mask);
6200 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
6201 TE->reorderOperands(Mask);
6202 } else {
6203 // Reorder the node and its operands.
6204 TE->reorderOperands(Mask);
6205 assert(TE->ReorderIndices.empty() &&
6206 "Expected empty reorder sequence.");
6207 reorderScalars(TE->Scalars, Mask);
6208 }
6209 if (!TE->ReuseShuffleIndices.empty()) {
6210 // Apply reversed order to keep the original ordering of the reused
6211 // elements to avoid extra reorder indices shuffling.
6212 OrdersType CurrentOrder;
6213 reorderOrder(CurrentOrder, MaskOrder);
6214 SmallVector<int> NewReuses;
6215 inversePermutation(CurrentOrder, NewReuses);
6216 addMask(NewReuses, TE->ReuseShuffleIndices);
6217 TE->ReuseShuffleIndices.swap(NewReuses);
6218 }
6219 }
6220 }
6221}
6222
6223bool BoUpSLP::canReorderOperands(
6224 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
6225 ArrayRef<TreeEntry *> ReorderableGathers,
6226 SmallVectorImpl<TreeEntry *> &GatherOps) {
6227 for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
6228 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
6229 return OpData.first == I &&
6230 (OpData.second->State == TreeEntry::Vectorize ||
6231 OpData.second->State == TreeEntry::StridedVectorize);
6232 }))
6233 continue;
6234 if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
6235 // Do not reorder if operand node is used by many user nodes.
6236 if (any_of(TE->UserTreeIndices,
6237 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
6238 return false;
6239 // Add the node to the list of the ordered nodes with the identity
6240 // order.
6241 Edges.emplace_back(I, TE);
6242 // Add ScatterVectorize nodes to the list of operands, where just
6243 // reordering of the scalars is required. Similar to the gathers, so
6244 // simply add to the list of gathered ops.
6245 // If there are reused scalars, process this node as a regular vectorize
6246 // node, just reorder reuses mask.
6247 if (TE->State != TreeEntry::Vectorize &&
6248 TE->State != TreeEntry::StridedVectorize &&
6249 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
6250 GatherOps.push_back(TE);
6251 continue;
6252 }
6253 TreeEntry *Gather = nullptr;
6254 if (count_if(ReorderableGathers,
6255 [&Gather, UserTE, I](TreeEntry *TE) {
6256 assert(TE->State != TreeEntry::Vectorize &&
6257 TE->State != TreeEntry::StridedVectorize &&
6258 "Only non-vectorized nodes are expected.");
6259 if (any_of(TE->UserTreeIndices,
6260 [UserTE, I](const EdgeInfo &EI) {
6261 return EI.UserTE == UserTE && EI.EdgeIdx == I;
6262 })) {
6263 assert(TE->isSame(UserTE->getOperand(I)) &&
6264 "Operand entry does not match operands.");
6265 Gather = TE;
6266 return true;
6267 }
6268 return false;
6269 }) > 1 &&
6270 !allConstant(UserTE->getOperand(I)))
6271 return false;
6272 if (Gather)
6273 GatherOps.push_back(Gather);
6274 }
6275 return true;
6276}
6277
6278void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
6279 SetVector<TreeEntry *> OrderedEntries;
6280 DenseSet<const TreeEntry *> GathersToOrders;
6281 // Find all reorderable leaf nodes with the given VF.
6282 // Currently the are vectorized loads,extracts without alternate operands +
6283 // some gathering of extracts.
6284 SmallVector<TreeEntry *> NonVectorized;
6285 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6286 if (TE->State != TreeEntry::Vectorize &&
6287 TE->State != TreeEntry::StridedVectorize)
6288 NonVectorized.push_back(TE.get());
6289 if (std::optional<OrdersType> CurrentOrder =
6290 getReorderingData(*TE, /*TopToBottom=*/false)) {
6291 OrderedEntries.insert(TE.get());
6292 if (!(TE->State == TreeEntry::Vectorize ||
6293 TE->State == TreeEntry::StridedVectorize) ||
6294 !TE->ReuseShuffleIndices.empty())
6295 GathersToOrders.insert(TE.get());
6296 }
6297 }
6298
6299 // 1. Propagate order to the graph nodes, which use only reordered nodes.
6300 // I.e., if the node has operands, that are reordered, try to make at least
6301 // one operand order in the natural order and reorder others + reorder the
6302 // user node itself.
6304 while (!OrderedEntries.empty()) {
6305 // 1. Filter out only reordered nodes.
6306 // 2. If the entry has multiple uses - skip it and jump to the next node.
6308 SmallVector<TreeEntry *> Filtered;
6309 for (TreeEntry *TE : OrderedEntries) {
6310 if (!(TE->State == TreeEntry::Vectorize ||
6311 TE->State == TreeEntry::StridedVectorize ||
6312 (TE->isGather() && GathersToOrders.contains(TE))) ||
6313 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6314 !all_of(drop_begin(TE->UserTreeIndices),
6315 [TE](const EdgeInfo &EI) {
6316 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
6317 }) ||
6318 !Visited.insert(TE).second) {
6319 Filtered.push_back(TE);
6320 continue;
6321 }
6322 // Build a map between user nodes and their operands order to speedup
6323 // search. The graph currently does not provide this dependency directly.
6324 for (EdgeInfo &EI : TE->UserTreeIndices)
6325 Users[EI.UserTE].emplace_back(EI.EdgeIdx, TE);
6326 }
6327 // Erase filtered entries.
6328 for (TreeEntry *TE : Filtered)
6329 OrderedEntries.remove(TE);
6331 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
6332 UsersVec(Users.begin(), Users.end());
6333 sort(UsersVec, [](const auto &Data1, const auto &Data2) {
6334 return Data1.first->Idx > Data2.first->Idx;
6335 });
6336 for (auto &Data : UsersVec) {
6337 // Check that operands are used only in the User node.
6338 SmallVector<TreeEntry *> GatherOps;
6339 if (!canReorderOperands(Data.first, Data.second, NonVectorized,
6340 GatherOps)) {
6341 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
6342 OrderedEntries.remove(Op.second);
6343 continue;
6344 }
6345 // All operands are reordered and used only in this node - propagate the
6346 // most used order to the user node.
6349 OrdersUses;
6350 // Do the analysis for each tree entry only once, otherwise the order of
6351 // the same node my be considered several times, though might be not
6352 // profitable.
6355 for (const auto &Op : Data.second) {
6356 TreeEntry *OpTE = Op.second;
6357 if (!VisitedOps.insert(OpTE).second)
6358 continue;
6359 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
6360 continue;
6361 const auto Order = [&]() -> const OrdersType {
6362 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
6363 return getReorderingData(*OpTE, /*TopToBottom=*/false)
6364 .value_or(OrdersType(1));
6365 return OpTE->ReorderIndices;
6366 }();
6367 // The order is partially ordered, skip it in favor of fully non-ordered
6368 // orders.
6369 if (Order.size() == 1)
6370 continue;
6371 unsigned NumOps = count_if(
6372 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
6373 return P.second == OpTE;
6374 });
6375 // Stores actually store the mask, not the order, need to invert.
6376 if (OpTE->State == TreeEntry::Vectorize &&
6377 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6378 assert(!OpTE->isAltShuffle() &&
6379 "Alternate instructions are only supported by BinaryOperator "
6380 "and CastInst.");
6381 SmallVector<int> Mask;
6382 inversePermutation(Order, Mask);
6383 unsigned E = Order.size();
6384 OrdersType CurrentOrder(E, E);
6385 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
6386 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6387 });
6388 fixupOrderingIndices(CurrentOrder);
6389 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
6390 NumOps;
6391 } else {
6392 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
6393 }
6394 auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
6395 const auto AllowsReordering = [&](const TreeEntry *TE) {
6396 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6397 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
6398 (IgnoreReorder && TE->Idx == 0))
6399 return true;
6400 if (TE->isGather()) {
6401 if (GathersToOrders.contains(TE))
6402 return !getReorderingData(*TE, /*TopToBottom=*/false)
6403 .value_or(OrdersType(1))
6404 .empty();
6405 return true;
6406 }
6407 return false;
6408 };
6409 for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
6410 TreeEntry *UserTE = EI.UserTE;
6411 if (!VisitedUsers.insert(UserTE).second)
6412 continue;
6413 // May reorder user node if it requires reordering, has reused
6414 // scalars, is an alternate op vectorize node or its op nodes require
6415 // reordering.
6416 if (AllowsReordering(UserTE))
6417 continue;
6418 // Check if users allow reordering.
6419 // Currently look up just 1 level of operands to avoid increase of
6420 // the compile time.
6421 // Profitable to reorder if definitely more operands allow
6422 // reordering rather than those with natural order.
6424 if (static_cast<unsigned>(count_if(
6425 Ops, [UserTE, &AllowsReordering](
6426 const std::pair<unsigned, TreeEntry *> &Op) {
6427 return AllowsReordering(Op.second) &&
6428 all_of(Op.second->UserTreeIndices,
6429 [UserTE](const EdgeInfo &EI) {
6430 return EI.UserTE == UserTE;
6431 });
6432 })) <= Ops.size() / 2)
6433 ++Res.first->second;
6434 }
6435 }
6436 if (OrdersUses.empty()) {
6437 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
6438 OrderedEntries.remove(Op.second);
6439 continue;
6440 }
6441 // Choose the most used order.
6442 unsigned IdentityCnt = 0;
6443 unsigned VF = Data.second.front().second->getVectorFactor();
6444 OrdersType IdentityOrder(VF, VF);
6445 for (auto &Pair : OrdersUses) {
6446 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
6447 IdentityCnt += Pair.second;
6448 combineOrders(IdentityOrder, Pair.first);
6449 }
6450 }
6451 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
6452 unsigned Cnt = IdentityCnt;
6453 for (auto &Pair : OrdersUses) {
6454 // Prefer identity order. But, if filled identity found (non-empty
6455 // order) with same number of uses, as the new candidate order, we can
6456 // choose this candidate order.
6457 if (Cnt < Pair.second) {
6458 combineOrders(Pair.first, BestOrder);
6459 BestOrder = Pair.first;
6460 Cnt = Pair.second;
6461 } else {
6462 combineOrders(BestOrder, Pair.first);
6463 }
6464 }
6465 // Set order of the user node.
6466 if (isIdentityOrder(BestOrder)) {
6467 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
6468 OrderedEntries.remove(Op.second);
6469 continue;
6470 }
6471 fixupOrderingIndices(BestOrder);
6472 // Erase operands from OrderedEntries list and adjust their orders.
6473 VisitedOps.clear();
6474 SmallVector<int> Mask;
6475 inversePermutation(BestOrder, Mask);
6476 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
6477 unsigned E = BestOrder.size();
6478 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
6479 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6480 });
6481 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
6482 TreeEntry *TE = Op.second;
6483 OrderedEntries.remove(TE);
6484 if (!VisitedOps.insert(TE).second)
6485 continue;
6486 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
6487 reorderNodeWithReuses(*TE, Mask);
6488 continue;
6489 }
6490 // Gathers are processed separately.
6491 if (TE->State != TreeEntry::Vectorize &&
6492 TE->State != TreeEntry::StridedVectorize &&
6493 (TE->State != TreeEntry::ScatterVectorize ||
6494 TE->ReorderIndices.empty()))
6495 continue;
6496 assert((BestOrder.size() == TE->ReorderIndices.size() ||
6497 TE->ReorderIndices.empty()) &&
6498 "Non-matching sizes of user/operand entries.");
6499 reorderOrder(TE->ReorderIndices, Mask);
6500 if (IgnoreReorder && TE == VectorizableTree.front().get())
6501 IgnoreReorder = false;
6502 }
6503 // For gathers just need to reorder its scalars.
6504 for (TreeEntry *Gather : GatherOps) {
6505 assert(Gather->ReorderIndices.empty() &&
6506 "Unexpected reordering of gathers.");
6507 if (!Gather->ReuseShuffleIndices.empty()) {
6508 // Just reorder reuses indices.
6509 reorderReuses(Gather->ReuseShuffleIndices, Mask);
6510 continue;
6511 }
6512 reorderScalars(Gather->Scalars, Mask);
6513 OrderedEntries.remove(Gather);
6514 }
6515 // Reorder operands of the user node and set the ordering for the user
6516 // node itself.
6517 if (Data.first->State != TreeEntry::Vectorize ||
6518 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
6519 Data.first->getMainOp()) ||
6520 Data.first->isAltShuffle())
6521 Data.first->reorderOperands(Mask);
6522 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
6523 Data.first->isAltShuffle() ||
6524 Data.first->State == TreeEntry::StridedVectorize) {
6525 reorderScalars(Data.first->Scalars, Mask);
6526 reorderOrder(Data.first->ReorderIndices, MaskOrder,
6527 /*BottomOrder=*/true);
6528 if (Data.first->ReuseShuffleIndices.empty() &&
6529 !Data.first->ReorderIndices.empty() &&
6530 !Data.first->isAltShuffle()) {
6531 // Insert user node to the list to try to sink reordering deeper in
6532 // the graph.
6533 OrderedEntries.insert(Data.first);
6534 }
6535 } else {
6536 reorderOrder(Data.first->ReorderIndices, Mask);
6537 }
6538 }
6539 }
6540 // If the reordering is unnecessary, just remove the reorder.
6541 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
6542 VectorizableTree.front()->ReuseShuffleIndices.empty())
6543 VectorizableTree.front()->ReorderIndices.clear();
6544}
6545
6546Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
6547 if ((Entry.getOpcode() == Instruction::Store ||
6548 Entry.getOpcode() == Instruction::Load) &&
6549 Entry.State == TreeEntry::StridedVectorize &&
6550 !Entry.ReorderIndices.empty() && isReverseOrder(Entry.ReorderIndices))
6551 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
6552 return dyn_cast<Instruction>(Entry.Scalars.front());
6553}
6554
6556 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
6557 DenseMap<Value *, unsigned> ScalarToExtUses;
6558 // Collect the values that we need to extract from the tree.
6559 for (auto &TEPtr : VectorizableTree) {
6560 TreeEntry *Entry = TEPtr.get();
6561
6562 // No need to handle users of gathered values.
6563 if (Entry->isGather())
6564 continue;
6565
6566 // For each lane:
6567 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
6568 Value *Scalar = Entry->Scalars[Lane];
6569 if (!isa<Instruction>(Scalar))
6570 continue;
6571 // All uses must be replaced already? No need to do it again.
6572 auto It = ScalarToExtUses.find(Scalar);
6573 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
6574 continue;
6575
6576 // Check if the scalar is externally used as an extra arg.
6577 const auto ExtI = ExternallyUsedValues.find(Scalar);
6578 if (ExtI != ExternallyUsedValues.end()) {
6579 int FoundLane = Entry->findLaneForValue(Scalar);
6580 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
6581 << FoundLane << " from " << *Scalar << ".\n");
6582 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
6583 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
6584 continue;
6585 }
6586 for (User *U : Scalar->users()) {
6587 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
6588
6589 Instruction *UserInst = dyn_cast<Instruction>(U);
6590 if (!UserInst || isDeleted(UserInst))
6591 continue;
6592
6593 // Ignore users in the user ignore list.
6594 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
6595 continue;
6596
6597 // Skip in-tree scalars that become vectors
6598 if (ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
6599 !UseEntries.empty()) {
6600 // Some in-tree scalars will remain as scalar in vectorized
6601 // instructions. If that is the case, the one in FoundLane will
6602 // be used.
6603 if (any_of(UseEntries, [&](TreeEntry *UseEntry) {
6604 return UseEntry->State == TreeEntry::ScatterVectorize ||
6606 Scalar, getRootEntryInstruction(*UseEntry), TLI,
6607 TTI);
6608 })) {
6609 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
6610 << ".\n");
6611 assert(none_of(UseEntries,
6612 [](TreeEntry *UseEntry) {
6613 return UseEntry->isGather();
6614 }) &&
6615 "Bad state");
6616 continue;
6617 }
6618 U = nullptr;
6619 if (It != ScalarToExtUses.end()) {
6620 ExternalUses[It->second].User = nullptr;
6621 break;
6622 }
6623 }
6624
6625 if (U && Scalar->hasNUsesOrMore(UsesLimit))
6626 U = nullptr;
6627 int FoundLane = Entry->findLaneForValue(Scalar);
6628 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
6629 << " from lane " << FoundLane << " from " << *Scalar
6630 << ".\n");
6631 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
6632 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
6633 if (!U)
6634 break;
6635 }
6636 }
6637 }
6638}
6639
6641BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
6644 PtrToStoresMap;
6645 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
6646 Value *V = TE->Scalars[Lane];
6647 // Don't iterate over the users of constant data.
6648 if (!isa<Instruction>(V))
6649 continue;
6650 // To save compilation time we don't visit if we have too many users.
6651 if (V->hasNUsesOrMore(UsesLimit))
6652 break;
6653
6654 // Collect stores per pointer object.
6655 for (User *U : V->users()) {
6656 auto *SI = dyn_cast<StoreInst>(U);
6657 // Test whether we can handle the store. V might be a global, which could
6658 // be used in a different function.
6659 if (SI == nullptr || !SI->isSimple() || SI->getFunction() != F ||
6660 !isValidElementType(SI->getValueOperand()->getType()))
6661 continue;
6662 // Skip entry if already
6663 if (isVectorized(U))
6664 continue;
6665
6666 Value *Ptr =
6667 getUnderlyingObject(SI->getPointerOperand(), RecursionMaxDepth);
6668 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
6669 SI->getValueOperand()->getType(), Ptr}];
6670 // For now just keep one store per pointer object per lane.
6671 // TODO: Extend this to support multiple stores per pointer per lane
6672 if (StoresVec.size() > Lane)
6673 continue;
6674 if (!StoresVec.empty()) {
6675 std::optional<int> Diff = getPointersDiff(
6676 SI->getValueOperand()->getType(), SI->getPointerOperand(),
6677 SI->getValueOperand()->getType(),
6678 StoresVec.front()->getPointerOperand(), *DL, *SE,
6679 /*StrictCheck=*/true);
6680 // We failed to compare the pointers so just abandon this store.
6681 if (!Diff)
6682 continue;
6683 }
6684 StoresVec.push_back(SI);
6685 }
6686 }
6687 SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());
6688 unsigned I = 0;
6689 for (auto &P : PtrToStoresMap) {
6690 Res[I].swap(P.second);
6691 ++I;
6692 }
6693 return Res;
6694}
6695
6696bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
6697 OrdersType &ReorderIndices) const {
6698 // We check whether the stores in StoreVec can form a vector by sorting them
6699 // and checking whether they are consecutive.
6700
6701 // To avoid calling getPointersDiff() while sorting we create a vector of
6702 // pairs {store, offset from first} and sort this instead.
6704 StoreInst *S0 = StoresVec[0];
6705 StoreOffsetVec.emplace_back(0, 0);
6706 Type *S0Ty = S0->getValueOperand()->getType();
6707 Value *S0Ptr = S0->getPointerOperand();
6708 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
6709 StoreInst *SI = StoresVec[Idx];
6710 std::optional<int> Diff =
6711 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
6712 SI->getPointerOperand(), *DL, *SE,
6713 /*StrictCheck=*/true);
6714 StoreOffsetVec.emplace_back(*Diff, Idx);
6715 }
6716
6717 // Check if the stores are consecutive by checking if their difference is 1.
6718 if (StoreOffsetVec.size() != StoresVec.size())
6719 return false;
6720 sort(StoreOffsetVec,
6721 [](const std::pair<int, unsigned> &L,
6722 const std::pair<int, unsigned> &R) { return L.first < R.first; });
6723 unsigned Idx = 0;
6724 int PrevDist = 0;
6725 for (const auto &P : StoreOffsetVec) {
6726 if (Idx > 0 && P.first != PrevDist + 1)
6727 return false;
6728 PrevDist = P.first;
6729 ++Idx;
6730 }
6731
6732 // Calculate the shuffle indices according to their offset against the sorted
6733 // StoreOffsetVec.
6734 ReorderIndices.assign(StoresVec.size(), 0);
6735 bool IsIdentity = true;
6736 for (auto [I, P] : enumerate(StoreOffsetVec)) {
6737 ReorderIndices[P.second] = I;
6738 IsIdentity &= P.second == I;
6739 }
6740 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
6741 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
6742 // same convention here.
6743 if (IsIdentity)
6744 ReorderIndices.clear();
6745
6746 return true;
6747}
6748
6749#ifndef NDEBUG
6751 for (unsigned Idx : Order)
6752 dbgs() << Idx << ", ";
6753 dbgs() << "\n";
6754}
6755#endif
6756
6758BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
6759 unsigned NumLanes = TE->Scalars.size();
6760
6761 SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);
6762
6763 // Holds the reorder indices for each candidate store vector that is a user of
6764 // the current TreeEntry.
6765 SmallVector<OrdersType, 1> ExternalReorderIndices;
6766
6767 // Now inspect the stores collected per pointer and look for vectorization
6768 // candidates. For each candidate calculate the reorder index vector and push
6769 // it into `ExternalReorderIndices`
6770 for (ArrayRef<StoreInst *> StoresVec : Stores) {
6771 // If we have fewer than NumLanes stores, then we can't form a vector.
6772 if (StoresVec.size() != NumLanes)
6773 continue;
6774
6775 // If the stores are not consecutive then abandon this StoresVec.
6776 OrdersType ReorderIndices;
6777 if (!canFormVector(StoresVec, ReorderIndices))
6778 continue;
6779
6780 // We now know that the scalars in StoresVec can form a vector instruction,
6781 // so set the reorder indices.
6782 ExternalReorderIndices.push_back(ReorderIndices);
6783 }
6784 return ExternalReorderIndices;
6785}
6786
6788 const SmallDenseSet<Value *> &UserIgnoreLst) {
6789 deleteTree();
6790 UserIgnoreList = &UserIgnoreLst;
6791 if (!allSameType(Roots))
6792 return;
6793 buildTree_rec(Roots, 0, EdgeInfo());
6794}
6795
6797 deleteTree();
6798 if (!allSameType(Roots))
6799 return;
6800 buildTree_rec(Roots, 0, EdgeInfo());
6801}
6802
6803/// Tries to find subvector of loads and builds new vector of only loads if can
6804/// be profitable.
6806 const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL,
6808 SmallVectorImpl<SmallVector<std::pair<LoadInst *, int>>> &GatheredLoads,
6809 bool AddNew = true) {
6810 if (VL.empty())
6811 return;
6812 Type *ScalarTy = getValueType(VL.front());
6813 if (!isValidElementType(ScalarTy))
6814 return;
6816 SmallVector<DenseMap<int, LoadInst *>> ClusteredDistToLoad;
6817 for (Value *V : VL) {
6818 auto *LI = dyn_cast<LoadInst>(V);
6819 if (!LI)
6820 continue;
6821 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
6822 continue;
6823 bool IsFound = false;
6824 for (auto [Map, Data] : zip(ClusteredDistToLoad, ClusteredLoads)) {
6825 assert(LI->getParent() == Data.front().first->getParent() &&
6826 LI->getType() == Data.front().first->getType() &&
6827 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) ==
6828 getUnderlyingObject(Data.front().first->getPointerOperand(),
6830 "Expected loads with the same type, same parent and same "
6831 "underlying pointer.");
6832 std::optional<int> Dist = getPointersDiff(
6833 LI->getType(), LI->getPointerOperand(), Data.front().first->getType(),
6834 Data.front().first->getPointerOperand(), DL, SE,
6835 /*StrictCheck=*/true);
6836 if (!Dist)
6837 continue;
6838 auto It = Map.find(*Dist);
6839 if (It != Map.end() && It->second != LI)
6840 continue;
6841 if (It == Map.end()) {
6842 Data.emplace_back(LI, *Dist);
6843 Map.try_emplace(*Dist, LI);
6844 }
6845 IsFound = true;
6846 break;
6847 }
6848 if (!IsFound) {
6849 ClusteredLoads.emplace_back().emplace_back(LI, 0);
6850 ClusteredDistToLoad.emplace_back().try_emplace(0, LI);
6851 }
6852 }
6853 auto FindMatchingLoads =
6856 &GatheredLoads,
6857 SetVector<unsigned> &ToAdd, SetVector<unsigned> &Repeated,
6858 int &Offset, unsigned &Start) {
6859 if (Loads.empty())
6860 return GatheredLoads.end();
6862 LoadInst *LI = Loads.front().first;
6863 for (auto [Idx, Data] : enumerate(GatheredLoads)) {
6864 if (Idx < Start)
6865 continue;
6866 ToAdd.clear();
6867 if (LI->getParent() != Data.front().first->getParent() ||
6868 LI->getType() != Data.front().first->getType())
6869 continue;
6870 std::optional<int> Dist =
6872 Data.front().first->getType(),
6873 Data.front().first->getPointerOperand(), DL, SE,
6874 /*StrictCheck=*/true);
6875 if (!Dist)
6876 continue;
6877 SmallSet<int, 4> DataDists;
6879 for (std::pair<LoadInst *, int> P : Data) {
6880 DataDists.insert(P.second);
6881 DataLoads.insert(P.first);
6882 }
6883 // Found matching gathered loads - check if all loads are unique or
6884 // can be effectively vectorized.
6885 unsigned NumUniques = 0;
6886 for (auto [Cnt, Pair] : enumerate(Loads)) {
6887 bool Used = DataLoads.contains(Pair.first);
6888 if (!Used && !DataDists.contains(*Dist + Pair.second)) {
6889 ++NumUniques;
6890 ToAdd.insert(Cnt);
6891 } else if (Used) {
6892 Repeated.insert(Cnt);
6893 }
6894 }
6895 if (NumUniques > 0 &&
6896 (Loads.size() == NumUniques ||
6897 (Loads.size() - NumUniques >= 2 &&
6898 Loads.size() - NumUniques >= Loads.size() / 2 &&
6899 (has_single_bit(Data.size() + NumUniques) ||
6900 bit_ceil(Data.size()) <
6901 bit_ceil(Data.size() + NumUniques))))) {
6902 Offset = *Dist;
6903 Start = Idx + 1;
6904 return std::next(GatheredLoads.begin(), Idx);
6905 }
6906 }
6907 ToAdd.clear();
6908 return GatheredLoads.end();
6909 };
6910 for (ArrayRef<std::pair<LoadInst *, int>> Data : ClusteredLoads) {
6911 unsigned Start = 0;
6912 SetVector<unsigned> ToAdd, LocalToAdd, Repeated;
6913 int Offset = 0;
6914 auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,
6915 Offset, Start);
6916 while (It != GatheredLoads.end()) {
6917 assert(!LocalToAdd.empty() && "Expected some elements to add.");
6918 for (unsigned Idx : LocalToAdd)
6919 It->emplace_back(Data[Idx].first, Data[Idx].second + Offset);
6920 ToAdd.insert(LocalToAdd.begin(), LocalToAdd.end());
6921 It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset,
6922 Start);
6923 }
6924 if (any_of(seq<unsigned>(Data.size()), [&](unsigned Idx) {
6925 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
6926 })) {
6927 auto AddNewLoads =
6929 for (unsigned Idx : seq<unsigned>(Data.size())) {
6930 if (ToAdd.contains(Idx) || Repeated.contains(Idx))
6931 continue;
6932 Loads.push_back(Data[Idx]);
6933 }
6934 };
6935 if (!AddNew) {
6936 LoadInst *LI = Data.front().first;
6937 It = find_if(
6938 GatheredLoads, [&](ArrayRef<std::pair<LoadInst *, int>> PD) {
6939 return PD.front().first->getParent() == LI->getParent() &&
6940 PD.front().first->getType() == LI->getType();
6941 });
6942 while (It != GatheredLoads.end()) {
6943 AddNewLoads(*It);
6944 It = std::find_if(
6945 std::next(It), GatheredLoads.end(),
6946 [&](ArrayRef<std::pair<LoadInst *, int>> PD) {
6947 return PD.front().first->getParent() == LI->getParent() &&
6948 PD.front().first->getType() == LI->getType();
6949 });
6950 }
6951 }
6952 GatheredLoads.emplace_back().append(Data.begin(), Data.end());
6953 AddNewLoads(GatheredLoads.emplace_back());
6954 }
6955 }
6956}
6957
6958void BoUpSLP::tryToVectorizeGatheredLoads(
6959 const SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
6960 SmallVector<SmallVector<std::pair<LoadInst *, int>>>,
6961 8> &GatheredLoads) {
6962 GatheredLoadsEntriesFirst = VectorizableTree.size();
6963
6964 SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
6965 LoadEntriesToVectorize.size());
6966 for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
6967 Set.insert(VectorizableTree[Idx]->Scalars.begin(),
6968 VectorizableTree[Idx]->Scalars.end());
6969
6970 // Sort loads by distance.
6971 auto LoadSorter = [](const std::pair<LoadInst *, int> &L1,
6972 const std::pair<LoadInst *, int> &L2) {
6973 return L1.second > L2.second;
6974 };
6975
6976 auto IsMaskedGatherSupported = [&, TTI = TTI](ArrayRef<LoadInst *> Loads) {
6977 ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
6978 Loads.size());
6979 Align Alignment = computeCommonAlignment<LoadInst>(Values);
6980 auto *Ty = getWidenedType(Loads.front()->getType(), Loads.size());
6981 return TTI->isLegalMaskedGather(Ty, Alignment) &&
6982 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
6983 };
6984
6985 auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,
6986 BoUpSLP::ValueSet &VectorizedLoads,
6987 SmallVectorImpl<LoadInst *> &NonVectorized,
6988 bool Final, unsigned MaxVF) {
6990 unsigned StartIdx = 0;
6991 SmallVector<int> CandidateVFs;
6992 if (VectorizeNonPowerOf2 && has_single_bit(MaxVF + 1))
6993 CandidateVFs.push_back(MaxVF);
6994 for (int NumElts = getFloorFullVectorNumberOfElements(
6995 *TTI, Loads.front()->getType(), MaxVF);
6996 NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(
6997 *TTI, Loads.front()->getType(), NumElts - 1)) {
6998 CandidateVFs.push_back(NumElts);
6999 if (VectorizeNonPowerOf2 && NumElts > 2)
7000 CandidateVFs.push_back(NumElts - 1);
7001 }
7002
7003 if (Final && CandidateVFs.empty())
7004 return Results;
7005
7006 unsigned BestVF = Final ? CandidateVFs.back() : 0;
7007 for (unsigned NumElts : CandidateVFs) {
7008 if (Final && NumElts > BestVF)
7009 continue;
7010 SmallVector<unsigned> MaskedGatherVectorized;
7011 for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E;
7012 ++Cnt) {
7013 ArrayRef<LoadInst *> Slice =
7014 ArrayRef(Loads).slice(Cnt, std::min(NumElts, E - Cnt));
7015 if (VectorizedLoads.count(Slice.front()) ||
7016 VectorizedLoads.count(Slice.back()) ||
7018 continue;
7019 // Check if it is profitable to try vectorizing gathered loads. It is
7020 // profitable if we have more than 3 consecutive loads or if we have
7021 // less but all users are vectorized or deleted.
7022 bool AllowToVectorize = false;
7023 // Check if it is profitable to vectorize 2-elements loads.
7024 if (NumElts == 2) {
7025 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
7026 Slice.front()->getType(), ElementCount::getFixed(NumElts));
7027 auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
7028 for (LoadInst *LI : Slice) {
7029 // If single use/user - allow to vectorize.
7030 if (LI->hasOneUse())
7031 continue;
7032 // 1. Check if number of uses equals number of users.
7033 // 2. All users are deleted.
7034 // 3. The load broadcasts are not allowed or the load is not
7035 // broadcasted.
7036 if (static_cast<unsigned int>(std::distance(
7037 LI->user_begin(), LI->user_end())) != LI->getNumUses())
7038 return false;
7039 if (!IsLegalBroadcastLoad)
7040 continue;
7041 if (LI->hasNUsesOrMore(UsesLimit))
7042 return false;
7043 for (User *U : LI->users()) {
7044 if (auto *UI = dyn_cast<Instruction>(U); UI && isDeleted(UI))
7045 continue;
7046 for (const TreeEntry *UTE : getTreeEntries(U)) {
7047 for (int I : seq<int>(UTE->getNumOperands())) {
7048 if (all_of(UTE->getOperand(I), [LI](Value *V) {
7049 return V == LI || isa<PoisonValue>(V);
7050 }))
7051 // Found legal broadcast - do not vectorize.
7052 return false;
7053 }
7054 }
7055 }
7056 }
7057 return true;
7058 };
7059 AllowToVectorize = CheckIfAllowed(Slice);
7060 } else {
7061 AllowToVectorize =
7062 (NumElts >= 3 ||
7063 any_of(ValueToGatherNodes.at(Slice.front()),
7064 [=](const TreeEntry *TE) {
7065 return TE->Scalars.size() == 2 &&
7066 ((TE->Scalars.front() == Slice.front() &&
7067 TE->Scalars.back() == Slice.back()) ||
7068 (TE->Scalars.front() == Slice.back() &&
7069 TE->Scalars.back() == Slice.front()));
7070 })) &&
7071 hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),
7072 Slice.size());
7073 }
7074 if (AllowToVectorize) {
7075 SmallVector<Value *> PointerOps;
7076 OrdersType CurrentOrder;
7077 // Try to build vector load.
7078 ArrayRef<Value *> Values(
7079 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
7080 LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
7081 PointerOps, &BestVF);
7082 if (LS != LoadsState::Gather ||
7083 (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
7084 if (LS == LoadsState::ScatterVectorize) {
7085 if (MaskedGatherVectorized.empty() ||
7086 Cnt >= MaskedGatherVectorized.back() + NumElts)
7087 MaskedGatherVectorized.push_back(Cnt);
7088 continue;
7089 }
7090 if (LS != LoadsState::Gather) {
7091 Results.emplace_back(Values, LS);
7092 VectorizedLoads.insert(Slice.begin(), Slice.end());
7093 // If we vectorized initial block, no need to try to vectorize it
7094 // again.
7095 if (Cnt == StartIdx)
7096 StartIdx += NumElts;
7097 }
7098 // Check if the whole array was vectorized already - exit.
7099 if (StartIdx >= Loads.size())
7100 break;
7101 // Erase last masked gather candidate, if another candidate within
7102 // the range is found to be better.
7103 if (!MaskedGatherVectorized.empty() &&
7104 Cnt < MaskedGatherVectorized.back() + NumElts)
7105 MaskedGatherVectorized.pop_back();
7106 Cnt += NumElts - 1;
7107 continue;
7108 }
7109 }
7110 if (!AllowToVectorize || BestVF == 0)
7112 }
7113 // Mark masked gathers candidates as vectorized, if any.
7114 for (unsigned Cnt : MaskedGatherVectorized) {
7115 ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(
7116 Cnt, std::min<unsigned>(NumElts, Loads.size() - Cnt));
7117 ArrayRef<Value *> Values(
7118 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
7119 Results.emplace_back(Values, LoadsState::ScatterVectorize);
7120 VectorizedLoads.insert(Slice.begin(), Slice.end());
7121 // If we vectorized initial block, no need to try to vectorize it again.
7122 if (Cnt == StartIdx)
7123 StartIdx += NumElts;
7124 }
7125 }
7126 for (LoadInst *LI : Loads) {
7127 if (!VectorizedLoads.contains(LI))
7128 NonVectorized.push_back(LI);
7129 }
7130 return Results;
7131 };
7132 auto ProcessGatheredLoads =
7133 [&, &TTI = *TTI](
7135 bool Final = false) {
7136 SmallVector<LoadInst *> NonVectorized;
7137 for (ArrayRef<std::pair<LoadInst *, int>> LoadsDists : GatheredLoads) {
7138 if (LoadsDists.size() <= 1) {
7139 NonVectorized.push_back(LoadsDists.back().first);
7140 continue;
7141 }
7142 SmallVector<std::pair<LoadInst *, int>> LocalLoadsDists(LoadsDists);
7143 SmallVector<LoadInst *> OriginalLoads(LocalLoadsDists.size());
7144 transform(LoadsDists, OriginalLoads.begin(),
7145 [](const std::pair<LoadInst *, int> &L) -> LoadInst * {
7146 return L.first;
7147 });
7148 stable_sort(LocalLoadsDists, LoadSorter);
7150 unsigned MaxConsecutiveDistance = 0;
7151 unsigned CurrentConsecutiveDist = 1;
7152 int LastDist = LocalLoadsDists.front().second;
7153 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
7154 for (const std::pair<LoadInst *, int> &L : LocalLoadsDists) {
7155 if (isVectorized(L.first))
7156 continue;
7157 assert(LastDist >= L.second &&
7158 "Expected first distance always not less than second");
7159 if (static_cast<unsigned>(LastDist - L.second) ==
7160 CurrentConsecutiveDist) {
7161 ++CurrentConsecutiveDist;
7162 MaxConsecutiveDistance =
7163 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
7164 Loads.push_back(L.first);
7165 continue;
7166 }
7167 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
7168 !Loads.empty())
7169 Loads.pop_back();
7170 CurrentConsecutiveDist = 1;
7171 LastDist = L.second;
7172 Loads.push_back(L.first);
7173 }
7174 if (Loads.size() <= 1)
7175 continue;
7176 if (AllowMaskedGather)
7177 MaxConsecutiveDistance = Loads.size();
7178 else if (MaxConsecutiveDistance < 2)
7179 continue;
7180 BoUpSLP::ValueSet VectorizedLoads;
7181 SmallVector<LoadInst *> SortedNonVectorized;
7183 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
7184 Final, MaxConsecutiveDistance);
7185 if (!Results.empty() && !SortedNonVectorized.empty() &&
7186 OriginalLoads.size() == Loads.size() &&
7187 MaxConsecutiveDistance == Loads.size() &&
7189 [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
7190 return P.second == LoadsState::ScatterVectorize;
7191 })) {
7192 VectorizedLoads.clear();
7193 SmallVector<LoadInst *> UnsortedNonVectorized;
7195 UnsortedResults =
7196 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
7197 UnsortedNonVectorized, Final,
7198 OriginalLoads.size());
7199 if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
7200 SortedNonVectorized.swap(UnsortedNonVectorized);
7201 Results.swap(UnsortedResults);
7202 }
7203 }
7204 for (auto [Slice, _] : Results) {
7205 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
7206 << Slice.size() << ")\n");
7207 if (any_of(Slice, [&](Value *V) { return isVectorized(V); })) {
7208 for (Value *L : Slice)
7209 if (!isVectorized(L))
7210 SortedNonVectorized.push_back(cast<LoadInst>(L));
7211 continue;
7212 }
7213
7214 // Select maximum VF as a maximum of user gathered nodes and
7215 // distance between scalar loads in these nodes.
7216 unsigned MaxVF = Slice.size();
7217 unsigned UserMaxVF = 0;
7218 unsigned InterleaveFactor = 0;
7219 if (MaxVF == 2) {
7220 UserMaxVF = MaxVF;
7221 } else {
7222 // Found distance between segments of the interleaved loads.
7223 std::optional<unsigned> InterleavedLoadsDistance = 0;
7224 unsigned Order = 0;
7225 std::optional<unsigned> CommonVF = 0;
7227 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
7228 for (auto [Idx, V] : enumerate(Slice)) {
7229 for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
7230 UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
7231 unsigned Pos =
7232 EntryToPosition.try_emplace(E, Idx).first->second;
7233 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
7234 if (CommonVF) {
7235 if (*CommonVF == 0) {
7236 CommonVF = E->Scalars.size();
7237 continue;
7238 }
7239 if (*CommonVF != E->Scalars.size())
7240 CommonVF.reset();
7241 }
7242 // Check if the load is the part of the interleaved load.
7243 if (Pos != Idx && InterleavedLoadsDistance) {
7244 if (!DeinterleavedNodes.contains(E) &&
7245 any_of(E->Scalars, [&, Slice = Slice](Value *V) {
7246 if (isa<Constant>(V))
7247 return false;
7248 if (isVectorized(V))
7249 return true;
7250 const auto &Nodes = ValueToGatherNodes.at(V);
7251 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
7252 !is_contained(Slice, V);
7253 })) {
7254 InterleavedLoadsDistance.reset();
7255 continue;
7256 }
7257 DeinterleavedNodes.insert(E);
7258 if (*InterleavedLoadsDistance == 0) {
7259 InterleavedLoadsDistance = Idx - Pos;
7260 continue;
7261 }
7262 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
7263 (Idx - Pos) / *InterleavedLoadsDistance < Order)
7264 InterleavedLoadsDistance.reset();
7265 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
7266 }
7267 }
7268 }
7269 DeinterleavedNodes.clear();
7270 // Check if the large load represents interleaved load operation.
7271 if (InterleavedLoadsDistance.value_or(0) > 1 &&
7272 CommonVF.value_or(0) != 0) {
7273 InterleaveFactor = bit_ceil(*InterleavedLoadsDistance);
7274 unsigned VF = *CommonVF;
7275 OrdersType Order;
7276 SmallVector<Value *> PointerOps;
7277 // Segmented load detected - vectorize at maximum vector factor.
7278 if (InterleaveFactor <= Slice.size() &&
7280 getWidenedType(Slice.front()->getType(), VF),
7281 InterleaveFactor,
7282 cast<LoadInst>(Slice.front())->getAlign(),
7283 cast<LoadInst>(Slice.front())
7285 canVectorizeLoads(Slice, Slice.front(), Order,
7286 PointerOps) == LoadsState::Vectorize) {
7287 UserMaxVF = InterleaveFactor * VF;
7288 } else {
7289 InterleaveFactor = 0;
7290 }
7291 }
7292 // Cannot represent the loads as consecutive vectorizable nodes -
7293 // just exit.
7294 unsigned ConsecutiveNodesSize = 0;
7295 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
7296 any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7297 [&, Slice = Slice](const auto &P) {
7298 const auto *It = find_if(Slice, [&](Value *V) {
7299 return std::get<1>(P).contains(V);
7300 });
7301 if (It == Slice.end())
7302 return false;
7304 VectorizableTree[std::get<0>(P)]->Scalars;
7305 ConsecutiveNodesSize += VL.size();
7306 unsigned Start = std::distance(Slice.begin(), It);
7307 unsigned Sz = Slice.size() - Start;
7308 return Sz < VL.size() ||
7309 Slice.slice(std::distance(Slice.begin(), It),
7310 VL.size()) != VL;
7311 }))
7312 continue;
7313 // Try to build long masked gather loads.
7314 UserMaxVF = bit_ceil(UserMaxVF);
7315 if (InterleaveFactor == 0 &&
7316 any_of(seq<unsigned>(Slice.size() / UserMaxVF),
7317 [&, Slice = Slice](unsigned Idx) {
7318 OrdersType Order;
7319 SmallVector<Value *> PointerOps;
7320 return canVectorizeLoads(
7321 Slice.slice(Idx * UserMaxVF, UserMaxVF),
7322 Slice[Idx * UserMaxVF], Order,
7323 PointerOps) ==
7324 LoadsState::ScatterVectorize;
7325 }))
7326 UserMaxVF = MaxVF;
7327 if (Slice.size() != ConsecutiveNodesSize)
7328 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
7329 }
7330 for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
7331 bool IsVectorized = true;
7332 for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
7333 ArrayRef<Value *> SubSlice =
7334 Slice.slice(I, std::min(VF, E - I));
7335 if (isVectorized(SubSlice.front()))
7336 continue;
7337 // Check if the subslice is to be-vectorized entry, which is not
7338 // equal to entry.
7339 if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7340 [&](const auto &P) {
7341 return !SubSlice.equals(
7342 VectorizableTree[std::get<0>(P)]
7343 ->Scalars) &&
7344 set_is_subset(SubSlice, std::get<1>(P));
7345 }))
7346 continue;
7347 unsigned Sz = VectorizableTree.size();
7348 buildTree_rec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
7349 if (Sz == VectorizableTree.size()) {
7350 IsVectorized = false;
7351 // Try non-interleaved vectorization with smaller vector
7352 // factor.
7353 if (InterleaveFactor > 0) {
7354 VF = 2 * (MaxVF / InterleaveFactor);
7355 InterleaveFactor = 0;
7356 }
7357 continue;
7358 }
7359 }
7360 if (IsVectorized)
7361 break;
7362 }
7363 }
7364 NonVectorized.append(SortedNonVectorized);
7365 }
7366 return NonVectorized;
7367 };
7368 for (const auto &GLs : GatheredLoads) {
7369 const auto &Ref = GLs.second;
7370 SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);
7371 if (!Ref.empty() && !NonVectorized.empty() &&
7372 std::accumulate(
7373 Ref.begin(), Ref.end(), 0u,
7374 [](unsigned S,
7375 ArrayRef<std::pair<LoadInst *, int>> LoadsDists) -> unsigned {
7376 return S + LoadsDists.size();
7377 }) != NonVectorized.size() &&
7378 IsMaskedGatherSupported(NonVectorized)) {
7380 for (LoadInst *LI : NonVectorized) {
7381 // Reinsert non-vectorized loads to other list of loads with the same
7382 // base pointers.
7383 gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI,
7384 FinalGatheredLoads,
7385 /*AddNew=*/false);
7386 }
7387 // Final attempt to vectorize non-vectorized loads.
7388 (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
7389 }
7390 }
7391 // Try to vectorize postponed load entries, previously marked as gathered.
7392 for (unsigned Idx : LoadEntriesToVectorize) {
7393 const TreeEntry &E = *VectorizableTree[Idx];
7394 SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
7395 // Avoid reordering, if possible.
7396 if (!E.ReorderIndices.empty()) {
7397 // Build a mask out of the reorder indices and reorder scalars per this
7398 // mask.
7399 SmallVector<int> ReorderMask;
7400 inversePermutation(E.ReorderIndices, ReorderMask);
7401 reorderScalars(GatheredScalars, ReorderMask);
7402 }
7403 buildTree_rec(GatheredScalars, 0, EdgeInfo());
7404 }
7405 // If no new entries created, consider it as no gathered loads entries must be
7406 // handled.
7407 if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
7408 VectorizableTree.size())
7409 GatheredLoadsEntriesFirst.reset();
7410}
7411
7412/// \return true if the specified list of values has only one instruction that
7413/// requires scheduling, false otherwise.
7414#ifndef NDEBUG
7416 Value *NeedsScheduling = nullptr;
7417 for (Value *V : VL) {
7419 continue;
7420 if (!NeedsScheduling) {
7421 NeedsScheduling = V;
7422 continue;
7423 }
7424 return false;
7425 }
7426 return NeedsScheduling;
7427}
7428#endif
7429
7430/// Generates key/subkey pair for the given value to provide effective sorting
7431/// of the values and better detection of the vectorizable values sequences. The
7432/// keys/subkeys can be used for better sorting of the values themselves (keys)
7433/// and in values subgroups (subkeys).
7434static std::pair<size_t, size_t> generateKeySubkey(
7435 Value *V, const TargetLibraryInfo *TLI,
7436 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
7437 bool AllowAlternate) {
7438 hash_code Key = hash_value(V->getValueID() + 2);
7439 hash_code SubKey = hash_value(0);
7440 // Sort the loads by the distance between the pointers.
7441 if (auto *LI = dyn_cast<LoadInst>(V)) {
7442 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
7443 if (LI->isSimple())
7444 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
7445 else
7446 Key = SubKey = hash_value(LI);
7447 } else if (isVectorLikeInstWithConstOps(V)) {
7448 // Sort extracts by the vector operands.
7449 if (isa<ExtractElementInst, UndefValue>(V))
7450 Key = hash_value(Value::UndefValueVal + 1);
7451 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
7452 if (!isUndefVector(EI->getVectorOperand()).all() &&
7453 !isa<UndefValue>(EI->getIndexOperand()))
7454 SubKey = hash_value(EI->getVectorOperand());
7455 }
7456 } else if (auto *I = dyn_cast<Instruction>(V)) {
7457 // Sort other instructions just by the opcodes except for CMPInst.
7458 // For CMP also sort by the predicate kind.
7459 if ((isa<BinaryOperator, CastInst>(I)) &&
7460 isValidForAlternation(I->getOpcode())) {
7461 if (AllowAlternate)
7462 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
7463 else
7464 Key = hash_combine(hash_value(I->getOpcode()), Key);
7465 SubKey = hash_combine(
7466 hash_value(I->getOpcode()), hash_value(I->getType()),
7467 hash_value(isa<BinaryOperator>(I)
7468 ? I->getType()
7469 : cast<CastInst>(I)->getOperand(0)->getType()));
7470 // For casts, look through the only operand to improve compile time.
7471 if (isa<CastInst>(I)) {
7472 std::pair<size_t, size_t> OpVals =
7473 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
7474 /*AllowAlternate=*/true);
7475 Key = hash_combine(OpVals.first, Key);
7476 SubKey = hash_combine(OpVals.first, SubKey);
7477 }
7478 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
7479 CmpInst::Predicate Pred = CI->getPredicate();
7480 if (CI->isCommutative())
7481 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
7483 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
7484 hash_value(SwapPred),
7485 hash_value(CI->getOperand(0)->getType()));
7486 } else if (auto *Call = dyn_cast<CallInst>(I)) {
7489 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
7490 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
7491 SubKey = hash_combine(hash_value(I->getOpcode()),
7492 hash_value(Call->getCalledFunction()));
7493 } else {
7494 Key = hash_combine(hash_value(Call), Key);
7495 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
7496 }
7497 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
7498 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
7499 hash_value(Op.Tag), SubKey);
7500 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
7501 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
7502 SubKey = hash_value(Gep->getPointerOperand());
7503 else
7504 SubKey = hash_value(Gep);
7505 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
7506 !isa<ConstantInt>(I->getOperand(1))) {
7507 // Do not try to vectorize instructions with potentially high cost.
7508 SubKey = hash_value(I);
7509 } else {
7510 SubKey = hash_value(I->getOpcode());
7511 }
7512 Key = hash_combine(hash_value(I->getParent()), Key);
7513 }
7514 return std::make_pair(Key, SubKey);
7515}
7516
7517/// Checks if the specified instruction \p I is an alternate operation for
7518/// the given \p MainOp and \p AltOp instructions.
7519static bool isAlternateInstruction(const Instruction *I,
7520 const Instruction *MainOp,
7521 const Instruction *AltOp,
7522 const TargetLibraryInfo &TLI);
7523
7524bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
7525 ArrayRef<Value *> VL) const {
7526 unsigned Opcode0 = S.getOpcode();
7527 unsigned Opcode1 = S.getAltOpcode();
7528 SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1));
7529 // If this pattern is supported by the target then consider it profitable.
7530 if (TTI->isLegalAltInstr(getWidenedType(S.getMainOp()->getType(), VL.size()),
7531 Opcode0, Opcode1, OpcodeMask))
7532 return true;
7534 for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
7535 Operands.emplace_back();
7536 // Prepare the operand vector.
7537 for (Value *V : VL) {
7538 if (isa<PoisonValue>(V)) {
7539 Operands.back().push_back(
7540 PoisonValue::get(S.getMainOp()->getOperand(I)->getType()));
7541 continue;
7542 }
7543 Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
7544 }
7545 }
7546 if (Operands.size() == 2) {
7547 // Try find best operands candidates.
7548 for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
7550 Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
7551 Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
7552 Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
7553 std::optional<int> Res = findBestRootPair(Candidates);
7554 switch (Res.value_or(0)) {
7555 case 0:
7556 break;
7557 case 1:
7558 std::swap(Operands[0][I + 1], Operands[1][I + 1]);
7559 break;
7560 case 2:
7561 std::swap(Operands[0][I], Operands[1][I]);
7562 break;
7563 default:
7564 llvm_unreachable("Unexpected index.");
7565 }
7566 }
7567 }
7568 DenseSet<unsigned> UniqueOpcodes;
7569 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
7570 unsigned NonInstCnt = 0;
7571 // Estimate number of instructions, required for the vectorized node and for
7572 // the buildvector node.
7573 unsigned UndefCnt = 0;
7574 // Count the number of extra shuffles, required for vector nodes.
7575 unsigned ExtraShuffleInsts = 0;
7576 // Check that operands do not contain same values and create either perfect
7577 // diamond match or shuffled match.
7578 if (Operands.size() == 2) {
7579 // Do not count same operands twice.
7580 if (Operands.front() == Operands.back()) {
7581 Operands.erase(Operands.begin());
7582 } else if (!allConstant(Operands.front()) &&
7583 all_of(Operands.front(), [&](Value *V) {
7584 return is_contained(Operands.back(), V);
7585 })) {
7586 Operands.erase(Operands.begin());
7587 ++ExtraShuffleInsts;
7588 }
7589 }
7590 const Loop *L = LI->getLoopFor(S.getMainOp()->getParent());
7591 // Vectorize node, if:
7592 // 1. at least single operand is constant or splat.
7593 // 2. Operands have many loop invariants (the instructions are not loop
7594 // invariants).
7595 // 3. At least single unique operands is supposed to vectorized.
7596 return none_of(Operands,
7597 [&](ArrayRef<Value *> Op) {
7598 if (allConstant(Op) ||
7599 (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
7600 getSameOpcode(Op, *TLI)))
7601 return false;
7603 for (Value *V : Op) {
7604 if (isa<Constant, ExtractElementInst>(V) ||
7605 isVectorized(V) || (L && L->isLoopInvariant(V))) {
7606 if (isa<UndefValue>(V))
7607 ++UndefCnt;
7608 continue;
7609 }
7610 auto Res = Uniques.try_emplace(V, 0);
7611 // Found first duplicate - need to add shuffle.
7612 if (!Res.second && Res.first->second == 1)
7613 ++ExtraShuffleInsts;
7614 ++Res.first->getSecond();
7615 if (auto *I = dyn_cast<Instruction>(V))
7616 UniqueOpcodes.insert(I->getOpcode());
7617 else if (Res.second)
7618 ++NonInstCnt;
7619 }
7620 return none_of(Uniques, [&](const auto &P) {
7621 return P.first->hasNUsesOrMore(P.second + 1) &&
7622 none_of(P.first->users(), [&](User *U) {
7623 return isVectorized(U) || Uniques.contains(U);
7624 });
7625 });
7626 }) ||
7627 // Do not vectorize node, if estimated number of vector instructions is
7628 // more than estimated number of buildvector instructions. Number of
7629 // vector operands is number of vector instructions + number of vector
7630 // instructions for operands (buildvectors). Number of buildvector
7631 // instructions is just number_of_operands * number_of_scalars.
7632 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
7633 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
7634 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
7635}
7636
7637BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
7638 const InstructionsState &S, ArrayRef<Value *> VL,
7639 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
7640 SmallVectorImpl<Value *> &PointerOps) {
7641 assert(S.getMainOp() &&
7642 "Expected instructions with same/alternate opcodes only.");
7643
7644 unsigned ShuffleOrOp =
7645 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
7646 Instruction *VL0 = S.getMainOp();
7647 switch (ShuffleOrOp) {
7648 case Instruction::PHI: {
7649 // Too many operands - gather, most probably won't be vectorized.
7650 if (VL0->getNumOperands() > MaxPHINumOperands)
7651 return TreeEntry::NeedToGather;
7652 // Check for terminator values (e.g. invoke).
7653 for (Value *V : VL) {
7654 auto *PHI = dyn_cast<PHINode>(V);
7655 if (!PHI)
7656 continue;
7657 for (Value *Incoming : PHI->incoming_values()) {
7658 Instruction *Term = dyn_cast<Instruction>(Incoming);
7659 if (Term && Term->isTerminator()) {
7661 << "SLP: Need to swizzle PHINodes (terminator use).\n");
7662 return TreeEntry::NeedToGather;
7663 }
7664 }
7665 }
7666
7667 return TreeEntry::Vectorize;
7668 }
7669 case Instruction::ExtractValue:
7670 case Instruction::ExtractElement: {
7671 bool Reuse = canReuseExtract(VL, CurrentOrder);
7672 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
7673 // non-full registers).
7674 if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size()))
7675 return TreeEntry::NeedToGather;
7676 if (Reuse || !CurrentOrder.empty())
7677 return TreeEntry::Vectorize;
7678 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
7679 return TreeEntry::NeedToGather;
7680 }
7681 case Instruction::InsertElement: {
7682 // Check that we have a buildvector and not a shuffle of 2 or more
7683 // different vectors.
7684 ValueSet SourceVectors;
7685 for (Value *V : VL) {
7686 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
7687 assert(getElementIndex(V) != std::nullopt &&
7688 "Non-constant or undef index?");
7689 }
7690
7691 if (count_if(VL, [&SourceVectors](Value *V) {
7692 return !SourceVectors.contains(V);
7693 }) >= 2) {
7694 // Found 2nd source vector - cancel.
7695 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
7696 "different source vectors.\n");
7697 return TreeEntry::NeedToGather;
7698 }
7699
7700 if (any_of(VL, [&SourceVectors](Value *V) {
7701 // The last InsertElement can have multiple uses.
7702 return SourceVectors.contains(V) && !V->hasOneUse();
7703 })) {
7704 assert(SLPReVec && "Only supported by REVEC.");
7705 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
7706 "multiple uses.\n");
7707 return TreeEntry::NeedToGather;
7708 }
7709
7710 return TreeEntry::Vectorize;
7711 }
7712 case Instruction::Load: {
7713 // Check that a vectorized load would load the same memory as a scalar
7714 // load. For example, we don't want to vectorize loads that are smaller
7715 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
7716 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
7717 // from such a struct, we read/write packed bits disagreeing with the
7718 // unvectorized version.
7719 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {
7721 return TreeEntry::Vectorize;
7723 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
7724 // Delay slow vectorized nodes for better vectorization attempts.
7725 LoadEntriesToVectorize.insert(VectorizableTree.size());
7726 return TreeEntry::NeedToGather;
7727 }
7728 return TreeEntry::ScatterVectorize;
7730 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
7731 // Delay slow vectorized nodes for better vectorization attempts.
7732 LoadEntriesToVectorize.insert(VectorizableTree.size());
7733 return TreeEntry::NeedToGather;
7734 }
7735 return TreeEntry::StridedVectorize;
7736 case LoadsState::Gather:
7737#ifndef NDEBUG
7738 Type *ScalarTy = VL0->getType();
7739 if (DL->getTypeSizeInBits(ScalarTy) !=
7740 DL->getTypeAllocSizeInBits(ScalarTy))
7741 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
7742 else if (any_of(VL, [](Value *V) {
7743 auto *LI = dyn_cast<LoadInst>(V);
7744 return !LI || !LI->isSimple();
7745 }))
7746 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
7747 else
7748 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
7749#endif // NDEBUG
7751 return TreeEntry::NeedToGather;
7752 }
7753 llvm_unreachable("Unexpected state of loads");
7754 }
7755 case Instruction::ZExt:
7756 case Instruction::SExt:
7757 case Instruction::FPToUI:
7758 case Instruction::FPToSI:
7759 case Instruction::FPExt:
7760 case Instruction::PtrToInt:
7761 case Instruction::IntToPtr:
7762 case Instruction::SIToFP:
7763 case Instruction::UIToFP:
7764 case Instruction::Trunc:
7765 case Instruction::FPTrunc:
7766 case Instruction::BitCast: {
7767 Type *SrcTy = VL0->getOperand(0)->getType();
7768 for (Value *V : VL) {
7769 if (isa<PoisonValue>(V))
7770 continue;
7771 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
7772 if (Ty != SrcTy || !isValidElementType(Ty)) {
7773 LLVM_DEBUG(
7774 dbgs() << "SLP: Gathering casts with different src types.\n");
7775 return TreeEntry::NeedToGather;
7776 }
7777 }
7778 return TreeEntry::Vectorize;
7779 }
7780 case Instruction::ICmp:
7781 case Instruction::FCmp: {
7782 // Check that all of the compares have the same predicate.
7783 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
7785 Type *ComparedTy = VL0->getOperand(0)->getType();
7786 for (Value *V : VL) {
7787 if (isa<PoisonValue>(V))
7788 continue;
7789 auto *Cmp = cast<CmpInst>(V);
7790 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
7791 Cmp->getOperand(0)->getType() != ComparedTy) {
7792 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
7793 return TreeEntry::NeedToGather;
7794 }
7795 }
7796 return TreeEntry::Vectorize;
7797 }
7798 case Instruction::Select:
7799 case Instruction::FNeg:
7800 case Instruction::Add:
7801 case Instruction::FAdd:
7802 case Instruction::Sub:
7803 case Instruction::FSub:
7804 case Instruction::Mul:
7805 case Instruction::FMul:
7806 case Instruction::UDiv:
7807 case Instruction::SDiv:
7808 case Instruction::FDiv:
7809 case Instruction::URem:
7810 case Instruction::SRem:
7811 case Instruction::FRem:
7812 case Instruction::Shl:
7813 case Instruction::LShr:
7814 case Instruction::AShr:
7815 case Instruction::And:
7816 case Instruction::Or:
7817 case Instruction::Xor:
7818 case Instruction::Freeze:
7819 if (S.getMainOp()->getType()->isFloatingPointTy() &&
7821 auto *I = dyn_cast<Instruction>(V);
7822 return I && I->isBinaryOp() && !I->isFast();
7823 }))
7824 return TreeEntry::NeedToGather;
7825 return TreeEntry::Vectorize;
7826 case Instruction::GetElementPtr: {
7827 // We don't combine GEPs with complicated (nested) indexing.
7828 for (Value *V : VL) {
7829 auto *I = dyn_cast<GetElementPtrInst>(V);
7830 if (!I)
7831 continue;
7832 if (I->getNumOperands() != 2) {
7833 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
7834 return TreeEntry::NeedToGather;
7835 }
7836 }
7837
7838 // We can't combine several GEPs into one vector if they operate on
7839 // different types.
7840 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
7841 for (Value *V : VL) {
7842 auto *GEP = dyn_cast<GEPOperator>(V);
7843 if (!GEP)
7844 continue;
7845 Type *CurTy = GEP->getSourceElementType();
7846 if (Ty0 != CurTy) {
7847 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
7848 return TreeEntry::NeedToGather;
7849 }
7850 }
7851
7852 // We don't combine GEPs with non-constant indexes.
7853 Type *Ty1 = VL0->getOperand(1)->getType();
7854 for (Value *V : VL) {
7855 auto *I = dyn_cast<GetElementPtrInst>(V);
7856 if (!I)
7857 continue;
7858 auto *Op = I->getOperand(1);
7859 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
7860 (Op->getType() != Ty1 &&
7861 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
7862 Op->getType()->getScalarSizeInBits() >
7863 DL->getIndexSizeInBits(
7864 V->getType()->getPointerAddressSpace())))) {
7865 LLVM_DEBUG(
7866 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
7867 return TreeEntry::NeedToGather;
7868 }
7869 }
7870
7871 return TreeEntry::Vectorize;
7872 }
7873 case Instruction::Store: {
7874 // Check if the stores are consecutive or if we need to swizzle them.
7875 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
7876 // Avoid types that are padded when being allocated as scalars, while
7877 // being packed together in a vector (such as i1).
7878 if (DL->getTypeSizeInBits(ScalarTy) !=
7879 DL->getTypeAllocSizeInBits(ScalarTy)) {
7880 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
7881 return TreeEntry::NeedToGather;
7882 }
7883 // Make sure all stores in the bundle are simple - we can't vectorize
7884 // atomic or volatile stores.
7885 for (Value *V : VL) {
7886 auto *SI = cast<StoreInst>(V);
7887 if (!SI->isSimple()) {
7888 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
7889 return TreeEntry::NeedToGather;
7890 }
7891 PointerOps.push_back(SI->getPointerOperand());
7892 }
7893
7894 // Check the order of pointer operands.
7895 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
7896 Value *Ptr0;
7897 Value *PtrN;
7898 if (CurrentOrder.empty()) {
7899 Ptr0 = PointerOps.front();
7900 PtrN = PointerOps.back();
7901 } else {
7902 Ptr0 = PointerOps[CurrentOrder.front()];
7903 PtrN = PointerOps[CurrentOrder.back()];
7904 }
7905 std::optional<int> Dist =
7906 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
7907 // Check that the sorted pointer operands are consecutive.
7908 if (static_cast<unsigned>(*Dist) == VL.size() - 1)
7909 return TreeEntry::Vectorize;
7910 }
7911
7912 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
7913 return TreeEntry::NeedToGather;
7914 }
7915 case Instruction::Call: {
7916 if (S.getMainOp()->getType()->isFloatingPointTy() &&
7918 auto *I = dyn_cast<Instruction>(V);
7919 return I && !I->isFast();
7920 }))
7921 return TreeEntry::NeedToGather;
7922 // Check if the calls are all to the same vectorizable intrinsic or
7923 // library function.
7924 CallInst *CI = cast<CallInst>(VL0);
7926
7927 VFShape Shape = VFShape::get(
7928 CI->getFunctionType(),
7929 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
7930 false /*HasGlobalPred*/);
7931 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
7932
7933 if (!VecFunc && !isTriviallyVectorizable(ID)) {
7934 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
7935 return TreeEntry::NeedToGather;
7936 }
7937 Function *F = CI->getCalledFunction();
7938 unsigned NumArgs = CI->arg_size();
7939 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
7940 for (unsigned J = 0; J != NumArgs; ++J)
7942 ScalarArgs[J] = CI->getArgOperand(J);
7943 for (Value *V : VL) {
7944 CallInst *CI2 = dyn_cast<CallInst>(V);
7945 if (!CI2 || CI2->getCalledFunction() != F ||
7946 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
7947 (VecFunc &&
7948 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
7950 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
7951 << "\n");
7952 return TreeEntry::NeedToGather;
7953 }
7954 // Some intrinsics have scalar arguments and should be same in order for
7955 // them to be vectorized.
7956 for (unsigned J = 0; J != NumArgs; ++J) {
7958 Value *A1J = CI2->getArgOperand(J);
7959 if (ScalarArgs[J] != A1J) {
7961 << "SLP: mismatched arguments in call:" << *CI
7962 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
7963 return TreeEntry::NeedToGather;
7964 }
7965 }
7966 }
7967 // Verify that the bundle operands are identical between the two calls.
7968 if (CI->hasOperandBundles() &&
7969 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
7970 CI->op_begin() + CI->getBundleOperandsEndIndex(),
7971 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
7972 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
7973 << "!=" << *V << '\n');
7974 return TreeEntry::NeedToGather;
7975 }
7976 }
7977
7978 return TreeEntry::Vectorize;
7979 }
7980 case Instruction::ShuffleVector: {
7981 if (!S.isAltShuffle()) {
7982 // REVEC can support non alternate shuffle.
7984 return TreeEntry::Vectorize;
7985 // If this is not an alternate sequence of opcode like add-sub
7986 // then do not vectorize this instruction.
7987 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
7988 return TreeEntry::NeedToGather;
7989 }
7990 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
7991 LLVM_DEBUG(
7992 dbgs()
7993 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
7994 "the whole alt sequence is not profitable.\n");
7995 return TreeEntry::NeedToGather;
7996 }
7997
7998 return TreeEntry::Vectorize;
7999 }
8000 default:
8001 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
8002 return TreeEntry::NeedToGather;
8003 }
8004}
8005
8006namespace {
8007/// Allows to correctly handle operands of the phi nodes based on the \p Main
8008/// PHINode order of incoming basic blocks/values.
8009class PHIHandler {
8010 DominatorTree &DT;
8011 PHINode *Main = nullptr;
8014
8015public:
8016 PHIHandler() = delete;
8017 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
8018 : DT(DT), Main(Main), Phis(Phis),
8019 Operands(Main->getNumIncomingValues(),
8020 SmallVector<Value *>(Phis.size(), nullptr)) {}
8021 void buildOperands() {
8022 constexpr unsigned FastLimit = 4;
8023 if (Main->getNumIncomingValues() <= FastLimit) {
8024 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
8025 BasicBlock *InBB = Main->getIncomingBlock(I);
8026 if (!DT.isReachableFromEntry(InBB)) {
8027 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
8028 continue;
8029 }
8030 // Prepare the operand vector.
8031 for (auto [Idx, V] : enumerate(Phis)) {
8032 auto *P = dyn_cast<PHINode>(V);
8033 if (!P) {
8034 assert(isa<PoisonValue>(V) &&
8035 "Expected isa instruction or poison value.");
8036 Operands[I][Idx] = V;
8037 continue;
8038 }
8039 if (P->getIncomingBlock(I) == InBB)
8040 Operands[I][Idx] = P->getIncomingValue(I);
8041 else
8042 Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
8043 }
8044 }
8045 return;
8046 }
8048 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
8049 BasicBlock *InBB = Main->getIncomingBlock(I);
8050 if (!DT.isReachableFromEntry(InBB)) {
8051 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
8052 continue;
8053 }
8054 Blocks.try_emplace(InBB).first->second.push_back(I);
8055 }
8056 for (auto [Idx, V] : enumerate(Phis)) {
8057 if (isa<PoisonValue>(V)) {
8058 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues()))
8059 Operands[I][Idx] = V;
8060 continue;
8061 }
8062 auto *P = cast<PHINode>(V);
8063 for (unsigned I : seq<unsigned>(0, P->getNumIncomingValues())) {
8064 BasicBlock *InBB = P->getIncomingBlock(I);
8065 if (InBB == Main->getIncomingBlock(I)) {
8066 if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))
8067 continue;
8068 Operands[I][Idx] = P->getIncomingValue(I);
8069 continue;
8070 }
8071 auto It = Blocks.find(InBB);
8072 if (It == Blocks.end())
8073 continue;
8074 Operands[It->second.front()][Idx] = P->getIncomingValue(I);
8075 }
8076 }
8077 for (const auto &P : Blocks) {
8078 if (P.getSecond().size() <= 1)
8079 continue;
8080 unsigned BasicI = P.getSecond().front();
8081 for (unsigned I : ArrayRef(P.getSecond()).drop_front()) {
8083 [&](const auto &Data) {
8084 return !Data.value() ||
8085 Data.value() == Operands[BasicI][Data.index()];
8086 }) &&
8087 "Expected empty operands list.");
8088 Operands[I] = Operands[BasicI];
8089 }
8090 }
8091 }
8092 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
8093};
8094} // namespace
8095
8096void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
8097 const EdgeInfo &UserTreeIdx,
8098 unsigned InterleaveFactor) {
8099 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
8100
8101 SmallVector<int> ReuseShuffleIndices;
8102 SmallVector<Value *> UniqueValues;
8103 SmallVector<Value *> NonUniqueValueVL;
8104 auto TryToFindDuplicates = [&](const InstructionsState &S,
8105 bool DoNotFail = false) {
8106 // Check that every instruction appears once in this bundle.
8107 SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
8108 for (Value *V : VL) {
8109 if (isConstant(V)) {
8110 ReuseShuffleIndices.emplace_back(
8111 isa<PoisonValue>(V) ? PoisonMaskElem : UniqueValues.size());
8112 UniqueValues.emplace_back(V);
8113 continue;
8114 }
8115 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
8116 ReuseShuffleIndices.emplace_back(Res.first->second);
8117 if (Res.second)
8118 UniqueValues.emplace_back(V);
8119 }
8120 size_t NumUniqueScalarValues = UniqueValues.size();
8121 bool IsFullVectors = hasFullVectorsOrPowerOf2(
8122 *TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues);
8123 if (NumUniqueScalarValues == VL.size() &&
8124 (VectorizeNonPowerOf2 || IsFullVectors)) {
8125 ReuseShuffleIndices.clear();
8126 } else {
8127 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
8128 if ((UserTreeIdx.UserTE &&
8129 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) ||
8130 !hasFullVectorsOrPowerOf2(*TTI, VL.front()->getType(), VL.size())) {
8131 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
8132 "for nodes with padding.\n");
8133 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8134 return false;
8135 }
8136 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
8137 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
8138 (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
8139 return isa<UndefValue>(V) || !isConstant(V);
8140 }))) {
8141 if (DoNotFail && UniquePositions.size() > 1 &&
8142 NumUniqueScalarValues > 1 && S.getMainOp()->isSafeToRemove() &&
8143 all_of(UniqueValues, IsaPred<Instruction, PoisonValue>)) {
8144 // Find the number of elements, which forms full vectors.
8145 unsigned PWSz = getFullVectorNumberOfElements(
8146 *TTI, UniqueValues.front()->getType(), UniqueValues.size());
8147 if (PWSz == VL.size()) {
8148 ReuseShuffleIndices.clear();
8149 } else {
8150 NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
8151 NonUniqueValueVL.append(
8152 PWSz - UniqueValues.size(),
8153 PoisonValue::get(UniqueValues.front()->getType()));
8154 // Check that extended with poisons operations are still valid for
8155 // vectorization (div/rem are not allowed).
8156 if (!getSameOpcode(NonUniqueValueVL, *TLI).valid()) {
8157 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
8158 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8159 return false;
8160 }
8161 VL = NonUniqueValueVL;
8162 }
8163 return true;
8164 }
8165 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
8166 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8167 return false;
8168 }
8169 VL = UniqueValues;
8170 }
8171 return true;
8172 };
8173
8174 InstructionsState S = getSameOpcode(VL, *TLI);
8175
8176 // Don't go into catchswitch blocks, which can happen with PHIs.
8177 // Such blocks can only have PHIs and the catchswitch. There is no
8178 // place to insert a shuffle if we need to, so just avoid that issue.
8179 if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
8180 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
8181 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8182 return;
8183 }
8184
8185 // Check if this is a duplicate of another entry.
8186 if (S) {
8187 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp() << ".\n");
8188 for (TreeEntry *E : getTreeEntries(S.getMainOp())) {
8189 if (E->isSame(VL)) {
8190 // Record the reuse of the tree node.
8191 E->UserTreeIndices.push_back(UserTreeIdx);
8192 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
8193 << ".\n");
8194 return;
8195 }
8196 SmallPtrSet<Value *, 8> Values(E->Scalars.begin(), E->Scalars.end());
8197 if (all_of(VL, [&](Value *V) {
8198 return isa<PoisonValue>(V) || Values.contains(V);
8199 })) {
8200 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
8201 if (TryToFindDuplicates(S))
8202 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8203 ReuseShuffleIndices);
8204 return;
8205 }
8206 }
8207 }
8208
8209 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
8210 // a load), in which case peek through to include it in the tree, without
8211 // ballooning over-budget.
8212 if (Depth >= RecursionMaxDepth &&
8213 !(S && !S.isAltShuffle() && VL.size() >= 4 &&
8214 (match(S.getMainOp(), m_Load(m_Value())) ||
8215 all_of(VL, [&S](const Value *I) {
8216 return match(I,
8218 cast<Instruction>(I)->getOpcode() == S.getOpcode();
8219 })))) {
8220 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
8221 if (TryToFindDuplicates(S))
8222 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8223 ReuseShuffleIndices);
8224 return;
8225 }
8226
8227 // Don't handle scalable vectors
8228 if (S && S.getOpcode() == Instruction::ExtractElement &&
8229 isa<ScalableVectorType>(
8230 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
8231 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
8232 if (TryToFindDuplicates(S))
8233 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8234 ReuseShuffleIndices);
8235 return;
8236 }
8237
8238 // Don't handle vectors.
8239 if (!SLPReVec && getValueType(VL.front())->isVectorTy()) {
8240 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
8241 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8242 return;
8243 }
8244
8245 // If all of the operands are identical or constant we have a simple solution.
8246 // If we deal with insert/extract instructions, they all must have constant
8247 // indices, otherwise we should gather them, not try to vectorize.
8248 // If alternate op node with 2 elements with gathered operands - do not
8249 // vectorize.
8250 auto &&NotProfitableForVectorization = [&S, this,
8252 if (!S || !S.isAltShuffle() || VL.size() > 2)
8253 return false;
8254 if (VectorizableTree.size() < MinTreeSize)
8255 return false;
8256 if (Depth >= RecursionMaxDepth - 1)
8257 return true;
8258 // Check if all operands are extracts, part of vector node or can build a
8259 // regular vectorize node.
8260 SmallVector<unsigned, 8> InstsCount;
8261 for (Value *V : VL) {
8262 auto *I = cast<Instruction>(V);
8263 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
8264 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
8265 }));
8266 }
8267 bool IsCommutative =
8268 isCommutative(S.getMainOp()) || isCommutative(S.getAltOp());
8269 if ((IsCommutative &&
8270 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
8271 (!IsCommutative &&
8272 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
8273 return true;
8274 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
8276 auto *I1 = cast<Instruction>(VL.front());
8277 auto *I2 = cast<Instruction>(VL.back());
8278 for (int Op : seq<int>(S.getMainOp()->getNumOperands()))
8279 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
8280 I2->getOperand(Op));
8281 if (static_cast<unsigned>(count_if(
8282 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
8284 })) >= S.getMainOp()->getNumOperands() / 2)
8285 return false;
8286 if (S.getMainOp()->getNumOperands() > 2)
8287 return true;
8288 if (IsCommutative) {
8289 // Check permuted operands.
8290 Candidates.clear();
8291 for (int Op = 0, E = S.getMainOp()->getNumOperands(); Op < E; ++Op)
8292 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
8293 I2->getOperand((Op + 1) % E));
8294 if (any_of(
8295 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
8297 }))
8298 return false;
8299 }
8300 return true;
8301 };
8302 SmallVector<unsigned> SortedIndices;
8303 BasicBlock *BB = nullptr;
8304 bool IsScatterVectorizeUserTE =
8305 UserTreeIdx.UserTE &&
8306 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
8307 bool AreAllSameBlock = S && allSameBlock(VL);
8308 bool AreScatterAllGEPSameBlock =
8309 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
8310 VL.size() > 2 &&
8311 all_of(VL,
8312 [&BB](Value *V) {
8313 auto *I = dyn_cast<GetElementPtrInst>(V);
8314 if (!I)
8315 return doesNotNeedToBeScheduled(V);
8316 if (!BB)
8317 BB = I->getParent();
8318 return BB == I->getParent() && I->getNumOperands() == 2;
8319 }) &&
8320 BB &&
8321 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
8322 SortedIndices));
8323 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
8324 if (!AreAllSameInsts || (!S && allConstant(VL)) || isSplat(VL) ||
8325 (S &&
8326 isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
8327 S.getMainOp()) &&
8329 NotProfitableForVectorization(VL)) {
8330 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
8331 if (TryToFindDuplicates(S))
8332 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8333 ReuseShuffleIndices);
8334 return;
8335 }
8336
8337 // Don't vectorize ephemeral values.
8338 if (S && !EphValues.empty()) {
8339 for (Value *V : VL) {
8340 if (EphValues.count(V)) {
8341 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
8342 << ") is ephemeral.\n");
8343 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8344 return;
8345 }
8346 }
8347 }
8348
8349 // We now know that this is a vector of instructions of the same type from
8350 // the same block.
8351
8352 // Check that none of the instructions in the bundle are already in the tree.
8353 for (Value *V : VL) {
8354 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
8356 continue;
8357 if (isVectorized(V)) {
8358 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
8359 << ") is already in tree.\n");
8360 if (TryToFindDuplicates(S))
8361 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8362 ReuseShuffleIndices);
8363 return;
8364 }
8365 }
8366
8367 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
8368 if (UserIgnoreList && !UserIgnoreList->empty()) {
8369 for (Value *V : VL) {
8370 if (UserIgnoreList->contains(V)) {
8371 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
8372 if (TryToFindDuplicates(S))
8373 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8374 ReuseShuffleIndices);
8375 return;
8376 }
8377 }
8378 }
8379
8380 // Special processing for sorted pointers for ScatterVectorize node with
8381 // constant indeces only.
8382 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
8383 assert(VL.front()->getType()->isPointerTy() &&
8384 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
8385 "Expected pointers only.");
8386 // Reset S to make it GetElementPtr kind of node.
8387 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
8388 assert(It != VL.end() && "Expected at least one GEP.");
8389 S = getSameOpcode(*It, *TLI);
8390 }
8391
8392 // Check that all of the users of the scalars that we want to vectorize are
8393 // schedulable.
8394 Instruction *VL0 = S.getMainOp();
8395 BB = VL0->getParent();
8396
8397 if (S &&
8398 (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()) ||
8399 !DT->isReachableFromEntry(BB))) {
8400 // Don't go into unreachable blocks. They may contain instructions with
8401 // dependency cycles which confuse the final scheduling.
8402 // Do not vectorize EH and non-returning blocks, not profitable in most
8403 // cases.
8404 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
8405 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8406 return;
8407 }
8408
8409 // Check that every instruction appears once in this bundle.
8410 if (!TryToFindDuplicates(S, /*DoNotFail=*/true))
8411 return;
8412
8413 // Perform specific checks for each particular instruction kind.
8414 OrdersType CurrentOrder;
8415 SmallVector<Value *> PointerOps;
8416 TreeEntry::EntryState State = getScalarsVectorizationState(
8417 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
8418 if (State == TreeEntry::NeedToGather) {
8419 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8420 ReuseShuffleIndices);
8421 return;
8422 }
8423
8424 auto &BSRef = BlocksSchedules[BB];
8425 if (!BSRef)
8426 BSRef = std::make_unique<BlockScheduling>(BB);
8427
8428 BlockScheduling &BS = *BSRef;
8429
8430 std::optional<ScheduleData *> Bundle =
8431 BS.tryScheduleBundle(UniqueValues, this, S);
8432#ifdef EXPENSIVE_CHECKS
8433 // Make sure we didn't break any internal invariants
8434 BS.verify();
8435#endif
8436 if (!Bundle) {
8437 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
8438 assert((!BS.getScheduleData(VL0) ||
8439 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
8440 "tryScheduleBundle should cancelScheduling on failure");
8441 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8442 ReuseShuffleIndices);
8443 NonScheduledFirst.insert(VL.front());
8444 if (S.getOpcode() == Instruction::Load &&
8445 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
8447 return;
8448 }
8449 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
8450
8451 unsigned ShuffleOrOp =
8452 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
8453 auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) {
8454 // Postpone PHI nodes creation
8455 SmallVector<unsigned> PHIOps;
8456 for (unsigned I : seq<unsigned>(Operands.size())) {
8458 if (Op.empty())
8459 continue;
8460 InstructionsState S = getSameOpcode(Op, *TLI);
8461 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
8462 buildTree_rec(Op, Depth + 1, {TE, I});
8463 else
8464 PHIOps.push_back(I);
8465 }
8466 for (unsigned I : PHIOps)
8467 buildTree_rec(Operands[I], Depth + 1, {TE, I});
8468 };
8469 switch (ShuffleOrOp) {
8470 case Instruction::PHI: {
8471 auto *PH = cast<PHINode>(VL0);
8472
8473 TreeEntry *TE =
8474 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
8475 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n";
8476 TE->dump());
8477
8478 // Keeps the reordered operands to avoid code duplication.
8479 PHIHandler Handler(*DT, PH, VL);
8480 Handler.buildOperands();
8481 for (unsigned I : seq<unsigned>(PH->getNumOperands()))
8482 TE->setOperand(I, Handler.getOperands(I));
8483 SmallVector<ArrayRef<Value *>> Operands(PH->getNumOperands());
8484 for (unsigned I : seq<unsigned>(PH->getNumOperands()))
8485 Operands[I] = Handler.getOperands(I);
8486 CreateOperandNodes(TE, Operands);
8487 return;
8488 }
8489 case Instruction::ExtractValue:
8490 case Instruction::ExtractElement: {
8491 if (CurrentOrder.empty()) {
8492 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
8493 } else {
8494 LLVM_DEBUG({
8495 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
8496 "with order";
8497 for (unsigned Idx : CurrentOrder)
8498 dbgs() << " " << Idx;
8499 dbgs() << "\n";
8500 });
8501 fixupOrderingIndices(CurrentOrder);
8502 }
8503 // Insert new order with initial value 0, if it does not exist,
8504 // otherwise return the iterator to the existing one.
8505 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8506 ReuseShuffleIndices, CurrentOrder);
8507 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
8508 "(ExtractValueInst/ExtractElementInst).\n";
8509 TE->dump());
8510 // This is a special case, as it does not gather, but at the same time
8511 // we are not extending buildTree_rec() towards the operands.
8512 TE->setOperand(*this);
8513 return;
8514 }
8515 case Instruction::InsertElement: {
8516 assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
8517
8518 auto OrdCompare = [](const std::pair<int, int> &P1,
8519 const std::pair<int, int> &P2) {
8520 return P1.first > P2.first;
8521 };
8523 decltype(OrdCompare)>
8524 Indices(OrdCompare);
8525 for (int I = 0, E = VL.size(); I < E; ++I) {
8526 unsigned Idx = *getElementIndex(VL[I]);
8527 Indices.emplace(Idx, I);
8528 }
8529 OrdersType CurrentOrder(VL.size(), VL.size());
8530 bool IsIdentity = true;
8531 for (int I = 0, E = VL.size(); I < E; ++I) {
8532 CurrentOrder[Indices.top().second] = I;
8533 IsIdentity &= Indices.top().second == I;
8534 Indices.pop();
8535 }
8536 if (IsIdentity)
8537 CurrentOrder.clear();
8538 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8539 {}, CurrentOrder);
8540 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
8541 TE->dump());
8542
8543 TE->setOperand(*this);
8544 buildTree_rec(TE->getOperand(1), Depth + 1, {TE, 1});
8545 return;
8546 }
8547 case Instruction::Load: {
8548 // Check that a vectorized load would load the same memory as a scalar
8549 // load. For example, we don't want to vectorize loads that are smaller
8550 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
8551 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
8552 // from such a struct, we read/write packed bits disagreeing with the
8553 // unvectorized version.
8554 TreeEntry *TE = nullptr;
8555 fixupOrderingIndices(CurrentOrder);
8556 switch (State) {
8557 case TreeEntry::Vectorize:
8558 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8559 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
8560 if (CurrentOrder.empty())
8561 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
8562 TE->dump());
8563 else
8565 << "SLP: added a new TreeEntry (jumbled LoadInst).\n";
8566 TE->dump());
8567 break;
8568 case TreeEntry::StridedVectorize:
8569 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
8570 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
8571 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
8572 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
8573 TE->dump());
8574 break;
8575 case TreeEntry::ScatterVectorize:
8576 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
8577 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
8578 UserTreeIdx, ReuseShuffleIndices);
8579 LLVM_DEBUG(
8580 dbgs()
8581 << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
8582 TE->dump());
8583 break;
8584 case TreeEntry::CombinedVectorize:
8585 case TreeEntry::NeedToGather:
8586 llvm_unreachable("Unexpected loads state.");
8587 }
8588 TE->setOperand(*this);
8589 if (State == TreeEntry::ScatterVectorize)
8590 buildTree_rec(PointerOps, Depth + 1, {TE, 0});
8591 return;
8592 }
8593 case Instruction::ZExt:
8594 case Instruction::SExt:
8595 case Instruction::FPToUI:
8596 case Instruction::FPToSI:
8597 case Instruction::FPExt:
8598 case Instruction::PtrToInt:
8599 case Instruction::IntToPtr:
8600 case Instruction::SIToFP:
8601 case Instruction::UIToFP:
8602 case Instruction::Trunc:
8603 case Instruction::FPTrunc:
8604 case Instruction::BitCast: {
8605 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
8606 std::make_pair(std::numeric_limits<unsigned>::min(),
8607 std::numeric_limits<unsigned>::max()));
8608 if (ShuffleOrOp == Instruction::ZExt ||
8609 ShuffleOrOp == Instruction::SExt) {
8610 CastMaxMinBWSizes = std::make_pair(
8611 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
8612 PrevMaxBW),
8613 std::min<unsigned>(
8614 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
8615 PrevMinBW));
8616 } else if (ShuffleOrOp == Instruction::Trunc) {
8617 CastMaxMinBWSizes = std::make_pair(
8618 std::max<unsigned>(
8619 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
8620 PrevMaxBW),
8621 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
8622 PrevMinBW));
8623 }
8624 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8625 ReuseShuffleIndices);
8626 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
8627 TE->dump());
8628
8629 TE->setOperand(*this);
8630 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
8631 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
8632 if (ShuffleOrOp == Instruction::Trunc) {
8633 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
8634 } else if (ShuffleOrOp == Instruction::SIToFP ||
8635 ShuffleOrOp == Instruction::UIToFP) {
8636 unsigned NumSignBits =
8637 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
8638 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
8639 APInt Mask = DB->getDemandedBits(OpI);
8640 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
8641 }
8642 if (NumSignBits * 2 >=
8643 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
8644 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
8645 }
8646 return;
8647 }
8648 case Instruction::ICmp:
8649 case Instruction::FCmp: {
8650 // Check that all of the compares have the same predicate.
8651 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
8652 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8653 ReuseShuffleIndices);
8654 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n";
8655 TE->dump());
8656
8658 VLOperands Ops(VL, S, *this);
8659 if (cast<CmpInst>(VL0)->isCommutative()) {
8660 // Commutative predicate - collect + sort operands of the instructions
8661 // so that each side is more likely to have the same opcode.
8663 "Commutative Predicate mismatch");
8664 Ops.reorder();
8665 Left = Ops.getVL(0);
8666 Right = Ops.getVL(1);
8667 } else {
8668 // Collect operands - commute if it uses the swapped predicate.
8669 for (Value *V : VL) {
8670 if (isa<PoisonValue>(V)) {
8671 Left.push_back(PoisonValue::get(VL0->getOperand(0)->getType()));
8672 Right.push_back(PoisonValue::get(VL0->getOperand(1)->getType()));
8673 continue;
8674 }
8675 auto *Cmp = cast<CmpInst>(V);
8676 Value *LHS = Cmp->getOperand(0);
8677 Value *RHS = Cmp->getOperand(1);
8678 if (Cmp->getPredicate() != P0)
8679 std::swap(LHS, RHS);
8680 Left.push_back(LHS);
8681 Right.push_back(RHS);
8682 }
8683 }
8684 TE->setOperand(0, Left);
8685 TE->setOperand(1, Right);
8686 buildTree_rec(Left, Depth + 1, {TE, 0});
8687 buildTree_rec(Right, Depth + 1, {TE, 1});
8688 if (ShuffleOrOp == Instruction::ICmp) {
8689 unsigned NumSignBits0 =
8690 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
8691 if (NumSignBits0 * 2 >=
8692 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
8693 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
8694 unsigned NumSignBits1 =
8695 ComputeNumSignBits(VL0->getOperand(1), *DL, 0, AC, nullptr, DT);
8696 if (NumSignBits1 * 2 >=
8697 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
8698 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
8699 }
8700 return;
8701 }
8702 case Instruction::Select:
8703 case Instruction::FNeg:
8704 case Instruction::Add:
8705 case Instruction::FAdd:
8706 case Instruction::Sub:
8707 case Instruction::FSub:
8708 case Instruction::Mul:
8709 case Instruction::FMul:
8710 case Instruction::UDiv:
8711 case Instruction::SDiv:
8712 case Instruction::FDiv:
8713 case Instruction::URem:
8714 case Instruction::SRem:
8715 case Instruction::FRem:
8716 case Instruction::Shl:
8717 case Instruction::LShr:
8718 case Instruction::AShr:
8719 case Instruction::And:
8720 case Instruction::Or:
8721 case Instruction::Xor:
8722 case Instruction::Freeze: {
8723 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8724 ReuseShuffleIndices);
8725 LLVM_DEBUG(
8726 dbgs() << "SLP: added a new TreeEntry "
8727 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
8728 TE->dump());
8729
8730 TE->setOperand(*this, isa<BinaryOperator>(VL0) && isCommutative(VL0));
8731 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
8732 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
8733 return;
8734 }
8735 case Instruction::GetElementPtr: {
8736 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8737 ReuseShuffleIndices);
8738 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
8739 TE->dump());
8741 // Prepare the operand vector for pointer operands.
8742 for (Value *V : VL) {
8743 auto *GEP = dyn_cast<GetElementPtrInst>(V);
8744 if (!GEP) {
8745 Operands.front().push_back(V);
8746 continue;
8747 }
8748 Operands.front().push_back(GEP->getPointerOperand());
8749 }
8750 TE->setOperand(0, Operands.front());
8751 // Need to cast all indices to the same type before vectorization to
8752 // avoid crash.
8753 // Required to be able to find correct matches between different gather
8754 // nodes and reuse the vectorized values rather than trying to gather them
8755 // again.
8756 int IndexIdx = 1;
8757 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
8758 Type *Ty = all_of(VL,
8759 [VL0Ty, IndexIdx](Value *V) {
8760 auto *GEP = dyn_cast<GetElementPtrInst>(V);
8761 if (!GEP)
8762 return true;
8763 return VL0Ty == GEP->getOperand(IndexIdx)->getType();
8764 })
8765 ? VL0Ty
8766 : DL->getIndexType(cast<GetElementPtrInst>(VL0)
8767 ->getPointerOperandType()
8768 ->getScalarType());
8769 // Prepare the operand vector.
8770 for (Value *V : VL) {
8771 auto *I = dyn_cast<GetElementPtrInst>(V);
8772 if (!I) {
8773 Operands.back().push_back(
8774 ConstantInt::get(Ty, 0, /*isSigned=*/false));
8775 continue;
8776 }
8777 auto *Op = I->getOperand(IndexIdx);
8778 auto *CI = dyn_cast<ConstantInt>(Op);
8779 if (!CI)
8780 Operands.back().push_back(Op);
8781 else
8782 Operands.back().push_back(ConstantFoldIntegerCast(
8783 CI, Ty, CI->getValue().isSignBitSet(), *DL));
8784 }
8785 TE->setOperand(IndexIdx, Operands.back());
8786
8787 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
8788 buildTree_rec(Operands[I], Depth + 1, {TE, I});
8789 return;
8790 }
8791 case Instruction::Store: {
8792 bool Consecutive = CurrentOrder.empty();
8793 if (!Consecutive)
8794 fixupOrderingIndices(CurrentOrder);
8795 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8796 ReuseShuffleIndices, CurrentOrder);
8797 if (Consecutive)
8798 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
8799 TE->dump());
8800 else
8801 LLVM_DEBUG(
8802 dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n";
8803 TE->dump());
8804 TE->setOperand(*this);
8805 buildTree_rec(TE->getOperand(0), Depth + 1, {TE, 0});
8806 return;
8807 }
8808 case Instruction::Call: {
8809 // Check if the calls are all to the same vectorizable intrinsic or
8810 // library function.
8811 CallInst *CI = cast<CallInst>(VL0);
8813
8814 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8815 ReuseShuffleIndices);
8816 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
8817 TE->dump());
8818 TE->setOperand(*this, isCommutative(VL0));
8819 for (unsigned I : seq<unsigned>(CI->arg_size())) {
8820 // For scalar operands no need to create an entry since no need to
8821 // vectorize it.
8823 continue;
8824 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
8825 }
8826 return;
8827 }
8828 case Instruction::ShuffleVector: {
8829 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8830 ReuseShuffleIndices);
8831 if (S.isAltShuffle()) {
8832 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
8833 TE->dump());
8834 } else {
8835 assert(SLPReVec && "Only supported by REVEC.");
8836 LLVM_DEBUG(
8837 dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
8838 TE->dump());
8839 }
8840
8841 // Reorder operands if reordering would enable vectorization.
8842 auto *CI = dyn_cast<CmpInst>(VL0);
8843 if (CI && any_of(VL, [](Value *V) {
8844 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
8845 })) {
8846 auto *MainCI = cast<CmpInst>(S.getMainOp());
8847 auto *AltCI = cast<CmpInst>(S.getAltOp());
8848 CmpInst::Predicate MainP = MainCI->getPredicate();
8849 CmpInst::Predicate AltP = AltCI->getPredicate();
8850 assert(MainP != AltP &&
8851 "Expected different main/alternate predicates.");
8853 // Collect operands - commute if it uses the swapped predicate or
8854 // alternate operation.
8855 for (Value *V : VL) {
8856 if (isa<PoisonValue>(V)) {
8857 Left.push_back(PoisonValue::get(MainCI->getOperand(0)->getType()));
8858 Right.push_back(PoisonValue::get(MainCI->getOperand(1)->getType()));
8859 continue;
8860 }
8861 auto *Cmp = cast<CmpInst>(V);
8862 Value *LHS = Cmp->getOperand(0);
8863 Value *RHS = Cmp->getOperand(1);
8864
8865 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
8866 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
8867 std::swap(LHS, RHS);
8868 } else {
8869 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
8870 std::swap(LHS, RHS);
8871 }
8872 Left.push_back(LHS);
8873 Right.push_back(RHS);
8874 }
8875 TE->setOperand(0, Left);
8876 TE->setOperand(1, Right);
8877 buildTree_rec(Left, Depth + 1, {TE, 0});
8878 buildTree_rec(Right, Depth + 1, {TE, 1});
8879 return;
8880 }
8881
8882 TE->setOperand(*this, isa<BinaryOperator>(VL0) || CI);
8883 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
8884 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
8885 return;
8886 }
8887 default:
8888 break;
8889 }
8890 llvm_unreachable("Unexpected vectorization of the instructions.");
8891}
8892
8894 unsigned N = 1;
8895 Type *EltTy = T;
8896
8897 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
8898 if (EltTy->isEmptyTy())
8899 return 0;
8900 if (auto *ST = dyn_cast<StructType>(EltTy)) {
8901 // Check that struct is homogeneous.
8902 for (const auto *Ty : ST->elements())
8903 if (Ty != *ST->element_begin())
8904 return 0;
8905 N *= ST->getNumElements();
8906 EltTy = *ST->element_begin();
8907 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
8908 N *= AT->getNumElements();
8909 EltTy = AT->getElementType();
8910 } else {
8911 auto *VT = cast<FixedVectorType>(EltTy);
8912 N *= VT->getNumElements();
8913 EltTy = VT->getElementType();
8914 }
8915 }
8916
8917 if (!isValidElementType(EltTy))
8918 return 0;
8919 uint64_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N));
8920 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
8921 VTSize != DL->getTypeStoreSizeInBits(T))
8922 return 0;
8923 return N;
8924}
8925
8926bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
8927 SmallVectorImpl<unsigned> &CurrentOrder,
8928 bool ResizeAllowed) const {
8929 const auto *It = find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
8930 assert(It != VL.end() && "Expected at least one extract instruction.");
8931 auto *E0 = cast<Instruction>(*It);
8932 assert(
8933 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
8934 "Invalid opcode");
8935 // Check if all of the extracts come from the same vector and from the
8936 // correct offset.
8937 Value *Vec = E0->getOperand(0);
8938
8939 CurrentOrder.clear();
8940
8941 // We have to extract from a vector/aggregate with the same number of elements.
8942 unsigned NElts;
8943 if (E0->getOpcode() == Instruction::ExtractValue) {
8944 NElts = canMapToVector(Vec->getType());
8945 if (!NElts)
8946 return false;
8947 // Check if load can be rewritten as load of vector.
8948 LoadInst *LI = dyn_cast<LoadInst>(Vec);
8949 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
8950 return false;
8951 } else {
8952 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
8953 }
8954
8955 unsigned E = VL.size();
8956 if (!ResizeAllowed && NElts != E)
8957 return false;
8958 SmallVector<int> Indices(E, PoisonMaskElem);
8959 unsigned MinIdx = NElts, MaxIdx = 0;
8960 for (auto [I, V] : enumerate(VL)) {
8961 auto *Inst = dyn_cast<Instruction>(V);
8962 if (!Inst)
8963 continue;
8964 if (Inst->getOperand(0) != Vec)
8965 return false;
8966 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
8967 if (isa<UndefValue>(EE->getIndexOperand()))
8968 continue;
8969 std::optional<unsigned> Idx = getExtractIndex(Inst);
8970 if (!Idx)
8971 return false;
8972 const unsigned ExtIdx = *Idx;
8973 if (ExtIdx >= NElts)
8974 continue;
8975 Indices[I] = ExtIdx;
8976 if (MinIdx > ExtIdx)
8977 MinIdx = ExtIdx;
8978 if (MaxIdx < ExtIdx)
8979 MaxIdx = ExtIdx;
8980 }
8981 if (MaxIdx - MinIdx + 1 > E)
8982 return false;
8983 if (MaxIdx + 1 <= E)
8984 MinIdx = 0;
8985
8986 // Check that all of the indices extract from the correct offset.
8987 bool ShouldKeepOrder = true;
8988 // Assign to all items the initial value E + 1 so we can check if the extract
8989 // instruction index was used already.
8990 // Also, later we can check that all the indices are used and we have a
8991 // consecutive access in the extract instructions, by checking that no
8992 // element of CurrentOrder still has value E + 1.
8993 CurrentOrder.assign(E, E);
8994 for (unsigned I = 0; I < E; ++I) {
8995 if (Indices[I] == PoisonMaskElem)
8996 continue;
8997 const unsigned ExtIdx = Indices[I] - MinIdx;
8998 if (CurrentOrder[ExtIdx] != E) {
8999 CurrentOrder.clear();
9000 return false;
9001 }
9002 ShouldKeepOrder &= ExtIdx == I;
9003 CurrentOrder[ExtIdx] = I;
9004 }
9005 if (ShouldKeepOrder)
9006 CurrentOrder.clear();
9007
9008 return ShouldKeepOrder;
9009}
9010
9011bool BoUpSLP::areAllUsersVectorized(
9012 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
9013 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
9014 all_of(I->users(), [this](User *U) {
9015 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
9016 (isa<ExtractElementInst>(U) && MustGather.contains(U));
9017 });
9018}
9019
9020static std::pair<InstructionCost, InstructionCost>
9023 ArrayRef<Type *> ArgTys) {
9025
9026 // Calculate the cost of the scalar and vector calls.
9027 FastMathFlags FMF;
9028 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
9029 FMF = FPCI->getFastMathFlags();
9030 IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF);
9031 auto IntrinsicCost =
9033
9034 auto Shape = VFShape::get(CI->getFunctionType(),
9036 false /*HasGlobalPred*/);
9037 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
9038 auto LibCost = IntrinsicCost;
9039 if (!CI->isNoBuiltin() && VecFunc) {
9040 // Calculate the cost of the vector library call.
9041 // If the corresponding vector call is cheaper, return its cost.
9042 LibCost =
9043 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
9044 }
9045 return {IntrinsicCost, LibCost};
9046}
9047
9048void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
9049 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
9050 SmallVectorImpl<Value *> *OpScalars,
9051 SmallVectorImpl<Value *> *AltScalars) const {
9052 unsigned Sz = Scalars.size();
9053 Mask.assign(Sz, PoisonMaskElem);
9054 SmallVector<int> OrderMask;
9055 if (!ReorderIndices.empty())
9056 inversePermutation(ReorderIndices, OrderMask);
9057 for (unsigned I = 0; I < Sz; ++I) {
9058 unsigned Idx = I;
9059 if (!ReorderIndices.empty())
9060 Idx = OrderMask[I];
9061 if (isa<PoisonValue>(Scalars[Idx]))
9062 continue;
9063 auto *OpInst = cast<Instruction>(Scalars[Idx]);
9064 if (IsAltOp(OpInst)) {
9065 Mask[I] = Sz + Idx;
9066 if (AltScalars)
9067 AltScalars->push_back(OpInst);
9068 } else {
9069 Mask[I] = Idx;
9070 if (OpScalars)
9071 OpScalars->push_back(OpInst);
9072 }
9073 }
9074 if (!ReuseShuffleIndices.empty()) {
9075 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
9076 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
9077 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
9078 });
9079 Mask.swap(NewMask);
9080 }
9081}
9082
9084 const Instruction *MainOp,
9085 const Instruction *AltOp,
9086 const TargetLibraryInfo &TLI) {
9087 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
9088 auto *AltCI = cast<CmpInst>(AltOp);
9089 CmpInst::Predicate MainP = MainCI->getPredicate();
9090 [[maybe_unused]] CmpInst::Predicate AltP = AltCI->getPredicate();
9091 assert(MainP != AltP && "Expected different main/alternate predicates.");
9092 auto *CI = cast<CmpInst>(I);
9093 if (isCmpSameOrSwapped(MainCI, CI, TLI))
9094 return false;
9095 if (isCmpSameOrSwapped(AltCI, CI, TLI))
9096 return true;
9097 CmpInst::Predicate P = CI->getPredicate();
9099
9100 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
9101 "CmpInst expected to match either main or alternate predicate or "
9102 "their swap.");
9103 return MainP != P && MainP != SwappedP;
9104 }
9105 return I->getOpcode() == AltOp->getOpcode();
9106}
9107
9108TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
9109 assert(!Ops.empty());
9110 const auto *Op0 = Ops.front();
9111
9112 const bool IsConstant = all_of(Ops, [](Value *V) {
9113 // TODO: We should allow undef elements here
9114 return isConstant(V) && !isa<UndefValue>(V);
9115 });
9116 const bool IsUniform = all_of(Ops, [=](Value *V) {
9117 // TODO: We should allow undef elements here
9118 return V == Op0;
9119 });
9120 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
9121 // TODO: We should allow undef elements here
9122 if (auto *CI = dyn_cast<ConstantInt>(V))
9123 return CI->getValue().isPowerOf2();
9124 return false;
9125 });
9126 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
9127 // TODO: We should allow undef elements here
9128 if (auto *CI = dyn_cast<ConstantInt>(V))
9129 return CI->getValue().isNegatedPowerOf2();
9130 return false;
9131 });
9132
9134 if (IsConstant && IsUniform)
9136 else if (IsConstant)
9138 else if (IsUniform)
9140
9142 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
9143 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
9144
9145 return {VK, VP};
9146}
9147
9148namespace {
9149/// The base class for shuffle instruction emission and shuffle cost estimation.
9150class BaseShuffleAnalysis {
9151protected:
9152 Type *ScalarTy = nullptr;
9153
9154 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
9155
9156 /// V is expected to be a vectorized value.
9157 /// When REVEC is disabled, there is no difference between VF and
9158 /// VNumElements.
9159 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
9160 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
9161 /// of 8.
9162 unsigned getVF(Value *V) const {
9163 assert(V && "V cannot be nullptr");
9164 assert(isa<FixedVectorType>(V->getType()) &&
9165 "V does not have FixedVectorType");
9166 assert(ScalarTy && "ScalarTy cannot be nullptr");
9167 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
9168 unsigned VNumElements =
9169 cast<FixedVectorType>(V->getType())->getNumElements();
9170 assert(VNumElements > ScalarTyNumElements &&
9171 "the number of elements of V is not large enough");
9172 assert(VNumElements % ScalarTyNumElements == 0 &&
9173 "the number of elements of V is not a vectorized value");
9174 return VNumElements / ScalarTyNumElements;
9175 }
9176
9177 /// Checks if the mask is an identity mask.
9178 /// \param IsStrict if is true the function returns false if mask size does
9179 /// not match vector size.
9180 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
9181 bool IsStrict) {
9182 int Limit = Mask.size();
9183 int VF = VecTy->getNumElements();
9184 int Index = -1;
9185 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
9186 return true;
9187 if (!IsStrict) {
9188 // Consider extract subvector starting from index 0.
9190 Index == 0)
9191 return true;
9192 // All VF-size submasks are identity (e.g.
9193 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
9194 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
9195 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
9196 return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
9198 }))
9199 return true;
9200 }
9201 return false;
9202 }
9203
9204 /// Tries to combine 2 different masks into single one.
9205 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
9206 /// change the size of the vector, \p LocalVF is the original size of the
9207 /// shuffled vector.
9208 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
9209 ArrayRef<int> ExtMask) {
9210 unsigned VF = Mask.size();
9211 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
9212 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
9213 if (ExtMask[I] == PoisonMaskElem)
9214 continue;
9215 int MaskedIdx = Mask[ExtMask[I] % VF];
9216 NewMask[I] =
9217 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
9218 }
9219 Mask.swap(NewMask);
9220 }
9221
9222 /// Looks through shuffles trying to reduce final number of shuffles in the
9223 /// code. The function looks through the previously emitted shuffle
9224 /// instructions and properly mark indices in mask as undef.
9225 /// For example, given the code
9226 /// \code
9227 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
9228 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
9229 /// \endcode
9230 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
9231 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
9232 /// <0, 1, 2, 3> for the shuffle.
9233 /// If 2 operands are of different size, the smallest one will be resized and
9234 /// the mask recalculated properly.
9235 /// For example, given the code
9236 /// \code
9237 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
9238 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
9239 /// \endcode
9240 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
9241 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
9242 /// <0, 1, 2, 3> for the shuffle.
9243 /// So, it tries to transform permutations to simple vector merge, if
9244 /// possible.
9245 /// \param V The input vector which must be shuffled using the given \p Mask.
9246 /// If the better candidate is found, \p V is set to this best candidate
9247 /// vector.
9248 /// \param Mask The input mask for the shuffle. If the best candidate is found
9249 /// during looking-through-shuffles attempt, it is updated accordingly.
9250 /// \param SinglePermute true if the shuffle operation is originally a
9251 /// single-value-permutation. In this case the look-through-shuffles procedure
9252 /// may look for resizing shuffles as the best candidates.
9253 /// \return true if the shuffle results in the non-resizing identity shuffle
9254 /// (and thus can be ignored), false - otherwise.
9255 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
9256 bool SinglePermute) {
9257 Value *Op = V;
9258 ShuffleVectorInst *IdentityOp = nullptr;
9259 SmallVector<int> IdentityMask;
9260 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
9261 // Exit if not a fixed vector type or changing size shuffle.
9262 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
9263 if (!SVTy)
9264 break;
9265 // Remember the identity or broadcast mask, if it is not a resizing
9266 // shuffle. If no better candidates are found, this Op and Mask will be
9267 // used in the final shuffle.
9268 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
9269 if (!IdentityOp || !SinglePermute ||
9270 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
9272 IdentityMask.size()))) {
9273 IdentityOp = SV;
9274 // Store current mask in the IdentityMask so later we did not lost
9275 // this info if IdentityOp is selected as the best candidate for the
9276 // permutation.
9277 IdentityMask.assign(Mask);
9278 }
9279 }
9280 // Remember the broadcast mask. If no better candidates are found, this Op
9281 // and Mask will be used in the final shuffle.
9282 // Zero splat can be used as identity too, since it might be used with
9283 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
9284 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
9285 // expensive, the analysis founds out, that the source vector is just a
9286 // broadcast, this original mask can be transformed to identity mask <0,
9287 // 1, 2, 3>.
9288 // \code
9289 // %0 = shuffle %v, poison, zeroinitalizer
9290 // %res = shuffle %0, poison, <3, 1, 2, 0>
9291 // \endcode
9292 // may be transformed to
9293 // \code
9294 // %0 = shuffle %v, poison, zeroinitalizer
9295 // %res = shuffle %0, poison, <0, 1, 2, 3>
9296 // \endcode
9297 if (SV->isZeroEltSplat()) {
9298 IdentityOp = SV;
9299 IdentityMask.assign(Mask);
9300 }
9301 int LocalVF = Mask.size();
9302 if (auto *SVOpTy =
9303 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
9304 LocalVF = SVOpTy->getNumElements();
9305 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
9306 for (auto [Idx, I] : enumerate(Mask)) {
9307 if (I == PoisonMaskElem ||
9308 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
9309 continue;
9310 ExtMask[Idx] = SV->getMaskValue(I);
9311 }
9312 bool IsOp1Undef = isUndefVector</*isPoisonOnly=*/true>(
9313 SV->getOperand(0),
9314 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
9315 .all();
9316 bool IsOp2Undef = isUndefVector</*isPoisonOnly=*/true>(
9317 SV->getOperand(1),
9318 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
9319 .all();
9320 if (!IsOp1Undef && !IsOp2Undef) {
9321 // Update mask and mark undef elems.
9322 for (int &I : Mask) {
9323 if (I == PoisonMaskElem)
9324 continue;
9325 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
9327 I = PoisonMaskElem;
9328 }
9329 break;
9330 }
9331 SmallVector<int> ShuffleMask(SV->getShuffleMask());
9332 combineMasks(LocalVF, ShuffleMask, Mask);
9333 Mask.swap(ShuffleMask);
9334 if (IsOp2Undef)
9335 Op = SV->getOperand(0);
9336 else
9337 Op = SV->getOperand(1);
9338 }
9339 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
9340 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
9342 if (IdentityOp) {
9343 V = IdentityOp;
9344 assert(Mask.size() == IdentityMask.size() &&
9345 "Expected masks of same sizes.");
9346 // Clear known poison elements.
9347 for (auto [I, Idx] : enumerate(Mask))
9348 if (Idx == PoisonMaskElem)
9349 IdentityMask[I] = PoisonMaskElem;
9350 Mask.swap(IdentityMask);
9351 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
9352 return SinglePermute &&
9353 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
9354 /*IsStrict=*/true) ||
9355 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
9356 Shuffle->isZeroEltSplat() &&
9358 }
9359 V = Op;
9360 return false;
9361 }
9362 V = Op;
9363 return true;
9364 }
9365
9366 /// Smart shuffle instruction emission, walks through shuffles trees and
9367 /// tries to find the best matching vector for the actual shuffle
9368 /// instruction.
9369 template <typename T, typename ShuffleBuilderTy>
9370 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
9371 ShuffleBuilderTy &Builder, Type *ScalarTy) {
9372 assert(V1 && "Expected at least one vector value.");
9373 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
9374 SmallVector<int> NewMask(Mask);
9375 if (ScalarTyNumElements != 1) {
9376 assert(SLPReVec && "FixedVectorType is not expected.");
9377 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, NewMask);
9378 Mask = NewMask;
9379 }
9380 if (V2)
9381 Builder.resizeToMatch(V1, V2);
9382 int VF = Mask.size();
9383 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
9384 VF = FTy->getNumElements();
9385 if (V2 && !isUndefVector</*IsPoisonOnly=*/true>(
9386 V2, buildUseMask(VF, Mask, UseMask::SecondArg))
9387 .all()) {
9388 // Peek through shuffles.
9389 Value *Op1 = V1;
9390 Value *Op2 = V2;
9391 int VF =
9392 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
9393 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
9394 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
9395 for (int I = 0, E = Mask.size(); I < E; ++I) {
9396 if (Mask[I] < VF)
9397 CombinedMask1[I] = Mask[I];
9398 else
9399 CombinedMask2[I] = Mask[I] - VF;
9400 }
9401 Value *PrevOp1;
9402 Value *PrevOp2;
9403 do {
9404 PrevOp1 = Op1;
9405 PrevOp2 = Op2;
9406 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
9407 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
9408 // Check if we have 2 resizing shuffles - need to peek through operands
9409 // again.
9410 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
9411 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
9412 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
9413 for (auto [Idx, I] : enumerate(CombinedMask1)) {
9414 if (I == PoisonMaskElem)
9415 continue;
9416 ExtMask1[Idx] = SV1->getMaskValue(I);
9417 }
9418 SmallBitVector UseMask1 = buildUseMask(
9419 cast<FixedVectorType>(SV1->getOperand(1)->getType())
9420 ->getNumElements(),
9421 ExtMask1, UseMask::SecondArg);
9422 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
9423 for (auto [Idx, I] : enumerate(CombinedMask2)) {
9424 if (I == PoisonMaskElem)
9425 continue;
9426 ExtMask2[Idx] = SV2->getMaskValue(I);
9427 }
9428 SmallBitVector UseMask2 = buildUseMask(
9429 cast<FixedVectorType>(SV2->getOperand(1)->getType())
9430 ->getNumElements(),
9431 ExtMask2, UseMask::SecondArg);
9432 if (SV1->getOperand(0)->getType() ==
9433 SV2->getOperand(0)->getType() &&
9434 SV1->getOperand(0)->getType() != SV1->getType() &&
9435 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
9436 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
9437 Op1 = SV1->getOperand(0);
9438 Op2 = SV2->getOperand(0);
9439 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
9440 int LocalVF = ShuffleMask1.size();
9441 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
9442 LocalVF = FTy->getNumElements();
9443 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
9444 CombinedMask1.swap(ShuffleMask1);
9445 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
9446 LocalVF = ShuffleMask2.size();
9447 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
9448 LocalVF = FTy->getNumElements();
9449 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
9450 CombinedMask2.swap(ShuffleMask2);
9451 }
9452 }
9453 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
9454 Builder.resizeToMatch(Op1, Op2);
9455 VF = std::max(cast<VectorType>(Op1->getType())
9456 ->getElementCount()
9457 .getKnownMinValue(),
9458 cast<VectorType>(Op2->getType())
9459 ->getElementCount()
9460 .getKnownMinValue());
9461 for (int I = 0, E = Mask.size(); I < E; ++I) {
9462 if (CombinedMask2[I] != PoisonMaskElem) {
9463 assert(CombinedMask1[I] == PoisonMaskElem &&
9464 "Expected undefined mask element");
9465 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
9466 }
9467 }
9468 if (Op1 == Op2 &&
9469 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
9470 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
9471 isa<ShuffleVectorInst>(Op1) &&
9472 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
9473 ArrayRef(CombinedMask1))))
9474 return Builder.createIdentity(Op1);
9475 return Builder.createShuffleVector(
9476 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
9477 CombinedMask1);
9478 }
9479 if (isa<PoisonValue>(V1))
9480 return Builder.createPoison(
9481 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
9482 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
9483 assert(V1 && "Expected non-null value after looking through shuffles.");
9484
9485 if (!IsIdentity)
9486 return Builder.createShuffleVector(V1, NewMask);
9487 return Builder.createIdentity(V1);
9488 }
9489
9490 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
9491 /// shuffle emission.
9492 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
9493 ArrayRef<int> Mask) {
9494 for (unsigned I : seq<unsigned>(CommonMask.size()))
9495 if (Mask[I] != PoisonMaskElem)
9496 CommonMask[I] = I;
9497 }
9498};
9499} // namespace
9500
9501/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
9502static std::pair<InstructionCost, InstructionCost>
9504 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
9505 Type *ScalarTy, VectorType *VecTy) {
9506 InstructionCost ScalarCost = 0;
9507 InstructionCost VecCost = 0;
9508 // Here we differentiate two cases: (1) when Ptrs represent a regular
9509 // vectorization tree node (as they are pointer arguments of scattered
9510 // loads) or (2) when Ptrs are the arguments of loads or stores being
9511 // vectorized as plane wide unit-stride load/store since all the
9512 // loads/stores are known to be from/to adjacent locations.
9513 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
9514 // Case 2: estimate costs for pointer related costs when vectorizing to
9515 // a wide load/store.
9516 // Scalar cost is estimated as a set of pointers with known relationship
9517 // between them.
9518 // For vector code we will use BasePtr as argument for the wide load/store
9519 // but we also need to account all the instructions which are going to
9520 // stay in vectorized code due to uses outside of these scalar
9521 // loads/stores.
9522 ScalarCost = TTI.getPointersChainCost(
9523 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
9524 CostKind);
9525
9526 SmallVector<const Value *> PtrsRetainedInVecCode;
9527 for (Value *V : Ptrs) {
9528 if (V == BasePtr) {
9529 PtrsRetainedInVecCode.push_back(V);
9530 continue;
9531 }
9532 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
9533 // For simplicity assume Ptr to stay in vectorized code if it's not a
9534 // GEP instruction. We don't care since it's cost considered free.
9535 // TODO: We should check for any uses outside of vectorizable tree
9536 // rather than just single use.
9537 if (!Ptr || !Ptr->hasOneUse())
9538 PtrsRetainedInVecCode.push_back(V);
9539 }
9540
9541 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
9542 // If all pointers stay in vectorized code then we don't have
9543 // any savings on that.
9544 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
9545 }
9546 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
9547 TTI::PointersChainInfo::getKnownStride(),
9548 VecTy, CostKind);
9549 } else {
9550 // Case 1: Ptrs are the arguments of loads that we are going to transform
9551 // into masked gather load intrinsic.
9552 // All the scalar GEPs will be removed as a result of vectorization.
9553 // For any external uses of some lanes extract element instructions will
9554 // be generated (which cost is estimated separately).
9555 TTI::PointersChainInfo PtrsInfo =
9556 all_of(Ptrs,
9557 [](const Value *V) {
9558 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
9559 return Ptr && !Ptr->hasAllConstantIndices();
9560 })
9561 ? TTI::PointersChainInfo::getUnknownStride()
9562 : TTI::PointersChainInfo::getKnownStride();
9563
9564 ScalarCost =
9565 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
9566 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
9567 if (!BaseGEP) {
9568 auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
9569 if (It != Ptrs.end())
9570 BaseGEP = cast<GEPOperator>(*It);
9571 }
9572 if (BaseGEP) {
9573 SmallVector<const Value *> Indices(BaseGEP->indices());
9574 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
9575 BaseGEP->getPointerOperand(), Indices, VecTy,
9576 CostKind);
9577 }
9578 }
9579
9580 return std::make_pair(ScalarCost, VecCost);
9581}
9582
9583void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
9584 assert(TE.isGather() && TE.ReorderIndices.empty() &&
9585 "Expected gather node without reordering.");
9587 SmallSet<size_t, 2> LoadKeyUsed;
9588
9589 // Do not reorder nodes if it small (just 2 elements), all-constant or all
9590 // instructions have same opcode already.
9591 if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||
9592 all_of(TE.Scalars, isConstant))
9593 return;
9594
9595 if (any_of(seq<unsigned>(TE.Idx), [&](unsigned Idx) {
9596 return VectorizableTree[Idx]->isSame(TE.Scalars);
9597 }))
9598 return;
9599
9600 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
9601 Key = hash_combine(hash_value(LI->getParent()), Key);
9602 Value *Ptr =
9604 if (LoadKeyUsed.contains(Key)) {
9605 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
9606 if (LIt != LoadsMap.end()) {
9607 for (LoadInst *RLI : LIt->second) {
9608 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
9609 LI->getType(), LI->getPointerOperand(), *DL, *SE,
9610 /*StrictCheck=*/true))
9611 return hash_value(RLI->getPointerOperand());
9612 }
9613 for (LoadInst *RLI : LIt->second) {
9615 LI->getPointerOperand(), *TLI)) {
9616 hash_code SubKey = hash_value(RLI->getPointerOperand());
9617 return SubKey;
9618 }
9619 }
9620 if (LIt->second.size() > 2) {
9621 hash_code SubKey =
9622 hash_value(LIt->second.back()->getPointerOperand());
9623 return SubKey;
9624 }
9625 }
9626 }
9627 LoadKeyUsed.insert(Key);
9628 LoadsMap.try_emplace(std::make_pair(Key, Ptr)).first->second.push_back(LI);
9629 return hash_value(LI->getPointerOperand());
9630 };
9633 bool IsOrdered = true;
9634 unsigned NumInstructions = 0;
9635 // Try to "cluster" scalar instructions, to be able to build extra vectorized
9636 // nodes.
9637 for (auto [I, V] : enumerate(TE.Scalars)) {
9638 size_t Key = 1, Idx = 1;
9639 if (auto *Inst = dyn_cast<Instruction>(V);
9640 Inst && !isa<ExtractElementInst, LoadInst, CastInst>(V) &&
9641 !isDeleted(Inst) && !isVectorized(V)) {
9642 std::tie(Key, Idx) = generateKeySubkey(V, TLI, GenerateLoadsSubkey,
9643 /*AllowAlternate=*/false);
9644 ++NumInstructions;
9645 }
9646 auto &Container = SortedValues[Key];
9647 if (IsOrdered && !KeyToIndex.contains(V) &&
9648 !(isa<Constant, ExtractElementInst>(V) ||
9650 ((Container.contains(Idx) &&
9651 KeyToIndex.at(Container[Idx].back()).back() != I - 1) ||
9652 (!Container.empty() && !Container.contains(Idx) &&
9653 KeyToIndex.at(Container.back().second.back()).back() != I - 1)))
9654 IsOrdered = false;
9655 auto &KTI = KeyToIndex[V];
9656 if (KTI.empty())
9657 Container[Idx].push_back(V);
9658 KTI.push_back(I);
9659 }
9661 APInt DemandedElts = APInt::getAllOnes(TE.Scalars.size());
9662 if (!IsOrdered && NumInstructions > 1) {
9663 unsigned Cnt = 0;
9664 TE.ReorderIndices.resize(TE.Scalars.size(), TE.Scalars.size());
9665 for (const auto &D : SortedValues) {
9666 for (const auto &P : D.second) {
9667 unsigned Sz = 0;
9668 for (Value *V : P.second) {
9669 ArrayRef<unsigned> Indices = KeyToIndex.at(V);
9670 for (auto [K, Idx] : enumerate(Indices)) {
9671 TE.ReorderIndices[Cnt + K] = Idx;
9672 TE.Scalars[Cnt + K] = V;
9673 }
9674 Sz += Indices.size();
9675 Cnt += Indices.size();
9676 }
9677 if (Sz > 1 && isa<Instruction>(P.second.front())) {
9678 const unsigned SubVF = getFloorFullVectorNumberOfElements(
9679 *TTI, TE.Scalars.front()->getType(), Sz);
9680 SubVectors.emplace_back(Cnt - Sz, SubVF);
9681 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
9682 DemandedElts.clearBit(I);
9683 } else if (!P.second.empty() && isConstant(P.second.front())) {
9684 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
9685 DemandedElts.clearBit(I);
9686 }
9687 }
9688 }
9689 }
9690 // Reuses always require shuffles, so consider it as profitable.
9691 if (!TE.ReuseShuffleIndices.empty() || TE.ReorderIndices.empty())
9692 return;
9693 // Do simple cost estimation.
9696 auto *ScalarTy = TE.Scalars.front()->getType();
9697 auto *VecTy = getWidenedType(ScalarTy, TE.Scalars.size());
9698 for (auto [Idx, Sz] : SubVectors) {
9700 Idx, getWidenedType(ScalarTy, Sz));
9701 }
9702 if (auto *FTy = dyn_cast<FixedVectorType>(ScalarTy)) {
9703 assert(SLPReVec && "Only supported by REVEC.");
9704 // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
9705 // of CreateInsertElement.
9706 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
9707 for (unsigned I : seq<unsigned>(TE.Scalars.size()))
9708 if (DemandedElts[I])
9709 Cost +=
9710 TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy, std::nullopt,
9711 CostKind, I * ScalarTyNumElements, FTy);
9712 } else {
9713 Cost += TTI->getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
9714 /*Extract=*/false, CostKind);
9715 }
9716 int Sz = TE.Scalars.size();
9717 SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
9718 TE.ReorderIndices.end());
9719 for (unsigned I : seq<unsigned>(Sz)) {
9720 Value *V = TE.getOrdered(I);
9721 if (isa<PoisonValue>(V)) {
9722 ReorderMask[I] = PoisonMaskElem;
9723 } else if (isConstant(V) || DemandedElts[I]) {
9724 ReorderMask[I] = I + TE.ReorderIndices.size();
9725 }
9726 }
9728 any_of(ReorderMask, [&](int I) { return I >= Sz; })
9731 VecTy, ReorderMask);
9732 DemandedElts = APInt::getAllOnes(VecTy->getNumElements());
9733 ReorderMask.assign(Sz, PoisonMaskElem);
9734 for (unsigned I : seq<unsigned>(Sz)) {
9735 Value *V = TE.getOrdered(I);
9736 if (isConstant(V)) {
9737 DemandedElts.clearBit(I);
9738 if (!isa<PoisonValue>(V))
9739 ReorderMask[I] = I;
9740 } else {
9741 ReorderMask[I] = I + Sz;
9742 }
9743 }
9745 VecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind);
9746 if (!DemandedElts.isAllOnes())
9747 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);
9748 if (Cost >= BVCost) {
9749 SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end());
9750 reorderScalars(TE.Scalars, Mask);
9751 TE.ReorderIndices.clear();
9752 }
9753}
9754
9757 BaseGraphSize = VectorizableTree.size();
9758 // Turn graph transforming mode on and off, when done.
9759 class GraphTransformModeRAAI {
9760 bool &SavedIsGraphTransformMode;
9761
9762 public:
9763 GraphTransformModeRAAI(bool &IsGraphTransformMode)
9764 : SavedIsGraphTransformMode(IsGraphTransformMode) {
9765 IsGraphTransformMode = true;
9766 }
9767 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
9768 } TransformContext(IsGraphTransformMode);
9769 // Operands are profitable if they are:
9770 // 1. At least one constant
9771 // or
9772 // 2. Splats
9773 // or
9774 // 3. Results in good vectorization opportunity, i.e. may generate vector
9775 // nodes and reduce cost of the graph.
9776 auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
9777 const InstructionsState &S) {
9779 for (unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
9780 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
9781 I2->getOperand(Op));
9782 return all_of(
9783 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
9784 return all_of(Cand,
9785 [](const std::pair<Value *, Value *> &P) {
9786 return isa<Constant>(P.first) ||
9787 isa<Constant>(P.second) || P.first == P.second;
9788 }) ||
9790 });
9791 };
9792
9793 // Try to reorder gather nodes for better vectorization opportunities.
9794 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
9795 TreeEntry &E = *VectorizableTree[Idx];
9796 if (E.isGather())
9797 reorderGatherNode(E);
9798 }
9799
9800 // The tree may grow here, so iterate over nodes, built before.
9801 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
9802 TreeEntry &E = *VectorizableTree[Idx];
9803 if (E.isGather()) {
9804 ArrayRef<Value *> VL = E.Scalars;
9805 const unsigned Sz = getVectorElementSize(VL.front());
9806 unsigned MinVF = getMinVF(2 * Sz);
9807 // Do not try partial vectorization for small nodes (<= 2), nodes with the
9808 // same opcode and same parent block or all constants.
9809 if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
9810 !(!E.hasState() || E.getOpcode() == Instruction::Load ||
9811 E.isAltShuffle() || !allSameBlock(VL)) ||
9812 allConstant(VL) || isSplat(VL))
9813 continue;
9814 // Try to find vectorizable sequences and transform them into a series of
9815 // insertvector instructions.
9816 unsigned StartIdx = 0;
9817 unsigned End = VL.size();
9818 for (unsigned VF = getFloorFullVectorNumberOfElements(
9819 *TTI, VL.front()->getType(), VL.size() - 1);
9820 VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
9821 *TTI, VL.front()->getType(), VF - 1)) {
9822 if (StartIdx + VF > End)
9823 continue;
9825 for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
9826 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
9827 // If any instruction is vectorized already - do not try again.
9828 // Reuse the existing node, if it fully matches the slice.
9829 if (isVectorized(Slice.front()) &&
9830 !getSameValuesTreeEntry(Slice.front(), Slice, /*SameVF=*/true))
9831 continue;
9832 // Constant already handled effectively - skip.
9833 if (allConstant(Slice))
9834 continue;
9835 // Do not try to vectorize small splats (less than vector register and
9836 // only with the single non-undef element).
9837 bool IsSplat = isSplat(Slice);
9838 bool IsTwoRegisterSplat = true;
9839 if (IsSplat && VF == 2) {
9840 unsigned NumRegs2VF = ::getNumberOfParts(
9841 *TTI, getWidenedType(Slice.front()->getType(), 2 * VF));
9842 IsTwoRegisterSplat = NumRegs2VF == 2;
9843 }
9844 if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat ||
9845 count(Slice, Slice.front()) ==
9846 static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
9847 : 1)) {
9848 if (IsSplat)
9849 continue;
9850 InstructionsState S = getSameOpcode(Slice, *TLI);
9851 if (!S || S.isAltShuffle() || !allSameBlock(Slice) ||
9852 (S.getOpcode() == Instruction::Load &&
9854 (S.getOpcode() != Instruction::Load &&
9855 !hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), VF)))
9856 continue;
9857 if (VF == 2) {
9858 // Try to vectorize reduced values or if all users are vectorized.
9859 // For expensive instructions extra extracts might be profitable.
9860 if ((!UserIgnoreList || E.Idx != 0) &&
9861 TTI->getInstructionCost(S.getMainOp(), CostKind) <
9863 !all_of(Slice, [&](Value *V) {
9864 if (isa<PoisonValue>(V))
9865 return true;
9866 return areAllUsersVectorized(cast<Instruction>(V),
9867 UserIgnoreList);
9868 }))
9869 continue;
9870 if (S.getOpcode() == Instruction::Load) {
9871 OrdersType Order;
9872 SmallVector<Value *> PointerOps;
9873 LoadsState Res =
9874 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps);
9875 // Do not vectorize gathers.
9876 if (Res == LoadsState::ScatterVectorize ||
9877 Res == LoadsState::Gather) {
9878 if (Res == LoadsState::Gather) {
9880 // If reductions and the scalars from the root node are
9881 // analyzed - mark as non-vectorizable reduction.
9882 if (UserIgnoreList && E.Idx == 0)
9883 analyzedReductionVals(Slice);
9884 }
9885 continue;
9886 }
9887 } else if (S.getOpcode() == Instruction::ExtractElement ||
9888 (TTI->getInstructionCost(S.getMainOp(), CostKind) <
9890 !CheckOperandsProfitability(
9891 S.getMainOp(),
9892 cast<Instruction>(*find_if(reverse(Slice),
9893 IsaPred<Instruction>)),
9894 S))) {
9895 // Do not vectorize extractelements (handled effectively
9896 // alread). Do not vectorize non-profitable instructions (with
9897 // low cost and non-vectorizable operands.)
9898 continue;
9899 }
9900 }
9901 }
9902 Slices.emplace_back(Cnt, Slice.size());
9903 }
9904 auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
9905 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
9906 if (StartIdx == Cnt)
9907 StartIdx = Cnt + Sz;
9908 if (End == Cnt + Sz)
9909 End = Cnt;
9910 };
9911 for (auto [Cnt, Sz] : Slices) {
9912 ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);
9913 // If any instruction is vectorized already - do not try again.
9914 if (TreeEntry *SE = getSameValuesTreeEntry(Slice.front(), Slice,
9915 /*SameVF=*/true)) {
9916 SE->UserTreeIndices.emplace_back(&E, UINT_MAX);
9917 AddCombinedNode(SE->Idx, Cnt, Sz);
9918 continue;
9919 }
9920 unsigned PrevSize = VectorizableTree.size();
9921 [[maybe_unused]] unsigned PrevEntriesSize =
9922 LoadEntriesToVectorize.size();
9923 buildTree_rec(Slice, 0, EdgeInfo(&E, UINT_MAX));
9924 if (PrevSize + 1 == VectorizableTree.size() &&
9925 VectorizableTree[PrevSize]->isGather() &&
9926 VectorizableTree[PrevSize]->hasState() &&
9927 VectorizableTree[PrevSize]->getOpcode() !=
9928 Instruction::ExtractElement &&
9929 !isSplat(Slice)) {
9930 if (UserIgnoreList && E.Idx == 0 && VF == 2)
9931 analyzedReductionVals(Slice);
9932 VectorizableTree.pop_back();
9933 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
9934 "LoadEntriesToVectorize expected to remain the same");
9935 continue;
9936 }
9937 AddCombinedNode(PrevSize, Cnt, Sz);
9938 }
9939 }
9940 // Restore ordering, if no extra vectorization happened.
9941 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
9942 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
9943 reorderScalars(E.Scalars, Mask);
9944 E.ReorderIndices.clear();
9945 }
9946 }
9947 if (!E.hasState())
9948 continue;
9949 switch (E.getOpcode()) {
9950 case Instruction::Load: {
9951 // No need to reorder masked gather loads, just reorder the scalar
9952 // operands.
9953 if (E.State != TreeEntry::Vectorize)
9954 break;
9955 Type *ScalarTy = E.getMainOp()->getType();
9956 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
9957 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
9958 // Check if profitable to represent consecutive load + reverse as strided
9959 // load with stride -1.
9960 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
9961 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
9962 SmallVector<int> Mask;
9963 inversePermutation(E.ReorderIndices, Mask);
9964 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
9965 InstructionCost OriginalVecCost =
9966 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
9971 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
9972 /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
9973 if (StridedCost < OriginalVecCost)
9974 // Strided load is more profitable than consecutive load + reverse -
9975 // transform the node to strided load.
9976 E.State = TreeEntry::StridedVectorize;
9977 }
9978 break;
9979 }
9980 case Instruction::Store: {
9981 Type *ScalarTy =
9982 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
9983 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
9984 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
9985 // Check if profitable to represent consecutive load + reverse as strided
9986 // load with stride -1.
9987 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
9988 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
9989 SmallVector<int> Mask;
9990 inversePermutation(E.ReorderIndices, Mask);
9991 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
9992 InstructionCost OriginalVecCost =
9993 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
9998 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
9999 /*VariableMask=*/false, CommonAlignment, CostKind, BaseSI);
10000 if (StridedCost < OriginalVecCost)
10001 // Strided store is more profitable than reverse + consecutive store -
10002 // transform the node to strided store.
10003 E.State = TreeEntry::StridedVectorize;
10004 } else if (!E.ReorderIndices.empty()) {
10005 // Check for interleaved stores.
10006 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {
10007 auto *BaseSI = cast<StoreInst>(E.Scalars.front());
10008 assert(Mask.size() > 1 && "Expected mask greater than 1 element.");
10009 if (Mask.size() < 4)
10010 return 0u;
10011 for (unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
10013 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
10015 VecTy, Factor, BaseSI->getAlign(),
10016 BaseSI->getPointerAddressSpace()))
10017 return Factor;
10018 }
10019
10020 return 0u;
10021 };
10022 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
10023 unsigned InterleaveFactor = IsInterleaveMask(Mask);
10024 if (InterleaveFactor != 0)
10025 E.setInterleave(InterleaveFactor);
10026 }
10027 break;
10028 }
10029 case Instruction::Select: {
10030 if (E.State != TreeEntry::Vectorize)
10031 break;
10032 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
10033 if (MinMaxID == Intrinsic::not_intrinsic)
10034 break;
10035 // This node is a minmax node.
10036 E.CombinedOp = TreeEntry::MinMax;
10037 TreeEntry *CondEntry = const_cast<TreeEntry *>(getOperandEntry(&E, 0));
10038 if (SelectOnly && CondEntry->UserTreeIndices.size() == 1 &&
10039 CondEntry->State == TreeEntry::Vectorize) {
10040 // The condition node is part of the combined minmax node.
10041 CondEntry->State = TreeEntry::CombinedVectorize;
10042 }
10043 break;
10044 }
10045 default:
10046 break;
10047 }
10048 }
10049
10050 if (LoadEntriesToVectorize.empty()) {
10051 // Single load node - exit.
10052 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
10053 VectorizableTree.front()->getOpcode() == Instruction::Load)
10054 return;
10055 // Small graph with small VF - exit.
10056 constexpr unsigned SmallTree = 3;
10057 constexpr unsigned SmallVF = 2;
10058 if ((VectorizableTree.size() <= SmallTree &&
10059 VectorizableTree.front()->Scalars.size() == SmallVF) ||
10060 (VectorizableTree.size() <= 2 && UserIgnoreList))
10061 return;
10062
10063 if (VectorizableTree.front()->isNonPowOf2Vec() &&
10064 getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
10065 getCanonicalGraphSize() <= SmallTree &&
10066 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
10067 [](const std::unique_ptr<TreeEntry> &TE) {
10068 return TE->isGather() && TE->hasState() &&
10069 TE->getOpcode() == Instruction::Load &&
10070 !allSameBlock(TE->Scalars);
10071 }) == 1)
10072 return;
10073 }
10074
10075 // A list of loads to be gathered during the vectorization process. We can
10076 // try to vectorize them at the end, if profitable.
10079 GatheredLoads;
10080
10081 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
10082 TreeEntry &E = *TE;
10083 if (E.isGather() &&
10084 ((E.hasState() && E.getOpcode() == Instruction::Load) ||
10085 (!E.hasState() && any_of(E.Scalars,
10086 [&](Value *V) {
10087 return isa<LoadInst>(V) &&
10088 !isVectorized(V) &&
10089 !isDeleted(cast<Instruction>(V));
10090 }))) &&
10091 !isSplat(E.Scalars)) {
10092 for (Value *V : E.Scalars) {
10093 auto *LI = dyn_cast<LoadInst>(V);
10094 if (!LI)
10095 continue;
10096 if (isDeleted(LI) || isVectorized(LI) || !LI->isSimple())
10097 continue;
10099 *this, V, *DL, *SE, *TTI,
10100 GatheredLoads[std::make_tuple(
10101 LI->getParent(),
10103 LI->getType())]);
10104 }
10105 }
10106 }
10107 // Try to vectorize gathered loads if this is not just a gather of loads.
10108 if (!GatheredLoads.empty())
10109 tryToVectorizeGatheredLoads(GatheredLoads);
10110}
10111
10112/// Merges shuffle masks and emits final shuffle instruction, if required. It
10113/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
10114/// when the actual shuffle instruction is generated only if this is actually
10115/// required. Otherwise, the shuffle instruction emission is delayed till the
10116/// end of the process, to reduce the number of emitted instructions and further
10117/// analysis/transformations.
10118class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
10119 bool IsFinalized = false;
10120 SmallVector<int> CommonMask;
10122 const TargetTransformInfo &TTI;
10124 SmallDenseSet<Value *> VectorizedVals;
10125 BoUpSLP &R;
10126 SmallPtrSetImpl<Value *> &CheckedExtracts;
10127 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
10128 /// While set, still trying to estimate the cost for the same nodes and we
10129 /// can delay actual cost estimation (virtual shuffle instruction emission).
10130 /// May help better estimate the cost if same nodes must be permuted + allows
10131 /// to move most of the long shuffles cost estimation to TTI.
10132 bool SameNodesEstimated = true;
10133
10134 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
10135 if (Ty->getScalarType()->isPointerTy()) {
10139 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
10140 Ty->getScalarType());
10141 if (auto *VTy = dyn_cast<VectorType>(Ty))
10142 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
10143 return Res;
10144 }
10145 return Constant::getAllOnesValue(Ty);
10146 }
10147
10148 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
10149 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
10150 return TTI::TCC_Free;
10151 auto *VecTy = getWidenedType(ScalarTy, VL.size());
10152 InstructionCost GatherCost = 0;
10153 SmallVector<Value *> Gathers(VL);
10154 if (!Root && isSplat(VL)) {
10155 // Found the broadcasting of the single scalar, calculate the cost as
10156 // the broadcast.
10157 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
10158 assert(It != VL.end() && "Expected at least one non-undef value.");
10159 // Add broadcast for non-identity shuffle only.
10160 bool NeedShuffle =
10161 count(VL, *It) > 1 &&
10162 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
10163 if (!NeedShuffle) {
10164 if (isa<FixedVectorType>(ScalarTy)) {
10165 assert(SLPReVec && "FixedVectorType is not expected.");
10166 return TTI.getShuffleCost(
10167 TTI::SK_InsertSubvector, VecTy, {}, CostKind,
10168 std::distance(VL.begin(), It) * getNumElements(ScalarTy),
10169 cast<FixedVectorType>(ScalarTy));
10170 }
10171 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
10172 CostKind, std::distance(VL.begin(), It),
10173 PoisonValue::get(VecTy), *It);
10174 }
10175
10176 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
10177 transform(VL, ShuffleMask.begin(), [](Value *V) {
10178 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
10179 });
10180 InstructionCost InsertCost =
10181 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
10182 PoisonValue::get(VecTy), *It);
10183 return InsertCost + ::getShuffleCost(TTI,
10185 VecTy, ShuffleMask, CostKind,
10186 /*Index=*/0, /*SubTp=*/nullptr,
10187 /*Args=*/*It);
10188 }
10189 return GatherCost +
10190 (all_of(Gathers, IsaPred<UndefValue>)
10192 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
10193 ScalarTy));
10194 };
10195
10196 /// Compute the cost of creating a vector containing the extracted values from
10197 /// \p VL.
10199 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
10200 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10201 unsigned NumParts) {
10202 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
10203 unsigned NumElts =
10204 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
10205 auto *EE = dyn_cast<ExtractElementInst>(V);
10206 if (!EE)
10207 return Sz;
10208 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
10209 if (!VecTy)
10210 return Sz;
10211 return std::max(Sz, VecTy->getNumElements());
10212 });
10213 // FIXME: this must be moved to TTI for better estimation.
10214 unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
10215 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
10217 -> std::optional<TTI::ShuffleKind> {
10218 if (NumElts <= EltsPerVector)
10219 return std::nullopt;
10220 int OffsetReg0 =
10221 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
10222 [](int S, int I) {
10223 if (I == PoisonMaskElem)
10224 return S;
10225 return std::min(S, I);
10226 }),
10227 EltsPerVector);
10228 int OffsetReg1 = OffsetReg0;
10229 DenseSet<int> RegIndices;
10230 // Check that if trying to permute same single/2 input vectors.
10232 int FirstRegId = -1;
10233 Indices.assign(1, OffsetReg0);
10234 for (auto [Pos, I] : enumerate(Mask)) {
10235 if (I == PoisonMaskElem)
10236 continue;
10237 int Idx = I - OffsetReg0;
10238 int RegId =
10239 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
10240 if (FirstRegId < 0)
10241 FirstRegId = RegId;
10242 RegIndices.insert(RegId);
10243 if (RegIndices.size() > 2)
10244 return std::nullopt;
10245 if (RegIndices.size() == 2) {
10246 ShuffleKind = TTI::SK_PermuteTwoSrc;
10247 if (Indices.size() == 1) {
10248 OffsetReg1 = alignDown(
10249 std::accumulate(
10250 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
10251 [&](int S, int I) {
10252 if (I == PoisonMaskElem)
10253 return S;
10254 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
10255 ((I - OffsetReg0) % NumElts) / EltsPerVector;
10256 if (RegId == FirstRegId)
10257 return S;
10258 return std::min(S, I);
10259 }),
10260 EltsPerVector);
10261 Indices.push_back(OffsetReg1 % NumElts);
10262 }
10263 Idx = I - OffsetReg1;
10264 }
10265 I = (Idx % NumElts) % EltsPerVector +
10266 (RegId == FirstRegId ? 0 : EltsPerVector);
10267 }
10268 return ShuffleKind;
10269 };
10271
10272 // Process extracts in blocks of EltsPerVector to check if the source vector
10273 // operand can be re-used directly. If not, add the cost of creating a
10274 // shuffle to extract the values into a vector register.
10275 for (unsigned Part : seq<unsigned>(NumParts)) {
10276 if (!ShuffleKinds[Part])
10277 continue;
10278 ArrayRef<int> MaskSlice = Mask.slice(
10279 Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
10280 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
10281 copy(MaskSlice, SubMask.begin());
10283 std::optional<TTI::ShuffleKind> RegShuffleKind =
10284 CheckPerRegistersShuffle(SubMask, Indices);
10285 if (!RegShuffleKind) {
10286 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
10288 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
10289 Cost +=
10290 ::getShuffleCost(TTI, *ShuffleKinds[Part],
10291 getWidenedType(ScalarTy, NumElts), MaskSlice);
10292 continue;
10293 }
10294 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
10295 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
10296 Cost +=
10297 ::getShuffleCost(TTI, *RegShuffleKind,
10298 getWidenedType(ScalarTy, EltsPerVector), SubMask);
10299 }
10300 const unsigned BaseVF = getFullVectorNumberOfElements(
10301 *R.TTI, VL.front()->getType(), alignTo(NumElts, EltsPerVector));
10302 for (unsigned Idx : Indices) {
10303 assert((Idx + EltsPerVector) <= BaseVF &&
10304 "SK_ExtractSubvector index out of range");
10306 getWidenedType(ScalarTy, BaseVF), {}, CostKind,
10307 Idx, getWidenedType(ScalarTy, EltsPerVector));
10308 }
10309 // Second attempt to check, if just a permute is better estimated than
10310 // subvector extract.
10311 SubMask.assign(NumElts, PoisonMaskElem);
10312 copy(MaskSlice, SubMask.begin());
10313 InstructionCost OriginalCost = ::getShuffleCost(
10314 TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask);
10315 if (OriginalCost < Cost)
10316 Cost = OriginalCost;
10317 }
10318 return Cost;
10319 }
10320 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
10321 /// mask \p Mask, register number \p Part, that includes \p SliceSize
10322 /// elements.
10323 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
10324 ArrayRef<int> Mask, unsigned Part,
10325 unsigned SliceSize) {
10326 if (SameNodesEstimated) {
10327 // Delay the cost estimation if the same nodes are reshuffling.
10328 // If we already requested the cost of reshuffling of E1 and E2 before, no
10329 // need to estimate another cost with the sub-Mask, instead include this
10330 // sub-Mask into the CommonMask to estimate it later and avoid double cost
10331 // estimation.
10332 if ((InVectors.size() == 2 &&
10333 cast<const TreeEntry *>(InVectors.front()) == &E1 &&
10334 cast<const TreeEntry *>(InVectors.back()) == E2) ||
10335 (!E2 && cast<const TreeEntry *>(InVectors.front()) == &E1)) {
10336 unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
10337 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
10338 [](int Idx) { return Idx == PoisonMaskElem; }) &&
10339 "Expected all poisoned elements.");
10340 ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
10341 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
10342 return;
10343 }
10344 // Found non-matching nodes - need to estimate the cost for the matched
10345 // and transform mask.
10346 Cost += createShuffle(InVectors.front(),
10347 InVectors.size() == 1 ? nullptr : InVectors.back(),
10348 CommonMask);
10349 transformMaskAfterShuffle(CommonMask, CommonMask);
10350 } else if (InVectors.size() == 2) {
10351 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
10352 transformMaskAfterShuffle(CommonMask, CommonMask);
10353 }
10354 SameNodesEstimated = false;
10355 if (!E2 && InVectors.size() == 1) {
10356 unsigned VF = E1.getVectorFactor();
10357 if (Value *V1 = InVectors.front().dyn_cast<Value *>()) {
10358 VF = std::max(VF,
10359 cast<FixedVectorType>(V1->getType())->getNumElements());
10360 } else {
10361 const auto *E = cast<const TreeEntry *>(InVectors.front());
10362 VF = std::max(VF, E->getVectorFactor());
10363 }
10364 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10365 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
10366 CommonMask[Idx] = Mask[Idx] + VF;
10367 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
10368 transformMaskAfterShuffle(CommonMask, CommonMask);
10369 } else {
10370 auto P = InVectors.front();
10371 Cost += createShuffle(&E1, E2, Mask);
10372 unsigned VF = Mask.size();
10373 if (Value *V1 = P.dyn_cast<Value *>()) {
10374 VF = std::max(VF,
10375 getNumElements(V1->getType()));
10376 } else {
10377 const auto *E = cast<const TreeEntry *>(P);
10378 VF = std::max(VF, E->getVectorFactor());
10379 }
10380 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10381 if (Mask[Idx] != PoisonMaskElem)
10382 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
10383 Cost += createShuffle(P, InVectors.front(), CommonMask);
10384 transformMaskAfterShuffle(CommonMask, CommonMask);
10385 }
10386 }
10387
10388 class ShuffleCostBuilder {
10389 const TargetTransformInfo &TTI;
10390
10391 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
10392 int Index = -1;
10393 return Mask.empty() ||
10394 (VF == Mask.size() &&
10397 Index == 0);
10398 }
10399
10400 public:
10401 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
10402 ~ShuffleCostBuilder() = default;
10403 InstructionCost createShuffleVector(Value *V1, Value *,
10404 ArrayRef<int> Mask) const {
10405 // Empty mask or identity mask are free.
10406 unsigned VF =
10407 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
10408 if (isEmptyOrIdentity(Mask, VF))
10409 return TTI::TCC_Free;
10410 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
10411 cast<VectorType>(V1->getType()), Mask);
10412 }
10413 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
10414 // Empty mask or identity mask are free.
10415 unsigned VF =
10416 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
10417 if (isEmptyOrIdentity(Mask, VF))
10418 return TTI::TCC_Free;
10419 return ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
10420 cast<VectorType>(V1->getType()), Mask);
10421 }
10422 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
10423 InstructionCost createPoison(Type *Ty, unsigned VF) const {
10424 return TTI::TCC_Free;
10425 }
10426 void resizeToMatch(Value *&, Value *&) const {}
10427 };
10428
10429 /// Smart shuffle instruction emission, walks through shuffles trees and
10430 /// tries to find the best matching vector for the actual shuffle
10431 /// instruction.
10433 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
10435 ArrayRef<int> Mask) {
10436 ShuffleCostBuilder Builder(TTI);
10437 SmallVector<int> CommonMask(Mask);
10438 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
10439 unsigned CommonVF = Mask.size();
10440 InstructionCost ExtraCost = 0;
10441 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
10442 unsigned VF) -> InstructionCost {
10443 if (E.isGather() && allConstant(E.Scalars))
10444 return TTI::TCC_Free;
10445 Type *EScalarTy = E.Scalars.front()->getType();
10446 bool IsSigned = true;
10447 if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
10448 EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
10449 IsSigned = It->second.second;
10450 }
10451 if (EScalarTy != ScalarTy) {
10452 unsigned CastOpcode = Instruction::Trunc;
10453 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10454 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10455 if (DstSz > SrcSz)
10456 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10457 return TTI.getCastInstrCost(CastOpcode, getWidenedType(ScalarTy, VF),
10458 getWidenedType(EScalarTy, VF),
10459 TTI::CastContextHint::None, CostKind);
10460 }
10461 return TTI::TCC_Free;
10462 };
10463 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
10464 if (isa<Constant>(V))
10465 return TTI::TCC_Free;
10466 auto *VecTy = cast<VectorType>(V->getType());
10467 Type *EScalarTy = VecTy->getElementType();
10468 if (EScalarTy != ScalarTy) {
10469 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
10470 unsigned CastOpcode = Instruction::Trunc;
10471 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10472 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10473 if (DstSz > SrcSz)
10474 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10475 return TTI.getCastInstrCost(
10476 CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
10477 VecTy, TTI::CastContextHint::None, CostKind);
10478 }
10479 return TTI::TCC_Free;
10480 };
10481 if (!V1 && !V2 && !P2.isNull()) {
10482 // Shuffle 2 entry nodes.
10483 const TreeEntry *E = cast<const TreeEntry *>(P1);
10484 unsigned VF = E->getVectorFactor();
10485 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10486 CommonVF = std::max(VF, E2->getVectorFactor());
10487 assert(all_of(Mask,
10488 [=](int Idx) {
10489 return Idx < 2 * static_cast<int>(CommonVF);
10490 }) &&
10491 "All elements in mask must be less than 2 * CommonVF.");
10492 if (E->Scalars.size() == E2->Scalars.size()) {
10493 SmallVector<int> EMask = E->getCommonMask();
10494 SmallVector<int> E2Mask = E2->getCommonMask();
10495 if (!EMask.empty() || !E2Mask.empty()) {
10496 for (int &Idx : CommonMask) {
10497 if (Idx == PoisonMaskElem)
10498 continue;
10499 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
10500 Idx = EMask[Idx];
10501 else if (Idx >= static_cast<int>(CommonVF))
10502 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
10503 E->Scalars.size();
10504 }
10505 }
10506 CommonVF = E->Scalars.size();
10507 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
10508 GetNodeMinBWAffectedCost(*E2, CommonVF);
10509 } else {
10510 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
10511 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
10512 }
10513 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10514 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10515 } else if (!V1 && P2.isNull()) {
10516 // Shuffle single entry node.
10517 const TreeEntry *E = cast<const TreeEntry *>(P1);
10518 unsigned VF = E->getVectorFactor();
10519 CommonVF = VF;
10520 assert(
10521 all_of(Mask,
10522 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
10523 "All elements in mask must be less than CommonVF.");
10524 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
10525 SmallVector<int> EMask = E->getCommonMask();
10526 assert(!EMask.empty() && "Expected non-empty common mask.");
10527 for (int &Idx : CommonMask) {
10528 if (Idx != PoisonMaskElem)
10529 Idx = EMask[Idx];
10530 }
10531 CommonVF = E->Scalars.size();
10532 } else if (unsigned Factor = E->getInterleaveFactor();
10533 Factor > 0 && E->Scalars.size() != Mask.size() &&
10535 Factor)) {
10536 // Deinterleaved nodes are free.
10537 std::iota(CommonMask.begin(), CommonMask.end(), 0);
10538 }
10539 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
10540 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10541 // Not identity/broadcast? Try to see if the original vector is better.
10542 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
10543 CommonVF == CommonMask.size() &&
10544 any_of(enumerate(CommonMask),
10545 [](const auto &&P) {
10546 return P.value() != PoisonMaskElem &&
10547 static_cast<unsigned>(P.value()) != P.index();
10548 }) &&
10549 any_of(CommonMask,
10550 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
10551 SmallVector<int> ReorderMask;
10552 inversePermutation(E->ReorderIndices, ReorderMask);
10553 ::addMask(CommonMask, ReorderMask);
10554 }
10555 } else if (V1 && P2.isNull()) {
10556 // Shuffle single vector.
10557 ExtraCost += GetValueMinBWAffectedCost(V1);
10558 CommonVF = getVF(V1);
10559 assert(
10560 all_of(Mask,
10561 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
10562 "All elements in mask must be less than CommonVF.");
10563 } else if (V1 && !V2) {
10564 // Shuffle vector and tree node.
10565 unsigned VF = getVF(V1);
10566 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10567 CommonVF = std::max(VF, E2->getVectorFactor());
10568 assert(all_of(Mask,
10569 [=](int Idx) {
10570 return Idx < 2 * static_cast<int>(CommonVF);
10571 }) &&
10572 "All elements in mask must be less than 2 * CommonVF.");
10573 if (E2->Scalars.size() == VF && VF != CommonVF) {
10574 SmallVector<int> E2Mask = E2->getCommonMask();
10575 assert(!E2Mask.empty() && "Expected non-empty common mask.");
10576 for (int &Idx : CommonMask) {
10577 if (Idx == PoisonMaskElem)
10578 continue;
10579 if (Idx >= static_cast<int>(CommonVF))
10580 Idx = E2Mask[Idx - CommonVF] + VF;
10581 }
10582 CommonVF = VF;
10583 }
10584 ExtraCost += GetValueMinBWAffectedCost(V1);
10585 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10586 ExtraCost += GetNodeMinBWAffectedCost(
10587 *E2, std::min(CommonVF, E2->getVectorFactor()));
10588 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10589 } else if (!V1 && V2) {
10590 // Shuffle vector and tree node.
10591 unsigned VF = getVF(V2);
10592 const TreeEntry *E1 = cast<const TreeEntry *>(P1);
10593 CommonVF = std::max(VF, E1->getVectorFactor());
10594 assert(all_of(Mask,
10595 [=](int Idx) {
10596 return Idx < 2 * static_cast<int>(CommonVF);
10597 }) &&
10598 "All elements in mask must be less than 2 * CommonVF.");
10599 if (E1->Scalars.size() == VF && VF != CommonVF) {
10600 SmallVector<int> E1Mask = E1->getCommonMask();
10601 assert(!E1Mask.empty() && "Expected non-empty common mask.");
10602 for (int &Idx : CommonMask) {
10603 if (Idx == PoisonMaskElem)
10604 continue;
10605 if (Idx >= static_cast<int>(CommonVF))
10606 Idx = E1Mask[Idx - CommonVF] + VF;
10607 else
10608 Idx = E1Mask[Idx];
10609 }
10610 CommonVF = VF;
10611 }
10612 ExtraCost += GetNodeMinBWAffectedCost(
10613 *E1, std::min(CommonVF, E1->getVectorFactor()));
10614 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10615 ExtraCost += GetValueMinBWAffectedCost(V2);
10616 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10617 } else {
10618 assert(V1 && V2 && "Expected both vectors.");
10619 unsigned VF = getVF(V1);
10620 CommonVF = std::max(VF, getVF(V2));
10621 assert(all_of(Mask,
10622 [=](int Idx) {
10623 return Idx < 2 * static_cast<int>(CommonVF);
10624 }) &&
10625 "All elements in mask must be less than 2 * CommonVF.");
10626 ExtraCost +=
10627 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
10628 if (V1->getType() != V2->getType()) {
10629 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10630 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10631 } else {
10632 if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
10633 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10634 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
10635 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10636 }
10637 }
10638 InVectors.front() =
10639 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
10640 if (InVectors.size() == 2)
10641 InVectors.pop_back();
10642 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
10643 V1, V2, CommonMask, Builder, ScalarTy);
10644 }
10645
10646public:
10648 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
10649 SmallPtrSetImpl<Value *> &CheckedExtracts)
10650 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
10651 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
10652 CheckedExtracts(CheckedExtracts) {}
10653 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
10654 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10655 unsigned NumParts, bool &UseVecBaseAsInput) {
10656 UseVecBaseAsInput = false;
10657 if (Mask.empty())
10658 return nullptr;
10659 Value *VecBase = nullptr;
10660 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
10661 if (!E->ReorderIndices.empty()) {
10662 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
10663 E->ReorderIndices.end());
10664 reorderScalars(VL, ReorderMask);
10665 }
10666 // Check if it can be considered reused if same extractelements were
10667 // vectorized already.
10668 bool PrevNodeFound = any_of(
10669 ArrayRef(R.VectorizableTree).take_front(E->Idx),
10670 [&](const std::unique_ptr<TreeEntry> &TE) {
10671 return ((TE->hasState() && !TE->isAltShuffle() &&
10672 TE->getOpcode() == Instruction::ExtractElement) ||
10673 TE->isGather()) &&
10674 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
10675 return VL.size() > Data.index() &&
10676 (Mask[Data.index()] == PoisonMaskElem ||
10677 isa<UndefValue>(VL[Data.index()]) ||
10678 Data.value() == VL[Data.index()]);
10679 });
10680 });
10681 SmallPtrSet<Value *, 4> UniqueBases;
10682 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
10683 for (unsigned Part : seq<unsigned>(NumParts)) {
10684 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
10685 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
10686 for (auto [I, V] :
10687 enumerate(ArrayRef(VL).slice(Part * SliceSize, Limit))) {
10688 // Ignore non-extractelement scalars.
10689 if (isa<UndefValue>(V) ||
10690 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
10691 continue;
10692 // If all users of instruction are going to be vectorized and this
10693 // instruction itself is not going to be vectorized, consider this
10694 // instruction as dead and remove its cost from the final cost of the
10695 // vectorized tree.
10696 // Also, avoid adjusting the cost for extractelements with multiple uses
10697 // in different graph entries.
10698 auto *EE = cast<ExtractElementInst>(V);
10699 VecBase = EE->getVectorOperand();
10700 UniqueBases.insert(VecBase);
10701 ArrayRef<TreeEntry *> VEs = R.getTreeEntries(V);
10702 if (!CheckedExtracts.insert(V).second ||
10703 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
10704 any_of(EE->users(),
10705 [&](User *U) {
10706 return isa<GetElementPtrInst>(U) &&
10707 !R.areAllUsersVectorized(cast<Instruction>(U),
10708 &VectorizedVals);
10709 }) ||
10710 (!VEs.empty() && !is_contained(VEs, E)))
10711 continue;
10712 std::optional<unsigned> EEIdx = getExtractIndex(EE);
10713 if (!EEIdx)
10714 continue;
10715 unsigned Idx = *EEIdx;
10716 // Take credit for instruction that will become dead.
10717 if (EE->hasOneUse() || !PrevNodeFound) {
10718 Instruction *Ext = EE->user_back();
10719 if (isa<SExtInst, ZExtInst>(Ext) &&
10720 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
10721 // Use getExtractWithExtendCost() to calculate the cost of
10722 // extractelement/ext pair.
10723 Cost -=
10724 TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
10725 EE->getVectorOperandType(), Idx);
10726 // Add back the cost of s|zext which is subtracted separately.
10728 Ext->getOpcode(), Ext->getType(), EE->getType(),
10729 TTI::getCastContextHint(Ext), CostKind, Ext);
10730 continue;
10731 }
10732 }
10733 Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),
10734 CostKind, Idx);
10735 }
10736 }
10737 // Check that gather of extractelements can be represented as just a
10738 // shuffle of a single/two vectors the scalars are extracted from.
10739 // Found the bunch of extractelement instructions that must be gathered
10740 // into a vector and can be represented as a permutation elements in a
10741 // single input vector or of 2 input vectors.
10742 // Done for reused if same extractelements were vectorized already.
10743 if (!PrevNodeFound)
10744 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
10745 InVectors.assign(1, E);
10746 CommonMask.assign(Mask.begin(), Mask.end());
10747 transformMaskAfterShuffle(CommonMask, CommonMask);
10748 SameNodesEstimated = false;
10749 if (NumParts != 1 && UniqueBases.size() != 1) {
10750 UseVecBaseAsInput = true;
10751 VecBase =
10752 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
10753 }
10754 return VecBase;
10755 }
10756 /// Checks if the specified entry \p E needs to be delayed because of its
10757 /// dependency nodes.
10758 std::optional<InstructionCost>
10759 needToDelay(const TreeEntry *,
10761 // No need to delay the cost estimation during analysis.
10762 return std::nullopt;
10763 }
10764 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
10765 if (&E1 == &E2) {
10766 assert(all_of(Mask,
10767 [&](int Idx) {
10768 return Idx < static_cast<int>(E1.getVectorFactor());
10769 }) &&
10770 "Expected single vector shuffle mask.");
10771 add(E1, Mask);
10772 return;
10773 }
10774 if (InVectors.empty()) {
10775 CommonMask.assign(Mask.begin(), Mask.end());
10776 InVectors.assign({&E1, &E2});
10777 return;
10778 }
10779 assert(!CommonMask.empty() && "Expected non-empty common mask.");
10780 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
10781 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
10782 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
10783 const auto *It =
10784 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
10785 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10786 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
10787 }
10788 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
10789 if (InVectors.empty()) {
10790 CommonMask.assign(Mask.begin(), Mask.end());
10791 InVectors.assign(1, &E1);
10792 return;
10793 }
10794 assert(!CommonMask.empty() && "Expected non-empty common mask.");
10795 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
10796 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
10797 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
10798 const auto *It =
10799 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
10800 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10801 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
10802 if (!SameNodesEstimated && InVectors.size() == 1)
10803 InVectors.emplace_back(&E1);
10804 }
10805 /// Adds 2 input vectors and the mask for their shuffling.
10806 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
10807 // May come only for shuffling of 2 vectors with extractelements, already
10808 // handled in adjustExtracts.
10809 assert(InVectors.size() == 1 &&
10810 all_of(enumerate(CommonMask),
10811 [&](auto P) {
10812 if (P.value() == PoisonMaskElem)
10813 return Mask[P.index()] == PoisonMaskElem;
10814 auto *EI = cast<ExtractElementInst>(
10815 cast<const TreeEntry *>(InVectors.front())
10816 ->getOrdered(P.index()));
10817 return EI->getVectorOperand() == V1 ||
10818 EI->getVectorOperand() == V2;
10819 }) &&
10820 "Expected extractelement vectors.");
10821 }
10822 /// Adds another one input vector and the mask for the shuffling.
10823 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
10824 if (InVectors.empty()) {
10825 assert(CommonMask.empty() && !ForExtracts &&
10826 "Expected empty input mask/vectors.");
10827 CommonMask.assign(Mask.begin(), Mask.end());
10828 InVectors.assign(1, V1);
10829 return;
10830 }
10831 if (ForExtracts) {
10832 // No need to add vectors here, already handled them in adjustExtracts.
10833 assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
10834 !CommonMask.empty() &&
10835 all_of(enumerate(CommonMask),
10836 [&](auto P) {
10837 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
10838 ->getOrdered(P.index());
10839 if (P.value() == PoisonMaskElem)
10840 return P.value() == Mask[P.index()] ||
10841 isa<UndefValue>(Scalar);
10842 if (isa<Constant>(V1))
10843 return true;
10844 auto *EI = cast<ExtractElementInst>(Scalar);
10845 return EI->getVectorOperand() == V1;
10846 }) &&
10847 "Expected only tree entry for extractelement vectors.");
10848 return;
10849 }
10850 assert(!InVectors.empty() && !CommonMask.empty() &&
10851 "Expected only tree entries from extracts/reused buildvectors.");
10852 unsigned VF = getVF(V1);
10853 if (InVectors.size() == 2) {
10854 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
10855 transformMaskAfterShuffle(CommonMask, CommonMask);
10856 VF = std::max<unsigned>(VF, CommonMask.size());
10857 } else if (const auto *InTE =
10858 InVectors.front().dyn_cast<const TreeEntry *>()) {
10859 VF = std::max(VF, InTE->getVectorFactor());
10860 } else {
10861 VF = std::max(
10862 VF, cast<FixedVectorType>(cast<Value *>(InVectors.front())->getType())
10863 ->getNumElements());
10864 }
10865 InVectors.push_back(V1);
10866 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10867 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
10868 CommonMask[Idx] = Mask[Idx] + VF;
10869 }
10870 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
10871 Value *Root = nullptr) {
10872 Cost += getBuildVectorCost(VL, Root);
10873 if (!Root) {
10874 // FIXME: Need to find a way to avoid use of getNullValue here.
10876 unsigned VF = VL.size();
10877 if (MaskVF != 0)
10878 VF = std::min(VF, MaskVF);
10879 for (Value *V : VL.take_front(VF)) {
10880 if (isa<UndefValue>(V)) {
10881 Vals.push_back(cast<Constant>(V));
10882 continue;
10883 }
10884 Vals.push_back(Constant::getNullValue(V->getType()));
10885 }
10886 if (auto *VecTy = dyn_cast<FixedVectorType>(Vals.front()->getType())) {
10887 assert(SLPReVec && "FixedVectorType is not expected.");
10888 // When REVEC is enabled, we need to expand vector types into scalar
10889 // types.
10890 unsigned VecTyNumElements = VecTy->getNumElements();
10891 SmallVector<Constant *> NewVals(VF * VecTyNumElements, nullptr);
10892 for (auto [I, V] : enumerate(Vals)) {
10893 Type *ScalarTy = V->getType()->getScalarType();
10894 Constant *NewVal;
10895 if (isa<PoisonValue>(V))
10896 NewVal = PoisonValue::get(ScalarTy);
10897 else if (isa<UndefValue>(V))
10898 NewVal = UndefValue::get(ScalarTy);
10899 else
10900 NewVal = Constant::getNullValue(ScalarTy);
10901 std::fill_n(NewVals.begin() + I * VecTyNumElements, VecTyNumElements,
10902 NewVal);
10903 }
10904 Vals.swap(NewVals);
10905 }
10906 return ConstantVector::get(Vals);
10907 }
10910 cast<FixedVectorType>(Root->getType())->getNumElements()),
10911 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
10912 }
10914 /// Finalize emission of the shuffles.
10917 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
10918 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
10919 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
10920 IsFinalized = true;
10921 if (Action) {
10922 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
10923 if (InVectors.size() == 2)
10924 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
10925 else
10926 Cost += createShuffle(Vec, nullptr, CommonMask);
10927 transformMaskAfterShuffle(CommonMask, CommonMask);
10928 assert(VF > 0 &&
10929 "Expected vector length for the final value before action.");
10930 Value *V = cast<Value *>(Vec);
10931 Action(V, CommonMask);
10932 InVectors.front() = V;
10933 }
10934 if (!SubVectors.empty()) {
10935 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
10936 if (InVectors.size() == 2)
10937 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
10938 else
10939 Cost += createShuffle(Vec, nullptr, CommonMask);
10940 transformMaskAfterShuffle(CommonMask, CommonMask);
10941 // Add subvectors permutation cost.
10942 if (!SubVectorsMask.empty()) {
10943 assert(SubVectorsMask.size() <= CommonMask.size() &&
10944 "Expected same size of masks for subvectors and common mask.");
10945 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
10946 copy(SubVectorsMask, SVMask.begin());
10947 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
10948 if (I2 != PoisonMaskElem) {
10949 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
10950 I1 = I2 + CommonMask.size();
10951 }
10952 }
10954 getWidenedType(ScalarTy, CommonMask.size()),
10955 SVMask, CostKind);
10956 }
10957 for (auto [E, Idx] : SubVectors) {
10958 Type *EScalarTy = E->Scalars.front()->getType();
10959 bool IsSigned = true;
10960 if (auto It = R.MinBWs.find(E); It != R.MinBWs.end()) {
10961 EScalarTy =
10962 IntegerType::get(EScalarTy->getContext(), It->second.first);
10963 IsSigned = It->second.second;
10964 }
10965 if (ScalarTy != EScalarTy) {
10966 unsigned CastOpcode = Instruction::Trunc;
10967 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10968 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10969 if (DstSz > SrcSz)
10970 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10972 CastOpcode, getWidenedType(ScalarTy, E->getVectorFactor()),
10973 getWidenedType(EScalarTy, E->getVectorFactor()),
10975 }
10978 getWidenedType(ScalarTy, CommonMask.size()), {}, CostKind, Idx,
10979 getWidenedType(ScalarTy, E->getVectorFactor()));
10980 if (!CommonMask.empty()) {
10981 std::iota(std::next(CommonMask.begin(), Idx),
10982 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
10983 Idx);
10984 }
10985 }
10986 }
10987
10988 if (!ExtMask.empty()) {
10989 if (CommonMask.empty()) {
10990 CommonMask.assign(ExtMask.begin(), ExtMask.end());
10991 } else {
10992 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
10993 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
10994 if (ExtMask[I] == PoisonMaskElem)
10995 continue;
10996 NewMask[I] = CommonMask[ExtMask[I]];
10997 }
10998 CommonMask.swap(NewMask);
10999 }
11000 }
11001 if (CommonMask.empty()) {
11002 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
11003 return Cost;
11004 }
11005 return Cost +
11006 createShuffle(InVectors.front(),
11007 InVectors.size() == 2 ? InVectors.back() : nullptr,
11008 CommonMask);
11009 }
11010
11012 assert((IsFinalized || CommonMask.empty()) &&
11013 "Shuffle construction must be finalized.");
11014 }
11015};
11016
11017const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
11018 unsigned Idx) const {
11019 if (const TreeEntry *VE = getMatchedVectorizedOperand(E, Idx))
11020 return VE;
11021 const auto *It =
11022 find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
11023 return TE->isGather() &&
11024 find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
11025 return EI.EdgeIdx == Idx && EI.UserTE == E;
11026 }) != TE->UserTreeIndices.end();
11027 });
11028 assert(It != VectorizableTree.end() && "Expected vectorizable entry.");
11029 return It->get();
11030}
11031
11032TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
11033 if (TE.State == TreeEntry::ScatterVectorize ||
11034 TE.State == TreeEntry::StridedVectorize)
11036 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
11037 !TE.isAltShuffle()) {
11038 if (TE.ReorderIndices.empty())
11040 SmallVector<int> Mask;
11041 inversePermutation(TE.ReorderIndices, Mask);
11042 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
11044 }
11046}
11047
11048/// Builds the arguments types vector for the given call instruction with the
11049/// given \p ID for the specified vector factor.
11052 const unsigned VF, unsigned MinBW,
11053 const TargetTransformInfo *TTI) {
11054 SmallVector<Type *> ArgTys;
11055 for (auto [Idx, Arg] : enumerate(CI->args())) {
11058 ArgTys.push_back(Arg->getType());
11059 continue;
11060 }
11061 if (MinBW > 0) {
11062 ArgTys.push_back(
11063 getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
11064 continue;
11065 }
11066 }
11067 ArgTys.push_back(getWidenedType(Arg->getType(), VF));
11068 }
11069 return ArgTys;
11070}
11071
11073BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
11074 SmallPtrSetImpl<Value *> &CheckedExtracts) {
11075 ArrayRef<Value *> VL = E->Scalars;
11076
11077 Type *ScalarTy = getValueType(VL[0]);
11078 if (!isValidElementType(ScalarTy))
11081
11082 // If we have computed a smaller type for the expression, update VecTy so
11083 // that the costs will be accurate.
11084 auto It = MinBWs.find(E);
11085 Type *OrigScalarTy = ScalarTy;
11086 if (It != MinBWs.end()) {
11087 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
11088 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
11089 if (VecTy)
11090 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
11091 }
11092 auto *VecTy = getWidenedType(ScalarTy, VL.size());
11093 unsigned EntryVF = E->getVectorFactor();
11094 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
11095
11096 if (E->isGather()) {
11097 if (allConstant(VL))
11098 return 0;
11099 if (isa<InsertElementInst>(VL[0]))
11101 if (isa<CmpInst>(VL.front()))
11102 ScalarTy = VL.front()->getType();
11103 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
11104 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
11105 }
11106 InstructionCost CommonCost = 0;
11108 if (!E->ReorderIndices.empty() && (E->State != TreeEntry::StridedVectorize ||
11109 !isReverseOrder(E->ReorderIndices))) {
11110 SmallVector<int> NewMask;
11111 if (E->getOpcode() == Instruction::Store) {
11112 // For stores the order is actually a mask.
11113 NewMask.resize(E->ReorderIndices.size());
11114 copy(E->ReorderIndices, NewMask.begin());
11115 } else {
11116 inversePermutation(E->ReorderIndices, NewMask);
11117 }
11118 ::addMask(Mask, NewMask);
11119 }
11120 if (!E->ReuseShuffleIndices.empty())
11121 ::addMask(Mask, E->ReuseShuffleIndices);
11122 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
11123 CommonCost =
11124 ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
11125 assert((E->State == TreeEntry::Vectorize ||
11126 E->State == TreeEntry::ScatterVectorize ||
11127 E->State == TreeEntry::StridedVectorize) &&
11128 "Unhandled state");
11129 assert(E->getOpcode() &&
11130 ((allSameType(VL) && allSameBlock(VL)) ||
11131 (E->getOpcode() == Instruction::GetElementPtr &&
11132 E->getMainOp()->getType()->isPointerTy())) &&
11133 "Invalid VL");
11134 Instruction *VL0 = E->getMainOp();
11135 unsigned ShuffleOrOp =
11136 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
11137 if (E->CombinedOp != TreeEntry::NotCombinedOp)
11138 ShuffleOrOp = E->CombinedOp;
11139 SmallSetVector<Value *, 16> UniqueValues(VL.begin(), VL.end());
11140 const unsigned Sz = UniqueValues.size();
11141 SmallBitVector UsedScalars(Sz, false);
11142 for (unsigned I = 0; I < Sz; ++I) {
11143 if (isa<Instruction>(UniqueValues[I]) &&
11144 is_contained(getTreeEntries(UniqueValues[I]), E))
11145 continue;
11146 UsedScalars.set(I);
11147 }
11148 auto GetCastContextHint = [&](Value *V) {
11149 if (ArrayRef<TreeEntry *> OpTEs = getTreeEntries(V); OpTEs.size() == 1)
11150 return getCastContextHint(*OpTEs.front());
11151 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
11152 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
11153 !SrcState.isAltShuffle())
11156 };
11157 auto GetCostDiff =
11158 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
11160 // Calculate the cost of this instruction.
11161 InstructionCost ScalarCost = 0;
11162 if (isa<CastInst, CallInst>(VL0)) {
11163 // For some of the instructions no need to calculate cost for each
11164 // particular instruction, we can use the cost of the single
11165 // instruction x total number of scalar instructions.
11166 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
11167 } else {
11168 for (unsigned I = 0; I < Sz; ++I) {
11169 if (UsedScalars.test(I))
11170 continue;
11171 ScalarCost += ScalarEltCost(I);
11172 }
11173 }
11174
11175 InstructionCost VecCost = VectorCost(CommonCost);
11176 // Check if the current node must be resized, if the parent node is not
11177 // resized.
11178 if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) &&
11179 E->Idx != 0 &&
11180 (E->getOpcode() != Instruction::Load ||
11181 !E->UserTreeIndices.empty())) {
11182 const EdgeInfo &EI =
11183 *find_if(E->UserTreeIndices, [](const EdgeInfo &EI) {
11184 return !EI.UserTE->isGather() || EI.EdgeIdx != UINT_MAX;
11185 });
11186 if (EI.UserTE->getOpcode() != Instruction::Select ||
11187 EI.EdgeIdx != 0) {
11188 auto UserBWIt = MinBWs.find(EI.UserTE);
11189 Type *UserScalarTy =
11190 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
11191 if (UserBWIt != MinBWs.end())
11192 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
11193 UserBWIt->second.first);
11194 if (ScalarTy != UserScalarTy) {
11195 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
11196 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
11197 unsigned VecOpcode;
11198 auto *UserVecTy = getWidenedType(UserScalarTy, E->Scalars.size());
11199 if (BWSz > SrcBWSz)
11200 VecOpcode = Instruction::Trunc;
11201 else
11202 VecOpcode =
11203 It->second.second ? Instruction::SExt : Instruction::ZExt;
11204 TTI::CastContextHint CCH = GetCastContextHint(VL0);
11205 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
11206 CostKind);
11207 }
11208 }
11209 }
11210 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
11211 ScalarCost, "Calculated costs for Tree"));
11212 return VecCost - ScalarCost;
11213 };
11214 // Calculate cost difference from vectorizing set of GEPs.
11215 // Negative value means vectorizing is profitable.
11216 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
11217 assert((E->State == TreeEntry::Vectorize ||
11218 E->State == TreeEntry::StridedVectorize) &&
11219 "Entry state expected to be Vectorize or StridedVectorize here.");
11220 InstructionCost ScalarCost = 0;
11221 InstructionCost VecCost = 0;
11222 std::tie(ScalarCost, VecCost) = getGEPCosts(
11223 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
11224 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
11225 "Calculated GEPs cost for Tree"));
11226
11227 return VecCost - ScalarCost;
11228 };
11229
11230 auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
11231 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI ? VI : VL);
11232 if (MinMaxID == Intrinsic::not_intrinsic)
11234 Type *CanonicalType = Ty;
11235 if (CanonicalType->isPtrOrPtrVectorTy())
11236 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
11237 CanonicalType->getContext(),
11238 DL->getTypeSizeInBits(CanonicalType->getScalarType())));
11239
11240 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
11241 {CanonicalType, CanonicalType});
11242 InstructionCost IntrinsicCost =
11243 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
11244 // If the selects are the only uses of the compares, they will be
11245 // dead and we can adjust the cost by removing their cost.
11246 if (VI && SelectOnly) {
11247 assert((!Ty->isVectorTy() || SLPReVec) &&
11248 "Expected only for scalar type.");
11249 auto *CI = cast<CmpInst>(VI->getOperand(0));
11250 IntrinsicCost -= TTI->getCmpSelInstrCost(
11251 CI->getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
11252 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
11253 {TTI::OK_AnyValue, TTI::OP_None}, CI);
11254 }
11255 return IntrinsicCost;
11256 };
11257 switch (ShuffleOrOp) {
11258 case Instruction::PHI: {
11259 // Count reused scalars.
11260 InstructionCost ScalarCost = 0;
11262 for (Value *V : UniqueValues) {
11263 auto *PHI = dyn_cast<PHINode>(V);
11264 if (!PHI)
11265 continue;
11266
11267 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
11268 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
11269 Value *Op = PHI->getIncomingValue(I);
11270 Operands[I] = Op;
11271 }
11272 if (const TreeEntry *OpTE =
11273 getSameValuesTreeEntry(Operands.front(), Operands))
11274 if (CountedOps.insert(OpTE).second &&
11275 !OpTE->ReuseShuffleIndices.empty())
11276 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
11277 OpTE->Scalars.size());
11278 }
11279
11280 return CommonCost - ScalarCost;
11281 }
11282 case Instruction::ExtractValue:
11283 case Instruction::ExtractElement: {
11284 auto GetScalarCost = [&](unsigned Idx) {
11285 if (isa<PoisonValue>(UniqueValues[Idx]))
11287
11288 auto *I = cast<Instruction>(UniqueValues[Idx]);
11289 VectorType *SrcVecTy;
11290 if (ShuffleOrOp == Instruction::ExtractElement) {
11291 auto *EE = cast<ExtractElementInst>(I);
11292 SrcVecTy = EE->getVectorOperandType();
11293 } else {
11294 auto *EV = cast<ExtractValueInst>(I);
11295 Type *AggregateTy = EV->getAggregateOperand()->getType();
11296 unsigned NumElts;
11297 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
11298 NumElts = ATy->getNumElements();
11299 else
11300 NumElts = AggregateTy->getStructNumElements();
11301 SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
11302 }
11303 if (I->hasOneUse()) {
11304 Instruction *Ext = I->user_back();
11305 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
11306 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
11307 // Use getExtractWithExtendCost() to calculate the cost of
11308 // extractelement/ext pair.
11310 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I));
11311 // Subtract the cost of s|zext which is subtracted separately.
11313 Ext->getOpcode(), Ext->getType(), I->getType(),
11315 return Cost;
11316 }
11317 }
11318 return TTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy,
11320 };
11321 auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; };
11322 return GetCostDiff(GetScalarCost, GetVectorCost);
11323 }
11324 case Instruction::InsertElement: {
11325 assert(E->ReuseShuffleIndices.empty() &&
11326 "Unique insertelements only are expected.");
11327 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
11328 unsigned const NumElts = SrcVecTy->getNumElements();
11329 unsigned const NumScalars = VL.size();
11330
11331 unsigned NumOfParts = ::getNumberOfParts(*TTI, SrcVecTy);
11332
11333 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
11334 unsigned OffsetBeg = *getElementIndex(VL.front());
11335 unsigned OffsetEnd = OffsetBeg;
11336 InsertMask[OffsetBeg] = 0;
11337 for (auto [I, V] : enumerate(VL.drop_front())) {
11338 unsigned Idx = *getElementIndex(V);
11339 if (OffsetBeg > Idx)
11340 OffsetBeg = Idx;
11341 else if (OffsetEnd < Idx)
11342 OffsetEnd = Idx;
11343 InsertMask[Idx] = I + 1;
11344 }
11345 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
11346 if (NumOfParts > 0 && NumOfParts < NumElts)
11347 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
11348 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
11349 VecScalarsSz;
11350 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
11351 unsigned InsertVecSz = std::min<unsigned>(
11352 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
11353 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
11354 bool IsWholeSubvector =
11355 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
11356 // Check if we can safely insert a subvector. If it is not possible, just
11357 // generate a whole-sized vector and shuffle the source vector and the new
11358 // subvector.
11359 if (OffsetBeg + InsertVecSz > VecSz) {
11360 // Align OffsetBeg to generate correct mask.
11361 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
11362 InsertVecSz = VecSz;
11363 }
11364
11365 APInt DemandedElts = APInt::getZero(NumElts);
11366 // TODO: Add support for Instruction::InsertValue.
11368 if (!E->ReorderIndices.empty()) {
11369 inversePermutation(E->ReorderIndices, Mask);
11370 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
11371 } else {
11372 Mask.assign(VecSz, PoisonMaskElem);
11373 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
11374 }
11375 bool IsIdentity = true;
11376 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
11377 Mask.swap(PrevMask);
11378 for (unsigned I = 0; I < NumScalars; ++I) {
11379 unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);
11380 DemandedElts.setBit(InsertIdx);
11381 IsIdentity &= InsertIdx - OffsetBeg == I;
11382 Mask[InsertIdx - OffsetBeg] = I;
11383 }
11384 assert(Offset < NumElts && "Failed to find vector index offset");
11385
11387 Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
11388 /*Insert*/ true, /*Extract*/ false,
11389 CostKind);
11390
11391 // First cost - resize to actual vector size if not identity shuffle or
11392 // need to shift the vector.
11393 // Do not calculate the cost if the actual size is the register size and
11394 // we can merge this shuffle with the following SK_Select.
11395 auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz);
11396 if (!IsIdentity)
11398 InsertVecTy, Mask);
11399 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
11400 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
11401 }));
11402 // Second cost - permutation with subvector, if some elements are from the
11403 // initial vector or inserting a subvector.
11404 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
11405 // subvector of ActualVecTy.
11406 SmallBitVector InMask =
11407 isUndefVector(FirstInsert->getOperand(0),
11408 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
11409 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
11410 if (InsertVecSz != VecSz) {
11411 auto *ActualVecTy = getWidenedType(ScalarTy, VecSz);
11412 Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, ActualVecTy, {},
11413 CostKind, OffsetBeg - Offset, InsertVecTy);
11414 } else {
11415 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
11416 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
11417 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
11418 I <= End; ++I)
11419 if (Mask[I] != PoisonMaskElem)
11420 Mask[I] = I + VecSz;
11421 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
11422 Mask[I] =
11423 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
11424 Cost +=
11425 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
11426 }
11427 }
11428 return Cost;
11429 }
11430 case Instruction::ZExt:
11431 case Instruction::SExt:
11432 case Instruction::FPToUI:
11433 case Instruction::FPToSI:
11434 case Instruction::FPExt:
11435 case Instruction::PtrToInt:
11436 case Instruction::IntToPtr:
11437 case Instruction::SIToFP:
11438 case Instruction::UIToFP:
11439 case Instruction::Trunc:
11440 case Instruction::FPTrunc:
11441 case Instruction::BitCast: {
11442 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
11443 Type *SrcScalarTy = VL0->getOperand(0)->getType();
11444 auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
11445 unsigned Opcode = ShuffleOrOp;
11446 unsigned VecOpcode = Opcode;
11447 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
11448 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
11449 // Check if the values are candidates to demote.
11450 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->getScalarType());
11451 if (SrcIt != MinBWs.end()) {
11452 SrcBWSz = SrcIt->second.first;
11453 unsigned SrcScalarTyNumElements = getNumElements(SrcScalarTy);
11454 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
11455 SrcVecTy =
11456 getWidenedType(SrcScalarTy, VL.size() * SrcScalarTyNumElements);
11457 }
11458 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
11459 if (BWSz == SrcBWSz) {
11460 VecOpcode = Instruction::BitCast;
11461 } else if (BWSz < SrcBWSz) {
11462 VecOpcode = Instruction::Trunc;
11463 } else if (It != MinBWs.end()) {
11464 assert(BWSz > SrcBWSz && "Invalid cast!");
11465 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
11466 } else if (SrcIt != MinBWs.end()) {
11467 assert(BWSz > SrcBWSz && "Invalid cast!");
11468 VecOpcode =
11469 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
11470 }
11471 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
11472 !SrcIt->second.second) {
11473 VecOpcode = Instruction::UIToFP;
11474 }
11475 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
11476 assert(Idx == 0 && "Expected 0 index only");
11477 return TTI->getCastInstrCost(Opcode, VL0->getType(),
11478 VL0->getOperand(0)->getType(),
11480 };
11481 auto GetVectorCost = [=](InstructionCost CommonCost) {
11482 // Do not count cost here if minimum bitwidth is in effect and it is just
11483 // a bitcast (here it is just a noop).
11484 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
11485 return CommonCost;
11486 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
11487 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
11488
11489 bool IsArithmeticExtendedReduction =
11490 E->Idx == 0 && UserIgnoreList &&
11491 all_of(*UserIgnoreList, [](Value *V) {
11492 auto *I = cast<Instruction>(V);
11493 return is_contained({Instruction::Add, Instruction::FAdd,
11494 Instruction::Mul, Instruction::FMul,
11495 Instruction::And, Instruction::Or,
11496 Instruction::Xor},
11497 I->getOpcode());
11498 });
11499 if (IsArithmeticExtendedReduction &&
11500 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
11501 return CommonCost;
11502 return CommonCost +
11503 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
11504 VecOpcode == Opcode ? VI : nullptr);
11505 };
11506 return GetCostDiff(GetScalarCost, GetVectorCost);
11507 }
11508 case Instruction::FCmp:
11509 case Instruction::ICmp:
11510 case Instruction::Select: {
11511 CmpPredicate VecPred, SwappedVecPred;
11512 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
11513 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
11514 match(VL0, MatchCmp))
11515 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
11516 else
11517 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
11520 auto GetScalarCost = [&](unsigned Idx) {
11521 if (isa<PoisonValue>(UniqueValues[Idx]))
11523
11524 auto *VI = cast<Instruction>(UniqueValues[Idx]);
11525 CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
11528 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
11529 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
11530 !match(VI, MatchCmp)) ||
11531 (CurrentPred != static_cast<CmpInst::Predicate>(VecPred) &&
11532 CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred)))
11533 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
11536
11538 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
11539 CostKind, getOperandInfo(VI->getOperand(0)),
11540 getOperandInfo(VI->getOperand(1)), VI);
11541 InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
11542 if (IntrinsicCost.isValid())
11543 ScalarCost = IntrinsicCost;
11544
11545 return ScalarCost;
11546 };
11547 auto GetVectorCost = [&](InstructionCost CommonCost) {
11548 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
11549
11550 InstructionCost VecCost =
11551 TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VecPred,
11552 CostKind, getOperandInfo(E->getOperand(0)),
11553 getOperandInfo(E->getOperand(1)), VL0);
11554 if (auto *SI = dyn_cast<SelectInst>(VL0)) {
11555 auto *CondType =
11556 getWidenedType(SI->getCondition()->getType(), VL.size());
11557 unsigned CondNumElements = CondType->getNumElements();
11558 unsigned VecTyNumElements = getNumElements(VecTy);
11559 assert(VecTyNumElements >= CondNumElements &&
11560 VecTyNumElements % CondNumElements == 0 &&
11561 "Cannot vectorize Instruction::Select");
11562 if (CondNumElements != VecTyNumElements) {
11563 // When the return type is i1 but the source is fixed vector type, we
11564 // need to duplicate the condition value.
11565 VecCost += ::getShuffleCost(
11566 *TTI, TTI::SK_PermuteSingleSrc, CondType,
11567 createReplicatedMask(VecTyNumElements / CondNumElements,
11568 CondNumElements));
11569 }
11570 }
11571 return VecCost + CommonCost;
11572 };
11573 return GetCostDiff(GetScalarCost, GetVectorCost);
11574 }
11575 case TreeEntry::MinMax: {
11576 auto GetScalarCost = [&](unsigned Idx) {
11577 return GetMinMaxCost(OrigScalarTy);
11578 };
11579 auto GetVectorCost = [&](InstructionCost CommonCost) {
11580 InstructionCost VecCost = GetMinMaxCost(VecTy);
11581 return VecCost + CommonCost;
11582 };
11583 return GetCostDiff(GetScalarCost, GetVectorCost);
11584 }
11585 case Instruction::FNeg:
11586 case Instruction::Add:
11587 case Instruction::FAdd:
11588 case Instruction::Sub:
11589 case Instruction::FSub:
11590 case Instruction::Mul:
11591 case Instruction::FMul:
11592 case Instruction::UDiv:
11593 case Instruction::SDiv:
11594 case Instruction::FDiv:
11595 case Instruction::URem:
11596 case Instruction::SRem:
11597 case Instruction::FRem:
11598 case Instruction::Shl:
11599 case Instruction::LShr:
11600 case Instruction::AShr:
11601 case Instruction::And:
11602 case Instruction::Or:
11603 case Instruction::Xor: {
11604 auto GetScalarCost = [&](unsigned Idx) {
11605 if (isa<PoisonValue>(UniqueValues[Idx]))
11607
11608 auto *VI = cast<Instruction>(UniqueValues[Idx]);
11609 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
11610 TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
11611 TTI::OperandValueInfo Op2Info =
11612 TTI::getOperandInfo(VI->getOperand(OpIdx));
11613 SmallVector<const Value *> Operands(VI->operand_values());
11614 return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind,
11615 Op1Info, Op2Info, Operands, VI);
11616 };
11617 auto GetVectorCost = [=](InstructionCost CommonCost) {
11618 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
11619 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
11620 ArrayRef<Value *> Ops = E->getOperand(I);
11621 if (all_of(Ops, [&](Value *Op) {
11622 auto *CI = dyn_cast<ConstantInt>(Op);
11623 return CI && CI->getValue().countr_one() >= It->second.first;
11624 }))
11625 return CommonCost;
11626 }
11627 }
11628 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
11629 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
11630 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
11631 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
11632 Op2Info, {}, nullptr, TLI) +
11633 CommonCost;
11634 };
11635 return GetCostDiff(GetScalarCost, GetVectorCost);
11636 }
11637 case Instruction::GetElementPtr: {
11638 return CommonCost + GetGEPCostDiff(VL, VL0);
11639 }
11640 case Instruction::Load: {
11641 auto GetScalarCost = [&](unsigned Idx) {
11642 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
11643 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
11644 VI->getAlign(), VI->getPointerAddressSpace(),
11646 };
11647 auto *LI0 = cast<LoadInst>(VL0);
11648 auto GetVectorCost = [&](InstructionCost CommonCost) {
11649 InstructionCost VecLdCost;
11650 switch (E->State) {
11651 case TreeEntry::Vectorize:
11652 if (unsigned Factor = E->getInterleaveFactor()) {
11653 VecLdCost = TTI->getInterleavedMemoryOpCost(
11654 Instruction::Load, VecTy, Factor, std::nullopt, LI0->getAlign(),
11655 LI0->getPointerAddressSpace(), CostKind);
11656
11657 } else {
11658 VecLdCost = TTI->getMemoryOpCost(
11659 Instruction::Load, VecTy, LI0->getAlign(),
11660 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
11661 }
11662 break;
11663 case TreeEntry::StridedVectorize: {
11664 Align CommonAlignment =
11665 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11666 VecLdCost = TTI->getStridedMemoryOpCost(
11667 Instruction::Load, VecTy, LI0->getPointerOperand(),
11668 /*VariableMask=*/false, CommonAlignment, CostKind);
11669 break;
11670 }
11671 case TreeEntry::ScatterVectorize: {
11672 Align CommonAlignment =
11673 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11674 VecLdCost = TTI->getGatherScatterOpCost(
11675 Instruction::Load, VecTy, LI0->getPointerOperand(),
11676 /*VariableMask=*/false, CommonAlignment, CostKind);
11677 break;
11678 }
11679 case TreeEntry::CombinedVectorize:
11680 case TreeEntry::NeedToGather:
11681 llvm_unreachable("Unexpected vectorization state.");
11682 }
11683 return VecLdCost + CommonCost;
11684 };
11685
11686 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
11687 // If this node generates masked gather load then it is not a terminal node.
11688 // Hence address operand cost is estimated separately.
11689 if (E->State == TreeEntry::ScatterVectorize)
11690 return Cost;
11691
11692 // Estimate cost of GEPs since this tree node is a terminator.
11693 SmallVector<Value *> PointerOps(VL.size());
11694 for (auto [I, V] : enumerate(VL))
11695 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
11696 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
11697 }
11698 case Instruction::Store: {
11699 bool IsReorder = !E->ReorderIndices.empty();
11700 auto GetScalarCost = [=](unsigned Idx) {
11701 auto *VI = cast<StoreInst>(VL[Idx]);
11702 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
11703 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
11704 VI->getAlign(), VI->getPointerAddressSpace(),
11705 CostKind, OpInfo, VI);
11706 };
11707 auto *BaseSI =
11708 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
11709 auto GetVectorCost = [=](InstructionCost CommonCost) {
11710 // We know that we can merge the stores. Calculate the cost.
11711 InstructionCost VecStCost;
11712 if (E->State == TreeEntry::StridedVectorize) {
11713 Align CommonAlignment =
11714 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
11715 VecStCost = TTI->getStridedMemoryOpCost(
11716 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
11717 /*VariableMask=*/false, CommonAlignment, CostKind);
11718 } else {
11719 assert(E->State == TreeEntry::Vectorize &&
11720 "Expected either strided or consecutive stores.");
11721 if (unsigned Factor = E->getInterleaveFactor()) {
11722 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
11723 "No reused shuffles expected");
11724 CommonCost = 0;
11725 VecStCost = TTI->getInterleavedMemoryOpCost(
11726 Instruction::Store, VecTy, Factor, std::nullopt,
11727 BaseSI->getAlign(), BaseSI->getPointerAddressSpace(), CostKind);
11728 } else {
11729 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
11730 VecStCost = TTI->getMemoryOpCost(
11731 Instruction::Store, VecTy, BaseSI->getAlign(),
11732 BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
11733 }
11734 }
11735 return VecStCost + CommonCost;
11736 };
11737 SmallVector<Value *> PointerOps(VL.size());
11738 for (auto [I, V] : enumerate(VL)) {
11739 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
11740 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
11741 }
11742
11743 return GetCostDiff(GetScalarCost, GetVectorCost) +
11744 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
11745 }
11746 case Instruction::Call: {
11747 auto GetScalarCost = [&](unsigned Idx) {
11748 auto *CI = cast<CallInst>(UniqueValues[Idx]);
11751 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
11752 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
11753 }
11756 CI->getFunctionType()->params(), CostKind);
11757 };
11758 auto GetVectorCost = [=](InstructionCost CommonCost) {
11759 auto *CI = cast<CallInst>(VL0);
11762 CI, ID, VecTy->getNumElements(),
11763 It != MinBWs.end() ? It->second.first : 0, TTI);
11764 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
11765 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
11766 };
11767 return GetCostDiff(GetScalarCost, GetVectorCost);
11768 }
11769 case Instruction::ShuffleVector: {
11770 if (!SLPReVec || E->isAltShuffle())
11771 assert(E->isAltShuffle() &&
11772 ((Instruction::isBinaryOp(E->getOpcode()) &&
11773 Instruction::isBinaryOp(E->getAltOpcode())) ||
11774 (Instruction::isCast(E->getOpcode()) &&
11775 Instruction::isCast(E->getAltOpcode())) ||
11776 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
11777 "Invalid Shuffle Vector Operand");
11778 // Try to find the previous shuffle node with the same operands and same
11779 // main/alternate ops.
11780 auto TryFindNodeWithEqualOperands = [=]() {
11781 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
11782 if (TE.get() == E)
11783 break;
11784 if (TE->hasState() && TE->isAltShuffle() &&
11785 ((TE->getOpcode() == E->getOpcode() &&
11786 TE->getAltOpcode() == E->getAltOpcode()) ||
11787 (TE->getOpcode() == E->getAltOpcode() &&
11788 TE->getAltOpcode() == E->getOpcode())) &&
11789 TE->hasEqualOperands(*E))
11790 return true;
11791 }
11792 return false;
11793 };
11794 auto GetScalarCost = [&](unsigned Idx) {
11795 if (isa<PoisonValue>(UniqueValues[Idx]))
11797
11798 auto *VI = cast<Instruction>(UniqueValues[Idx]);
11799 assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
11800 (void)E;
11801 return TTI->getInstructionCost(VI, CostKind);
11802 };
11803 // Need to clear CommonCost since the final shuffle cost is included into
11804 // vector cost.
11805 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
11806 // VecCost is equal to sum of the cost of creating 2 vectors
11807 // and the cost of creating shuffle.
11808 InstructionCost VecCost = 0;
11809 if (TryFindNodeWithEqualOperands()) {
11810 LLVM_DEBUG({
11811 dbgs() << "SLP: diamond match for alternate node found.\n";
11812 E->dump();
11813 });
11814 // No need to add new vector costs here since we're going to reuse
11815 // same main/alternate vector ops, just do different shuffling.
11816 } else if (Instruction::isBinaryOp(E->getOpcode())) {
11817 VecCost =
11818 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
11819 VecCost +=
11820 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
11821 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
11822 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
11823 VecCost = TTIRef.getCmpSelInstrCost(
11824 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(), CostKind,
11825 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11826 VL0);
11827 VecCost += TTIRef.getCmpSelInstrCost(
11828 E->getOpcode(), VecTy, MaskTy,
11829 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
11830 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11831 E->getAltOp());
11832 } else {
11833 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
11834 auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
11835 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
11836 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
11837 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
11838 unsigned SrcBWSz =
11839 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
11840 if (SrcIt != MinBWs.end()) {
11841 SrcBWSz = SrcIt->second.first;
11842 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
11843 SrcTy = getWidenedType(SrcSclTy, VL.size());
11844 }
11845 if (BWSz <= SrcBWSz) {
11846 if (BWSz < SrcBWSz)
11847 VecCost =
11848 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
11850 LLVM_DEBUG({
11851 dbgs()
11852 << "SLP: alternate extension, which should be truncated.\n";
11853 E->dump();
11854 });
11855 return VecCost;
11856 }
11857 }
11858 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
11860 VecCost +=
11861 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
11863 }
11865 E->buildAltOpShuffleMask(
11866 [&](Instruction *I) {
11867 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
11868 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
11869 *TLI);
11870 },
11871 Mask);
11873 FinalVecTy, Mask, CostKind);
11874 // Patterns like [fadd,fsub] can be combined into a single instruction
11875 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
11876 // need to take into account their order when looking for the most used
11877 // order.
11878 unsigned Opcode0 = E->getOpcode();
11879 unsigned Opcode1 = E->getAltOpcode();
11880 SmallBitVector OpcodeMask(getAltInstrMask(E->Scalars, Opcode0, Opcode1));
11881 // If this pattern is supported by the target then we consider the
11882 // order.
11883 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
11884 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
11885 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
11886 return AltVecCost < VecCost ? AltVecCost : VecCost;
11887 }
11888 // TODO: Check the reverse order too.
11889 return VecCost;
11890 };
11891 if (SLPReVec && !E->isAltShuffle())
11892 return GetCostDiff(
11893 GetScalarCost, [&](InstructionCost) -> InstructionCost {
11894 // If a group uses mask in order, the shufflevector can be
11895 // eliminated by instcombine. Then the cost is 0.
11896 assert(isa<ShuffleVectorInst>(VL.front()) &&
11897 "Not supported shufflevector usage.");
11898 auto *SV = cast<ShuffleVectorInst>(VL.front());
11899 unsigned SVNumElements =
11900 cast<FixedVectorType>(SV->getOperand(0)->getType())
11901 ->getNumElements();
11902 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
11903 for (size_t I = 0, End = VL.size(); I != End; I += GroupSize) {
11904 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
11905 int NextIndex = 0;
11906 if (!all_of(Group, [&](Value *V) {
11907 assert(isa<ShuffleVectorInst>(V) &&
11908 "Not supported shufflevector usage.");
11909 auto *SV = cast<ShuffleVectorInst>(V);
11910 int Index;
11911 [[maybe_unused]] bool IsExtractSubvectorMask =
11912 SV->isExtractSubvectorMask(Index);
11913 assert(IsExtractSubvectorMask &&
11914 "Not supported shufflevector usage.");
11915 if (NextIndex != Index)
11916 return false;
11917 NextIndex += SV->getShuffleMask().size();
11918 return true;
11919 }))
11920 return ::getShuffleCost(
11922 calculateShufflevectorMask(E->Scalars));
11923 }
11924 return TTI::TCC_Free;
11925 });
11926 return GetCostDiff(GetScalarCost, GetVectorCost);
11927 }
11928 case Instruction::Freeze:
11929 return CommonCost;
11930 default:
11931 llvm_unreachable("Unknown instruction");
11932 }
11933}
11934
11935bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
11936 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
11937 << VectorizableTree.size() << " is fully vectorizable .\n");
11938
11939 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
11941 return TE->isGather() &&
11942 !any_of(TE->Scalars,
11943 [this](Value *V) { return EphValues.contains(V); }) &&
11944 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
11945 TE->Scalars.size() < Limit ||
11946 (((TE->hasState() &&
11947 TE->getOpcode() == Instruction::ExtractElement) ||
11948 all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
11949 isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
11950 (TE->hasState() && TE->getOpcode() == Instruction::Load &&
11951 !TE->isAltShuffle()) ||
11952 any_of(TE->Scalars, IsaPred<LoadInst>));
11953 };
11954
11955 // We only handle trees of heights 1 and 2.
11956 if (VectorizableTree.size() == 1 &&
11957 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
11958 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
11959 (ForReduction &&
11960 AreVectorizableGathers(VectorizableTree[0].get(),
11961 VectorizableTree[0]->Scalars.size()) &&
11962 VectorizableTree[0]->getVectorFactor() > 2)))
11963 return true;
11964
11965 if (VectorizableTree.size() != 2)
11966 return false;
11967
11968 // Handle splat and all-constants stores. Also try to vectorize tiny trees
11969 // with the second gather nodes if they have less scalar operands rather than
11970 // the initial tree element (may be profitable to shuffle the second gather)
11971 // or they are extractelements, which form shuffle.
11973 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
11974 AreVectorizableGathers(VectorizableTree[1].get(),
11975 VectorizableTree[0]->Scalars.size()))
11976 return true;
11977
11978 // Gathering cost would be too much for tiny trees.
11979 if (VectorizableTree[0]->isGather() ||
11980 (VectorizableTree[1]->isGather() &&
11981 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
11982 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
11983 return false;
11984
11985 return true;
11986}
11987
11988static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
11990 bool MustMatchOrInst) {
11991 // Look past the root to find a source value. Arbitrarily follow the
11992 // path through operand 0 of any 'or'. Also, peek through optional
11993 // shift-left-by-multiple-of-8-bits.
11994 Value *ZextLoad = Root;
11995 const APInt *ShAmtC;
11996 bool FoundOr = false;
11997 while (!isa<ConstantExpr>(ZextLoad) &&
11998 (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
11999 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
12000 ShAmtC->urem(8) == 0))) {
12001 auto *BinOp = cast<BinaryOperator>(ZextLoad);
12002 ZextLoad = BinOp->getOperand(0);
12003 if (BinOp->getOpcode() == Instruction::Or)
12004 FoundOr = true;
12005 }
12006 // Check if the input is an extended load of the required or/shift expression.
12007 Value *Load;
12008 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
12009 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
12010 return false;
12011
12012 // Require that the total load bit width is a legal integer type.
12013 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
12014 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
12015 Type *SrcTy = Load->getType();
12016 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
12017 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
12018 return false;
12019
12020 // Everything matched - assume that we can fold the whole sequence using
12021 // load combining.
12022 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
12023 << *(cast<Instruction>(Root)) << "\n");
12024
12025 return true;
12026}
12027
12029 if (RdxKind != RecurKind::Or)
12030 return false;
12031
12032 unsigned NumElts = VectorizableTree[0]->Scalars.size();
12033 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
12034 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
12035 /* MatchOr */ false);
12036}
12037
12039 // Peek through a final sequence of stores and check if all operations are
12040 // likely to be load-combined.
12041 unsigned NumElts = Stores.size();
12042 for (Value *Scalar : Stores) {
12043 Value *X;
12044 if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
12045 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
12046 return false;
12047 }
12048 return true;
12049}
12050
12051bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
12052 if (!DebugCounter::shouldExecute(VectorizedGraphs))
12053 return true;
12054
12055 // Graph is empty - do nothing.
12056 if (VectorizableTree.empty()) {
12057 assert(ExternalUses.empty() && "We shouldn't have any external users");
12058
12059 return true;
12060 }
12061
12062 // No need to vectorize inserts of gathered values.
12063 if (VectorizableTree.size() == 2 &&
12064 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
12065 VectorizableTree[1]->isGather() &&
12066 (VectorizableTree[1]->getVectorFactor() <= 2 ||
12067 !(isSplat(VectorizableTree[1]->Scalars) ||
12068 allConstant(VectorizableTree[1]->Scalars))))
12069 return true;
12070
12071 // If the graph includes only PHI nodes and gathers, it is defnitely not
12072 // profitable for the vectorization, we can skip it, if the cost threshold is
12073 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
12074 // gathers/buildvectors.
12075 constexpr int Limit = 4;
12076 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
12077 !VectorizableTree.empty() &&
12078 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12079 return (TE->isGather() &&
12080 (!TE->hasState() ||
12081 TE->getOpcode() != Instruction::ExtractElement) &&
12082 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
12083 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
12084 }))
12085 return true;
12086
12087 // We can vectorize the tree if its size is greater than or equal to the
12088 // minimum size specified by the MinTreeSize command line option.
12089 if (VectorizableTree.size() >= MinTreeSize)
12090 return false;
12091
12092 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
12093 // can vectorize it if we can prove it fully vectorizable.
12094 if (isFullyVectorizableTinyTree(ForReduction))
12095 return false;
12096
12097 // Check if any of the gather node forms an insertelement buildvector
12098 // somewhere.
12099 bool IsAllowedSingleBVNode =
12100 VectorizableTree.size() > 1 ||
12101 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
12102 !VectorizableTree.front()->isAltShuffle() &&
12103 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
12104 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
12105 allSameBlock(VectorizableTree.front()->Scalars));
12106 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12107 return TE->isGather() && all_of(TE->Scalars, [&](Value *V) {
12108 return isa<ExtractElementInst, UndefValue>(V) ||
12109 (IsAllowedSingleBVNode &&
12110 !V->hasNUsesOrMore(UsesLimit) &&
12111 any_of(V->users(), IsaPred<InsertElementInst>));
12112 });
12113 }))
12114 return false;
12115
12116 if (VectorizableTree.back()->isGather() &&
12117 VectorizableTree.back()->hasState() &&
12118 VectorizableTree.back()->isAltShuffle() &&
12119 VectorizableTree.back()->getVectorFactor() > 2 &&
12120 allSameBlock(VectorizableTree.back()->Scalars) &&
12121 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
12123 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
12124 VectorizableTree.back()->getVectorFactor()),
12125 APInt::getAllOnes(VectorizableTree.back()->getVectorFactor()),
12126 /*Insert=*/true, /*Extract=*/false,
12128 return false;
12129
12130 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
12131 // vectorizable.
12132 return true;
12133}
12134
12137 constexpr unsigned SmallTree = 3;
12138 if (VectorizableTree.front()->isNonPowOf2Vec() &&
12139 getCanonicalGraphSize() <= SmallTree &&
12140 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
12141 [](const std::unique_ptr<TreeEntry> &TE) {
12142 return TE->isGather() && TE->hasState() &&
12143 TE->getOpcode() == Instruction::Load &&
12144 !allSameBlock(TE->Scalars);
12145 }) == 1)
12146 return true;
12147 return false;
12148 }
12149 bool Res = false;
12150 for (unsigned Idx : seq<unsigned>(getTreeSize())) {
12151 TreeEntry &E = *VectorizableTree[Idx];
12152 if (!E.isGather())
12153 continue;
12154 if (E.hasState() && E.getOpcode() != Instruction::Load)
12155 return false;
12156 if (isSplat(E.Scalars) || allConstant(E.Scalars))
12157 continue;
12158 Res = true;
12159 }
12160 return Res;
12161}
12162
12164 // Walk from the bottom of the tree to the top, tracking which values are
12165 // live. When we see a call instruction that is not part of our tree,
12166 // query TTI to see if there is a cost to keeping values live over it
12167 // (for example, if spills and fills are required).
12168 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
12170
12172 Instruction *PrevInst = nullptr;
12173
12174 // The entries in VectorizableTree are not necessarily ordered by their
12175 // position in basic blocks. Collect them and order them by dominance so later
12176 // instructions are guaranteed to be visited first. For instructions in
12177 // different basic blocks, we only scan to the beginning of the block, so
12178 // their order does not matter, as long as all instructions in a basic block
12179 // are grouped together. Using dominance ensures a deterministic order.
12180 SmallVector<Instruction *, 16> OrderedScalars;
12181 for (const auto &TEPtr : VectorizableTree) {
12182 if (TEPtr->State != TreeEntry::Vectorize)
12183 continue;
12184 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
12185 if (!Inst)
12186 continue;
12187 OrderedScalars.push_back(Inst);
12188 }
12189 llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) {
12190 auto *NodeA = DT->getNode(A->getParent());
12191 auto *NodeB = DT->getNode(B->getParent());
12192 assert(NodeA && "Should only process reachable instructions");
12193 assert(NodeB && "Should only process reachable instructions");
12194 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
12195 "Different nodes should have different DFS numbers");
12196 if (NodeA != NodeB)
12197 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
12198 return B->comesBefore(A);
12199 });
12200
12201 for (Instruction *Inst : OrderedScalars) {
12202 if (!PrevInst) {
12203 PrevInst = Inst;
12204 continue;
12205 }
12206
12207 // Update LiveValues.
12208 LiveValues.erase(PrevInst);
12209 for (auto &J : PrevInst->operands()) {
12210 if (isa<Instruction>(&*J) && isVectorized(&*J))
12211 LiveValues.insert(cast<Instruction>(&*J));
12212 }
12213
12214 LLVM_DEBUG({
12215 dbgs() << "SLP: #LV: " << LiveValues.size();
12216 for (auto *X : LiveValues)
12217 dbgs() << " " << X->getName();
12218 dbgs() << ", Looking at ";
12219 Inst->dump();
12220 });
12221
12222 // Now find the sequence of instructions between PrevInst and Inst.
12223 unsigned NumCalls = 0;
12224 BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
12225 PrevInstIt =
12226 PrevInst->getIterator().getReverse();
12227 while (InstIt != PrevInstIt) {
12228 if (PrevInstIt == PrevInst->getParent()->rend()) {
12229 PrevInstIt = Inst->getParent()->rbegin();
12230 continue;
12231 }
12232
12233 auto NoCallIntrinsic = [this](Instruction *I) {
12234 if (auto *II = dyn_cast<IntrinsicInst>(I)) {
12235 if (II->isAssumeLikeIntrinsic())
12236 return true;
12237 FastMathFlags FMF;
12239 for (auto &ArgOp : II->args())
12240 Tys.push_back(ArgOp->getType());
12241 if (auto *FPMO = dyn_cast<FPMathOperator>(II))
12242 FMF = FPMO->getFastMathFlags();
12243 IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys,
12244 FMF);
12245 InstructionCost IntrCost =
12248 nullptr, II->getType(), Tys, TTI::TCK_RecipThroughput);
12249 if (IntrCost < CallCost)
12250 return true;
12251 }
12252 return false;
12253 };
12254
12255 // Debug information does not impact spill cost.
12256 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
12257 &*PrevInstIt != PrevInst)
12258 NumCalls++;
12259
12260 ++PrevInstIt;
12261 }
12262
12263 if (NumCalls) {
12265 for (auto *II : LiveValues) {
12266 auto *ScalarTy = II->getType();
12267 if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
12268 ScalarTy = VectorTy->getElementType();
12269 V.push_back(getWidenedType(ScalarTy, BundleWidth));
12270 }
12271 Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
12272 }
12273
12274 PrevInst = Inst;
12275 }
12276
12277 return Cost;
12278}
12279
12280/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
12281/// buildvector sequence.
12283 const InsertElementInst *IE2) {
12284 if (IE1 == IE2)
12285 return false;
12286 const auto *I1 = IE1;
12287 const auto *I2 = IE2;
12288 const InsertElementInst *PrevI1;
12289 const InsertElementInst *PrevI2;
12290 unsigned Idx1 = *getElementIndex(IE1);
12291 unsigned Idx2 = *getElementIndex(IE2);
12292 do {
12293 if (I2 == IE1)
12294 return true;
12295 if (I1 == IE2)
12296 return false;
12297 PrevI1 = I1;
12298 PrevI2 = I2;
12299 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
12300 getElementIndex(I1).value_or(Idx2) != Idx2)
12301 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
12302 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
12303 getElementIndex(I2).value_or(Idx1) != Idx1)
12304 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
12305 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
12306 llvm_unreachable("Two different buildvectors not expected.");
12307}
12308
12309namespace {
12310/// Returns incoming Value *, if the requested type is Value * too, or a default
12311/// value, otherwise.
12312struct ValueSelect {
12313 template <typename U>
12314 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
12315 return V;
12316 }
12317 template <typename U>
12318 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
12319 return U();
12320 }
12321};
12322} // namespace
12323
12324/// Does the analysis of the provided shuffle masks and performs the requested
12325/// actions on the vectors with the given shuffle masks. It tries to do it in
12326/// several steps.
12327/// 1. If the Base vector is not undef vector, resizing the very first mask to
12328/// have common VF and perform action for 2 input vectors (including non-undef
12329/// Base). Other shuffle masks are combined with the resulting after the 1 stage
12330/// and processed as a shuffle of 2 elements.
12331/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
12332/// action only for 1 vector with the given mask, if it is not the identity
12333/// mask.
12334/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
12335/// vectors, combing the masks properly between the steps.
12336template <typename T>
12338 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
12339 function_ref<unsigned(T *)> GetVF,
12340 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
12342 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
12343 SmallVector<int> Mask(ShuffleMask.begin()->second);
12344 auto VMIt = std::next(ShuffleMask.begin());
12345 T *Prev = nullptr;
12346 SmallBitVector UseMask =
12347 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
12348 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
12349 if (!IsBaseUndef.all()) {
12350 // Base is not undef, need to combine it with the next subvectors.
12351 std::pair<T *, bool> Res =
12352 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
12353 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
12354 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
12355 if (Mask[Idx] == PoisonMaskElem)
12356 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
12357 else
12358 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
12359 }
12360 [[maybe_unused]] auto *V = ValueSelect::get<T *>(Base);
12361 assert((!V || GetVF(V) == Mask.size()) &&
12362 "Expected base vector of VF number of elements.");
12363 Prev = Action(Mask, {nullptr, Res.first});
12364 } else if (ShuffleMask.size() == 1) {
12365 // Base is undef and only 1 vector is shuffled - perform the action only for
12366 // single vector, if the mask is not the identity mask.
12367 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
12368 /*ForSingleMask=*/true);
12369 if (Res.second)
12370 // Identity mask is found.
12371 Prev = Res.first;
12372 else
12373 Prev = Action(Mask, {ShuffleMask.begin()->first});
12374 } else {
12375 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
12376 // shuffles step by step, combining shuffle between the steps.
12377 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
12378 unsigned Vec2VF = GetVF(VMIt->first);
12379 if (Vec1VF == Vec2VF) {
12380 // No need to resize the input vectors since they are of the same size, we
12381 // can shuffle them directly.
12382 ArrayRef<int> SecMask = VMIt->second;
12383 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
12384 if (SecMask[I] != PoisonMaskElem) {
12385 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
12386 Mask[I] = SecMask[I] + Vec1VF;
12387 }
12388 }
12389 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
12390 } else {
12391 // Vectors of different sizes - resize and reshuffle.
12392 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
12393 /*ForSingleMask=*/false);
12394 std::pair<T *, bool> Res2 =
12395 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
12396 ArrayRef<int> SecMask = VMIt->second;
12397 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
12398 if (Mask[I] != PoisonMaskElem) {
12399 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
12400 if (Res1.second)
12401 Mask[I] = I;
12402 } else if (SecMask[I] != PoisonMaskElem) {
12403 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
12404 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
12405 }
12406 }
12407 Prev = Action(Mask, {Res1.first, Res2.first});
12408 }
12409 VMIt = std::next(VMIt);
12410 }
12411 [[maybe_unused]] bool IsBaseNotUndef = !IsBaseUndef.all();
12412 // Perform requested actions for the remaining masks/vectors.
12413 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
12414 // Shuffle other input vectors, if any.
12415 std::pair<T *, bool> Res =
12416 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
12417 ArrayRef<int> SecMask = VMIt->second;
12418 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
12419 if (SecMask[I] != PoisonMaskElem) {
12420 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
12421 "Multiple uses of scalars.");
12422 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
12423 } else if (Mask[I] != PoisonMaskElem) {
12424 Mask[I] = I;
12425 }
12426 }
12427 Prev = Action(Mask, {Prev, Res.first});
12428 }
12429 return Prev;
12430}
12431
12432namespace {
12433/// Data type for handling buildvector sequences with the reused scalars from
12434/// other tree entries.
12435template <typename T> struct ShuffledInsertData {
12436 /// List of insertelements to be replaced by shuffles.
12437 SmallVector<InsertElementInst *> InsertElements;
12438 /// The parent vectors and shuffle mask for the given list of inserts.
12440};
12441} // namespace
12442
12445 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
12446 << VectorizableTree.size() << ".\n");
12447
12448 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
12449
12450 SmallPtrSet<Value *, 4> CheckedExtracts;
12451 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
12452 TreeEntry &TE = *VectorizableTree[I];
12453 // No need to count the cost for combined entries, they are combined and
12454 // just skip their cost.
12455 if (TE.State == TreeEntry::CombinedVectorize) {
12456 LLVM_DEBUG(
12457 dbgs() << "SLP: Skipping cost for combined node that starts with "
12458 << *TE.Scalars[0] << ".\n";
12459 TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
12460 continue;
12461 }
12462 if (TE.isGather() && TE.hasState()) {
12463 if (const TreeEntry *E =
12464 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
12465 E && E->getVectorFactor() == TE.getVectorFactor()) {
12466 // Some gather nodes might be absolutely the same as some vectorizable
12467 // nodes after reordering, need to handle it.
12468 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
12469 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
12470 << "SLP: Current total cost = " << Cost << "\n");
12471 continue;
12472 }
12473 }
12474
12475 // Exclude cost of gather loads nodes which are not used. These nodes were
12476 // built as part of the final attempt to vectorize gathered loads.
12477 assert((!TE.isGather() || TE.Idx == 0 || !TE.UserTreeIndices.empty()) &&
12478 "Expected gather nodes with users only.");
12479
12480 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
12481 Cost += C;
12482 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
12483 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
12484 << "SLP: Current total cost = " << Cost << "\n");
12485 }
12486
12487 SmallPtrSet<Value *, 16> ExtractCostCalculated;
12488 InstructionCost ExtractCost = 0;
12490 SmallVector<APInt> DemandedElts;
12491 SmallDenseSet<Value *, 4> UsedInserts;
12493 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
12495 SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
12496 // Keep track {Scalar, Index, User} tuple.
12497 // On AArch64, this helps in fusing a mov instruction, associated with
12498 // extractelement, with fmul in the backend so that extractelement is free.
12500 for (ExternalUser &EU : ExternalUses) {
12501 ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane);
12502 }
12503 for (ExternalUser &EU : ExternalUses) {
12504 // Uses by ephemeral values are free (because the ephemeral value will be
12505 // removed prior to code generation, and so the extraction will be
12506 // removed as well).
12507 if (EphValues.count(EU.User))
12508 continue;
12509
12510 // Used in unreachable blocks or in EH pads (rarely executed) or is
12511 // terminated with unreachable instruction.
12512 if (BasicBlock *UserParent =
12513 EU.User ? cast<Instruction>(EU.User)->getParent() : nullptr;
12514 UserParent &&
12515 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
12516 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
12517 continue;
12518
12519 // We only add extract cost once for the same scalar.
12520 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
12521 !ExtractCostCalculated.insert(EU.Scalar).second)
12522 continue;
12523
12524 // No extract cost for vector "scalar"
12525 if (isa<FixedVectorType>(EU.Scalar->getType()))
12526 continue;
12527
12528 // If found user is an insertelement, do not calculate extract cost but try
12529 // to detect it as a final shuffled/identity match.
12530 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
12531 VU && VU->getOperand(1) == EU.Scalar) {
12532 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
12533 if (!UsedInserts.insert(VU).second)
12534 continue;
12535 std::optional<unsigned> InsertIdx = getElementIndex(VU);
12536 if (InsertIdx) {
12537 const TreeEntry *ScalarTE = &EU.E;
12538 auto *It = find_if(
12539 ShuffledInserts,
12540 [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
12541 // Checks if 2 insertelements are from the same buildvector.
12542 InsertElementInst *VecInsert = Data.InsertElements.front();
12544 VU, VecInsert, [this](InsertElementInst *II) -> Value * {
12545 Value *Op0 = II->getOperand(0);
12546 if (isVectorized(II) && !isVectorized(Op0))
12547 return nullptr;
12548 return Op0;
12549 });
12550 });
12551 int VecId = -1;
12552 if (It == ShuffledInserts.end()) {
12553 auto &Data = ShuffledInserts.emplace_back();
12554 Data.InsertElements.emplace_back(VU);
12555 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
12556 VecId = ShuffledInserts.size() - 1;
12557 auto It = MinBWs.find(ScalarTE);
12558 if (It != MinBWs.end() &&
12559 VectorCasts
12560 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
12561 .second) {
12562 unsigned BWSz = It->second.first;
12563 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
12564 unsigned VecOpcode;
12565 if (DstBWSz < BWSz)
12566 VecOpcode = Instruction::Trunc;
12567 else
12568 VecOpcode =
12569 It->second.second ? Instruction::SExt : Instruction::ZExt;
12572 VecOpcode, FTy,
12573 getWidenedType(IntegerType::get(FTy->getContext(), BWSz),
12574 FTy->getNumElements()),
12576 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
12577 << " for extending externally used vector with "
12578 "non-equal minimum bitwidth.\n");
12579 Cost += C;
12580 }
12581 } else {
12582 if (isFirstInsertElement(VU, It->InsertElements.front()))
12583 It->InsertElements.front() = VU;
12584 VecId = std::distance(ShuffledInserts.begin(), It);
12585 }
12586 int InIdx = *InsertIdx;
12587 SmallVectorImpl<int> &Mask =
12588 ShuffledInserts[VecId].ValueMasks[ScalarTE];
12589 if (Mask.empty())
12590 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
12591 Mask[InIdx] = EU.Lane;
12592 DemandedElts[VecId].setBit(InIdx);
12593 continue;
12594 }
12595 }
12596 }
12597
12599 // If we plan to rewrite the tree in a smaller type, we will need to sign
12600 // extend the extracted value back to the original type. Here, we account
12601 // for the extract and the added cost of the sign extend if needed.
12602 InstructionCost ExtraCost = TTI::TCC_Free;
12603 auto *VecTy = getWidenedType(EU.Scalar->getType(), BundleWidth);
12604 const TreeEntry *Entry = &EU.E;
12605 auto It = MinBWs.find(Entry);
12606 if (It != MinBWs.end()) {
12607 auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
12608 unsigned Extend = isKnownNonNegative(EU.Scalar, SimplifyQuery(*DL))
12609 ? Instruction::ZExt
12610 : Instruction::SExt;
12611 VecTy = getWidenedType(MinTy, BundleWidth);
12612 ExtraCost = TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
12613 VecTy, EU.Lane);
12614 } else {
12615 ExtraCost =
12616 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
12617 EU.Lane, EU.Scalar, ScalarUserAndIdx);
12618 }
12619 // Leave the scalar instructions as is if they are cheaper than extracts.
12620 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
12621 Entry->getOpcode() == Instruction::Load) {
12622 // Checks if the user of the external scalar is phi in loop body.
12623 auto IsPhiInLoop = [&](const ExternalUser &U) {
12624 if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
12625 auto *I = cast<Instruction>(U.Scalar);
12626 const Loop *L = LI->getLoopFor(Phi->getParent());
12627 return L && (Phi->getParent() == I->getParent() ||
12628 L == LI->getLoopFor(I->getParent()));
12629 }
12630 return false;
12631 };
12632 if (!ValueToExtUses) {
12633 ValueToExtUses.emplace();
12634 for_each(enumerate(ExternalUses), [&](const auto &P) {
12635 // Ignore phis in loops.
12636 if (IsPhiInLoop(P.value()))
12637 return;
12638
12639 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
12640 });
12641 }
12642 // Can use original instruction, if no operands vectorized or they are
12643 // marked as externally used already.
12644 auto *Inst = cast<Instruction>(EU.Scalar);
12645 InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
12646 auto OperandIsScalar = [&](Value *V) {
12647 if (!isVectorized(V)) {
12648 // Some extractelements might be not vectorized, but
12649 // transformed into shuffle and removed from the function,
12650 // consider it here.
12651 if (auto *EE = dyn_cast<ExtractElementInst>(V))
12652 return !EE->hasOneUse() || !MustGather.contains(EE);
12653 return true;
12654 }
12655 return ValueToExtUses->contains(V);
12656 };
12657 bool CanBeUsedAsScalar = all_of(Inst->operands(), OperandIsScalar);
12658 bool CanBeUsedAsScalarCast = false;
12659 if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
12660 if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));
12661 Op && all_of(Op->operands(), OperandIsScalar)) {
12662 InstructionCost OpCost =
12663 (isVectorized(Op) && !ValueToExtUses->contains(Op))
12665 : 0;
12666 if (ScalarCost + OpCost <= ExtraCost) {
12667 CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
12668 ScalarCost += OpCost;
12669 }
12670 }
12671 }
12672 if (CanBeUsedAsScalar) {
12673 bool KeepScalar = ScalarCost <= ExtraCost;
12674 // Try to keep original scalar if the user is the phi node from the same
12675 // block as the root phis, currently vectorized. It allows to keep
12676 // better ordering info of PHIs, being vectorized currently.
12677 bool IsProfitablePHIUser =
12678 (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
12679 VectorizableTree.front()->Scalars.size() > 2)) &&
12680 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
12681 !Inst->hasNUsesOrMore(UsesLimit) &&
12682 none_of(Inst->users(),
12683 [&](User *U) {
12684 auto *PHIUser = dyn_cast<PHINode>(U);
12685 return (!PHIUser ||
12686 PHIUser->getParent() !=
12687 cast<Instruction>(
12688 VectorizableTree.front()->getMainOp())
12689 ->getParent()) &&
12690 !isVectorized(U);
12691 }) &&
12692 count_if(Entry->Scalars, [&](Value *V) {
12693 return ValueToExtUses->contains(V);
12694 }) <= 2;
12695 if (IsProfitablePHIUser) {
12696 KeepScalar = true;
12697 } else if (KeepScalar && ScalarCost != TTI::TCC_Free &&
12698 ExtraCost - ScalarCost <= TTI::TCC_Basic &&
12699 (!GatheredLoadsEntriesFirst.has_value() ||
12700 Entry->Idx < *GatheredLoadsEntriesFirst)) {
12701 unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
12702 return ValueToExtUses->contains(V);
12703 });
12704 auto It = ExtractsCount.find(Entry);
12705 if (It != ExtractsCount.end()) {
12706 assert(ScalarUsesCount >= It->getSecond().size() &&
12707 "Expected total number of external uses not less than "
12708 "number of scalar uses.");
12709 ScalarUsesCount -= It->getSecond().size();
12710 }
12711 // Keep original scalar if number of externally used instructions in
12712 // the same entry is not power of 2. It may help to do some extra
12713 // vectorization for now.
12714 KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
12715 }
12716 if (KeepScalar) {
12717 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
12718 for_each(Inst->operands(), [&](Value *V) {
12719 auto It = ValueToExtUses->find(V);
12720 if (It != ValueToExtUses->end()) {
12721 // Replace all uses to avoid compiler crash.
12722 ExternalUses[It->second].User = nullptr;
12723 }
12724 });
12725 ExtraCost = ScalarCost;
12726 if (!IsPhiInLoop(EU))
12727 ExtractsCount[Entry].insert(Inst);
12728 if (CanBeUsedAsScalarCast) {
12729 ScalarOpsFromCasts.insert(Inst->getOperand(0));
12730 // Update the users of the operands of the cast operand to avoid
12731 // compiler crash.
12732 if (auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
12733 for_each(IOp->operands(), [&](Value *V) {
12734 auto It = ValueToExtUses->find(V);
12735 if (It != ValueToExtUses->end()) {
12736 // Replace all uses to avoid compiler crash.
12737 ExternalUses[It->second].User = nullptr;
12738 }
12739 });
12740 }
12741 }
12742 }
12743 }
12744 }
12745
12746 ExtractCost += ExtraCost;
12747 }
12748 // Insert externals for extract of operands of casts to be emitted as scalars
12749 // instead of extractelement.
12750 for (Value *V : ScalarOpsFromCasts) {
12751 ExternalUsesAsOriginalScalar.insert(V);
12752 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V); !TEs.empty()) {
12753 ExternalUses.emplace_back(V, nullptr, *TEs.front(),
12754 TEs.front()->findLaneForValue(V));
12755 }
12756 }
12757 // Add reduced value cost, if resized.
12758 if (!VectorizedVals.empty()) {
12759 const TreeEntry &Root = *VectorizableTree.front();
12760 auto BWIt = MinBWs.find(&Root);
12761 if (BWIt != MinBWs.end()) {
12762 Type *DstTy = Root.Scalars.front()->getType();
12763 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->getScalarType());
12764 unsigned SrcSz =
12765 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
12766 if (OriginalSz != SrcSz) {
12767 unsigned Opcode = Instruction::Trunc;
12768 if (OriginalSz > SrcSz)
12769 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
12770 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
12771 if (auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
12772 assert(SLPReVec && "Only supported by REVEC.");
12773 SrcTy = getWidenedType(SrcTy, VecTy->getNumElements());
12774 }
12775 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
12778 }
12779 }
12780 }
12781
12782 InstructionCost SpillCost = getSpillCost();
12783 Cost += SpillCost + ExtractCost;
12784 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
12785 bool) {
12786 InstructionCost C = 0;
12787 unsigned VF = Mask.size();
12788 unsigned VecVF = TE->getVectorFactor();
12789 if (VF != VecVF &&
12790 (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
12792 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
12793 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
12794 OrigMask.begin());
12796 getWidenedType(TE->getMainOp()->getType(), VecVF),
12797 OrigMask);
12798 LLVM_DEBUG(
12799 dbgs() << "SLP: Adding cost " << C
12800 << " for final shuffle of insertelement external users.\n";
12801 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
12802 Cost += C;
12803 return std::make_pair(TE, true);
12804 }
12805 return std::make_pair(TE, false);
12806 };
12807 // Calculate the cost of the reshuffled vectors, if any.
12808 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
12809 Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0);
12810 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
12811 unsigned VF = 0;
12812 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
12814 assert((TEs.size() == 1 || TEs.size() == 2) &&
12815 "Expected exactly 1 or 2 tree entries.");
12816 if (TEs.size() == 1) {
12817 if (VF == 0)
12818 VF = TEs.front()->getVectorFactor();
12819 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12820 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
12821 !all_of(enumerate(Mask), [=](const auto &Data) {
12822 return Data.value() == PoisonMaskElem ||
12823 (Data.index() < VF &&
12824 static_cast<int>(Data.index()) == Data.value());
12825 })) {
12828 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
12829 << " for final shuffle of insertelement "
12830 "external users.\n";
12831 TEs.front()->dump();
12832 dbgs() << "SLP: Current total cost = " << Cost << "\n");
12833 Cost += C;
12834 }
12835 } else {
12836 if (VF == 0) {
12837 if (TEs.front() &&
12838 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
12839 VF = TEs.front()->getVectorFactor();
12840 else
12841 VF = Mask.size();
12842 }
12843 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12846 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
12847 << " for final shuffle of vector node and external "
12848 "insertelement users.\n";
12849 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
12850 dbgs() << "SLP: Current total cost = " << Cost << "\n");
12851 Cost += C;
12852 }
12853 VF = Mask.size();
12854 return TEs.back();
12855 };
12856 (void)performExtractsShuffleAction<const TreeEntry>(
12857 MutableArrayRef(Vector.data(), Vector.size()), Base,
12858 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
12859 EstimateShufflesCost);
12861 cast<FixedVectorType>(
12862 ShuffledInserts[I].InsertElements.front()->getType()),
12863 DemandedElts[I],
12864 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
12865 Cost -= InsertCost;
12866 }
12867
12868 // Add the cost for reduced value resize (if required).
12869 if (ReductionBitWidth != 0) {
12870 assert(UserIgnoreList && "Expected reduction tree.");
12871 const TreeEntry &E = *VectorizableTree.front();
12872 auto It = MinBWs.find(&E);
12873 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
12874 unsigned SrcSize = It->second.first;
12875 unsigned DstSize = ReductionBitWidth;
12876 unsigned Opcode = Instruction::Trunc;
12877 if (SrcSize < DstSize) {
12878 bool IsArithmeticExtendedReduction =
12879 all_of(*UserIgnoreList, [](Value *V) {
12880 auto *I = cast<Instruction>(V);
12881 return is_contained({Instruction::Add, Instruction::FAdd,
12882 Instruction::Mul, Instruction::FMul,
12883 Instruction::And, Instruction::Or,
12884 Instruction::Xor},
12885 I->getOpcode());
12886 });
12887 if (IsArithmeticExtendedReduction)
12888 Opcode =
12889 Instruction::BitCast; // Handle it by getExtendedReductionCost
12890 else
12891 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
12892 }
12893 if (Opcode != Instruction::BitCast) {
12894 auto *SrcVecTy =
12895 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
12896 auto *DstVecTy =
12897 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
12898 TTI::CastContextHint CCH = getCastContextHint(E);
12899 InstructionCost CastCost;
12900 switch (E.getOpcode()) {
12901 case Instruction::SExt:
12902 case Instruction::ZExt:
12903 case Instruction::Trunc: {
12904 const TreeEntry *OpTE = getOperandEntry(&E, 0);
12905 CCH = getCastContextHint(*OpTE);
12906 break;
12907 }
12908 default:
12909 break;
12910 }
12911 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
12913 Cost += CastCost;
12914 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
12915 << " for final resize for reduction from " << SrcVecTy
12916 << " to " << DstVecTy << "\n";
12917 dbgs() << "SLP: Current total cost = " << Cost << "\n");
12918 }
12919 }
12920 }
12921
12922#ifndef NDEBUG
12923 SmallString<256> Str;
12924 {
12926 OS << "SLP: Spill Cost = " << SpillCost << ".\n"
12927 << "SLP: Extract Cost = " << ExtractCost << ".\n"
12928 << "SLP: Total Cost = " << Cost << ".\n";
12929 }
12930 LLVM_DEBUG(dbgs() << Str);
12931 if (ViewSLPTree)
12932 ViewGraph(this, "SLP" + F->getName(), false, Str);
12933#endif
12934
12935 return Cost;
12936}
12937
12938/// Tries to find extractelement instructions with constant indices from fixed
12939/// vector type and gather such instructions into a bunch, which highly likely
12940/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
12941/// successful, the matched scalars are replaced by poison values in \p VL for
12942/// future analysis.
12943std::optional<TTI::ShuffleKind>
12944BoUpSLP::tryToGatherSingleRegisterExtractElements(
12946 // Scan list of gathered scalars for extractelements that can be represented
12947 // as shuffles.
12949 SmallVector<int> UndefVectorExtracts;
12950 for (int I = 0, E = VL.size(); I < E; ++I) {
12951 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
12952 if (!EI) {
12953 if (isa<UndefValue>(VL[I]))
12954 UndefVectorExtracts.push_back(I);
12955 continue;
12956 }
12957 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
12958 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
12959 continue;
12960 std::optional<unsigned> Idx = getExtractIndex(EI);
12961 // Undefined index.
12962 if (!Idx) {
12963 UndefVectorExtracts.push_back(I);
12964 continue;
12965 }
12966 if (Idx >= VecTy->getNumElements()) {
12967 UndefVectorExtracts.push_back(I);
12968 continue;
12969 }
12970 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
12971 ExtractMask.reset(*Idx);
12972 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
12973 UndefVectorExtracts.push_back(I);
12974 continue;
12975 }
12976 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
12977 }
12978 // Sort the vector operands by the maximum number of uses in extractelements.
12980 VectorOpToIdx.takeVector();
12981 stable_sort(Vectors, [](const auto &P1, const auto &P2) {
12982 return P1.second.size() > P2.second.size();
12983 });
12984 // Find the best pair of the vectors or a single vector.
12985 const int UndefSz = UndefVectorExtracts.size();
12986 unsigned SingleMax = 0;
12987 unsigned PairMax = 0;
12988 if (!Vectors.empty()) {
12989 SingleMax = Vectors.front().second.size() + UndefSz;
12990 if (Vectors.size() > 1) {
12991 auto *ItNext = std::next(Vectors.begin());
12992 PairMax = SingleMax + ItNext->second.size();
12993 }
12994 }
12995 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
12996 return std::nullopt;
12997 // Check if better to perform a shuffle of 2 vectors or just of a single
12998 // vector.
12999 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
13000 SmallVector<Value *> GatheredExtracts(
13001 VL.size(), PoisonValue::get(VL.front()->getType()));
13002 if (SingleMax >= PairMax && SingleMax) {
13003 for (int Idx : Vectors.front().second)
13004 std::swap(GatheredExtracts[Idx], VL[Idx]);
13005 } else if (!Vectors.empty()) {
13006 for (unsigned Idx : {0, 1})
13007 for (int Idx : Vectors[Idx].second)
13008 std::swap(GatheredExtracts[Idx], VL[Idx]);
13009 }
13010 // Add extracts from undefs too.
13011 for (int Idx : UndefVectorExtracts)
13012 std::swap(GatheredExtracts[Idx], VL[Idx]);
13013 // Check that gather of extractelements can be represented as just a
13014 // shuffle of a single/two vectors the scalars are extracted from.
13015 std::optional<TTI::ShuffleKind> Res =
13016 isFixedVectorShuffle(GatheredExtracts, Mask, AC);
13017 if (!Res || all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
13018 // TODO: try to check other subsets if possible.
13019 // Restore the original VL if attempt was not successful.
13020 copy(SavedVL, VL.begin());
13021 return std::nullopt;
13022 }
13023 // Restore unused scalars from mask, if some of the extractelements were not
13024 // selected for shuffle.
13025 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
13026 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
13027 isa<UndefValue>(GatheredExtracts[I])) {
13028 std::swap(VL[I], GatheredExtracts[I]);
13029 continue;
13030 }
13031 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
13032 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
13033 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
13034 is_contained(UndefVectorExtracts, I))
13035 continue;
13036 }
13037 return Res;
13038}
13039
13040/// Tries to find extractelement instructions with constant indices from fixed
13041/// vector type and gather such instructions into a bunch, which highly likely
13042/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
13043/// successful, the matched scalars are replaced by poison values in \p VL for
13044/// future analysis.
13046BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
13048 unsigned NumParts) const {
13049 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
13050 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
13051 Mask.assign(VL.size(), PoisonMaskElem);
13052 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
13053 for (unsigned Part : seq<unsigned>(NumParts)) {
13054 // Scan list of gathered scalars for extractelements that can be represented
13055 // as shuffles.
13057 Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
13058 SmallVector<int> SubMask;
13059 std::optional<TTI::ShuffleKind> Res =
13060 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
13061 ShufflesRes[Part] = Res;
13062 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
13063 }
13064 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
13065 return Res.has_value();
13066 }))
13067 ShufflesRes.clear();
13068 return ShufflesRes;
13069}
13070
13071std::optional<TargetTransformInfo::ShuffleKind>
13072BoUpSLP::isGatherShuffledSingleRegisterEntry(
13073 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
13074 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
13075 Entries.clear();
13076 // TODO: currently checking only for Scalars in the tree entry, need to count
13077 // reused elements too for better cost estimation.
13078 const EdgeInfo &TEUseEI = TE == VectorizableTree.front().get()
13079 ? EdgeInfo(const_cast<TreeEntry *>(TE), 0)
13080 : TE->UserTreeIndices.front();
13081 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
13082 const BasicBlock *TEInsertBlock = nullptr;
13083 // Main node of PHI entries keeps the correct order of operands/incoming
13084 // blocks.
13085 if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
13086 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
13087 TEInsertPt = TEInsertBlock->getTerminator();
13088 } else {
13089 TEInsertBlock = TEInsertPt->getParent();
13090 }
13091 if (!DT->isReachableFromEntry(TEInsertBlock))
13092 return std::nullopt;
13093 auto *NodeUI = DT->getNode(TEInsertBlock);
13094 assert(NodeUI && "Should only process reachable instructions");
13095 SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end());
13096 auto CheckOrdering = [&](const Instruction *InsertPt) {
13097 // Argument InsertPt is an instruction where vector code for some other
13098 // tree entry (one that shares one or more scalars with TE) is going to be
13099 // generated. This lambda returns true if insertion point of vector code
13100 // for the TE dominates that point (otherwise dependency is the other way
13101 // around). The other node is not limited to be of a gather kind. Gather
13102 // nodes are not scheduled and their vector code is inserted before their
13103 // first user. If user is PHI, that is supposed to be at the end of a
13104 // predecessor block. Otherwise it is the last instruction among scalars of
13105 // the user node. So, instead of checking dependency between instructions
13106 // themselves, we check dependency between their insertion points for vector
13107 // code (since each scalar instruction ends up as a lane of a vector
13108 // instruction).
13109 const BasicBlock *InsertBlock = InsertPt->getParent();
13110 auto *NodeEUI = DT->getNode(InsertBlock);
13111 if (!NodeEUI)
13112 return false;
13113 assert((NodeUI == NodeEUI) ==
13114 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
13115 "Different nodes should have different DFS numbers");
13116 // Check the order of the gather nodes users.
13117 if (TEInsertPt->getParent() != InsertBlock &&
13118 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
13119 return false;
13120 if (TEInsertPt->getParent() == InsertBlock &&
13121 TEInsertPt->comesBefore(InsertPt))
13122 return false;
13123 return true;
13124 };
13125 // Find all tree entries used by the gathered values. If no common entries
13126 // found - not a shuffle.
13127 // Here we build a set of tree nodes for each gathered value and trying to
13128 // find the intersection between these sets. If we have at least one common
13129 // tree node for each gathered value - we have just a permutation of the
13130 // single vector. If we have 2 different sets, we're in situation where we
13131 // have a permutation of 2 input vectors.
13133 DenseMap<Value *, int> UsedValuesEntry;
13134 for (Value *V : VL) {
13135 if (isConstant(V))
13136 continue;
13137 // Build a list of tree entries where V is used.
13139 for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
13140 if (TEPtr == TE || TEPtr->Idx == 0)
13141 continue;
13142 assert(any_of(TEPtr->Scalars,
13143 [&](Value *V) { return GatheredScalars.contains(V); }) &&
13144 "Must contain at least single gathered value.");
13145 assert(TEPtr->UserTreeIndices.size() == 1 &&
13146 "Expected only single user of a gather node.");
13147 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
13148
13149 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
13150 const Instruction *InsertPt =
13151 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
13152 : &getLastInstructionInBundle(UseEI.UserTE);
13153 if (TEInsertPt == InsertPt) {
13154 // If 2 gathers are operands of the same entry (regardless of whether
13155 // user is PHI or else), compare operands indices, use the earlier one
13156 // as the base.
13157 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
13158 continue;
13159 // If the user instruction is used for some reason in different
13160 // vectorized nodes - make it depend on index.
13161 if (TEUseEI.UserTE != UseEI.UserTE &&
13162 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
13163 continue;
13164 }
13165
13166 // Check if the user node of the TE comes after user node of TEPtr,
13167 // otherwise TEPtr depends on TE.
13168 if ((TEInsertBlock != InsertPt->getParent() ||
13169 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
13170 !CheckOrdering(InsertPt))
13171 continue;
13172 VToTEs.insert(TEPtr);
13173 }
13174 if (ArrayRef<TreeEntry *> VTEs = getTreeEntries(V); !VTEs.empty()) {
13175 const TreeEntry *VTE = VTEs.front();
13176 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
13177 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
13178 VTEs = VTEs.drop_front();
13179 // Iterate through all vectorized nodes.
13180 const auto *MIt = find_if(VTEs, [](const TreeEntry *MTE) {
13181 return MTE->State == TreeEntry::Vectorize;
13182 });
13183 if (MIt == VTEs.end())
13184 continue;
13185 VTE = *MIt;
13186 }
13187 if (none_of(TE->CombinedEntriesWithIndices,
13188 [&](const auto &P) { return P.first == VTE->Idx; })) {
13189 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
13190 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
13191 continue;
13192 }
13193 VToTEs.insert(VTE);
13194 }
13195 if (VToTEs.empty())
13196 continue;
13197 if (UsedTEs.empty()) {
13198 // The first iteration, just insert the list of nodes to vector.
13199 UsedTEs.push_back(VToTEs);
13200 UsedValuesEntry.try_emplace(V, 0);
13201 } else {
13202 // Need to check if there are any previously used tree nodes which use V.
13203 // If there are no such nodes, consider that we have another one input
13204 // vector.
13205 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
13206 unsigned Idx = 0;
13207 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
13208 // Do we have a non-empty intersection of previously listed tree entries
13209 // and tree entries using current V?
13210 set_intersect(VToTEs, Set);
13211 if (!VToTEs.empty()) {
13212 // Yes, write the new subset and continue analysis for the next
13213 // scalar.
13214 Set.swap(VToTEs);
13215 break;
13216 }
13217 VToTEs = SavedVToTEs;
13218 ++Idx;
13219 }
13220 // No non-empty intersection found - need to add a second set of possible
13221 // source vectors.
13222 if (Idx == UsedTEs.size()) {
13223 // If the number of input vectors is greater than 2 - not a permutation,
13224 // fallback to the regular gather.
13225 // TODO: support multiple reshuffled nodes.
13226 if (UsedTEs.size() == 2)
13227 continue;
13228 UsedTEs.push_back(SavedVToTEs);
13229 Idx = UsedTEs.size() - 1;
13230 }
13231 UsedValuesEntry.try_emplace(V, Idx);
13232 }
13233 }
13234
13235 if (UsedTEs.empty()) {
13236 Entries.clear();
13237 return std::nullopt;
13238 }
13239
13240 unsigned VF = 0;
13241 if (UsedTEs.size() == 1) {
13242 // Keep the order to avoid non-determinism.
13243 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
13244 UsedTEs.front().end());
13245 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
13246 return TE1->Idx < TE2->Idx;
13247 });
13248 // Try to find the perfect match in another gather node at first.
13249 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
13250 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
13251 });
13252 if (It != FirstEntries.end() &&
13253 ((*It)->getVectorFactor() == VL.size() ||
13254 ((*It)->getVectorFactor() == TE->Scalars.size() &&
13255 TE->ReuseShuffleIndices.size() == VL.size() &&
13256 (*It)->isSame(TE->Scalars)))) {
13257 Entries.push_back(*It);
13258 if ((*It)->getVectorFactor() == VL.size()) {
13259 std::iota(std::next(Mask.begin(), Part * VL.size()),
13260 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
13261 } else {
13262 SmallVector<int> CommonMask = TE->getCommonMask();
13263 copy(CommonMask, Mask.begin());
13264 }
13265 // Clear undef scalars.
13266 for (unsigned I : seq<unsigned>(VL.size()))
13267 if (isa<PoisonValue>(VL[I]))
13268 Mask[Part * VL.size() + I] = PoisonMaskElem;
13270 }
13271 // No perfect match, just shuffle, so choose the first tree node from the
13272 // tree.
13273 Entries.push_back(FirstEntries.front());
13274 VF = FirstEntries.front()->getVectorFactor();
13275 } else {
13276 // Try to find nodes with the same vector factor.
13277 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
13278 // Keep the order of tree nodes to avoid non-determinism.
13280 for (const TreeEntry *TE : UsedTEs.front()) {
13281 unsigned VF = TE->getVectorFactor();
13282 auto It = VFToTE.find(VF);
13283 if (It != VFToTE.end()) {
13284 if (It->second->Idx > TE->Idx)
13285 It->getSecond() = TE;
13286 continue;
13287 }
13288 VFToTE.try_emplace(VF, TE);
13289 }
13290 // Same, keep the order to avoid non-determinism.
13291 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
13292 UsedTEs.back().end());
13293 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
13294 return TE1->Idx < TE2->Idx;
13295 });
13296 for (const TreeEntry *TE : SecondEntries) {
13297 auto It = VFToTE.find(TE->getVectorFactor());
13298 if (It != VFToTE.end()) {
13299 VF = It->first;
13300 Entries.push_back(It->second);
13301 Entries.push_back(TE);
13302 break;
13303 }
13304 }
13305 // No 2 source vectors with the same vector factor - just choose 2 with max
13306 // index.
13307 if (Entries.empty()) {
13308 Entries.push_back(*llvm::max_element(
13309 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
13310 return TE1->Idx < TE2->Idx;
13311 }));
13312 Entries.push_back(SecondEntries.front());
13313 VF = std::max(Entries.front()->getVectorFactor(),
13314 Entries.back()->getVectorFactor());
13315 } else {
13316 VF = Entries.front()->getVectorFactor();
13317 }
13318 }
13319
13320 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
13321 // Checks if the 2 PHIs are compatible in terms of high possibility to be
13322 // vectorized.
13323 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
13324 auto *PHI = cast<PHINode>(V);
13325 auto *PHI1 = cast<PHINode>(V1);
13326 // Check that all incoming values are compatible/from same parent (if they
13327 // are instructions).
13328 // The incoming values are compatible if they all are constants, or
13329 // instruction with the same/alternate opcodes from the same basic block.
13330 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
13331 Value *In = PHI->getIncomingValue(I);
13332 Value *In1 = PHI1->getIncomingValue(I);
13333 if (isConstant(In) && isConstant(In1))
13334 continue;
13335 if (!getSameOpcode({In, In1}, *TLI))
13336 return false;
13337 if (cast<Instruction>(In)->getParent() !=
13338 cast<Instruction>(In1)->getParent())
13339 return false;
13340 }
13341 return true;
13342 };
13343 // Check if the value can be ignored during analysis for shuffled gathers.
13344 // We suppose it is better to ignore instruction, which do not form splats,
13345 // are not vectorized/not extractelements (these instructions will be handled
13346 // by extractelements processing) or may form vector node in future.
13347 auto MightBeIgnored = [=](Value *V) {
13348 auto *I = dyn_cast<Instruction>(V);
13349 return I && !IsSplatOrUndefs && !isVectorized(I) &&
13351 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
13352 };
13353 // Check that the neighbor instruction may form a full vector node with the
13354 // current instruction V. It is possible, if they have same/alternate opcode
13355 // and same parent basic block.
13356 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
13357 Value *V1 = VL[Idx];
13358 bool UsedInSameVTE = false;
13359 auto It = UsedValuesEntry.find(V1);
13360 if (It != UsedValuesEntry.end())
13361 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
13362 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
13363 getSameOpcode({V, V1}, *TLI) &&
13364 cast<Instruction>(V)->getParent() ==
13365 cast<Instruction>(V1)->getParent() &&
13366 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
13367 };
13368 // Build a shuffle mask for better cost estimation and vector emission.
13369 SmallBitVector UsedIdxs(Entries.size());
13371 for (int I = 0, E = VL.size(); I < E; ++I) {
13372 Value *V = VL[I];
13373 auto It = UsedValuesEntry.find(V);
13374 if (It == UsedValuesEntry.end())
13375 continue;
13376 // Do not try to shuffle scalars, if they are constants, or instructions
13377 // that can be vectorized as a result of the following vector build
13378 // vectorization.
13379 if (isConstant(V) || (MightBeIgnored(V) &&
13380 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
13381 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
13382 continue;
13383 unsigned Idx = It->second;
13384 EntryLanes.emplace_back(Idx, I);
13385 UsedIdxs.set(Idx);
13386 }
13387 // Iterate through all shuffled scalars and select entries, which can be used
13388 // for final shuffle.
13390 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
13391 if (!UsedIdxs.test(I))
13392 continue;
13393 // Fix the entry number for the given scalar. If it is the first entry, set
13394 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
13395 // These indices are used when calculating final shuffle mask as the vector
13396 // offset.
13397 for (std::pair<unsigned, int> &Pair : EntryLanes)
13398 if (Pair.first == I)
13399 Pair.first = TempEntries.size();
13400 TempEntries.push_back(Entries[I]);
13401 }
13402 Entries.swap(TempEntries);
13403 if (EntryLanes.size() == Entries.size() &&
13404 !VL.equals(ArrayRef(TE->Scalars)
13405 .slice(Part * VL.size(),
13406 std::min<int>(VL.size(), TE->Scalars.size())))) {
13407 // We may have here 1 or 2 entries only. If the number of scalars is equal
13408 // to the number of entries, no need to do the analysis, it is not very
13409 // profitable. Since VL is not the same as TE->Scalars, it means we already
13410 // have some shuffles before. Cut off not profitable case.
13411 Entries.clear();
13412 return std::nullopt;
13413 }
13414 // Build the final mask, check for the identity shuffle, if possible.
13415 bool IsIdentity = Entries.size() == 1;
13416 // Pair.first is the offset to the vector, while Pair.second is the index of
13417 // scalar in the list.
13418 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
13419 unsigned Idx = Part * VL.size() + Pair.second;
13420 Mask[Idx] =
13421 Pair.first * VF +
13422 (ForOrder ? std::distance(
13423 Entries[Pair.first]->Scalars.begin(),
13424 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
13425 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
13426 IsIdentity &= Mask[Idx] == Pair.second;
13427 }
13428 if (ForOrder || IsIdentity || Entries.empty()) {
13429 switch (Entries.size()) {
13430 case 1:
13431 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
13433 break;
13434 case 2:
13435 if (EntryLanes.size() > 2 || VL.size() <= 2)
13437 break;
13438 default:
13439 break;
13440 }
13441 } else if (!isa<VectorType>(VL.front()->getType()) &&
13442 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
13443 // Do the cost estimation if shuffle beneficial than buildvector.
13444 SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),
13445 std::next(Mask.begin(), (Part + 1) * VL.size()));
13446 int MinElement = SubMask.front(), MaxElement = SubMask.front();
13447 for (int Idx : SubMask) {
13448 if (Idx == PoisonMaskElem)
13449 continue;
13450 if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
13451 MinElement = Idx;
13452 if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
13453 MaxElement = Idx;
13454 }
13455 assert(MaxElement >= 0 && MinElement >= 0 &&
13456 MaxElement % VF >= MinElement % VF &&
13457 "Expected at least single element.");
13458 unsigned NewVF = std::max<unsigned>(
13459 VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(),
13460 (MaxElement % VF) -
13461 (MinElement % VF) + 1));
13462 if (NewVF < VF) {
13463 for_each(SubMask, [&](int &Idx) {
13464 if (Idx == PoisonMaskElem)
13465 return;
13466 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
13467 (Idx >= static_cast<int>(VF) ? NewVF : 0);
13468 });
13469 } else {
13470 NewVF = VF;
13471 }
13472
13474 auto *VecTy = getWidenedType(VL.front()->getType(), NewVF);
13475 auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
13476 auto GetShuffleCost = [&,
13479 VectorType *VecTy) -> InstructionCost {
13480 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
13482 Mask, Entries.front()->getInterleaveFactor()))
13483 return TTI::TCC_Free;
13484 return ::getShuffleCost(TTI,
13485 Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
13487 VecTy, Mask, CostKind);
13488 };
13489 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
13490 InstructionCost FirstShuffleCost = 0;
13491 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
13492 if (Entries.size() == 1 || !Entries[0]->isGather()) {
13493 FirstShuffleCost = ShuffleCost;
13494 } else {
13495 // Transform mask to include only first entry.
13496 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
13497 bool IsIdentity = true;
13498 for (auto [I, Idx] : enumerate(FirstMask)) {
13499 if (Idx >= static_cast<int>(NewVF)) {
13501 } else {
13502 DemandedElts.clearBit(I);
13503 if (Idx != PoisonMaskElem)
13504 IsIdentity &= static_cast<int>(I) == Idx;
13505 }
13506 }
13507 if (!IsIdentity)
13508 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
13509 FirstShuffleCost += TTI->getScalarizationOverhead(
13510 MaskVecTy, DemandedElts, /*Insert=*/true,
13511 /*Extract=*/false, CostKind);
13512 }
13513 InstructionCost SecondShuffleCost = 0;
13514 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
13515 if (Entries.size() == 1 || !Entries[1]->isGather()) {
13516 SecondShuffleCost = ShuffleCost;
13517 } else {
13518 // Transform mask to include only first entry.
13519 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
13520 bool IsIdentity = true;
13521 for (auto [I, Idx] : enumerate(SecondMask)) {
13522 if (Idx < static_cast<int>(NewVF) && Idx >= 0) {
13524 } else {
13525 DemandedElts.clearBit(I);
13526 if (Idx != PoisonMaskElem) {
13527 Idx -= NewVF;
13528 IsIdentity &= static_cast<int>(I) == Idx;
13529 }
13530 }
13531 }
13532 if (!IsIdentity)
13533 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
13534 SecondShuffleCost += TTI->getScalarizationOverhead(
13535 MaskVecTy, DemandedElts, /*Insert=*/true,
13536 /*Extract=*/false, CostKind);
13537 }
13538 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
13539 for (auto [I, Idx] : enumerate(SubMask))
13540 if (Idx == PoisonMaskElem)
13541 DemandedElts.clearBit(I);
13542 InstructionCost BuildVectorCost =
13543 TTI->getScalarizationOverhead(MaskVecTy, DemandedElts, /*Insert=*/true,
13544 /*Extract=*/false, CostKind);
13545 const TreeEntry *BestEntry = nullptr;
13546 if (FirstShuffleCost < ShuffleCost) {
13547 std::for_each(std::next(Mask.begin(), Part * VL.size()),
13548 std::next(Mask.begin(), (Part + 1) * VL.size()),
13549 [&](int &Idx) {
13550 if (Idx >= static_cast<int>(VF))
13551 Idx = PoisonMaskElem;
13552 });
13553 BestEntry = Entries.front();
13554 ShuffleCost = FirstShuffleCost;
13555 }
13556 if (SecondShuffleCost < ShuffleCost) {
13557 std::for_each(std::next(Mask.begin(), Part * VL.size()),
13558 std::next(Mask.begin(), (Part + 1) * VL.size()),
13559 [&](int &Idx) {
13560 if (Idx < static_cast<int>(VF))
13561 Idx = PoisonMaskElem;
13562 else
13563 Idx -= VF;
13564 });
13565 BestEntry = Entries[1];
13566 ShuffleCost = SecondShuffleCost;
13567 }
13568 if (BuildVectorCost >= ShuffleCost) {
13569 if (BestEntry) {
13570 Entries.clear();
13571 Entries.push_back(BestEntry);
13572 }
13573 return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
13575 }
13576 }
13577 Entries.clear();
13578 // Clear the corresponding mask elements.
13579 std::fill(std::next(Mask.begin(), Part * VL.size()),
13580 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
13581 return std::nullopt;
13582}
13583
13585BoUpSLP::isGatherShuffledEntry(
13586 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
13587 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
13588 bool ForOrder) {
13589 assert(NumParts > 0 && NumParts < VL.size() &&
13590 "Expected positive number of registers.");
13591 Entries.clear();
13592 // No need to check for the topmost gather node.
13593 if (TE == VectorizableTree.front().get() &&
13594 (!GatheredLoadsEntriesFirst.has_value() ||
13595 none_of(ArrayRef(VectorizableTree).drop_front(),
13596 [](const std::unique_ptr<TreeEntry> &TE) {
13597 return !TE->isGather();
13598 })))
13599 return {};
13600 // FIXME: Gathering for non-power-of-2 (non whole registers) nodes not
13601 // implemented yet.
13602 if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
13603 return {};
13604 Mask.assign(VL.size(), PoisonMaskElem);
13605 assert((TE->UserTreeIndices.size() == 1 ||
13606 TE == VectorizableTree.front().get()) &&
13607 "Expected only single user of the gather node.");
13608 assert(VL.size() % NumParts == 0 &&
13609 "Number of scalars must be divisible by NumParts.");
13610 if (!TE->UserTreeIndices.empty() &&
13611 TE->UserTreeIndices.front().UserTE->isGather() &&
13612 TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) {
13613 assert(
13614 (TE->Idx == 0 ||
13615 (TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) ||
13616 isSplat(TE->Scalars)) &&
13617 "Expected splat or extractelements only node.");
13618 return {};
13619 }
13620 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
13622 for (unsigned Part : seq<unsigned>(NumParts)) {
13623 ArrayRef<Value *> SubVL =
13624 VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
13625 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
13626 std::optional<TTI::ShuffleKind> SubRes =
13627 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
13628 ForOrder);
13629 if (!SubRes)
13630 SubEntries.clear();
13631 Res.push_back(SubRes);
13632 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
13633 SubEntries.front()->getVectorFactor() == VL.size() &&
13634 (SubEntries.front()->isSame(TE->Scalars) ||
13635 SubEntries.front()->isSame(VL))) {
13636 SmallVector<const TreeEntry *> LocalSubEntries;
13637 LocalSubEntries.swap(SubEntries);
13638 Entries.clear();
13639 Res.clear();
13640 std::iota(Mask.begin(), Mask.end(), 0);
13641 // Clear undef scalars.
13642 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
13643 if (isa<PoisonValue>(VL[I]))
13645 Entries.emplace_back(1, LocalSubEntries.front());
13647 return Res;
13648 }
13649 }
13650 if (all_of(Res,
13651 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
13652 Entries.clear();
13653 return {};
13654 }
13655 return Res;
13656}
13657
13658InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
13659 Type *ScalarTy) const {
13660 auto *VecTy = getWidenedType(ScalarTy, VL.size());
13661 bool DuplicateNonConst = false;
13662 // Find the cost of inserting/extracting values from the vector.
13663 // Check if the same elements are inserted several times and count them as
13664 // shuffle candidates.
13665 APInt ShuffledElements = APInt::getZero(VL.size());
13666 DenseMap<Value *, unsigned> UniqueElements;
13669 auto EstimateInsertCost = [&](unsigned I, Value *V) {
13670 if (V->getType() != ScalarTy) {
13671 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
13673 V = nullptr;
13674 }
13675 if (!ForPoisonSrc)
13676 Cost +=
13677 TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
13678 I, Constant::getNullValue(VecTy), V);
13679 };
13680 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
13681 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
13682 Value *V = VL[I];
13683 // No need to shuffle duplicates for constants.
13684 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V)) {
13685 ShuffledElements.setBit(I);
13686 ShuffleMask[I] = isa<PoisonValue>(V) ? PoisonMaskElem : I;
13687 continue;
13688 }
13689
13690 auto Res = UniqueElements.try_emplace(V, I);
13691 if (Res.second) {
13692 EstimateInsertCost(I, V);
13693 ShuffleMask[I] = I;
13694 continue;
13695 }
13696
13697 DuplicateNonConst = true;
13698 ShuffledElements.setBit(I);
13699 ShuffleMask[I] = Res.first->second;
13700 }
13701 if (ForPoisonSrc) {
13702 if (isa<FixedVectorType>(ScalarTy)) {
13703 assert(SLPReVec && "Only supported by REVEC.");
13704 // We don't need to insert elements one by one. Instead, we can insert the
13705 // entire vector into the destination.
13706 Cost = 0;
13707 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
13708 for (unsigned I : seq<unsigned>(VL.size()))
13709 if (!ShuffledElements[I])
13711 TTI::SK_InsertSubvector, VecTy, std::nullopt, CostKind,
13712 I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
13713 } else {
13715 /*DemandedElts*/ ~ShuffledElements,
13716 /*Insert*/ true,
13717 /*Extract*/ false, CostKind, VL);
13718 }
13719 }
13720 if (DuplicateNonConst)
13722 VecTy, ShuffleMask);
13723 return Cost;
13724}
13725
13726Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
13727 auto &Res = EntryToLastInstruction.try_emplace(E).first->second;
13728 if (Res)
13729 return *Res;
13730 // Get the basic block this bundle is in. All instructions in the bundle
13731 // should be in this block (except for extractelement-like instructions with
13732 // constant indices or gathered loads).
13733 auto *Front = E->getMainOp();
13734 auto *BB = Front->getParent();
13735 assert(((GatheredLoadsEntriesFirst.has_value() &&
13736 E->getOpcode() == Instruction::Load && E->isGather() &&
13737 E->Idx < *GatheredLoadsEntriesFirst) ||
13738 all_of(E->Scalars,
13739 [=](Value *V) -> bool {
13740 if (E->getOpcode() == Instruction::GetElementPtr &&
13741 !isa<GetElementPtrInst>(V))
13742 return true;
13743 auto *I = dyn_cast<Instruction>(V);
13744 return !I || !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
13745 isVectorLikeInstWithConstOps(I);
13746 })) &&
13747 "Expected gathered loads or GEPs or instructions from same basic "
13748 "block.");
13749
13750 auto FindLastInst = [&]() {
13751 Instruction *LastInst = Front;
13752 for (Value *V : E->Scalars) {
13753 auto *I = dyn_cast<Instruction>(V);
13754 if (!I)
13755 continue;
13756 if (LastInst->getParent() == I->getParent()) {
13757 if (LastInst->comesBefore(I))
13758 LastInst = I;
13759 continue;
13760 }
13761 assert(((E->getOpcode() == Instruction::GetElementPtr &&
13762 !isa<GetElementPtrInst>(I)) ||
13763 (isVectorLikeInstWithConstOps(LastInst) &&
13765 (GatheredLoadsEntriesFirst.has_value() &&
13766 E->getOpcode() == Instruction::Load && E->isGather() &&
13767 E->Idx < *GatheredLoadsEntriesFirst)) &&
13768 "Expected vector-like or non-GEP in GEP node insts only.");
13769 if (!DT->isReachableFromEntry(LastInst->getParent())) {
13770 LastInst = I;
13771 continue;
13772 }
13773 if (!DT->isReachableFromEntry(I->getParent()))
13774 continue;
13775 auto *NodeA = DT->getNode(LastInst->getParent());
13776 auto *NodeB = DT->getNode(I->getParent());
13777 assert(NodeA && "Should only process reachable instructions");
13778 assert(NodeB && "Should only process reachable instructions");
13779 assert((NodeA == NodeB) ==
13780 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13781 "Different nodes should have different DFS numbers");
13782 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
13783 LastInst = I;
13784 }
13785 BB = LastInst->getParent();
13786 return LastInst;
13787 };
13788
13789 auto FindFirstInst = [&]() {
13790 Instruction *FirstInst = Front;
13791 for (Value *V : E->Scalars) {
13792 auto *I = dyn_cast<Instruction>(V);
13793 if (!I)
13794 continue;
13795 if (FirstInst->getParent() == I->getParent()) {
13796 if (I->comesBefore(FirstInst))
13797 FirstInst = I;
13798 continue;
13799 }
13800 assert(((E->getOpcode() == Instruction::GetElementPtr &&
13801 !isa<GetElementPtrInst>(I)) ||
13802 (isVectorLikeInstWithConstOps(FirstInst) &&
13804 "Expected vector-like or non-GEP in GEP node insts only.");
13805 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
13806 FirstInst = I;
13807 continue;
13808 }
13809 if (!DT->isReachableFromEntry(I->getParent()))
13810 continue;
13811 auto *NodeA = DT->getNode(FirstInst->getParent());
13812 auto *NodeB = DT->getNode(I->getParent());
13813 assert(NodeA && "Should only process reachable instructions");
13814 assert(NodeB && "Should only process reachable instructions");
13815 assert((NodeA == NodeB) ==
13816 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13817 "Different nodes should have different DFS numbers");
13818 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
13819 FirstInst = I;
13820 }
13821 return FirstInst;
13822 };
13823
13824 // Set insertpoint for gathered loads to the very first load.
13825 if (GatheredLoadsEntriesFirst.has_value() &&
13826 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
13827 E->getOpcode() == Instruction::Load) {
13828 Res = FindFirstInst();
13829 return *Res;
13830 }
13831
13832 // Set the insert point to the beginning of the basic block if the entry
13833 // should not be scheduled.
13834 if (doesNotNeedToSchedule(E->Scalars) ||
13835 (!E->isGather() && all_of(E->Scalars, isVectorLikeInstWithConstOps))) {
13836 if ((E->getOpcode() == Instruction::GetElementPtr &&
13837 any_of(E->Scalars,
13838 [](Value *V) {
13839 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
13840 })) ||
13841 all_of(E->Scalars,
13842 [](Value *V) {
13843 return isa<PoisonValue>(V) ||
13844 (!isVectorLikeInstWithConstOps(V) &&
13845 isUsedOutsideBlock(V));
13846 }) ||
13847 (E->isGather() && E->Idx == 0 && all_of(E->Scalars, [](Value *V) {
13848 return isa<ExtractElementInst, UndefValue>(V) ||
13849 areAllOperandsNonInsts(V);
13850 })))
13851 Res = FindLastInst();
13852 else
13853 Res = FindFirstInst();
13854 return *Res;
13855 }
13856
13857 // Find the last instruction. The common case should be that BB has been
13858 // scheduled, and the last instruction is VL.back(). So we start with
13859 // VL.back() and iterate over schedule data until we reach the end of the
13860 // bundle. The end of the bundle is marked by null ScheduleData.
13861 if (BlocksSchedules.count(BB) && !E->isGather()) {
13862 Value *V = E->isOneOf(E->Scalars.back());
13864 V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled);
13865 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
13866 if (Bundle && Bundle->isPartOfBundle())
13867 for (; Bundle; Bundle = Bundle->NextInBundle)
13868 Res = Bundle->Inst;
13869 }
13870
13871 // LastInst can still be null at this point if there's either not an entry
13872 // for BB in BlocksSchedules or there's no ScheduleData available for
13873 // VL.back(). This can be the case if buildTree_rec aborts for various
13874 // reasons (e.g., the maximum recursion depth is reached, the maximum region
13875 // size is reached, etc.). ScheduleData is initialized in the scheduling
13876 // "dry-run".
13877 //
13878 // If this happens, we can still find the last instruction by brute force. We
13879 // iterate forwards from Front (inclusive) until we either see all
13880 // instructions in the bundle or reach the end of the block. If Front is the
13881 // last instruction in program order, LastInst will be set to Front, and we
13882 // will visit all the remaining instructions in the block.
13883 //
13884 // One of the reasons we exit early from buildTree_rec is to place an upper
13885 // bound on compile-time. Thus, taking an additional compile-time hit here is
13886 // not ideal. However, this should be exceedingly rare since it requires that
13887 // we both exit early from buildTree_rec and that the bundle be out-of-order
13888 // (causing us to iterate all the way to the end of the block).
13889 if (!Res)
13890 Res = FindLastInst();
13891 assert(Res && "Failed to find last instruction in bundle");
13892 return *Res;
13893}
13894
13895void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
13896 auto *Front = E->getMainOp();
13897 Instruction *LastInst = &getLastInstructionInBundle(E);
13898 assert(LastInst && "Failed to find last instruction in bundle");
13899 BasicBlock::iterator LastInstIt = LastInst->getIterator();
13900 // If the instruction is PHI, set the insert point after all the PHIs.
13901 bool IsPHI = isa<PHINode>(LastInst);
13902 if (IsPHI)
13903 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
13904 if (IsPHI || (!E->isGather() && doesNotNeedToSchedule(E->Scalars))) {
13905 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
13906 } else {
13907 // Set the insertion point after the last instruction in the bundle. Set the
13908 // debug location to Front.
13909 Builder.SetInsertPoint(
13910 LastInst->getParent(),
13912 }
13913 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
13914}
13915
13916Value *BoUpSLP::gather(
13917 ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
13918 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle) {
13919 // List of instructions/lanes from current block and/or the blocks which are
13920 // part of the current loop. These instructions will be inserted at the end to
13921 // make it possible to optimize loops and hoist invariant instructions out of
13922 // the loops body with better chances for success.
13924 SmallSet<int, 4> PostponedIndices;
13925 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
13926 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
13928 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
13929 InsertBB = InsertBB->getSinglePredecessor();
13930 return InsertBB && InsertBB == InstBB;
13931 };
13932 for (int I = 0, E = VL.size(); I < E; ++I) {
13933 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
13934 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
13935 isVectorized(Inst) ||
13936 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
13937 PostponedIndices.insert(I).second)
13938 PostponedInsts.emplace_back(Inst, I);
13939 }
13940
13941 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
13942 Type *Ty) {
13943 Value *Scalar = V;
13944 if (Scalar->getType() != Ty) {
13945 assert(Scalar->getType()->isIntOrIntVectorTy() &&
13946 Ty->isIntOrIntVectorTy() && "Expected integer types only.");
13947 Value *V = Scalar;
13948 if (auto *CI = dyn_cast<CastInst>(Scalar);
13949 isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
13950 Value *Op = CI->getOperand(0);
13951 if (auto *IOp = dyn_cast<Instruction>(Op);
13952 !IOp || !(isDeleted(IOp) || isVectorized(IOp)))
13953 V = Op;
13954 }
13955 Scalar = Builder.CreateIntCast(
13956 V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
13957 }
13958
13959 Instruction *InsElt;
13960 if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
13961 assert(SLPReVec && "FixedVectorType is not expected.");
13962 Vec =
13963 createInsertVector(Builder, Vec, Scalar, Pos * getNumElements(VecTy));
13964 auto *II = dyn_cast<IntrinsicInst>(Vec);
13965 if (!II || II->getIntrinsicID() != Intrinsic::vector_insert)
13966 return Vec;
13967 InsElt = II;
13968 } else {
13969 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
13970 InsElt = dyn_cast<InsertElementInst>(Vec);
13971 if (!InsElt)
13972 return Vec;
13973 }
13974 GatherShuffleExtractSeq.insert(InsElt);
13975 CSEBlocks.insert(InsElt->getParent());
13976 // Add to our 'need-to-extract' list.
13977 if (isa<Instruction>(V)) {
13978 if (ArrayRef<TreeEntry *> Entries = getTreeEntries(V); !Entries.empty()) {
13979 // Find which lane we need to extract.
13980 User *UserOp = nullptr;
13981 if (Scalar != V) {
13982 if (auto *SI = dyn_cast<Instruction>(Scalar))
13983 UserOp = SI;
13984 } else {
13985 UserOp = InsElt;
13986 }
13987 if (UserOp) {
13988 unsigned FoundLane = Entries.front()->findLaneForValue(V);
13989 ExternalUses.emplace_back(V, UserOp, *Entries.front(), FoundLane);
13990 }
13991 }
13992 }
13993 return Vec;
13994 };
13995 auto *VecTy = getWidenedType(ScalarTy, VL.size());
13996 Value *Vec = PoisonValue::get(VecTy);
13997 SmallVector<int> NonConsts;
13999 std::iota(Mask.begin(), Mask.end(), 0);
14000 Value *OriginalRoot = Root;
14001 if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
14002 SV && isa<PoisonValue>(SV->getOperand(1)) &&
14003 SV->getOperand(0)->getType() == VecTy) {
14004 Root = SV->getOperand(0);
14005 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
14006 }
14007 // Insert constant values at first.
14008 for (int I = 0, E = VL.size(); I < E; ++I) {
14009 if (PostponedIndices.contains(I))
14010 continue;
14011 if (!isConstant(VL[I])) {
14012 NonConsts.push_back(I);
14013 continue;
14014 }
14015 if (isa<PoisonValue>(VL[I]))
14016 continue;
14017 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
14018 Mask[I] = I + E;
14019 }
14020 if (Root) {
14021 if (isa<PoisonValue>(Vec)) {
14022 Vec = OriginalRoot;
14023 } else {
14024 Vec = CreateShuffle(Root, Vec, Mask);
14025 if (auto *OI = dyn_cast<Instruction>(OriginalRoot);
14026 OI && OI->hasNUses(0) &&
14027 none_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
14028 return TE->VectorizedValue == OI;
14029 }))
14030 eraseInstruction(OI);
14031 }
14032 }
14033 // Insert non-constant values.
14034 for (int I : NonConsts)
14035 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
14036 // Append instructions, which are/may be part of the loop, in the end to make
14037 // it possible to hoist non-loop-based instructions.
14038 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
14039 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
14040
14041 return Vec;
14042}
14043
14044/// Merges shuffle masks and emits final shuffle instruction, if required. It
14045/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
14046/// when the actual shuffle instruction is generated only if this is actually
14047/// required. Otherwise, the shuffle instruction emission is delayed till the
14048/// end of the process, to reduce the number of emitted instructions and further
14049/// analysis/transformations.
14050/// The class also will look through the previously emitted shuffle instructions
14051/// and properly mark indices in mask as undef.
14052/// For example, given the code
14053/// \code
14054/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
14055/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
14056/// \endcode
14057/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
14058/// look through %s1 and %s2 and emit
14059/// \code
14060/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
14061/// \endcode
14062/// instead.
14063/// If 2 operands are of different size, the smallest one will be resized and
14064/// the mask recalculated properly.
14065/// For example, given the code
14066/// \code
14067/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
14068/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
14069/// \endcode
14070/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
14071/// look through %s1 and %s2 and emit
14072/// \code
14073/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
14074/// \endcode
14075/// instead.
14076class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
14077 bool IsFinalized = false;
14078 /// Combined mask for all applied operands and masks. It is built during
14079 /// analysis and actual emission of shuffle vector instructions.
14080 SmallVector<int> CommonMask;
14081 /// List of operands for the shuffle vector instruction. It hold at max 2
14082 /// operands, if the 3rd is going to be added, the first 2 are combined into
14083 /// shuffle with \p CommonMask mask, the first operand sets to be the
14084 /// resulting shuffle and the second operand sets to be the newly added
14085 /// operand. The \p CommonMask is transformed in the proper way after that.
14086 SmallVector<Value *, 2> InVectors;
14087 IRBuilderBase &Builder;
14088 BoUpSLP &R;
14089
14090 class ShuffleIRBuilder {
14091 IRBuilderBase &Builder;
14092 /// Holds all of the instructions that we gathered.
14093 SetVector<Instruction *> &GatherShuffleExtractSeq;
14094 /// A list of blocks that we are going to CSE.
14095 DenseSet<BasicBlock *> &CSEBlocks;
14096 /// Data layout.
14097 const DataLayout &DL;
14098
14099 public:
14100 ShuffleIRBuilder(IRBuilderBase &Builder,
14101 SetVector<Instruction *> &GatherShuffleExtractSeq,
14102 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
14103 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
14104 CSEBlocks(CSEBlocks), DL(DL) {}
14105 ~ShuffleIRBuilder() = default;
14106 /// Creates shufflevector for the 2 operands with the given mask.
14107 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
14108 if (V1->getType() != V2->getType()) {
14110 V1->getType()->isIntOrIntVectorTy() &&
14111 "Expected integer vector types only.");
14112 if (V1->getType() != V2->getType()) {
14113 if (cast<VectorType>(V2->getType())
14114 ->getElementType()
14115 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
14116 ->getElementType()
14117 ->getIntegerBitWidth())
14118 V2 = Builder.CreateIntCast(
14119 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
14120 else
14121 V1 = Builder.CreateIntCast(
14122 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
14123 }
14124 }
14125 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
14126 if (auto *I = dyn_cast<Instruction>(Vec)) {
14127 GatherShuffleExtractSeq.insert(I);
14128 CSEBlocks.insert(I->getParent());
14129 }
14130 return Vec;
14131 }
14132 /// Creates permutation of the single vector operand with the given mask, if
14133 /// it is not identity mask.
14134 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
14135 if (Mask.empty())
14136 return V1;
14137 unsigned VF = Mask.size();
14138 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
14139 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
14140 return V1;
14141 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
14142 if (auto *I = dyn_cast<Instruction>(Vec)) {
14143 GatherShuffleExtractSeq.insert(I);
14144 CSEBlocks.insert(I->getParent());
14145 }
14146 return Vec;
14147 }
14148 Value *createIdentity(Value *V) { return V; }
14149 Value *createPoison(Type *Ty, unsigned VF) {
14150 return PoisonValue::get(getWidenedType(Ty, VF));
14151 }
14152 /// Resizes 2 input vector to match the sizes, if the they are not equal
14153 /// yet. The smallest vector is resized to the size of the larger vector.
14154 void resizeToMatch(Value *&V1, Value *&V2) {
14155 if (V1->getType() == V2->getType())
14156 return;
14157 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
14158 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
14159 int VF = std::max(V1VF, V2VF);
14160 int MinVF = std::min(V1VF, V2VF);
14161 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
14162 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
14163 0);
14164 Value *&Op = MinVF == V1VF ? V1 : V2;
14165 Op = Builder.CreateShuffleVector(Op, IdentityMask);
14166 if (auto *I = dyn_cast<Instruction>(Op)) {
14167 GatherShuffleExtractSeq.insert(I);
14168 CSEBlocks.insert(I->getParent());
14169 }
14170 if (MinVF == V1VF)
14171 V1 = Op;
14172 else
14173 V2 = Op;
14174 }
14175 };
14176
14177 /// Smart shuffle instruction emission, walks through shuffles trees and
14178 /// tries to find the best matching vector for the actual shuffle
14179 /// instruction.
14180 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
14181 assert(V1 && "Expected at least one vector value.");
14182 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
14183 R.CSEBlocks, *R.DL);
14184 return BaseShuffleAnalysis::createShuffle<Value *>(
14185 V1, V2, Mask, ShuffleBuilder, ScalarTy);
14186 }
14187
14188 /// Cast value \p V to the vector type with the same number of elements, but
14189 /// the base type \p ScalarTy.
14190 Value *castToScalarTyElem(Value *V,
14191 std::optional<bool> IsSigned = std::nullopt) {
14192 auto *VecTy = cast<VectorType>(V->getType());
14193 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
14194 if (VecTy->getElementType() == ScalarTy->getScalarType())
14195 return V;
14196 return Builder.CreateIntCast(
14197 V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
14198 IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
14199 }
14200
14201public:
14203 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
14204
14205 /// Adjusts extractelements after reusing them.
14206 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
14207 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14208 unsigned NumParts, bool &UseVecBaseAsInput) {
14209 UseVecBaseAsInput = false;
14210 SmallPtrSet<Value *, 4> UniqueBases;
14211 Value *VecBase = nullptr;
14212 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
14213 if (!E->ReorderIndices.empty()) {
14214 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
14215 E->ReorderIndices.end());
14216 reorderScalars(VL, ReorderMask);
14217 }
14218 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
14219 int Idx = Mask[I];
14220 if (Idx == PoisonMaskElem)
14221 continue;
14222 auto *EI = cast<ExtractElementInst>(VL[I]);
14223 VecBase = EI->getVectorOperand();
14224 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecBase); !TEs.empty())
14225 VecBase = TEs.front()->VectorizedValue;
14226 assert(VecBase && "Expected vectorized value.");
14227 UniqueBases.insert(VecBase);
14228 // If the only one use is vectorized - can delete the extractelement
14229 // itself.
14230 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
14231 (NumParts != 1 && count(VL, EI) > 1) ||
14232 any_of(EI->users(), [&](User *U) {
14233 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
14234 return UTEs.empty() || UTEs.size() > 1 ||
14235 (isa<GetElementPtrInst>(U) &&
14236 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
14237 (!UTEs.empty() &&
14238 count_if(R.VectorizableTree,
14239 [&](const std::unique_ptr<TreeEntry> &TE) {
14240 return any_of(TE->UserTreeIndices,
14241 [&](const EdgeInfo &Edge) {
14242 return Edge.UserTE ==
14243 UTEs.front();
14244 }) &&
14245 is_contained(VL, EI);
14246 }) != 1);
14247 }))
14248 continue;
14249 R.eraseInstruction(EI);
14250 }
14251 if (NumParts == 1 || UniqueBases.size() == 1) {
14252 assert(VecBase && "Expected vectorized value.");
14253 return castToScalarTyElem(VecBase);
14254 }
14255 UseVecBaseAsInput = true;
14256 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
14257 for (auto [I, Idx] : enumerate(Mask))
14258 if (Idx != PoisonMaskElem)
14259 Idx = I;
14260 };
14261 // Perform multi-register vector shuffle, joining them into a single virtual
14262 // long vector.
14263 // Need to shuffle each part independently and then insert all this parts
14264 // into a long virtual vector register, forming the original vector.
14265 Value *Vec = nullptr;
14266 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
14267 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
14268 for (unsigned Part : seq<unsigned>(NumParts)) {
14269 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
14270 ArrayRef<Value *> SubVL = ArrayRef(VL).slice(Part * SliceSize, Limit);
14271 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
14272 constexpr int MaxBases = 2;
14273 SmallVector<Value *, MaxBases> Bases(MaxBases);
14274 auto VLMask = zip(SubVL, SubMask);
14275 const unsigned VF = std::accumulate(
14276 VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) {
14277 if (std::get<1>(D) == PoisonMaskElem)
14278 return S;
14279 Value *VecOp =
14280 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
14281 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
14282 !TEs.empty())
14283 VecOp = TEs.front()->VectorizedValue;
14284 assert(VecOp && "Expected vectorized value.");
14285 const unsigned Size =
14286 cast<FixedVectorType>(VecOp->getType())->getNumElements();
14287 return std::max(S, Size);
14288 });
14289 for (const auto [V, I] : VLMask) {
14290 if (I == PoisonMaskElem)
14291 continue;
14292 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
14293 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp); !TEs.empty())
14294 VecOp = TEs.front()->VectorizedValue;
14295 assert(VecOp && "Expected vectorized value.");
14296 VecOp = castToScalarTyElem(VecOp);
14297 Bases[I / VF] = VecOp;
14298 }
14299 if (!Bases.front())
14300 continue;
14301 Value *SubVec;
14302 if (Bases.back()) {
14303 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
14304 TransformToIdentity(SubMask);
14305 } else {
14306 SubVec = Bases.front();
14307 }
14308 if (!Vec) {
14309 Vec = SubVec;
14310 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
14311 [&](unsigned P) {
14312 ArrayRef<int> SubMask =
14313 Mask.slice(P * SliceSize,
14314 getNumElems(Mask.size(),
14315 SliceSize, P));
14316 return all_of(SubMask, [](int Idx) {
14317 return Idx == PoisonMaskElem;
14318 });
14319 })) &&
14320 "Expected first part or all previous parts masked.");
14321 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14322 } else {
14323 unsigned NewVF =
14324 cast<FixedVectorType>(Vec->getType())->getNumElements();
14325 if (Vec->getType() != SubVec->getType()) {
14326 unsigned SubVecVF =
14327 cast<FixedVectorType>(SubVec->getType())->getNumElements();
14328 NewVF = std::max(NewVF, SubVecVF);
14329 }
14330 // Adjust SubMask.
14331 for (int &Idx : SubMask)
14332 if (Idx != PoisonMaskElem)
14333 Idx += NewVF;
14334 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14335 Vec = createShuffle(Vec, SubVec, VecMask);
14336 TransformToIdentity(VecMask);
14337 }
14338 }
14339 copy(VecMask, Mask.begin());
14340 return Vec;
14341 }
14342 /// Checks if the specified entry \p E needs to be delayed because of its
14343 /// dependency nodes.
14344 std::optional<Value *>
14345 needToDelay(const TreeEntry *E,
14347 // No need to delay emission if all deps are ready.
14348 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
14349 return all_of(
14350 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
14351 }))
14352 return std::nullopt;
14353 // Postpone gather emission, will be emitted after the end of the
14354 // process to keep correct order.
14355 auto *ResVecTy = getWidenedType(ScalarTy, E->getVectorFactor());
14356 return Builder.CreateAlignedLoad(
14357 ResVecTy,
14359 MaybeAlign());
14360 }
14361 /// Adds 2 input vectors (in form of tree entries) and the mask for their
14362 /// shuffling.
14363 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
14364 Value *V1 = E1.VectorizedValue;
14365 if (V1->getType()->isIntOrIntVectorTy())
14366 V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
14367 if (isa<PoisonValue>(V))
14368 return false;
14369 return !isKnownNonNegative(
14370 V, SimplifyQuery(*R.DL));
14371 }));
14372 Value *V2 = E2.VectorizedValue;
14373 if (V2->getType()->isIntOrIntVectorTy())
14374 V2 = castToScalarTyElem(V2, any_of(E2.Scalars, [&](Value *V) {
14375 if (isa<PoisonValue>(V))
14376 return false;
14377 return !isKnownNonNegative(
14378 V, SimplifyQuery(*R.DL));
14379 }));
14380 add(V1, V2, Mask);
14381 }
14382 /// Adds single input vector (in form of tree entry) and the mask for its
14383 /// shuffling.
14384 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
14385 Value *V1 = E1.VectorizedValue;
14386 if (V1->getType()->isIntOrIntVectorTy())
14387 V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
14388 if (isa<PoisonValue>(V))
14389 return false;
14390 return !isKnownNonNegative(
14391 V, SimplifyQuery(*R.DL));
14392 }));
14393 add(V1, Mask);
14394 }
14395 /// Adds 2 input vectors and the mask for their shuffling.
14396 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
14397 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
14398 assert(isa<FixedVectorType>(V1->getType()) &&
14399 isa<FixedVectorType>(V2->getType()) &&
14400 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
14401 V1 = castToScalarTyElem(V1);
14402 V2 = castToScalarTyElem(V2);
14403 if (InVectors.empty()) {
14404 InVectors.push_back(V1);
14405 InVectors.push_back(V2);
14406 CommonMask.assign(Mask.begin(), Mask.end());
14407 return;
14408 }
14409 Value *Vec = InVectors.front();
14410 if (InVectors.size() == 2) {
14411 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
14412 transformMaskAfterShuffle(CommonMask, CommonMask);
14413 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
14414 Mask.size()) {
14415 Vec = createShuffle(Vec, nullptr, CommonMask);
14416 transformMaskAfterShuffle(CommonMask, CommonMask);
14417 }
14418 V1 = createShuffle(V1, V2, Mask);
14419 unsigned VF = std::max(getVF(V1), getVF(Vec));
14420 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14421 if (Mask[Idx] != PoisonMaskElem)
14422 CommonMask[Idx] = Idx + VF;
14423 InVectors.front() = Vec;
14424 if (InVectors.size() == 2)
14425 InVectors.back() = V1;
14426 else
14427 InVectors.push_back(V1);
14428 }
14429 /// Adds another one input vector and the mask for the shuffling.
14430 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
14431 assert(isa<FixedVectorType>(V1->getType()) &&
14432 "castToScalarTyElem expects V1 to be FixedVectorType");
14433 V1 = castToScalarTyElem(V1);
14434 if (InVectors.empty()) {
14435 InVectors.push_back(V1);
14436 CommonMask.assign(Mask.begin(), Mask.end());
14437 return;
14438 }
14439 const auto *It = find(InVectors, V1);
14440 if (It == InVectors.end()) {
14441 if (InVectors.size() == 2 ||
14442 InVectors.front()->getType() != V1->getType()) {
14443 Value *V = InVectors.front();
14444 if (InVectors.size() == 2) {
14445 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14446 transformMaskAfterShuffle(CommonMask, CommonMask);
14447 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
14448 CommonMask.size()) {
14449 V = createShuffle(InVectors.front(), nullptr, CommonMask);
14450 transformMaskAfterShuffle(CommonMask, CommonMask);
14451 }
14452 unsigned VF = std::max(CommonMask.size(), Mask.size());
14453 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14454 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
14455 CommonMask[Idx] =
14456 V->getType() != V1->getType()
14457 ? Idx + VF
14458 : Mask[Idx] + cast<FixedVectorType>(V1->getType())
14459 ->getNumElements();
14460 if (V->getType() != V1->getType())
14461 V1 = createShuffle(V1, nullptr, Mask);
14462 InVectors.front() = V;
14463 if (InVectors.size() == 2)
14464 InVectors.back() = V1;
14465 else
14466 InVectors.push_back(V1);
14467 return;
14468 }
14469 // Check if second vector is required if the used elements are already
14470 // used from the first one.
14471 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14472 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
14473 InVectors.push_back(V1);
14474 break;
14475 }
14476 }
14477 unsigned VF = 0;
14478 for (Value *V : InVectors)
14479 VF = std::max(VF, getVF(V));
14480 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14481 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
14482 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
14483 }
14484 /// Adds another one input vector and the mask for the shuffling.
14486 SmallVector<int> NewMask;
14487 inversePermutation(Order, NewMask);
14488 add(V1, NewMask);
14489 }
14490 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
14491 Value *Root = nullptr) {
14492 return R.gather(VL, Root, ScalarTy,
14493 [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
14494 return createShuffle(V1, V2, Mask);
14495 });
14496 }
14497 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
14498 /// Finalize emission of the shuffles.
14499 /// \param Action the action (if any) to be performed before final applying of
14500 /// the \p ExtMask mask.
14501 Value *
14503 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14504 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
14505 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
14506 IsFinalized = true;
14507 if (Action) {
14508 Value *Vec = InVectors.front();
14509 if (InVectors.size() == 2) {
14510 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
14511 InVectors.pop_back();
14512 } else {
14513 Vec = createShuffle(Vec, nullptr, CommonMask);
14514 }
14515 transformMaskAfterShuffle(CommonMask, CommonMask);
14516 assert(VF > 0 &&
14517 "Expected vector length for the final value before action.");
14518 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
14519 if (VecVF < VF) {
14520 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
14521 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
14522 Vec = createShuffle(Vec, nullptr, ResizeMask);
14523 }
14524 Action(Vec, CommonMask);
14525 InVectors.front() = Vec;
14526 }
14527 if (!SubVectors.empty()) {
14528 Value *Vec = InVectors.front();
14529 if (InVectors.size() == 2) {
14530 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
14531 InVectors.pop_back();
14532 } else {
14533 Vec = createShuffle(Vec, nullptr, CommonMask);
14534 }
14535 transformMaskAfterShuffle(CommonMask, CommonMask);
14536 auto CreateSubVectors = [&](Value *Vec,
14537 SmallVectorImpl<int> &CommonMask) {
14538 for (auto [E, Idx] : SubVectors) {
14539 Value *V = E->VectorizedValue;
14540 if (V->getType()->isIntOrIntVectorTy())
14541 V = castToScalarTyElem(V, any_of(E->Scalars, [&](Value *V) {
14542 if (isa<PoisonValue>(V))
14543 return false;
14544 return !isKnownNonNegative(
14545 V, SimplifyQuery(*R.DL));
14546 }));
14547 unsigned InsertionIndex = Idx * getNumElements(ScalarTy);
14548 Vec = createInsertVector(
14549 Builder, Vec, V, InsertionIndex,
14550 std::bind(&ShuffleInstructionBuilder::createShuffle, this, _1, _2,
14551 _3));
14552 if (!CommonMask.empty()) {
14553 std::iota(std::next(CommonMask.begin(), Idx),
14554 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
14555 Idx);
14556 }
14557 }
14558 return Vec;
14559 };
14560 if (SubVectorsMask.empty()) {
14561 Vec = CreateSubVectors(Vec, CommonMask);
14562 } else {
14563 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
14564 copy(SubVectorsMask, SVMask.begin());
14565 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
14566 if (I2 != PoisonMaskElem) {
14567 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
14568 I1 = I2 + CommonMask.size();
14569 }
14570 }
14571 Value *InsertVec =
14572 CreateSubVectors(PoisonValue::get(Vec->getType()), CommonMask);
14573 Vec = createShuffle(InsertVec, Vec, SVMask);
14574 transformMaskAfterShuffle(CommonMask, SVMask);
14575 }
14576 InVectors.front() = Vec;
14577 }
14578
14579 if (!ExtMask.empty()) {
14580 if (CommonMask.empty()) {
14581 CommonMask.assign(ExtMask.begin(), ExtMask.end());
14582 } else {
14583 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
14584 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
14585 if (ExtMask[I] == PoisonMaskElem)
14586 continue;
14587 NewMask[I] = CommonMask[ExtMask[I]];
14588 }
14589 CommonMask.swap(NewMask);
14590 }
14591 }
14592 if (CommonMask.empty()) {
14593 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
14594 return InVectors.front();
14595 }
14596 if (InVectors.size() == 2)
14597 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14598 return createShuffle(InVectors.front(), nullptr, CommonMask);
14599 }
14600
14602 assert((IsFinalized || CommonMask.empty()) &&
14603 "Shuffle construction must be finalized.");
14604 }
14605};
14606
14607BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E,
14608 unsigned NodeIdx) {
14609 ArrayRef<Value *> VL = E->getOperand(NodeIdx);
14610 InstructionsState S = getSameOpcode(VL, *TLI);
14611 // Special processing for GEPs bundle, which may include non-gep values.
14612 if (!S && VL.front()->getType()->isPointerTy()) {
14613 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
14614 if (It != VL.end())
14615 S = getSameOpcode(*It, *TLI);
14616 }
14617 if (!S)
14618 return nullptr;
14619 auto CheckSameVE = [&](const TreeEntry *VE) {
14620 return any_of(VE->UserTreeIndices,
14621 [E, NodeIdx](const EdgeInfo &EI) {
14622 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14623 }) ||
14624 any_of(VectorizableTree,
14625 [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
14626 return TE->isOperandGatherNode(
14627 {const_cast<TreeEntry *>(E), NodeIdx}) &&
14628 VE->isSame(TE->Scalars);
14629 });
14630 };
14631 TreeEntry *VE = getSameValuesTreeEntry(S.getMainOp(), VL);
14632 if (VE && CheckSameVE(VE))
14633 return VE;
14634 return nullptr;
14635}
14636
14637Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
14638 bool PostponedPHIs) {
14639 ValueList &VL = E->getOperand(NodeIdx);
14640 const unsigned VF = VL.size();
14641 if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx)) {
14642 auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
14643 // V may be affected by MinBWs.
14644 // We want ShuffleInstructionBuilder to correctly support REVEC. The key
14645 // factor is the number of elements, not their type.
14646 Type *ScalarTy = cast<VectorType>(V->getType())->getElementType();
14647 unsigned NumElements = getNumElements(VL.front()->getType());
14648 ShuffleInstructionBuilder ShuffleBuilder(
14649 NumElements != 1 ? FixedVectorType::get(ScalarTy, NumElements)
14650 : ScalarTy,
14651 Builder, *this);
14652 ShuffleBuilder.add(V, Mask);
14654 E->CombinedEntriesWithIndices.size());
14655 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14656 [&](const auto &P) {
14657 return std::make_pair(VectorizableTree[P.first].get(),
14658 P.second);
14659 });
14660 assert((E->CombinedEntriesWithIndices.empty() ||
14661 E->ReorderIndices.empty()) &&
14662 "Expected either combined subnodes or reordering");
14663 return ShuffleBuilder.finalize({}, SubVectors, {});
14664 };
14665 Value *V = vectorizeTree(VE, PostponedPHIs);
14666 if (VF * getNumElements(VL[0]->getType()) !=
14667 cast<FixedVectorType>(V->getType())->getNumElements()) {
14668 if (!VE->ReuseShuffleIndices.empty()) {
14669 // Reshuffle to get only unique values.
14670 // If some of the scalars are duplicated in the vectorization
14671 // tree entry, we do not vectorize them but instead generate a
14672 // mask for the reuses. But if there are several users of the
14673 // same entry, they may have different vectorization factors.
14674 // This is especially important for PHI nodes. In this case, we
14675 // need to adapt the resulting instruction for the user
14676 // vectorization factor and have to reshuffle it again to take
14677 // only unique elements of the vector. Without this code the
14678 // function incorrectly returns reduced vector instruction with
14679 // the same elements, not with the unique ones.
14680
14681 // block:
14682 // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
14683 // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
14684 // ... (use %2)
14685 // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
14686 // br %block
14688 for (auto [I, V] : enumerate(VL)) {
14689 if (isa<PoisonValue>(V))
14690 continue;
14691 Mask[I] = VE->findLaneForValue(V);
14692 }
14693 V = FinalShuffle(V, Mask);
14694 } else {
14695 assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
14696 "Expected vectorization factor less "
14697 "than original vector size.");
14698 SmallVector<int> UniformMask(VF, 0);
14699 std::iota(UniformMask.begin(), UniformMask.end(), 0);
14700 V = FinalShuffle(V, UniformMask);
14701 }
14702 }
14703 // Need to update the operand gather node, if actually the operand is not a
14704 // vectorized node, but the buildvector/gather node, which matches one of
14705 // the vectorized nodes.
14706 if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {
14707 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14708 }) == VE->UserTreeIndices.end()) {
14709 auto *It =
14710 find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
14711 return TE->isGather() && TE->UserTreeIndices.front().UserTE == E &&
14712 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
14713 });
14714 assert(It != VectorizableTree.end() && "Expected gather node operand.");
14715 (*It)->VectorizedValue = V;
14716 }
14717 return V;
14718 }
14719
14720 // Find the corresponding gather entry and vectorize it.
14721 // Allows to be more accurate with tree/graph transformations, checks for the
14722 // correctness of the transformations in many cases.
14723 auto *I = find_if(VectorizableTree,
14724 [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {
14725 return TE->isOperandGatherNode({E, NodeIdx});
14726 });
14727 assert(I != VectorizableTree.end() && "Gather node is not in the graph.");
14728 assert(I->get()->UserTreeIndices.size() == 1 &&
14729 "Expected only single user for the gather node.");
14730 assert(I->get()->isSame(VL) && "Expected same list of scalars.");
14731 return vectorizeTree(I->get(), PostponedPHIs);
14732}
14733
14734template <typename BVTy, typename ResTy, typename... Args>
14735ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
14736 Args &...Params) {
14737 assert(E->isGather() && "Expected gather node.");
14738 unsigned VF = E->getVectorFactor();
14739
14740 bool NeedFreeze = false;
14741 SmallVector<int> ReuseShuffleIndices(E->ReuseShuffleIndices.begin(),
14742 E->ReuseShuffleIndices.end());
14743 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
14744 // Clear values, to be replaced by insertvector instructions.
14745 for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
14746 for_each(MutableArrayRef(GatheredScalars)
14747 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
14748 [&](Value *&V) { V = PoisonValue::get(V->getType()); });
14750 E->CombinedEntriesWithIndices.size());
14751 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14752 [&](const auto &P) {
14753 return std::make_pair(VectorizableTree[P.first].get(), P.second);
14754 });
14755 // Build a mask out of the reorder indices and reorder scalars per this
14756 // mask.
14757 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
14758 E->ReorderIndices.end());
14759 if (!ReorderMask.empty())
14760 reorderScalars(GatheredScalars, ReorderMask);
14761 SmallVector<int> SubVectorsMask;
14762 inversePermutation(E->ReorderIndices, SubVectorsMask);
14763 // Transform non-clustered elements in the mask to poison (-1).
14764 // "Clustered" operations will be reordered using this mask later.
14765 if (!SubVectors.empty() && !SubVectorsMask.empty()) {
14766 for (unsigned I : seq<unsigned>(GatheredScalars.size()))
14767 if (E->Scalars[I] == GatheredScalars[ReorderMask[I]])
14768 SubVectorsMask[ReorderMask[I]] = PoisonMaskElem;
14769 } else {
14770 SubVectorsMask.clear();
14771 }
14772 SmallVector<Value *> StoredGS(GatheredScalars);
14773 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
14774 unsigned I, unsigned SliceSize,
14775 bool IsNotPoisonous) {
14776 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
14777 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
14778 }))
14779 return false;
14780 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
14781 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
14782 if (UserTE->getNumOperands() != 2)
14783 return false;
14784 if (!IsNotPoisonous) {
14785 auto *It =
14786 find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {
14787 return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) {
14788 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
14789 }) != TE->UserTreeIndices.end();
14790 });
14791 if (It == VectorizableTree.end())
14792 return false;
14793 SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end());
14794 if (!(*It)->ReorderIndices.empty()) {
14795 inversePermutation((*It)->ReorderIndices, ReorderMask);
14796 reorderScalars(GS, ReorderMask);
14797 }
14798 if (!all_of(zip(GatheredScalars, GS), [&](const auto &P) {
14799 Value *V0 = std::get<0>(P);
14800 Value *V1 = std::get<1>(P);
14801 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
14802 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
14803 is_contained(E->Scalars, V1));
14804 }))
14805 return false;
14806 }
14807 int Idx;
14808 if ((Mask.size() < InputVF &&
14810 Idx == 0) ||
14811 (Mask.size() == InputVF &&
14812 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
14813 std::iota(
14814 std::next(Mask.begin(), I * SliceSize),
14815 std::next(Mask.begin(),
14816 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
14817 0);
14818 } else {
14819 unsigned IVal =
14820 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
14821 std::fill(
14822 std::next(Mask.begin(), I * SliceSize),
14823 std::next(Mask.begin(),
14824 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
14825 IVal);
14826 }
14827 return true;
14828 };
14829 BVTy ShuffleBuilder(ScalarTy, Params...);
14830 ResTy Res = ResTy();
14832 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
14834 Value *ExtractVecBase = nullptr;
14835 bool UseVecBaseAsInput = false;
14838 Type *OrigScalarTy = GatheredScalars.front()->getType();
14839 auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
14840 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, GatheredScalars.size());
14841 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
14842 // Check for gathered extracts.
14843 bool Resized = false;
14844 ExtractShuffles =
14845 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
14846 if (!ExtractShuffles.empty()) {
14847 SmallVector<const TreeEntry *> ExtractEntries;
14848 for (auto [Idx, I] : enumerate(ExtractMask)) {
14849 if (I == PoisonMaskElem)
14850 continue;
14851 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(
14852 cast<ExtractElementInst>(StoredGS[Idx])->getVectorOperand());
14853 !TEs.empty())
14854 ExtractEntries.append(TEs.begin(), TEs.end());
14855 }
14856 if (std::optional<ResTy> Delayed =
14857 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
14858 // Delay emission of gathers which are not ready yet.
14859 PostponedGathers.insert(E);
14860 // Postpone gather emission, will be emitted after the end of the
14861 // process to keep correct order.
14862 return *Delayed;
14863 }
14864 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
14865 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
14866 ExtractVecBase = VecBase;
14867 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
14868 if (VF == VecBaseTy->getNumElements() &&
14869 GatheredScalars.size() != VF) {
14870 Resized = true;
14871 GatheredScalars.append(VF - GatheredScalars.size(),
14872 PoisonValue::get(OrigScalarTy));
14873 NumParts =
14874 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF), VF);
14875 }
14876 }
14877 }
14878 // Gather extracts after we check for full matched gathers only.
14879 if (!ExtractShuffles.empty() || !E->hasState() ||
14880 E->getOpcode() != Instruction::Load ||
14881 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
14882 any_of(E->Scalars, IsaPred<LoadInst>)) &&
14883 any_of(E->Scalars,
14884 [this](Value *V) {
14885 return isa<LoadInst>(V) && isVectorized(V);
14886 })) ||
14887 (E->hasState() && E->isAltShuffle()) ||
14888 all_of(E->Scalars, [this](Value *V) { return isVectorized(V); }) ||
14889 isSplat(E->Scalars) ||
14890 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
14891 GatherShuffles =
14892 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
14893 }
14894 if (!GatherShuffles.empty()) {
14895 if (std::optional<ResTy> Delayed =
14896 ShuffleBuilder.needToDelay(E, Entries)) {
14897 // Delay emission of gathers which are not ready yet.
14898 PostponedGathers.insert(E);
14899 // Postpone gather emission, will be emitted after the end of the
14900 // process to keep correct order.
14901 return *Delayed;
14902 }
14903 if (GatherShuffles.size() == 1 &&
14904 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
14905 Entries.front().front()->isSame(E->Scalars)) {
14906 // Perfect match in the graph, will reuse the previously vectorized
14907 // node. Cost is 0.
14908 LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "
14909 << shortBundleName(E->Scalars, E->Idx) << ".\n");
14910 // Restore the mask for previous partially matched values.
14911 Mask.resize(E->Scalars.size());
14912 const TreeEntry *FrontTE = Entries.front().front();
14913 if (FrontTE->ReorderIndices.empty() &&
14914 ((FrontTE->ReuseShuffleIndices.empty() &&
14915 E->Scalars.size() == FrontTE->Scalars.size()) ||
14916 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
14917 std::iota(Mask.begin(), Mask.end(), 0);
14918 } else {
14919 for (auto [I, V] : enumerate(E->Scalars)) {
14920 if (isa<PoisonValue>(V)) {
14922 continue;
14923 }
14924 Mask[I] = FrontTE->findLaneForValue(V);
14925 }
14926 }
14927 ShuffleBuilder.add(*FrontTE, Mask);
14928 // Full matched entry found, no need to insert subvectors.
14929 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
14930 return Res;
14931 }
14932 if (!Resized) {
14933 if (GatheredScalars.size() != VF &&
14934 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
14935 return any_of(TEs, [&](const TreeEntry *TE) {
14936 return TE->getVectorFactor() == VF;
14937 });
14938 }))
14939 GatheredScalars.append(VF - GatheredScalars.size(),
14940 PoisonValue::get(OrigScalarTy));
14941 }
14942 // Remove shuffled elements from list of gathers.
14943 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
14944 if (Mask[I] != PoisonMaskElem)
14945 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
14946 }
14947 }
14948 }
14949 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
14950 SmallVectorImpl<int> &ReuseMask,
14951 bool IsRootPoison) {
14952 // For splats with can emit broadcasts instead of gathers, so try to find
14953 // such sequences.
14954 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
14955 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
14956 Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
14957 SmallVector<int> UndefPos;
14958 DenseMap<Value *, unsigned> UniquePositions;
14959 // Gather unique non-const values and all constant values.
14960 // For repeated values, just shuffle them.
14961 int NumNonConsts = 0;
14962 int SinglePos = 0;
14963 for (auto [I, V] : enumerate(Scalars)) {
14964 if (isa<UndefValue>(V)) {
14965 if (!isa<PoisonValue>(V)) {
14966 ReuseMask[I] = I;
14967 UndefPos.push_back(I);
14968 }
14969 continue;
14970 }
14971 if (isConstant(V)) {
14972 ReuseMask[I] = I;
14973 continue;
14974 }
14975 ++NumNonConsts;
14976 SinglePos = I;
14977 Value *OrigV = V;
14978 Scalars[I] = PoisonValue::get(OrigScalarTy);
14979 if (IsSplat) {
14980 Scalars.front() = OrigV;
14981 ReuseMask[I] = 0;
14982 } else {
14983 const auto Res = UniquePositions.try_emplace(OrigV, I);
14984 Scalars[Res.first->second] = OrigV;
14985 ReuseMask[I] = Res.first->second;
14986 }
14987 }
14988 if (NumNonConsts == 1) {
14989 // Restore single insert element.
14990 if (IsSplat) {
14991 ReuseMask.assign(VF, PoisonMaskElem);
14992 std::swap(Scalars.front(), Scalars[SinglePos]);
14993 if (!UndefPos.empty() && UndefPos.front() == 0)
14994 Scalars.front() = UndefValue::get(OrigScalarTy);
14995 }
14996 ReuseMask[SinglePos] = SinglePos;
14997 } else if (!UndefPos.empty() && IsSplat) {
14998 // For undef values, try to replace them with the simple broadcast.
14999 // We can do it if the broadcasted value is guaranteed to be
15000 // non-poisonous, or by freezing the incoming scalar value first.
15001 auto *It = find_if(Scalars, [this, E](Value *V) {
15002 return !isa<UndefValue>(V) &&
15004 (E->UserTreeIndices.size() == 1 &&
15005 any_of(V->uses(), [E](const Use &U) {
15006 // Check if the value already used in the same operation in
15007 // one of the nodes already.
15008 return E->UserTreeIndices.front().EdgeIdx !=
15009 U.getOperandNo() &&
15010 is_contained(
15011 E->UserTreeIndices.front().UserTE->Scalars,
15012 U.getUser());
15013 })));
15014 });
15015 if (It != Scalars.end()) {
15016 // Replace undefs by the non-poisoned scalars and emit broadcast.
15017 int Pos = std::distance(Scalars.begin(), It);
15018 for (int I : UndefPos) {
15019 // Set the undef position to the non-poisoned scalar.
15020 ReuseMask[I] = Pos;
15021 // Replace the undef by the poison, in the mask it is replaced by
15022 // non-poisoned scalar already.
15023 if (I != Pos)
15024 Scalars[I] = PoisonValue::get(OrigScalarTy);
15025 }
15026 } else {
15027 // Replace undefs by the poisons, emit broadcast and then emit
15028 // freeze.
15029 for (int I : UndefPos) {
15030 ReuseMask[I] = PoisonMaskElem;
15031 if (isa<UndefValue>(Scalars[I]))
15032 Scalars[I] = PoisonValue::get(OrigScalarTy);
15033 }
15034 NeedFreeze = true;
15035 }
15036 }
15037 };
15038 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
15039 bool IsNonPoisoned = true;
15040 bool IsUsedInExpr = true;
15041 Value *Vec1 = nullptr;
15042 if (!ExtractShuffles.empty()) {
15043 // Gather of extractelements can be represented as just a shuffle of
15044 // a single/two vectors the scalars are extracted from.
15045 // Find input vectors.
15046 Value *Vec2 = nullptr;
15047 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
15048 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
15049 ExtractMask[I] = PoisonMaskElem;
15050 }
15051 if (UseVecBaseAsInput) {
15052 Vec1 = ExtractVecBase;
15053 } else {
15054 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
15055 if (ExtractMask[I] == PoisonMaskElem)
15056 continue;
15057 if (isa<UndefValue>(E->Scalars[I]))
15058 continue;
15059 auto *EI = cast<ExtractElementInst>(StoredGS[I]);
15060 Value *VecOp = EI->getVectorOperand();
15061 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(VecOp);
15062 !TEs.empty() && TEs.front()->VectorizedValue)
15063 VecOp = TEs.front()->VectorizedValue;
15064 if (!Vec1) {
15065 Vec1 = VecOp;
15066 } else if (Vec1 != VecOp) {
15067 assert((!Vec2 || Vec2 == VecOp) &&
15068 "Expected only 1 or 2 vectors shuffle.");
15069 Vec2 = VecOp;
15070 }
15071 }
15072 }
15073 if (Vec2) {
15074 IsUsedInExpr = false;
15075 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1, AC) &&
15076 isGuaranteedNotToBePoison(Vec2, AC);
15077 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
15078 } else if (Vec1) {
15079 bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1, AC);
15080 IsUsedInExpr &= FindReusedSplat(
15081 ExtractMask,
15082 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
15083 ExtractMask.size(), IsNotPoisonedVec);
15084 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
15085 IsNonPoisoned &= IsNotPoisonedVec;
15086 } else {
15087 IsUsedInExpr = false;
15088 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
15089 /*ForExtracts=*/true);
15090 }
15091 }
15092 if (!GatherShuffles.empty()) {
15093 unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
15094 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
15095 for (const auto [I, TEs] : enumerate(Entries)) {
15096 if (TEs.empty()) {
15097 assert(!GatherShuffles[I] &&
15098 "No shuffles with empty entries list expected.");
15099 continue;
15100 }
15101 assert((TEs.size() == 1 || TEs.size() == 2) &&
15102 "Expected shuffle of 1 or 2 entries.");
15103 unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
15104 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
15105 VecMask.assign(VecMask.size(), PoisonMaskElem);
15106 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
15107 if (TEs.size() == 1) {
15108 bool IsNotPoisonedVec =
15109 TEs.front()->VectorizedValue
15110 ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC)
15111 : true;
15112 IsUsedInExpr &=
15113 FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I,
15114 SliceSize, IsNotPoisonedVec);
15115 ShuffleBuilder.add(*TEs.front(), VecMask);
15116 IsNonPoisoned &= IsNotPoisonedVec;
15117 } else {
15118 IsUsedInExpr = false;
15119 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
15120 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
15121 IsNonPoisoned &=
15122 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) &&
15123 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue, AC);
15124 }
15125 }
15126 }
15127 // Try to figure out best way to combine values: build a shuffle and insert
15128 // elements or just build several shuffles.
15129 // Insert non-constant scalars.
15130 SmallVector<Value *> NonConstants(GatheredScalars);
15131 int EMSz = ExtractMask.size();
15132 int MSz = Mask.size();
15133 // Try to build constant vector and shuffle with it only if currently we
15134 // have a single permutation and more than 1 scalar constants.
15135 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
15136 bool IsIdentityShuffle =
15137 ((UseVecBaseAsInput ||
15138 all_of(ExtractShuffles,
15139 [](const std::optional<TTI::ShuffleKind> &SK) {
15140 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
15142 })) &&
15143 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
15144 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
15145 (!GatherShuffles.empty() &&
15146 all_of(GatherShuffles,
15147 [](const std::optional<TTI::ShuffleKind> &SK) {
15148 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
15150 }) &&
15151 none_of(Mask, [&](int I) { return I >= MSz; }) &&
15153 bool EnoughConstsForShuffle =
15154 IsSingleShuffle &&
15155 (none_of(GatheredScalars,
15156 [](Value *V) {
15157 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
15158 }) ||
15159 any_of(GatheredScalars,
15160 [](Value *V) {
15161 return isa<Constant>(V) && !isa<UndefValue>(V);
15162 })) &&
15163 (!IsIdentityShuffle ||
15164 (GatheredScalars.size() == 2 &&
15165 any_of(GatheredScalars,
15166 [](Value *V) { return !isa<UndefValue>(V); })) ||
15167 count_if(GatheredScalars, [](Value *V) {
15168 return isa<Constant>(V) && !isa<PoisonValue>(V);
15169 }) > 1);
15170 // NonConstants array contains just non-constant values, GatheredScalars
15171 // contains only constant to build final vector and then shuffle.
15172 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
15173 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
15174 NonConstants[I] = PoisonValue::get(OrigScalarTy);
15175 else
15176 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
15177 }
15178 // Generate constants for final shuffle and build a mask for them.
15179 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
15180 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
15181 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
15182 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
15183 ShuffleBuilder.add(BV, BVMask);
15184 }
15185 if (all_of(NonConstants, [=](Value *V) {
15186 return isa<PoisonValue>(V) ||
15187 (IsSingleShuffle && ((IsIdentityShuffle &&
15188 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
15189 }))
15190 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15191 SubVectorsMask);
15192 else
15193 Res = ShuffleBuilder.finalize(
15194 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
15195 [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
15196 TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);
15197 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
15198 });
15199 } else if (!allConstant(GatheredScalars)) {
15200 // Gather unique scalars and all constants.
15201 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
15202 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
15203 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
15204 ShuffleBuilder.add(BV, ReuseMask);
15205 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15206 SubVectorsMask);
15207 } else {
15208 // Gather all constants.
15209 SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem);
15210 for (auto [I, V] : enumerate(GatheredScalars)) {
15211 if (!isa<PoisonValue>(V))
15212 Mask[I] = I;
15213 }
15214 Value *BV = ShuffleBuilder.gather(GatheredScalars);
15215 ShuffleBuilder.add(BV, Mask);
15216 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15217 SubVectorsMask);
15218 }
15219
15220 if (NeedFreeze)
15221 Res = ShuffleBuilder.createFreeze(Res);
15222 return Res;
15223}
15224
15225Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy,
15226 bool PostponedPHIs) {
15227 for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
15228 (void)vectorizeTree(VectorizableTree[EIdx].get(), PostponedPHIs);
15229 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
15230 Builder, *this);
15231}
15232
15233/// \returns \p I after propagating metadata from \p VL only for instructions in
15234/// \p VL.
15237 for (Value *V : VL)
15238 if (isa<Instruction>(V))
15239 Insts.push_back(V);
15240 return llvm::propagateMetadata(Inst, Insts);
15241}
15242
15243Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
15244 IRBuilderBase::InsertPointGuard Guard(Builder);
15245
15246 if (E->VectorizedValue &&
15247 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
15248 E->isAltShuffle())) {
15249 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
15250 return E->VectorizedValue;
15251 }
15252
15253 Value *V = E->Scalars.front();
15254 Type *ScalarTy = V->getType();
15255 if (!isa<CmpInst>(V))
15256 ScalarTy = getValueType(V);
15257 auto It = MinBWs.find(E);
15258 if (It != MinBWs.end()) {
15259 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
15260 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
15261 if (VecTy)
15262 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
15263 }
15264 auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
15265 if (E->isGather()) {
15266 // Set insert point for non-reduction initial nodes.
15267 if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
15268 setInsertPointAfterBundle(E);
15269 Value *Vec = createBuildVector(E, ScalarTy, PostponedPHIs);
15270 E->VectorizedValue = Vec;
15271 return Vec;
15272 }
15273
15274 bool IsReverseOrder =
15275 !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices);
15276 auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
15277 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
15278 if (E->getOpcode() == Instruction::Store &&
15279 E->State == TreeEntry::Vectorize) {
15281 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
15282 E->ReorderIndices.size());
15283 ShuffleBuilder.add(V, Mask);
15284 } else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
15285 ShuffleBuilder.addOrdered(V, {});
15286 } else {
15287 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
15288 }
15290 E->CombinedEntriesWithIndices.size());
15291 transform(
15292 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
15293 return std::make_pair(VectorizableTree[P.first].get(), P.second);
15294 });
15295 assert(
15296 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
15297 "Expected either combined subnodes or reordering");
15298 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
15299 };
15300
15301 assert(!E->isGather() && "Unhandled state");
15302 unsigned ShuffleOrOp =
15303 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
15304 Instruction *VL0 = E->getMainOp();
15305 auto GetOperandSignedness = [&](unsigned Idx) {
15306 const TreeEntry *OpE = getOperandEntry(E, Idx);
15307 bool IsSigned = false;
15308 auto It = MinBWs.find(OpE);
15309 if (It != MinBWs.end())
15310 IsSigned = It->second.second;
15311 else
15312 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
15313 if (isa<PoisonValue>(V))
15314 return false;
15315 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15316 });
15317 return IsSigned;
15318 };
15319 switch (ShuffleOrOp) {
15320 case Instruction::PHI: {
15321 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
15322 E != VectorizableTree.front().get() ||
15323 !E->UserTreeIndices.empty()) &&
15324 "PHI reordering is free.");
15325 if (PostponedPHIs && E->VectorizedValue)
15326 return E->VectorizedValue;
15327 auto *PH = cast<PHINode>(VL0);
15328 Builder.SetInsertPoint(PH->getParent(),
15329 PH->getParent()->getFirstNonPHIIt());
15330 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
15331 if (PostponedPHIs || !E->VectorizedValue) {
15332 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
15333 E->PHI = NewPhi;
15334 Value *V = NewPhi;
15335
15336 // Adjust insertion point once all PHI's have been generated.
15337 Builder.SetInsertPoint(PH->getParent(),
15338 PH->getParent()->getFirstInsertionPt());
15339 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
15340
15341 V = FinalShuffle(V, E);
15342
15343 E->VectorizedValue = V;
15344 if (PostponedPHIs)
15345 return V;
15346 }
15347 PHINode *NewPhi = cast<PHINode>(E->PHI);
15348 // If phi node is fully emitted - exit.
15349 if (NewPhi->getNumIncomingValues() != 0)
15350 return NewPhi;
15351
15352 // PHINodes may have multiple entries from the same block. We want to
15353 // visit every block once.
15355
15356 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
15358 BasicBlock *IBB = PH->getIncomingBlock(I);
15359
15360 // Stop emission if all incoming values are generated.
15361 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
15362 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15363 return NewPhi;
15364 }
15365
15366 if (!VisitedBBs.insert(IBB).second) {
15367 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
15368 continue;
15369 }
15370
15371 Builder.SetInsertPoint(IBB->getTerminator());
15372 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
15373 Value *Vec = vectorizeOperand(E, I, /*PostponedPHIs=*/true);
15374 if (VecTy != Vec->getType()) {
15375 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
15376 MinBWs.contains(getOperandEntry(E, I))) &&
15377 "Expected item in MinBWs.");
15378 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
15379 }
15380 NewPhi->addIncoming(Vec, IBB);
15381 }
15382
15383 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
15384 "Invalid number of incoming values");
15385 assert(E->VectorizedValue && "Expected vectorized value.");
15386 return E->VectorizedValue;
15387 }
15388
15389 case Instruction::ExtractElement: {
15390 Value *V = E->getSingleOperand(0);
15391 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V); !TEs.empty())
15392 V = TEs.front()->VectorizedValue;
15393 setInsertPointAfterBundle(E);
15394 V = FinalShuffle(V, E);
15395 E->VectorizedValue = V;
15396 return V;
15397 }
15398 case Instruction::ExtractValue: {
15399 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
15400 Builder.SetInsertPoint(LI);
15401 Value *Ptr = LI->getPointerOperand();
15402 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
15403 Value *NewV = ::propagateMetadata(V, E->Scalars);
15404 NewV = FinalShuffle(NewV, E);
15405 E->VectorizedValue = NewV;
15406 return NewV;
15407 }
15408 case Instruction::InsertElement: {
15409 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
15410 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
15411 Value *V = vectorizeOperand(E, 1, PostponedPHIs);
15412 ArrayRef<Value *> Op = E->getOperand(1);
15413 Type *ScalarTy = Op.front()->getType();
15414 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
15415 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
15416 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
15417 assert(Res.first > 0 && "Expected item in MinBWs.");
15418 V = Builder.CreateIntCast(
15419 V,
15421 ScalarTy,
15422 cast<FixedVectorType>(V->getType())->getNumElements()),
15423 Res.second);
15424 }
15425
15426 // Create InsertVector shuffle if necessary
15427 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
15428 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
15429 }));
15430 const unsigned NumElts =
15431 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
15432 const unsigned NumScalars = E->Scalars.size();
15433
15434 unsigned Offset = *getElementIndex(VL0);
15435 assert(Offset < NumElts && "Failed to find vector index offset");
15436
15437 // Create shuffle to resize vector
15439 if (!E->ReorderIndices.empty()) {
15440 inversePermutation(E->ReorderIndices, Mask);
15441 Mask.append(NumElts - NumScalars, PoisonMaskElem);
15442 } else {
15443 Mask.assign(NumElts, PoisonMaskElem);
15444 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
15445 }
15446 // Create InsertVector shuffle if necessary
15447 bool IsIdentity = true;
15448 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
15449 Mask.swap(PrevMask);
15450 for (unsigned I = 0; I < NumScalars; ++I) {
15451 Value *Scalar = E->Scalars[PrevMask[I]];
15452 unsigned InsertIdx = *getElementIndex(Scalar);
15453 IsIdentity &= InsertIdx - Offset == I;
15454 Mask[InsertIdx - Offset] = I;
15455 }
15456 if (!IsIdentity || NumElts != NumScalars) {
15457 Value *V2 = nullptr;
15458 bool IsVNonPoisonous =
15460 SmallVector<int> InsertMask(Mask);
15461 if (NumElts != NumScalars && Offset == 0) {
15462 // Follow all insert element instructions from the current buildvector
15463 // sequence.
15464 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
15465 do {
15466 std::optional<unsigned> InsertIdx = getElementIndex(Ins);
15467 if (!InsertIdx)
15468 break;
15469 if (InsertMask[*InsertIdx] == PoisonMaskElem)
15470 InsertMask[*InsertIdx] = *InsertIdx;
15471 if (!Ins->hasOneUse())
15472 break;
15473 Ins = dyn_cast_or_null<InsertElementInst>(
15474 Ins->getUniqueUndroppableUser());
15475 } while (Ins);
15476 SmallBitVector UseMask =
15477 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15478 SmallBitVector IsFirstPoison =
15479 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15480 SmallBitVector IsFirstUndef =
15481 isUndefVector(FirstInsert->getOperand(0), UseMask);
15482 if (!IsFirstPoison.all()) {
15483 unsigned Idx = 0;
15484 for (unsigned I = 0; I < NumElts; I++) {
15485 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
15486 IsFirstUndef.test(I)) {
15487 if (IsVNonPoisonous) {
15488 InsertMask[I] = I < NumScalars ? I : 0;
15489 continue;
15490 }
15491 if (!V2)
15492 V2 = UndefValue::get(V->getType());
15493 if (Idx >= NumScalars)
15494 Idx = NumScalars - 1;
15495 InsertMask[I] = NumScalars + Idx;
15496 ++Idx;
15497 } else if (InsertMask[I] != PoisonMaskElem &&
15498 Mask[I] == PoisonMaskElem) {
15499 InsertMask[I] = PoisonMaskElem;
15500 }
15501 }
15502 } else {
15503 InsertMask = Mask;
15504 }
15505 }
15506 if (!V2)
15507 V2 = PoisonValue::get(V->getType());
15508 V = Builder.CreateShuffleVector(V, V2, InsertMask);
15509 if (auto *I = dyn_cast<Instruction>(V)) {
15510 GatherShuffleExtractSeq.insert(I);
15511 CSEBlocks.insert(I->getParent());
15512 }
15513 }
15514
15515 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
15516 for (unsigned I = 0; I < NumElts; I++) {
15517 if (Mask[I] != PoisonMaskElem)
15518 InsertMask[Offset + I] = I;
15519 }
15520 SmallBitVector UseMask =
15521 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15522 SmallBitVector IsFirstUndef =
15523 isUndefVector(FirstInsert->getOperand(0), UseMask);
15524 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
15525 NumElts != NumScalars) {
15526 if (IsFirstUndef.all()) {
15527 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
15528 SmallBitVector IsFirstPoison =
15529 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15530 if (!IsFirstPoison.all()) {
15531 for (unsigned I = 0; I < NumElts; I++) {
15532 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
15533 InsertMask[I] = I + NumElts;
15534 }
15535 }
15536 V = Builder.CreateShuffleVector(
15537 V,
15538 IsFirstPoison.all() ? PoisonValue::get(V->getType())
15539 : FirstInsert->getOperand(0),
15540 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
15541 if (auto *I = dyn_cast<Instruction>(V)) {
15542 GatherShuffleExtractSeq.insert(I);
15543 CSEBlocks.insert(I->getParent());
15544 }
15545 }
15546 } else {
15547 SmallBitVector IsFirstPoison =
15548 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15549 for (unsigned I = 0; I < NumElts; I++) {
15550 if (InsertMask[I] == PoisonMaskElem)
15551 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
15552 else
15553 InsertMask[I] += NumElts;
15554 }
15555 V = Builder.CreateShuffleVector(
15556 FirstInsert->getOperand(0), V, InsertMask,
15557 cast<Instruction>(E->Scalars.back())->getName());
15558 if (auto *I = dyn_cast<Instruction>(V)) {
15559 GatherShuffleExtractSeq.insert(I);
15560 CSEBlocks.insert(I->getParent());
15561 }
15562 }
15563 }
15564
15565 ++NumVectorInstructions;
15566 E->VectorizedValue = V;
15567 return V;
15568 }
15569 case Instruction::ZExt:
15570 case Instruction::SExt:
15571 case Instruction::FPToUI:
15572 case Instruction::FPToSI:
15573 case Instruction::FPExt:
15574 case Instruction::PtrToInt:
15575 case Instruction::IntToPtr:
15576 case Instruction::SIToFP:
15577 case Instruction::UIToFP:
15578 case Instruction::Trunc:
15579 case Instruction::FPTrunc:
15580 case Instruction::BitCast: {
15581 setInsertPointAfterBundle(E);
15582
15583 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
15584 if (E->VectorizedValue) {
15585 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15586 return E->VectorizedValue;
15587 }
15588
15589 auto *CI = cast<CastInst>(VL0);
15590 Instruction::CastOps VecOpcode = CI->getOpcode();
15591 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
15592 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
15593 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
15594 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
15595 SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {
15596 // Check if the values are candidates to demote.
15597 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
15598 if (SrcIt != MinBWs.end())
15599 SrcBWSz = SrcIt->second.first;
15600 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
15601 if (BWSz == SrcBWSz) {
15602 VecOpcode = Instruction::BitCast;
15603 } else if (BWSz < SrcBWSz) {
15604 VecOpcode = Instruction::Trunc;
15605 } else if (It != MinBWs.end()) {
15606 assert(BWSz > SrcBWSz && "Invalid cast!");
15607 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
15608 } else if (SrcIt != MinBWs.end()) {
15609 assert(BWSz > SrcBWSz && "Invalid cast!");
15610 VecOpcode =
15611 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
15612 }
15613 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
15614 !SrcIt->second.second) {
15615 VecOpcode = Instruction::UIToFP;
15616 }
15617 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
15618 ? InVec
15619 : Builder.CreateCast(VecOpcode, InVec, VecTy);
15620 V = FinalShuffle(V, E);
15621
15622 E->VectorizedValue = V;
15623 ++NumVectorInstructions;
15624 return V;
15625 }
15626 case Instruction::FCmp:
15627 case Instruction::ICmp: {
15628 setInsertPointAfterBundle(E);
15629
15630 Value *L = vectorizeOperand(E, 0, PostponedPHIs);
15631 if (E->VectorizedValue) {
15632 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15633 return E->VectorizedValue;
15634 }
15635 Value *R = vectorizeOperand(E, 1, PostponedPHIs);
15636 if (E->VectorizedValue) {
15637 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15638 return E->VectorizedValue;
15639 }
15640 if (L->getType() != R->getType()) {
15641 assert((getOperandEntry(E, 0)->isGather() ||
15642 getOperandEntry(E, 1)->isGather() ||
15643 MinBWs.contains(getOperandEntry(E, 0)) ||
15644 MinBWs.contains(getOperandEntry(E, 1))) &&
15645 "Expected item in MinBWs.");
15646 if (cast<VectorType>(L->getType())
15647 ->getElementType()
15648 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
15649 ->getElementType()
15650 ->getIntegerBitWidth()) {
15651 Type *CastTy = R->getType();
15652 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
15653 } else {
15654 Type *CastTy = L->getType();
15655 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
15656 }
15657 }
15658
15659 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
15660 Value *V = Builder.CreateCmp(P0, L, R);
15661 propagateIRFlags(V, E->Scalars, VL0);
15662 if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end())
15663 ICmp->setSameSign(/*B=*/false);
15664 // Do not cast for cmps.
15665 VecTy = cast<FixedVectorType>(V->getType());
15666 V = FinalShuffle(V, E);
15667
15668 E->VectorizedValue = V;
15669 ++NumVectorInstructions;
15670 return V;
15671 }
15672 case Instruction::Select: {
15673 setInsertPointAfterBundle(E);
15674
15675 Value *Cond = vectorizeOperand(E, 0, PostponedPHIs);
15676 if (E->VectorizedValue) {
15677 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15678 return E->VectorizedValue;
15679 }
15680 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
15681 if (E->VectorizedValue) {
15682 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15683 return E->VectorizedValue;
15684 }
15685 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
15686 if (E->VectorizedValue) {
15687 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15688 return E->VectorizedValue;
15689 }
15690 if (True->getType() != VecTy || False->getType() != VecTy) {
15691 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
15692 getOperandEntry(E, 2)->isGather() ||
15693 MinBWs.contains(getOperandEntry(E, 1)) ||
15694 MinBWs.contains(getOperandEntry(E, 2))) &&
15695 "Expected item in MinBWs.");
15696 if (True->getType() != VecTy)
15697 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
15698 if (False->getType() != VecTy)
15699 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
15700 }
15701
15702 unsigned CondNumElements = getNumElements(Cond->getType());
15703 unsigned TrueNumElements = getNumElements(True->getType());
15704 assert(TrueNumElements >= CondNumElements &&
15705 TrueNumElements % CondNumElements == 0 &&
15706 "Cannot vectorize Instruction::Select");
15707 assert(TrueNumElements == getNumElements(False->getType()) &&
15708 "Cannot vectorize Instruction::Select");
15709 if (CondNumElements != TrueNumElements) {
15710 // When the return type is i1 but the source is fixed vector type, we
15711 // need to duplicate the condition value.
15712 Cond = Builder.CreateShuffleVector(
15713 Cond, createReplicatedMask(TrueNumElements / CondNumElements,
15714 CondNumElements));
15715 }
15716 assert(getNumElements(Cond->getType()) == TrueNumElements &&
15717 "Cannot vectorize Instruction::Select");
15718 Value *V = Builder.CreateSelect(Cond, True, False);
15719 V = FinalShuffle(V, E);
15720
15721 E->VectorizedValue = V;
15722 ++NumVectorInstructions;
15723 return V;
15724 }
15725 case Instruction::FNeg: {
15726 setInsertPointAfterBundle(E);
15727
15728 Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
15729
15730 if (E->VectorizedValue) {
15731 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15732 return E->VectorizedValue;
15733 }
15734
15735 Value *V = Builder.CreateUnOp(
15736 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
15737 propagateIRFlags(V, E->Scalars, VL0);
15738 if (auto *I = dyn_cast<Instruction>(V))
15739 V = ::propagateMetadata(I, E->Scalars);
15740
15741 V = FinalShuffle(V, E);
15742
15743 E->VectorizedValue = V;
15744 ++NumVectorInstructions;
15745
15746 return V;
15747 }
15748 case Instruction::Freeze: {
15749 setInsertPointAfterBundle(E);
15750
15751 Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
15752
15753 if (E->VectorizedValue) {
15754 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15755 return E->VectorizedValue;
15756 }
15757
15758 if (Op->getType() != VecTy) {
15759 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
15760 MinBWs.contains(getOperandEntry(E, 0))) &&
15761 "Expected item in MinBWs.");
15762 Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0));
15763 }
15764 Value *V = Builder.CreateFreeze(Op);
15765 V = FinalShuffle(V, E);
15766
15767 E->VectorizedValue = V;
15768 ++NumVectorInstructions;
15769
15770 return V;
15771 }
15772 case Instruction::Add:
15773 case Instruction::FAdd:
15774 case Instruction::Sub:
15775 case Instruction::FSub:
15776 case Instruction::Mul:
15777 case Instruction::FMul:
15778 case Instruction::UDiv:
15779 case Instruction::SDiv:
15780 case Instruction::FDiv:
15781 case Instruction::URem:
15782 case Instruction::SRem:
15783 case Instruction::FRem:
15784 case Instruction::Shl:
15785 case Instruction::LShr:
15786 case Instruction::AShr:
15787 case Instruction::And:
15788 case Instruction::Or:
15789 case Instruction::Xor: {
15790 setInsertPointAfterBundle(E);
15791
15792 Value *LHS = vectorizeOperand(E, 0, PostponedPHIs);
15793 if (E->VectorizedValue) {
15794 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15795 return E->VectorizedValue;
15796 }
15797 Value *RHS = vectorizeOperand(E, 1, PostponedPHIs);
15798 if (E->VectorizedValue) {
15799 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15800 return E->VectorizedValue;
15801 }
15802 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
15803 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
15804 ArrayRef<Value *> Ops = E->getOperand(I);
15805 if (all_of(Ops, [&](Value *Op) {
15806 auto *CI = dyn_cast<ConstantInt>(Op);
15807 return CI && CI->getValue().countr_one() >= It->second.first;
15808 })) {
15809 V = FinalShuffle(I == 0 ? RHS : LHS, E);
15810 E->VectorizedValue = V;
15811 ++NumVectorInstructions;
15812 return V;
15813 }
15814 }
15815 }
15816 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
15817 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
15818 getOperandEntry(E, 1)->isGather() ||
15819 MinBWs.contains(getOperandEntry(E, 0)) ||
15820 MinBWs.contains(getOperandEntry(E, 1))) &&
15821 "Expected item in MinBWs.");
15822 if (LHS->getType() != VecTy)
15823 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
15824 if (RHS->getType() != VecTy)
15825 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
15826 }
15827
15828 Value *V = Builder.CreateBinOp(
15829 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
15830 RHS);
15831 propagateIRFlags(V, E->Scalars, VL0, It == MinBWs.end());
15832 if (auto *I = dyn_cast<Instruction>(V)) {
15833 V = ::propagateMetadata(I, E->Scalars);
15834 // Drop nuw flags for abs(sub(commutative), true).
15835 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
15836 any_of(E->Scalars, [](Value *V) {
15837 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
15838 }))
15839 I->setHasNoUnsignedWrap(/*b=*/false);
15840 }
15841
15842 V = FinalShuffle(V, E);
15843
15844 E->VectorizedValue = V;
15845 ++NumVectorInstructions;
15846
15847 return V;
15848 }
15849 case Instruction::Load: {
15850 // Loads are inserted at the head of the tree because we don't want to
15851 // sink them all the way down past store instructions.
15852 setInsertPointAfterBundle(E);
15853
15854 LoadInst *LI = cast<LoadInst>(VL0);
15855 Instruction *NewLI;
15856 Value *PO = LI->getPointerOperand();
15857 if (E->State == TreeEntry::Vectorize) {
15858 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
15859 } else if (E->State == TreeEntry::StridedVectorize) {
15860 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
15861 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
15862 PO = IsReverseOrder ? PtrN : Ptr0;
15863 std::optional<int> Diff = getPointersDiff(
15864 VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
15865 Type *StrideTy = DL->getIndexType(PO->getType());
15866 Value *StrideVal;
15867 if (Diff) {
15868 int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
15869 StrideVal =
15870 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
15871 DL->getTypeAllocSize(ScalarTy));
15872 } else {
15873 SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
15874 transform(E->Scalars, PointerOps.begin(), [](Value *V) {
15875 return cast<LoadInst>(V)->getPointerOperand();
15876 });
15877 OrdersType Order;
15878 std::optional<Value *> Stride =
15879 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,
15880 &*Builder.GetInsertPoint());
15881 Value *NewStride =
15882 Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true);
15883 StrideVal = Builder.CreateMul(
15884 NewStride,
15885 ConstantInt::get(
15886 StrideTy,
15887 (IsReverseOrder ? -1 : 1) *
15888 static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
15889 }
15890 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15891 auto *Inst = Builder.CreateIntrinsic(
15892 Intrinsic::experimental_vp_strided_load,
15893 {VecTy, PO->getType(), StrideTy},
15894 {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
15895 Builder.getInt32(E->Scalars.size())});
15896 Inst->addParamAttr(
15897 /*ArgNo=*/0,
15898 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
15899 NewLI = Inst;
15900 } else {
15901 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
15902 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
15903 if (E->VectorizedValue) {
15904 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15905 return E->VectorizedValue;
15906 }
15907 if (isa<FixedVectorType>(ScalarTy)) {
15908 assert(SLPReVec && "FixedVectorType is not expected.");
15909 // CreateMaskedGather expects VecTy and VecPtr have same size. We need
15910 // to expand VecPtr if ScalarTy is a vector type.
15911 unsigned ScalarTyNumElements =
15912 cast<FixedVectorType>(ScalarTy)->getNumElements();
15913 unsigned VecTyNumElements =
15914 cast<FixedVectorType>(VecTy)->getNumElements();
15915 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
15916 "Cannot expand getelementptr.");
15917 unsigned VF = VecTyNumElements / ScalarTyNumElements;
15918 SmallVector<Constant *> Indices(VecTyNumElements);
15919 transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) {
15920 return Builder.getInt64(I % ScalarTyNumElements);
15921 });
15922 VecPtr = Builder.CreateGEP(
15923 VecTy->getElementType(),
15924 Builder.CreateShuffleVector(
15925 VecPtr, createReplicatedMask(ScalarTyNumElements, VF)),
15926 ConstantVector::get(Indices));
15927 }
15928 // Use the minimum alignment of the gathered loads.
15929 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15930 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
15931 }
15932 Value *V = ::propagateMetadata(NewLI, E->Scalars);
15933
15934 V = FinalShuffle(V, E);
15935 E->VectorizedValue = V;
15936 ++NumVectorInstructions;
15937 return V;
15938 }
15939 case Instruction::Store: {
15940 auto *SI = cast<StoreInst>(VL0);
15941
15942 setInsertPointAfterBundle(E);
15943
15944 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
15945 if (VecValue->getType() != VecTy)
15946 VecValue =
15947 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
15948 VecValue = FinalShuffle(VecValue, E);
15949
15950 Value *Ptr = SI->getPointerOperand();
15951 Instruction *ST;
15952 if (E->State == TreeEntry::Vectorize) {
15953 ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
15954 } else {
15955 assert(E->State == TreeEntry::StridedVectorize &&
15956 "Expected either strided or consecutive stores.");
15957 if (!E->ReorderIndices.empty()) {
15958 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
15959 Ptr = SI->getPointerOperand();
15960 }
15961 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
15962 Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
15963 auto *Inst = Builder.CreateIntrinsic(
15964 Intrinsic::experimental_vp_strided_store,
15965 {VecTy, Ptr->getType(), StrideTy},
15966 {VecValue, Ptr,
15967 ConstantInt::get(
15968 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
15969 Builder.getAllOnesMask(VecTy->getElementCount()),
15970 Builder.getInt32(E->Scalars.size())});
15971 Inst->addParamAttr(
15972 /*ArgNo=*/1,
15973 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
15974 ST = Inst;
15975 }
15976
15977 Value *V = ::propagateMetadata(ST, E->Scalars);
15978
15979 E->VectorizedValue = V;
15980 ++NumVectorInstructions;
15981 return V;
15982 }
15983 case Instruction::GetElementPtr: {
15984 auto *GEP0 = cast<GetElementPtrInst>(VL0);
15985 setInsertPointAfterBundle(E);
15986
15987 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
15988 if (E->VectorizedValue) {
15989 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15990 return E->VectorizedValue;
15991 }
15992
15993 SmallVector<Value *> OpVecs;
15994 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
15995 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
15996 if (E->VectorizedValue) {
15997 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15998 return E->VectorizedValue;
15999 }
16000 OpVecs.push_back(OpVec);
16001 }
16002
16003 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
16004 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
16006 for (Value *V : E->Scalars) {
16007 if (isa<GetElementPtrInst>(V))
16008 GEPs.push_back(V);
16009 }
16010 V = ::propagateMetadata(I, GEPs);
16011 }
16012
16013 V = FinalShuffle(V, E);
16014
16015 E->VectorizedValue = V;
16016 ++NumVectorInstructions;
16017
16018 return V;
16019 }
16020 case Instruction::Call: {
16021 CallInst *CI = cast<CallInst>(VL0);
16022 setInsertPointAfterBundle(E);
16023
16025
16027 CI, ID, VecTy->getNumElements(),
16028 It != MinBWs.end() ? It->second.first : 0, TTI);
16029 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
16030 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
16031 VecCallCosts.first <= VecCallCosts.second;
16032
16033 Value *ScalarArg = nullptr;
16034 SmallVector<Value *> OpVecs;
16035 SmallVector<Type *, 2> TysForDecl;
16036 // Add return type if intrinsic is overloaded on it.
16037 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI))
16038 TysForDecl.push_back(VecTy);
16039 auto *CEI = cast<CallInst>(VL0);
16040 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
16041 ValueList OpVL;
16042 // Some intrinsics have scalar arguments. This argument should not be
16043 // vectorized.
16044 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) {
16045 ScalarArg = CEI->getArgOperand(I);
16046 // if decided to reduce bitwidth of abs intrinsic, it second argument
16047 // must be set false (do not return poison, if value issigned min).
16048 if (ID == Intrinsic::abs && It != MinBWs.end() &&
16049 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
16050 ScalarArg = Builder.getFalse();
16051 OpVecs.push_back(ScalarArg);
16053 TysForDecl.push_back(ScalarArg->getType());
16054 continue;
16055 }
16056
16057 Value *OpVec = vectorizeOperand(E, I, PostponedPHIs);
16058 if (E->VectorizedValue) {
16059 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16060 return E->VectorizedValue;
16061 }
16062 ScalarArg = CEI->getArgOperand(I);
16063 if (cast<VectorType>(OpVec->getType())->getElementType() !=
16064 ScalarArg->getType()->getScalarType() &&
16065 It == MinBWs.end()) {
16066 auto *CastTy =
16067 getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
16068 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
16069 } else if (It != MinBWs.end()) {
16070 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
16071 }
16072 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
16073 OpVecs.push_back(OpVec);
16074 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))
16075 TysForDecl.push_back(OpVec->getType());
16076 }
16077
16078 Function *CF;
16079 if (!UseIntrinsic) {
16080 VFShape Shape =
16083 static_cast<unsigned>(VecTy->getNumElements())),
16084 false /*HasGlobalPred*/);
16085 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
16086 } else {
16087 CF = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, TysForDecl);
16088 }
16089
16091 CI->getOperandBundlesAsDefs(OpBundles);
16092 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
16093
16094 propagateIRFlags(V, E->Scalars, VL0);
16095 V = FinalShuffle(V, E);
16096
16097 E->VectorizedValue = V;
16098 ++NumVectorInstructions;
16099 return V;
16100 }
16101 case Instruction::ShuffleVector: {
16102 Value *V;
16103 if (SLPReVec && !E->isAltShuffle()) {
16104 setInsertPointAfterBundle(E);
16105 Value *Src = vectorizeOperand(E, 0, PostponedPHIs);
16106 if (E->VectorizedValue) {
16107 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16108 return E->VectorizedValue;
16109 }
16110 SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));
16111 if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
16112 assert(isa<PoisonValue>(SVSrc->getOperand(1)) &&
16113 "Not supported shufflevector usage.");
16114 SmallVector<int> NewMask(ThisMask.size());
16115 transform(ThisMask, NewMask.begin(), [&SVSrc](int Mask) {
16116 return SVSrc->getShuffleMask()[Mask];
16117 });
16118 V = Builder.CreateShuffleVector(SVSrc->getOperand(0), NewMask);
16119 } else {
16120 V = Builder.CreateShuffleVector(Src, ThisMask);
16121 }
16122 propagateIRFlags(V, E->Scalars, VL0);
16123 if (auto *I = dyn_cast<Instruction>(V))
16124 V = ::propagateMetadata(I, E->Scalars);
16125 V = FinalShuffle(V, E);
16126 } else {
16127 assert(E->isAltShuffle() &&
16128 ((Instruction::isBinaryOp(E->getOpcode()) &&
16129 Instruction::isBinaryOp(E->getAltOpcode())) ||
16130 (Instruction::isCast(E->getOpcode()) &&
16131 Instruction::isCast(E->getAltOpcode())) ||
16132 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
16133 "Invalid Shuffle Vector Operand");
16134
16135 Value *LHS = nullptr, *RHS = nullptr;
16136 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
16137 setInsertPointAfterBundle(E);
16138 LHS = vectorizeOperand(E, 0, PostponedPHIs);
16139 if (E->VectorizedValue) {
16140 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16141 return E->VectorizedValue;
16142 }
16143 RHS = vectorizeOperand(E, 1, PostponedPHIs);
16144 } else {
16145 setInsertPointAfterBundle(E);
16146 LHS = vectorizeOperand(E, 0, PostponedPHIs);
16147 }
16148 if (E->VectorizedValue) {
16149 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16150 return E->VectorizedValue;
16151 }
16152 if (LHS && RHS &&
16153 ((Instruction::isBinaryOp(E->getOpcode()) &&
16154 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
16155 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
16156 assert((It != MinBWs.end() ||
16157 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
16158 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
16159 MinBWs.contains(getOperandEntry(E, 0)) ||
16160 MinBWs.contains(getOperandEntry(E, 1))) &&
16161 "Expected item in MinBWs.");
16162 Type *CastTy = VecTy;
16163 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
16164 if (cast<VectorType>(LHS->getType())
16165 ->getElementType()
16166 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
16167 ->getElementType()
16168 ->getIntegerBitWidth())
16169 CastTy = RHS->getType();
16170 else
16171 CastTy = LHS->getType();
16172 }
16173 if (LHS->getType() != CastTy)
16174 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
16175 if (RHS->getType() != CastTy)
16176 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
16177 }
16178
16179 Value *V0, *V1;
16180 if (Instruction::isBinaryOp(E->getOpcode())) {
16181 V0 = Builder.CreateBinOp(
16182 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
16183 V1 = Builder.CreateBinOp(
16184 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
16185 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
16186 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
16187 auto *AltCI = cast<CmpInst>(E->getAltOp());
16188 CmpInst::Predicate AltPred = AltCI->getPredicate();
16189 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
16190 } else {
16191 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
16192 unsigned SrcBWSz = DL->getTypeSizeInBits(
16193 cast<VectorType>(LHS->getType())->getElementType());
16194 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
16195 if (BWSz <= SrcBWSz) {
16196 if (BWSz < SrcBWSz)
16197 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
16198 assert(LHS->getType() == VecTy &&
16199 "Expected same type as operand.");
16200 if (auto *I = dyn_cast<Instruction>(LHS))
16201 LHS = ::propagateMetadata(I, E->Scalars);
16202 LHS = FinalShuffle(LHS, E);
16203 E->VectorizedValue = LHS;
16204 ++NumVectorInstructions;
16205 return LHS;
16206 }
16207 }
16208 V0 = Builder.CreateCast(
16209 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
16210 V1 = Builder.CreateCast(
16211 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
16212 }
16213 // Add V0 and V1 to later analysis to try to find and remove matching
16214 // instruction, if any.
16215 for (Value *V : {V0, V1}) {
16216 if (auto *I = dyn_cast<Instruction>(V)) {
16217 GatherShuffleExtractSeq.insert(I);
16218 CSEBlocks.insert(I->getParent());
16219 }
16220 }
16221
16222 // Create shuffle to take alternate operations from the vector.
16223 // Also, gather up main and alt scalar ops to propagate IR flags to
16224 // each vector operation.
16225 ValueList OpScalars, AltScalars;
16227 E->buildAltOpShuffleMask(
16228 [E, this](Instruction *I) {
16229 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
16230 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
16231 *TLI);
16232 },
16233 Mask, &OpScalars, &AltScalars);
16234
16235 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
16236 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
16237 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
16238 // Drop nuw flags for abs(sub(commutative), true).
16239 if (auto *I = dyn_cast<Instruction>(Vec);
16240 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
16241 any_of(E->Scalars, [](Value *V) {
16242 if (isa<PoisonValue>(V))
16243 return false;
16244 auto *IV = cast<Instruction>(V);
16245 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
16246 }))
16247 I->setHasNoUnsignedWrap(/*b=*/false);
16248 };
16249 DropNuwFlag(V0, E->getOpcode());
16250 DropNuwFlag(V1, E->getAltOpcode());
16251
16252 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
16253 assert(SLPReVec && "FixedVectorType is not expected.");
16255 }
16256 V = Builder.CreateShuffleVector(V0, V1, Mask);
16257 if (auto *I = dyn_cast<Instruction>(V)) {
16258 V = ::propagateMetadata(I, E->Scalars);
16259 GatherShuffleExtractSeq.insert(I);
16260 CSEBlocks.insert(I->getParent());
16261 }
16262 }
16263
16264 E->VectorizedValue = V;
16265 ++NumVectorInstructions;
16266
16267 return V;
16268 }
16269 default:
16270 llvm_unreachable("unknown inst");
16271 }
16272 return nullptr;
16273}
16274
16276 ExtraValueToDebugLocsMap ExternallyUsedValues;
16277 return vectorizeTree(ExternallyUsedValues);
16278}
16279
16280Value *
16282 Instruction *ReductionRoot) {
16283 // All blocks must be scheduled before any instructions are inserted.
16284 for (auto &BSIter : BlocksSchedules) {
16285 scheduleBlock(BSIter.second.get());
16286 }
16287 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
16288 // need to rebuild it.
16289 EntryToLastInstruction.clear();
16290
16291 if (ReductionRoot)
16292 Builder.SetInsertPoint(ReductionRoot->getParent(),
16293 ReductionRoot->getIterator());
16294 else
16295 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
16296
16297 // Emit gathered loads first to emit better code for the users of those
16298 // gathered loads.
16299 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
16300 if (GatheredLoadsEntriesFirst.has_value() &&
16301 TE->Idx >= *GatheredLoadsEntriesFirst &&
16302 (!TE->isGather() || !TE->UserTreeIndices.empty())) {
16303 assert((!TE->UserTreeIndices.empty() ||
16304 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
16305 "Expected gathered load node.");
16306 (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
16307 }
16308 }
16309 // Postpone emission of PHIs operands to avoid cyclic dependencies issues.
16310 (void)vectorizeTree(VectorizableTree[0].get(), /*PostponedPHIs=*/true);
16311 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
16312 if (TE->State == TreeEntry::Vectorize &&
16313 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
16314 TE->VectorizedValue)
16315 (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
16316 // Run through the list of postponed gathers and emit them, replacing the temp
16317 // emitted allocas with actual vector instructions.
16318 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
16320 for (const TreeEntry *E : PostponedNodes) {
16321 auto *TE = const_cast<TreeEntry *>(E);
16322 if (auto *VecTE = getSameValuesTreeEntry(
16323 TE->Scalars.front(), TE->UserTreeIndices.front().UserTE->getOperand(
16324 TE->UserTreeIndices.front().EdgeIdx));
16325 VecTE && VecTE->isSame(TE->Scalars))
16326 // Found gather node which is absolutely the same as one of the
16327 // vectorized nodes. It may happen after reordering.
16328 continue;
16329 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
16330 TE->VectorizedValue = nullptr;
16331 auto *UserI =
16332 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
16333 // If user is a PHI node, its vector code have to be inserted right before
16334 // block terminator. Since the node was delayed, there were some unresolved
16335 // dependencies at the moment when stab instruction was emitted. In a case
16336 // when any of these dependencies turn out an operand of another PHI, coming
16337 // from this same block, position of a stab instruction will become invalid.
16338 // The is because source vector that supposed to feed this gather node was
16339 // inserted at the end of the block [after stab instruction]. So we need
16340 // to adjust insertion point again to the end of block.
16341 if (isa<PHINode>(UserI)) {
16342 // Insert before all users.
16343 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
16344 for (User *U : PrevVec->users()) {
16345 if (U == UserI)
16346 continue;
16347 auto *UI = dyn_cast<Instruction>(U);
16348 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
16349 continue;
16350 if (UI->comesBefore(InsertPt))
16351 InsertPt = UI;
16352 }
16353 Builder.SetInsertPoint(InsertPt);
16354 } else {
16355 Builder.SetInsertPoint(PrevVec);
16356 }
16357 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
16358 Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false);
16359 if (auto *VecI = dyn_cast<Instruction>(Vec);
16360 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
16361 Builder.GetInsertPoint()->comesBefore(VecI))
16362 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
16363 Builder.GetInsertPoint());
16364 if (Vec->getType() != PrevVec->getType()) {
16365 assert(Vec->getType()->isIntOrIntVectorTy() &&
16366 PrevVec->getType()->isIntOrIntVectorTy() &&
16367 "Expected integer vector types only.");
16368 std::optional<bool> IsSigned;
16369 for (Value *V : TE->Scalars) {
16370 if (isVectorized(V)) {
16371 for (const TreeEntry *MNTE : getTreeEntries(V)) {
16372 auto It = MinBWs.find(MNTE);
16373 if (It != MinBWs.end()) {
16374 IsSigned = IsSigned.value_or(false) || It->second.second;
16375 if (*IsSigned)
16376 break;
16377 }
16378 }
16379 if (IsSigned.value_or(false))
16380 break;
16381 // Scan through gather nodes.
16382 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
16383 auto It = MinBWs.find(BVE);
16384 if (It != MinBWs.end()) {
16385 IsSigned = IsSigned.value_or(false) || It->second.second;
16386 if (*IsSigned)
16387 break;
16388 }
16389 }
16390 if (IsSigned.value_or(false))
16391 break;
16392 if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
16393 IsSigned =
16394 IsSigned.value_or(false) ||
16395 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
16396 continue;
16397 }
16398 if (IsSigned.value_or(false))
16399 break;
16400 }
16401 }
16402 if (IsSigned.value_or(false)) {
16403 // Final attempt - check user node.
16404 auto It = MinBWs.find(TE->UserTreeIndices.front().UserTE);
16405 if (It != MinBWs.end())
16406 IsSigned = It->second.second;
16407 }
16408 assert(IsSigned &&
16409 "Expected user node or perfect diamond match in MinBWs.");
16410 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
16411 }
16412 PrevVec->replaceAllUsesWith(Vec);
16413 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
16414 // Replace the stub vector node, if it was used before for one of the
16415 // buildvector nodes already.
16416 auto It = PostponedValues.find(PrevVec);
16417 if (It != PostponedValues.end()) {
16418 for (TreeEntry *VTE : It->getSecond())
16419 VTE->VectorizedValue = Vec;
16420 }
16421 eraseInstruction(PrevVec);
16422 }
16423
16424 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
16425 << " values .\n");
16426
16428 // Maps vector instruction to original insertelement instruction
16429 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
16430 // Maps extract Scalar to the corresponding extractelement instruction in the
16431 // basic block. Only one extractelement per block should be emitted.
16433 ScalarToEEs;
16434 SmallDenseSet<Value *, 4> UsedInserts;
16436 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
16438 // Extract all of the elements with the external uses.
16439 for (const auto &ExternalUse : ExternalUses) {
16440 Value *Scalar = ExternalUse.Scalar;
16441 llvm::User *User = ExternalUse.User;
16442
16443 // Skip users that we already RAUW. This happens when one instruction
16444 // has multiple uses of the same value.
16445 if (User && !is_contained(Scalar->users(), User))
16446 continue;
16447 const TreeEntry *E = &ExternalUse.E;
16448 assert(E && "Invalid scalar");
16449 assert(!E->isGather() && "Extracting from a gather list");
16450 // Non-instruction pointers are not deleted, just skip them.
16451 if (E->getOpcode() == Instruction::GetElementPtr &&
16452 !isa<GetElementPtrInst>(Scalar))
16453 continue;
16454
16455 Value *Vec = E->VectorizedValue;
16456 assert(Vec && "Can't find vectorizable value");
16457
16458 Value *Lane = Builder.getInt32(ExternalUse.Lane);
16459 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
16460 if (Scalar->getType() != Vec->getType()) {
16461 Value *Ex = nullptr;
16462 Value *ExV = nullptr;
16463 auto *Inst = dyn_cast<Instruction>(Scalar);
16464 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
16465 auto It = ScalarToEEs.find(Scalar);
16466 if (It != ScalarToEEs.end()) {
16467 // No need to emit many extracts, just move the only one in the
16468 // current block.
16469 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
16470 : Builder.GetInsertBlock());
16471 if (EEIt != It->second.end()) {
16472 Value *PrevV = EEIt->second.first;
16473 if (auto *I = dyn_cast<Instruction>(PrevV);
16474 I && !ReplaceInst &&
16475 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
16476 Builder.GetInsertPoint()->comesBefore(I)) {
16477 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
16478 Builder.GetInsertPoint());
16479 if (auto *CI = dyn_cast<Instruction>(EEIt->second.second))
16480 CI->moveAfter(I);
16481 }
16482 Ex = PrevV;
16483 ExV = EEIt->second.second ? EEIt->second.second : Ex;
16484 }
16485 }
16486 if (!Ex) {
16487 // "Reuse" the existing extract to improve final codegen.
16488 if (ReplaceInst) {
16489 // Leave the instruction as is, if it cheaper extracts and all
16490 // operands are scalar.
16491 if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
16492 IgnoredExtracts.insert(EE);
16493 Ex = EE;
16494 } else {
16495 auto *CloneInst = Inst->clone();
16496 CloneInst->insertBefore(Inst->getIterator());
16497 if (Inst->hasName())
16498 CloneInst->takeName(Inst);
16499 Ex = CloneInst;
16500 }
16501 } else if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
16502 ES && isa<Instruction>(Vec)) {
16503 Value *V = ES->getVectorOperand();
16504 auto *IVec = cast<Instruction>(Vec);
16505 if (ArrayRef<TreeEntry *> ETEs = getTreeEntries(V); !ETEs.empty())
16506 V = ETEs.front()->VectorizedValue;
16507 if (auto *IV = dyn_cast<Instruction>(V);
16508 !IV || IV == Vec || IV->getParent() != IVec->getParent() ||
16509 IV->comesBefore(IVec))
16510 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
16511 else
16512 Ex = Builder.CreateExtractElement(Vec, Lane);
16513 } else if (auto *VecTy =
16514 dyn_cast<FixedVectorType>(Scalar->getType())) {
16515 assert(SLPReVec && "FixedVectorType is not expected.");
16516 unsigned VecTyNumElements = VecTy->getNumElements();
16517 // When REVEC is enabled, we need to extract a vector.
16518 // Note: The element size of Scalar may be different from the
16519 // element size of Vec.
16520 Ex = createExtractVector(Builder, Vec, VecTyNumElements,
16521 ExternalUse.Lane * VecTyNumElements);
16522 } else {
16523 Ex = Builder.CreateExtractElement(Vec, Lane);
16524 }
16525 // If necessary, sign-extend or zero-extend ScalarRoot
16526 // to the larger type.
16527 ExV = Ex;
16528 if (Scalar->getType() != Ex->getType())
16529 ExV = Builder.CreateIntCast(
16530 Ex, Scalar->getType(),
16531 !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
16532 auto *I = dyn_cast<Instruction>(Ex);
16533 ScalarToEEs[Scalar].try_emplace(I ? I->getParent()
16534 : &F->getEntryBlock(),
16535 std::make_pair(Ex, ExV));
16536 }
16537 // The then branch of the previous if may produce constants, since 0
16538 // operand might be a constant.
16539 if (auto *ExI = dyn_cast<Instruction>(Ex);
16540 ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {
16541 GatherShuffleExtractSeq.insert(ExI);
16542 CSEBlocks.insert(ExI->getParent());
16543 }
16544 return ExV;
16545 }
16546 assert(isa<FixedVectorType>(Scalar->getType()) &&
16547 isa<InsertElementInst>(Scalar) &&
16548 "In-tree scalar of vector type is not insertelement?");
16549 auto *IE = cast<InsertElementInst>(Scalar);
16550 VectorToInsertElement.try_emplace(Vec, IE);
16551 return Vec;
16552 };
16553 // If User == nullptr, the Scalar remains as scalar in vectorized
16554 // instructions or is used as extra arg. Generate ExtractElement instruction
16555 // and update the record for this scalar in ExternallyUsedValues.
16556 if (!User) {
16557 if (!ScalarsWithNullptrUser.insert(Scalar).second)
16558 continue;
16559 assert(
16560 (ExternallyUsedValues.count(Scalar) ||
16561 Scalar->hasNUsesOrMore(UsesLimit) ||
16562 ExternalUsesAsOriginalScalar.contains(Scalar) ||
16563 any_of(
16564 Scalar->users(),
16565 [&, TTI = TTI](llvm::User *U) {
16566 if (ExternalUsesAsOriginalScalar.contains(U))
16567 return true;
16568 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
16569 return !UseEntries.empty() &&
16570 (E->State == TreeEntry::Vectorize ||
16571 E->State == TreeEntry::StridedVectorize) &&
16572 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
16573 return (UseEntry->State == TreeEntry::Vectorize ||
16574 UseEntry->State ==
16575 TreeEntry::StridedVectorize) &&
16576 doesInTreeUserNeedToExtract(
16577 Scalar, getRootEntryInstruction(*UseEntry),
16578 TLI, TTI);
16579 });
16580 })) &&
16581 "Scalar with nullptr User must be registered in "
16582 "ExternallyUsedValues map or remain as scalar in vectorized "
16583 "instructions");
16584 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
16585 if (auto *PHI = dyn_cast<PHINode>(VecI)) {
16586 if (PHI->getParent()->isLandingPad())
16587 Builder.SetInsertPoint(
16588 PHI->getParent(),
16589 std::next(
16590 PHI->getParent()->getLandingPadInst()->getIterator()));
16591 else
16592 Builder.SetInsertPoint(PHI->getParent(),
16593 PHI->getParent()->getFirstNonPHIIt());
16594 } else {
16595 Builder.SetInsertPoint(VecI->getParent(),
16596 std::next(VecI->getIterator()));
16597 }
16598 } else {
16599 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
16600 }
16601 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16602 // Required to update internally referenced instructions.
16603 if (Scalar != NewInst) {
16604 assert((!isa<ExtractElementInst>(Scalar) ||
16605 !IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&
16606 "Extractelements should not be replaced.");
16607 Scalar->replaceAllUsesWith(NewInst);
16608 }
16609 continue;
16610 }
16611
16612 if (auto *VU = dyn_cast<InsertElementInst>(User);
16613 VU && VU->getOperand(1) == Scalar) {
16614 // Skip if the scalar is another vector op or Vec is not an instruction.
16615 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
16616 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
16617 if (!UsedInserts.insert(VU).second)
16618 continue;
16619 // Need to use original vector, if the root is truncated.
16620 auto BWIt = MinBWs.find(E);
16621 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
16622 auto *ScalarTy = FTy->getElementType();
16623 auto Key = std::make_pair(Vec, ScalarTy);
16624 auto VecIt = VectorCasts.find(Key);
16625 if (VecIt == VectorCasts.end()) {
16626 IRBuilderBase::InsertPointGuard Guard(Builder);
16627 if (auto *IVec = dyn_cast<PHINode>(Vec)) {
16628 if (IVec->getParent()->isLandingPad())
16629 Builder.SetInsertPoint(IVec->getParent(),
16630 std::next(IVec->getParent()
16631 ->getLandingPadInst()
16632 ->getIterator()));
16633 else
16634 Builder.SetInsertPoint(
16635 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
16636 } else if (auto *IVec = dyn_cast<Instruction>(Vec)) {
16637 Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
16638 }
16639 Vec = Builder.CreateIntCast(
16640 Vec,
16642 ScalarTy,
16643 cast<FixedVectorType>(Vec->getType())->getNumElements()),
16644 BWIt->second.second);
16645 VectorCasts.try_emplace(Key, Vec);
16646 } else {
16647 Vec = VecIt->second;
16648 }
16649 }
16650
16651 std::optional<unsigned> InsertIdx = getElementIndex(VU);
16652 if (InsertIdx) {
16653 auto *It = find_if(
16654 ShuffledInserts, [VU](const ShuffledInsertData<Value *> &Data) {
16655 // Checks if 2 insertelements are from the same buildvector.
16656 InsertElementInst *VecInsert = Data.InsertElements.front();
16658 VU, VecInsert,
16659 [](InsertElementInst *II) { return II->getOperand(0); });
16660 });
16661 unsigned Idx = *InsertIdx;
16662 if (It == ShuffledInserts.end()) {
16663 (void)ShuffledInserts.emplace_back();
16664 It = std::next(ShuffledInserts.begin(),
16665 ShuffledInserts.size() - 1);
16666 }
16667 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
16668 if (Mask.empty())
16669 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
16670 Mask[Idx] = ExternalUse.Lane;
16671 It->InsertElements.push_back(cast<InsertElementInst>(User));
16672 continue;
16673 }
16674 }
16675 }
16676 }
16677
16678 // Generate extracts for out-of-tree users.
16679 // Find the insertion point for the extractelement lane.
16680 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
16681 if (PHINode *PH = dyn_cast<PHINode>(User)) {
16682 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
16683 if (PH->getIncomingValue(I) == Scalar) {
16684 Instruction *IncomingTerminator =
16685 PH->getIncomingBlock(I)->getTerminator();
16686 if (isa<CatchSwitchInst>(IncomingTerminator)) {
16687 Builder.SetInsertPoint(VecI->getParent(),
16688 std::next(VecI->getIterator()));
16689 } else {
16690 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
16691 }
16692 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16693 PH->setOperand(I, NewInst);
16694 }
16695 }
16696 } else {
16697 Builder.SetInsertPoint(cast<Instruction>(User));
16698 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16699 User->replaceUsesOfWith(Scalar, NewInst);
16700 }
16701 } else {
16702 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
16703 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16704 User->replaceUsesOfWith(Scalar, NewInst);
16705 }
16706
16707 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
16708 }
16709
16710 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
16711 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
16712 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
16713 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
16714 for (int I = 0, E = Mask.size(); I < E; ++I) {
16715 if (Mask[I] < VF)
16716 CombinedMask1[I] = Mask[I];
16717 else
16718 CombinedMask2[I] = Mask[I] - VF;
16719 }
16720 ShuffleInstructionBuilder ShuffleBuilder(
16721 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
16722 ShuffleBuilder.add(V1, CombinedMask1);
16723 if (V2)
16724 ShuffleBuilder.add(V2, CombinedMask2);
16725 return ShuffleBuilder.finalize({}, {}, {});
16726 };
16727
16728 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
16729 bool ForSingleMask) {
16730 unsigned VF = Mask.size();
16731 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
16732 if (VF != VecVF) {
16733 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
16734 Vec = CreateShuffle(Vec, nullptr, Mask);
16735 return std::make_pair(Vec, true);
16736 }
16737 if (!ForSingleMask) {
16738 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
16739 for (unsigned I = 0; I < VF; ++I) {
16740 if (Mask[I] != PoisonMaskElem)
16741 ResizeMask[Mask[I]] = Mask[I];
16742 }
16743 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
16744 }
16745 }
16746
16747 return std::make_pair(Vec, false);
16748 };
16749 // Perform shuffling of the vectorize tree entries for better handling of
16750 // external extracts.
16751 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
16752 // Find the first and the last instruction in the list of insertelements.
16753 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
16754 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
16755 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
16756 Builder.SetInsertPoint(LastInsert);
16757 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
16758 Value *NewInst = performExtractsShuffleAction<Value>(
16759 MutableArrayRef(Vector.data(), Vector.size()),
16760 FirstInsert->getOperand(0),
16761 [](Value *Vec) {
16762 return cast<VectorType>(Vec->getType())
16763 ->getElementCount()
16764 .getKnownMinValue();
16765 },
16766 ResizeToVF,
16767 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
16768 ArrayRef<Value *> Vals) {
16769 assert((Vals.size() == 1 || Vals.size() == 2) &&
16770 "Expected exactly 1 or 2 input values.");
16771 if (Vals.size() == 1) {
16772 // Do not create shuffle if the mask is a simple identity
16773 // non-resizing mask.
16774 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
16775 ->getNumElements() ||
16776 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
16777 return CreateShuffle(Vals.front(), nullptr, Mask);
16778 return Vals.front();
16779 }
16780 return CreateShuffle(Vals.front() ? Vals.front()
16781 : FirstInsert->getOperand(0),
16782 Vals.back(), Mask);
16783 });
16784 auto It = ShuffledInserts[I].InsertElements.rbegin();
16785 // Rebuild buildvector chain.
16786 InsertElementInst *II = nullptr;
16787 if (It != ShuffledInserts[I].InsertElements.rend())
16788 II = *It;
16790 while (It != ShuffledInserts[I].InsertElements.rend()) {
16791 assert(II && "Must be an insertelement instruction.");
16792 if (*It == II)
16793 ++It;
16794 else
16795 Inserts.push_back(cast<Instruction>(II));
16796 II = dyn_cast<InsertElementInst>(II->getOperand(0));
16797 }
16798 for (Instruction *II : reverse(Inserts)) {
16799 II->replaceUsesOfWith(II->getOperand(0), NewInst);
16800 if (auto *NewI = dyn_cast<Instruction>(NewInst))
16801 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
16802 II->moveAfter(NewI);
16803 NewInst = II;
16804 }
16805 LastInsert->replaceAllUsesWith(NewInst);
16806 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
16807 IE->replaceUsesOfWith(IE->getOperand(0),
16808 PoisonValue::get(IE->getOperand(0)->getType()));
16809 IE->replaceUsesOfWith(IE->getOperand(1),
16810 PoisonValue::get(IE->getOperand(1)->getType()));
16811 eraseInstruction(IE);
16812 }
16813 CSEBlocks.insert(LastInsert->getParent());
16814 }
16815
16816 SmallVector<Instruction *> RemovedInsts;
16817 // For each vectorized value:
16818 for (auto &TEPtr : VectorizableTree) {
16819 TreeEntry *Entry = TEPtr.get();
16820
16821 // No need to handle users of gathered values.
16822 if (Entry->isGather())
16823 continue;
16824
16825 assert(Entry->VectorizedValue && "Can't find vectorizable value");
16826
16827 // For each lane:
16828 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
16829 Value *Scalar = Entry->Scalars[Lane];
16830
16831 if (Entry->getOpcode() == Instruction::GetElementPtr &&
16832 !isa<GetElementPtrInst>(Scalar))
16833 continue;
16834 if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);
16835 EE && IgnoredExtracts.contains(EE))
16836 continue;
16837 if (isa<PoisonValue>(Scalar))
16838 continue;
16839#ifndef NDEBUG
16840 Type *Ty = Scalar->getType();
16841 if (!Ty->isVoidTy()) {
16842 for (User *U : Scalar->users()) {
16843 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
16844
16845 // It is legal to delete users in the ignorelist.
16846 assert((isVectorized(U) ||
16847 (UserIgnoreList && UserIgnoreList->contains(U)) ||
16848 (isa_and_nonnull<Instruction>(U) &&
16849 isDeleted(cast<Instruction>(U)))) &&
16850 "Deleting out-of-tree value");
16851 }
16852 }
16853#endif
16854 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
16855 auto *I = cast<Instruction>(Scalar);
16856 RemovedInsts.push_back(I);
16857 }
16858 }
16859
16860 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
16861 // new vector instruction.
16862 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
16863 V->mergeDIAssignID(RemovedInsts);
16864
16865 // Clear up reduction references, if any.
16866 if (UserIgnoreList) {
16867 for (Instruction *I : RemovedInsts) {
16868 const TreeEntry *IE = getTreeEntries(I).front();
16869 if (IE->Idx != 0 &&
16870 !(VectorizableTree.front()->isGather() &&
16871 !IE->UserTreeIndices.empty() &&
16872 (ValueToGatherNodes.lookup(I).contains(
16873 VectorizableTree.front().get()) ||
16874 any_of(IE->UserTreeIndices,
16875 [&](const EdgeInfo &EI) {
16876 return EI.UserTE == VectorizableTree.front().get() &&
16877 EI.EdgeIdx == UINT_MAX;
16878 }))) &&
16879 !(GatheredLoadsEntriesFirst.has_value() &&
16880 IE->Idx >= *GatheredLoadsEntriesFirst &&
16881 VectorizableTree.front()->isGather() &&
16882 is_contained(VectorizableTree.front()->Scalars, I)))
16883 continue;
16884 SmallVector<SelectInst *> LogicalOpSelects;
16885 I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
16886 // Do not replace condition of the logical op in form select <cond>.
16887 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
16888 (match(U.getUser(), m_LogicalAnd()) ||
16889 match(U.getUser(), m_LogicalOr())) &&
16890 U.getOperandNo() == 0;
16891 if (IsPoisoningLogicalOp) {
16892 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
16893 return false;
16894 }
16895 return UserIgnoreList->contains(U.getUser());
16896 });
16897 // Replace conditions of the poisoning logical ops with the non-poison
16898 // constant value.
16899 for (SelectInst *SI : LogicalOpSelects)
16900 SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));
16901 }
16902 }
16903 // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
16904 // cache correctness.
16905 // NOTE: removeInstructionAndOperands only marks the instruction for deletion
16906 // - instructions are not deleted until later.
16907 removeInstructionsAndOperands(ArrayRef(RemovedInsts));
16908
16909 Builder.ClearInsertionPoint();
16910 InstrElementSize.clear();
16911
16912 const TreeEntry &RootTE = *VectorizableTree.front();
16913 Value *Vec = RootTE.VectorizedValue;
16914 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
16915 It != MinBWs.end() &&
16916 ReductionBitWidth != It->second.first) {
16917 IRBuilder<>::InsertPointGuard Guard(Builder);
16918 Builder.SetInsertPoint(ReductionRoot->getParent(),
16919 ReductionRoot->getIterator());
16920 Vec = Builder.CreateIntCast(
16921 Vec,
16922 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
16923 cast<VectorType>(Vec->getType())->getElementCount()),
16924 It->second.second);
16925 }
16926 return Vec;
16927}
16928
16930 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
16931 << " gather sequences instructions.\n");
16932 // LICM InsertElementInst sequences.
16933 for (Instruction *I : GatherShuffleExtractSeq) {
16934 if (isDeleted(I))
16935 continue;
16936
16937 // Check if this block is inside a loop.
16938 Loop *L = LI->getLoopFor(I->getParent());
16939 if (!L)
16940 continue;
16941
16942 // Check if it has a preheader.
16943 BasicBlock *PreHeader = L->getLoopPreheader();
16944 if (!PreHeader)
16945 continue;
16946
16947 // If the vector or the element that we insert into it are
16948 // instructions that are defined in this basic block then we can't
16949 // hoist this instruction.
16950 if (any_of(I->operands(), [L](Value *V) {
16951 auto *OpI = dyn_cast<Instruction>(V);
16952 return OpI && L->contains(OpI);
16953 }))
16954 continue;
16955
16956 // We can hoist this instruction. Move it to the pre-header.
16957 I->moveBefore(PreHeader->getTerminator()->getIterator());
16958 CSEBlocks.insert(PreHeader);
16959 }
16960
16961 // Make a list of all reachable blocks in our CSE queue.
16963 CSEWorkList.reserve(CSEBlocks.size());
16964 for (BasicBlock *BB : CSEBlocks)
16965 if (DomTreeNode *N = DT->getNode(BB)) {
16967 CSEWorkList.push_back(N);
16968 }
16969
16970 // Sort blocks by domination. This ensures we visit a block after all blocks
16971 // dominating it are visited.
16972 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
16973 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
16974 "Different nodes should have different DFS numbers");
16975 return A->getDFSNumIn() < B->getDFSNumIn();
16976 });
16977
16978 // Less defined shuffles can be replaced by the more defined copies.
16979 // Between two shuffles one is less defined if it has the same vector operands
16980 // and its mask indeces are the same as in the first one or undefs. E.g.
16981 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
16982 // poison, <0, 0, 0, 0>.
16983 auto &&IsIdenticalOrLessDefined = [TTI = TTI](Instruction *I1,
16984 Instruction *I2,
16985 SmallVectorImpl<int> &NewMask) {
16986 if (I1->getType() != I2->getType())
16987 return false;
16988 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
16989 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
16990 if (!SI1 || !SI2)
16991 return I1->isIdenticalTo(I2);
16992 if (SI1->isIdenticalTo(SI2))
16993 return true;
16994 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
16995 if (SI1->getOperand(I) != SI2->getOperand(I))
16996 return false;
16997 // Check if the second instruction is more defined than the first one.
16998 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
16999 ArrayRef<int> SM1 = SI1->getShuffleMask();
17000 // Count trailing undefs in the mask to check the final number of used
17001 // registers.
17002 unsigned LastUndefsCnt = 0;
17003 for (int I = 0, E = NewMask.size(); I < E; ++I) {
17004 if (SM1[I] == PoisonMaskElem)
17005 ++LastUndefsCnt;
17006 else
17007 LastUndefsCnt = 0;
17008 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
17009 NewMask[I] != SM1[I])
17010 return false;
17011 if (NewMask[I] == PoisonMaskElem)
17012 NewMask[I] = SM1[I];
17013 }
17014 // Check if the last undefs actually change the final number of used vector
17015 // registers.
17016 return SM1.size() - LastUndefsCnt > 1 &&
17017 ::getNumberOfParts(*TTI, SI1->getType()) ==
17019 *TTI, getWidenedType(SI1->getType()->getElementType(),
17020 SM1.size() - LastUndefsCnt));
17021 };
17022 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
17023 // instructions. TODO: We can further optimize this scan if we split the
17024 // instructions into different buckets based on the insert lane.
17026 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
17027 assert(*I &&
17028 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
17029 "Worklist not sorted properly!");
17030 BasicBlock *BB = (*I)->getBlock();
17031 // For all instructions in blocks containing gather sequences:
17032 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
17033 if (isDeleted(&In))
17034 continue;
17035 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
17036 !GatherShuffleExtractSeq.contains(&In))
17037 continue;
17038
17039 // Check if we can replace this instruction with any of the
17040 // visited instructions.
17041 bool Replaced = false;
17042 for (Instruction *&V : Visited) {
17043 SmallVector<int> NewMask;
17044 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
17045 DT->dominates(V->getParent(), In.getParent())) {
17046 In.replaceAllUsesWith(V);
17047 eraseInstruction(&In);
17048 if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
17049 if (!NewMask.empty())
17050 SI->setShuffleMask(NewMask);
17051 Replaced = true;
17052 break;
17053 }
17054 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
17055 GatherShuffleExtractSeq.contains(V) &&
17056 IsIdenticalOrLessDefined(V, &In, NewMask) &&
17057 DT->dominates(In.getParent(), V->getParent())) {
17058 In.moveAfter(V);
17059 V->replaceAllUsesWith(&In);
17061 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
17062 if (!NewMask.empty())
17063 SI->setShuffleMask(NewMask);
17064 V = &In;
17065 Replaced = true;
17066 break;
17067 }
17068 }
17069 if (!Replaced) {
17070 assert(!is_contained(Visited, &In));
17071 Visited.push_back(&In);
17072 }
17073 }
17074 }
17075 CSEBlocks.clear();
17076 GatherShuffleExtractSeq.clear();
17077}
17078
17079BoUpSLP::ScheduleData *
17080BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
17081 ScheduleData *Bundle = nullptr;
17082 ScheduleData *PrevInBundle = nullptr;
17083 for (Value *V : VL) {
17085 continue;
17086 ScheduleData *BundleMember = getScheduleData(V);
17087 assert(BundleMember &&
17088 "no ScheduleData for bundle member "
17089 "(maybe not in same basic block)");
17090 assert(BundleMember->isSchedulingEntity() &&
17091 "bundle member already part of other bundle");
17092 if (PrevInBundle) {
17093 PrevInBundle->NextInBundle = BundleMember;
17094 } else {
17095 Bundle = BundleMember;
17096 }
17097
17098 // Group the instructions to a bundle.
17099 BundleMember->FirstInBundle = Bundle;
17100 PrevInBundle = BundleMember;
17101 }
17102 assert(Bundle && "Failed to find schedule bundle");
17103 return Bundle;
17104}
17105
17106// Groups the instructions to a bundle (which is then a single scheduling entity)
17107// and schedules instructions until the bundle gets ready.
17108std::optional<BoUpSLP::ScheduleData *>
17109BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
17110 const InstructionsState &S) {
17111 // No need to schedule PHIs, insertelement, extractelement and extractvalue
17112 // instructions.
17113 if (isa<PHINode>(S.getMainOp()) ||
17115 return nullptr;
17116
17117 // Initialize the instruction bundle.
17118 Instruction *OldScheduleEnd = ScheduleEnd;
17119 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
17120
17121 auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
17122 ScheduleData *Bundle) {
17123 // The scheduling region got new instructions at the lower end (or it is a
17124 // new region for the first bundle). This makes it necessary to
17125 // recalculate all dependencies.
17126 // It is seldom that this needs to be done a second time after adding the
17127 // initial bundle to the region.
17128 if (ScheduleEnd != OldScheduleEnd) {
17129 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())
17130 if (ScheduleData *SD = getScheduleData(I))
17131 SD->clearDependencies();
17132 ReSchedule = true;
17133 }
17134 if (Bundle) {
17135 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
17136 << " in block " << BB->getName() << "\n");
17137 calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);
17138 }
17139
17140 if (ReSchedule) {
17141 resetSchedule();
17142 initialFillReadyList(ReadyInsts);
17143 }
17144
17145 // Now try to schedule the new bundle or (if no bundle) just calculate
17146 // dependencies. As soon as the bundle is "ready" it means that there are no
17147 // cyclic dependencies and we can schedule it. Note that's important that we
17148 // don't "schedule" the bundle yet (see cancelScheduling).
17149 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
17150 !ReadyInsts.empty()) {
17151 ScheduleData *Picked = ReadyInsts.pop_back_val();
17152 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
17153 "must be ready to schedule");
17154 schedule(Picked, ReadyInsts);
17155 }
17156 };
17157
17158 // Make sure that the scheduling region contains all
17159 // instructions of the bundle.
17160 for (Value *V : VL) {
17162 continue;
17163 if (!extendSchedulingRegion(V, S)) {
17164 // If the scheduling region got new instructions at the lower end (or it
17165 // is a new region for the first bundle). This makes it necessary to
17166 // recalculate all dependencies.
17167 // Otherwise the compiler may crash trying to incorrectly calculate
17168 // dependencies and emit instruction in the wrong order at the actual
17169 // scheduling.
17170 TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr);
17171 return std::nullopt;
17172 }
17173 }
17174
17175 bool ReSchedule = false;
17176 for (Value *V : VL) {
17178 continue;
17179 ScheduleData *BundleMember = getScheduleData(V);
17180 assert(BundleMember &&
17181 "no ScheduleData for bundle member (maybe not in same basic block)");
17182
17183 // Make sure we don't leave the pieces of the bundle in the ready list when
17184 // whole bundle might not be ready.
17185 ReadyInsts.remove(BundleMember);
17186
17187 if (!BundleMember->IsScheduled)
17188 continue;
17189 // A bundle member was scheduled as single instruction before and now
17190 // needs to be scheduled as part of the bundle. We just get rid of the
17191 // existing schedule.
17192 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
17193 << " was already scheduled\n");
17194 ReSchedule = true;
17195 }
17196
17197 auto *Bundle = buildBundle(VL);
17198 TryScheduleBundleImpl(ReSchedule, Bundle);
17199 if (!Bundle->isReady()) {
17200 cancelScheduling(VL, S.getMainOp());
17201 return std::nullopt;
17202 }
17203 return Bundle;
17204}
17205
17206void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
17207 Value *OpValue) {
17208 if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) ||
17210 return;
17211
17212 if (doesNotNeedToBeScheduled(OpValue))
17213 OpValue = *find_if_not(VL, doesNotNeedToBeScheduled);
17214 ScheduleData *Bundle = getScheduleData(OpValue);
17215 LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
17216 assert(!Bundle->IsScheduled &&
17217 "Can't cancel bundle which is already scheduled");
17218 assert(Bundle->isSchedulingEntity() &&
17219 (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) &&
17220 "tried to unbundle something which is not a bundle");
17221
17222 // Remove the bundle from the ready list.
17223 if (Bundle->isReady())
17224 ReadyInsts.remove(Bundle);
17225
17226 // Un-bundle: make single instructions out of the bundle.
17227 ScheduleData *BundleMember = Bundle;
17228 while (BundleMember) {
17229 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
17230 BundleMember->FirstInBundle = BundleMember;
17231 ScheduleData *Next = BundleMember->NextInBundle;
17232 BundleMember->NextInBundle = nullptr;
17233 BundleMember->TE = nullptr;
17234 if (BundleMember->unscheduledDepsInBundle() == 0) {
17235 ReadyInsts.insert(BundleMember);
17236 }
17237 BundleMember = Next;
17238 }
17239}
17240
17241BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
17242 // Allocate a new ScheduleData for the instruction.
17243 if (ChunkPos >= ChunkSize) {
17244 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
17245 ChunkPos = 0;
17246 }
17247 return &(ScheduleDataChunks.back()[ChunkPos++]);
17248}
17249
17250bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
17251 Value *V, const InstructionsState &S) {
17252 Instruction *I = dyn_cast<Instruction>(V);
17253 assert(I && "bundle member must be an instruction");
17254 assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
17256 "phi nodes/insertelements/extractelements/extractvalues don't need to "
17257 "be scheduled");
17258 if (getScheduleData(I))
17259 return true;
17260 if (!ScheduleStart) {
17261 // It's the first instruction in the new region.
17262 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
17263 ScheduleStart = I;
17264 ScheduleEnd = I->getNextNode();
17265 assert(ScheduleEnd && "tried to vectorize a terminator?");
17266 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
17267 return true;
17268 }
17269 // Search up and down at the same time, because we don't know if the new
17270 // instruction is above or below the existing scheduling region.
17271 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
17272 // against the budget. Otherwise debug info could affect codegen.
17274 ++ScheduleStart->getIterator().getReverse();
17275 BasicBlock::reverse_iterator UpperEnd = BB->rend();
17276 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
17277 BasicBlock::iterator LowerEnd = BB->end();
17278 auto IsAssumeLikeIntr = [](const Instruction &I) {
17279 if (auto *II = dyn_cast<IntrinsicInst>(&I))
17280 return II->isAssumeLikeIntrinsic();
17281 return false;
17282 };
17283 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17284 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17285 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
17286 &*DownIter != I) {
17287 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
17288 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
17289 return false;
17290 }
17291
17292 ++UpIter;
17293 ++DownIter;
17294
17295 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17296 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17297 }
17298 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
17299 assert(I->getParent() == ScheduleStart->getParent() &&
17300 "Instruction is in wrong basic block.");
17301 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
17302 ScheduleStart = I;
17303 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
17304 << "\n");
17305 return true;
17306 }
17307 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
17308 "Expected to reach top of the basic block or instruction down the "
17309 "lower end.");
17310 assert(I->getParent() == ScheduleEnd->getParent() &&
17311 "Instruction is in wrong basic block.");
17312 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
17313 nullptr);
17314 ScheduleEnd = I->getNextNode();
17315 assert(ScheduleEnd && "tried to vectorize a terminator?");
17316 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
17317 return true;
17318}
17319
17320void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
17321 Instruction *ToI,
17322 ScheduleData *PrevLoadStore,
17323 ScheduleData *NextLoadStore) {
17324 ScheduleData *CurrentLoadStore = PrevLoadStore;
17325 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
17326 // No need to allocate data for non-schedulable instructions.
17328 continue;
17329 ScheduleData *SD = ScheduleDataMap.lookup(I);
17330 if (!SD) {
17331 SD = allocateScheduleDataChunks();
17332 ScheduleDataMap[I] = SD;
17333 }
17334 assert(!isInSchedulingRegion(SD) &&
17335 "new ScheduleData already in scheduling region");
17336 SD->init(SchedulingRegionID, I);
17337
17338 if (I->mayReadOrWriteMemory() &&
17339 (!isa<IntrinsicInst>(I) ||
17340 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
17341 cast<IntrinsicInst>(I)->getIntrinsicID() !=
17342 Intrinsic::pseudoprobe))) {
17343 // Update the linked list of memory accessing instructions.
17344 if (CurrentLoadStore) {
17345 CurrentLoadStore->NextLoadStore = SD;
17346 } else {
17347 FirstLoadStoreInRegion = SD;
17348 }
17349 CurrentLoadStore = SD;
17350 }
17351
17352 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
17353 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
17354 RegionHasStackSave = true;
17355 }
17356 if (NextLoadStore) {
17357 if (CurrentLoadStore)
17358 CurrentLoadStore->NextLoadStore = NextLoadStore;
17359 } else {
17360 LastLoadStoreInRegion = CurrentLoadStore;
17361 }
17362}
17363
17364void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
17365 bool InsertInReadyList,
17366 BoUpSLP *SLP) {
17367 assert(SD->isSchedulingEntity());
17368
17370 WorkList.push_back(SD);
17371
17372 while (!WorkList.empty()) {
17373 ScheduleData *SD = WorkList.pop_back_val();
17374 for (ScheduleData *BundleMember = SD; BundleMember;
17375 BundleMember = BundleMember->NextInBundle) {
17376 assert(isInSchedulingRegion(BundleMember));
17377 if (BundleMember->hasValidDependencies())
17378 continue;
17379
17380 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember
17381 << "\n");
17382 BundleMember->Dependencies = 0;
17383 BundleMember->resetUnscheduledDeps();
17384
17385 // Handle def-use chain dependencies.
17386 for (User *U : BundleMember->Inst->users()) {
17387 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
17388 BundleMember->Dependencies++;
17389 ScheduleData *DestBundle = UseSD->FirstInBundle;
17390 if (!DestBundle->IsScheduled)
17391 BundleMember->incrementUnscheduledDeps(1);
17392 if (!DestBundle->hasValidDependencies())
17393 WorkList.push_back(DestBundle);
17394 }
17395 }
17396
17397 auto MakeControlDependent = [&](Instruction *I) {
17398 auto *DepDest = getScheduleData(I);
17399 assert(DepDest && "must be in schedule window");
17400 DepDest->ControlDependencies.push_back(BundleMember);
17401 BundleMember->Dependencies++;
17402 ScheduleData *DestBundle = DepDest->FirstInBundle;
17403 if (!DestBundle->IsScheduled)
17404 BundleMember->incrementUnscheduledDeps(1);
17405 if (!DestBundle->hasValidDependencies())
17406 WorkList.push_back(DestBundle);
17407 };
17408
17409 // Any instruction which isn't safe to speculate at the beginning of the
17410 // block is control dependend on any early exit or non-willreturn call
17411 // which proceeds it.
17412 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) {
17413 for (Instruction *I = BundleMember->Inst->getNextNode();
17414 I != ScheduleEnd; I = I->getNextNode()) {
17415 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
17416 continue;
17417
17418 // Add the dependency
17419 MakeControlDependent(I);
17420
17422 // Everything past here must be control dependent on I.
17423 break;
17424 }
17425 }
17426
17427 if (RegionHasStackSave) {
17428 // If we have an inalloc alloca instruction, it needs to be scheduled
17429 // after any preceeding stacksave. We also need to prevent any alloca
17430 // from reordering above a preceeding stackrestore.
17431 if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
17432 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
17433 for (Instruction *I = BundleMember->Inst->getNextNode();
17434 I != ScheduleEnd; I = I->getNextNode()) {
17435 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
17436 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
17437 // Any allocas past here must be control dependent on I, and I
17438 // must be memory dependend on BundleMember->Inst.
17439 break;
17440
17441 if (!isa<AllocaInst>(I))
17442 continue;
17443
17444 // Add the dependency
17445 MakeControlDependent(I);
17446 }
17447 }
17448
17449 // In addition to the cases handle just above, we need to prevent
17450 // allocas and loads/stores from moving below a stacksave or a
17451 // stackrestore. Avoiding moving allocas below stackrestore is currently
17452 // thought to be conservatism. Moving loads/stores below a stackrestore
17453 // can lead to incorrect code.
17454 if (isa<AllocaInst>(BundleMember->Inst) ||
17455 BundleMember->Inst->mayReadOrWriteMemory()) {
17456 for (Instruction *I = BundleMember->Inst->getNextNode();
17457 I != ScheduleEnd; I = I->getNextNode()) {
17458 if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&
17459 !match(I, m_Intrinsic<Intrinsic::stackrestore>()))
17460 continue;
17461
17462 // Add the dependency
17463 MakeControlDependent(I);
17464 break;
17465 }
17466 }
17467 }
17468
17469 // Handle the memory dependencies (if any).
17470 ScheduleData *DepDest = BundleMember->NextLoadStore;
17471 if (!DepDest)
17472 continue;
17473 Instruction *SrcInst = BundleMember->Inst;
17474 assert(SrcInst->mayReadOrWriteMemory() &&
17475 "NextLoadStore list for non memory effecting bundle?");
17476 MemoryLocation SrcLoc = getLocation(SrcInst);
17477 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
17478 unsigned NumAliased = 0;
17479 unsigned DistToSrc = 1;
17480
17481 for (; DepDest; DepDest = DepDest->NextLoadStore) {
17482 assert(isInSchedulingRegion(DepDest));
17483
17484 // We have two limits to reduce the complexity:
17485 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
17486 // SLP->isAliased (which is the expensive part in this loop).
17487 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
17488 // the whole loop (even if the loop is fast, it's quadratic).
17489 // It's important for the loop break condition (see below) to
17490 // check this limit even between two read-only instructions.
17491 if (DistToSrc >= MaxMemDepDistance ||
17492 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
17493 (NumAliased >= AliasedCheckLimit ||
17494 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
17495
17496 // We increment the counter only if the locations are aliased
17497 // (instead of counting all alias checks). This gives a better
17498 // balance between reduced runtime and accurate dependencies.
17499 NumAliased++;
17500
17501 DepDest->MemoryDependencies.push_back(BundleMember);
17502 BundleMember->Dependencies++;
17503 ScheduleData *DestBundle = DepDest->FirstInBundle;
17504 if (!DestBundle->IsScheduled) {
17505 BundleMember->incrementUnscheduledDeps(1);
17506 }
17507 if (!DestBundle->hasValidDependencies()) {
17508 WorkList.push_back(DestBundle);
17509 }
17510 }
17511
17512 // Example, explaining the loop break condition: Let's assume our
17513 // starting instruction is i0 and MaxMemDepDistance = 3.
17514 //
17515 // +--------v--v--v
17516 // i0,i1,i2,i3,i4,i5,i6,i7,i8
17517 // +--------^--^--^
17518 //
17519 // MaxMemDepDistance let us stop alias-checking at i3 and we add
17520 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
17521 // Previously we already added dependencies from i3 to i6,i7,i8
17522 // (because of MaxMemDepDistance). As we added a dependency from
17523 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
17524 // and we can abort this loop at i6.
17525 if (DistToSrc >= 2 * MaxMemDepDistance)
17526 break;
17527 DistToSrc++;
17528 }
17529 }
17530 if (InsertInReadyList && SD->isReady()) {
17531 ReadyInsts.insert(SD);
17532 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst
17533 << "\n");
17534 }
17535 }
17536}
17537
17538void BoUpSLP::BlockScheduling::resetSchedule() {
17539 assert(ScheduleStart &&
17540 "tried to reset schedule on block which has not been scheduled");
17541 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
17542 if (ScheduleData *SD = getScheduleData(I)) {
17543 assert(isInSchedulingRegion(SD) &&
17544 "ScheduleData not in scheduling region");
17545 SD->IsScheduled = false;
17546 SD->resetUnscheduledDeps();
17547 }
17548 }
17549 ReadyInsts.clear();
17550}
17551
17552void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
17553 if (!BS->ScheduleStart)
17554 return;
17555
17556 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
17557
17558 // A key point - if we got here, pre-scheduling was able to find a valid
17559 // scheduling of the sub-graph of the scheduling window which consists
17560 // of all vector bundles and their transitive users. As such, we do not
17561 // need to reschedule anything *outside of* that subgraph.
17562
17563 BS->resetSchedule();
17564
17565 // For the real scheduling we use a more sophisticated ready-list: it is
17566 // sorted by the original instruction location. This lets the final schedule
17567 // be as close as possible to the original instruction order.
17568 // WARNING: If changing this order causes a correctness issue, that means
17569 // there is some missing dependence edge in the schedule data graph.
17570 struct ScheduleDataCompare {
17571 bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
17572 return SD2->SchedulingPriority < SD1->SchedulingPriority;
17573 }
17574 };
17575 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
17576
17577 // Ensure that all dependency data is updated (for nodes in the sub-graph)
17578 // and fill the ready-list with initial instructions.
17579 int Idx = 0;
17580 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
17581 I = I->getNextNode()) {
17582 if (ScheduleData *SD = BS->getScheduleData(I)) {
17583 [[maybe_unused]] ArrayRef<TreeEntry *> SDTEs = getTreeEntries(SD->Inst);
17585 SD->isPartOfBundle() ==
17586 (!SDTEs.empty() &&
17587 !doesNotNeedToSchedule(SDTEs.front()->Scalars))) &&
17588 "scheduler and vectorizer bundle mismatch");
17589 SD->FirstInBundle->SchedulingPriority = Idx++;
17590
17591 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
17592 BS->calculateDependencies(SD, false, this);
17593 }
17594 }
17595 BS->initialFillReadyList(ReadyInsts);
17596
17597 Instruction *LastScheduledInst = BS->ScheduleEnd;
17598
17599 // Do the "real" scheduling.
17600 while (!ReadyInsts.empty()) {
17601 ScheduleData *Picked = *ReadyInsts.begin();
17602 ReadyInsts.erase(ReadyInsts.begin());
17603
17604 // Move the scheduled instruction(s) to their dedicated places, if not
17605 // there yet.
17606 for (ScheduleData *BundleMember = Picked; BundleMember;
17607 BundleMember = BundleMember->NextInBundle) {
17608 Instruction *PickedInst = BundleMember->Inst;
17609 if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
17610 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
17611 LastScheduledInst = PickedInst;
17612 }
17613
17614 BS->schedule(Picked, ReadyInsts);
17615 }
17616
17617 // Check that we didn't break any of our invariants.
17618#ifdef EXPENSIVE_CHECKS
17619 BS->verify();
17620#endif
17621
17622#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
17623 // Check that all schedulable entities got scheduled
17624 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
17625 ScheduleData *SD = BS->getScheduleData(I);
17626 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())
17627 assert(SD->IsScheduled && "must be scheduled at this point");
17628 }
17629#endif
17630
17631 // Avoid duplicate scheduling of the block.
17632 BS->ScheduleStart = nullptr;
17633}
17634
17636 // If V is a store, just return the width of the stored value (or value
17637 // truncated just before storing) without traversing the expression tree.
17638 // This is the common case.
17639 if (auto *Store = dyn_cast<StoreInst>(V))
17640 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
17641
17642 if (auto *IEI = dyn_cast<InsertElementInst>(V))
17643 return getVectorElementSize(IEI->getOperand(1));
17644
17645 auto E = InstrElementSize.find(V);
17646 if (E != InstrElementSize.end())
17647 return E->second;
17648
17649 // If V is not a store, we can traverse the expression tree to find loads
17650 // that feed it. The type of the loaded value may indicate a more suitable
17651 // width than V's type. We want to base the vector element size on the width
17652 // of memory operations where possible.
17655 if (auto *I = dyn_cast<Instruction>(V)) {
17656 Worklist.emplace_back(I, I->getParent(), 0);
17657 Visited.insert(I);
17658 }
17659
17660 // Traverse the expression tree in bottom-up order looking for loads. If we
17661 // encounter an instruction we don't yet handle, we give up.
17662 auto Width = 0u;
17663 Value *FirstNonBool = nullptr;
17664 while (!Worklist.empty()) {
17665 auto [I, Parent, Level] = Worklist.pop_back_val();
17666
17667 // We should only be looking at scalar instructions here. If the current
17668 // instruction has a vector type, skip.
17669 auto *Ty = I->getType();
17670 if (isa<VectorType>(Ty))
17671 continue;
17672 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
17673 FirstNonBool = I;
17674 if (Level > RecursionMaxDepth)
17675 continue;
17676
17677 // If the current instruction is a load, update MaxWidth to reflect the
17678 // width of the loaded value.
17679 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(I))
17680 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
17681
17682 // Otherwise, we need to visit the operands of the instruction. We only
17683 // handle the interesting cases from buildTree here. If an operand is an
17684 // instruction we haven't yet visited and from the same basic block as the
17685 // user or the use is a PHI node, we add it to the worklist.
17688 for (Use &U : I->operands()) {
17689 if (auto *J = dyn_cast<Instruction>(U.get()))
17690 if (Visited.insert(J).second &&
17691 (isa<PHINode>(I) || J->getParent() == Parent)) {
17692 Worklist.emplace_back(J, J->getParent(), Level + 1);
17693 continue;
17694 }
17695 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
17696 FirstNonBool = U.get();
17697 }
17698 } else {
17699 break;
17700 }
17701 }
17702
17703 // If we didn't encounter a memory access in the expression tree, or if we
17704 // gave up for some reason, just return the width of V. Otherwise, return the
17705 // maximum width we found.
17706 if (!Width) {
17707 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
17708 V = FirstNonBool;
17709 Width = DL->getTypeSizeInBits(V->getType());
17710 }
17711
17712 for (Instruction *I : Visited)
17713 InstrElementSize[I] = Width;
17714
17715 return Width;
17716}
17717
17718bool BoUpSLP::collectValuesToDemote(
17719 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
17721 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
17722 bool &IsProfitableToDemote, bool IsTruncRoot) const {
17723 // We can always demote constants.
17724 if (all_of(E.Scalars, IsaPred<Constant>))
17725 return true;
17726
17727 unsigned OrigBitWidth =
17728 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
17729 if (OrigBitWidth == BitWidth) {
17730 MaxDepthLevel = 1;
17731 return true;
17732 }
17733
17734 // Check if the node was analyzed already and must keep its original bitwidth.
17735 if (NodesToKeepBWs.contains(E.Idx))
17736 return false;
17737
17738 // If the value is not a vectorized instruction in the expression and not used
17739 // by the insertelement instruction and not used in multiple vector nodes, it
17740 // cannot be demoted.
17741 bool IsSignedNode = any_of(E.Scalars, [&](Value *R) {
17742 if (isa<PoisonValue>(R))
17743 return false;
17744 return !isKnownNonNegative(R, SimplifyQuery(*DL));
17745 });
17746 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
17747 if (isa<PoisonValue>(V))
17748 return true;
17749 if (getTreeEntries(V).size() > 1)
17750 return false;
17751 // For lat shuffle of sext/zext with many uses need to check the extra bit
17752 // for unsigned values, otherwise may have incorrect casting for reused
17753 // scalars.
17754 bool IsSignedVal = !isKnownNonNegative(V, SimplifyQuery(*DL));
17755 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
17756 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17757 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
17758 return true;
17759 }
17760 unsigned NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
17761 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
17762 if (IsSignedNode)
17763 ++BitWidth1;
17764 if (auto *I = dyn_cast<Instruction>(V)) {
17765 APInt Mask = DB->getDemandedBits(I);
17766 unsigned BitWidth2 =
17767 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
17768 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
17769 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
17770 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
17771 break;
17772 BitWidth2 *= 2;
17773 }
17774 BitWidth1 = std::min(BitWidth1, BitWidth2);
17775 }
17776 BitWidth = std::max(BitWidth, BitWidth1);
17777 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
17778 };
17779 auto FinalAnalysis = [&, TTI = TTI]() {
17780 if (!IsProfitableToDemote)
17781 return false;
17782 bool Res = all_of(
17783 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
17784 // Demote gathers.
17785 if (Res && E.isGather()) {
17786 // Check possible extractelement instructions bases and final vector
17787 // length.
17788 SmallPtrSet<Value *, 4> UniqueBases;
17789 for (Value *V : E.Scalars) {
17790 auto *EE = dyn_cast<ExtractElementInst>(V);
17791 if (!EE)
17792 continue;
17793 UniqueBases.insert(EE->getVectorOperand());
17794 }
17795 const unsigned VF = E.Scalars.size();
17796 Type *OrigScalarTy = E.Scalars.front()->getType();
17797 if (UniqueBases.size() <= 2 ||
17798 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF)) ==
17800 *TTI,
17802 IntegerType::get(OrigScalarTy->getContext(), BitWidth),
17803 VF)))
17804 ToDemote.push_back(E.Idx);
17805 }
17806 return Res;
17807 };
17808 if (E.isGather() || !Visited.insert(&E).second ||
17809 any_of(E.Scalars, [&](Value *V) {
17810 return !isa<PoisonValue>(V) && all_of(V->users(), [&](User *U) {
17811 return isa<InsertElementInst>(U) && !isVectorized(U);
17812 });
17813 }))
17814 return FinalAnalysis();
17815
17816 if (any_of(E.Scalars, [&](Value *V) {
17817 return !all_of(V->users(), [=](User *U) {
17818 return isVectorized(U) ||
17819 (E.Idx == 0 && UserIgnoreList &&
17820 UserIgnoreList->contains(U)) ||
17821 (!isa<CmpInst>(U) && U->getType()->isSized() &&
17822 !U->getType()->isScalableTy() &&
17823 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
17824 }) && !IsPotentiallyTruncated(V, BitWidth);
17825 }))
17826 return false;
17827
17828 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
17829 bool &NeedToExit) {
17830 NeedToExit = false;
17831 unsigned InitLevel = MaxDepthLevel;
17832 for (const TreeEntry *Op : Operands) {
17833 unsigned Level = InitLevel;
17834 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
17835 ToDemote, Visited, NodesToKeepBWs, Level,
17836 IsProfitableToDemote, IsTruncRoot)) {
17837 if (!IsProfitableToDemote)
17838 return false;
17839 NeedToExit = true;
17840 if (!FinalAnalysis())
17841 return false;
17842 continue;
17843 }
17844 MaxDepthLevel = std::max(MaxDepthLevel, Level);
17845 }
17846 return true;
17847 };
17848 auto AttemptCheckBitwidth =
17849 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
17850 // Try all bitwidth < OrigBitWidth.
17851 NeedToExit = false;
17852 unsigned BestFailBitwidth = 0;
17853 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
17854 if (Checker(BitWidth, OrigBitWidth))
17855 return true;
17856 if (BestFailBitwidth == 0 && FinalAnalysis())
17857 BestFailBitwidth = BitWidth;
17858 }
17859 if (BitWidth >= OrigBitWidth) {
17860 if (BestFailBitwidth == 0) {
17861 BitWidth = OrigBitWidth;
17862 return false;
17863 }
17864 MaxDepthLevel = 1;
17865 BitWidth = BestFailBitwidth;
17866 NeedToExit = true;
17867 return true;
17868 }
17869 return false;
17870 };
17871 auto TryProcessInstruction =
17872 [&](unsigned &BitWidth, ArrayRef<const TreeEntry *> Operands = {},
17873 function_ref<bool(unsigned, unsigned)> Checker = {}) {
17874 if (Operands.empty()) {
17875 if (!IsTruncRoot)
17876 MaxDepthLevel = 1;
17877 (void)for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17878 std::ref(BitWidth)));
17879 } else {
17880 // Several vectorized uses? Check if we can truncate it, otherwise -
17881 // exit.
17882 if (E.UserTreeIndices.size() > 1 &&
17883 !all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17884 std::ref(BitWidth))))
17885 return false;
17886 bool NeedToExit = false;
17887 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
17888 return false;
17889 if (NeedToExit)
17890 return true;
17891 if (!ProcessOperands(Operands, NeedToExit))
17892 return false;
17893 if (NeedToExit)
17894 return true;
17895 }
17896
17897 ++MaxDepthLevel;
17898 // Record the entry that we can demote.
17899 ToDemote.push_back(E.Idx);
17900 return IsProfitableToDemote;
17901 };
17902 switch (E.getOpcode()) {
17903
17904 // We can always demote truncations and extensions. Since truncations can
17905 // seed additional demotion, we save the truncated value.
17906 case Instruction::Trunc:
17907 if (IsProfitableToDemoteRoot)
17908 IsProfitableToDemote = true;
17909 return TryProcessInstruction(BitWidth);
17910 case Instruction::ZExt:
17911 case Instruction::SExt:
17912 IsProfitableToDemote = true;
17913 return TryProcessInstruction(BitWidth);
17914
17915 // We can demote certain binary operations if we can demote both of their
17916 // operands.
17917 case Instruction::Add:
17918 case Instruction::Sub:
17919 case Instruction::Mul:
17920 case Instruction::And:
17921 case Instruction::Or:
17922 case Instruction::Xor: {
17923 return TryProcessInstruction(
17924 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
17925 }
17926 case Instruction::Freeze:
17927 return TryProcessInstruction(BitWidth, getOperandEntry(&E, 0));
17928 case Instruction::Shl: {
17929 // If we are truncating the result of this SHL, and if it's a shift of an
17930 // inrange amount, we can always perform a SHL in a smaller type.
17931 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
17932 return all_of(E.Scalars, [&](Value *V) {
17933 if (isa<PoisonValue>(V))
17934 return true;
17935 auto *I = cast<Instruction>(V);
17936 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17937 return AmtKnownBits.getMaxValue().ult(BitWidth);
17938 });
17939 };
17940 return TryProcessInstruction(
17941 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
17942 }
17943 case Instruction::LShr: {
17944 // If this is a truncate of a logical shr, we can truncate it to a smaller
17945 // lshr iff we know that the bits we would otherwise be shifting in are
17946 // already zeros.
17947 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
17948 return all_of(E.Scalars, [&](Value *V) {
17949 if (isa<PoisonValue>(V))
17950 return true;
17951 auto *I = cast<Instruction>(V);
17952 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17953 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17954 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
17955 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
17956 SimplifyQuery(*DL));
17957 });
17958 };
17959 return TryProcessInstruction(
17960 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
17961 LShrChecker);
17962 }
17963 case Instruction::AShr: {
17964 // If this is a truncate of an arithmetic shr, we can truncate it to a
17965 // smaller ashr iff we know that all the bits from the sign bit of the
17966 // original type and the sign bit of the truncate type are similar.
17967 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
17968 return all_of(E.Scalars, [&](Value *V) {
17969 if (isa<PoisonValue>(V))
17970 return true;
17971 auto *I = cast<Instruction>(V);
17972 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17973 unsigned ShiftedBits = OrigBitWidth - BitWidth;
17974 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
17975 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
17976 nullptr, DT);
17977 });
17978 };
17979 return TryProcessInstruction(
17980 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
17981 AShrChecker);
17982 }
17983 case Instruction::UDiv:
17984 case Instruction::URem: {
17985 // UDiv and URem can be truncated if all the truncated bits are zero.
17986 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
17987 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
17988 return all_of(E.Scalars, [&](Value *V) {
17989 auto *I = cast<Instruction>(V);
17990 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17991 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
17992 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
17993 });
17994 };
17995 return TryProcessInstruction(
17996 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
17997 }
17998
17999 // We can demote selects if we can demote their true and false values.
18000 case Instruction::Select: {
18001 return TryProcessInstruction(
18002 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
18003 }
18004
18005 // We can demote phis if we can demote all their incoming operands. Note that
18006 // we don't need to worry about cycles since we ensure single use above.
18007 case Instruction::PHI: {
18008 const unsigned NumOps = E.getNumOperands();
18010 transform(seq<unsigned>(0, NumOps), Ops.begin(),
18011 std::bind(&BoUpSLP::getOperandEntry, this, &E, _1));
18012
18013 return TryProcessInstruction(BitWidth, Ops);
18014 }
18015
18016 case Instruction::Call: {
18017 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
18018 if (!IC)
18019 break;
18021 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
18022 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
18023 break;
18024 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
18025 function_ref<bool(unsigned, unsigned)> CallChecker;
18026 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
18027 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
18028 return all_of(E.Scalars, [&](Value *V) {
18029 auto *I = cast<Instruction>(V);
18030 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
18031 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
18032 return MaskedValueIsZero(I->getOperand(0), Mask,
18033 SimplifyQuery(*DL)) &&
18034 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
18035 }
18036 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
18037 "Expected min/max intrinsics only.");
18038 unsigned SignBits = OrigBitWidth - BitWidth;
18039 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
18040 unsigned Op0SignBits = ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
18041 nullptr, DT);
18042 unsigned Op1SignBits = ComputeNumSignBits(I->getOperand(1), *DL, 0, AC,
18043 nullptr, DT);
18044 return SignBits <= Op0SignBits &&
18045 ((SignBits != Op0SignBits &&
18046 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
18047 MaskedValueIsZero(I->getOperand(0), Mask,
18048 SimplifyQuery(*DL))) &&
18049 SignBits <= Op1SignBits &&
18050 ((SignBits != Op1SignBits &&
18051 !isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) ||
18052 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
18053 });
18054 };
18055 auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
18056 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
18057 return all_of(E.Scalars, [&](Value *V) {
18058 auto *I = cast<Instruction>(V);
18059 unsigned SignBits = OrigBitWidth - BitWidth;
18060 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
18061 unsigned Op0SignBits =
18062 ComputeNumSignBits(I->getOperand(0), *DL, 0, AC, nullptr, DT);
18063 return SignBits <= Op0SignBits &&
18064 ((SignBits != Op0SignBits &&
18065 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
18066 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
18067 });
18068 };
18069 if (ID != Intrinsic::abs) {
18070 Operands.push_back(getOperandEntry(&E, 1));
18071 CallChecker = CompChecker;
18072 } else {
18073 CallChecker = AbsChecker;
18074 }
18075 InstructionCost BestCost =
18076 std::numeric_limits<InstructionCost::CostType>::max();
18077 unsigned BestBitWidth = BitWidth;
18078 unsigned VF = E.Scalars.size();
18079 // Choose the best bitwidth based on cost estimations.
18080 auto Checker = [&](unsigned BitWidth, unsigned) {
18081 unsigned MinBW = PowerOf2Ceil(BitWidth);
18082 SmallVector<Type *> ArgTys =
18083 buildIntrinsicArgTypes(IC, ID, VF, MinBW, TTI);
18084 auto VecCallCosts = getVectorCallCosts(
18085 IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),
18086 TTI, TLI, ArgTys);
18087 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
18088 if (Cost < BestCost) {
18089 BestCost = Cost;
18090 BestBitWidth = BitWidth;
18091 }
18092 return false;
18093 };
18094 [[maybe_unused]] bool NeedToExit;
18095 (void)AttemptCheckBitwidth(Checker, NeedToExit);
18096 BitWidth = BestBitWidth;
18097 return TryProcessInstruction(BitWidth, Operands, CallChecker);
18098 }
18099
18100 // Otherwise, conservatively give up.
18101 default:
18102 break;
18103 }
18104 MaxDepthLevel = 1;
18105 return FinalAnalysis();
18106}
18107
18108static RecurKind getRdxKind(Value *V);
18109
18111 // We only attempt to truncate integer expressions.
18112 bool IsStoreOrInsertElt =
18113 VectorizableTree.front()->hasState() &&
18114 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
18115 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
18116 if ((IsStoreOrInsertElt || UserIgnoreList) &&
18117 ExtraBitWidthNodes.size() <= 1 &&
18118 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
18119 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
18120 return;
18121
18122 unsigned NodeIdx = 0;
18123 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
18124 NodeIdx = 1;
18125
18126 // Ensure the roots of the vectorizable tree don't form a cycle.
18127 if (VectorizableTree[NodeIdx]->isGather() ||
18128 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
18129 (NodeIdx != 0 && any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18130 [NodeIdx](const EdgeInfo &EI) {
18131 return EI.UserTE->Idx > NodeIdx;
18132 })))
18133 return;
18134
18135 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
18136 // resize to the final type.
18137 bool IsTruncRoot = false;
18138 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
18139 SmallVector<unsigned> RootDemotes;
18140 SmallDenseSet<unsigned, 8> NodesToKeepBWs;
18141 if (NodeIdx != 0 &&
18142 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18143 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
18144 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
18145 IsTruncRoot = true;
18146 RootDemotes.push_back(NodeIdx);
18147 IsProfitableToDemoteRoot = true;
18148 ++NodeIdx;
18149 }
18150
18151 // Analyzed the reduction already and not profitable - exit.
18152 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
18153 return;
18154
18155 SmallVector<unsigned> ToDemote;
18156 auto ComputeMaxBitWidth =
18157 [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot,
18158 unsigned Limit, bool IsTruncRoot, bool IsSignedCmp) -> unsigned {
18159 ToDemote.clear();
18160 // Check if the root is trunc and the next node is gather/buildvector, then
18161 // keep trunc in scalars, which is free in most cases.
18162 if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&
18163 !NodesToKeepBWs.contains(E.Idx) &&
18164 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
18165 all_of(E.Scalars, [&](Value *V) {
18166 return V->hasOneUse() || isa<Constant>(V) ||
18167 (!V->hasNUsesOrMore(UsesLimit) &&
18168 none_of(V->users(), [&](User *U) {
18169 ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
18170 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18171 if (TEs.empty() || is_contained(TEs, UserTE))
18172 return false;
18173 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18174 SelectInst>(U) ||
18175 !isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18176 SelectInst>(UserTE->getMainOp()))
18177 return true;
18178 unsigned UserTESz = DL->getTypeSizeInBits(
18179 UserTE->Scalars.front()->getType());
18180 if (all_of(TEs, [&](const TreeEntry *TE) {
18181 auto It = MinBWs.find(TE);
18182 return It != MinBWs.end() &&
18183 It->second.first > UserTESz;
18184 }))
18185 return true;
18186 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
18187 }));
18188 })) {
18189 ToDemote.push_back(E.Idx);
18190 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18191 auto It = MinBWs.find(UserTE);
18192 if (It != MinBWs.end())
18193 return It->second.first;
18194 unsigned MaxBitWidth =
18195 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
18196 MaxBitWidth = bit_ceil(MaxBitWidth);
18197 if (MaxBitWidth < 8 && MaxBitWidth > 1)
18198 MaxBitWidth = 8;
18199 return MaxBitWidth;
18200 }
18201
18202 if (!E.hasState())
18203 return 0u;
18204
18205 unsigned VF = E.getVectorFactor();
18206 Type *ScalarTy = E.Scalars.front()->getType();
18207 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
18208 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
18209 if (!TreeRootIT)
18210 return 0u;
18211
18212 if (any_of(E.Scalars,
18213 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
18214 return 0u;
18215
18216 unsigned NumParts = ::getNumberOfParts(
18217 *TTI, getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
18218
18219 // The maximum bit width required to represent all the values that can be
18220 // demoted without loss of precision. It would be safe to truncate the roots
18221 // of the expression to this width.
18222 unsigned MaxBitWidth = 1u;
18223
18224 // True if the roots can be zero-extended back to their original type,
18225 // rather than sign-extended. We know that if the leading bits are not
18226 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
18227 // True.
18228 // Determine if the sign bit of all the roots is known to be zero. If not,
18229 // IsKnownPositive is set to False.
18230 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
18231 if (isa<PoisonValue>(R))
18232 return true;
18233 KnownBits Known = computeKnownBits(R, *DL);
18234 return Known.isNonNegative();
18235 });
18236
18237 // We first check if all the bits of the roots are demanded. If they're not,
18238 // we can truncate the roots to this narrower type.
18239 for (Value *Root : E.Scalars) {
18240 if (isa<PoisonValue>(Root))
18241 continue;
18242 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, 0, AC, nullptr, DT);
18243 TypeSize NumTypeBits =
18244 DL->getTypeSizeInBits(Root->getType()->getScalarType());
18245 unsigned BitWidth1 = NumTypeBits - NumSignBits;
18246 // If we can't prove that the sign bit is zero, we must add one to the
18247 // maximum bit width to account for the unknown sign bit. This preserves
18248 // the existing sign bit so we can safely sign-extend the root back to the
18249 // original type. Otherwise, if we know the sign bit is zero, we will
18250 // zero-extend the root instead.
18251 //
18252 // FIXME: This is somewhat suboptimal, as there will be cases where adding
18253 // one to the maximum bit width will yield a larger-than-necessary
18254 // type. In general, we need to add an extra bit only if we can't
18255 // prove that the upper bit of the original type is equal to the
18256 // upper bit of the proposed smaller type. If these two bits are
18257 // the same (either zero or one) we know that sign-extending from
18258 // the smaller type will result in the same value. Here, since we
18259 // can't yet prove this, we are just making the proposed smaller
18260 // type larger to ensure correctness.
18261 if (!IsKnownPositive)
18262 ++BitWidth1;
18263
18264 APInt Mask = DB->getDemandedBits(cast<Instruction>(Root));
18265 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18266 MaxBitWidth =
18267 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
18268 }
18269
18270 if (MaxBitWidth < 8 && MaxBitWidth > 1)
18271 MaxBitWidth = 8;
18272
18273 // If the original type is large, but reduced type does not improve the reg
18274 // use - ignore it.
18275 if (NumParts > 1 &&
18276 NumParts ==
18278 *TTI, getWidenedType(IntegerType::get(F->getContext(),
18279 bit_ceil(MaxBitWidth)),
18280 VF)))
18281 return 0u;
18282
18283 unsigned Opcode = E.getOpcode();
18284 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
18285 Opcode == Instruction::SExt ||
18286 Opcode == Instruction::ZExt || NumParts > 1;
18287 // Conservatively determine if we can actually truncate the roots of the
18288 // expression. Collect the values that can be demoted in ToDemote and
18289 // additional roots that require investigating in Roots.
18291 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
18292 bool NeedToDemote = IsProfitableToDemote;
18293
18294 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
18295 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
18296 NeedToDemote, IsTruncRoot) ||
18297 (MaxDepthLevel <= Limit &&
18298 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
18299 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
18300 DL->getTypeSizeInBits(TreeRootIT) /
18301 DL->getTypeSizeInBits(
18302 E.getMainOp()->getOperand(0)->getType()) >
18303 2)))))
18304 return 0u;
18305 // Round MaxBitWidth up to the next power-of-two.
18306 MaxBitWidth = bit_ceil(MaxBitWidth);
18307
18308 return MaxBitWidth;
18309 };
18310
18311 // If we can truncate the root, we must collect additional values that might
18312 // be demoted as a result. That is, those seeded by truncations we will
18313 // modify.
18314 // Add reduction ops sizes, if any.
18315 if (UserIgnoreList &&
18316 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
18317 // Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
18318 // x i1> to in)).
18319 if (all_of(*UserIgnoreList,
18320 [](Value *V) {
18321 return isa<PoisonValue>(V) ||
18322 cast<Instruction>(V)->getOpcode() == Instruction::Add;
18323 }) &&
18324 VectorizableTree.front()->State == TreeEntry::Vectorize &&
18325 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
18326 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
18327 Builder.getInt1Ty()) {
18328 ReductionBitWidth = 1;
18329 } else {
18330 for (Value *V : *UserIgnoreList) {
18331 if (isa<PoisonValue>(V))
18332 continue;
18333 unsigned NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
18334 TypeSize NumTypeBits = DL->getTypeSizeInBits(V->getType());
18335 unsigned BitWidth1 = NumTypeBits - NumSignBits;
18337 ++BitWidth1;
18338 unsigned BitWidth2 = BitWidth1;
18340 APInt Mask = DB->getDemandedBits(cast<Instruction>(V));
18341 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18342 }
18343 ReductionBitWidth =
18344 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
18345 }
18346 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
18347 ReductionBitWidth = 8;
18348
18349 ReductionBitWidth = bit_ceil(ReductionBitWidth);
18350 }
18351 }
18352 bool IsTopRoot = NodeIdx == 0;
18353 while (NodeIdx < VectorizableTree.size() &&
18354 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18355 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
18356 RootDemotes.push_back(NodeIdx);
18357 ++NodeIdx;
18358 IsTruncRoot = true;
18359 }
18360 bool IsSignedCmp = false;
18361 while (NodeIdx < VectorizableTree.size()) {
18362 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
18363 unsigned Limit = 2;
18364 if (IsTopRoot &&
18365 ReductionBitWidth ==
18366 DL->getTypeSizeInBits(
18367 VectorizableTree.front()->Scalars.front()->getType()))
18368 Limit = 3;
18369 unsigned MaxBitWidth = ComputeMaxBitWidth(
18370 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
18371 IsTruncRoot, IsSignedCmp);
18372 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
18373 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
18374 ReductionBitWidth = bit_ceil(MaxBitWidth);
18375 else if (MaxBitWidth == 0)
18376 ReductionBitWidth = 0;
18377 }
18378
18379 for (unsigned Idx : RootDemotes) {
18380 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
18381 uint32_t OrigBitWidth =
18382 DL->getTypeSizeInBits(V->getType()->getScalarType());
18383 if (OrigBitWidth > MaxBitWidth) {
18384 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
18385 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
18386 }
18387 return false;
18388 }))
18389 ToDemote.push_back(Idx);
18390 }
18391 RootDemotes.clear();
18392 IsTopRoot = false;
18393 IsProfitableToDemoteRoot = true;
18394
18395 if (ExtraBitWidthNodes.empty()) {
18396 NodeIdx = VectorizableTree.size();
18397 } else {
18398 unsigned NewIdx = 0;
18399 do {
18400 NewIdx = *ExtraBitWidthNodes.begin();
18401 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
18402 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
18403 NodeIdx = NewIdx;
18404 IsTruncRoot =
18405 NodeIdx < VectorizableTree.size() &&
18406 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18407 [](const EdgeInfo &EI) {
18408 return EI.EdgeIdx == 0 &&
18409 EI.UserTE->getOpcode() == Instruction::Trunc &&
18410 !EI.UserTE->isAltShuffle();
18411 });
18412 IsSignedCmp =
18413 NodeIdx < VectorizableTree.size() &&
18414 any_of(
18415 VectorizableTree[NodeIdx]->UserTreeIndices,
18416 [&](const EdgeInfo &EI) {
18417 return (EI.UserTE->hasState() &&
18418 EI.UserTE->getOpcode() == Instruction::ICmp) &&
18419 any_of(EI.UserTE->Scalars, [&](Value *V) {
18420 auto *IC = dyn_cast<ICmpInst>(V);
18421 return IC &&
18422 (IC->isSigned() ||
18423 !isKnownNonNegative(IC->getOperand(0),
18424 SimplifyQuery(*DL)) ||
18425 !isKnownNonNegative(IC->getOperand(1),
18426 SimplifyQuery(*DL)));
18427 });
18428 });
18429 }
18430
18431 // If the maximum bit width we compute is less than the width of the roots'
18432 // type, we can proceed with the narrowing. Otherwise, do nothing.
18433 if (MaxBitWidth == 0 ||
18434 MaxBitWidth >=
18435 cast<IntegerType>(TreeRoot.front()->getType()->getScalarType())
18436 ->getBitWidth()) {
18437 if (UserIgnoreList)
18438 AnalyzedMinBWVals.insert(TreeRoot.begin(), TreeRoot.end());
18439 NodesToKeepBWs.insert(ToDemote.begin(), ToDemote.end());
18440 continue;
18441 }
18442
18443 // Finally, map the values we can demote to the maximum bit with we
18444 // computed.
18445 for (unsigned Idx : ToDemote) {
18446 TreeEntry *TE = VectorizableTree[Idx].get();
18447 if (MinBWs.contains(TE))
18448 continue;
18449 bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
18450 if (isa<PoisonValue>(R))
18451 return false;
18452 return !isKnownNonNegative(R, SimplifyQuery(*DL));
18453 });
18454 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
18455 }
18456 }
18457}
18458
18460 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
18461 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
18462 auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
18463 auto *AA = &AM.getResult<AAManager>(F);
18464 auto *LI = &AM.getResult<LoopAnalysis>(F);
18465 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
18466 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
18467 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
18469
18470 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
18471 if (!Changed)
18472 return PreservedAnalyses::all();
18473
18476 return PA;
18477}
18478
18480 TargetTransformInfo *TTI_,
18481 TargetLibraryInfo *TLI_, AAResults *AA_,
18482 LoopInfo *LI_, DominatorTree *DT_,
18483 AssumptionCache *AC_, DemandedBits *DB_,
18486 return false;
18487 SE = SE_;
18488 TTI = TTI_;
18489 TLI = TLI_;
18490 AA = AA_;
18491 LI = LI_;
18492 DT = DT_;
18493 AC = AC_;
18494 DB = DB_;
18495 DL = &F.getDataLayout();
18496
18497 Stores.clear();
18498 GEPs.clear();
18499 bool Changed = false;
18500
18501 // If the target claims to have no vector registers don't attempt
18502 // vectorization.
18504 LLVM_DEBUG(
18505 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
18506 return false;
18507 }
18508
18509 // Don't vectorize when the attribute NoImplicitFloat is used.
18510 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
18511 return false;
18512
18513 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
18514
18515 // Use the bottom up slp vectorizer to construct chains that start with
18516 // store instructions.
18517 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
18518
18519 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
18520 // delete instructions.
18521
18522 // Update DFS numbers now so that we can use them for ordering.
18523 DT->updateDFSNumbers();
18524
18525 // Scan the blocks in the function in post order.
18526 for (auto *BB : post_order(&F.getEntryBlock())) {
18527 if (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()))
18528 continue;
18529
18530 // Start new block - clear the list of reduction roots.
18531 R.clearReductionData();
18532 collectSeedInstructions(BB);
18533
18534 // Vectorize trees that end at stores.
18535 if (!Stores.empty()) {
18536 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
18537 << " underlying objects.\n");
18538 Changed |= vectorizeStoreChains(R);
18539 }
18540
18541 // Vectorize trees that end at reductions.
18542 Changed |= vectorizeChainsInBlock(BB, R);
18543
18544 // Vectorize the index computations of getelementptr instructions. This
18545 // is primarily intended to catch gather-like idioms ending at
18546 // non-consecutive loads.
18547 if (!GEPs.empty()) {
18548 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
18549 << " underlying objects.\n");
18550 Changed |= vectorizeGEPIndices(BB, R);
18551 }
18552 }
18553
18554 if (Changed) {
18555 R.optimizeGatherSequence();
18556 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
18557 }
18558 return Changed;
18559}
18560
18561std::optional<bool>
18562SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
18563 unsigned Idx, unsigned MinVF,
18564 unsigned &Size) {
18565 Size = 0;
18566 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
18567 << "\n");
18568 const unsigned Sz = R.getVectorElementSize(Chain[0]);
18569 unsigned VF = Chain.size();
18570
18571 if (!has_single_bit(Sz) ||
18573 *TTI, cast<StoreInst>(Chain.front())->getValueOperand()->getType(),
18574 VF) ||
18575 VF < 2 || VF < MinVF) {
18576 // Check if vectorizing with a non-power-of-2 VF should be considered. At
18577 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
18578 // all vector lanes are used.
18579 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
18580 return false;
18581 }
18582
18583 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
18584 << "\n");
18585
18586 SetVector<Value *> ValOps;
18587 for (Value *V : Chain)
18588 ValOps.insert(cast<StoreInst>(V)->getValueOperand());
18589 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
18590 InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI);
18591 if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
18592 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
18593 bool IsAllowedSize =
18594 hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(),
18595 ValOps.size()) ||
18596 (VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1));
18597 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
18598 (!S.getMainOp()->isSafeToRemove() ||
18599 any_of(ValOps.getArrayRef(),
18600 [&](Value *V) {
18601 return !isa<ExtractElementInst>(V) &&
18602 (V->getNumUses() > Chain.size() ||
18603 any_of(V->users(), [&](User *U) {
18604 return !Stores.contains(U);
18605 }));
18606 }))) ||
18607 (ValOps.size() > Chain.size() / 2 && !S)) {
18608 Size = (!IsAllowedSize && S) ? 1 : 2;
18609 return false;
18610 }
18611 }
18612 if (R.isLoadCombineCandidate(Chain))
18613 return true;
18614 R.buildTree(Chain);
18615 // Check if tree tiny and store itself or its value is not vectorized.
18616 if (R.isTreeTinyAndNotFullyVectorizable()) {
18617 if (R.isGathered(Chain.front()) ||
18618 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
18619 return std::nullopt;
18620 Size = R.getCanonicalGraphSize();
18621 return false;
18622 }
18623 R.reorderTopToBottom();
18624 R.reorderBottomToTop();
18625 R.transformNodes();
18626 R.buildExternalUses();
18627
18628 R.computeMinimumValueSizes();
18629
18630 Size = R.getCanonicalGraphSize();
18631 if (S && S.getOpcode() == Instruction::Load)
18632 Size = 2; // cut off masked gather small trees
18633 InstructionCost Cost = R.getTreeCost();
18634
18635 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
18636 if (Cost < -SLPCostThreshold) {
18637 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
18638
18639 using namespace ore;
18640
18641 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
18642 cast<StoreInst>(Chain[0]))
18643 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
18644 << " and with tree size "
18645 << NV("TreeSize", R.getTreeSize()));
18646
18647 R.vectorizeTree();
18648 return true;
18649 }
18650
18651 return false;
18652}
18653
18654/// Checks if the quadratic mean deviation is less than 90% of the mean size.
18655static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
18656 bool First) {
18657 unsigned Num = 0;
18658 uint64_t Sum = std::accumulate(
18659 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
18660 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
18661 unsigned Size = First ? Val.first : Val.second;
18662 if (Size == 1)
18663 return V;
18664 ++Num;
18665 return V + Size;
18666 });
18667 if (Num == 0)
18668 return true;
18669 uint64_t Mean = Sum / Num;
18670 if (Mean == 0)
18671 return true;
18672 uint64_t Dev = std::accumulate(
18673 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
18674 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
18675 unsigned P = First ? Val.first : Val.second;
18676 if (P == 1)
18677 return V;
18678 return V + (P - Mean) * (P - Mean);
18679 }) /
18680 Num;
18681 return Dev * 81 / (Mean * Mean) == 0;
18682}
18683
18684bool SLPVectorizerPass::vectorizeStores(
18685 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
18686 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
18687 &Visited) {
18688 // We may run into multiple chains that merge into a single chain. We mark the
18689 // stores that we vectorized so that we don't visit the same store twice.
18690 BoUpSLP::ValueSet VectorizedStores;
18691 bool Changed = false;
18692
18693 struct StoreDistCompare {
18694 bool operator()(const std::pair<unsigned, int> &Op1,
18695 const std::pair<unsigned, int> &Op2) const {
18696 return Op1.second < Op2.second;
18697 }
18698 };
18699 // A set of pairs (index of store in Stores array ref, Distance of the store
18700 // address relative to base store address in units).
18701 using StoreIndexToDistSet =
18702 std::set<std::pair<unsigned, int>, StoreDistCompare>;
18703 auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {
18704 int PrevDist = -1;
18706 // Collect the chain into a list.
18707 for (auto [Idx, Data] : enumerate(Set)) {
18708 if (Operands.empty() || Data.second - PrevDist == 1) {
18709 Operands.push_back(Stores[Data.first]);
18710 PrevDist = Data.second;
18711 if (Idx != Set.size() - 1)
18712 continue;
18713 }
18714 auto E = make_scope_exit([&, &DataVar = Data]() {
18715 Operands.clear();
18716 Operands.push_back(Stores[DataVar.first]);
18717 PrevDist = DataVar.second;
18718 });
18719
18720 if (Operands.size() <= 1 ||
18721 !Visited
18722 .insert({Operands.front(),
18723 cast<StoreInst>(Operands.front())->getValueOperand(),
18724 Operands.back(),
18725 cast<StoreInst>(Operands.back())->getValueOperand(),
18726 Operands.size()})
18727 .second)
18728 continue;
18729
18730 unsigned MaxVecRegSize = R.getMaxVecRegSize();
18731 unsigned EltSize = R.getVectorElementSize(Operands[0]);
18732 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
18733
18734 unsigned MaxVF =
18735 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
18736 auto *Store = cast<StoreInst>(Operands[0]);
18737 Type *StoreTy = Store->getValueOperand()->getType();
18738 Type *ValueTy = StoreTy;
18739 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
18740 ValueTy = Trunc->getSrcTy();
18741 unsigned MinVF = std::max<unsigned>(
18743 R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
18744 ValueTy)));
18745
18746 if (MaxVF < MinVF) {
18747 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
18748 << ") < "
18749 << "MinVF (" << MinVF << ")\n");
18750 continue;
18751 }
18752
18753 unsigned NonPowerOf2VF = 0;
18755 // First try vectorizing with a non-power-of-2 VF. At the moment, only
18756 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
18757 // lanes are used.
18758 unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF);
18759 if (has_single_bit(CandVF + 1)) {
18760 NonPowerOf2VF = CandVF;
18761 assert(NonPowerOf2VF != MaxVF &&
18762 "Non-power-of-2 VF should not be equal to MaxVF");
18763 }
18764 }
18765
18766 unsigned MaxRegVF = MaxVF;
18767 MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
18768 if (MaxVF < MinVF) {
18769 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
18770 << ") < "
18771 << "MinVF (" << MinVF << ")\n");
18772 continue;
18773 }
18774
18775 unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF);
18776 SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > 0 ? 1 : 0));
18777 unsigned Size = MinVF;
18778 for_each(reverse(CandidateVFs), [&](unsigned &VF) {
18779 VF = Size > MaxVF ? NonPowerOf2VF : Size;
18780 Size *= 2;
18781 });
18782 unsigned End = Operands.size();
18783 unsigned Repeat = 0;
18784 constexpr unsigned MaxAttempts = 4;
18786 for_each(RangeSizes, [](std::pair<unsigned, unsigned> &P) {
18787 P.first = P.second = 1;
18788 });
18790 auto IsNotVectorized = [](bool First,
18791 const std::pair<unsigned, unsigned> &P) {
18792 return First ? P.first > 0 : P.second > 0;
18793 };
18794 auto IsVectorized = [](bool First,
18795 const std::pair<unsigned, unsigned> &P) {
18796 return First ? P.first == 0 : P.second == 0;
18797 };
18798 auto VFIsProfitable = [](bool First, unsigned Size,
18799 const std::pair<unsigned, unsigned> &P) {
18800 return First ? Size >= P.first : Size >= P.second;
18801 };
18802 auto FirstSizeSame = [](unsigned Size,
18803 const std::pair<unsigned, unsigned> &P) {
18804 return Size == P.first;
18805 };
18806 while (true) {
18807 ++Repeat;
18808 bool RepeatChanged = false;
18809 bool AnyProfitableGraph = false;
18810 for (unsigned Size : CandidateVFs) {
18811 AnyProfitableGraph = false;
18812 unsigned StartIdx = std::distance(
18813 RangeSizes.begin(),
18814 find_if(RangeSizes, std::bind(IsNotVectorized, Size >= MaxRegVF,
18815 std::placeholders::_1)));
18816 while (StartIdx < End) {
18817 unsigned EndIdx =
18818 std::distance(RangeSizes.begin(),
18819 find_if(RangeSizes.drop_front(StartIdx),
18820 std::bind(IsVectorized, Size >= MaxRegVF,
18821 std::placeholders::_1)));
18822 unsigned Sz = EndIdx >= End ? End : EndIdx;
18823 for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) {
18824 if (!checkTreeSizes(RangeSizes.slice(Cnt, Size),
18825 Size >= MaxRegVF)) {
18826 ++Cnt;
18827 continue;
18828 }
18830 assert(all_of(Slice,
18831 [&](Value *V) {
18832 return cast<StoreInst>(V)
18833 ->getValueOperand()
18834 ->getType() ==
18835 cast<StoreInst>(Slice.front())
18836 ->getValueOperand()
18837 ->getType();
18838 }) &&
18839 "Expected all operands of same type.");
18840 if (!NonSchedulable.empty()) {
18841 auto [NonSchedSizeMax, NonSchedSizeMin] =
18842 NonSchedulable.lookup(Slice.front());
18843 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= Size) {
18844 Cnt += NonSchedSizeMax;
18845 continue;
18846 }
18847 }
18848 unsigned TreeSize;
18849 std::optional<bool> Res =
18850 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
18851 if (!Res) {
18852 NonSchedulable
18853 .try_emplace(Slice.front(), std::make_pair(Size, Size))
18854 .first->getSecond()
18855 .second = Size;
18856 } else if (*Res) {
18857 // Mark the vectorized stores so that we don't vectorize them
18858 // again.
18859 VectorizedStores.insert(Slice.begin(), Slice.end());
18860 // Mark the vectorized stores so that we don't vectorize them
18861 // again.
18862 AnyProfitableGraph = RepeatChanged = Changed = true;
18863 // If we vectorized initial block, no need to try to vectorize
18864 // it again.
18865 for_each(RangeSizes.slice(Cnt, Size),
18866 [](std::pair<unsigned, unsigned> &P) {
18867 P.first = P.second = 0;
18868 });
18869 if (Cnt < StartIdx + MinVF) {
18870 for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
18871 [](std::pair<unsigned, unsigned> &P) {
18872 P.first = P.second = 0;
18873 });
18874 StartIdx = Cnt + Size;
18875 }
18876 if (Cnt > Sz - Size - MinVF) {
18877 for_each(RangeSizes.slice(Cnt + Size, Sz - (Cnt + Size)),
18878 [](std::pair<unsigned, unsigned> &P) {
18879 P.first = P.second = 0;
18880 });
18881 if (Sz == End)
18882 End = Cnt;
18883 Sz = Cnt;
18884 }
18885 Cnt += Size;
18886 continue;
18887 }
18888 if (Size > 2 && Res &&
18889 !all_of(RangeSizes.slice(Cnt, Size),
18890 std::bind(VFIsProfitable, Size >= MaxRegVF, TreeSize,
18891 std::placeholders::_1))) {
18892 Cnt += Size;
18893 continue;
18894 }
18895 // Check for the very big VFs that we're not rebuilding same
18896 // trees, just with larger number of elements.
18897 if (Size > MaxRegVF && TreeSize > 1 &&
18898 all_of(RangeSizes.slice(Cnt, Size),
18899 std::bind(FirstSizeSame, TreeSize,
18900 std::placeholders::_1))) {
18901 Cnt += Size;
18902 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
18903 ++Cnt;
18904 continue;
18905 }
18906 if (TreeSize > 1)
18907 for_each(RangeSizes.slice(Cnt, Size),
18908 [&](std::pair<unsigned, unsigned> &P) {
18909 if (Size >= MaxRegVF)
18910 P.second = std::max(P.second, TreeSize);
18911 else
18912 P.first = std::max(P.first, TreeSize);
18913 });
18914 ++Cnt;
18915 AnyProfitableGraph = true;
18916 }
18917 if (StartIdx >= End)
18918 break;
18919 if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
18920 AnyProfitableGraph = true;
18921 StartIdx = std::distance(
18922 RangeSizes.begin(),
18923 find_if(RangeSizes.drop_front(Sz),
18924 std::bind(IsNotVectorized, Size >= MaxRegVF,
18925 std::placeholders::_1)));
18926 }
18927 if (!AnyProfitableGraph && Size >= MaxRegVF && has_single_bit(Size))
18928 break;
18929 }
18930 // All values vectorized - exit.
18931 if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
18932 return P.first == 0 && P.second == 0;
18933 }))
18934 break;
18935 // Check if tried all attempts or no need for the last attempts at all.
18936 if (Repeat >= MaxAttempts ||
18937 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
18938 break;
18939 constexpr unsigned StoresLimit = 64;
18940 const unsigned MaxTotalNum = std::min<unsigned>(
18941 Operands.size(),
18942 static_cast<unsigned>(
18943 End -
18944 std::distance(
18945 RangeSizes.begin(),
18946 find_if(RangeSizes, std::bind(IsNotVectorized, true,
18947 std::placeholders::_1))) +
18948 1));
18949 unsigned VF = bit_ceil(CandidateVFs.front()) * 2;
18950 unsigned Limit =
18951 getFloorFullVectorNumberOfElements(*TTI, StoreTy, MaxTotalNum);
18952 CandidateVFs.clear();
18953 if (bit_floor(Limit) == VF)
18954 CandidateVFs.push_back(Limit);
18955 if (VF > MaxTotalNum || VF >= StoresLimit)
18956 break;
18957 for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &P) {
18958 if (P.first != 0)
18959 P.first = std::max(P.second, P.first);
18960 });
18961 // Last attempt to vectorize max number of elements, if all previous
18962 // attempts were unsuccessful because of the cost issues.
18963 CandidateVFs.push_back(VF);
18964 }
18965 }
18966 };
18967
18968 // Stores pair (first: index of the store into Stores array ref, address of
18969 // which taken as base, second: sorted set of pairs {index, dist}, which are
18970 // indices of stores in the set and their store location distances relative to
18971 // the base address).
18972
18973 // Need to store the index of the very first store separately, since the set
18974 // may be reordered after the insertion and the first store may be moved. This
18975 // container allows to reduce number of calls of getPointersDiff() function.
18977 // Inserts the specified store SI with the given index Idx to the set of the
18978 // stores. If the store with the same distance is found already - stop
18979 // insertion, try to vectorize already found stores. If some stores from this
18980 // sequence were not vectorized - try to vectorize them with the new store
18981 // later. But this logic is applied only to the stores, that come before the
18982 // previous store with the same distance.
18983 // Example:
18984 // 1. store x, %p
18985 // 2. store y, %p+1
18986 // 3. store z, %p+2
18987 // 4. store a, %p
18988 // 5. store b, %p+3
18989 // - Scan this from the last to first store. The very first bunch of stores is
18990 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
18991 // vector).
18992 // - The next store in the list - #1 - has the same distance from store #5 as
18993 // the store #4.
18994 // - Try to vectorize sequence of stores 4,2,3,5.
18995 // - If all these stores are vectorized - just drop them.
18996 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
18997 // - Start new stores sequence.
18998 // The new bunch of stores is {1, {1, 0}}.
18999 // - Add the stores from previous sequence, that were not vectorized.
19000 // Here we consider the stores in the reversed order, rather they are used in
19001 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
19002 // Store #3 can be added -> comes after store #4 with the same distance as
19003 // store #1.
19004 // Store #5 cannot be added - comes before store #4.
19005 // This logic allows to improve the compile time, we assume that the stores
19006 // after previous store with the same distance most likely have memory
19007 // dependencies and no need to waste compile time to try to vectorize them.
19008 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
19009 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
19010 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
19011 std::optional<int> Diff = getPointersDiff(
19012 Stores[Set.first]->getValueOperand()->getType(),
19013 Stores[Set.first]->getPointerOperand(),
19014 SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE,
19015 /*StrictCheck=*/true);
19016 if (!Diff)
19017 continue;
19018 auto It = Set.second.find(std::make_pair(Idx, *Diff));
19019 if (It == Set.second.end()) {
19020 Set.second.emplace(Idx, *Diff);
19021 return;
19022 }
19023 // Try to vectorize the first found set to avoid duplicate analysis.
19024 TryToVectorize(Set.second);
19025 unsigned ItIdx = It->first;
19026 int ItDist = It->second;
19027 StoreIndexToDistSet PrevSet;
19028 copy_if(Set.second, std::inserter(PrevSet, PrevSet.end()),
19029 [&](const std::pair<unsigned, int> &Pair) {
19030 return Pair.first > ItIdx;
19031 });
19032 Set.second.clear();
19033 Set.first = Idx;
19034 Set.second.emplace(Idx, 0);
19035 // Insert stores that followed previous match to try to vectorize them
19036 // with this store.
19037 unsigned StartIdx = ItIdx + 1;
19038 SmallBitVector UsedStores(Idx - StartIdx);
19039 // Distances to previously found dup store (or this store, since they
19040 // store to the same addresses).
19041 SmallVector<int> Dists(Idx - StartIdx, 0);
19042 for (const std::pair<unsigned, int> &Pair : reverse(PrevSet)) {
19043 // Do not try to vectorize sequences, we already tried.
19044 if (VectorizedStores.contains(Stores[Pair.first]))
19045 break;
19046 unsigned BI = Pair.first - StartIdx;
19047 UsedStores.set(BI);
19048 Dists[BI] = Pair.second - ItDist;
19049 }
19050 for (unsigned I = StartIdx; I < Idx; ++I) {
19051 unsigned BI = I - StartIdx;
19052 if (UsedStores.test(BI))
19053 Set.second.emplace(I, Dists[BI]);
19054 }
19055 return;
19056 }
19057 auto &Res = SortedStores.emplace_back();
19058 Res.first = Idx;
19059 Res.second.emplace(Idx, 0);
19060 };
19061 Type *PrevValTy = nullptr;
19062 for (auto [I, SI] : enumerate(Stores)) {
19063 if (R.isDeleted(SI))
19064 continue;
19065 if (!PrevValTy)
19066 PrevValTy = SI->getValueOperand()->getType();
19067 // Check that we do not try to vectorize stores of different types.
19068 if (PrevValTy != SI->getValueOperand()->getType()) {
19069 for (auto &Set : SortedStores)
19070 TryToVectorize(Set.second);
19071 SortedStores.clear();
19072 PrevValTy = SI->getValueOperand()->getType();
19073 }
19074 FillStoresSet(I, SI);
19075 }
19076
19077 // Final vectorization attempt.
19078 for (auto &Set : SortedStores)
19079 TryToVectorize(Set.second);
19080
19081 return Changed;
19082}
19083
19084void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
19085 // Initialize the collections. We will make a single pass over the block.
19086 Stores.clear();
19087 GEPs.clear();
19088
19089 // Visit the store and getelementptr instructions in BB and organize them in
19090 // Stores and GEPs according to the underlying objects of their pointer
19091 // operands.
19092 for (Instruction &I : *BB) {
19093 // Ignore store instructions that are volatile or have a pointer operand
19094 // that doesn't point to a scalar type.
19095 if (auto *SI = dyn_cast<StoreInst>(&I)) {
19096 if (!SI->isSimple())
19097 continue;
19098 if (!isValidElementType(SI->getValueOperand()->getType()))
19099 continue;
19100 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
19101 }
19102
19103 // Ignore getelementptr instructions that have more than one index, a
19104 // constant index, or a pointer operand that doesn't point to a scalar
19105 // type.
19106 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
19107 if (GEP->getNumIndices() != 1)
19108 continue;
19109 Value *Idx = GEP->idx_begin()->get();
19110 if (isa<Constant>(Idx))
19111 continue;
19112 if (!isValidElementType(Idx->getType()))
19113 continue;
19114 if (GEP->getType()->isVectorTy())
19115 continue;
19116 GEPs[GEP->getPointerOperand()].push_back(GEP);
19117 }
19118 }
19119}
19120
19121bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
19122 bool MaxVFOnly) {
19123 if (VL.size() < 2)
19124 return false;
19125
19126 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
19127 << VL.size() << ".\n");
19128
19129 // Check that all of the parts are instructions of the same type,
19130 // we permit an alternate opcode via InstructionsState.
19131 InstructionsState S = getSameOpcode(VL, *TLI);
19132 if (!S)
19133 return false;
19134
19135 Instruction *I0 = S.getMainOp();
19136 // Make sure invalid types (including vector type) are rejected before
19137 // determining vectorization factor for scalar instructions.
19138 for (Value *V : VL) {
19139 Type *Ty = V->getType();
19140 if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) {
19141 // NOTE: the following will give user internal llvm type name, which may
19142 // not be useful.
19143 R.getORE()->emit([&]() {
19144 std::string TypeStr;
19145 llvm::raw_string_ostream rso(TypeStr);
19146 Ty->print(rso);
19147 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
19148 << "Cannot SLP vectorize list: type "
19149 << TypeStr + " is unsupported by vectorizer";
19150 });
19151 return false;
19152 }
19153 }
19154
19155 Type *ScalarTy = getValueType(VL[0]);
19156 unsigned Sz = R.getVectorElementSize(I0);
19157 unsigned MinVF = R.getMinVF(Sz);
19158 unsigned MaxVF = std::max<unsigned>(
19159 getFloorFullVectorNumberOfElements(*TTI, ScalarTy, VL.size()), MinVF);
19160 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
19161 if (MaxVF < 2) {
19162 R.getORE()->emit([&]() {
19163 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
19164 << "Cannot SLP vectorize list: vectorization factor "
19165 << "less than 2 is not supported";
19166 });
19167 return false;
19168 }
19169
19170 bool Changed = false;
19171 bool CandidateFound = false;
19172 InstructionCost MinCost = SLPCostThreshold.getValue();
19173
19174 unsigned NextInst = 0, MaxInst = VL.size();
19175 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
19176 VF = getFloorFullVectorNumberOfElements(*TTI, I0->getType(), VF - 1)) {
19177 // No actual vectorization should happen, if number of parts is the same as
19178 // provided vectorization factor (i.e. the scalar type is used for vector
19179 // code during codegen).
19180 auto *VecTy = getWidenedType(ScalarTy, VF);
19181 if (TTI->getNumberOfParts(VecTy) == VF)
19182 continue;
19183 for (unsigned I = NextInst; I < MaxInst; ++I) {
19184 unsigned ActualVF = std::min(MaxInst - I, VF);
19185
19186 if (!hasFullVectorsOrPowerOf2(*TTI, ScalarTy, ActualVF))
19187 continue;
19188
19189 if (MaxVFOnly && ActualVF < MaxVF)
19190 break;
19191 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
19192 break;
19193
19194 SmallVector<Value *> Ops(ActualVF, nullptr);
19195 unsigned Idx = 0;
19196 for (Value *V : VL.drop_front(I)) {
19197 // Check that a previous iteration of this loop did not delete the
19198 // Value.
19199 if (auto *Inst = dyn_cast<Instruction>(V);
19200 !Inst || !R.isDeleted(Inst)) {
19201 Ops[Idx] = V;
19202 ++Idx;
19203 if (Idx == ActualVF)
19204 break;
19205 }
19206 }
19207 // Not enough vectorizable instructions - exit.
19208 if (Idx != ActualVF)
19209 break;
19210
19211 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
19212 << "\n");
19213
19214 R.buildTree(Ops);
19215 if (R.isTreeTinyAndNotFullyVectorizable())
19216 continue;
19217 R.reorderTopToBottom();
19218 R.reorderBottomToTop(
19219 /*IgnoreReorder=*/!isa<InsertElementInst>(Ops.front()) &&
19220 !R.doesRootHaveInTreeUses());
19221 R.transformNodes();
19222 R.buildExternalUses();
19223
19224 R.computeMinimumValueSizes();
19225 InstructionCost Cost = R.getTreeCost();
19226 CandidateFound = true;
19227 MinCost = std::min(MinCost, Cost);
19228
19229 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
19230 << " for VF=" << ActualVF << "\n");
19231 if (Cost < -SLPCostThreshold) {
19232 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
19233 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
19234 cast<Instruction>(Ops[0]))
19235 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
19236 << " and with tree size "
19237 << ore::NV("TreeSize", R.getTreeSize()));
19238
19239 R.vectorizeTree();
19240 // Move to the next bundle.
19241 I += VF - 1;
19242 NextInst = I + 1;
19243 Changed = true;
19244 }
19245 }
19246 }
19247
19248 if (!Changed && CandidateFound) {
19249 R.getORE()->emit([&]() {
19250 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
19251 << "List vectorization was possible but not beneficial with cost "
19252 << ore::NV("Cost", MinCost) << " >= "
19253 << ore::NV("Treshold", -SLPCostThreshold);
19254 });
19255 } else if (!Changed) {
19256 R.getORE()->emit([&]() {
19257 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
19258 << "Cannot SLP vectorize list: vectorization was impossible"
19259 << " with available vectorization factors";
19260 });
19261 }
19262 return Changed;
19263}
19264
19265bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
19266 if (!I)
19267 return false;
19268
19269 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
19270 return false;
19271
19272 Value *P = I->getParent();
19273
19274 // Vectorize in current basic block only.
19275 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
19276 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
19277 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
19278 R.isDeleted(Op0) || R.isDeleted(Op1))
19279 return false;
19280
19281 // First collect all possible candidates
19283 Candidates.emplace_back(Op0, Op1);
19284
19285 auto *A = dyn_cast<BinaryOperator>(Op0);
19286 auto *B = dyn_cast<BinaryOperator>(Op1);
19287 // Try to skip B.
19288 if (A && B && B->hasOneUse()) {
19289 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
19290 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
19291 if (B0 && B0->getParent() == P && !R.isDeleted(B0))
19292 Candidates.emplace_back(A, B0);
19293 if (B1 && B1->getParent() == P && !R.isDeleted(B1))
19294 Candidates.emplace_back(A, B1);
19295 }
19296 // Try to skip A.
19297 if (B && A && A->hasOneUse()) {
19298 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
19299 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
19300 if (A0 && A0->getParent() == P && !R.isDeleted(A0))
19301 Candidates.emplace_back(A0, B);
19302 if (A1 && A1->getParent() == P && !R.isDeleted(A1))
19303 Candidates.emplace_back(A1, B);
19304 }
19305
19306 if (Candidates.size() == 1)
19307 return tryToVectorizeList({Op0, Op1}, R);
19308
19309 // We have multiple options. Try to pick the single best.
19310 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
19311 if (!BestCandidate)
19312 return false;
19313 return tryToVectorizeList(
19314 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
19315}
19316
19317namespace {
19318
19319/// Model horizontal reductions.
19320///
19321/// A horizontal reduction is a tree of reduction instructions that has values
19322/// that can be put into a vector as its leaves. For example:
19323///
19324/// mul mul mul mul
19325/// \ / \ /
19326/// + +
19327/// \ /
19328/// +
19329/// This tree has "mul" as its leaf values and "+" as its reduction
19330/// instructions. A reduction can feed into a store or a binary operation
19331/// feeding a phi.
19332/// ...
19333/// \ /
19334/// +
19335/// |
19336/// phi +=
19337///
19338/// Or:
19339/// ...
19340/// \ /
19341/// +
19342/// |
19343/// *p =
19344///
19345class HorizontalReduction {
19346 using ReductionOpsType = SmallVector<Value *, 16>;
19347 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
19348 ReductionOpsListType ReductionOps;
19349 /// List of possibly reduced values.
19351 /// Maps reduced value to the corresponding reduction operation.
19353 WeakTrackingVH ReductionRoot;
19354 /// The type of reduction operation.
19355 RecurKind RdxKind;
19356 /// Checks if the optimization of original scalar identity operations on
19357 /// matched horizontal reductions is enabled and allowed.
19358 bool IsSupportedHorRdxIdentityOp = false;
19359
19360 static bool isCmpSelMinMax(Instruction *I) {
19361 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
19363 }
19364
19365 // And/or are potentially poison-safe logical patterns like:
19366 // select x, y, false
19367 // select x, true, y
19368 static bool isBoolLogicOp(Instruction *I) {
19369 return isa<SelectInst>(I) &&
19370 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
19371 }
19372
19373 /// Checks if instruction is associative and can be vectorized.
19374 static bool isVectorizable(RecurKind Kind, Instruction *I) {
19375 if (Kind == RecurKind::None)
19376 return false;
19377
19378 // Integer ops that map to select instructions or intrinsics are fine.
19380 isBoolLogicOp(I))
19381 return true;
19382
19383 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
19384 // FP min/max are associative except for NaN and -0.0. We do not
19385 // have to rule out -0.0 here because the intrinsic semantics do not
19386 // specify a fixed result for it.
19387 return I->getFastMathFlags().noNaNs();
19388 }
19389
19390 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
19391 return true;
19392
19393 return I->isAssociative();
19394 }
19395
19396 static Value *getRdxOperand(Instruction *I, unsigned Index) {
19397 // Poison-safe 'or' takes the form: select X, true, Y
19398 // To make that work with the normal operand processing, we skip the
19399 // true value operand.
19400 // TODO: Change the code and data structures to handle this without a hack.
19401 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
19402 return I->getOperand(2);
19403 return I->getOperand(Index);
19404 }
19405
19406 /// Creates reduction operation with the current opcode.
19407 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
19408 Value *RHS, const Twine &Name, bool UseSelect) {
19409 switch (Kind) {
19410 case RecurKind::Or: {
19411 if (UseSelect &&
19413 return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name);
19414 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
19415 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
19416 Name);
19417 }
19418 case RecurKind::And: {
19419 if (UseSelect &&
19421 return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name);
19422 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
19423 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
19424 Name);
19425 }
19426 case RecurKind::Add:
19427 case RecurKind::Mul:
19428 case RecurKind::Xor:
19429 case RecurKind::FAdd:
19430 case RecurKind::FMul: {
19431 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
19432 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
19433 Name);
19434 }
19435 case RecurKind::SMax:
19436 case RecurKind::SMin:
19437 case RecurKind::UMax:
19438 case RecurKind::UMin:
19439 if (UseSelect) {
19441 Value *Cmp = Builder.CreateICmp(Pred, LHS, RHS, Name);
19442 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
19443 }
19444 [[fallthrough]];
19445 case RecurKind::FMax:
19446 case RecurKind::FMin:
19447 case RecurKind::FMaximum:
19448 case RecurKind::FMinimum: {
19450 return Builder.CreateBinaryIntrinsic(Id, LHS, RHS);
19451 }
19452 default:
19453 llvm_unreachable("Unknown reduction operation.");
19454 }
19455 }
19456
19457 /// Creates reduction operation with the current opcode with the IR flags
19458 /// from \p ReductionOps, dropping nuw/nsw flags.
19459 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
19460 Value *RHS, const Twine &Name,
19461 const ReductionOpsListType &ReductionOps) {
19462 bool UseSelect = ReductionOps.size() == 2 ||
19463 // Logical or/and.
19464 (ReductionOps.size() == 1 &&
19465 any_of(ReductionOps.front(), IsaPred<SelectInst>));
19466 assert((!UseSelect || ReductionOps.size() != 2 ||
19467 isa<SelectInst>(ReductionOps[1][0])) &&
19468 "Expected cmp + select pairs for reduction");
19469 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
19471 if (auto *Sel = dyn_cast<SelectInst>(Op)) {
19472 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
19473 /*IncludeWrapFlags=*/false);
19474 propagateIRFlags(Op, ReductionOps[1], nullptr,
19475 /*IncludeWrapFlags=*/false);
19476 return Op;
19477 }
19478 }
19479 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
19480 return Op;
19481 }
19482
19483public:
19484 static RecurKind getRdxKind(Value *V) {
19485 auto *I = dyn_cast<Instruction>(V);
19486 if (!I)
19487 return RecurKind::None;
19488 if (match(I, m_Add(m_Value(), m_Value())))
19489 return RecurKind::Add;
19490 if (match(I, m_Mul(m_Value(), m_Value())))
19491 return RecurKind::Mul;
19492 if (match(I, m_And(m_Value(), m_Value())) ||
19494 return RecurKind::And;
19495 if (match(I, m_Or(m_Value(), m_Value())) ||
19497 return RecurKind::Or;
19498 if (match(I, m_Xor(m_Value(), m_Value())))
19499 return RecurKind::Xor;
19500 if (match(I, m_FAdd(m_Value(), m_Value())))
19501 return RecurKind::FAdd;
19502 if (match(I, m_FMul(m_Value(), m_Value())))
19503 return RecurKind::FMul;
19504
19505 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
19506 return RecurKind::FMax;
19507 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
19508 return RecurKind::FMin;
19509
19510 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(), m_Value())))
19511 return RecurKind::FMaximum;
19512 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(), m_Value())))
19513 return RecurKind::FMinimum;
19514 // This matches either cmp+select or intrinsics. SLP is expected to handle
19515 // either form.
19516 // TODO: If we are canonicalizing to intrinsics, we can remove several
19517 // special-case paths that deal with selects.
19518 if (match(I, m_SMax(m_Value(), m_Value())))
19519 return RecurKind::SMax;
19520 if (match(I, m_SMin(m_Value(), m_Value())))
19521 return RecurKind::SMin;
19522 if (match(I, m_UMax(m_Value(), m_Value())))
19523 return RecurKind::UMax;
19524 if (match(I, m_UMin(m_Value(), m_Value())))
19525 return RecurKind::UMin;
19526
19527 if (auto *Select = dyn_cast<SelectInst>(I)) {
19528 // Try harder: look for min/max pattern based on instructions producing
19529 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
19530 // During the intermediate stages of SLP, it's very common to have
19531 // pattern like this (since optimizeGatherSequence is run only once
19532 // at the end):
19533 // %1 = extractelement <2 x i32> %a, i32 0
19534 // %2 = extractelement <2 x i32> %a, i32 1
19535 // %cond = icmp sgt i32 %1, %2
19536 // %3 = extractelement <2 x i32> %a, i32 0
19537 // %4 = extractelement <2 x i32> %a, i32 1
19538 // %select = select i1 %cond, i32 %3, i32 %4
19539 CmpPredicate Pred;
19540 Instruction *L1;
19541 Instruction *L2;
19542
19543 Value *LHS = Select->getTrueValue();
19544 Value *RHS = Select->getFalseValue();
19545 Value *Cond = Select->getCondition();
19546
19547 // TODO: Support inverse predicates.
19548 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
19549 if (!isa<ExtractElementInst>(RHS) ||
19550 !L2->isIdenticalTo(cast<Instruction>(RHS)))
19551 return RecurKind::None;
19552 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
19553 if (!isa<ExtractElementInst>(LHS) ||
19554 !L1->isIdenticalTo(cast<Instruction>(LHS)))
19555 return RecurKind::None;
19556 } else {
19557 if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
19558 return RecurKind::None;
19559 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
19560 !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
19561 !L2->isIdenticalTo(cast<Instruction>(RHS)))
19562 return RecurKind::None;
19563 }
19564
19565 switch (Pred) {
19566 default:
19567 return RecurKind::None;
19568 case CmpInst::ICMP_SGT:
19569 case CmpInst::ICMP_SGE:
19570 return RecurKind::SMax;
19571 case CmpInst::ICMP_SLT:
19572 case CmpInst::ICMP_SLE:
19573 return RecurKind::SMin;
19574 case CmpInst::ICMP_UGT:
19575 case CmpInst::ICMP_UGE:
19576 return RecurKind::UMax;
19577 case CmpInst::ICMP_ULT:
19578 case CmpInst::ICMP_ULE:
19579 return RecurKind::UMin;
19580 }
19581 }
19582 return RecurKind::None;
19583 }
19584
19585 /// Get the index of the first operand.
19586 static unsigned getFirstOperandIndex(Instruction *I) {
19587 return isCmpSelMinMax(I) ? 1 : 0;
19588 }
19589
19590private:
19591 /// Total number of operands in the reduction operation.
19592 static unsigned getNumberOfOperands(Instruction *I) {
19593 return isCmpSelMinMax(I) ? 3 : 2;
19594 }
19595
19596 /// Checks if the instruction is in basic block \p BB.
19597 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
19598 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
19599 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
19600 auto *Sel = cast<SelectInst>(I);
19601 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
19602 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
19603 }
19604 return I->getParent() == BB;
19605 }
19606
19607 /// Expected number of uses for reduction operations/reduced values.
19608 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
19609 if (IsCmpSelMinMax) {
19610 // SelectInst must be used twice while the condition op must have single
19611 // use only.
19612 if (auto *Sel = dyn_cast<SelectInst>(I))
19613 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
19614 return I->hasNUses(2);
19615 }
19616
19617 // Arithmetic reduction operation must be used once only.
19618 return I->hasOneUse();
19619 }
19620
19621 /// Initializes the list of reduction operations.
19622 void initReductionOps(Instruction *I) {
19623 if (isCmpSelMinMax(I))
19624 ReductionOps.assign(2, ReductionOpsType());
19625 else
19626 ReductionOps.assign(1, ReductionOpsType());
19627 }
19628
19629 /// Add all reduction operations for the reduction instruction \p I.
19630 void addReductionOps(Instruction *I) {
19631 if (isCmpSelMinMax(I)) {
19632 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
19633 ReductionOps[1].emplace_back(I);
19634 } else {
19635 ReductionOps[0].emplace_back(I);
19636 }
19637 }
19638
19639 static bool isGoodForReduction(ArrayRef<Value *> Data) {
19640 int Sz = Data.size();
19641 auto *I = dyn_cast<Instruction>(Data.front());
19642 return Sz > 1 || isConstant(Data.front()) ||
19643 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
19644 }
19645
19646public:
19647 HorizontalReduction() = default;
19648
19649 /// Try to find a reduction tree.
19650 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
19651 ScalarEvolution &SE, const DataLayout &DL,
19652 const TargetLibraryInfo &TLI) {
19653 RdxKind = HorizontalReduction::getRdxKind(Root);
19654 if (!isVectorizable(RdxKind, Root))
19655 return false;
19656
19657 // Analyze "regular" integer/FP types for reductions - no target-specific
19658 // types or pointers.
19659 Type *Ty = Root->getType();
19660 if (!isValidElementType(Ty) || Ty->isPointerTy())
19661 return false;
19662
19663 // Though the ultimate reduction may have multiple uses, its condition must
19664 // have only single use.
19665 if (auto *Sel = dyn_cast<SelectInst>(Root))
19666 if (!Sel->getCondition()->hasOneUse())
19667 return false;
19668
19669 ReductionRoot = Root;
19670
19671 // Iterate through all the operands of the possible reduction tree and
19672 // gather all the reduced values, sorting them by their value id.
19673 BasicBlock *BB = Root->getParent();
19674 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
19676 1, std::make_pair(Root, 0));
19677 // Checks if the operands of the \p TreeN instruction are also reduction
19678 // operations or should be treated as reduced values or an extra argument,
19679 // which is not part of the reduction.
19680 auto CheckOperands = [&](Instruction *TreeN,
19681 SmallVectorImpl<Value *> &PossibleReducedVals,
19682 SmallVectorImpl<Instruction *> &ReductionOps,
19683 unsigned Level) {
19684 for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
19685 getNumberOfOperands(TreeN)))) {
19686 Value *EdgeVal = getRdxOperand(TreeN, I);
19687 ReducedValsToOps[EdgeVal].push_back(TreeN);
19688 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
19689 // If the edge is not an instruction, or it is different from the main
19690 // reduction opcode or has too many uses - possible reduced value.
19691 // Also, do not try to reduce const values, if the operation is not
19692 // foldable.
19693 if (!EdgeInst || Level > RecursionMaxDepth ||
19694 getRdxKind(EdgeInst) != RdxKind ||
19695 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
19696 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
19697 !isVectorizable(RdxKind, EdgeInst) ||
19698 (R.isAnalyzedReductionRoot(EdgeInst) &&
19699 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
19700 PossibleReducedVals.push_back(EdgeVal);
19701 continue;
19702 }
19703 ReductionOps.push_back(EdgeInst);
19704 }
19705 };
19706 // Try to regroup reduced values so that it gets more profitable to try to
19707 // reduce them. Values are grouped by their value ids, instructions - by
19708 // instruction op id and/or alternate op id, plus do extra analysis for
19709 // loads (grouping them by the distabce between pointers) and cmp
19710 // instructions (grouping them by the predicate).
19713 8>
19714 PossibleReducedVals;
19715 initReductionOps(Root);
19717 SmallSet<size_t, 2> LoadKeyUsed;
19718
19719 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
19720 Key = hash_combine(hash_value(LI->getParent()), Key);
19721 Value *Ptr =
19723 if (!LoadKeyUsed.insert(Key).second) {
19724 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
19725 if (LIt != LoadsMap.end()) {
19726 for (LoadInst *RLI : LIt->second) {
19727 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
19728 LI->getType(), LI->getPointerOperand(), DL, SE,
19729 /*StrictCheck=*/true))
19730 return hash_value(RLI->getPointerOperand());
19731 }
19732 for (LoadInst *RLI : LIt->second) {
19734 LI->getPointerOperand(), TLI)) {
19735 hash_code SubKey = hash_value(RLI->getPointerOperand());
19736 return SubKey;
19737 }
19738 }
19739 if (LIt->second.size() > 2) {
19740 hash_code SubKey =
19741 hash_value(LIt->second.back()->getPointerOperand());
19742 return SubKey;
19743 }
19744 }
19745 }
19746 LoadsMap.try_emplace(std::make_pair(Key, Ptr))
19747 .first->second.push_back(LI);
19748 return hash_value(LI->getPointerOperand());
19749 };
19750
19751 while (!Worklist.empty()) {
19752 auto [TreeN, Level] = Worklist.pop_back_val();
19753 SmallVector<Value *> PossibleRedVals;
19754 SmallVector<Instruction *> PossibleReductionOps;
19755 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
19756 addReductionOps(TreeN);
19757 // Add reduction values. The values are sorted for better vectorization
19758 // results.
19759 for (Value *V : PossibleRedVals) {
19760 size_t Key, Idx;
19761 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
19762 /*AllowAlternate=*/false);
19763 ++PossibleReducedVals[Key][Idx]
19764 .insert(std::make_pair(V, 0))
19765 .first->second;
19766 }
19767 for (Instruction *I : reverse(PossibleReductionOps))
19768 Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);
19769 }
19770 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
19771 // Sort values by the total number of values kinds to start the reduction
19772 // from the longest possible reduced values sequences.
19773 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
19774 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
19775 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
19776 for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
19777 It != E; ++It) {
19778 PossibleRedValsVect.emplace_back();
19779 auto RedValsVect = It->second.takeVector();
19780 stable_sort(RedValsVect, llvm::less_second());
19781 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
19782 PossibleRedValsVect.back().append(Data.second, Data.first);
19783 }
19784 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
19785 return P1.size() > P2.size();
19786 });
19787 int NewIdx = -1;
19788 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
19789 if (NewIdx < 0 ||
19790 (!isGoodForReduction(Data) &&
19791 (!isa<LoadInst>(Data.front()) ||
19792 !isa<LoadInst>(ReducedVals[NewIdx].front()) ||
19794 cast<LoadInst>(Data.front())->getPointerOperand()) !=
19796 cast<LoadInst>(ReducedVals[NewIdx].front())
19797 ->getPointerOperand())))) {
19798 NewIdx = ReducedVals.size();
19799 ReducedVals.emplace_back();
19800 }
19801 ReducedVals[NewIdx].append(Data.rbegin(), Data.rend());
19802 }
19803 }
19804 // Sort the reduced values by number of same/alternate opcode and/or pointer
19805 // operand.
19806 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
19807 return P1.size() > P2.size();
19808 });
19809 return true;
19810 }
19811
19812 /// Attempt to vectorize the tree found by matchAssociativeReduction.
19813 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
19814 const TargetLibraryInfo &TLI, AssumptionCache *AC) {
19815 const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
19816 constexpr unsigned RegMaxNumber = 4;
19817 constexpr unsigned RedValsMaxNumber = 128;
19818 // If there are a sufficient number of reduction values, reduce
19819 // to a nearby power-of-2. We can safely generate oversized
19820 // vectors and rely on the backend to split them to legal sizes.
19821 if (unsigned NumReducedVals = std::accumulate(
19822 ReducedVals.begin(), ReducedVals.end(), 0,
19823 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
19824 if (!isGoodForReduction(Vals))
19825 return Num;
19826 return Num + Vals.size();
19827 });
19828 NumReducedVals < ReductionLimit &&
19829 all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
19830 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
19831 })) {
19832 for (ReductionOpsType &RdxOps : ReductionOps)
19833 for (Value *RdxOp : RdxOps)
19834 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
19835 return nullptr;
19836 }
19837
19838 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
19839 TargetFolder(DL));
19840 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
19841
19842 // Track the reduced values in case if they are replaced by extractelement
19843 // because of the vectorization.
19844 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *
19845 ReducedVals.front().size());
19846
19847 // The compare instruction of a min/max is the insertion point for new
19848 // instructions and may be replaced with a new compare instruction.
19849 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
19850 assert(isa<SelectInst>(RdxRootInst) &&
19851 "Expected min/max reduction to have select root instruction");
19852 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
19853 assert(isa<Instruction>(ScalarCond) &&
19854 "Expected min/max reduction to have compare condition");
19855 return cast<Instruction>(ScalarCond);
19856 };
19857
19858 bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {
19859 return isBoolLogicOp(cast<Instruction>(V));
19860 });
19861 // Return new VectorizedTree, based on previous value.
19862 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
19863 if (VectorizedTree) {
19864 // Update the final value in the reduction.
19866 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
19867 if (AnyBoolLogicOp) {
19868 auto It = ReducedValsToOps.find(VectorizedTree);
19869 auto It1 = ReducedValsToOps.find(Res);
19870 if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) ||
19871 isGuaranteedNotToBePoison(VectorizedTree, AC) ||
19872 (It != ReducedValsToOps.end() &&
19873 any_of(It->getSecond(), [&](Instruction *I) {
19874 return isBoolLogicOp(I) &&
19875 getRdxOperand(I, 0) == VectorizedTree;
19876 }))) {
19877 ;
19878 } else if (isGuaranteedNotToBePoison(Res, AC) ||
19879 (It1 != ReducedValsToOps.end() &&
19880 any_of(It1->getSecond(), [&](Instruction *I) {
19881 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
19882 }))) {
19883 std::swap(VectorizedTree, Res);
19884 } else {
19885 VectorizedTree = Builder.CreateFreeze(VectorizedTree);
19886 }
19887 }
19888
19889 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
19890 ReductionOps);
19891 }
19892 // Initialize the final value in the reduction.
19893 return Res;
19894 };
19895 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
19896 ReductionOps.front().size());
19897 for (ReductionOpsType &RdxOps : ReductionOps)
19898 for (Value *RdxOp : RdxOps) {
19899 if (!RdxOp)
19900 continue;
19901 IgnoreList.insert(RdxOp);
19902 }
19903 // Intersect the fast-math-flags from all reduction operations.
19904 FastMathFlags RdxFMF;
19905 RdxFMF.set();
19906 for (Value *U : IgnoreList)
19907 if (auto *FPMO = dyn_cast<FPMathOperator>(U))
19908 RdxFMF &= FPMO->getFastMathFlags();
19909 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
19910
19911 // Need to track reduced vals, they may be changed during vectorization of
19912 // subvectors.
19913 for (ArrayRef<Value *> Candidates : ReducedVals)
19914 for (Value *V : Candidates)
19915 TrackedVals.try_emplace(V, V);
19916
19918 Value *V) -> unsigned & {
19919 auto *It = MV.find(V);
19920 assert(It != MV.end() && "Unable to find given key.");
19921 return It->second;
19922 };
19923
19924 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
19925 // List of the values that were reduced in other trees as part of gather
19926 // nodes and thus requiring extract if fully vectorized in other trees.
19927 SmallPtrSet<Value *, 4> RequiredExtract;
19928 WeakTrackingVH VectorizedTree = nullptr;
19929 bool CheckForReusedReductionOps = false;
19930 // Try to vectorize elements based on their type.
19932 for (ArrayRef<Value *> RV : ReducedVals)
19933 States.push_back(getSameOpcode(RV, TLI));
19934 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
19935 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
19936 InstructionsState S = States[I];
19937 SmallVector<Value *> Candidates;
19938 Candidates.reserve(2 * OrigReducedVals.size());
19939 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
19940 for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
19941 Value *RdxVal = TrackedVals.at(OrigReducedVals[Cnt]);
19942 // Check if the reduction value was not overriden by the extractelement
19943 // instruction because of the vectorization and exclude it, if it is not
19944 // compatible with other values.
19945 // Also check if the instruction was folded to constant/other value.
19946 auto *Inst = dyn_cast<Instruction>(RdxVal);
19947 if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
19948 (!S || !S.isOpcodeOrAlt(Inst))) ||
19949 (S && !Inst))
19950 continue;
19951 Candidates.push_back(RdxVal);
19952 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
19953 }
19954 bool ShuffledExtracts = false;
19955 // Try to handle shuffled extractelements.
19956 if (S && S.getOpcode() == Instruction::ExtractElement &&
19957 !S.isAltShuffle() && I + 1 < E) {
19958 SmallVector<Value *> CommonCandidates(Candidates);
19959 for (Value *RV : ReducedVals[I + 1]) {
19960 Value *RdxVal = TrackedVals.at(RV);
19961 // Check if the reduction value was not overriden by the
19962 // extractelement instruction because of the vectorization and
19963 // exclude it, if it is not compatible with other values.
19964 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
19965 if (!Inst)
19966 continue;
19967 CommonCandidates.push_back(RdxVal);
19968 TrackedToOrig.try_emplace(RdxVal, RV);
19969 }
19971 if (isFixedVectorShuffle(CommonCandidates, Mask, AC)) {
19972 ++I;
19973 Candidates.swap(CommonCandidates);
19974 ShuffledExtracts = true;
19975 }
19976 }
19977
19978 // Emit code for constant values.
19979 if (Candidates.size() > 1 && allConstant(Candidates)) {
19980 Value *Res = Candidates.front();
19981 Value *OrigV = TrackedToOrig.at(Candidates.front());
19982 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
19983 for (Value *VC : ArrayRef(Candidates).drop_front()) {
19984 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
19985 Value *OrigV = TrackedToOrig.at(VC);
19986 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
19987 if (auto *ResI = dyn_cast<Instruction>(Res))
19988 V.analyzedReductionRoot(ResI);
19989 }
19990 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
19991 continue;
19992 }
19993
19994 unsigned NumReducedVals = Candidates.size();
19995 if (NumReducedVals < ReductionLimit &&
19996 (NumReducedVals < 2 || !isSplat(Candidates)))
19997 continue;
19998
19999 // Check if we support repeated scalar values processing (optimization of
20000 // original scalar identity operations on matched horizontal reductions).
20001 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
20002 RdxKind != RecurKind::FMul &&
20003 RdxKind != RecurKind::FMulAdd;
20004 // Gather same values.
20005 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
20006 if (IsSupportedHorRdxIdentityOp)
20007 for (Value *V : Candidates) {
20008 Value *OrigV = TrackedToOrig.at(V);
20009 ++SameValuesCounter.try_emplace(OrigV).first->second;
20010 }
20011 // Used to check if the reduced values used same number of times. In this
20012 // case the compiler may produce better code. E.g. if reduced values are
20013 // aabbccdd (8 x values), then the first node of the tree will have a node
20014 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
20015 // Plus, the final reduction will be performed on <8 x aabbccdd>.
20016 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
20017 // x abcd) * 2.
20018 // Currently it only handles add/fadd/xor. and/or/min/max do not require
20019 // this analysis, other operations may require an extra estimation of
20020 // the profitability.
20021 bool SameScaleFactor = false;
20022 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
20023 SameValuesCounter.size() != Candidates.size();
20024 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
20025 if (OptReusedScalars) {
20026 SameScaleFactor =
20027 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
20028 RdxKind == RecurKind::Xor) &&
20029 all_of(drop_begin(SameValuesCounter),
20030 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
20031 return P.second == SameValuesCounter.front().second;
20032 });
20033 Candidates.resize(SameValuesCounter.size());
20034 transform(SameValuesCounter, Candidates.begin(),
20035 [&](const auto &P) { return TrackedVals.at(P.first); });
20036 NumReducedVals = Candidates.size();
20037 // Have a reduction of the same element.
20038 if (NumReducedVals == 1) {
20039 Value *OrigV = TrackedToOrig.at(Candidates.front());
20040 unsigned Cnt = At(SameValuesCounter, OrigV);
20041 Value *RedVal =
20042 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
20043 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20044 VectorizedVals.try_emplace(OrigV, Cnt);
20045 ExternallyUsedValues.insert(OrigV);
20046 continue;
20047 }
20048 }
20049
20050 unsigned MaxVecRegSize = V.getMaxVecRegSize();
20051 unsigned EltSize = V.getVectorElementSize(Candidates[0]);
20052 const unsigned MaxElts = std::clamp<unsigned>(
20053 llvm::bit_floor(MaxVecRegSize / EltSize), RedValsMaxNumber,
20054 RegMaxNumber * RedValsMaxNumber);
20055
20056 unsigned ReduxWidth = NumReducedVals;
20057 auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) {
20058 unsigned NumParts, NumRegs;
20059 Type *ScalarTy = Candidates.front()->getType();
20060 ReduxWidth =
20061 getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth);
20062 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
20063 NumParts = ::getNumberOfParts(TTI, Tp);
20064 NumRegs =
20066 while (NumParts > NumRegs) {
20067 assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0.");
20068 ReduxWidth = bit_floor(ReduxWidth - 1);
20069 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
20070 NumParts = ::getNumberOfParts(TTI, Tp);
20071 NumRegs =
20073 }
20074 if (NumParts > NumRegs / 2)
20075 ReduxWidth = bit_floor(ReduxWidth);
20076 return ReduxWidth;
20077 };
20078 if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))
20079 ReduxWidth = GetVectorFactor(ReduxWidth);
20080 ReduxWidth = std::min(ReduxWidth, MaxElts);
20081
20082 unsigned Start = 0;
20083 unsigned Pos = Start;
20084 // Restarts vectorization attempt with lower vector factor.
20085 unsigned PrevReduxWidth = ReduxWidth;
20086 bool CheckForReusedReductionOpsLocal = false;
20087 auto AdjustReducedVals = [&](bool IgnoreVL = false) {
20088 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
20089 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
20090 // Check if any of the reduction ops are gathered. If so, worth
20091 // trying again with less number of reduction ops.
20092 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
20093 }
20094 ++Pos;
20095 if (Pos < NumReducedVals - ReduxWidth + 1)
20096 return IsAnyRedOpGathered;
20097 Pos = Start;
20098 --ReduxWidth;
20099 if (ReduxWidth > 1)
20100 ReduxWidth = GetVectorFactor(ReduxWidth);
20101 return IsAnyRedOpGathered;
20102 };
20103 bool AnyVectorized = false;
20104 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
20105 while (Pos < NumReducedVals - ReduxWidth + 1 &&
20106 ReduxWidth >= ReductionLimit) {
20107 // Dependency in tree of the reduction ops - drop this attempt, try
20108 // later.
20109 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
20110 Start == 0) {
20111 CheckForReusedReductionOps = true;
20112 break;
20113 }
20114 PrevReduxWidth = ReduxWidth;
20115 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
20116 // Been analyzed already - skip.
20117 if (IgnoredCandidates.contains(std::make_pair(Pos, ReduxWidth)) ||
20118 (!has_single_bit(ReduxWidth) &&
20119 (IgnoredCandidates.contains(
20120 std::make_pair(Pos, bit_floor(ReduxWidth))) ||
20121 IgnoredCandidates.contains(
20122 std::make_pair(Pos + (ReduxWidth - bit_floor(ReduxWidth)),
20123 bit_floor(ReduxWidth))))) ||
20124 V.areAnalyzedReductionVals(VL)) {
20125 (void)AdjustReducedVals(/*IgnoreVL=*/true);
20126 continue;
20127 }
20128 // Early exit if any of the reduction values were deleted during
20129 // previous vectorization attempts.
20130 if (any_of(VL, [&V](Value *RedVal) {
20131 auto *RedValI = dyn_cast<Instruction>(RedVal);
20132 if (!RedValI)
20133 return false;
20134 return V.isDeleted(RedValI);
20135 }))
20136 break;
20137 V.buildTree(VL, IgnoreList);
20138 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
20139 if (!AdjustReducedVals())
20140 V.analyzedReductionVals(VL);
20141 continue;
20142 }
20143 if (V.isLoadCombineReductionCandidate(RdxKind)) {
20144 if (!AdjustReducedVals())
20145 V.analyzedReductionVals(VL);
20146 continue;
20147 }
20148 V.reorderTopToBottom();
20149 // No need to reorder the root node at all.
20150 V.reorderBottomToTop(/*IgnoreReorder=*/true);
20151 // Keep extracted other reduction values, if they are used in the
20152 // vectorization trees.
20153 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
20154 ExternallyUsedValues);
20155 // The reduction root is used as the insertion point for new
20156 // instructions, so set it as externally used to prevent it from being
20157 // deleted.
20158 LocalExternallyUsedValues.insert(ReductionRoot);
20159 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
20160 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
20161 continue;
20162 for (Value *V : ReducedVals[Cnt])
20163 if (isa<Instruction>(V))
20164 LocalExternallyUsedValues.insert(TrackedVals[V]);
20165 }
20166 if (!IsSupportedHorRdxIdentityOp) {
20167 // Number of uses of the candidates in the vector of values.
20168 assert(SameValuesCounter.empty() &&
20169 "Reused values counter map is not empty");
20170 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20171 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20172 continue;
20173 Value *V = Candidates[Cnt];
20174 Value *OrigV = TrackedToOrig.at(V);
20175 ++SameValuesCounter.try_emplace(OrigV).first->second;
20176 }
20177 }
20178 V.transformNodes();
20179 SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end());
20180 // Gather externally used values.
20182 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20183 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20184 continue;
20185 Value *RdxVal = Candidates[Cnt];
20186 if (auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
20187 RdxVal = It->second;
20188 if (!Visited.insert(RdxVal).second)
20189 continue;
20190 // Check if the scalar was vectorized as part of the vectorization
20191 // tree but not the top node.
20192 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
20193 LocalExternallyUsedValues.insert(RdxVal);
20194 continue;
20195 }
20196 Value *OrigV = TrackedToOrig.at(RdxVal);
20197 unsigned NumOps =
20198 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
20199 if (NumOps != ReducedValsToOps.at(OrigV).size())
20200 LocalExternallyUsedValues.insert(RdxVal);
20201 }
20202 // Do not need the list of reused scalars in regular mode anymore.
20203 if (!IsSupportedHorRdxIdentityOp)
20204 SameValuesCounter.clear();
20205 for (Value *RdxVal : VL)
20206 if (RequiredExtract.contains(RdxVal))
20207 LocalExternallyUsedValues.insert(RdxVal);
20208 V.buildExternalUses(LocalExternallyUsedValues);
20209
20210 V.computeMinimumValueSizes();
20211
20212 // Estimate cost.
20213 InstructionCost TreeCost = V.getTreeCost(VL);
20214 InstructionCost ReductionCost =
20215 getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V);
20216 InstructionCost Cost = TreeCost + ReductionCost;
20217 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
20218 << " for reduction\n");
20219 if (!Cost.isValid())
20220 break;
20221 if (Cost >= -SLPCostThreshold) {
20222 V.getORE()->emit([&]() {
20223 return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
20224 ReducedValsToOps.at(VL[0]).front())
20225 << "Vectorizing horizontal reduction is possible "
20226 << "but not beneficial with cost " << ore::NV("Cost", Cost)
20227 << " and threshold "
20228 << ore::NV("Threshold", -SLPCostThreshold);
20229 });
20230 if (!AdjustReducedVals()) {
20231 V.analyzedReductionVals(VL);
20232 unsigned Offset = Pos == Start ? Pos : Pos - 1;
20233 if (ReduxWidth > ReductionLimit && V.isTreeNotExtendable()) {
20234 // Add subvectors of VL to the list of the analyzed values.
20235 for (unsigned VF = getFloorFullVectorNumberOfElements(
20236 *TTI, VL.front()->getType(), ReduxWidth - 1);
20237 VF >= ReductionLimit;
20239 *TTI, VL.front()->getType(), VF - 1)) {
20240 if (has_single_bit(VF) &&
20241 V.getCanonicalGraphSize() != V.getTreeSize())
20242 continue;
20243 for (unsigned Idx : seq<unsigned>(ReduxWidth - VF))
20244 IgnoredCandidates.insert(std::make_pair(Offset + Idx, VF));
20245 }
20246 }
20247 }
20248 continue;
20249 }
20250
20251 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
20252 << Cost << ". (HorRdx)\n");
20253 V.getORE()->emit([&]() {
20254 return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
20255 ReducedValsToOps.at(VL[0]).front())
20256 << "Vectorized horizontal reduction with cost "
20257 << ore::NV("Cost", Cost) << " and with tree size "
20258 << ore::NV("TreeSize", V.getTreeSize());
20259 });
20260
20261 Builder.setFastMathFlags(RdxFMF);
20262
20263 // Emit a reduction. If the root is a select (min/max idiom), the insert
20264 // point is the compare condition of that select.
20265 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
20266 Instruction *InsertPt = RdxRootInst;
20267 if (IsCmpSelMinMax)
20268 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
20269
20270 // Vectorize a tree.
20271 Value *VectorizedRoot =
20272 V.vectorizeTree(LocalExternallyUsedValues, InsertPt);
20273 // Update TrackedToOrig mapping, since the tracked values might be
20274 // updated.
20275 for (Value *RdxVal : Candidates) {
20276 Value *OrigVal = TrackedToOrig.at(RdxVal);
20277 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
20278 if (TransformedRdxVal != RdxVal)
20279 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
20280 }
20281
20282 Builder.SetInsertPoint(InsertPt);
20283
20284 // To prevent poison from leaking across what used to be sequential,
20285 // safe, scalar boolean logic operations, the reduction operand must be
20286 // frozen.
20287 if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot, AC))
20288 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
20289
20290 // Emit code to correctly handle reused reduced values, if required.
20291 if (OptReusedScalars && !SameScaleFactor) {
20292 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
20293 SameValuesCounter, TrackedToOrig);
20294 }
20295
20296 Value *ReducedSubTree;
20297 Type *ScalarTy = VL.front()->getType();
20298 if (isa<FixedVectorType>(ScalarTy)) {
20299 assert(SLPReVec && "FixedVectorType is not expected.");
20300 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
20301 ReducedSubTree = PoisonValue::get(FixedVectorType::get(
20302 VectorizedRoot->getType()->getScalarType(), ScalarTyNumElements));
20303 for (unsigned I : seq<unsigned>(ScalarTyNumElements)) {
20304 // Do reduction for each lane.
20305 // e.g., do reduce add for
20306 // VL[0] = <4 x Ty> <a, b, c, d>
20307 // VL[1] = <4 x Ty> <e, f, g, h>
20308 // Lane[0] = <2 x Ty> <a, e>
20309 // Lane[1] = <2 x Ty> <b, f>
20310 // Lane[2] = <2 x Ty> <c, g>
20311 // Lane[3] = <2 x Ty> <d, h>
20312 // result[0] = reduce add Lane[0]
20313 // result[1] = reduce add Lane[1]
20314 // result[2] = reduce add Lane[2]
20315 // result[3] = reduce add Lane[3]
20317 createStrideMask(I, ScalarTyNumElements, VL.size());
20318 Value *Lane = Builder.CreateShuffleVector(VectorizedRoot, Mask);
20319 ReducedSubTree = Builder.CreateInsertElement(
20320 ReducedSubTree,
20321 emitReduction(Lane, Builder, TTI, RdxRootInst->getType()), I);
20322 }
20323 } else {
20324 ReducedSubTree = emitReduction(VectorizedRoot, Builder, TTI,
20325 RdxRootInst->getType());
20326 }
20327 if (ReducedSubTree->getType() != VL.front()->getType()) {
20328 assert(ReducedSubTree->getType() != VL.front()->getType() &&
20329 "Expected different reduction type.");
20330 ReducedSubTree =
20331 Builder.CreateIntCast(ReducedSubTree, VL.front()->getType(),
20332 V.isSignedMinBitwidthRootNode());
20333 }
20334
20335 // Improved analysis for add/fadd/xor reductions with same scale factor
20336 // for all operands of reductions. We can emit scalar ops for them
20337 // instead.
20338 if (OptReusedScalars && SameScaleFactor)
20339 ReducedSubTree = emitScaleForReusedOps(
20340 ReducedSubTree, Builder, SameValuesCounter.front().second);
20341
20342 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
20343 // Count vectorized reduced values to exclude them from final reduction.
20344 for (Value *RdxVal : VL) {
20345 Value *OrigV = TrackedToOrig.at(RdxVal);
20346 if (IsSupportedHorRdxIdentityOp) {
20347 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
20348 continue;
20349 }
20350 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20351 if (!V.isVectorized(RdxVal))
20352 RequiredExtract.insert(RdxVal);
20353 }
20354 Pos += ReduxWidth;
20355 Start = Pos;
20356 ReduxWidth = NumReducedVals - Pos;
20357 if (ReduxWidth > 1)
20358 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
20359 AnyVectorized = true;
20360 }
20361 if (OptReusedScalars && !AnyVectorized) {
20362 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
20363 Value *RdxVal = TrackedVals.at(P.first);
20364 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder, P.second);
20365 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20366 VectorizedVals.try_emplace(P.first, P.second);
20367 }
20368 continue;
20369 }
20370 }
20371 if (VectorizedTree) {
20372 // Reorder operands of bool logical op in the natural order to avoid
20373 // possible problem with poison propagation. If not possible to reorder
20374 // (both operands are originally RHS), emit an extra freeze instruction
20375 // for the LHS operand.
20376 // I.e., if we have original code like this:
20377 // RedOp1 = select i1 ?, i1 LHS, i1 false
20378 // RedOp2 = select i1 RHS, i1 ?, i1 false
20379
20380 // Then, we swap LHS/RHS to create a new op that matches the poison
20381 // semantics of the original code.
20382
20383 // If we have original code like this and both values could be poison:
20384 // RedOp1 = select i1 ?, i1 LHS, i1 false
20385 // RedOp2 = select i1 ?, i1 RHS, i1 false
20386
20387 // Then, we must freeze LHS in the new op.
20388 auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS,
20389 Instruction *RedOp1,
20390 Instruction *RedOp2,
20391 bool InitStep) {
20392 if (!AnyBoolLogicOp)
20393 return;
20394 if (isBoolLogicOp(RedOp1) && ((!InitStep && LHS == VectorizedTree) ||
20395 getRdxOperand(RedOp1, 0) == LHS ||
20397 return;
20398 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
20399 getRdxOperand(RedOp2, 0) == RHS ||
20401 std::swap(LHS, RHS);
20402 return;
20403 }
20404 if (LHS != VectorizedTree)
20405 LHS = Builder.CreateFreeze(LHS);
20406 };
20407 // Finish the reduction.
20408 // Need to add extra arguments and not vectorized possible reduction
20409 // values.
20410 // Try to avoid dependencies between the scalar remainders after
20411 // reductions.
20412 auto FinalGen =
20414 bool InitStep) {
20415 unsigned Sz = InstVals.size();
20417 Sz % 2);
20418 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
20419 Instruction *RedOp = InstVals[I + 1].first;
20420 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
20421 Value *RdxVal1 = InstVals[I].second;
20422 Value *StableRdxVal1 = RdxVal1;
20423 auto It1 = TrackedVals.find(RdxVal1);
20424 if (It1 != TrackedVals.end())
20425 StableRdxVal1 = It1->second;
20426 Value *RdxVal2 = InstVals[I + 1].second;
20427 Value *StableRdxVal2 = RdxVal2;
20428 auto It2 = TrackedVals.find(RdxVal2);
20429 if (It2 != TrackedVals.end())
20430 StableRdxVal2 = It2->second;
20431 // To prevent poison from leaking across what used to be
20432 // sequential, safe, scalar boolean logic operations, the
20433 // reduction operand must be frozen.
20434 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
20435 RedOp, InitStep);
20436 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
20437 StableRdxVal2, "op.rdx", ReductionOps);
20438 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
20439 }
20440 if (Sz % 2 == 1)
20441 ExtraReds[Sz / 2] = InstVals.back();
20442 return ExtraReds;
20443 };
20445 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
20446 VectorizedTree);
20448 for (ArrayRef<Value *> Candidates : ReducedVals) {
20449 for (Value *RdxVal : Candidates) {
20450 if (!Visited.insert(RdxVal).second)
20451 continue;
20452 unsigned NumOps = VectorizedVals.lookup(RdxVal);
20453 for (Instruction *RedOp :
20454 ArrayRef(ReducedValsToOps.at(RdxVal)).drop_back(NumOps))
20455 ExtraReductions.emplace_back(RedOp, RdxVal);
20456 }
20457 }
20458 // Iterate through all not-vectorized reduction values/extra arguments.
20459 bool InitStep = true;
20460 while (ExtraReductions.size() > 1) {
20462 FinalGen(ExtraReductions, InitStep);
20463 ExtraReductions.swap(NewReds);
20464 InitStep = false;
20465 }
20466 VectorizedTree = ExtraReductions.front().second;
20467
20468 ReductionRoot->replaceAllUsesWith(VectorizedTree);
20469
20470 // The original scalar reduction is expected to have no remaining
20471 // uses outside the reduction tree itself. Assert that we got this
20472 // correct, replace internal uses with undef, and mark for eventual
20473 // deletion.
20474#ifndef NDEBUG
20475 SmallSet<Value *, 4> IgnoreSet;
20476 for (ArrayRef<Value *> RdxOps : ReductionOps)
20477 IgnoreSet.insert(RdxOps.begin(), RdxOps.end());
20478#endif
20479 for (ArrayRef<Value *> RdxOps : ReductionOps) {
20480 for (Value *Ignore : RdxOps) {
20481 if (!Ignore)
20482 continue;
20483#ifndef NDEBUG
20484 for (auto *U : Ignore->users()) {
20485 assert(IgnoreSet.count(U) &&
20486 "All users must be either in the reduction ops list.");
20487 }
20488#endif
20489 if (!Ignore->use_empty()) {
20490 Value *P = PoisonValue::get(Ignore->getType());
20491 Ignore->replaceAllUsesWith(P);
20492 }
20493 }
20494 V.removeInstructionsAndOperands(RdxOps);
20495 }
20496 } else if (!CheckForReusedReductionOps) {
20497 for (ReductionOpsType &RdxOps : ReductionOps)
20498 for (Value *RdxOp : RdxOps)
20499 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
20500 }
20501 return VectorizedTree;
20502 }
20503
20504private:
20505 /// Calculate the cost of a reduction.
20506 InstructionCost getReductionCost(TargetTransformInfo *TTI,
20507 ArrayRef<Value *> ReducedVals,
20508 bool IsCmpSelMinMax, FastMathFlags FMF,
20509 const BoUpSLP &R) {
20511 Type *ScalarTy = ReducedVals.front()->getType();
20512 unsigned ReduxWidth = ReducedVals.size();
20513 FixedVectorType *VectorTy = R.getReductionType();
20514 InstructionCost VectorCost = 0, ScalarCost;
20515 // If all of the reduced values are constant, the vector cost is 0, since
20516 // the reduction value can be calculated at the compile time.
20517 bool AllConsts = allConstant(ReducedVals);
20518 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
20520 // Scalar cost is repeated for N-1 elements.
20521 int Cnt = ReducedVals.size();
20522 for (Value *RdxVal : ReducedVals) {
20523 if (Cnt == 1)
20524 break;
20525 --Cnt;
20526 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
20527 Cost += GenCostFn();
20528 continue;
20529 }
20530 InstructionCost ScalarCost = 0;
20531 for (User *U : RdxVal->users()) {
20532 auto *RdxOp = cast<Instruction>(U);
20533 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
20534 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
20535 continue;
20536 }
20537 ScalarCost = InstructionCost::getInvalid();
20538 break;
20539 }
20540 if (ScalarCost.isValid())
20541 Cost += ScalarCost;
20542 else
20543 Cost += GenCostFn();
20544 }
20545 return Cost;
20546 };
20547 switch (RdxKind) {
20548 case RecurKind::Add:
20549 case RecurKind::Mul:
20550 case RecurKind::Or:
20551 case RecurKind::And:
20552 case RecurKind::Xor:
20553 case RecurKind::FAdd:
20554 case RecurKind::FMul: {
20555 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
20556 if (!AllConsts) {
20557 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
20558 assert(SLPReVec && "FixedVectorType is not expected.");
20559 unsigned ScalarTyNumElements = VecTy->getNumElements();
20560 for (unsigned I : seq<unsigned>(ReducedVals.size())) {
20561 VectorCost += TTI->getShuffleCost(
20562 TTI::SK_PermuteSingleSrc, VectorTy,
20563 createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));
20564 VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy, FMF,
20565 CostKind);
20566 }
20567 VectorCost += TTI->getScalarizationOverhead(
20568 VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
20569 /*Extract*/ false, TTI::TCK_RecipThroughput);
20570 } else {
20571 Type *RedTy = VectorTy->getElementType();
20572 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
20573 std::make_pair(RedTy, true));
20574 if (RType == RedTy) {
20575 VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
20576 FMF, CostKind);
20577 } else {
20578 VectorCost = TTI->getExtendedReductionCost(
20579 RdxOpcode, !IsSigned, RedTy, getWidenedType(RType, ReduxWidth),
20580 FMF, CostKind);
20581 }
20582 }
20583 }
20584 ScalarCost = EvaluateScalarCost([&]() {
20585 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
20586 });
20587 break;
20588 }
20589 case RecurKind::FMax:
20590 case RecurKind::FMin:
20591 case RecurKind::FMaximum:
20592 case RecurKind::FMinimum:
20593 case RecurKind::SMax:
20594 case RecurKind::SMin:
20595 case RecurKind::UMax:
20596 case RecurKind::UMin: {
20598 if (!AllConsts)
20599 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
20600 ScalarCost = EvaluateScalarCost([&]() {
20601 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
20602 return TTI->getIntrinsicInstrCost(ICA, CostKind);
20603 });
20604 break;
20605 }
20606 default:
20607 llvm_unreachable("Expected arithmetic or min/max reduction operation");
20608 }
20609
20610 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
20611 << " for reduction of " << shortBundleName(ReducedVals)
20612 << " (It is a splitting reduction)\n");
20613 return VectorCost - ScalarCost;
20614 }
20615
20616 /// Emit a horizontal reduction of the vectorized value.
20617 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
20618 const TargetTransformInfo *TTI, Type *DestTy) {
20619 assert(VectorizedValue && "Need to have a vectorized tree node");
20620 assert(RdxKind != RecurKind::FMulAdd &&
20621 "A call to the llvm.fmuladd intrinsic is not handled yet");
20622
20623 auto *FTy = cast<FixedVectorType>(VectorizedValue->getType());
20624 if (FTy->getScalarType() == Builder.getInt1Ty() &&
20625 RdxKind == RecurKind::Add &&
20626 DestTy->getScalarType() != FTy->getScalarType()) {
20627 // Convert vector_reduce_add(ZExt(<n x i1>)) to
20628 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
20629 Value *V = Builder.CreateBitCast(
20630 VectorizedValue, Builder.getIntNTy(FTy->getNumElements()));
20631 ++NumVectorInstructions;
20632 return Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, V);
20633 }
20634 ++NumVectorInstructions;
20635 return createSimpleReduction(Builder, VectorizedValue, RdxKind);
20636 }
20637
20638 /// Emits optimized code for unique scalar value reused \p Cnt times.
20639 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
20640 unsigned Cnt) {
20641 assert(IsSupportedHorRdxIdentityOp &&
20642 "The optimization of matched scalar identity horizontal reductions "
20643 "must be supported.");
20644 if (Cnt == 1)
20645 return VectorizedValue;
20646 switch (RdxKind) {
20647 case RecurKind::Add: {
20648 // res = mul vv, n
20649 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
20650 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
20651 << VectorizedValue << ". (HorRdx)\n");
20652 return Builder.CreateMul(VectorizedValue, Scale);
20653 }
20654 case RecurKind::Xor: {
20655 // res = n % 2 ? 0 : vv
20656 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
20657 << ". (HorRdx)\n");
20658 if (Cnt % 2 == 0)
20659 return Constant::getNullValue(VectorizedValue->getType());
20660 return VectorizedValue;
20661 }
20662 case RecurKind::FAdd: {
20663 // res = fmul v, n
20664 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
20665 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
20666 << VectorizedValue << ". (HorRdx)\n");
20667 return Builder.CreateFMul(VectorizedValue, Scale);
20668 }
20669 case RecurKind::And:
20670 case RecurKind::Or:
20671 case RecurKind::SMax:
20672 case RecurKind::SMin:
20673 case RecurKind::UMax:
20674 case RecurKind::UMin:
20675 case RecurKind::FMax:
20676 case RecurKind::FMin:
20677 case RecurKind::FMaximum:
20678 case RecurKind::FMinimum:
20679 // res = vv
20680 return VectorizedValue;
20681 case RecurKind::Mul:
20682 case RecurKind::FMul:
20683 case RecurKind::FMulAdd:
20684 case RecurKind::IAnyOf:
20685 case RecurKind::FAnyOf:
20686 case RecurKind::IFindLastIV:
20687 case RecurKind::FFindLastIV:
20688 case RecurKind::None:
20689 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
20690 }
20691 return nullptr;
20692 }
20693
20694 /// Emits actual operation for the scalar identity values, found during
20695 /// horizontal reduction analysis.
20696 Value *
20697 emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
20698 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
20699 const DenseMap<Value *, Value *> &TrackedToOrig) {
20700 assert(IsSupportedHorRdxIdentityOp &&
20701 "The optimization of matched scalar identity horizontal reductions "
20702 "must be supported.");
20703 ArrayRef<Value *> VL = R.getRootNodeScalars();
20704 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
20705 if (VTy->getElementType() != VL.front()->getType()) {
20706 VectorizedValue = Builder.CreateIntCast(
20707 VectorizedValue,
20708 getWidenedType(VL.front()->getType(), VTy->getNumElements()),
20709 R.isSignedMinBitwidthRootNode());
20710 }
20711 switch (RdxKind) {
20712 case RecurKind::Add: {
20713 // root = mul prev_root, <1, 1, n, 1>
20715 for (Value *V : VL) {
20716 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
20717 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
20718 }
20719 auto *Scale = ConstantVector::get(Vals);
20720 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
20721 << VectorizedValue << ". (HorRdx)\n");
20722 return Builder.CreateMul(VectorizedValue, Scale);
20723 }
20724 case RecurKind::And:
20725 case RecurKind::Or:
20726 // No need for multiple or/and(s).
20727 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
20728 << ". (HorRdx)\n");
20729 return VectorizedValue;
20730 case RecurKind::SMax:
20731 case RecurKind::SMin:
20732 case RecurKind::UMax:
20733 case RecurKind::UMin:
20734 case RecurKind::FMax:
20735 case RecurKind::FMin:
20736 case RecurKind::FMaximum:
20737 case RecurKind::FMinimum:
20738 // No need for multiple min/max(s) of the same value.
20739 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
20740 << ". (HorRdx)\n");
20741 return VectorizedValue;
20742 case RecurKind::Xor: {
20743 // Replace values with even number of repeats with 0, since
20744 // x xor x = 0.
20745 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
20746 // 7>, if elements 4th and 6th elements have even number of repeats.
20748 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
20750 std::iota(Mask.begin(), Mask.end(), 0);
20751 bool NeedShuffle = false;
20752 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
20753 Value *V = VL[I];
20754 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
20755 if (Cnt % 2 == 0) {
20756 Mask[I] = VF;
20757 NeedShuffle = true;
20758 }
20759 }
20760 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
20761 : Mask) dbgs()
20762 << I << " ";
20763 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
20764 if (NeedShuffle)
20765 VectorizedValue = Builder.CreateShuffleVector(
20766 VectorizedValue,
20767 ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
20768 return VectorizedValue;
20769 }
20770 case RecurKind::FAdd: {
20771 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
20773 for (Value *V : VL) {
20774 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
20775 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
20776 }
20777 auto *Scale = ConstantVector::get(Vals);
20778 return Builder.CreateFMul(VectorizedValue, Scale);
20779 }
20780 case RecurKind::Mul:
20781 case RecurKind::FMul:
20782 case RecurKind::FMulAdd:
20783 case RecurKind::IAnyOf:
20784 case RecurKind::FAnyOf:
20785 case RecurKind::IFindLastIV:
20786 case RecurKind::FFindLastIV:
20787 case RecurKind::None:
20788 llvm_unreachable("Unexpected reduction kind for reused scalars.");
20789 }
20790 return nullptr;
20791 }
20792};
20793} // end anonymous namespace
20794
20795/// Gets recurrence kind from the specified value.
20797 return HorizontalReduction::getRdxKind(V);
20798}
20799static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
20800 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
20801 return cast<FixedVectorType>(IE->getType())->getNumElements();
20802
20803 unsigned AggregateSize = 1;
20804 auto *IV = cast<InsertValueInst>(InsertInst);
20805 Type *CurrentType = IV->getType();
20806 do {
20807 if (auto *ST = dyn_cast<StructType>(CurrentType)) {
20808 for (auto *Elt : ST->elements())
20809 if (Elt != ST->getElementType(0)) // check homogeneity
20810 return std::nullopt;
20811 AggregateSize *= ST->getNumElements();
20812 CurrentType = ST->getElementType(0);
20813 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
20814 AggregateSize *= AT->getNumElements();
20815 CurrentType = AT->getElementType();
20816 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
20817 AggregateSize *= VT->getNumElements();
20818 return AggregateSize;
20819 } else if (CurrentType->isSingleValueType()) {
20820 return AggregateSize;
20821 } else {
20822 return std::nullopt;
20823 }
20824 } while (true);
20825}
20826
20827static void findBuildAggregate_rec(Instruction *LastInsertInst,
20829 SmallVectorImpl<Value *> &BuildVectorOpds,
20830 SmallVectorImpl<Value *> &InsertElts,
20831 unsigned OperandOffset, const BoUpSLP &R) {
20832 do {
20833 Value *InsertedOperand = LastInsertInst->getOperand(1);
20834 std::optional<unsigned> OperandIndex =
20835 getElementIndex(LastInsertInst, OperandOffset);
20836 if (!OperandIndex || R.isDeleted(LastInsertInst))
20837 return;
20838 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
20839 findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI,
20840 BuildVectorOpds, InsertElts, *OperandIndex, R);
20841
20842 } else {
20843 BuildVectorOpds[*OperandIndex] = InsertedOperand;
20844 InsertElts[*OperandIndex] = LastInsertInst;
20845 }
20846 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
20847 } while (LastInsertInst != nullptr &&
20848 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
20849 LastInsertInst->hasOneUse());
20850}
20851
20852/// Recognize construction of vectors like
20853/// %ra = insertelement <4 x float> poison, float %s0, i32 0
20854/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
20855/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
20856/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
20857/// starting from the last insertelement or insertvalue instruction.
20858///
20859/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
20860/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
20861/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
20862///
20863/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
20864///
20865/// \return true if it matches.
20866static bool findBuildAggregate(Instruction *LastInsertInst,
20868 SmallVectorImpl<Value *> &BuildVectorOpds,
20869 SmallVectorImpl<Value *> &InsertElts,
20870 const BoUpSLP &R) {
20871
20872 assert((isa<InsertElementInst>(LastInsertInst) ||
20873 isa<InsertValueInst>(LastInsertInst)) &&
20874 "Expected insertelement or insertvalue instruction!");
20875
20876 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
20877 "Expected empty result vectors!");
20878
20879 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
20880 if (!AggregateSize)
20881 return false;
20882 BuildVectorOpds.resize(*AggregateSize);
20883 InsertElts.resize(*AggregateSize);
20884
20885 findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0,
20886 R);
20887 llvm::erase(BuildVectorOpds, nullptr);
20888 llvm::erase(InsertElts, nullptr);
20889 if (BuildVectorOpds.size() >= 2)
20890 return true;
20891
20892 return false;
20893}
20894
20895/// Try and get a reduction instruction from a phi node.
20896///
20897/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
20898/// if they come from either \p ParentBB or a containing loop latch.
20899///
20900/// \returns A candidate reduction value if possible, or \code nullptr \endcode
20901/// if not possible.
20903 BasicBlock *ParentBB, LoopInfo *LI) {
20904 // There are situations where the reduction value is not dominated by the
20905 // reduction phi. Vectorizing such cases has been reported to cause
20906 // miscompiles. See PR25787.
20907 auto DominatedReduxValue = [&](Value *R) {
20908 return isa<Instruction>(R) &&
20909 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
20910 };
20911
20912 Instruction *Rdx = nullptr;
20913
20914 // Return the incoming value if it comes from the same BB as the phi node.
20915 if (P->getIncomingBlock(0) == ParentBB) {
20916 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
20917 } else if (P->getIncomingBlock(1) == ParentBB) {
20918 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
20919 }
20920
20921 if (Rdx && DominatedReduxValue(Rdx))
20922 return Rdx;
20923
20924 // Otherwise, check whether we have a loop latch to look at.
20925 Loop *BBL = LI->getLoopFor(ParentBB);
20926 if (!BBL)
20927 return nullptr;
20928 BasicBlock *BBLatch = BBL->getLoopLatch();
20929 if (!BBLatch)
20930 return nullptr;
20931
20932 // There is a loop latch, return the incoming value if it comes from
20933 // that. This reduction pattern occasionally turns up.
20934 if (P->getIncomingBlock(0) == BBLatch) {
20935 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
20936 } else if (P->getIncomingBlock(1) == BBLatch) {
20937 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
20938 }
20939
20940 if (Rdx && DominatedReduxValue(Rdx))
20941 return Rdx;
20942
20943 return nullptr;
20944}
20945
20946static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
20947 if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
20948 return true;
20949 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1))))
20950 return true;
20951 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))
20952 return true;
20953 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(V0), m_Value(V1))))
20954 return true;
20955 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(V0), m_Value(V1))))
20956 return true;
20957 if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1))))
20958 return true;
20959 if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1))))
20960 return true;
20961 if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1))))
20962 return true;
20963 if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1))))
20964 return true;
20965 return false;
20966}
20967
20968/// We could have an initial reduction that is not an add.
20969/// r *= v1 + v2 + v3 + v4
20970/// In such a case start looking for a tree rooted in the first '+'.
20971/// \Returns the new root if found, which may be nullptr if not an instruction.
20973 Instruction *Root) {
20974 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
20975 isa<IntrinsicInst>(Root)) &&
20976 "Expected binop, select, or intrinsic for reduction matching");
20977 Value *LHS =
20978 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
20979 Value *RHS =
20980 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
20981 if (LHS == Phi)
20982 return dyn_cast<Instruction>(RHS);
20983 if (RHS == Phi)
20984 return dyn_cast<Instruction>(LHS);
20985 return nullptr;
20986}
20987
20988/// \p Returns the first operand of \p I that does not match \p Phi. If
20989/// operand is not an instruction it returns nullptr.
20991 Value *Op0 = nullptr;
20992 Value *Op1 = nullptr;
20993 if (!matchRdxBop(I, Op0, Op1))
20994 return nullptr;
20995 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
20996}
20997
20998/// \Returns true if \p I is a candidate instruction for reduction vectorization.
21000 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
21001 Value *B0 = nullptr, *B1 = nullptr;
21002 bool IsBinop = matchRdxBop(I, B0, B1);
21003 return IsBinop || IsSelect;
21004}
21005
21006bool SLPVectorizerPass::vectorizeHorReduction(
21007 PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
21008 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
21009 if (!ShouldVectorizeHor)
21010 return false;
21011 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
21012
21013 if (Root->getParent() != BB || isa<PHINode>(Root))
21014 return false;
21015
21016 // If we can find a secondary reduction root, use that instead.
21017 auto SelectRoot = [&]() {
21018 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
21019 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
21020 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
21021 return NewRoot;
21022 return Root;
21023 };
21024
21025 // Start analysis starting from Root instruction. If horizontal reduction is
21026 // found, try to vectorize it. If it is not a horizontal reduction or
21027 // vectorization is not possible or not effective, and currently analyzed
21028 // instruction is a binary operation, try to vectorize the operands, using
21029 // pre-order DFS traversal order. If the operands were not vectorized, repeat
21030 // the same procedure considering each operand as a possible root of the
21031 // horizontal reduction.
21032 // Interrupt the process if the Root instruction itself was vectorized or all
21033 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
21034 // If a horizintal reduction was not matched or vectorized we collect
21035 // instructions for possible later attempts for vectorization.
21036 std::queue<std::pair<Instruction *, unsigned>> Stack;
21037 Stack.emplace(SelectRoot(), 0);
21038 SmallPtrSet<Value *, 8> VisitedInstrs;
21039 bool Res = false;
21040 auto &&TryToReduce = [this, &R](Instruction *Inst) -> Value * {
21041 if (R.isAnalyzedReductionRoot(Inst))
21042 return nullptr;
21043 if (!isReductionCandidate(Inst))
21044 return nullptr;
21045 HorizontalReduction HorRdx;
21046 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
21047 return nullptr;
21048 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC);
21049 };
21050 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
21051 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
21052 FutureSeed = getNonPhiOperand(Root, P);
21053 if (!FutureSeed)
21054 return false;
21055 }
21056 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
21057 // analysis is done separately.
21058 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
21059 PostponedInsts.push_back(FutureSeed);
21060 return true;
21061 };
21062
21063 while (!Stack.empty()) {
21064 Instruction *Inst;
21065 unsigned Level;
21066 std::tie(Inst, Level) = Stack.front();
21067 Stack.pop();
21068 // Do not try to analyze instruction that has already been vectorized.
21069 // This may happen when we vectorize instruction operands on a previous
21070 // iteration while stack was populated before that happened.
21071 if (R.isDeleted(Inst))
21072 continue;
21073 if (Value *VectorizedV = TryToReduce(Inst)) {
21074 Res = true;
21075 if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
21076 // Try to find another reduction.
21077 Stack.emplace(I, Level);
21078 continue;
21079 }
21080 if (R.isDeleted(Inst))
21081 continue;
21082 } else {
21083 // We could not vectorize `Inst` so try to use it as a future seed.
21084 if (!TryAppendToPostponedInsts(Inst)) {
21085 assert(Stack.empty() && "Expected empty stack");
21086 break;
21087 }
21088 }
21089
21090 // Try to vectorize operands.
21091 // Continue analysis for the instruction from the same basic block only to
21092 // save compile time.
21093 if (++Level < RecursionMaxDepth)
21094 for (auto *Op : Inst->operand_values())
21095 if (VisitedInstrs.insert(Op).second)
21096 if (auto *I = dyn_cast<Instruction>(Op))
21097 // Do not try to vectorize CmpInst operands, this is done
21098 // separately.
21099 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&
21100 !R.isDeleted(I) && I->getParent() == BB)
21101 Stack.emplace(I, Level);
21102 }
21103 return Res;
21104}
21105
21106bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
21107 BasicBlock *BB, BoUpSLP &R) {
21108 SmallVector<WeakTrackingVH> PostponedInsts;
21109 bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts);
21110 Res |= tryToVectorize(PostponedInsts, R);
21111 return Res;
21112}
21113
21114bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
21115 BoUpSLP &R) {
21116 bool Res = false;
21117 for (Value *V : Insts)
21118 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
21119 Res |= tryToVectorize(Inst, R);
21120 return Res;
21121}
21122
21123bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
21124 BasicBlock *BB, BoUpSLP &R,
21125 bool MaxVFOnly) {
21126 if (!R.canMapToVector(IVI->getType()))
21127 return false;
21128
21129 SmallVector<Value *, 16> BuildVectorOpds;
21130 SmallVector<Value *, 16> BuildVectorInsts;
21131 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts, R))
21132 return false;
21133
21134 if (MaxVFOnly && BuildVectorOpds.size() == 2) {
21135 R.getORE()->emit([&]() {
21136 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
21137 << "Cannot SLP vectorize list: only 2 elements of buildvalue, "
21138 "trying reduction first.";
21139 });
21140 return false;
21141 }
21142 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
21143 // Aggregate value is unlikely to be processed in vector register.
21144 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
21145}
21146
21147bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
21148 BasicBlock *BB, BoUpSLP &R,
21149 bool MaxVFOnly) {
21150 SmallVector<Value *, 16> BuildVectorInsts;
21151 SmallVector<Value *, 16> BuildVectorOpds;
21153 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts, R) ||
21154 (all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
21155 isFixedVectorShuffle(BuildVectorOpds, Mask, AC)))
21156 return false;
21157
21158 if (MaxVFOnly && BuildVectorInsts.size() == 2) {
21159 R.getORE()->emit([&]() {
21160 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
21161 << "Cannot SLP vectorize list: only 2 elements of buildvector, "
21162 "trying reduction first.";
21163 });
21164 return false;
21165 }
21166 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
21167 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
21168}
21169
21170template <typename T>
21172 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
21173 function_ref<bool(T *, T *)> AreCompatible,
21174 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
21175 bool MaxVFOnly, BoUpSLP &R) {
21176 bool Changed = false;
21177 // Sort by type, parent, operands.
21178 stable_sort(Incoming, Comparator);
21179
21180 // Try to vectorize elements base on their type.
21181 SmallVector<T *> Candidates;
21183 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
21184 VL.clear()) {
21185 // Look for the next elements with the same type, parent and operand
21186 // kinds.
21187 auto *I = dyn_cast<Instruction>(*IncIt);
21188 if (!I || R.isDeleted(I)) {
21189 ++IncIt;
21190 continue;
21191 }
21192 auto *SameTypeIt = IncIt;
21193 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
21194 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21195 AreCompatible(*SameTypeIt, *IncIt))) {
21196 auto *I = dyn_cast<Instruction>(*SameTypeIt);
21197 ++SameTypeIt;
21198 if (I && !R.isDeleted(I))
21199 VL.push_back(cast<T>(I));
21200 }
21201
21202 // Try to vectorize them.
21203 unsigned NumElts = VL.size();
21204 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
21205 << NumElts << ")\n");
21206 // The vectorization is a 3-state attempt:
21207 // 1. Try to vectorize instructions with the same/alternate opcodes with the
21208 // size of maximal register at first.
21209 // 2. Try to vectorize remaining instructions with the same type, if
21210 // possible. This may result in the better vectorization results rather than
21211 // if we try just to vectorize instructions with the same/alternate opcodes.
21212 // 3. Final attempt to try to vectorize all instructions with the
21213 // same/alternate ops only, this may result in some extra final
21214 // vectorization.
21215 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
21216 // Success start over because instructions might have been changed.
21217 Changed = true;
21218 VL.swap(Candidates);
21219 Candidates.clear();
21220 for (T *V : VL) {
21221 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
21222 Candidates.push_back(V);
21223 }
21224 } else {
21225 /// \Returns the minimum number of elements that we will attempt to
21226 /// vectorize.
21227 auto GetMinNumElements = [&R](Value *V) {
21228 unsigned EltSize = R.getVectorElementSize(V);
21229 return std::max(2U, R.getMaxVecRegSize() / EltSize);
21230 };
21231 if (NumElts < GetMinNumElements(*IncIt) &&
21232 (Candidates.empty() ||
21233 Candidates.front()->getType() == (*IncIt)->getType())) {
21234 for (T *V : VL) {
21235 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
21236 Candidates.push_back(V);
21237 }
21238 }
21239 }
21240 // Final attempt to vectorize instructions with the same types.
21241 if (Candidates.size() > 1 &&
21242 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
21243 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
21244 // Success start over because instructions might have been changed.
21245 Changed = true;
21246 } else if (MaxVFOnly) {
21247 // Try to vectorize using small vectors.
21249 for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
21250 VL.clear()) {
21251 auto *I = dyn_cast<Instruction>(*It);
21252 if (!I || R.isDeleted(I)) {
21253 ++It;
21254 continue;
21255 }
21256 auto *SameTypeIt = It;
21257 while (SameTypeIt != End &&
21258 (!isa<Instruction>(*SameTypeIt) ||
21259 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21260 AreCompatible(*SameTypeIt, *It))) {
21261 auto *I = dyn_cast<Instruction>(*SameTypeIt);
21262 ++SameTypeIt;
21263 if (I && !R.isDeleted(I))
21264 VL.push_back(cast<T>(I));
21265 }
21266 unsigned NumElts = VL.size();
21267 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
21268 /*MaxVFOnly=*/false))
21269 Changed = true;
21270 It = SameTypeIt;
21271 }
21272 }
21273 Candidates.clear();
21274 }
21275
21276 // Start over at the next instruction of a different type (or the end).
21277 IncIt = SameTypeIt;
21278 }
21279 return Changed;
21280}
21281
21282/// Compare two cmp instructions. If IsCompatibility is true, function returns
21283/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
21284/// operands. If IsCompatibility is false, function implements strict weak
21285/// ordering relation between two cmp instructions, returning true if the first
21286/// instruction is "less" than the second, i.e. its predicate is less than the
21287/// predicate of the second or the operands IDs are less than the operands IDs
21288/// of the second cmp instruction.
21289template <bool IsCompatibility>
21290static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
21291 const DominatorTree &DT) {
21292 assert(isValidElementType(V->getType()) &&
21293 isValidElementType(V2->getType()) &&
21294 "Expected valid element types only.");
21295 if (V == V2)
21296 return IsCompatibility;
21297 auto *CI1 = cast<CmpInst>(V);
21298 auto *CI2 = cast<CmpInst>(V2);
21299 if (CI1->getOperand(0)->getType()->getTypeID() <
21300 CI2->getOperand(0)->getType()->getTypeID())
21301 return !IsCompatibility;
21302 if (CI1->getOperand(0)->getType()->getTypeID() >
21303 CI2->getOperand(0)->getType()->getTypeID())
21304 return false;
21305 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
21307 return !IsCompatibility;
21308 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
21310 return false;
21311 CmpInst::Predicate Pred1 = CI1->getPredicate();
21312 CmpInst::Predicate Pred2 = CI2->getPredicate();
21315 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
21316 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
21317 if (BasePred1 < BasePred2)
21318 return !IsCompatibility;
21319 if (BasePred1 > BasePred2)
21320 return false;
21321 // Compare operands.
21322 bool CI1Preds = Pred1 == BasePred1;
21323 bool CI2Preds = Pred2 == BasePred1;
21324 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
21325 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
21326 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
21327 if (Op1 == Op2)
21328 continue;
21329 if (Op1->getValueID() < Op2->getValueID())
21330 return !IsCompatibility;
21331 if (Op1->getValueID() > Op2->getValueID())
21332 return false;
21333 if (auto *I1 = dyn_cast<Instruction>(Op1))
21334 if (auto *I2 = dyn_cast<Instruction>(Op2)) {
21335 if (IsCompatibility) {
21336 if (I1->getParent() != I2->getParent())
21337 return false;
21338 } else {
21339 // Try to compare nodes with same parent.
21340 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
21341 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
21342 if (!NodeI1)
21343 return NodeI2 != nullptr;
21344 if (!NodeI2)
21345 return false;
21346 assert((NodeI1 == NodeI2) ==
21347 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
21348 "Different nodes should have different DFS numbers");
21349 if (NodeI1 != NodeI2)
21350 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
21351 }
21352 InstructionsState S = getSameOpcode({I1, I2}, TLI);
21353 if (S && (IsCompatibility || !S.isAltShuffle()))
21354 continue;
21355 if (IsCompatibility)
21356 return false;
21357 if (I1->getOpcode() != I2->getOpcode())
21358 return I1->getOpcode() < I2->getOpcode();
21359 }
21360 }
21361 return IsCompatibility;
21362}
21363
21364template <typename ItT>
21365bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
21366 BasicBlock *BB, BoUpSLP &R) {
21367 bool Changed = false;
21368 // Try to find reductions first.
21369 for (CmpInst *I : CmpInsts) {
21370 if (R.isDeleted(I))
21371 continue;
21372 for (Value *Op : I->operands())
21373 if (auto *RootOp = dyn_cast<Instruction>(Op)) {
21374 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R);
21375 if (R.isDeleted(I))
21376 break;
21377 }
21378 }
21379 // Try to vectorize operands as vector bundles.
21380 for (CmpInst *I : CmpInsts) {
21381 if (R.isDeleted(I))
21382 continue;
21383 Changed |= tryToVectorize(I, R);
21384 }
21385 // Try to vectorize list of compares.
21386 // Sort by type, compare predicate, etc.
21387 auto CompareSorter = [&](Value *V, Value *V2) {
21388 if (V == V2)
21389 return false;
21390 return compareCmp<false>(V, V2, *TLI, *DT);
21391 };
21392
21393 auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
21394 if (V1 == V2)
21395 return true;
21396 return compareCmp<true>(V1, V2, *TLI, *DT);
21397 };
21398
21400 for (Instruction *V : CmpInsts)
21401 if (!R.isDeleted(V) && isValidElementType(getValueType(V)))
21402 Vals.push_back(V);
21403 if (Vals.size() <= 1)
21404 return Changed;
21405 Changed |= tryToVectorizeSequence<Value>(
21406 Vals, CompareSorter, AreCompatibleCompares,
21407 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
21408 // Exclude possible reductions from other blocks.
21409 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
21410 return any_of(V->users(), [V](User *U) {
21411 auto *Select = dyn_cast<SelectInst>(U);
21412 return Select &&
21413 Select->getParent() != cast<Instruction>(V)->getParent();
21414 });
21415 });
21416 if (ArePossiblyReducedInOtherBlock)
21417 return false;
21418 return tryToVectorizeList(Candidates, R, MaxVFOnly);
21419 },
21420 /*MaxVFOnly=*/true, R);
21421 return Changed;
21422}
21423
21424bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
21425 BasicBlock *BB, BoUpSLP &R) {
21426 assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
21427 "This function only accepts Insert instructions");
21428 bool OpsChanged = false;
21429 SmallVector<WeakTrackingVH> PostponedInsts;
21430 for (auto *I : reverse(Instructions)) {
21431 // pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
21432 if (R.isDeleted(I) || isa<CmpInst>(I))
21433 continue;
21434 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
21435 OpsChanged |=
21436 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/true);
21437 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
21438 OpsChanged |=
21439 vectorizeInsertElementInst(LastInsertElem, BB, R, /*MaxVFOnly=*/true);
21440 }
21441 // pass2 - try to vectorize reductions only
21442 if (R.isDeleted(I))
21443 continue;
21444 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, PostponedInsts);
21445 if (R.isDeleted(I) || isa<CmpInst>(I))
21446 continue;
21447 // pass3 - try to match and vectorize a buildvector sequence.
21448 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
21449 OpsChanged |=
21450 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/false);
21451 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
21452 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
21453 /*MaxVFOnly=*/false);
21454 }
21455 }
21456 // Now try to vectorize postponed instructions.
21457 OpsChanged |= tryToVectorize(PostponedInsts, R);
21458
21459 Instructions.clear();
21460 return OpsChanged;
21461}
21462
21463bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
21464 bool Changed = false;
21466 SmallPtrSet<Value *, 16> VisitedInstrs;
21467 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
21468 // node. Allows better to identify the chains that can be vectorized in the
21469 // better way.
21471 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
21473 isValidElementType(V2->getType()) &&
21474 "Expected vectorizable types only.");
21475 // It is fine to compare type IDs here, since we expect only vectorizable
21476 // types, like ints, floats and pointers, we don't care about other type.
21477 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
21478 return true;
21479 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
21480 return false;
21481 if (V1->getType()->getScalarSizeInBits() <
21482 V2->getType()->getScalarSizeInBits())
21483 return true;
21484 if (V1->getType()->getScalarSizeInBits() >
21485 V2->getType()->getScalarSizeInBits())
21486 return false;
21487 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
21488 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
21489 if (Opcodes1.size() < Opcodes2.size())
21490 return true;
21491 if (Opcodes1.size() > Opcodes2.size())
21492 return false;
21493 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
21494 {
21495 // Instructions come first.
21496 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
21497 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
21498 if (I1 && I2) {
21499 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
21500 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
21501 if (!NodeI1)
21502 return NodeI2 != nullptr;
21503 if (!NodeI2)
21504 return false;
21505 assert((NodeI1 == NodeI2) ==
21506 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
21507 "Different nodes should have different DFS numbers");
21508 if (NodeI1 != NodeI2)
21509 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
21510 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
21511 if (S && !S.isAltShuffle())
21512 continue;
21513 return I1->getOpcode() < I2->getOpcode();
21514 }
21515 if (I1)
21516 return true;
21517 if (I2)
21518 return false;
21519 }
21520 {
21521 // Non-undef constants come next.
21522 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
21523 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
21524 if (C1 && C2)
21525 continue;
21526 if (C1)
21527 return true;
21528 if (C2)
21529 return false;
21530 }
21531 bool U1 = isa<UndefValue>(Opcodes1[I]);
21532 bool U2 = isa<UndefValue>(Opcodes2[I]);
21533 {
21534 // Non-constant non-instructions come next.
21535 if (!U1 && !U2) {
21536 auto ValID1 = Opcodes1[I]->getValueID();
21537 auto ValID2 = Opcodes2[I]->getValueID();
21538 if (ValID1 == ValID2)
21539 continue;
21540 if (ValID1 < ValID2)
21541 return true;
21542 if (ValID1 > ValID2)
21543 return false;
21544 }
21545 if (!U1)
21546 return true;
21547 if (!U2)
21548 return false;
21549 }
21550 // Undefs come last.
21551 assert(U1 && U2 && "The only thing left should be undef & undef.");
21552 }
21553 return false;
21554 };
21555 auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](Value *V1, Value *V2) {
21556 if (V1 == V2)
21557 return true;
21558 if (V1->getType() != V2->getType())
21559 return false;
21560 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
21561 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
21562 if (Opcodes1.size() != Opcodes2.size())
21563 return false;
21564 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
21565 // Undefs are compatible with any other value.
21566 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
21567 continue;
21568 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
21569 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
21570 if (R.isDeleted(I1) || R.isDeleted(I2))
21571 return false;
21572 if (I1->getParent() != I2->getParent())
21573 return false;
21574 if (getSameOpcode({I1, I2}, *TLI))
21575 continue;
21576 return false;
21577 }
21578 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
21579 continue;
21580 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
21581 return false;
21582 }
21583 return true;
21584 };
21585
21586 bool HaveVectorizedPhiNodes = false;
21587 do {
21588 // Collect the incoming values from the PHIs.
21589 Incoming.clear();
21590 for (Instruction &I : *BB) {
21591 auto *P = dyn_cast<PHINode>(&I);
21592 if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
21593 break;
21594
21595 // No need to analyze deleted, vectorized and non-vectorizable
21596 // instructions.
21597 if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
21598 isValidElementType(P->getType()))
21599 Incoming.push_back(P);
21600 }
21601
21602 if (Incoming.size() <= 1)
21603 break;
21604
21605 // Find the corresponding non-phi nodes for better matching when trying to
21606 // build the tree.
21607 for (Value *V : Incoming) {
21608 SmallVectorImpl<Value *> &Opcodes =
21609 PHIToOpcodes.try_emplace(V).first->getSecond();
21610 if (!Opcodes.empty())
21611 continue;
21612 SmallVector<Value *, 4> Nodes(1, V);
21614 while (!Nodes.empty()) {
21615 auto *PHI = cast<PHINode>(Nodes.pop_back_val());
21616 if (!Visited.insert(PHI).second)
21617 continue;
21618 for (Value *V : PHI->incoming_values()) {
21619 if (auto *PHI1 = dyn_cast<PHINode>((V))) {
21620 Nodes.push_back(PHI1);
21621 continue;
21622 }
21623 Opcodes.emplace_back(V);
21624 }
21625 }
21626 }
21627
21628 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
21629 Incoming, PHICompare, AreCompatiblePHIs,
21630 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
21631 return tryToVectorizeList(Candidates, R, MaxVFOnly);
21632 },
21633 /*MaxVFOnly=*/true, R);
21634 Changed |= HaveVectorizedPhiNodes;
21635 if (HaveVectorizedPhiNodes && any_of(PHIToOpcodes, [&](const auto &P) {
21636 auto *PHI = dyn_cast<PHINode>(P.first);
21637 return !PHI || R.isDeleted(PHI);
21638 }))
21639 PHIToOpcodes.clear();
21640 VisitedInstrs.insert(Incoming.begin(), Incoming.end());
21641 } while (HaveVectorizedPhiNodes);
21642
21643 VisitedInstrs.clear();
21644
21645 InstSetVector PostProcessInserts;
21646 SmallSetVector<CmpInst *, 8> PostProcessCmps;
21647 // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true
21648 // also vectorizes `PostProcessCmps`.
21649 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
21650 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
21651 if (VectorizeCmps) {
21652 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
21653 PostProcessCmps.clear();
21654 }
21655 PostProcessInserts.clear();
21656 return Changed;
21657 };
21658 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
21659 auto IsInPostProcessInstrs = [&](Instruction *I) {
21660 if (auto *Cmp = dyn_cast<CmpInst>(I))
21661 return PostProcessCmps.contains(Cmp);
21662 return isa<InsertElementInst, InsertValueInst>(I) &&
21663 PostProcessInserts.contains(I);
21664 };
21665 // Returns true if `I` is an instruction without users, like terminator, or
21666 // function call with ignored return value, store. Ignore unused instructions
21667 // (basing on instruction type, except for CallInst and InvokeInst).
21668 auto HasNoUsers = [](Instruction *I) {
21669 return I->use_empty() &&
21670 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
21671 };
21672 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
21673 // Skip instructions with scalable type. The num of elements is unknown at
21674 // compile-time for scalable type.
21675 if (isa<ScalableVectorType>(It->getType()))
21676 continue;
21677
21678 // Skip instructions marked for the deletion.
21679 if (R.isDeleted(&*It))
21680 continue;
21681 // We may go through BB multiple times so skip the one we have checked.
21682 if (!VisitedInstrs.insert(&*It).second) {
21683 if (HasNoUsers(&*It) &&
21684 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
21685 // We would like to start over since some instructions are deleted
21686 // and the iterator may become invalid value.
21687 Changed = true;
21688 It = BB->begin();
21689 E = BB->end();
21690 }
21691 continue;
21692 }
21693
21694 if (isa<DbgInfoIntrinsic>(It))
21695 continue;
21696
21697 // Try to vectorize reductions that use PHINodes.
21698 if (PHINode *P = dyn_cast<PHINode>(It)) {
21699 // Check that the PHI is a reduction PHI.
21700 if (P->getNumIncomingValues() == 2) {
21701 // Try to match and vectorize a horizontal reduction.
21702 Instruction *Root = getReductionInstr(DT, P, BB, LI);
21703 if (Root && vectorizeRootInstruction(P, Root, BB, R)) {
21704 Changed = true;
21705 It = BB->begin();
21706 E = BB->end();
21707 continue;
21708 }
21709 }
21710 // Try to vectorize the incoming values of the PHI, to catch reductions
21711 // that feed into PHIs.
21712 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
21713 // Skip if the incoming block is the current BB for now. Also, bypass
21714 // unreachable IR for efficiency and to avoid crashing.
21715 // TODO: Collect the skipped incoming values and try to vectorize them
21716 // after processing BB.
21717 if (BB == P->getIncomingBlock(I) ||
21718 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
21719 continue;
21720
21721 // Postponed instructions should not be vectorized here, delay their
21722 // vectorization.
21723 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
21724 PI && !IsInPostProcessInstrs(PI)) {
21725 bool Res =
21726 vectorizeRootInstruction(nullptr, PI, P->getIncomingBlock(I), R);
21727 Changed |= Res;
21728 if (Res && R.isDeleted(P)) {
21729 It = BB->begin();
21730 E = BB->end();
21731 break;
21732 }
21733 }
21734 }
21735 continue;
21736 }
21737
21738 if (HasNoUsers(&*It)) {
21739 bool OpsChanged = false;
21740 auto *SI = dyn_cast<StoreInst>(It);
21741 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
21742 if (SI) {
21743 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
21744 // Try to vectorize chain in store, if this is the only store to the
21745 // address in the block.
21746 // TODO: This is just a temporarily solution to save compile time. Need
21747 // to investigate if we can safely turn on slp-vectorize-hor-store
21748 // instead to allow lookup for reduction chains in all non-vectorized
21749 // stores (need to check side effects and compile time).
21750 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
21751 SI->getValueOperand()->hasOneUse();
21752 }
21753 if (TryToVectorizeRoot) {
21754 for (auto *V : It->operand_values()) {
21755 // Postponed instructions should not be vectorized here, delay their
21756 // vectorization.
21757 if (auto *VI = dyn_cast<Instruction>(V);
21758 VI && !IsInPostProcessInstrs(VI))
21759 // Try to match and vectorize a horizontal reduction.
21760 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R);
21761 }
21762 }
21763 // Start vectorization of post-process list of instructions from the
21764 // top-tree instructions to try to vectorize as many instructions as
21765 // possible.
21766 OpsChanged |=
21767 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
21768 if (OpsChanged) {
21769 // We would like to start over since some instructions are deleted
21770 // and the iterator may become invalid value.
21771 Changed = true;
21772 It = BB->begin();
21773 E = BB->end();
21774 continue;
21775 }
21776 }
21777
21778 if (isa<InsertElementInst, InsertValueInst>(It))
21779 PostProcessInserts.insert(&*It);
21780 else if (isa<CmpInst>(It))
21781 PostProcessCmps.insert(cast<CmpInst>(&*It));
21782 }
21783
21784 return Changed;
21785}
21786
21787bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
21788 auto Changed = false;
21789 for (auto &Entry : GEPs) {
21790 // If the getelementptr list has fewer than two elements, there's nothing
21791 // to do.
21792 if (Entry.second.size() < 2)
21793 continue;
21794
21795 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
21796 << Entry.second.size() << ".\n");
21797
21798 // Process the GEP list in chunks suitable for the target's supported
21799 // vector size. If a vector register can't hold 1 element, we are done. We
21800 // are trying to vectorize the index computations, so the maximum number of
21801 // elements is based on the size of the index expression, rather than the
21802 // size of the GEP itself (the target's pointer size).
21803 auto *It = find_if(Entry.second, [&](GetElementPtrInst *GEP) {
21804 return !R.isDeleted(GEP);
21805 });
21806 if (It == Entry.second.end())
21807 continue;
21808 unsigned MaxVecRegSize = R.getMaxVecRegSize();
21809 unsigned EltSize = R.getVectorElementSize(*(*It)->idx_begin());
21810 if (MaxVecRegSize < EltSize)
21811 continue;
21812
21813 unsigned MaxElts = MaxVecRegSize / EltSize;
21814 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
21815 auto Len = std::min<unsigned>(BE - BI, MaxElts);
21816 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
21817
21818 // Initialize a set a candidate getelementptrs. Note that we use a
21819 // SetVector here to preserve program order. If the index computations
21820 // are vectorizable and begin with loads, we want to minimize the chance
21821 // of having to reorder them later.
21822 SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
21823
21824 // Some of the candidates may have already been vectorized after we
21825 // initially collected them or their index is optimized to constant value.
21826 // If so, they are marked as deleted, so remove them from the set of
21827 // candidates.
21828 Candidates.remove_if([&R](Value *I) {
21829 return R.isDeleted(cast<Instruction>(I)) ||
21830 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
21831 });
21832
21833 // Remove from the set of candidates all pairs of getelementptrs with
21834 // constant differences. Such getelementptrs are likely not good
21835 // candidates for vectorization in a bottom-up phase since one can be
21836 // computed from the other. We also ensure all candidate getelementptr
21837 // indices are unique.
21838 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
21839 auto *GEPI = GEPList[I];
21840 if (!Candidates.count(GEPI))
21841 continue;
21842 const SCEV *SCEVI = SE->getSCEV(GEPList[I]);
21843 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
21844 auto *GEPJ = GEPList[J];
21845 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
21846 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
21847 Candidates.remove(GEPI);
21848 Candidates.remove(GEPJ);
21849 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
21850 Candidates.remove(GEPJ);
21851 }
21852 }
21853 }
21854
21855 // We break out of the above computation as soon as we know there are
21856 // fewer than two candidates remaining.
21857 if (Candidates.size() < 2)
21858 continue;
21859
21860 // Add the single, non-constant index of each candidate to the bundle. We
21861 // ensured the indices met these constraints when we originally collected
21862 // the getelementptrs.
21863 SmallVector<Value *, 16> Bundle(Candidates.size());
21864 auto BundleIndex = 0u;
21865 for (auto *V : Candidates) {
21866 auto *GEP = cast<GetElementPtrInst>(V);
21867 auto *GEPIdx = GEP->idx_begin()->get();
21868 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
21869 Bundle[BundleIndex++] = GEPIdx;
21870 }
21871
21872 // Try and vectorize the indices. We are currently only interested in
21873 // gather-like cases of the form:
21874 //
21875 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
21876 //
21877 // where the loads of "a", the loads of "b", and the subtractions can be
21878 // performed in parallel. It's likely that detecting this pattern in a
21879 // bottom-up phase will be simpler and less costly than building a
21880 // full-blown top-down phase beginning at the consecutive loads.
21881 Changed |= tryToVectorizeList(Bundle, R);
21882 }
21883 }
21884 return Changed;
21885}
21886
21887bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
21888 bool Changed = false;
21889 // Sort by type, base pointers and values operand. Value operands must be
21890 // compatible (have the same opcode, same parent), otherwise it is
21891 // definitely not profitable to try to vectorize them.
21892 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
21893 if (V->getValueOperand()->getType()->getTypeID() <
21894 V2->getValueOperand()->getType()->getTypeID())
21895 return true;
21896 if (V->getValueOperand()->getType()->getTypeID() >
21897 V2->getValueOperand()->getType()->getTypeID())
21898 return false;
21899 if (V->getPointerOperandType()->getTypeID() <
21900 V2->getPointerOperandType()->getTypeID())
21901 return true;
21902 if (V->getPointerOperandType()->getTypeID() >
21903 V2->getPointerOperandType()->getTypeID())
21904 return false;
21905 if (V->getValueOperand()->getType()->getScalarSizeInBits() <
21906 V2->getValueOperand()->getType()->getScalarSizeInBits())
21907 return true;
21908 if (V->getValueOperand()->getType()->getScalarSizeInBits() >
21909 V2->getValueOperand()->getType()->getScalarSizeInBits())
21910 return false;
21911 // UndefValues are compatible with all other values.
21912 if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
21913 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
21915 DT->getNode(I1->getParent());
21917 DT->getNode(I2->getParent());
21918 assert(NodeI1 && "Should only process reachable instructions");
21919 assert(NodeI2 && "Should only process reachable instructions");
21920 assert((NodeI1 == NodeI2) ==
21921 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
21922 "Different nodes should have different DFS numbers");
21923 if (NodeI1 != NodeI2)
21924 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
21925 return I1->getOpcode() < I2->getOpcode();
21926 }
21927 return V->getValueOperand()->getValueID() <
21928 V2->getValueOperand()->getValueID();
21929 };
21930
21931 auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {
21932 if (V1 == V2)
21933 return true;
21934 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
21935 return false;
21936 if (V1->getPointerOperandType() != V2->getPointerOperandType())
21937 return false;
21938 // Undefs are compatible with any other value.
21939 if (isa<UndefValue>(V1->getValueOperand()) ||
21940 isa<UndefValue>(V2->getValueOperand()))
21941 return true;
21942 if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))
21943 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
21944 if (I1->getParent() != I2->getParent())
21945 return false;
21946 return getSameOpcode({I1, I2}, *TLI).valid();
21947 }
21948 if (isa<Constant>(V1->getValueOperand()) &&
21949 isa<Constant>(V2->getValueOperand()))
21950 return true;
21951 return V1->getValueOperand()->getValueID() ==
21952 V2->getValueOperand()->getValueID();
21953 };
21954
21955 // Attempt to sort and vectorize each of the store-groups.
21957 for (auto &Pair : Stores) {
21958 if (Pair.second.size() < 2)
21959 continue;
21960
21961 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
21962 << Pair.second.size() << ".\n");
21963
21964 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
21965 continue;
21966
21967 // Reverse stores to do bottom-to-top analysis. This is important if the
21968 // values are stores to the same addresses several times, in this case need
21969 // to follow the stores order (reversed to meet the memory dependecies).
21970 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
21971 Pair.second.rend());
21972 Changed |= tryToVectorizeSequence<StoreInst>(
21973 ReversedStores, StoreSorter, AreCompatibleStores,
21974 [&](ArrayRef<StoreInst *> Candidates, bool) {
21975 return vectorizeStores(Candidates, R, Attempted);
21976 },
21977 /*MaxVFOnly=*/false, R);
21978 }
21979 return Changed;
21980}
static bool isConstant(const MachineInstr &MI)
AMDGPU Register Bank Select
Rewrite undef for PHI
ReachingDefAnalysis InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
basic Basic Alias true
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:622
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
Definition: DataLayout.cpp:920
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
Definition: DebugCounter.h:190
#define LLVM_DEBUG(...)
Definition: Debug.h:106
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
uint32_t Index
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
static const HTTPClientCleanup Cleanup
Definition: HTTPClient.cpp:42
Hexagon Common GEP
#define _
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition: IVUsers.cpp:48
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Definition: LICM.cpp:1504
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
ppc ctr loops verify
static bool IsSelect(MachineInstr &MI)
if(PassOpts->AAPipeline)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Correctly creates insert_subvector, checking that the index is multiple of the subvectors length.
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, unsigned Opcode0, unsigned Opcode1)
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Correctly creates extract_subvector, checking that the index is multiple of the subvectors length.
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
#define SV_NAME
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
Definition: VPlanSLP.cpp:154
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1407
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1330
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:371
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:380
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1640
void clearAllBits()
Set every bit to 0.
Definition: APInt.h:1397
void setAllBits()
Set every bit to 1.
Definition: APInt.h:1319
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition: APInt.h:1367
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:200
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition: APInt.h:286
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:239
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:429
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition: ArrayRef.h:190
const T & back() const
back - Get the last element.
Definition: ArrayRef.h:177
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition: ArrayRef.h:231
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:207
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:171
iterator end() const
Definition: ArrayRef.h:157
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:213
iterator begin() const
Definition: ArrayRef.h:156
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:198
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:234
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator end()
Definition: BasicBlock.h:474
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:461
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:179
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:220
reverse_iterator rend()
Definition: BasicBlock.h:479
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:177
bool isEHPad() const
Return true if this basic block is an exception handling block.
Definition: BasicBlock.h:688
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:240
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:72
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1112
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
Definition: InstrTypes.h:1986
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:1881
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1341
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
Definition: InstrTypes.h:2123
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Definition: InstrTypes.h:1980
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1286
FunctionType * getFunctionType() const
Definition: InstrTypes.h:1199
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1277
unsigned arg_size() const
Definition: InstrTypes.h:1284
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1494
bool hasOperandBundles() const
Return true if this User has any operand bundles.
Definition: InstrTypes.h:1977
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:444
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:661
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:980
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:702
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:703
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:697
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:696
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:700
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:698
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:701
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:699
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition: InstrTypes.h:825
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:787
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:763
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition: CmpPredicate.h:22
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2307
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:157
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
Definition: Constants.cpp:1472
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1421
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:420
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
Definition: DataLayout.h:434
IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
Definition: DataLayout.cpp:878
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:617
static bool shouldExecute(unsigned CounterName)
Definition: DebugCounter.h:87
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:103
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition: DenseMap.h:226
bool erase(const KeyT &Val)
Definition: DenseMap.h:321
unsigned size() const
Definition: DenseMap.h:99
bool empty() const
Definition: DenseMap.h:98
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:152
iterator end()
Definition: DenseMap.h:84
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:202
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:147
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:211
Implements a dense probed hash-table based set.
Definition: DenseSet.h:278
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
Definition: Dominators.cpp:321
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
This instruction extracts a single (scalar) element from a VectorType value.
This instruction extracts a struct member or array element value from an aggregate value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
void set()
Definition: FMF.h:62
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:563
unsigned getNumElements() const
Definition: DerivedTypes.h:606
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
ArrayRef< Type * > params() const
Definition: DerivedTypes.h:132
Type * getReturnType() const
Definition: DerivedTypes.h:126
bool empty() const
Definition: Function.h:871
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:933
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:113
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition: IRBuilder.h:1072
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2511
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition: IRBuilder.h:530
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition: IRBuilder.h:1080
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2499
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:558
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1815
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:485
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1053
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:194
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition: IRBuilder.h:2574
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition: IRBuilder.h:2186
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:193
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:330
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:239
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1874
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:510
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Definition: IRBuilder.h:867
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1761
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition: IRBuilder.cpp:889
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:900
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:505
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2404
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2435
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2152
CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Definition: IRBuilder.cpp:881
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2533
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:490
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2449
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1671
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:188
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2225
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:199
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1834
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2380
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1614
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1404
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
Definition: IRBuilder.cpp:596
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2705
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
Definition: Instruction.h:319
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
Definition: Instruction.h:799
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:511
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool isBinaryOp() const
Definition: Instruction.h:315
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:310
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
bool isIntDivRem() const
Definition: Instruction.h:316
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:311
An instruction for reading from memory.
Definition: Instructions.h:176
Value * getPointerOperand()
Definition: Instructions.h:255
bool isSimple() const
Definition: Instructions.h:247
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:211
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator end()
Definition: MapVector.h:71
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition: MapVector.h:55
iterator find(const KeyT &Key)
Definition: MapVector.h:167
bool empty() const
Definition: MapVector.h:79
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition: MapVector.h:118
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:141
ValueT lookup(const KeyT &Key) const
Definition: MapVector.h:110
size_type size() const
Definition: MapVector.h:60
std::pair< KeyT, ValueT > & front()
Definition: MapVector.h:83
void clear()
Definition: MapVector.h:88
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:310
T & front() const
front - Get the first element.
Definition: ArrayRef.h:366
iterator end() const
Definition: ArrayRef.h:360
iterator begin() const
Definition: ArrayRef.h:359
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:379
The optimization diagnostic interface.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
This is a MutableArrayRef that owns its array.
Definition: ArrayRef.h:452
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:686
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
Definition: PointerUnion.h:142
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
Definition: PointerUnion.h:168
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1878
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:146
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition: SetVector.h:84
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
const value_type & front() const
Return the first element of the SetVector.
Definition: SetVector.h:143
void clear()
Completely clear the SetVector.
Definition: SetVector.h:273
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:254
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:298
size_type size() const
Definition: SmallPtrSet.h:94
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:363
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition: SmallPtrSet.h:401
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:452
iterator end() const
Definition: SmallPtrSet.h:477
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
iterator begin() const
Definition: SmallPtrSet.h:472
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:458
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:175
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition: SmallSet.h:222
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
size_type size() const
Definition: SmallSet.h:170
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:704
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
void reserve(size_type N)
Definition: SmallVector.h:663
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
void swap(SmallVectorImpl &RHS)
Definition: SmallVector.h:968
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
Type * getPointerOperandType() const
Definition: Instructions.h:384
Value * getValueOperand()
Definition: Instructions.h:378
Value * getPointerOperand()
Definition: Instructions.h:381
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
TargetFolder - Create constants with target dependent folding.
Definition: TargetFolder.h:34
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
TypeSize getRegisterBitWidth(RegisterKind K) const
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType=nullptr, TargetCostKind CostKind=TCK_SizeAndLatency) const
Estimate the cost of a GEP operation when lowered.
bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, Align Alignment, unsigned AddrSpace) const
Return true is the target supports interleaved access for the given vector type VTy,...
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
\Returns true if the target supports broadcasting a load to a vector of type <NumElements x ElementTy...
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool forceScalarizeMaskedGather(VectorType *Type, Align Alignment) const
Return true if the target forces scalarizing of llvm.masked.gather intrinsics.
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const
Return true if the target supports strided load.
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
OperandValueProperties
Additional properties of an operand's values.
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const PointersChainInfo &Info, Type *AccessTy, TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Estimate the cost of a chain of pointers (typically pointer operands of a chain of loads or stores wi...
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
unsigned getMinVectorRegisterBitWidth() const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
Return true if this is an alternating opcode pattern that can be lowered to a single instruction on t...
bool isFPVectorizationPotentiallyUnsafe() const
Indicate that it is potentially unsafe to automatically vectorize floating-point operations because t...
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index) const
unsigned getNumberOfParts(Type *Tp) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ GatherScatter
The cast is used with a gather/scatter.
OperandValueKind
Additional information about an operand's possible values.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
bool isX86_FP80Ty() const
Return true if this is x86 long double.
Definition: Type.h:159
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:243
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:264
bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
unsigned getStructNumElements() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition: Type.h:295
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
Definition: Type.h:165
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:267
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:136
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:225
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1859
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
op_range operands()
Definition: User.h:288
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Definition: User.h:115
op_iterator op_begin()
Definition: User.h:280
Value * getOperand(unsigned i) const
Definition: User.h:228
unsigned getNumOperands() const
Definition: User.h:250
iterator_range< value_op_iterator > operand_values()
Definition: User.h:312
The Vector Function Database.
Definition: VectorUtils.h:31
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:72
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition: Value.h:532
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition: Value.cpp:153
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:149
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition: Value.cpp:255
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
Base class of all SIMD vector types.
Definition: DerivedTypes.h:427
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:665
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Definition: DerivedTypes.h:460
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:204
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
iterator find(const_arg_type_t< ValueT > V)
Definition: DenseSet.h:187
size_type size() const
Definition: DenseSet.h:81
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:193
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:95
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
Definition: Hashing.h:75
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
CRTP base class for adapting an iterator to a different type.
Definition: iterator.h:237
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:661
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:691
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={})
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
bool isIdentityOrder(ArrayRef< unsigned > Order) const
Does this non-empty order represent an identity order? Identity should be represented as an empty ord...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
InstructionCost getSpillCost() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
Definition: VectorUtils.h:106
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Key
PAL metadata keys.
@ HorizontalReduction
Definition: ARMBaseInfo.h:425
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ Entry
Definition: COFF.h:844
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:732
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:826
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:885
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
Definition: PatternMatch.h:299
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
Definition: PatternMatch.h:239
@ GS
Definition: X86.h:210
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path LLVM_LIFETIME_BOUND, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:226
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
Definition: LoopUtils.cpp:1278
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
@ Offset
Definition: DWP.cpp:480
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:854
void stable_sort(R &&Range)
Definition: STLExtras.h:2037
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1759
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1732
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
hash_code hash_value(const FixedPointSemantics &Val)
Definition: APFixedPoint.h:136
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:989
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:546
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition: ScopeExit.h:59
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
Definition: SetOperations.h:58
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:7301
void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition: Utils.cpp:1683
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition: STLExtras.h:2207
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:556
iterator_range< po_iterator< T > > post_order(const T &G)
MaybeAlign getAlign(const Function &F, unsigned Index)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1785
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:395
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2107
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1952
constexpr bool has_single_bit(T Value) noexcept
Definition: bit.h:146
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition: Local.cpp:406
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:341
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1664
llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
Definition: STLExtras.h:1771
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
Definition: SPIRVUtils.h:256
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition: Local.cpp:425
bool isModOrRefSet(const ModRefInfo MRI)
Definition: ModRef.h:42
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition: Casting.h:548
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
Definition: LoopUtils.cpp:1368
constexpr int PoisonMaskElem
@ Ref
The access may reference the value stored in memory.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:404
@ Other
Any other memory.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
TargetTransformInfo TTI
CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
Definition: LoopUtils.cpp:1054
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:33
@ Or
Bitwise or logical OR of integers.
@ None
Not a recurrence.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1938
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:2014
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
Definition: GraphWriter.h:427
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1841
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1945
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1766
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
InstructionCost Cost
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition: Sequence.h:305
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:590
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
Definition: VectorUtils.cpp:46
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:468
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Used to keep track of an operand bundle.
Definition: InstrTypes.h:2144
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
Direction
An enum for the direction of the loop.
Definition: LoopInfo.h:215
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
Describe known properties for a set of pointers.
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Definition: STLExtras.h:1467
Function object to check whether the second component of a container supported by std::get (like std:...
Definition: STLExtras.h:1476
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.