LLVM 22.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
41#include "llvm/Analysis/Loads.h"
52#include "llvm/IR/Attributes.h"
53#include "llvm/IR/BasicBlock.h"
54#include "llvm/IR/Constant.h"
55#include "llvm/IR/Constants.h"
56#include "llvm/IR/DataLayout.h"
58#include "llvm/IR/Dominators.h"
59#include "llvm/IR/Function.h"
60#include "llvm/IR/IRBuilder.h"
61#include "llvm/IR/InstrTypes.h"
62#include "llvm/IR/Instruction.h"
65#include "llvm/IR/Intrinsics.h"
66#include "llvm/IR/Module.h"
67#include "llvm/IR/Operator.h"
69#include "llvm/IR/Type.h"
70#include "llvm/IR/Use.h"
71#include "llvm/IR/User.h"
72#include "llvm/IR/Value.h"
73#include "llvm/IR/ValueHandle.h"
74#ifdef EXPENSIVE_CHECKS
75#include "llvm/IR/Verifier.h"
76#endif
77#include "llvm/Pass.h"
82#include "llvm/Support/Debug.h"
94#include <algorithm>
95#include <cassert>
96#include <cstdint>
97#include <iterator>
98#include <memory>
99#include <optional>
100#include <set>
101#include <string>
102#include <tuple>
103#include <utility>
104
105using namespace llvm;
106using namespace llvm::PatternMatch;
107using namespace slpvectorizer;
108using namespace std::placeholders;
109
110#define SV_NAME "slp-vectorizer"
111#define DEBUG_TYPE "SLP"
112
113STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
114
115DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",
116 "Controls which SLP graphs should be vectorized.");
117
118static cl::opt<bool>
119 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
120 cl::desc("Run the SLP vectorization passes"));
121
122static cl::opt<bool>
123 SLPReVec("slp-revec", cl::init(false), cl::Hidden,
124 cl::desc("Enable vectorization for wider vector utilization"));
125
126static cl::opt<int>
128 cl::desc("Only vectorize if you gain more than this "
129 "number "));
130
132 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
133 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
134 "heuristics and makes vectorization decision via cost modeling."));
135
136static cl::opt<bool>
137ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
138 cl::desc("Attempt to vectorize horizontal reductions"));
139
141 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
142 cl::desc(
143 "Attempt to vectorize horizontal reductions feeding into a store"));
144
146 "slp-split-alternate-instructions", cl::init(true), cl::Hidden,
147 cl::desc("Improve the code quality by splitting alternate instructions"));
148
149static cl::opt<int>
151 cl::desc("Attempt to vectorize for this register size in bits"));
152
155 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
156
157/// Limits the size of scheduling regions in a block.
158/// It avoid long compile times for _very_ large blocks where vector
159/// instructions are spread over a wide range.
160/// This limit is way higher than needed by real-world functions.
161static cl::opt<int>
162ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
163 cl::desc("Limit the size of the SLP scheduling region per block"));
164
166 "slp-min-reg-size", cl::init(128), cl::Hidden,
167 cl::desc("Attempt to vectorize for this register size in bits"));
168
170 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
171 cl::desc("Limit the recursion depth when building a vectorizable tree"));
172
174 "slp-min-tree-size", cl::init(3), cl::Hidden,
175 cl::desc("Only vectorize small trees if they are fully vectorizable"));
176
177// The maximum depth that the look-ahead score heuristic will explore.
178// The higher this value, the higher the compilation time overhead.
180 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
181 cl::desc("The maximum look-ahead depth for operand reordering scores"));
182
183// The maximum depth that the look-ahead score heuristic will explore
184// when it probing among candidates for vectorization tree roots.
185// The higher this value, the higher the compilation time overhead but unlike
186// similar limit for operands ordering this is less frequently used, hence
187// impact of higher value is less noticeable.
189 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
190 cl::desc("The maximum look-ahead depth for searching best rooting option"));
191
193 "slp-min-strided-loads", cl::init(2), cl::Hidden,
194 cl::desc("The minimum number of loads, which should be considered strided, "
195 "if the stride is > 1 or is runtime value"));
196
198 "slp-max-stride", cl::init(8), cl::Hidden,
199 cl::desc("The maximum stride, considered to be profitable."));
200
201static cl::opt<bool>
202 DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden,
203 cl::desc("Disable tree reordering even if it is "
204 "profitable. Used for testing only."));
205
206static cl::opt<bool>
207 ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden,
208 cl::desc("Generate strided loads even if they are not "
209 "profitable. Used for testing only."));
210
211static cl::opt<bool>
212 ViewSLPTree("view-slp-tree", cl::Hidden,
213 cl::desc("Display the SLP trees with Graphviz"));
214
216 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
217 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
218
219/// Enables vectorization of copyable elements.
221 "slp-copyable-elements", cl::init(true), cl::Hidden,
222 cl::desc("Try to replace values with the idempotent instructions for "
223 "better vectorization."));
224
225// Limit the number of alias checks. The limit is chosen so that
226// it has no negative effect on the llvm benchmarks.
227static const unsigned AliasedCheckLimit = 10;
228
229// Limit of the number of uses for potentially transformed instructions/values,
230// used in checks to avoid compile-time explode.
231static constexpr int UsesLimit = 64;
232
233// Another limit for the alias checks: The maximum distance between load/store
234// instructions where alias checks are done.
235// This limit is useful for very large basic blocks.
236static const unsigned MaxMemDepDistance = 160;
237
238/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
239/// regions to be handled.
240static const int MinScheduleRegionSize = 16;
241
242/// Maximum allowed number of operands in the PHI nodes.
243static const unsigned MaxPHINumOperands = 128;
244
245/// Predicate for the element types that the SLP vectorizer supports.
246///
247/// The most important thing to filter here are types which are invalid in LLVM
248/// vectors. We also filter target specific types which have absolutely no
249/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
250/// avoids spending time checking the cost model and realizing that they will
251/// be inevitably scalarized.
252static bool isValidElementType(Type *Ty) {
253 // TODO: Support ScalableVectorType.
255 Ty = Ty->getScalarType();
256 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
257 !Ty->isPPC_FP128Ty();
258}
259
260/// Returns the type of the given value/instruction \p V. If it is store,
261/// returns the type of its value operand, for Cmp - the types of the compare
262/// operands and for insertelement - the type os the inserted operand.
263/// Otherwise, just the type of the value is returned.
265 if (auto *SI = dyn_cast<StoreInst>(V))
266 return SI->getValueOperand()->getType();
267 if (auto *CI = dyn_cast<CmpInst>(V))
268 return CI->getOperand(0)->getType();
269 if (auto *IE = dyn_cast<InsertElementInst>(V))
270 return IE->getOperand(1)->getType();
271 return V->getType();
272}
273
274/// \returns the number of elements for Ty.
275static unsigned getNumElements(Type *Ty) {
277 "ScalableVectorType is not supported.");
278 if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
279 return VecTy->getNumElements();
280 return 1;
281}
282
283/// \returns the vector type of ScalarTy based on vectorization factor.
284static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
285 return FixedVectorType::get(ScalarTy->getScalarType(),
286 VF * getNumElements(ScalarTy));
287}
288
289/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
290/// which forms type, which splits by \p TTI into whole vector types during
291/// legalization.
293 Type *Ty, unsigned Sz) {
294 if (!isValidElementType(Ty))
295 return bit_ceil(Sz);
296 // Find the number of elements, which forms full vectors.
297 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
298 if (NumParts == 0 || NumParts >= Sz)
299 return bit_ceil(Sz);
300 return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
301}
302
303/// Returns the number of elements of the given type \p Ty, not greater than \p
304/// Sz, which forms type, which splits by \p TTI into whole vector types during
305/// legalization.
306static unsigned
308 unsigned Sz) {
309 if (!isValidElementType(Ty))
310 return bit_floor(Sz);
311 // Find the number of elements, which forms full vectors.
312 unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
313 if (NumParts == 0 || NumParts >= Sz)
314 return bit_floor(Sz);
315 unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
316 if (RegVF > Sz)
317 return bit_floor(Sz);
318 return (Sz / RegVF) * RegVF;
319}
320
321static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
322 SmallVectorImpl<int> &Mask) {
323 // The ShuffleBuilder implementation use shufflevector to splat an "element".
324 // But the element have different meaning for SLP (scalar) and REVEC
325 // (vector). We need to expand Mask into masks which shufflevector can use
326 // directly.
327 SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
328 for (unsigned I : seq<unsigned>(Mask.size()))
329 for (auto [J, MaskV] : enumerate(MutableArrayRef(NewMask).slice(
330 I * VecTyNumElements, VecTyNumElements)))
331 MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem
332 : Mask[I] * VecTyNumElements + J;
333 Mask.swap(NewMask);
334}
335
336/// \returns the number of groups of shufflevector
337/// A group has the following features
338/// 1. All of value in a group are shufflevector.
339/// 2. The mask of all shufflevector is isExtractSubvectorMask.
340/// 3. The mask of all shufflevector uses all of the elements of the source.
341/// e.g., it is 1 group (%0)
342/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
343/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
344/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
345/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
346/// it is 2 groups (%3 and %4)
347/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
348/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
349/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
350/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
351/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
352/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
353/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
354/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
355/// it is 0 group
356/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
357/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
358/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
359/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
361 if (VL.empty())
362 return 0;
364 return 0;
365 auto *SV = cast<ShuffleVectorInst>(VL.front());
366 unsigned SVNumElements =
367 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
368 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
369 if (SVNumElements % ShuffleMaskSize != 0)
370 return 0;
371 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
372 if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
373 return 0;
374 unsigned NumGroup = 0;
375 for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) {
376 auto *SV = cast<ShuffleVectorInst>(VL[I]);
377 Value *Src = SV->getOperand(0);
378 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
379 SmallBitVector ExpectedIndex(GroupSize);
380 if (!all_of(Group, [&](Value *V) {
381 auto *SV = cast<ShuffleVectorInst>(V);
382 // From the same source.
383 if (SV->getOperand(0) != Src)
384 return false;
385 int Index;
386 if (!SV->isExtractSubvectorMask(Index))
387 return false;
388 ExpectedIndex.set(Index / ShuffleMaskSize);
389 return true;
390 }))
391 return 0;
392 if (!ExpectedIndex.all())
393 return 0;
394 ++NumGroup;
395 }
396 assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
397 return NumGroup;
398}
399
400/// \returns a shufflevector mask which is used to vectorize shufflevectors
401/// e.g.,
402/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
403/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
404/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
405/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
406/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
407/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
408/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
409/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
410/// the result is
411/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
413 assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
414 auto *SV = cast<ShuffleVectorInst>(VL.front());
415 unsigned SVNumElements =
416 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
417 SmallVector<int> Mask;
418 unsigned AccumulateLength = 0;
419 for (Value *V : VL) {
420 auto *SV = cast<ShuffleVectorInst>(V);
421 for (int M : SV->getShuffleMask())
422 Mask.push_back(M == PoisonMaskElem ? PoisonMaskElem
423 : AccumulateLength + M);
424 AccumulateLength += SVNumElements;
425 }
426 return Mask;
427}
428
429/// \returns True if the value is a constant (but not globals/constant
430/// expressions).
431static bool isConstant(Value *V) {
433}
434
435/// Checks if \p V is one of vector-like instructions, i.e. undef,
436/// insertelement/extractelement with constant indices for fixed vector type or
437/// extractvalue instruction.
441 return false;
442 auto *I = dyn_cast<Instruction>(V);
443 if (!I || isa<ExtractValueInst>(I))
444 return true;
445 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
446 return false;
448 return isConstant(I->getOperand(1));
449 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
450 return isConstant(I->getOperand(2));
451}
452
453/// Returns power-of-2 number of elements in a single register (part), given the
454/// total number of elements \p Size and number of registers (parts) \p
455/// NumParts.
456static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
457 return std::min<unsigned>(Size, bit_ceil(divideCeil(Size, NumParts)));
458}
459
460/// Returns correct remaining number of elements, considering total amount \p
461/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
462/// and current register (part) \p Part.
463static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
464 unsigned Part) {
465 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
466}
467
468#if !defined(NDEBUG)
469/// Print a short descriptor of the instruction bundle suitable for debug output.
470static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
471 std::string Result;
472 raw_string_ostream OS(Result);
473 if (Idx >= 0)
474 OS << "Idx: " << Idx << ", ";
475 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
476 return Result;
477}
478#endif
479
480/// \returns true if all of the instructions in \p VL are in the same block or
481/// false otherwise.
483 auto *It = find_if(VL, IsaPred<Instruction>);
484 if (It == VL.end())
485 return false;
488 return true;
489
490 BasicBlock *BB = I0->getParent();
491 for (Value *V : iterator_range(It, VL.end())) {
492 if (isa<PoisonValue>(V))
493 continue;
494 auto *II = dyn_cast<Instruction>(V);
495 if (!II)
496 return false;
497
498 if (BB != II->getParent())
499 return false;
500 }
501 return true;
502}
503
504/// \returns True if all of the values in \p VL are constants (but not
505/// globals/constant expressions).
507 // Constant expressions and globals can't be vectorized like normal integer/FP
508 // constants.
509 return all_of(VL, isConstant);
510}
511
512/// \returns True if all of the values in \p VL are identical or some of them
513/// are UndefValue.
514static bool isSplat(ArrayRef<Value *> VL) {
515 Value *FirstNonUndef = nullptr;
516 for (Value *V : VL) {
517 if (isa<UndefValue>(V))
518 continue;
519 if (!FirstNonUndef) {
520 FirstNonUndef = V;
521 continue;
522 }
523 if (V != FirstNonUndef)
524 return false;
525 }
526 return FirstNonUndef != nullptr;
527}
528
529/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
530/// For BinaryOperator, it also checks if \p InstWithUses is used in specific
531/// patterns that make it effectively commutative (like equality comparisons
532/// with zero).
533/// In most cases, users should not call this function directly (since \p I and
534/// \p InstWithUses are the same). However, when analyzing interchangeable
535/// instructions, we need to use the converted opcode along with the original
536/// uses.
537/// \param I The instruction to check for commutativity
538/// \param ValWithUses The value whose uses are analyzed for special
539/// patterns
540static bool isCommutative(Instruction *I, Value *ValWithUses) {
541 if (auto *Cmp = dyn_cast<CmpInst>(I))
542 return Cmp->isCommutative();
543 if (auto *BO = dyn_cast<BinaryOperator>(I))
544 return BO->isCommutative() ||
545 (BO->getOpcode() == Instruction::Sub &&
546 !ValWithUses->hasNUsesOrMore(UsesLimit) &&
547 all_of(
548 ValWithUses->uses(),
549 [](const Use &U) {
550 // Commutative, if icmp eq/ne sub, 0
551 CmpPredicate Pred;
552 if (match(U.getUser(),
553 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
554 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
555 return true;
556 // Commutative, if abs(sub nsw, true) or abs(sub, false).
557 ConstantInt *Flag;
558 return match(U.getUser(),
559 m_Intrinsic<Intrinsic::abs>(
560 m_Specific(U.get()), m_ConstantInt(Flag))) &&
561 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
562 Flag->isOne());
563 })) ||
564 (BO->getOpcode() == Instruction::FSub &&
565 !ValWithUses->hasNUsesOrMore(UsesLimit) &&
566 all_of(ValWithUses->uses(), [](const Use &U) {
567 return match(U.getUser(),
568 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
569 }));
570 return I->isCommutative();
571}
572
573/// This is a helper function to check whether \p I is commutative.
574/// This is a convenience wrapper that calls the two-parameter version of
575/// isCommutative with the same instruction for both parameters. This is
576/// the common case where the instruction being checked for commutativity
577/// is the same as the instruction whose uses are analyzed for special
578/// patterns (see the two-parameter version above for details).
579/// \param I The instruction to check for commutativity
580/// \returns true if the instruction is commutative, false otherwise
581static bool isCommutative(Instruction *I) { return isCommutative(I, I); }
582
583/// \returns number of operands of \p I, considering commutativity. Returns 2
584/// for commutative instrinsics.
585/// \param I The instruction to check for commutativity
588 // IntrinsicInst::isCommutative returns true if swapping the first "two"
589 // arguments to the intrinsic produces the same result.
590 constexpr unsigned IntrinsicNumOperands = 2;
591 return IntrinsicNumOperands;
592 }
593 return I->getNumOperands();
594}
595
596template <typename T>
597static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
598 unsigned Offset) {
599 static_assert(std::is_same_v<T, InsertElementInst> ||
600 std::is_same_v<T, ExtractElementInst>,
601 "unsupported T");
602 int Index = Offset;
603 if (const auto *IE = dyn_cast<T>(Inst)) {
604 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
605 if (!VT)
606 return std::nullopt;
607 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
608 if (!CI)
609 return std::nullopt;
610 if (CI->getValue().uge(VT->getNumElements()))
611 return std::nullopt;
612 Index *= VT->getNumElements();
613 Index += CI->getZExtValue();
614 return Index;
615 }
616 return std::nullopt;
617}
618
619/// \returns inserting or extracting index of InsertElement, ExtractElement or
620/// InsertValue instruction, using Offset as base offset for index.
621/// \returns std::nullopt if the index is not an immediate.
622static std::optional<unsigned> getElementIndex(const Value *Inst,
623 unsigned Offset = 0) {
624 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
625 return Index;
627 return Index;
628
629 int Index = Offset;
630
631 const auto *IV = dyn_cast<InsertValueInst>(Inst);
632 if (!IV)
633 return std::nullopt;
634
635 Type *CurrentType = IV->getType();
636 for (unsigned I : IV->indices()) {
637 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
638 Index *= ST->getNumElements();
639 CurrentType = ST->getElementType(I);
640 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
641 Index *= AT->getNumElements();
642 CurrentType = AT->getElementType();
643 } else {
644 return std::nullopt;
645 }
646 Index += I;
647 }
648 return Index;
649}
650
651/// \returns true if all of the values in \p VL use the same opcode.
652/// For comparison instructions, also checks if predicates match.
653/// PoisonValues are considered matching.
654/// Interchangeable instructions are not considered.
656 auto *It = find_if(VL, IsaPred<Instruction>);
657 if (It == VL.end())
658 return true;
659 Instruction *MainOp = cast<Instruction>(*It);
660 unsigned Opcode = MainOp->getOpcode();
661 bool IsCmpOp = isa<CmpInst>(MainOp);
662 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
664 return std::all_of(It, VL.end(), [&](Value *V) {
665 if (auto *CI = dyn_cast<CmpInst>(V))
666 return BasePred == CI->getPredicate();
667 if (auto *I = dyn_cast<Instruction>(V))
668 return I->getOpcode() == Opcode;
669 return isa<PoisonValue>(V);
670 });
671}
672
673namespace {
674/// Specifies the way the mask should be analyzed for undefs/poisonous elements
675/// in the shuffle mask.
676enum class UseMask {
677 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
678 ///< check for the mask elements for the first argument (mask
679 ///< indices are in range [0:VF)).
680 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
681 ///< for the mask elements for the second argument (mask indices
682 ///< are in range [VF:2*VF))
683 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
684 ///< future shuffle elements and mark them as ones as being used
685 ///< in future. Non-undef elements are considered as unused since
686 ///< they're already marked as used in the mask.
687};
688} // namespace
689
690/// Prepares a use bitset for the given mask either for the first argument or
691/// for the second.
693 UseMask MaskArg) {
694 SmallBitVector UseMask(VF, true);
695 for (auto [Idx, Value] : enumerate(Mask)) {
696 if (Value == PoisonMaskElem) {
697 if (MaskArg == UseMask::UndefsAsMask)
698 UseMask.reset(Idx);
699 continue;
700 }
701 if (MaskArg == UseMask::FirstArg && Value < VF)
702 UseMask.reset(Value);
703 else if (MaskArg == UseMask::SecondArg && Value >= VF)
704 UseMask.reset(Value - VF);
705 }
706 return UseMask;
707}
708
709/// Checks if the given value is actually an undefined constant vector.
710/// Also, if the \p UseMask is not empty, tries to check if the non-masked
711/// elements actually mask the insertelement buildvector, if any.
712template <bool IsPoisonOnly = false>
714 const SmallBitVector &UseMask = {}) {
715 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
716 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
717 if (isa<T>(V))
718 return Res;
719 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
720 if (!VecTy)
721 return Res.reset();
722 auto *C = dyn_cast<Constant>(V);
723 if (!C) {
724 if (!UseMask.empty()) {
725 const Value *Base = V;
726 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
727 Base = II->getOperand(0);
728 if (isa<T>(II->getOperand(1)))
729 continue;
730 std::optional<unsigned> Idx = getElementIndex(II);
731 if (!Idx) {
732 Res.reset();
733 return Res;
734 }
735 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
736 Res.reset(*Idx);
737 }
738 // TODO: Add analysis for shuffles here too.
739 if (V == Base) {
740 Res.reset();
741 } else {
742 SmallBitVector SubMask(UseMask.size(), false);
743 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
744 }
745 } else {
746 Res.reset();
747 }
748 return Res;
749 }
750 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
751 if (Constant *Elem = C->getAggregateElement(I))
752 if (!isa<T>(Elem) &&
753 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
754 Res.reset(I);
755 }
756 return Res;
757}
758
759/// Checks if the vector of instructions can be represented as a shuffle, like:
760/// %x0 = extractelement <4 x i8> %x, i32 0
761/// %x3 = extractelement <4 x i8> %x, i32 3
762/// %y1 = extractelement <4 x i8> %y, i32 1
763/// %y2 = extractelement <4 x i8> %y, i32 2
764/// %x0x0 = mul i8 %x0, %x0
765/// %x3x3 = mul i8 %x3, %x3
766/// %y1y1 = mul i8 %y1, %y1
767/// %y2y2 = mul i8 %y2, %y2
768/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
769/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
770/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
771/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
772/// ret <4 x i8> %ins4
773/// can be transformed into:
774/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
775/// i32 6>
776/// %2 = mul <4 x i8> %1, %1
777/// ret <4 x i8> %2
778/// Mask will return the Shuffle Mask equivalent to the extracted elements.
779/// TODO: Can we split off and reuse the shuffle mask detection from
780/// ShuffleVectorInst/getShuffleCost?
781static std::optional<TargetTransformInfo::ShuffleKind>
783 AssumptionCache *AC) {
784 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
785 if (It == VL.end())
786 return std::nullopt;
787 unsigned Size =
788 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) {
789 auto *EI = dyn_cast<ExtractElementInst>(V);
790 if (!EI)
791 return S;
792 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
793 if (!VTy)
794 return S;
795 return std::max(S, VTy->getNumElements());
796 });
797
798 Value *Vec1 = nullptr;
799 Value *Vec2 = nullptr;
800 bool HasNonUndefVec = any_of(VL, [&](Value *V) {
801 auto *EE = dyn_cast<ExtractElementInst>(V);
802 if (!EE)
803 return false;
804 Value *Vec = EE->getVectorOperand();
805 if (isa<UndefValue>(Vec))
806 return false;
807 return isGuaranteedNotToBePoison(Vec, AC);
808 });
809 enum ShuffleMode { Unknown, Select, Permute };
810 ShuffleMode CommonShuffleMode = Unknown;
811 Mask.assign(VL.size(), PoisonMaskElem);
812 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
813 // Undef can be represented as an undef element in a vector.
814 if (isa<UndefValue>(VL[I]))
815 continue;
816 auto *EI = cast<ExtractElementInst>(VL[I]);
817 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
818 return std::nullopt;
819 auto *Vec = EI->getVectorOperand();
820 // We can extractelement from undef or poison vector.
822 continue;
823 // All vector operands must have the same number of vector elements.
824 if (isa<UndefValue>(Vec)) {
825 Mask[I] = I;
826 } else {
827 if (isa<UndefValue>(EI->getIndexOperand()))
828 continue;
829 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
830 if (!Idx)
831 return std::nullopt;
832 // Undefined behavior if Idx is negative or >= Size.
833 if (Idx->getValue().uge(Size))
834 continue;
835 unsigned IntIdx = Idx->getValue().getZExtValue();
836 Mask[I] = IntIdx;
837 }
838 if (isUndefVector(Vec).all() && HasNonUndefVec)
839 continue;
840 // For correct shuffling we have to have at most 2 different vector operands
841 // in all extractelement instructions.
842 if (!Vec1 || Vec1 == Vec) {
843 Vec1 = Vec;
844 } else if (!Vec2 || Vec2 == Vec) {
845 Vec2 = Vec;
846 Mask[I] += Size;
847 } else {
848 return std::nullopt;
849 }
850 if (CommonShuffleMode == Permute)
851 continue;
852 // If the extract index is not the same as the operation number, it is a
853 // permutation.
854 if (Mask[I] % Size != I) {
855 CommonShuffleMode = Permute;
856 continue;
857 }
858 CommonShuffleMode = Select;
859 }
860 // If we're not crossing lanes in different vectors, consider it as blending.
861 if (CommonShuffleMode == Select && Vec2)
863 // If Vec2 was never used, we have a permutation of a single vector, otherwise
864 // we have permutation of 2 vectors.
867}
868
869/// \returns True if Extract{Value,Element} instruction extracts element Idx.
870static std::optional<unsigned> getExtractIndex(const Instruction *E) {
871 unsigned Opcode = E->getOpcode();
872 assert((Opcode == Instruction::ExtractElement ||
873 Opcode == Instruction::ExtractValue) &&
874 "Expected extractelement or extractvalue instruction.");
875 if (Opcode == Instruction::ExtractElement) {
876 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
877 if (!CI)
878 return std::nullopt;
879 return CI->getZExtValue();
880 }
881 auto *EI = cast<ExtractValueInst>(E);
882 if (EI->getNumIndices() != 1)
883 return std::nullopt;
884 return *EI->idx_begin();
885}
886
887namespace llvm {
888/// Checks if the provided value does not require scheduling. It does not
889/// require scheduling if this is not an instruction or it is an instruction
890/// that does not read/write memory and all operands are either not instructions
891/// or phi nodes or instructions from different blocks.
892static bool areAllOperandsNonInsts(Value *V);
893/// Checks if the provided value does not require scheduling. It does not
894/// require scheduling if this is not an instruction or it is an instruction
895/// that does not read/write memory and all users are phi nodes or instructions
896/// from the different blocks.
897static bool isUsedOutsideBlock(Value *V);
898/// Checks if the specified value does not require scheduling. It does not
899/// require scheduling if all operands and all users do not need to be scheduled
900/// in the current basic block.
901static bool doesNotNeedToBeScheduled(Value *V);
902} // namespace llvm
903
904namespace {
905/// \returns true if \p Opcode is allowed as part of the main/alternate
906/// instruction for SLP vectorization.
907///
908/// Example of unsupported opcode is SDIV that can potentially cause UB if the
909/// "shuffled out" lane would result in division by zero.
910bool isValidForAlternation(unsigned Opcode) {
911 return !Instruction::isIntDivRem(Opcode);
912}
913
914/// Helper class that determines VL can use the same opcode.
915/// Alternate instruction is supported. In addition, it supports interchangeable
916/// instruction. An interchangeable instruction is an instruction that can be
917/// converted to another instruction with same semantics. For example, x << 1 is
918/// equal to x * 2. x * 1 is equal to x | 0.
919class BinOpSameOpcodeHelper {
920 using MaskType = std::uint_fast16_t;
921 /// Sort SupportedOp because it is used by binary_search.
922 constexpr static std::initializer_list<unsigned> SupportedOp = {
923 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
924 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
925 enum : MaskType {
926 ShlBIT = 0b1,
927 AShrBIT = 0b10,
928 MulBIT = 0b100,
929 AddBIT = 0b1000,
930 SubBIT = 0b10000,
931 AndBIT = 0b100000,
932 OrBIT = 0b1000000,
933 XorBIT = 0b10000000,
934 MainOpBIT = 0b100000000,
936 };
937 /// Return a non-nullptr if either operand of I is a ConstantInt.
938 /// The second return value represents the operand position. We check the
939 /// right-hand side first (1). If the right hand side is not a ConstantInt and
940 /// the instruction is neither Sub, Shl, nor AShr, we then check the left hand
941 /// side (0).
942 static std::pair<ConstantInt *, unsigned>
943 isBinOpWithConstantInt(const Instruction *I) {
944 unsigned Opcode = I->getOpcode();
945 assert(binary_search(SupportedOp, Opcode) && "Unsupported opcode.");
946 (void)SupportedOp;
947 auto *BinOp = cast<BinaryOperator>(I);
948 if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1)))
949 return {CI, 1};
950 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
951 Opcode == Instruction::AShr)
952 return {nullptr, 0};
953 if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(0)))
954 return {CI, 0};
955 return {nullptr, 0};
956 }
957 struct InterchangeableInfo {
958 const Instruction *I = nullptr;
959 /// The bit it sets represents whether MainOp can be converted to.
960 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
961 MulBIT | AShrBIT | ShlBIT;
962 /// We cannot create an interchangeable instruction that does not exist in
963 /// VL. For example, VL [x + 0, y * 1] can be converted to [x << 0, y << 0],
964 /// but << does not exist in VL. In the end, we convert VL to [x * 1, y *
965 /// 1]. SeenBefore is used to know what operations have been seen before.
966 MaskType SeenBefore = 0;
967 InterchangeableInfo(const Instruction *I) : I(I) {}
968 /// Return false allows BinOpSameOpcodeHelper to find an alternate
969 /// instruction. Directly setting the mask will destroy the mask state,
970 /// preventing us from determining which instruction it should convert to.
971 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
972 if (Mask & InterchangeableMask) {
973 SeenBefore |= OpcodeInMaskForm;
974 Mask &= InterchangeableMask;
975 return true;
976 }
977 return false;
978 }
979 bool equal(unsigned Opcode) {
980 return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
981 }
982 unsigned getOpcode() const {
983 MaskType Candidate = Mask & SeenBefore;
984 if (Candidate & MainOpBIT)
985 return I->getOpcode();
986 if (Candidate & ShlBIT)
987 return Instruction::Shl;
988 if (Candidate & AShrBIT)
989 return Instruction::AShr;
990 if (Candidate & MulBIT)
991 return Instruction::Mul;
992 if (Candidate & AddBIT)
993 return Instruction::Add;
994 if (Candidate & SubBIT)
995 return Instruction::Sub;
996 if (Candidate & AndBIT)
997 return Instruction::And;
998 if (Candidate & OrBIT)
999 return Instruction::Or;
1000 if (Candidate & XorBIT)
1001 return Instruction::Xor;
1002 llvm_unreachable("Cannot find interchangeable instruction.");
1003 }
1004
1005 /// Return true if the instruction can be converted to \p Opcode.
1006 bool hasCandidateOpcode(unsigned Opcode) const {
1007 MaskType Candidate = Mask & SeenBefore;
1008 switch (Opcode) {
1009 case Instruction::Shl:
1010 return Candidate & ShlBIT;
1011 case Instruction::AShr:
1012 return Candidate & AShrBIT;
1013 case Instruction::Mul:
1014 return Candidate & MulBIT;
1015 case Instruction::Add:
1016 return Candidate & AddBIT;
1017 case Instruction::Sub:
1018 return Candidate & SubBIT;
1019 case Instruction::And:
1020 return Candidate & AndBIT;
1021 case Instruction::Or:
1022 return Candidate & OrBIT;
1023 case Instruction::Xor:
1024 return Candidate & XorBIT;
1025 case Instruction::LShr:
1026 case Instruction::FAdd:
1027 case Instruction::FSub:
1028 case Instruction::FMul:
1029 case Instruction::SDiv:
1030 case Instruction::UDiv:
1031 case Instruction::FDiv:
1032 case Instruction::SRem:
1033 case Instruction::URem:
1034 case Instruction::FRem:
1035 return false;
1036 default:
1037 break;
1038 }
1039 llvm_unreachable("Cannot find interchangeable instruction.");
1040 }
1041
1042 SmallVector<Value *> getOperand(const Instruction *To) const {
1043 unsigned ToOpcode = To->getOpcode();
1044 unsigned FromOpcode = I->getOpcode();
1045 if (FromOpcode == ToOpcode)
1046 return SmallVector<Value *>(I->operands());
1047 assert(binary_search(SupportedOp, ToOpcode) && "Unsupported opcode.");
1048 auto [CI, Pos] = isBinOpWithConstantInt(I);
1049 const APInt &FromCIValue = CI->getValue();
1050 unsigned FromCIValueBitWidth = FromCIValue.getBitWidth();
1051 APInt ToCIValue;
1052 switch (FromOpcode) {
1053 case Instruction::Shl:
1054 if (ToOpcode == Instruction::Mul) {
1055 ToCIValue = APInt::getOneBitSet(FromCIValueBitWidth,
1056 FromCIValue.getZExtValue());
1057 } else {
1058 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1059 ToCIValue = ToOpcode == Instruction::And
1060 ? APInt::getAllOnes(FromCIValueBitWidth)
1061 : APInt::getZero(FromCIValueBitWidth);
1062 }
1063 break;
1064 case Instruction::Mul:
1065 assert(FromCIValue.isPowerOf2() && "Cannot convert the instruction.");
1066 if (ToOpcode == Instruction::Shl) {
1067 ToCIValue = APInt(FromCIValueBitWidth, FromCIValue.logBase2());
1068 } else {
1069 assert(FromCIValue.isOne() && "Cannot convert the instruction.");
1070 ToCIValue = ToOpcode == Instruction::And
1071 ? APInt::getAllOnes(FromCIValueBitWidth)
1072 : APInt::getZero(FromCIValueBitWidth);
1073 }
1074 break;
1075 case Instruction::Add:
1076 case Instruction::Sub:
1077 if (FromCIValue.isZero()) {
1078 ToCIValue = APInt::getZero(FromCIValueBitWidth);
1079 } else {
1080 assert(is_contained({Instruction::Add, Instruction::Sub}, ToOpcode) &&
1081 "Cannot convert the instruction.");
1082 ToCIValue = FromCIValue;
1083 ToCIValue.negate();
1084 }
1085 break;
1086 case Instruction::And:
1087 assert(FromCIValue.isAllOnes() && "Cannot convert the instruction.");
1088 ToCIValue = ToOpcode == Instruction::Mul
1089 ? APInt::getOneBitSet(FromCIValueBitWidth, 0)
1090 : APInt::getZero(FromCIValueBitWidth);
1091 break;
1092 default:
1093 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1094 ToCIValue = APInt::getZero(FromCIValueBitWidth);
1095 break;
1096 }
1097 Value *LHS = I->getOperand(1 - Pos);
1098 Constant *RHS =
1099 ConstantInt::get(I->getOperand(Pos)->getType(), ToCIValue);
1100 // constant + x cannot be -constant - x
1101 // instead, it should be x - -constant
1102 if (Pos == 1 ||
1103 ((FromOpcode == Instruction::Add || FromOpcode == Instruction::Or ||
1104 FromOpcode == Instruction::Xor) &&
1105 ToOpcode == Instruction::Sub))
1106 return SmallVector<Value *>({LHS, RHS});
1107 return SmallVector<Value *>({RHS, LHS});
1108 }
1109 };
1110 InterchangeableInfo MainOp;
1111 InterchangeableInfo AltOp;
1112 bool isValidForAlternation(const Instruction *I) const {
1113 return ::isValidForAlternation(MainOp.I->getOpcode()) &&
1114 ::isValidForAlternation(I->getOpcode());
1115 }
1116 bool initializeAltOp(const Instruction *I) {
1117 if (AltOp.I)
1118 return true;
1119 if (!isValidForAlternation(I))
1120 return false;
1121 AltOp.I = I;
1122 return true;
1123 }
1124
1125public:
1126 BinOpSameOpcodeHelper(const Instruction *MainOp,
1127 const Instruction *AltOp = nullptr)
1128 : MainOp(MainOp), AltOp(AltOp) {
1129 assert(is_sorted(SupportedOp) && "SupportedOp is not sorted.");
1130 }
1131 bool add(const Instruction *I) {
1133 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1134 unsigned Opcode = I->getOpcode();
1135 MaskType OpcodeInMaskForm;
1136 // Prefer Shl, AShr, Mul, Add, Sub, And, Or and Xor over MainOp.
1137 switch (Opcode) {
1138 case Instruction::Shl:
1139 OpcodeInMaskForm = ShlBIT;
1140 break;
1141 case Instruction::AShr:
1142 OpcodeInMaskForm = AShrBIT;
1143 break;
1144 case Instruction::Mul:
1145 OpcodeInMaskForm = MulBIT;
1146 break;
1147 case Instruction::Add:
1148 OpcodeInMaskForm = AddBIT;
1149 break;
1150 case Instruction::Sub:
1151 OpcodeInMaskForm = SubBIT;
1152 break;
1153 case Instruction::And:
1154 OpcodeInMaskForm = AndBIT;
1155 break;
1156 case Instruction::Or:
1157 OpcodeInMaskForm = OrBIT;
1158 break;
1159 case Instruction::Xor:
1160 OpcodeInMaskForm = XorBIT;
1161 break;
1162 default:
1163 return MainOp.equal(Opcode) ||
1164 (initializeAltOp(I) && AltOp.equal(Opcode));
1165 }
1166 MaskType InterchangeableMask = OpcodeInMaskForm;
1167 ConstantInt *CI = isBinOpWithConstantInt(I).first;
1168 if (CI) {
1169 constexpr MaskType CanBeAll =
1170 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1171 const APInt &CIValue = CI->getValue();
1172 switch (Opcode) {
1173 case Instruction::Shl:
1174 if (CIValue.ult(CIValue.getBitWidth()))
1175 InterchangeableMask = CIValue.isZero() ? CanBeAll : MulBIT | ShlBIT;
1176 break;
1177 case Instruction::Mul:
1178 if (CIValue.isOne()) {
1179 InterchangeableMask = CanBeAll;
1180 break;
1181 }
1182 if (CIValue.isPowerOf2())
1183 InterchangeableMask = MulBIT | ShlBIT;
1184 break;
1185 case Instruction::Add:
1186 case Instruction::Sub:
1187 InterchangeableMask = CIValue.isZero() ? CanBeAll : SubBIT | AddBIT;
1188 break;
1189 case Instruction::And:
1190 if (CIValue.isAllOnes())
1191 InterchangeableMask = CanBeAll;
1192 break;
1193 case Instruction::Xor:
1194 if (CIValue.isZero())
1195 InterchangeableMask = XorBIT | OrBIT | AndBIT | SubBIT | AddBIT;
1196 break;
1197 default:
1198 if (CIValue.isZero())
1199 InterchangeableMask = CanBeAll;
1200 break;
1201 }
1202 }
1203 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1204 (initializeAltOp(I) &&
1205 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1206 }
1207 unsigned getMainOpcode() const { return MainOp.getOpcode(); }
1208 /// Checks if the list of potential opcodes includes \p Opcode.
1209 bool hasCandidateOpcode(unsigned Opcode) const {
1210 return MainOp.hasCandidateOpcode(Opcode);
1211 }
1212 bool hasAltOp() const { return AltOp.I; }
1213 unsigned getAltOpcode() const {
1214 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1215 }
1216 SmallVector<Value *> getOperand(const Instruction *I) const {
1217 return MainOp.getOperand(I);
1218 }
1219};
1220
1221/// Main data required for vectorization of instructions.
1222class InstructionsState {
1223 /// MainOp and AltOp are primarily determined by getSameOpcode. Currently,
1224 /// only BinaryOperator, CastInst, and CmpInst support alternate instructions
1225 /// (i.e., AltOp is not equal to MainOp; this can be checked using
1226 /// isAltShuffle).
1227 /// A rare exception is TrySplitNode, where the InstructionsState is derived
1228 /// from getMainAltOpsNoStateVL.
1229 /// For those InstructionsState that use alternate instructions, the resulting
1230 /// vectorized output ultimately comes from a shufflevector. For example,
1231 /// given a vector list (VL):
1232 /// VL[0] = add i32 a, e
1233 /// VL[1] = sub i32 b, f
1234 /// VL[2] = add i32 c, g
1235 /// VL[3] = sub i32 d, h
1236 /// The vectorized result would be:
1237 /// intermediated_0 = add <4 x i32> <a, b, c, d>, <e, f, g, h>
1238 /// intermediated_1 = sub <4 x i32> <a, b, c, d>, <e, f, g, h>
1239 /// result = shufflevector <4 x i32> intermediated_0,
1240 /// <4 x i32> intermediated_1,
1241 /// <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1242 /// Since shufflevector is used in the final result, when calculating the cost
1243 /// (getEntryCost), we must account for the usage of shufflevector in
1244 /// GetVectorCost.
1245 Instruction *MainOp = nullptr;
1246 Instruction *AltOp = nullptr;
1247 /// Wether the instruction state represents copyable instructions.
1248 bool HasCopyables = false;
1249
1250public:
1251 Instruction *getMainOp() const {
1252 assert(valid() && "InstructionsState is invalid.");
1253 return MainOp;
1254 }
1255
1256 Instruction *getAltOp() const {
1257 assert(valid() && "InstructionsState is invalid.");
1258 return AltOp;
1259 }
1260
1261 /// The main/alternate opcodes for the list of instructions.
1262 unsigned getOpcode() const { return getMainOp()->getOpcode(); }
1263
1264 unsigned getAltOpcode() const { return getAltOp()->getOpcode(); }
1265
1266 /// Some of the instructions in the list have alternate opcodes.
1267 bool isAltShuffle() const { return getMainOp() != getAltOp(); }
1268
1269 /// Checks if the instruction matches either the main or alternate opcode.
1270 /// \returns
1271 /// - MainOp if \param I matches MainOp's opcode directly or can be converted
1272 /// to it
1273 /// - AltOp if \param I matches AltOp's opcode directly or can be converted to
1274 /// it
1275 /// - nullptr if \param I cannot be matched or converted to either opcode
1276 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
1277 assert(MainOp && "MainOp cannot be nullptr.");
1278 if (I->getOpcode() == MainOp->getOpcode())
1279 return MainOp;
1280 // Prefer AltOp instead of interchangeable instruction of MainOp.
1281 assert(AltOp && "AltOp cannot be nullptr.");
1282 if (I->getOpcode() == AltOp->getOpcode())
1283 return AltOp;
1284 if (!I->isBinaryOp())
1285 return nullptr;
1286 BinOpSameOpcodeHelper Converter(MainOp);
1287 if (!Converter.add(I) || !Converter.add(MainOp))
1288 return nullptr;
1289 if (isAltShuffle() && !Converter.hasCandidateOpcode(MainOp->getOpcode())) {
1290 BinOpSameOpcodeHelper AltConverter(AltOp);
1291 if (AltConverter.add(I) && AltConverter.add(AltOp) &&
1292 AltConverter.hasCandidateOpcode(AltOp->getOpcode()))
1293 return AltOp;
1294 }
1295 if (Converter.hasAltOp() && !isAltShuffle())
1296 return nullptr;
1297 return Converter.hasAltOp() ? AltOp : MainOp;
1298 }
1299
1300 /// Checks if main/alt instructions are shift operations.
1301 bool isShiftOp() const {
1302 return getMainOp()->isShift() && getAltOp()->isShift();
1303 }
1304
1305 /// Checks if main/alt instructions are bitwise logic operations.
1306 bool isBitwiseLogicOp() const {
1307 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1308 }
1309
1310 /// Checks if main/alt instructions are mul/div/rem/fmul/fdiv/frem operations.
1311 bool isMulDivLikeOp() const {
1312 constexpr std::array<unsigned, 8> MulDiv = {
1313 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1314 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1315 Instruction::URem, Instruction::FRem};
1316 return is_contained(MulDiv, getOpcode()) &&
1317 is_contained(MulDiv, getAltOpcode());
1318 }
1319
1320 /// Checks if main/alt instructions are add/sub/fadd/fsub operations.
1321 bool isAddSubLikeOp() const {
1322 constexpr std::array<unsigned, 4> AddSub = {
1323 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1324 Instruction::FSub};
1325 return is_contained(AddSub, getOpcode()) &&
1326 is_contained(AddSub, getAltOpcode());
1327 }
1328
1329 /// Checks if main/alt instructions are cmp operations.
1330 bool isCmpOp() const {
1331 return (getOpcode() == Instruction::ICmp ||
1332 getOpcode() == Instruction::FCmp) &&
1333 getAltOpcode() == getOpcode();
1334 }
1335
1336 /// Checks if the current state is valid, i.e. has non-null MainOp
1337 bool valid() const { return MainOp && AltOp; }
1338
1339 explicit operator bool() const { return valid(); }
1340
1341 InstructionsState() = delete;
1342 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1343 bool HasCopyables = false)
1344 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1345 static InstructionsState invalid() { return {nullptr, nullptr}; }
1346
1347 /// Checks if the value is a copyable element.
1348 bool isCopyableElement(Value *V) const {
1349 assert(valid() && "InstructionsState is invalid.");
1350 if (!HasCopyables)
1351 return false;
1352 if (isAltShuffle() || getOpcode() == Instruction::GetElementPtr)
1353 return false;
1354 auto *I = dyn_cast<Instruction>(V);
1355 if (!I)
1356 return !isa<PoisonValue>(V);
1357 if (I->getParent() != MainOp->getParent() &&
1360 return true;
1361 if (I->getOpcode() == MainOp->getOpcode())
1362 return false;
1363 if (!I->isBinaryOp())
1364 return true;
1365 BinOpSameOpcodeHelper Converter(MainOp);
1366 return !Converter.add(I) || !Converter.add(MainOp) ||
1367 Converter.hasAltOp() || !Converter.hasCandidateOpcode(getOpcode());
1368 }
1369
1370 /// Checks if the value is non-schedulable.
1371 bool isNonSchedulable(Value *V) const {
1372 assert(valid() && "InstructionsState is invalid.");
1373 auto *I = dyn_cast<Instruction>(V);
1374 if (!HasCopyables)
1375 return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
1377 // MainOp for copyables always schedulable to correctly identify
1378 // non-schedulable copyables.
1379 if (getMainOp() == V)
1380 return false;
1381 if (isCopyableElement(V)) {
1382 auto IsNonSchedulableCopyableElement = [this](Value *V) {
1383 auto *I = dyn_cast<Instruction>(V);
1384 return !I || isa<PHINode>(I) || I->getParent() != MainOp->getParent() ||
1386 // If the copyable instructions comes after MainOp
1387 // (non-schedulable, but used in the block) - cannot vectorize
1388 // it, will possibly generate use before def.
1389 !MainOp->comesBefore(I));
1390 };
1391
1392 return IsNonSchedulableCopyableElement(V);
1393 }
1394 return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
1396 }
1397
1398 /// Checks if the state represents copyable instructions.
1399 bool areInstructionsWithCopyableElements() const {
1400 assert(valid() && "InstructionsState is invalid.");
1401 return HasCopyables;
1402 }
1403};
1404
1405std::pair<Instruction *, SmallVector<Value *>>
1406convertTo(Instruction *I, const InstructionsState &S) {
1407 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(I);
1408 assert(SelectedOp && "Cannot convert the instruction.");
1409 if (I->isBinaryOp()) {
1410 BinOpSameOpcodeHelper Converter(I);
1411 return std::make_pair(SelectedOp, Converter.getOperand(SelectedOp));
1412 }
1413 return std::make_pair(SelectedOp, SmallVector<Value *>(I->operands()));
1414}
1415
1416} // end anonymous namespace
1417
1418static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1419 const TargetLibraryInfo &TLI);
1420
1421/// Find an instruction with a specific opcode in VL.
1422/// \param VL Array of values to search through. Must contain only Instructions
1423/// and PoisonValues.
1424/// \param Opcode The instruction opcode to search for
1425/// \returns
1426/// - The first instruction found with matching opcode
1427/// - nullptr if no matching instruction is found
1429 unsigned Opcode) {
1430 for (Value *V : VL) {
1431 if (isa<PoisonValue>(V))
1432 continue;
1433 assert(isa<Instruction>(V) && "Only accepts PoisonValue and Instruction.");
1434 auto *Inst = cast<Instruction>(V);
1435 if (Inst->getOpcode() == Opcode)
1436 return Inst;
1437 }
1438 return nullptr;
1439}
1440
1441/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
1442/// compatible instructions or constants, or just some other regular values.
1443static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
1444 Value *Op1, const TargetLibraryInfo &TLI) {
1445 return (isConstant(BaseOp0) && isConstant(Op0)) ||
1446 (isConstant(BaseOp1) && isConstant(Op1)) ||
1447 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
1448 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
1449 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1450 getSameOpcode({BaseOp0, Op0}, TLI) ||
1451 getSameOpcode({BaseOp1, Op1}, TLI);
1452}
1453
1454/// \returns true if a compare instruction \p CI has similar "look" and
1455/// same predicate as \p BaseCI, "as is" or with its operands and predicate
1456/// swapped, false otherwise.
1457static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
1458 const TargetLibraryInfo &TLI) {
1459 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
1460 "Assessing comparisons of different types?");
1461 CmpInst::Predicate BasePred = BaseCI->getPredicate();
1462 CmpInst::Predicate Pred = CI->getPredicate();
1464
1465 Value *BaseOp0 = BaseCI->getOperand(0);
1466 Value *BaseOp1 = BaseCI->getOperand(1);
1467 Value *Op0 = CI->getOperand(0);
1468 Value *Op1 = CI->getOperand(1);
1469
1470 return (BasePred == Pred &&
1471 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
1472 (BasePred == SwappedPred &&
1473 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
1474}
1475
1476/// \returns analysis of the Instructions in \p VL described in
1477/// InstructionsState, the Opcode that we suppose the whole list
1478/// could be vectorized even if its structure is diverse.
1479static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1480 const TargetLibraryInfo &TLI) {
1481 // Make sure these are all Instructions.
1483 return InstructionsState::invalid();
1484
1485 auto *It = find_if(VL, IsaPred<Instruction>);
1486 if (It == VL.end())
1487 return InstructionsState::invalid();
1488
1489 Instruction *MainOp = cast<Instruction>(*It);
1490 unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);
1491 if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) ||
1492 (VL.size() == 2 && InstCnt < 2))
1493 return InstructionsState::invalid();
1494
1495 bool IsCastOp = isa<CastInst>(MainOp);
1496 bool IsBinOp = isa<BinaryOperator>(MainOp);
1497 bool IsCmpOp = isa<CmpInst>(MainOp);
1498 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
1500 Instruction *AltOp = MainOp;
1501 unsigned Opcode = MainOp->getOpcode();
1502 unsigned AltOpcode = Opcode;
1503
1504 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1505 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1506 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
1507 UniquePreds.insert(BasePred);
1508 UniqueNonSwappedPreds.insert(BasePred);
1509 for (Value *V : VL) {
1510 auto *I = dyn_cast<CmpInst>(V);
1511 if (!I)
1512 return false;
1513 CmpInst::Predicate CurrentPred = I->getPredicate();
1514 CmpInst::Predicate SwappedCurrentPred =
1515 CmpInst::getSwappedPredicate(CurrentPred);
1516 UniqueNonSwappedPreds.insert(CurrentPred);
1517 if (!UniquePreds.contains(CurrentPred) &&
1518 !UniquePreds.contains(SwappedCurrentPred))
1519 UniquePreds.insert(CurrentPred);
1520 }
1521 // Total number of predicates > 2, but if consider swapped predicates
1522 // compatible only 2, consider swappable predicates as compatible opcodes,
1523 // not alternate.
1524 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
1525 }();
1526 // Check for one alternate opcode from another BinaryOperator.
1527 // TODO - generalize to support all operators (types, calls etc.).
1528 Intrinsic::ID BaseID = 0;
1529 SmallVector<VFInfo> BaseMappings;
1530 if (auto *CallBase = dyn_cast<CallInst>(MainOp)) {
1531 BaseID = getVectorIntrinsicIDForCall(CallBase, &TLI);
1532 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
1533 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
1534 return InstructionsState::invalid();
1535 }
1536 bool AnyPoison = InstCnt != VL.size();
1537 // Check MainOp too to be sure that it matches the requirements for the
1538 // instructions.
1539 for (Value *V : iterator_range(It, VL.end())) {
1540 auto *I = dyn_cast<Instruction>(V);
1541 if (!I)
1542 continue;
1543
1544 // Cannot combine poison and divisions.
1545 // TODO: do some smart analysis of the CallInsts to exclude divide-like
1546 // intrinsics/functions only.
1547 if (AnyPoison && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
1548 return InstructionsState::invalid();
1549 unsigned InstOpcode = I->getOpcode();
1550 if (IsBinOp && isa<BinaryOperator>(I)) {
1551 if (BinOpHelper.add(I))
1552 continue;
1553 } else if (IsCastOp && isa<CastInst>(I)) {
1554 Value *Op0 = MainOp->getOperand(0);
1555 Type *Ty0 = Op0->getType();
1556 Value *Op1 = I->getOperand(0);
1557 Type *Ty1 = Op1->getType();
1558 if (Ty0 == Ty1) {
1559 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1560 continue;
1561 if (Opcode == AltOpcode) {
1562 assert(isValidForAlternation(Opcode) &&
1563 isValidForAlternation(InstOpcode) &&
1564 "Cast isn't safe for alternation, logic needs to be updated!");
1565 AltOpcode = InstOpcode;
1566 AltOp = I;
1567 continue;
1568 }
1569 }
1570 } else if (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) {
1571 auto *BaseInst = cast<CmpInst>(MainOp);
1572 Type *Ty0 = BaseInst->getOperand(0)->getType();
1573 Type *Ty1 = Inst->getOperand(0)->getType();
1574 if (Ty0 == Ty1) {
1575 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
1576 assert(InstOpcode == AltOpcode &&
1577 "Alternate instructions are only supported by BinaryOperator "
1578 "and CastInst.");
1579 // Check for compatible operands. If the corresponding operands are not
1580 // compatible - need to perform alternate vectorization.
1581 CmpInst::Predicate CurrentPred = Inst->getPredicate();
1582 CmpInst::Predicate SwappedCurrentPred =
1583 CmpInst::getSwappedPredicate(CurrentPred);
1584
1585 if ((VL.size() == 2 || SwappedPredsCompatible) &&
1586 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1587 continue;
1588
1589 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
1590 continue;
1591 auto *AltInst = cast<CmpInst>(AltOp);
1592 if (MainOp != AltOp) {
1593 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
1594 continue;
1595 } else if (BasePred != CurrentPred) {
1596 assert(
1597 isValidForAlternation(InstOpcode) &&
1598 "CmpInst isn't safe for alternation, logic needs to be updated!");
1599 AltOp = I;
1600 continue;
1601 }
1602 CmpInst::Predicate AltPred = AltInst->getPredicate();
1603 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1604 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1605 continue;
1606 }
1607 } else if (InstOpcode == Opcode) {
1608 assert(InstOpcode == AltOpcode &&
1609 "Alternate instructions are only supported by BinaryOperator and "
1610 "CastInst.");
1611 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
1612 if (Gep->getNumOperands() != 2 ||
1613 Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType())
1614 return InstructionsState::invalid();
1615 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
1617 return InstructionsState::invalid();
1618 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
1619 auto *BaseLI = cast<LoadInst>(MainOp);
1620 if (!LI->isSimple() || !BaseLI->isSimple())
1621 return InstructionsState::invalid();
1622 } else if (auto *Call = dyn_cast<CallInst>(I)) {
1623 auto *CallBase = cast<CallInst>(MainOp);
1624 if (Call->getCalledFunction() != CallBase->getCalledFunction())
1625 return InstructionsState::invalid();
1626 if (Call->hasOperandBundles() &&
1628 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1629 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1630 CallBase->op_begin() +
1632 return InstructionsState::invalid();
1634 if (ID != BaseID)
1635 return InstructionsState::invalid();
1636 if (!ID) {
1637 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
1638 if (Mappings.size() != BaseMappings.size() ||
1639 Mappings.front().ISA != BaseMappings.front().ISA ||
1640 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
1641 Mappings.front().VectorName != BaseMappings.front().VectorName ||
1642 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
1643 Mappings.front().Shape.Parameters !=
1644 BaseMappings.front().Shape.Parameters)
1645 return InstructionsState::invalid();
1646 }
1647 }
1648 continue;
1649 }
1650 return InstructionsState::invalid();
1651 }
1652
1653 if (IsBinOp) {
1654 MainOp = findInstructionWithOpcode(VL, BinOpHelper.getMainOpcode());
1655 assert(MainOp && "Cannot find MainOp with Opcode from BinOpHelper.");
1656 AltOp = findInstructionWithOpcode(VL, BinOpHelper.getAltOpcode());
1657 assert(MainOp && "Cannot find AltOp with Opcode from BinOpHelper.");
1658 }
1659 assert((MainOp == AltOp || !allSameOpcode(VL)) &&
1660 "Incorrect implementation of allSameOpcode.");
1661 InstructionsState S(MainOp, AltOp);
1662 assert(all_of(VL,
1663 [&](Value *V) {
1664 return isa<PoisonValue>(V) ||
1665 S.getMatchingMainOpOrAltOp(cast<Instruction>(V));
1666 }) &&
1667 "Invalid InstructionsState.");
1668 return S;
1669}
1670
1671/// \returns true if all of the values in \p VL have the same type or false
1672/// otherwise.
1674 Type *Ty = VL.consume_front()->getType();
1675 return all_of(VL, [&](Value *V) { return V->getType() == Ty; });
1676}
1677
1678/// \returns True if in-tree use also needs extract. This refers to
1679/// possible scalar operand in vectorized instruction.
1680static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
1681 TargetLibraryInfo *TLI,
1682 const TargetTransformInfo *TTI) {
1683 if (!UserInst)
1684 return false;
1685 unsigned Opcode = UserInst->getOpcode();
1686 switch (Opcode) {
1687 case Instruction::Load: {
1688 LoadInst *LI = cast<LoadInst>(UserInst);
1689 return (LI->getPointerOperand() == Scalar);
1690 }
1691 case Instruction::Store: {
1692 StoreInst *SI = cast<StoreInst>(UserInst);
1693 return (SI->getPointerOperand() == Scalar);
1694 }
1695 case Instruction::Call: {
1696 CallInst *CI = cast<CallInst>(UserInst);
1698 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
1699 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1700 Arg.value().get() == Scalar;
1701 });
1702 }
1703 default:
1704 return false;
1705 }
1706}
1707
1708/// \returns the AA location that is being access by the instruction.
1711 return MemoryLocation::get(SI);
1712 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1713 return MemoryLocation::get(LI);
1714 return MemoryLocation();
1715}
1716
1717/// \returns True if the instruction is not a volatile or atomic load/store.
1718static bool isSimple(Instruction *I) {
1719 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1720 return LI->isSimple();
1722 return SI->isSimple();
1724 return !MI->isVolatile();
1725 return true;
1726}
1727
1728/// Shuffles \p Mask in accordance with the given \p SubMask.
1729/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1730/// one but two input vectors.
1731static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
1732 bool ExtendingManyInputs = false) {
1733 if (SubMask.empty())
1734 return;
1735 assert(
1736 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
1737 // Check if input scalars were extended to match the size of other node.
1738 (SubMask.size() == Mask.size() && Mask.back() == PoisonMaskElem)) &&
1739 "SubMask with many inputs support must be larger than the mask.");
1740 if (Mask.empty()) {
1741 Mask.append(SubMask.begin(), SubMask.end());
1742 return;
1743 }
1744 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1745 int TermValue = std::min(Mask.size(), SubMask.size());
1746 for (int I = 0, E = SubMask.size(); I < E; ++I) {
1747 if (SubMask[I] == PoisonMaskElem ||
1748 (!ExtendingManyInputs &&
1749 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1750 continue;
1751 NewMask[I] = Mask[SubMask[I]];
1752 }
1753 Mask.swap(NewMask);
1754}
1755
1756/// Order may have elements assigned special value (size) which is out of
1757/// bounds. Such indices only appear on places which correspond to undef values
1758/// (see canReuseExtract for details) and used in order to avoid undef values
1759/// have effect on operands ordering.
1760/// The first loop below simply finds all unused indices and then the next loop
1761/// nest assigns these indices for undef values positions.
1762/// As an example below Order has two undef positions and they have assigned
1763/// values 3 and 7 respectively:
1764/// before: 6 9 5 4 9 2 1 0
1765/// after: 6 3 5 4 7 2 1 0
1767 const size_t Sz = Order.size();
1768 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1769 SmallBitVector MaskedIndices(Sz);
1770 for (unsigned I = 0; I < Sz; ++I) {
1771 if (Order[I] < Sz)
1772 UnusedIndices.reset(Order[I]);
1773 else
1774 MaskedIndices.set(I);
1775 }
1776 if (MaskedIndices.none())
1777 return;
1778 assert(UnusedIndices.count() == MaskedIndices.count() &&
1779 "Non-synced masked/available indices.");
1780 int Idx = UnusedIndices.find_first();
1781 int MIdx = MaskedIndices.find_first();
1782 while (MIdx >= 0) {
1783 assert(Idx >= 0 && "Indices must be synced.");
1784 Order[MIdx] = Idx;
1785 Idx = UnusedIndices.find_next(Idx);
1786 MIdx = MaskedIndices.find_next(MIdx);
1787 }
1788}
1789
1790/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1791/// Opcode1.
1793 unsigned Opcode0, unsigned Opcode1) {
1794 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
1795 SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
1796 for (unsigned Lane : seq<unsigned>(VL.size())) {
1797 if (isa<PoisonValue>(VL[Lane]))
1798 continue;
1799 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1800 OpcodeMask.set(Lane * ScalarTyNumElements,
1801 Lane * ScalarTyNumElements + ScalarTyNumElements);
1802 }
1803 return OpcodeMask;
1804}
1805
1806/// Replicates the given \p Val \p VF times.
1808 unsigned VF) {
1809 assert(none_of(Val, [](Constant *C) { return C->getType()->isVectorTy(); }) &&
1810 "Expected scalar constants.");
1811 SmallVector<Constant *> NewVal(Val.size() * VF);
1812 for (auto [I, V] : enumerate(Val))
1813 std::fill_n(NewVal.begin() + I * VF, VF, V);
1814 return NewVal;
1815}
1816
1817namespace llvm {
1818
1820 SmallVectorImpl<int> &Mask) {
1821 Mask.clear();
1822 const unsigned E = Indices.size();
1823 Mask.resize(E, PoisonMaskElem);
1824 for (unsigned I = 0; I < E; ++I)
1825 Mask[Indices[I]] = I;
1826}
1827
1828/// Reorders the list of scalars in accordance with the given \p Mask.
1830 ArrayRef<int> Mask) {
1831 assert(!Mask.empty() && "Expected non-empty mask.");
1832 SmallVector<Value *> Prev(Scalars.size(),
1833 PoisonValue::get(Scalars.front()->getType()));
1834 Prev.swap(Scalars);
1835 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1836 if (Mask[I] != PoisonMaskElem)
1837 Scalars[Mask[I]] = Prev[I];
1838}
1839
1840/// Checks if the provided value does not require scheduling. It does not
1841/// require scheduling if this is not an instruction or it is an instruction
1842/// that does not read/write memory and all operands are either not instructions
1843/// or phi nodes or instructions from different blocks.
1845 auto *I = dyn_cast<Instruction>(V);
1846 if (!I)
1847 return true;
1848 return !mayHaveNonDefUseDependency(*I) &&
1849 all_of(I->operands(), [I](Value *V) {
1850 auto *IO = dyn_cast<Instruction>(V);
1851 if (!IO)
1852 return true;
1853 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1854 });
1855}
1856
1857/// Checks if the provided value does not require scheduling. It does not
1858/// require scheduling if this is not an instruction or it is an instruction
1859/// that does not read/write memory and all users are phi nodes or instructions
1860/// from the different blocks.
1861static bool isUsedOutsideBlock(Value *V) {
1862 auto *I = dyn_cast<Instruction>(V);
1863 if (!I)
1864 return true;
1865 // Limits the number of uses to save compile time.
1866 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1867 all_of(I->users(), [I](User *U) {
1868 auto *IU = dyn_cast<Instruction>(U);
1869 if (!IU)
1870 return true;
1871 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1872 });
1873}
1874
1875/// Checks if the specified value does not require scheduling. It does not
1876/// require scheduling if all operands and all users do not need to be scheduled
1877/// in the current basic block.
1880}
1881
1882/// Checks if the specified array of instructions does not require scheduling.
1883/// It is so if all either instructions have operands that do not require
1884/// scheduling or their users do not require scheduling since they are phis or
1885/// in other basic blocks.
1887 return !VL.empty() &&
1889}
1890
1891/// Returns true if widened type of \p Ty elements with size \p Sz represents
1892/// full vector type, i.e. adding extra element results in extra parts upon type
1893/// legalization.
1895 unsigned Sz) {
1896 if (Sz <= 1)
1897 return false;
1899 return false;
1900 if (has_single_bit(Sz))
1901 return true;
1902 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
1903 return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
1904 Sz % NumParts == 0;
1905}
1906
1907/// Returns number of parts, the type \p VecTy will be split at the codegen
1908/// phase. If the type is going to be scalarized or does not uses whole
1909/// registers, returns 1.
1910static unsigned
1912 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1913 unsigned NumParts = TTI.getNumberOfParts(VecTy);
1914 if (NumParts == 0 || NumParts >= Limit)
1915 return 1;
1916 unsigned Sz = getNumElements(VecTy);
1917 if (NumParts >= Sz || Sz % NumParts != 0 ||
1918 !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), Sz / NumParts))
1919 return 1;
1920 return NumParts;
1921}
1922
1923namespace slpvectorizer {
1924
1925/// Bottom Up SLP Vectorizer.
1926class BoUpSLP {
1927 class TreeEntry;
1928 class ScheduleEntity;
1929 class ScheduleData;
1930 class ScheduleCopyableData;
1931 class ScheduleBundle;
1934
1935 /// If we decide to generate strided load / store, this struct contains all
1936 /// the necessary info. It's fields are calculated by analyzeRtStrideCandidate
1937 /// and analyzeConstantStrideCandidate. Note that Stride can be given either
1938 /// as a SCEV or as a Value if it already exists. To get the stride in bytes,
1939 /// StrideVal (or value obtained from StrideSCEV) has to by multiplied by the
1940 /// size of element of FixedVectorType.
1941 struct StridedPtrInfo {
1942 Value *StrideVal = nullptr;
1943 const SCEV *StrideSCEV = nullptr;
1944 FixedVectorType *Ty = nullptr;
1945 };
1946 SmallDenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
1947
1948public:
1949 /// Tracks the state we can represent the loads in the given sequence.
1957
1964
1966 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1968 const DataLayout *DL, OptimizationRemarkEmitter *ORE)
1969 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1970 AC(AC), DB(DB), DL(DL), ORE(ORE),
1971 Builder(Se->getContext(), TargetFolder(*DL)) {
1972 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1973 // Use the vector register size specified by the target unless overridden
1974 // by a command-line option.
1975 // TODO: It would be better to limit the vectorization factor based on
1976 // data type rather than just register size. For example, x86 AVX has
1977 // 256-bit registers, but it does not support integer operations
1978 // at that width (that requires AVX2).
1979 if (MaxVectorRegSizeOption.getNumOccurrences())
1980 MaxVecRegSize = MaxVectorRegSizeOption;
1981 else
1982 MaxVecRegSize =
1983 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
1984 .getFixedValue();
1985
1986 if (MinVectorRegSizeOption.getNumOccurrences())
1987 MinVecRegSize = MinVectorRegSizeOption;
1988 else
1989 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1990 }
1991
1992 /// Vectorize the tree that starts with the elements in \p VL.
1993 /// Returns the vectorized root.
1995
1996 /// Vectorize the tree but with the list of externally used values \p
1997 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1998 /// generated extractvalue instructions.
2000 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
2001 Instruction *ReductionRoot = nullptr,
2002 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {});
2003
2004 /// \returns the cost incurred by unwanted spills and fills, caused by
2005 /// holding live values over call sites.
2007
2008 /// \returns the vectorization cost of the subtree that starts at \p VL.
2009 /// A negative number means that this is profitable.
2010 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = {},
2011 InstructionCost ReductionCost = TTI::TCC_Free);
2012
2013 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
2014 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
2015 void buildTree(ArrayRef<Value *> Roots,
2016 const SmallDenseSet<Value *> &UserIgnoreLst);
2017
2018 /// Construct a vectorizable tree that starts at \p Roots.
2019 void buildTree(ArrayRef<Value *> Roots);
2020
2021 /// Return the scalars of the root node.
2023 assert(!VectorizableTree.empty() && "No graph to get the first node from");
2024 return VectorizableTree.front()->Scalars;
2025 }
2026
2027 /// Returns the type/is-signed info for the root node in the graph without
2028 /// casting.
2029 std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const {
2030 const TreeEntry &Root = *VectorizableTree.front();
2031 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2032 !Root.Scalars.front()->getType()->isIntegerTy())
2033 return std::nullopt;
2034 auto It = MinBWs.find(&Root);
2035 if (It != MinBWs.end())
2036 return std::make_pair(IntegerType::get(Root.Scalars.front()->getContext(),
2037 It->second.first),
2038 It->second.second);
2039 if (Root.getOpcode() == Instruction::ZExt ||
2040 Root.getOpcode() == Instruction::SExt)
2041 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
2042 Root.getOpcode() == Instruction::SExt);
2043 return std::nullopt;
2044 }
2045
2046 /// Checks if the root graph node can be emitted with narrower bitwidth at
2047 /// codegen and returns it signedness, if so.
2049 return MinBWs.at(VectorizableTree.front().get()).second;
2050 }
2051
2052 /// Returns reduction type after minbitdth analysis.
2054 if (ReductionBitWidth == 0 ||
2055 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2056 ReductionBitWidth >=
2057 DL->getTypeSizeInBits(
2058 VectorizableTree.front()->Scalars.front()->getType()))
2059 return getWidenedType(
2060 VectorizableTree.front()->Scalars.front()->getType(),
2061 VectorizableTree.front()->getVectorFactor());
2062 return getWidenedType(
2064 VectorizableTree.front()->Scalars.front()->getContext(),
2065 ReductionBitWidth),
2066 VectorizableTree.front()->getVectorFactor());
2067 }
2068
2069 /// Builds external uses of the vectorized scalars, i.e. the list of
2070 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
2071 /// ExternallyUsedValues contains additional list of external uses to handle
2072 /// vectorization of reductions.
2073 void
2074 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
2075
2076 /// Transforms graph nodes to target specific representations, if profitable.
2077 void transformNodes();
2078
2079 /// Clear the internal data structures that are created by 'buildTree'.
2080 void deleteTree() {
2081 VectorizableTree.clear();
2082 ScalarToTreeEntries.clear();
2083 OperandsToTreeEntry.clear();
2084 ScalarsInSplitNodes.clear();
2085 MustGather.clear();
2086 NonScheduledFirst.clear();
2087 EntryToLastInstruction.clear();
2088 LoadEntriesToVectorize.clear();
2089 IsGraphTransformMode = false;
2090 GatheredLoadsEntriesFirst.reset();
2091 CompressEntryToData.clear();
2092 ExternalUses.clear();
2093 ExternalUsesAsOriginalScalar.clear();
2094 ExternalUsesWithNonUsers.clear();
2095 for (auto &Iter : BlocksSchedules) {
2096 BlockScheduling *BS = Iter.second.get();
2097 BS->clear();
2098 }
2099 MinBWs.clear();
2100 ReductionBitWidth = 0;
2101 BaseGraphSize = 1;
2102 CastMaxMinBWSizes.reset();
2103 ExtraBitWidthNodes.clear();
2104 InstrElementSize.clear();
2105 UserIgnoreList = nullptr;
2106 PostponedGathers.clear();
2107 ValueToGatherNodes.clear();
2108 TreeEntryToStridedPtrInfoMap.clear();
2109 }
2110
2111 unsigned getTreeSize() const { return VectorizableTree.size(); }
2112
2113 /// Returns the base graph size, before any transformations.
2114 unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
2115
2116 /// Perform LICM and CSE on the newly generated gather sequences.
2118
2119 /// Does this non-empty order represent an identity order? Identity
2120 /// should be represented as an empty order, so this is used to
2121 /// decide if we can canonicalize a computed order. Undef elements
2122 /// (represented as size) are ignored.
2124 assert(!Order.empty() && "expected non-empty order");
2125 const unsigned Sz = Order.size();
2126 return all_of(enumerate(Order), [&](const auto &P) {
2127 return P.value() == P.index() || P.value() == Sz;
2128 });
2129 }
2130
2131 /// Checks if the specified gather tree entry \p TE can be represented as a
2132 /// shuffled vector entry + (possibly) permutation with other gathers. It
2133 /// implements the checks only for possibly ordered scalars (Loads,
2134 /// ExtractElement, ExtractValue), which can be part of the graph.
2135 /// \param TopToBottom If true, used for the whole tree rotation, false - for
2136 /// sub-tree rotations. \param IgnoreReorder true, if the order of the root
2137 /// node might be ignored.
2138 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE,
2139 bool TopToBottom,
2140 bool IgnoreReorder);
2141
2142 /// Sort loads into increasing pointers offsets to allow greater clustering.
2143 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
2144
2145 /// Gets reordering data for the given tree entry. If the entry is vectorized
2146 /// - just return ReorderIndices, otherwise check if the scalars can be
2147 /// reordered and return the most optimal order.
2148 /// \return std::nullopt if ordering is not important, empty order, if
2149 /// identity order is important, or the actual order.
2150 /// \param TopToBottom If true, include the order of vectorized stores and
2151 /// insertelement nodes, otherwise skip them.
2152 /// \param IgnoreReorder true, if the root node order can be ignored.
2153 std::optional<OrdersType>
2154 getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder);
2155
2156 /// Checks if it is profitable to reorder the current tree.
2157 /// If the tree does not contain many profitable reordable nodes, better to
2158 /// skip it to save compile time.
2159 bool isProfitableToReorder() const;
2160
2161 /// Reorders the current graph to the most profitable order starting from the
2162 /// root node to the leaf nodes. The best order is chosen only from the nodes
2163 /// of the same size (vectorization factor). Smaller nodes are considered
2164 /// parts of subgraph with smaller VF and they are reordered independently. We
2165 /// can make it because we still need to extend smaller nodes to the wider VF
2166 /// and we can merge reordering shuffles with the widening shuffles.
2167 void reorderTopToBottom();
2168
2169 /// Reorders the current graph to the most profitable order starting from
2170 /// leaves to the root. It allows to rotate small subgraphs and reduce the
2171 /// number of reshuffles if the leaf nodes use the same order. In this case we
2172 /// can merge the orders and just shuffle user node instead of shuffling its
2173 /// operands. Plus, even the leaf nodes have different orders, it allows to
2174 /// sink reordering in the graph closer to the root node and merge it later
2175 /// during analysis.
2176 void reorderBottomToTop(bool IgnoreReorder = false);
2177
2178 /// \return The vector element size in bits to use when vectorizing the
2179 /// expression tree ending at \p V. If V is a store, the size is the width of
2180 /// the stored value. Otherwise, the size is the width of the largest loaded
2181 /// value reaching V. This method is used by the vectorizer to calculate
2182 /// vectorization factors.
2183 unsigned getVectorElementSize(Value *V);
2184
2185 /// Compute the minimum type sizes required to represent the entries in a
2186 /// vectorizable tree.
2188
2189 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
2190 unsigned getMaxVecRegSize() const {
2191 return MaxVecRegSize;
2192 }
2193
2194 // \returns minimum vector register size as set by cl::opt.
2195 unsigned getMinVecRegSize() const {
2196 return MinVecRegSize;
2197 }
2198
2199 unsigned getMinVF(unsigned Sz) const {
2200 return std::max(2U, getMinVecRegSize() / Sz);
2201 }
2202
2203 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
2204 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
2205 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2206 return MaxVF ? MaxVF : UINT_MAX;
2207 }
2208
2209 /// Check if homogeneous aggregate is isomorphic to some VectorType.
2210 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
2211 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
2212 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
2213 ///
2214 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
2215 unsigned canMapToVector(Type *T) const;
2216
2217 /// \returns True if the VectorizableTree is both tiny and not fully
2218 /// vectorizable. We do not vectorize such trees.
2219 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
2220
2221 /// Checks if the graph and all its subgraphs cannot be better vectorized.
2222 /// It may happen, if all gather nodes are loads and they cannot be
2223 /// "clusterized". In this case even subgraphs cannot be vectorized more
2224 /// effectively than the base graph.
2225 bool isTreeNotExtendable() const;
2226
2227 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
2228 /// can be load combined in the backend. Load combining may not be allowed in
2229 /// the IR optimizer, so we do not want to alter the pattern. For example,
2230 /// partially transforming a scalar bswap() pattern into vector code is
2231 /// effectively impossible for the backend to undo.
2232 /// TODO: If load combining is allowed in the IR optimizer, this analysis
2233 /// may not be necessary.
2234 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
2235
2236 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
2237 /// can be load combined in the backend. Load combining may not be allowed in
2238 /// the IR optimizer, so we do not want to alter the pattern. For example,
2239 /// partially transforming a scalar bswap() pattern into vector code is
2240 /// effectively impossible for the backend to undo.
2241 /// TODO: If load combining is allowed in the IR optimizer, this analysis
2242 /// may not be necessary.
2243 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
2244 bool isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
2245 Align Alignment, const int64_t Diff, Value *Ptr0,
2246 Value *PtrN, StridedPtrInfo &SPtrInfo) const;
2247
2248 /// Return true if an array of scalar loads can be replaced with a strided
2249 /// load (with run-time stride).
2250 /// \param PointerOps list of pointer arguments of loads.
2251 /// \param ScalarTy type of loads.
2252 /// \param CommonAlignment common alignement of loads as computed by
2253 /// `computeCommonAlignment<LoadInst>`.
2254 /// \param SortedIndicies is a list of indicies computed by this function such
2255 /// that the sequence `PointerOps[SortedIndices[0]],
2256 /// PointerOps[SortedIndicies[1]], ..., PointerOps[SortedIndices[n]]` is
2257 /// ordered by the coefficient of the stride. For example, if PointerOps is
2258 /// `%base + %stride, %base, %base + 2 * stride` the `SortedIndices` will be
2259 /// `[1, 0, 2]`. We follow the convention that if `SortedIndices` has to be
2260 /// `0, 1, 2, 3, ...` we return empty vector for `SortedIndicies`.
2261 /// \param SPtrInfo If the function return `true`, it also sets all the fields
2262 /// of `SPtrInfo` necessary to generate the strided load later.
2263 bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ScalarTy,
2264 Align CommonAlignment,
2265 SmallVectorImpl<unsigned> &SortedIndices,
2266 StridedPtrInfo &SPtrInfo) const;
2267
2268 /// Checks if the given array of loads can be represented as a vectorized,
2269 /// scatter or just simple gather.
2270 /// \param VL list of loads.
2271 /// \param VL0 main load value.
2272 /// \param Order returned order of load instructions.
2273 /// \param PointerOps returned list of pointer operands.
2274 /// \param BestVF return best vector factor, if recursive check found better
2275 /// vectorization sequences rather than masked gather.
2276 /// \param TryRecursiveCheck used to check if long masked gather can be
2277 /// represented as a serie of loads/insert subvector, if profitable.
2280 SmallVectorImpl<Value *> &PointerOps,
2281 StridedPtrInfo &SPtrInfo,
2282 unsigned *BestVF = nullptr,
2283 bool TryRecursiveCheck = true) const;
2284
2285 /// Registers non-vectorizable sequence of loads
2286 template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
2287 ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
2288 }
2289
2290 /// Checks if the given loads sequence is known as not vectorizable
2291 template <typename T>
2293 return ListOfKnonwnNonVectorizableLoads.contains(hash_value(VL));
2294 }
2295
2297
2298 /// This structure holds any data we need about the edges being traversed
2299 /// during buildTreeRec(). We keep track of:
2300 /// (i) the user TreeEntry index, and
2301 /// (ii) the index of the edge.
2302 struct EdgeInfo {
2303 EdgeInfo() = default;
2304 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
2306 /// The user TreeEntry.
2307 TreeEntry *UserTE = nullptr;
2308 /// The operand index of the use.
2309 unsigned EdgeIdx = UINT_MAX;
2310#ifndef NDEBUG
2312 const BoUpSLP::EdgeInfo &EI) {
2313 EI.dump(OS);
2314 return OS;
2315 }
2316 /// Debug print.
2317 void dump(raw_ostream &OS) const {
2318 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
2319 << " EdgeIdx:" << EdgeIdx << "}";
2320 }
2321 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
2322#endif
2323 bool operator == (const EdgeInfo &Other) const {
2324 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
2325 }
2326
2327 operator bool() const { return UserTE != nullptr; }
2328 };
2329 friend struct DenseMapInfo<EdgeInfo>;
2330
2331 /// A helper class used for scoring candidates for two consecutive lanes.
2333 const TargetLibraryInfo &TLI;
2334 const DataLayout &DL;
2335 ScalarEvolution &SE;
2336 const BoUpSLP &R;
2337 int NumLanes; // Total number of lanes (aka vectorization factor).
2338 int MaxLevel; // The maximum recursion depth for accumulating score.
2339
2340 public:
2342 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
2343 int MaxLevel)
2344 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2345 MaxLevel(MaxLevel) {}
2346
2347 // The hard-coded scores listed here are not very important, though it shall
2348 // be higher for better matches to improve the resulting cost. When
2349 // computing the scores of matching one sub-tree with another, we are
2350 // basically counting the number of values that are matching. So even if all
2351 // scores are set to 1, we would still get a decent matching result.
2352 // However, sometimes we have to break ties. For example we may have to
2353 // choose between matching loads vs matching opcodes. This is what these
2354 // scores are helping us with: they provide the order of preference. Also,
2355 // this is important if the scalar is externally used or used in another
2356 // tree entry node in the different lane.
2357
2358 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
2359 static const int ScoreConsecutiveLoads = 4;
2360 /// The same load multiple times. This should have a better score than
2361 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
2362 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
2363 /// a vector load and 1.0 for a broadcast.
2364 static const int ScoreSplatLoads = 3;
2365 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
2366 static const int ScoreReversedLoads = 3;
2367 /// A load candidate for masked gather.
2368 static const int ScoreMaskedGatherCandidate = 1;
2369 /// ExtractElementInst from same vector and consecutive indexes.
2370 static const int ScoreConsecutiveExtracts = 4;
2371 /// ExtractElementInst from same vector and reversed indices.
2372 static const int ScoreReversedExtracts = 3;
2373 /// Constants.
2374 static const int ScoreConstants = 2;
2375 /// Instructions with the same opcode.
2376 static const int ScoreSameOpcode = 2;
2377 /// Instructions with alt opcodes (e.g, add + sub).
2378 static const int ScoreAltOpcodes = 1;
2379 /// Identical instructions (a.k.a. splat or broadcast).
2380 static const int ScoreSplat = 1;
2381 /// Matching with an undef is preferable to failing.
2382 static const int ScoreUndef = 1;
2383 /// Score for failing to find a decent match.
2384 static const int ScoreFail = 0;
2385 /// Score if all users are vectorized.
2386 static const int ScoreAllUserVectorized = 1;
2387
2388 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
2389 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
2390 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
2391 /// MainAltOps.
2393 ArrayRef<Value *> MainAltOps) const {
2394 if (!isValidElementType(V1->getType()) ||
2397
2398 if (V1 == V2) {
2399 if (isa<LoadInst>(V1)) {
2400 // Retruns true if the users of V1 and V2 won't need to be extracted.
2401 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
2402 // Bail out if we have too many uses to save compilation time.
2403 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
2404 return false;
2405
2406 auto AllUsersVectorized = [U1, U2, this](Value *V) {
2407 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
2408 return U == U1 || U == U2 || R.isVectorized(U);
2409 });
2410 };
2411 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2412 };
2413 // A broadcast of a load can be cheaper on some targets.
2414 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
2415 ElementCount::getFixed(NumLanes)) &&
2416 ((int)V1->getNumUses() == NumLanes ||
2417 AllUsersAreInternal(V1, V2)))
2419 }
2421 }
2422
2423 auto CheckSameEntryOrFail = [&]() {
2424 if (ArrayRef<TreeEntry *> TEs1 = R.getTreeEntries(V1); !TEs1.empty()) {
2426 if (ArrayRef<TreeEntry *> TEs2 = R.getTreeEntries(V2);
2427 !TEs2.empty() &&
2428 any_of(TEs2, [&](TreeEntry *E) { return Set.contains(E); }))
2430 }
2432 };
2433
2434 auto *LI1 = dyn_cast<LoadInst>(V1);
2435 auto *LI2 = dyn_cast<LoadInst>(V2);
2436 if (LI1 && LI2) {
2437 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2438 !LI2->isSimple())
2439 return CheckSameEntryOrFail();
2440
2441 std::optional<int64_t> Dist = getPointersDiff(
2442 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
2443 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
2444 if (!Dist || *Dist == 0) {
2445 if (getUnderlyingObject(LI1->getPointerOperand()) ==
2446 getUnderlyingObject(LI2->getPointerOperand()) &&
2447 R.TTI->isLegalMaskedGather(
2448 getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
2450 return CheckSameEntryOrFail();
2451 }
2452 // The distance is too large - still may be profitable to use masked
2453 // loads/gathers.
2454 if (std::abs(*Dist) > NumLanes / 2)
2456 // This still will detect consecutive loads, but we might have "holes"
2457 // in some cases. It is ok for non-power-2 vectorization and may produce
2458 // better results. It should not affect current vectorization.
2461 }
2462
2463 auto *C1 = dyn_cast<Constant>(V1);
2464 auto *C2 = dyn_cast<Constant>(V2);
2465 if (C1 && C2)
2467
2468 // Consider constants and buildvector compatible.
2469 if ((C1 && isa<InsertElementInst>(V2)) ||
2470 (C2 && isa<InsertElementInst>(V1)))
2472
2473 // Extracts from consecutive indexes of the same vector better score as
2474 // the extracts could be optimized away.
2475 Value *EV1;
2476 ConstantInt *Ex1Idx;
2477 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
2478 // Undefs are always profitable for extractelements.
2479 // Compiler can easily combine poison and extractelement <non-poison> or
2480 // undef and extractelement <poison>. But combining undef +
2481 // extractelement <non-poison-but-may-produce-poison> requires some
2482 // extra operations.
2483 if (isa<UndefValue>(V2))
2484 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
2487 Value *EV2 = nullptr;
2488 ConstantInt *Ex2Idx = nullptr;
2489 if (match(V2,
2491 m_Undef())))) {
2492 // Undefs are always profitable for extractelements.
2493 if (!Ex2Idx)
2495 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
2497 if (EV2 == EV1) {
2498 int Idx1 = Ex1Idx->getZExtValue();
2499 int Idx2 = Ex2Idx->getZExtValue();
2500 int Dist = Idx2 - Idx1;
2501 // The distance is too large - still may be profitable to use
2502 // shuffles.
2503 if (std::abs(Dist) == 0)
2505 if (std::abs(Dist) > NumLanes / 2)
2509 }
2511 }
2512 return CheckSameEntryOrFail();
2513 }
2514
2515 auto *I1 = dyn_cast<Instruction>(V1);
2516 auto *I2 = dyn_cast<Instruction>(V2);
2517 if (I1 && I2) {
2518 if (I1->getParent() != I2->getParent())
2519 return CheckSameEntryOrFail();
2520 SmallVector<Value *, 4> Ops(MainAltOps);
2521 Ops.push_back(I1);
2522 Ops.push_back(I2);
2523 InstructionsState S = getSameOpcode(Ops, TLI);
2524 // Note: Only consider instructions with <= 2 operands to avoid
2525 // complexity explosion.
2526 if (S &&
2527 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||
2528 !S.isAltShuffle()) &&
2529 all_of(Ops, [&S](Value *V) {
2530 return isa<PoisonValue>(V) ||
2531 cast<Instruction>(V)->getNumOperands() ==
2532 S.getMainOp()->getNumOperands();
2533 }))
2534 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
2536 }
2537
2538 if (I1 && isa<PoisonValue>(V2))
2540
2541 if (isa<UndefValue>(V2))
2543
2544 return CheckSameEntryOrFail();
2545 }
2546
2547 /// Go through the operands of \p LHS and \p RHS recursively until
2548 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
2549 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
2550 /// of \p U1 and \p U2), except at the beginning of the recursion where
2551 /// these are set to nullptr.
2552 ///
2553 /// For example:
2554 /// \verbatim
2555 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
2556 /// \ / \ / \ / \ /
2557 /// + + + +
2558 /// G1 G2 G3 G4
2559 /// \endverbatim
2560 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
2561 /// each level recursively, accumulating the score. It starts from matching
2562 /// the additions at level 0, then moves on to the loads (level 1). The
2563 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
2564 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
2565 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
2566 /// Please note that the order of the operands does not matter, as we
2567 /// evaluate the score of all profitable combinations of operands. In
2568 /// other words the score of G1 and G4 is the same as G1 and G2. This
2569 /// heuristic is based on ideas described in:
2570 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
2571 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
2572 /// Luís F. W. Góes
2574 Instruction *U2, int CurrLevel,
2575 ArrayRef<Value *> MainAltOps) const {
2576
2577 // Get the shallow score of V1 and V2.
2578 int ShallowScoreAtThisLevel =
2579 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
2580
2581 // If reached MaxLevel,
2582 // or if V1 and V2 are not instructions,
2583 // or if they are SPLAT,
2584 // or if they are not consecutive,
2585 // or if profitable to vectorize loads or extractelements, early return
2586 // the current cost.
2587 auto *I1 = dyn_cast<Instruction>(LHS);
2588 auto *I2 = dyn_cast<Instruction>(RHS);
2589 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2590 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
2591 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
2592 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2594 ShallowScoreAtThisLevel))
2595 return ShallowScoreAtThisLevel;
2596 assert(I1 && I2 && "Should have early exited.");
2597
2598 // Contains the I2 operand indexes that got matched with I1 operands.
2599 SmallSet<unsigned, 4> Op2Used;
2600
2601 // Recursion towards the operands of I1 and I2. We are trying all possible
2602 // operand pairs, and keeping track of the best score.
2603 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2604 OpIdx1 != NumOperands1; ++OpIdx1) {
2605 // Try to pair op1I with the best operand of I2.
2606 int MaxTmpScore = 0;
2607 unsigned MaxOpIdx2 = 0;
2608 bool FoundBest = false;
2609 // If I2 is commutative try all combinations.
2610 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
2611 unsigned ToIdx = isCommutative(I2)
2612 ? I2->getNumOperands()
2613 : std::min(I2->getNumOperands(), OpIdx1 + 1);
2614 assert(FromIdx <= ToIdx && "Bad index");
2615 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2616 // Skip operands already paired with OpIdx1.
2617 if (Op2Used.count(OpIdx2))
2618 continue;
2619 // Recursively calculate the cost at each level
2620 int TmpScore =
2621 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
2622 I1, I2, CurrLevel + 1, {});
2623 // Look for the best score.
2624 if (TmpScore > LookAheadHeuristics::ScoreFail &&
2625 TmpScore > MaxTmpScore) {
2626 MaxTmpScore = TmpScore;
2627 MaxOpIdx2 = OpIdx2;
2628 FoundBest = true;
2629 }
2630 }
2631 if (FoundBest) {
2632 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
2633 Op2Used.insert(MaxOpIdx2);
2634 ShallowScoreAtThisLevel += MaxTmpScore;
2635 }
2636 }
2637 return ShallowScoreAtThisLevel;
2638 }
2639 };
2640 /// A helper data structure to hold the operands of a vector of instructions.
2641 /// This supports a fixed vector length for all operand vectors.
2643 /// For each operand we need (i) the value, and (ii) the opcode that it
2644 /// would be attached to if the expression was in a left-linearized form.
2645 /// This is required to avoid illegal operand reordering.
2646 /// For example:
2647 /// \verbatim
2648 /// 0 Op1
2649 /// |/
2650 /// Op1 Op2 Linearized + Op2
2651 /// \ / ----------> |/
2652 /// - -
2653 ///
2654 /// Op1 - Op2 (0 + Op1) - Op2
2655 /// \endverbatim
2656 ///
2657 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
2658 ///
2659 /// Another way to think of this is to track all the operations across the
2660 /// path from the operand all the way to the root of the tree and to
2661 /// calculate the operation that corresponds to this path. For example, the
2662 /// path from Op2 to the root crosses the RHS of the '-', therefore the
2663 /// corresponding operation is a '-' (which matches the one in the
2664 /// linearized tree, as shown above).
2665 ///
2666 /// For lack of a better term, we refer to this operation as Accumulated
2667 /// Path Operation (APO).
2668 struct OperandData {
2669 OperandData() = default;
2670 OperandData(Value *V, bool APO, bool IsUsed)
2671 : V(V), APO(APO), IsUsed(IsUsed) {}
2672 /// The operand value.
2673 Value *V = nullptr;
2674 /// TreeEntries only allow a single opcode, or an alternate sequence of
2675 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
2676 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
2677 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
2678 /// (e.g., Add/Mul)
2679 bool APO = false;
2680 /// Helper data for the reordering function.
2681 bool IsUsed = false;
2682 };
2683
2684 /// During operand reordering, we are trying to select the operand at lane
2685 /// that matches best with the operand at the neighboring lane. Our
2686 /// selection is based on the type of value we are looking for. For example,
2687 /// if the neighboring lane has a load, we need to look for a load that is
2688 /// accessing a consecutive address. These strategies are summarized in the
2689 /// 'ReorderingMode' enumerator.
2690 enum class ReorderingMode {
2691 Load, ///< Matching loads to consecutive memory addresses
2692 Opcode, ///< Matching instructions based on opcode (same or alternate)
2693 Constant, ///< Matching constants
2694 Splat, ///< Matching the same instruction multiple times (broadcast)
2695 Failed, ///< We failed to create a vectorizable group
2696 };
2697
2698 using OperandDataVec = SmallVector<OperandData, 2>;
2699
2700 /// A vector of operand vectors.
2702 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
2703 /// is not IntrinsicInst, ArgSize is User::getNumOperands.
2704 unsigned ArgSize = 0;
2705
2706 const TargetLibraryInfo &TLI;
2707 const DataLayout &DL;
2708 ScalarEvolution &SE;
2709 const BoUpSLP &R;
2710 const Loop *L = nullptr;
2711
2712 /// \returns the operand data at \p OpIdx and \p Lane.
2713 OperandData &getData(unsigned OpIdx, unsigned Lane) {
2714 return OpsVec[OpIdx][Lane];
2715 }
2716
2717 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
2718 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
2719 return OpsVec[OpIdx][Lane];
2720 }
2721
2722 /// Clears the used flag for all entries.
2723 void clearUsed() {
2724 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
2725 OpIdx != NumOperands; ++OpIdx)
2726 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2727 ++Lane)
2728 OpsVec[OpIdx][Lane].IsUsed = false;
2729 }
2730
2731 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
2732 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
2733 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2734 }
2735
2736 /// \param Lane lane of the operands under analysis.
2737 /// \param OpIdx operand index in \p Lane lane we're looking the best
2738 /// candidate for.
2739 /// \param Idx operand index of the current candidate value.
2740 /// \returns The additional score due to possible broadcasting of the
2741 /// elements in the lane. It is more profitable to have power-of-2 unique
2742 /// elements in the lane, it will be vectorized with higher probability
2743 /// after removing duplicates. Currently the SLP vectorizer supports only
2744 /// vectorization of the power-of-2 number of unique scalars.
2745 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
2746 const SmallBitVector &UsedLanes) const {
2747 Value *IdxLaneV = getData(Idx, Lane).V;
2748 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2749 isa<ExtractElementInst>(IdxLaneV))
2750 return 0;
2752 for (unsigned Ln : seq<unsigned>(getNumLanes())) {
2753 if (Ln == Lane)
2754 continue;
2755 Value *OpIdxLnV = getData(OpIdx, Ln).V;
2756 if (!isa<Instruction>(OpIdxLnV))
2757 return 0;
2758 Uniques.try_emplace(OpIdxLnV, Ln);
2759 }
2760 unsigned UniquesCount = Uniques.size();
2761 auto IdxIt = Uniques.find(IdxLaneV);
2762 unsigned UniquesCntWithIdxLaneV =
2763 IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2764 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2765 auto OpIdxIt = Uniques.find(OpIdxLaneV);
2766 unsigned UniquesCntWithOpIdxLaneV =
2767 OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2768 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2769 return 0;
2770 return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) -
2771 UniquesCntWithOpIdxLaneV,
2772 UniquesCntWithOpIdxLaneV -
2773 bit_floor(UniquesCntWithOpIdxLaneV)) -
2774 ((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second))
2775 ? UniquesCntWithIdxLaneV - bit_floor(UniquesCntWithIdxLaneV)
2776 : bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2777 }
2778
2779 /// \param Lane lane of the operands under analysis.
2780 /// \param OpIdx operand index in \p Lane lane we're looking the best
2781 /// candidate for.
2782 /// \param Idx operand index of the current candidate value.
2783 /// \returns The additional score for the scalar which users are all
2784 /// vectorized.
2785 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
2786 Value *IdxLaneV = getData(Idx, Lane).V;
2787 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2788 // Do not care about number of uses for vector-like instructions
2789 // (extractelement/extractvalue with constant indices), they are extracts
2790 // themselves and already externally used. Vectorization of such
2791 // instructions does not add extra extractelement instruction, just may
2792 // remove it.
2793 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
2794 isVectorLikeInstWithConstOps(OpIdxLaneV))
2796 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2797 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2798 return 0;
2799 return R.areAllUsersVectorized(IdxLaneI)
2801 : 0;
2802 }
2803
2804 /// Score scaling factor for fully compatible instructions but with
2805 /// different number of external uses. Allows better selection of the
2806 /// instructions with less external uses.
2807 static const int ScoreScaleFactor = 10;
2808
2809 /// \Returns the look-ahead score, which tells us how much the sub-trees
2810 /// rooted at \p LHS and \p RHS match, the more they match the higher the
2811 /// score. This helps break ties in an informed way when we cannot decide on
2812 /// the order of the operands by just considering the immediate
2813 /// predecessors.
2814 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
2815 int Lane, unsigned OpIdx, unsigned Idx,
2816 bool &IsUsed, const SmallBitVector &UsedLanes) {
2817 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
2819 // Keep track of the instruction stack as we recurse into the operands
2820 // during the look-ahead score exploration.
2821 int Score =
2822 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
2823 /*CurrLevel=*/1, MainAltOps);
2824 if (Score) {
2825 int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
2826 if (Score <= -SplatScore) {
2827 // Failed score.
2828 Score = 0;
2829 } else {
2830 Score += SplatScore;
2831 // Scale score to see the difference between different operands
2832 // and similar operands but all vectorized/not all vectorized
2833 // uses. It does not affect actual selection of the best
2834 // compatible operand in general, just allows to select the
2835 // operand with all vectorized uses.
2836 Score *= ScoreScaleFactor;
2837 Score += getExternalUseScore(Lane, OpIdx, Idx);
2838 IsUsed = true;
2839 }
2840 }
2841 return Score;
2842 }
2843
2844 /// Best defined scores per lanes between the passes. Used to choose the
2845 /// best operand (with the highest score) between the passes.
2846 /// The key - {Operand Index, Lane}.
2847 /// The value - the best score between the passes for the lane and the
2848 /// operand.
2850 BestScoresPerLanes;
2851
2852 // Search all operands in Ops[*][Lane] for the one that matches best
2853 // Ops[OpIdx][LastLane] and return its opreand index.
2854 // If no good match can be found, return std::nullopt.
2855 std::optional<unsigned>
2856 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
2857 ArrayRef<ReorderingMode> ReorderingModes,
2858 ArrayRef<Value *> MainAltOps,
2859 const SmallBitVector &UsedLanes) {
2860 unsigned NumOperands = getNumOperands();
2861
2862 // The operand of the previous lane at OpIdx.
2863 Value *OpLastLane = getData(OpIdx, LastLane).V;
2864
2865 // Our strategy mode for OpIdx.
2866 ReorderingMode RMode = ReorderingModes[OpIdx];
2867 if (RMode == ReorderingMode::Failed)
2868 return std::nullopt;
2869
2870 // The linearized opcode of the operand at OpIdx, Lane.
2871 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2872
2873 // The best operand index and its score.
2874 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
2875 // are using the score to differentiate between the two.
2876 struct BestOpData {
2877 std::optional<unsigned> Idx;
2878 unsigned Score = 0;
2879 } BestOp;
2880 BestOp.Score =
2881 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
2882 .first->second;
2883
2884 // Track if the operand must be marked as used. If the operand is set to
2885 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
2886 // want to reestimate the operands again on the following iterations).
2887 bool IsUsed = RMode == ReorderingMode::Splat ||
2888 RMode == ReorderingMode::Constant ||
2889 RMode == ReorderingMode::Load;
2890 // Iterate through all unused operands and look for the best.
2891 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2892 // Get the operand at Idx and Lane.
2893 OperandData &OpData = getData(Idx, Lane);
2894 Value *Op = OpData.V;
2895 bool OpAPO = OpData.APO;
2896
2897 // Skip already selected operands.
2898 if (OpData.IsUsed)
2899 continue;
2900
2901 // Skip if we are trying to move the operand to a position with a
2902 // different opcode in the linearized tree form. This would break the
2903 // semantics.
2904 if (OpAPO != OpIdxAPO)
2905 continue;
2906
2907 // Look for an operand that matches the current mode.
2908 switch (RMode) {
2909 case ReorderingMode::Load:
2910 case ReorderingMode::Opcode: {
2911 bool LeftToRight = Lane > LastLane;
2912 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
2913 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
2914 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2915 OpIdx, Idx, IsUsed, UsedLanes);
2916 if (Score > static_cast<int>(BestOp.Score) ||
2917 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
2918 Idx == OpIdx)) {
2919 BestOp.Idx = Idx;
2920 BestOp.Score = Score;
2921 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2922 }
2923 break;
2924 }
2925 case ReorderingMode::Constant:
2926 if (isa<Constant>(Op) ||
2927 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
2928 BestOp.Idx = Idx;
2929 if (isa<Constant>(Op)) {
2931 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2933 }
2935 IsUsed = false;
2936 }
2937 break;
2938 case ReorderingMode::Splat:
2939 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
2940 IsUsed = Op == OpLastLane;
2941 if (Op == OpLastLane) {
2942 BestOp.Score = LookAheadHeuristics::ScoreSplat;
2943 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2945 }
2946 BestOp.Idx = Idx;
2947 }
2948 break;
2949 case ReorderingMode::Failed:
2950 llvm_unreachable("Not expected Failed reordering mode.");
2951 }
2952 }
2953
2954 if (BestOp.Idx) {
2955 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2956 return BestOp.Idx;
2957 }
2958 // If we could not find a good match return std::nullopt.
2959 return std::nullopt;
2960 }
2961
2962 /// Helper for reorderOperandVecs.
2963 /// \returns the lane that we should start reordering from. This is the one
2964 /// which has the least number of operands that can freely move about or
2965 /// less profitable because it already has the most optimal set of operands.
2966 unsigned getBestLaneToStartReordering() const {
2967 unsigned Min = UINT_MAX;
2968 unsigned SameOpNumber = 0;
2969 // std::pair<unsigned, unsigned> is used to implement a simple voting
2970 // algorithm and choose the lane with the least number of operands that
2971 // can freely move about or less profitable because it already has the
2972 // most optimal set of operands. The first unsigned is a counter for
2973 // voting, the second unsigned is the counter of lanes with instructions
2974 // with same/alternate opcodes and same parent basic block.
2976 // Try to be closer to the original results, if we have multiple lanes
2977 // with same cost. If 2 lanes have the same cost, use the one with the
2978 // highest index.
2979 for (int I = getNumLanes(); I > 0; --I) {
2980 unsigned Lane = I - 1;
2981 OperandsOrderData NumFreeOpsHash =
2982 getMaxNumOperandsThatCanBeReordered(Lane);
2983 // Compare the number of operands that can move and choose the one with
2984 // the least number.
2985 if (NumFreeOpsHash.NumOfAPOs < Min) {
2986 Min = NumFreeOpsHash.NumOfAPOs;
2987 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2988 HashMap.clear();
2989 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2990 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2991 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2992 // Select the most optimal lane in terms of number of operands that
2993 // should be moved around.
2994 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2995 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2996 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2997 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2998 auto [It, Inserted] =
2999 HashMap.try_emplace(NumFreeOpsHash.Hash, 1, Lane);
3000 if (!Inserted)
3001 ++It->second.first;
3002 }
3003 }
3004 // Select the lane with the minimum counter.
3005 unsigned BestLane = 0;
3006 unsigned CntMin = UINT_MAX;
3007 for (const auto &Data : reverse(HashMap)) {
3008 if (Data.second.first < CntMin) {
3009 CntMin = Data.second.first;
3010 BestLane = Data.second.second;
3011 }
3012 }
3013 return BestLane;
3014 }
3015
3016 /// Data structure that helps to reorder operands.
3017 struct OperandsOrderData {
3018 /// The best number of operands with the same APOs, which can be
3019 /// reordered.
3020 unsigned NumOfAPOs = UINT_MAX;
3021 /// Number of operands with the same/alternate instruction opcode and
3022 /// parent.
3023 unsigned NumOpsWithSameOpcodeParent = 0;
3024 /// Hash for the actual operands ordering.
3025 /// Used to count operands, actually their position id and opcode
3026 /// value. It is used in the voting mechanism to find the lane with the
3027 /// least number of operands that can freely move about or less profitable
3028 /// because it already has the most optimal set of operands. Can be
3029 /// replaced with SmallVector<unsigned> instead but hash code is faster
3030 /// and requires less memory.
3031 unsigned Hash = 0;
3032 };
3033 /// \returns the maximum number of operands that are allowed to be reordered
3034 /// for \p Lane and the number of compatible instructions(with the same
3035 /// parent/opcode). This is used as a heuristic for selecting the first lane
3036 /// to start operand reordering.
3037 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
3038 unsigned CntTrue = 0;
3039 unsigned NumOperands = getNumOperands();
3040 // Operands with the same APO can be reordered. We therefore need to count
3041 // how many of them we have for each APO, like this: Cnt[APO] = x.
3042 // Since we only have two APOs, namely true and false, we can avoid using
3043 // a map. Instead we can simply count the number of operands that
3044 // correspond to one of them (in this case the 'true' APO), and calculate
3045 // the other by subtracting it from the total number of operands.
3046 // Operands with the same instruction opcode and parent are more
3047 // profitable since we don't need to move them in many cases, with a high
3048 // probability such lane already can be vectorized effectively.
3049 bool AllUndefs = true;
3050 unsigned NumOpsWithSameOpcodeParent = 0;
3051 Instruction *OpcodeI = nullptr;
3052 BasicBlock *Parent = nullptr;
3053 unsigned Hash = 0;
3054 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3055 const OperandData &OpData = getData(OpIdx, Lane);
3056 if (OpData.APO)
3057 ++CntTrue;
3058 // Use Boyer-Moore majority voting for finding the majority opcode and
3059 // the number of times it occurs.
3060 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
3061 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI) ||
3062 I->getParent() != Parent) {
3063 if (NumOpsWithSameOpcodeParent == 0) {
3064 NumOpsWithSameOpcodeParent = 1;
3065 OpcodeI = I;
3066 Parent = I->getParent();
3067 } else {
3068 --NumOpsWithSameOpcodeParent;
3069 }
3070 } else {
3071 ++NumOpsWithSameOpcodeParent;
3072 }
3073 }
3074 Hash = hash_combine(
3075 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
3076 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
3077 }
3078 if (AllUndefs)
3079 return {};
3080 OperandsOrderData Data;
3081 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
3082 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3083 Data.Hash = Hash;
3084 return Data;
3085 }
3086
3087 /// Go through the instructions in VL and append their operands.
3088 void appendOperands(ArrayRef<Value *> VL, ArrayRef<ValueList> Operands,
3089 const InstructionsState &S) {
3090 assert(!Operands.empty() && !VL.empty() && "Bad list of operands");
3091 assert((empty() || all_of(Operands,
3092 [this](const ValueList &VL) {
3093 return VL.size() == getNumLanes();
3094 })) &&
3095 "Expected same number of lanes");
3096 assert(S.valid() && "InstructionsState is invalid.");
3097 // IntrinsicInst::isCommutative returns true if swapping the first "two"
3098 // arguments to the intrinsic produces the same result.
3099 Instruction *MainOp = S.getMainOp();
3100 unsigned NumOperands = MainOp->getNumOperands();
3102 OpsVec.resize(ArgSize);
3103 unsigned NumLanes = VL.size();
3104 for (OperandDataVec &Ops : OpsVec)
3105 Ops.resize(NumLanes);
3106 for (unsigned Lane : seq<unsigned>(NumLanes)) {
3107 // Our tree has just 3 nodes: the root and two operands.
3108 // It is therefore trivial to get the APO. We only need to check the
3109 // opcode of V and whether the operand at OpIdx is the LHS or RHS
3110 // operand. The LHS operand of both add and sub is never attached to an
3111 // inversese operation in the linearized form, therefore its APO is
3112 // false. The RHS is true only if V is an inverse operation.
3113
3114 // Since operand reordering is performed on groups of commutative
3115 // operations or alternating sequences (e.g., +, -), we can safely tell
3116 // the inverse operations by checking commutativity.
3117 auto *I = dyn_cast<Instruction>(VL[Lane]);
3118 if (!I && isa<PoisonValue>(VL[Lane])) {
3119 for (unsigned OpIdx : seq<unsigned>(NumOperands))
3120 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], true, false};
3121 continue;
3122 }
3123 bool IsInverseOperation = false;
3124 if (S.isCopyableElement(VL[Lane])) {
3125 // The value is a copyable element.
3126 IsInverseOperation = !isCommutative(MainOp, VL[Lane]);
3127 } else {
3128 assert(I && "Expected instruction");
3129 auto [SelectedOp, Ops] = convertTo(I, S);
3130 // We cannot check commutativity by the converted instruction
3131 // (SelectedOp) because isCommutative also examines def-use
3132 // relationships.
3133 IsInverseOperation = !isCommutative(SelectedOp, I);
3134 }
3135 for (unsigned OpIdx : seq<unsigned>(ArgSize)) {
3136 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
3137 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], APO, false};
3138 }
3139 }
3140 }
3141
3142 /// \returns the number of operands.
3143 unsigned getNumOperands() const { return ArgSize; }
3144
3145 /// \returns the number of lanes.
3146 unsigned getNumLanes() const { return OpsVec[0].size(); }
3147
3148 /// \returns the operand value at \p OpIdx and \p Lane.
3149 Value *getValue(unsigned OpIdx, unsigned Lane) const {
3150 return getData(OpIdx, Lane).V;
3151 }
3152
3153 /// \returns true if the data structure is empty.
3154 bool empty() const { return OpsVec.empty(); }
3155
3156 /// Clears the data.
3157 void clear() { OpsVec.clear(); }
3158
3159 /// \Returns true if there are enough operands identical to \p Op to fill
3160 /// the whole vector (it is mixed with constants or loop invariant values).
3161 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
3162 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
3163 assert(Op == getValue(OpIdx, Lane) &&
3164 "Op is expected to be getValue(OpIdx, Lane).");
3165 // Small number of loads - try load matching.
3166 if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
3167 return false;
3168 bool OpAPO = getData(OpIdx, Lane).APO;
3169 bool IsInvariant = L && L->isLoopInvariant(Op);
3170 unsigned Cnt = 0;
3171 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3172 if (Ln == Lane)
3173 continue;
3174 // This is set to true if we found a candidate for broadcast at Lane.
3175 bool FoundCandidate = false;
3176 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3177 OperandData &Data = getData(OpI, Ln);
3178 if (Data.APO != OpAPO || Data.IsUsed)
3179 continue;
3180 Value *OpILane = getValue(OpI, Lane);
3181 bool IsConstantOp = isa<Constant>(OpILane);
3182 // Consider the broadcast candidate if:
3183 // 1. Same value is found in one of the operands.
3184 if (Data.V == Op ||
3185 // 2. The operand in the given lane is not constant but there is a
3186 // constant operand in another lane (which can be moved to the
3187 // given lane). In this case we can represent it as a simple
3188 // permutation of constant and broadcast.
3189 (!IsConstantOp &&
3190 ((Lns > 2 && isa<Constant>(Data.V)) ||
3191 // 2.1. If we have only 2 lanes, need to check that value in the
3192 // next lane does not build same opcode sequence.
3193 (Lns == 2 &&
3194 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) &&
3195 isa<Constant>(Data.V)))) ||
3196 // 3. The operand in the current lane is loop invariant (can be
3197 // hoisted out) and another operand is also a loop invariant
3198 // (though not a constant). In this case the whole vector can be
3199 // hoisted out.
3200 // FIXME: need to teach the cost model about this case for better
3201 // estimation.
3202 (IsInvariant && !isa<Constant>(Data.V) &&
3203 !getSameOpcode({Op, Data.V}, TLI) &&
3204 L->isLoopInvariant(Data.V))) {
3205 FoundCandidate = true;
3206 Data.IsUsed = Data.V == Op;
3207 if (Data.V == Op)
3208 ++Cnt;
3209 break;
3210 }
3211 }
3212 if (!FoundCandidate)
3213 return false;
3214 }
3215 return getNumLanes() == 2 || Cnt > 1;
3216 }
3217
3218 /// Checks if there is at least single compatible operand in lanes other
3219 /// than \p Lane, compatible with the operand \p Op.
3220 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
3221 assert(Op == getValue(OpIdx, Lane) &&
3222 "Op is expected to be getValue(OpIdx, Lane).");
3223 bool OpAPO = getData(OpIdx, Lane).APO;
3224 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3225 if (Ln == Lane)
3226 continue;
3227 if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
3228 const OperandData &Data = getData(OpI, Ln);
3229 if (Data.APO != OpAPO || Data.IsUsed)
3230 return true;
3231 Value *OpILn = getValue(OpI, Ln);
3232 return (L && L->isLoopInvariant(OpILn)) ||
3233 (getSameOpcode({Op, OpILn}, TLI) &&
3234 allSameBlock({Op, OpILn}));
3235 }))
3236 return true;
3237 }
3238 return false;
3239 }
3240
3241 public:
3242 /// Initialize with all the operands of the instruction vector \p RootVL.
3244 const InstructionsState &S, const BoUpSLP &R)
3245 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3246 L(R.LI->getLoopFor(S.getMainOp()->getParent())) {
3247 // Append all the operands of RootVL.
3248 appendOperands(RootVL, Operands, S);
3249 }
3250
3251 /// \Returns a value vector with the operands across all lanes for the
3252 /// opearnd at \p OpIdx.
3253 ValueList getVL(unsigned OpIdx) const {
3254 ValueList OpVL(OpsVec[OpIdx].size());
3255 assert(OpsVec[OpIdx].size() == getNumLanes() &&
3256 "Expected same num of lanes across all operands");
3257 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3258 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
3259 return OpVL;
3260 }
3261
3262 // Performs operand reordering for 2 or more operands.
3263 // The original operands are in OrigOps[OpIdx][Lane].
3264 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
3265 void reorder() {
3266 unsigned NumOperands = getNumOperands();
3267 unsigned NumLanes = getNumLanes();
3268 // Each operand has its own mode. We are using this mode to help us select
3269 // the instructions for each lane, so that they match best with the ones
3270 // we have selected so far.
3271 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
3272
3273 // This is a greedy single-pass algorithm. We are going over each lane
3274 // once and deciding on the best order right away with no back-tracking.
3275 // However, in order to increase its effectiveness, we start with the lane
3276 // that has operands that can move the least. For example, given the
3277 // following lanes:
3278 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
3279 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
3280 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
3281 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
3282 // we will start at Lane 1, since the operands of the subtraction cannot
3283 // be reordered. Then we will visit the rest of the lanes in a circular
3284 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
3285
3286 // Find the first lane that we will start our search from.
3287 unsigned FirstLane = getBestLaneToStartReordering();
3288
3289 // Initialize the modes.
3290 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3291 Value *OpLane0 = getValue(OpIdx, FirstLane);
3292 // Keep track if we have instructions with all the same opcode on one
3293 // side.
3294 if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
3295 // Check if OpLane0 should be broadcast.
3296 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
3297 !canBeVectorized(OpILane0, OpIdx, FirstLane))
3298 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3299 else if (isa<LoadInst>(OpILane0))
3300 ReorderingModes[OpIdx] = ReorderingMode::Load;
3301 else
3302 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
3303 } else if (isa<Constant>(OpLane0)) {
3304 ReorderingModes[OpIdx] = ReorderingMode::Constant;
3305 } else if (isa<Argument>(OpLane0)) {
3306 // Our best hope is a Splat. It may save some cost in some cases.
3307 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3308 } else {
3309 llvm_unreachable("Unexpected value kind.");
3310 }
3311 }
3312
3313 // Check that we don't have same operands. No need to reorder if operands
3314 // are just perfect diamond or shuffled diamond match. Do not do it only
3315 // for possible broadcasts or non-power of 2 number of scalars (just for
3316 // now).
3317 auto &&SkipReordering = [this]() {
3318 SmallPtrSet<Value *, 4> UniqueValues;
3319 ArrayRef<OperandData> Op0 = OpsVec.front();
3320 for (const OperandData &Data : Op0)
3321 UniqueValues.insert(Data.V);
3323 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
3324 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
3325 return !UniqueValues.contains(Data.V);
3326 }))
3327 return false;
3328 }
3329 // TODO: Check if we can remove a check for non-power-2 number of
3330 // scalars after full support of non-power-2 vectorization.
3331 return UniqueValues.size() != 2 &&
3332 hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(),
3333 UniqueValues.size());
3334 };
3335
3336 // If the initial strategy fails for any of the operand indexes, then we
3337 // perform reordering again in a second pass. This helps avoid assigning
3338 // high priority to the failed strategy, and should improve reordering for
3339 // the non-failed operand indexes.
3340 for (int Pass = 0; Pass != 2; ++Pass) {
3341 // Check if no need to reorder operands since they're are perfect or
3342 // shuffled diamond match.
3343 // Need to do it to avoid extra external use cost counting for
3344 // shuffled matches, which may cause regressions.
3345 if (SkipReordering())
3346 break;
3347 // Skip the second pass if the first pass did not fail.
3348 bool StrategyFailed = false;
3349 // Mark all operand data as free to use.
3350 clearUsed();
3351 // We keep the original operand order for the FirstLane, so reorder the
3352 // rest of the lanes. We are visiting the nodes in a circular fashion,
3353 // using FirstLane as the center point and increasing the radius
3354 // distance.
3355 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
3356 for (unsigned I = 0; I < NumOperands; ++I)
3357 MainAltOps[I].push_back(getData(I, FirstLane).V);
3358
3359 SmallBitVector UsedLanes(NumLanes);
3360 UsedLanes.set(FirstLane);
3361 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3362 // Visit the lane on the right and then the lane on the left.
3363 for (int Direction : {+1, -1}) {
3364 int Lane = FirstLane + Direction * Distance;
3365 if (Lane < 0 || Lane >= (int)NumLanes)
3366 continue;
3367 UsedLanes.set(Lane);
3368 int LastLane = Lane - Direction;
3369 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
3370 "Out of bounds");
3371 // Look for a good match for each operand.
3372 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3373 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
3374 std::optional<unsigned> BestIdx =
3375 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
3376 MainAltOps[OpIdx], UsedLanes);
3377 // By not selecting a value, we allow the operands that follow to
3378 // select a better matching value. We will get a non-null value in
3379 // the next run of getBestOperand().
3380 if (BestIdx) {
3381 // Swap the current operand with the one returned by
3382 // getBestOperand().
3383 swap(OpIdx, *BestIdx, Lane);
3384 } else {
3385 // Enable the second pass.
3386 StrategyFailed = true;
3387 }
3388 // Try to get the alternate opcode and follow it during analysis.
3389 if (MainAltOps[OpIdx].size() != 2) {
3390 OperandData &AltOp = getData(OpIdx, Lane);
3391 InstructionsState OpS =
3392 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
3393 if (OpS && OpS.isAltShuffle())
3394 MainAltOps[OpIdx].push_back(AltOp.V);
3395 }
3396 }
3397 }
3398 }
3399 // Skip second pass if the strategy did not fail.
3400 if (!StrategyFailed)
3401 break;
3402 }
3403 }
3404
3405#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3406 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
3407 switch (RMode) {
3408 case ReorderingMode::Load:
3409 return "Load";
3410 case ReorderingMode::Opcode:
3411 return "Opcode";
3412 case ReorderingMode::Constant:
3413 return "Constant";
3414 case ReorderingMode::Splat:
3415 return "Splat";
3416 case ReorderingMode::Failed:
3417 return "Failed";
3418 }
3419 llvm_unreachable("Unimplemented Reordering Type");
3420 }
3421
3422 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
3423 raw_ostream &OS) {
3424 return OS << getModeStr(RMode);
3425 }
3426
3427 /// Debug print.
3428 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
3429 printMode(RMode, dbgs());
3430 }
3431
3432 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
3433 return printMode(RMode, OS);
3434 }
3435
3437 const unsigned Indent = 2;
3438 unsigned Cnt = 0;
3439 for (const OperandDataVec &OpDataVec : OpsVec) {
3440 OS << "Operand " << Cnt++ << "\n";
3441 for (const OperandData &OpData : OpDataVec) {
3442 OS.indent(Indent) << "{";
3443 if (Value *V = OpData.V)
3444 OS << *V;
3445 else
3446 OS << "null";
3447 OS << ", APO:" << OpData.APO << "}\n";
3448 }
3449 OS << "\n";
3450 }
3451 return OS;
3452 }
3453
3454 /// Debug print.
3455 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
3456#endif
3457 };
3458
3459 /// Evaluate each pair in \p Candidates and return index into \p Candidates
3460 /// for a pair which have highest score deemed to have best chance to form
3461 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
3462 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
3463 /// of the cost, considered to be good enough score.
3464 std::optional<int>
3465 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
3466 int Limit = LookAheadHeuristics::ScoreFail) const {
3467 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
3469 int BestScore = Limit;
3470 std::optional<int> Index;
3471 for (int I : seq<int>(0, Candidates.size())) {
3472 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
3473 Candidates[I].second,
3474 /*U1=*/nullptr, /*U2=*/nullptr,
3475 /*CurrLevel=*/1, {});
3476 if (Score > BestScore) {
3477 BestScore = Score;
3478 Index = I;
3479 }
3480 }
3481 return Index;
3482 }
3483
3484 /// Checks if the instruction is marked for deletion.
3485 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
3486
3487 /// Removes an instruction from its block and eventually deletes it.
3488 /// It's like Instruction::eraseFromParent() except that the actual deletion
3489 /// is delayed until BoUpSLP is destructed.
3491 DeletedInstructions.insert(I);
3492 }
3493
3494 /// Remove instructions from the parent function and clear the operands of \p
3495 /// DeadVals instructions, marking for deletion trivially dead operands.
3496 template <typename T>
3498 ArrayRef<T *> DeadVals,
3499 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
3501 for (T *V : DeadVals) {
3502 auto *I = cast<Instruction>(V);
3504 }
3505 DenseSet<Value *> Processed;
3506 for (T *V : DeadVals) {
3507 if (!V || !Processed.insert(V).second)
3508 continue;
3509 auto *I = cast<Instruction>(V);
3511 ArrayRef<TreeEntry *> Entries = getTreeEntries(I);
3512 for (Use &U : I->operands()) {
3513 if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
3514 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
3516 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
3517 return Entry->VectorizedValue == OpI;
3518 })))
3519 DeadInsts.push_back(OpI);
3520 }
3521 I->dropAllReferences();
3522 }
3523 for (T *V : DeadVals) {
3524 auto *I = cast<Instruction>(V);
3525 if (!I->getParent())
3526 continue;
3527 assert((I->use_empty() || all_of(I->uses(),
3528 [&](Use &U) {
3529 return isDeleted(
3530 cast<Instruction>(U.getUser()));
3531 })) &&
3532 "trying to erase instruction with users.");
3533 I->removeFromParent();
3534 SE->forgetValue(I);
3535 }
3536 // Process the dead instruction list until empty.
3537 while (!DeadInsts.empty()) {
3538 Value *V = DeadInsts.pop_back_val();
3540 if (!VI || !VI->getParent())
3541 continue;
3543 "Live instruction found in dead worklist!");
3544 assert(VI->use_empty() && "Instructions with uses are not dead.");
3545
3546 // Don't lose the debug info while deleting the instructions.
3547 salvageDebugInfo(*VI);
3548
3549 // Null out all of the instruction's operands to see if any operand
3550 // becomes dead as we go.
3551 for (Use &OpU : VI->operands()) {
3552 Value *OpV = OpU.get();
3553 if (!OpV)
3554 continue;
3555 OpU.set(nullptr);
3556
3557 if (!OpV->use_empty())
3558 continue;
3559
3560 // If the operand is an instruction that became dead as we nulled out
3561 // the operand, and if it is 'trivially' dead, delete it in a future
3562 // loop iteration.
3563 if (auto *OpI = dyn_cast<Instruction>(OpV))
3564 if (!DeletedInstructions.contains(OpI) &&
3565 (!OpI->getType()->isVectorTy() ||
3566 none_of(VectorValuesAndScales,
3567 [&](const std::tuple<Value *, unsigned, bool> &V) {
3568 return std::get<0>(V) == OpI;
3569 })) &&
3571 DeadInsts.push_back(OpI);
3572 }
3573
3574 VI->removeFromParent();
3575 eraseInstruction(VI);
3576 SE->forgetValue(VI);
3577 }
3578 }
3579
3580 /// Checks if the instruction was already analyzed for being possible
3581 /// reduction root.
3583 return AnalyzedReductionsRoots.count(I);
3584 }
3585 /// Register given instruction as already analyzed for being possible
3586 /// reduction root.
3588 AnalyzedReductionsRoots.insert(I);
3589 }
3590 /// Checks if the provided list of reduced values was checked already for
3591 /// vectorization.
3593 return AnalyzedReductionVals.contains(hash_value(VL));
3594 }
3595 /// Adds the list of reduced values to list of already checked values for the
3596 /// vectorization.
3598 AnalyzedReductionVals.insert(hash_value(VL));
3599 }
3600 /// Clear the list of the analyzed reduction root instructions.
3602 AnalyzedReductionsRoots.clear();
3603 AnalyzedReductionVals.clear();
3604 AnalyzedMinBWVals.clear();
3605 }
3606 /// Checks if the given value is gathered in one of the nodes.
3607 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
3608 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
3609 }
3610 /// Checks if the given value is gathered in one of the nodes.
3611 bool isGathered(const Value *V) const {
3612 return MustGather.contains(V);
3613 }
3614 /// Checks if the specified value was not schedule.
3615 bool isNotScheduled(const Value *V) const {
3616 return NonScheduledFirst.contains(V);
3617 }
3618
3619 /// Check if the value is vectorized in the tree.
3620 bool isVectorized(const Value *V) const {
3621 assert(V && "V cannot be nullptr.");
3622 return ScalarToTreeEntries.contains(V);
3623 }
3624
3625 ~BoUpSLP();
3626
3627private:
3628 /// Determine if a node \p E in can be demoted to a smaller type with a
3629 /// truncation. We collect the entries that will be demoted in ToDemote.
3630 /// \param E Node for analysis
3631 /// \param ToDemote indices of the nodes to be demoted.
3632 bool collectValuesToDemote(
3633 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
3635 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
3636 bool &IsProfitableToDemote, bool IsTruncRoot) const;
3637
3638 /// Builds the list of reorderable operands on the edges \p Edges of the \p
3639 /// UserTE, which allow reordering (i.e. the operands can be reordered because
3640 /// they have only one user and reordarable).
3641 /// \param ReorderableGathers List of all gather nodes that require reordering
3642 /// (e.g., gather of extractlements or partially vectorizable loads).
3643 /// \param GatherOps List of gather operand nodes for \p UserTE that require
3644 /// reordering, subset of \p NonVectorized.
3645 void buildReorderableOperands(
3646 TreeEntry *UserTE,
3647 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
3648 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
3649 SmallVectorImpl<TreeEntry *> &GatherOps);
3650
3651 /// Checks if the given \p TE is a gather node with clustered reused scalars
3652 /// and reorders it per given \p Mask.
3653 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
3654
3655 /// Checks if all users of \p I are the part of the vectorization tree.
3656 bool areAllUsersVectorized(
3657 Instruction *I,
3658 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
3659
3660 /// Return information about the vector formed for the specified index
3661 /// of a vector of (the same) instruction.
3663
3664 /// \returns the graph entry for the \p Idx operand of the \p E entry.
3665 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
3666 TreeEntry *getOperandEntry(TreeEntry *E, unsigned Idx) {
3667 return const_cast<TreeEntry *>(
3668 getOperandEntry(const_cast<const TreeEntry *>(E), Idx));
3669 }
3670
3671 /// Gets the root instruction for the given node. If the node is a strided
3672 /// load/store node with the reverse order, the root instruction is the last
3673 /// one.
3674 Instruction *getRootEntryInstruction(const TreeEntry &Entry) const;
3675
3676 /// \returns Cast context for the given graph node.
3678 getCastContextHint(const TreeEntry &TE) const;
3679
3680 /// \returns the cost of the vectorizable entry.
3681 InstructionCost getEntryCost(const TreeEntry *E,
3682 ArrayRef<Value *> VectorizedVals,
3683 SmallPtrSetImpl<Value *> &CheckedExtracts);
3684
3685 /// Checks if it is legal and profitable to build SplitVectorize node for the
3686 /// given \p VL.
3687 /// \param Op1 first homogeneous scalars.
3688 /// \param Op2 second homogeneous scalars.
3689 /// \param ReorderIndices indices to reorder the scalars.
3690 /// \returns true if the node was successfully built.
3691 bool canBuildSplitNode(ArrayRef<Value *> VL,
3692 const InstructionsState &LocalState,
3695 OrdersType &ReorderIndices) const;
3696
3697 /// This is the recursive part of buildTree.
3698 void buildTreeRec(ArrayRef<Value *> Roots, unsigned Depth, const EdgeInfo &EI,
3699 unsigned InterleaveFactor = 0);
3700
3701 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
3702 /// be vectorized to use the original vector (or aggregate "bitcast" to a
3703 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
3704 /// returns false, setting \p CurrentOrder to either an empty vector or a
3705 /// non-identity permutation that allows to reuse extract instructions.
3706 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
3707 /// extract order.
3708 bool canReuseExtract(ArrayRef<Value *> VL,
3709 SmallVectorImpl<unsigned> &CurrentOrder,
3710 bool ResizeAllowed = false) const;
3711
3712 /// Vectorize a single entry in the tree.
3713 Value *vectorizeTree(TreeEntry *E);
3714
3715 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
3716 /// \p E.
3717 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx);
3718
3719 /// Create a new vector from a list of scalar values. Produces a sequence
3720 /// which exploits values reused across lanes, and arranges the inserts
3721 /// for ease of later optimization.
3722 template <typename BVTy, typename ResTy, typename... Args>
3723 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
3724
3725 /// Create a new vector from a list of scalar values. Produces a sequence
3726 /// which exploits values reused across lanes, and arranges the inserts
3727 /// for ease of later optimization.
3728 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
3729
3730 /// Returns the instruction in the bundle, which can be used as a base point
3731 /// for scheduling. Usually it is the last instruction in the bundle, except
3732 /// for the case when all operands are external (in this case, it is the first
3733 /// instruction in the list).
3734 Instruction &getLastInstructionInBundle(const TreeEntry *E);
3735
3736 /// Tries to find extractelement instructions with constant indices from fixed
3737 /// vector type and gather such instructions into a bunch, which highly likely
3738 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3739 /// was successful, the matched scalars are replaced by poison values in \p VL
3740 /// for future analysis.
3741 std::optional<TargetTransformInfo::ShuffleKind>
3742 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
3743 SmallVectorImpl<int> &Mask) const;
3744
3745 /// Tries to find extractelement instructions with constant indices from fixed
3746 /// vector type and gather such instructions into a bunch, which highly likely
3747 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3748 /// was successful, the matched scalars are replaced by poison values in \p VL
3749 /// for future analysis.
3751 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
3753 unsigned NumParts) const;
3754
3755 /// Checks if the gathered \p VL can be represented as a single register
3756 /// shuffle(s) of previous tree entries.
3757 /// \param TE Tree entry checked for permutation.
3758 /// \param VL List of scalars (a subset of the TE scalar), checked for
3759 /// permutations. Must form single-register vector.
3760 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3761 /// commands to build the mask using the original vector value, without
3762 /// relying on the potential reordering.
3763 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
3764 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
3765 std::optional<TargetTransformInfo::ShuffleKind>
3766 isGatherShuffledSingleRegisterEntry(
3767 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
3768 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
3769 bool ForOrder);
3770
3771 /// Checks if the gathered \p VL can be represented as multi-register
3772 /// shuffle(s) of previous tree entries.
3773 /// \param TE Tree entry checked for permutation.
3774 /// \param VL List of scalars (a subset of the TE scalar), checked for
3775 /// permutations.
3776 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3777 /// commands to build the mask using the original vector value, without
3778 /// relying on the potential reordering.
3779 /// \returns per-register series of ShuffleKind, if gathered values can be
3780 /// represented as shuffles of previous tree entries. \p Mask is filled with
3781 /// the shuffle mask (also on per-register base).
3783 isGatherShuffledEntry(
3784 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
3786 unsigned NumParts, bool ForOrder = false);
3787
3788 /// \returns the cost of gathering (inserting) the values in \p VL into a
3789 /// vector.
3790 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
3791 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
3792 Type *ScalarTy) const;
3793
3794 /// Set the Builder insert point to one after the last instruction in
3795 /// the bundle
3796 void setInsertPointAfterBundle(const TreeEntry *E);
3797
3798 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
3799 /// specified, the starting vector value is poison.
3800 Value *
3801 gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
3802 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle);
3803
3804 /// \returns whether the VectorizableTree is fully vectorizable and will
3805 /// be beneficial even the tree height is tiny.
3806 bool isFullyVectorizableTinyTree(bool ForReduction) const;
3807
3808 /// Run through the list of all gathered loads in the graph and try to find
3809 /// vector loads/masked gathers instead of regular gathers. Later these loads
3810 /// are reshufled to build final gathered nodes.
3811 void tryToVectorizeGatheredLoads(
3812 const SmallMapVector<
3813 std::tuple<BasicBlock *, Value *, Type *>,
3814 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
3815 &GatheredLoads);
3816
3817 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
3818 /// users of \p TE and collects the stores. It returns the map from the store
3819 /// pointers to the collected stores.
3821 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
3822
3823 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
3824 /// stores in \p StoresVec can form a vector instruction. If so it returns
3825 /// true and populates \p ReorderIndices with the shuffle indices of the
3826 /// stores when compared to the sorted vector.
3827 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
3828 OrdersType &ReorderIndices) const;
3829
3830 /// Iterates through the users of \p TE, looking for scalar stores that can be
3831 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
3832 /// their order and builds an order index vector for each store bundle. It
3833 /// returns all these order vectors found.
3834 /// We run this after the tree has formed, otherwise we may come across user
3835 /// instructions that are not yet in the tree.
3837 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
3838
3839 /// Tries to reorder the gathering node for better vectorization
3840 /// opportunities.
3841 void reorderGatherNode(TreeEntry &TE);
3842
3843 class TreeEntry {
3844 public:
3845 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
3846 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3847
3848 /// \returns Common mask for reorder indices and reused scalars.
3849 SmallVector<int> getCommonMask() const {
3850 if (State == TreeEntry::SplitVectorize)
3851 return {};
3852 SmallVector<int> Mask;
3853 inversePermutation(ReorderIndices, Mask);
3854 ::addMask(Mask, ReuseShuffleIndices);
3855 return Mask;
3856 }
3857
3858 /// \returns The mask for split nodes.
3859 SmallVector<int> getSplitMask() const {
3860 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3861 "Expected only split vectorize node.");
3862 SmallVector<int> Mask(getVectorFactor(), PoisonMaskElem);
3863 unsigned CommonVF = std::max<unsigned>(
3864 CombinedEntriesWithIndices.back().second,
3865 Scalars.size() - CombinedEntriesWithIndices.back().second);
3866 for (auto [Idx, I] : enumerate(ReorderIndices))
3867 Mask[I] =
3868 Idx + (Idx >= CombinedEntriesWithIndices.back().second
3869 ? CommonVF - CombinedEntriesWithIndices.back().second
3870 : 0);
3871 return Mask;
3872 }
3873
3874 /// Updates (reorders) SplitVectorize node according to the given mask \p
3875 /// Mask and order \p MaskOrder.
3876 void reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
3877 ArrayRef<int> MaskOrder);
3878
3879 /// \returns true if the scalars in VL are equal to this entry.
3880 bool isSame(ArrayRef<Value *> VL) const {
3881 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
3882 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
3883 return std::equal(VL.begin(), VL.end(), Scalars.begin());
3884 return VL.size() == Mask.size() &&
3885 std::equal(VL.begin(), VL.end(), Mask.begin(),
3886 [Scalars](Value *V, int Idx) {
3887 return (isa<UndefValue>(V) &&
3888 Idx == PoisonMaskElem) ||
3889 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3890 });
3891 };
3892 if (!ReorderIndices.empty()) {
3893 // TODO: implement matching if the nodes are just reordered, still can
3894 // treat the vector as the same if the list of scalars matches VL
3895 // directly, without reordering.
3896 SmallVector<int> Mask;
3897 inversePermutation(ReorderIndices, Mask);
3898 if (VL.size() == Scalars.size())
3899 return IsSame(Scalars, Mask);
3900 if (VL.size() == ReuseShuffleIndices.size()) {
3901 ::addMask(Mask, ReuseShuffleIndices);
3902 return IsSame(Scalars, Mask);
3903 }
3904 return false;
3905 }
3906 return IsSame(Scalars, ReuseShuffleIndices);
3907 }
3908
3909 /// \returns true if current entry has same operands as \p TE.
3910 bool hasEqualOperands(const TreeEntry &TE) const {
3911 if (TE.getNumOperands() != getNumOperands())
3912 return false;
3913 SmallBitVector Used(getNumOperands());
3914 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
3915 unsigned PrevCount = Used.count();
3916 for (unsigned K = 0; K < E; ++K) {
3917 if (Used.test(K))
3918 continue;
3919 if (getOperand(K) == TE.getOperand(I)) {
3920 Used.set(K);
3921 break;
3922 }
3923 }
3924 // Check if we actually found the matching operand.
3925 if (PrevCount == Used.count())
3926 return false;
3927 }
3928 return true;
3929 }
3930
3931 /// \return Final vectorization factor for the node. Defined by the total
3932 /// number of vectorized scalars, including those, used several times in the
3933 /// entry and counted in the \a ReuseShuffleIndices, if any.
3934 unsigned getVectorFactor() const {
3935 if (!ReuseShuffleIndices.empty())
3936 return ReuseShuffleIndices.size();
3937 return Scalars.size();
3938 };
3939
3940 /// Checks if the current node is a gather node.
3941 bool isGather() const { return State == NeedToGather; }
3942
3943 /// A vector of scalars.
3944 ValueList Scalars;
3945
3946 /// The Scalars are vectorized into this value. It is initialized to Null.
3947 WeakTrackingVH VectorizedValue = nullptr;
3948
3949 /// Do we need to gather this sequence or vectorize it
3950 /// (either with vector instruction or with scatter/gather
3951 /// intrinsics for store/load)?
3952 enum EntryState {
3953 Vectorize, ///< The node is regularly vectorized.
3954 ScatterVectorize, ///< Masked scatter/gather node.
3955 StridedVectorize, ///< Strided loads (and stores)
3956 CompressVectorize, ///< (Masked) load with compress.
3957 NeedToGather, ///< Gather/buildvector node.
3958 CombinedVectorize, ///< Vectorized node, combined with its user into more
3959 ///< complex node like select/cmp to minmax, mul/add to
3960 ///< fma, etc. Must be used for the following nodes in
3961 ///< the pattern, not the very first one.
3962 SplitVectorize, ///< Splits the node into 2 subnodes, vectorizes them
3963 ///< independently and then combines back.
3964 };
3965 EntryState State;
3966
3967 /// List of combined opcodes supported by the vectorizer.
3968 enum CombinedOpcode {
3969 NotCombinedOp = -1,
3970 MinMax = Instruction::OtherOpsEnd + 1,
3971 FMulAdd,
3972 };
3973 CombinedOpcode CombinedOp = NotCombinedOp;
3974
3975 /// Does this sequence require some shuffling?
3976 SmallVector<int, 4> ReuseShuffleIndices;
3977
3978 /// Does this entry require reordering?
3979 SmallVector<unsigned, 4> ReorderIndices;
3980
3981 /// Points back to the VectorizableTree.
3982 ///
3983 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
3984 /// to be a pointer and needs to be able to initialize the child iterator.
3985 /// Thus we need a reference back to the container to translate the indices
3986 /// to entries.
3987 VecTreeTy &Container;
3988
3989 /// The TreeEntry index containing the user of this entry.
3990 EdgeInfo UserTreeIndex;
3991
3992 /// The index of this treeEntry in VectorizableTree.
3993 unsigned Idx = 0;
3994
3995 /// For gather/buildvector/alt opcode nodes, which are combined from
3996 /// other nodes as a series of insertvector instructions.
3997 SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
3998
3999 private:
4000 /// The operands of each instruction in each lane Operands[op_index][lane].
4001 /// Note: This helps avoid the replication of the code that performs the
4002 /// reordering of operands during buildTreeRec() and vectorizeTree().
4004
4005 /// Copyable elements of the entry node.
4006 SmallPtrSet<const Value *, 4> CopyableElements;
4007
4008 /// MainOp and AltOp are recorded inside. S should be obtained from
4009 /// newTreeEntry.
4010 InstructionsState S = InstructionsState::invalid();
4011
4012 /// Interleaving factor for interleaved loads Vectorize nodes.
4013 unsigned InterleaveFactor = 0;
4014
4015 /// True if the node does not require scheduling.
4016 bool DoesNotNeedToSchedule = false;
4017
4018 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
4019 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
4020 if (Operands.size() < OpIdx + 1)
4021 Operands.resize(OpIdx + 1);
4022 assert(Operands[OpIdx].empty() && "Already resized?");
4023 assert(OpVL.size() <= Scalars.size() &&
4024 "Number of operands is greater than the number of scalars.");
4025 Operands[OpIdx].resize(OpVL.size());
4026 copy(OpVL, Operands[OpIdx].begin());
4027 }
4028
4029 public:
4030 /// Returns interleave factor for interleave nodes.
4031 unsigned getInterleaveFactor() const { return InterleaveFactor; }
4032 /// Sets interleaving factor for the interleaving nodes.
4033 void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
4034
4035 /// Marks the node as one that does not require scheduling.
4036 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule = true; }
4037 /// Returns true if the node is marked as one that does not require
4038 /// scheduling.
4039 bool doesNotNeedToSchedule() const { return DoesNotNeedToSchedule; }
4040
4041 /// Set this bundle's operands from \p Operands.
4042 void setOperands(ArrayRef<ValueList> Operands) {
4043 for (unsigned I : seq<unsigned>(Operands.size()))
4044 setOperand(I, Operands[I]);
4045 }
4046
4047 /// Reorders operands of the node to the given mask \p Mask.
4048 void reorderOperands(ArrayRef<int> Mask) {
4049 for (ValueList &Operand : Operands)
4050 reorderScalars(Operand, Mask);
4051 }
4052
4053 /// \returns the \p OpIdx operand of this TreeEntry.
4054 ValueList &getOperand(unsigned OpIdx) {
4055 assert(OpIdx < Operands.size() && "Off bounds");
4056 return Operands[OpIdx];
4057 }
4058
4059 /// \returns the \p OpIdx operand of this TreeEntry.
4060 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
4061 assert(OpIdx < Operands.size() && "Off bounds");
4062 return Operands[OpIdx];
4063 }
4064
4065 /// \returns the number of operands.
4066 unsigned getNumOperands() const { return Operands.size(); }
4067
4068 /// \return the single \p OpIdx operand.
4069 Value *getSingleOperand(unsigned OpIdx) const {
4070 assert(OpIdx < Operands.size() && "Off bounds");
4071 assert(!Operands[OpIdx].empty() && "No operand available");
4072 return Operands[OpIdx][0];
4073 }
4074
4075 /// Some of the instructions in the list have alternate opcodes.
4076 bool isAltShuffle() const { return S.isAltShuffle(); }
4077
4078 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
4079 return S.getMatchingMainOpOrAltOp(I);
4080 }
4081
4082 /// Chooses the correct key for scheduling data. If \p Op has the same (or
4083 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
4084 /// \p OpValue.
4085 Value *isOneOf(Value *Op) const {
4086 auto *I = dyn_cast<Instruction>(Op);
4087 if (I && getMatchingMainOpOrAltOp(I))
4088 return Op;
4089 return S.getMainOp();
4090 }
4091
4092 void setOperations(const InstructionsState &S) {
4093 assert(S && "InstructionsState is invalid.");
4094 this->S = S;
4095 }
4096
4097 Instruction *getMainOp() const { return S.getMainOp(); }
4098
4099 Instruction *getAltOp() const { return S.getAltOp(); }
4100
4101 /// The main/alternate opcodes for the list of instructions.
4102 unsigned getOpcode() const { return S.getOpcode(); }
4103
4104 unsigned getAltOpcode() const { return S.getAltOpcode(); }
4105
4106 bool hasState() const { return S.valid(); }
4107
4108 /// Add \p V to the list of copyable elements.
4109 void addCopyableElement(Value *V) {
4110 assert(S.isCopyableElement(V) && "Not a copyable element.");
4111 CopyableElements.insert(V);
4112 }
4113
4114 /// Returns true if \p V is a copyable element.
4115 bool isCopyableElement(Value *V) const {
4116 return CopyableElements.contains(V);
4117 }
4118
4119 /// Returns true if any scalar in the list is a copyable element.
4120 bool hasCopyableElements() const { return !CopyableElements.empty(); }
4121
4122 /// Returns the state of the operations.
4123 const InstructionsState &getOperations() const { return S; }
4124
4125 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
4126 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
4127 unsigned findLaneForValue(Value *V) const {
4128 unsigned FoundLane = getVectorFactor();
4129 for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End;
4130 std::advance(It, 1)) {
4131 if (*It != V)
4132 continue;
4133 FoundLane = std::distance(Scalars.begin(), It);
4134 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4135 if (!ReorderIndices.empty())
4136 FoundLane = ReorderIndices[FoundLane];
4137 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4138 if (ReuseShuffleIndices.empty())
4139 break;
4140 if (auto *RIt = find(ReuseShuffleIndices, FoundLane);
4141 RIt != ReuseShuffleIndices.end()) {
4142 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
4143 break;
4144 }
4145 }
4146 assert(FoundLane < getVectorFactor() && "Unable to find given value.");
4147 return FoundLane;
4148 }
4149
4150 /// Build a shuffle mask for graph entry which represents a merge of main
4151 /// and alternate operations.
4152 void
4153 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
4154 SmallVectorImpl<int> &Mask,
4155 SmallVectorImpl<Value *> *OpScalars = nullptr,
4156 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
4157
4158 /// Return true if this is a non-power-of-2 node.
4159 bool isNonPowOf2Vec() const {
4160 bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
4161 return IsNonPowerOf2;
4162 }
4163
4164 /// Return true if this is a node, which tries to vectorize number of
4165 /// elements, forming whole vectors.
4166 bool
4167 hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
4168 bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
4169 TTI, getValueType(Scalars.front()), Scalars.size());
4170 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
4171 "Reshuffling not supported with non-power-of-2 vectors yet.");
4172 return IsNonPowerOf2;
4173 }
4174
4175 Value *getOrdered(unsigned Idx) const {
4176 assert(isGather() && "Must be used only for buildvectors/gathers.");
4177 if (ReorderIndices.empty())
4178 return Scalars[Idx];
4179 SmallVector<int> Mask;
4180 inversePermutation(ReorderIndices, Mask);
4181 return Scalars[Mask[Idx]];
4182 }
4183
4184#ifndef NDEBUG
4185 /// Debug printer.
4186 LLVM_DUMP_METHOD void dump() const {
4187 dbgs() << Idx << ".\n";
4188 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4189 dbgs() << "Operand " << OpI << ":\n";
4190 for (const Value *V : Operands[OpI])
4191 dbgs().indent(2) << *V << "\n";
4192 }
4193 dbgs() << "Scalars: \n";
4194 for (Value *V : Scalars)
4195 dbgs().indent(2) << *V << "\n";
4196 dbgs() << "State: ";
4197 if (S && hasCopyableElements())
4198 dbgs() << "[[Copyable]] ";
4199 switch (State) {
4200 case Vectorize:
4201 if (InterleaveFactor > 0) {
4202 dbgs() << "Vectorize with interleave factor " << InterleaveFactor
4203 << "\n";
4204 } else {
4205 dbgs() << "Vectorize\n";
4206 }
4207 break;
4208 case ScatterVectorize:
4209 dbgs() << "ScatterVectorize\n";
4210 break;
4211 case StridedVectorize:
4212 dbgs() << "StridedVectorize\n";
4213 break;
4214 case CompressVectorize:
4215 dbgs() << "CompressVectorize\n";
4216 break;
4217 case NeedToGather:
4218 dbgs() << "NeedToGather\n";
4219 break;
4220 case CombinedVectorize:
4221 dbgs() << "CombinedVectorize\n";
4222 break;
4223 case SplitVectorize:
4224 dbgs() << "SplitVectorize\n";
4225 break;
4226 }
4227 if (S) {
4228 dbgs() << "MainOp: " << *S.getMainOp() << "\n";
4229 dbgs() << "AltOp: " << *S.getAltOp() << "\n";
4230 } else {
4231 dbgs() << "MainOp: NULL\n";
4232 dbgs() << "AltOp: NULL\n";
4233 }
4234 dbgs() << "VectorizedValue: ";
4235 if (VectorizedValue)
4236 dbgs() << *VectorizedValue << "\n";
4237 else
4238 dbgs() << "NULL\n";
4239 dbgs() << "ReuseShuffleIndices: ";
4240 if (ReuseShuffleIndices.empty())
4241 dbgs() << "Empty";
4242 else
4243 for (int ReuseIdx : ReuseShuffleIndices)
4244 dbgs() << ReuseIdx << ", ";
4245 dbgs() << "\n";
4246 dbgs() << "ReorderIndices: ";
4247 for (unsigned ReorderIdx : ReorderIndices)
4248 dbgs() << ReorderIdx << ", ";
4249 dbgs() << "\n";
4250 dbgs() << "UserTreeIndex: ";
4251 if (UserTreeIndex)
4252 dbgs() << UserTreeIndex;
4253 else
4254 dbgs() << "<invalid>";
4255 dbgs() << "\n";
4256 if (!CombinedEntriesWithIndices.empty()) {
4257 dbgs() << "Combined entries: ";
4258 interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
4259 dbgs() << "Entry index " << P.first << " with offset " << P.second;
4260 });
4261 dbgs() << "\n";
4262 }
4263 }
4264#endif
4265 };
4266
4267#ifndef NDEBUG
4268 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
4269 InstructionCost VecCost, InstructionCost ScalarCost,
4270 StringRef Banner) const {
4271 dbgs() << "SLP: " << Banner << ":\n";
4272 E->dump();
4273 dbgs() << "SLP: Costs:\n";
4274 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
4275 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
4276 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
4277 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4278 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
4279 }
4280#endif
4281
4282 /// Create a new gather TreeEntry
4283 TreeEntry *newGatherTreeEntry(ArrayRef<Value *> VL,
4284 const InstructionsState &S,
4285 const EdgeInfo &UserTreeIdx,
4286 ArrayRef<int> ReuseShuffleIndices = {}) {
4287 auto Invalid = ScheduleBundle::invalid();
4288 return newTreeEntry(VL, Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4289 }
4290
4291 /// Create a new VectorizableTree entry.
4292 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, ScheduleBundle &Bundle,
4293 const InstructionsState &S,
4294 const EdgeInfo &UserTreeIdx,
4295 ArrayRef<int> ReuseShuffleIndices = {},
4296 ArrayRef<unsigned> ReorderIndices = {},
4297 unsigned InterleaveFactor = 0) {
4298 TreeEntry::EntryState EntryState =
4299 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4300 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4301 ReuseShuffleIndices, ReorderIndices);
4302 if (E && InterleaveFactor > 0)
4303 E->setInterleave(InterleaveFactor);
4304 return E;
4305 }
4306
4307 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
4308 TreeEntry::EntryState EntryState,
4309 ScheduleBundle &Bundle, const InstructionsState &S,
4310 const EdgeInfo &UserTreeIdx,
4311 ArrayRef<int> ReuseShuffleIndices = {},
4312 ArrayRef<unsigned> ReorderIndices = {}) {
4313 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4314 EntryState == TreeEntry::SplitVectorize)) ||
4315 (Bundle && EntryState != TreeEntry::NeedToGather &&
4316 EntryState != TreeEntry::SplitVectorize)) &&
4317 "Need to vectorize gather entry?");
4318 // Gathered loads still gathered? Do not create entry, use the original one.
4319 if (GatheredLoadsEntriesFirst.has_value() &&
4320 EntryState == TreeEntry::NeedToGather && S &&
4321 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4322 !UserTreeIdx.UserTE)
4323 return nullptr;
4324 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
4325 TreeEntry *Last = VectorizableTree.back().get();
4326 Last->Idx = VectorizableTree.size() - 1;
4327 Last->State = EntryState;
4328 if (UserTreeIdx.UserTE)
4329 OperandsToTreeEntry.try_emplace(
4330 std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx), Last);
4331 // FIXME: Remove once support for ReuseShuffleIndices has been implemented
4332 // for non-power-of-two vectors.
4333 assert(
4334 (hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) ||
4335 ReuseShuffleIndices.empty()) &&
4336 "Reshuffling scalars not yet supported for nodes with padding");
4337 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
4338 ReuseShuffleIndices.end());
4339 if (ReorderIndices.empty()) {
4340 Last->Scalars.assign(VL.begin(), VL.end());
4341 if (S)
4342 Last->setOperations(S);
4343 } else {
4344 // Reorder scalars and build final mask.
4345 Last->Scalars.assign(VL.size(), nullptr);
4346 transform(ReorderIndices, Last->Scalars.begin(),
4347 [VL](unsigned Idx) -> Value * {
4348 if (Idx >= VL.size())
4349 return UndefValue::get(VL.front()->getType());
4350 return VL[Idx];
4351 });
4352 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
4353 if (S)
4354 Last->setOperations(S);
4355 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
4356 }
4357 if (EntryState == TreeEntry::SplitVectorize) {
4358 assert(S && "Split nodes must have operations.");
4359 Last->setOperations(S);
4360 SmallPtrSet<Value *, 4> Processed;
4361 for (Value *V : VL) {
4362 auto *I = dyn_cast<Instruction>(V);
4363 if (!I)
4364 continue;
4365 auto It = ScalarsInSplitNodes.find(V);
4366 if (It == ScalarsInSplitNodes.end()) {
4367 ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(Last);
4368 (void)Processed.insert(V);
4369 } else if (Processed.insert(V).second) {
4370 assert(!is_contained(It->getSecond(), Last) &&
4371 "Value already associated with the node.");
4372 It->getSecond().push_back(Last);
4373 }
4374 }
4375 } else if (!Last->isGather()) {
4376 if (isa<PHINode>(S.getMainOp()) ||
4377 isVectorLikeInstWithConstOps(S.getMainOp()) ||
4378 (!S.areInstructionsWithCopyableElements() &&
4379 doesNotNeedToSchedule(VL)) ||
4380 all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))
4381 Last->setDoesNotNeedToSchedule();
4382 SmallPtrSet<Value *, 4> Processed;
4383 for (Value *V : VL) {
4384 if (isa<PoisonValue>(V))
4385 continue;
4386 if (S.isCopyableElement(V)) {
4387 Last->addCopyableElement(V);
4388 continue;
4389 }
4390 auto It = ScalarToTreeEntries.find(V);
4391 if (It == ScalarToTreeEntries.end()) {
4392 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(Last);
4393 (void)Processed.insert(V);
4394 } else if (Processed.insert(V).second) {
4395 assert(!is_contained(It->getSecond(), Last) &&
4396 "Value already associated with the node.");
4397 It->getSecond().push_back(Last);
4398 }
4399 }
4400 // Update the scheduler bundle to point to this TreeEntry.
4401 assert((!Bundle.getBundle().empty() || Last->doesNotNeedToSchedule()) &&
4402 "Bundle and VL out of sync");
4403 if (!Bundle.getBundle().empty()) {
4404#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4405 auto *BundleMember = Bundle.getBundle().begin();
4406 SmallPtrSet<Value *, 4> Processed;
4407 for (Value *V : VL) {
4408 if (S.isNonSchedulable(V) || !Processed.insert(V).second)
4409 continue;
4410 ++BundleMember;
4411 }
4412 assert(BundleMember == Bundle.getBundle().end() &&
4413 "Bundle and VL out of sync");
4414#endif
4415 Bundle.setTreeEntry(Last);
4416 }
4417 } else {
4418 // Build a map for gathered scalars to the nodes where they are used.
4419 bool AllConstsOrCasts = true;
4420 for (Value *V : VL) {
4421 if (S && S.areInstructionsWithCopyableElements() &&
4422 S.isCopyableElement(V))
4423 Last->addCopyableElement(V);
4424 if (!isConstant(V)) {
4425 auto *I = dyn_cast<CastInst>(V);
4426 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
4427 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4428 !UserTreeIdx.UserTE->isGather())
4429 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
4430 }
4431 }
4432 if (AllConstsOrCasts)
4433 CastMaxMinBWSizes =
4434 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
4435 MustGather.insert_range(VL);
4436 }
4437
4438 if (UserTreeIdx.UserTE)
4439 Last->UserTreeIndex = UserTreeIdx;
4440 return Last;
4441 }
4442
4443 /// -- Vectorization State --
4444 /// Holds all of the tree entries.
4445 TreeEntry::VecTreeTy VectorizableTree;
4446
4447#ifndef NDEBUG
4448 /// Debug printer.
4449 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
4450 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4451 VectorizableTree[Id]->dump();
4452 dbgs() << "\n";
4453 }
4454 }
4455#endif
4456
4457 /// Get list of vector entries, associated with the value \p V.
4458 ArrayRef<TreeEntry *> getTreeEntries(Value *V) const {
4459 assert(V && "V cannot be nullptr.");
4460 auto It = ScalarToTreeEntries.find(V);
4461 if (It == ScalarToTreeEntries.end())
4462 return {};
4463 return It->getSecond();
4464 }
4465
4466 /// Get list of split vector entries, associated with the value \p V.
4467 ArrayRef<TreeEntry *> getSplitTreeEntries(Value *V) const {
4468 assert(V && "V cannot be nullptr.");
4469 auto It = ScalarsInSplitNodes.find(V);
4470 if (It == ScalarsInSplitNodes.end())
4471 return {};
4472 return It->getSecond();
4473 }
4474
4475 /// Returns first vector node for value \p V, matching values \p VL.
4476 TreeEntry *getSameValuesTreeEntry(Value *V, ArrayRef<Value *> VL,
4477 bool SameVF = false) const {
4478 assert(V && "V cannot be nullptr.");
4479 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
4480 if ((!SameVF || TE->getVectorFactor() == VL.size()) && TE->isSame(VL))
4481 return TE;
4482 return nullptr;
4483 }
4484
4485 /// Check that the operand node of alternate node does not generate
4486 /// buildvector sequence. If it is, then probably not worth it to build
4487 /// alternate shuffle, if number of buildvector operands + alternate
4488 /// instruction > than the number of buildvector instructions.
4489 /// \param S the instructions state of the analyzed values.
4490 /// \param VL list of the instructions with alternate opcodes.
4491 bool areAltOperandsProfitable(const InstructionsState &S,
4492 ArrayRef<Value *> VL) const;
4493
4494 /// Contains all the outputs of legality analysis for a list of values to
4495 /// vectorize.
4496 class ScalarsVectorizationLegality {
4497 InstructionsState S;
4498 bool IsLegal;
4499 bool TryToFindDuplicates;
4500 bool TrySplitVectorize;
4501
4502 public:
4503 ScalarsVectorizationLegality(InstructionsState S, bool IsLegal,
4504 bool TryToFindDuplicates = true,
4505 bool TrySplitVectorize = false)
4506 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4507 TrySplitVectorize(TrySplitVectorize) {
4508 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4509 "Inconsistent state");
4510 }
4511 const InstructionsState &getInstructionsState() const { return S; };
4512 bool isLegal() const { return IsLegal; }
4513 bool tryToFindDuplicates() const { return TryToFindDuplicates; }
4514 bool trySplitVectorize() const { return TrySplitVectorize; }
4515 };
4516
4517 /// Checks if the specified list of the instructions/values can be vectorized
4518 /// in general.
4519 ScalarsVectorizationLegality
4520 getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
4521 const EdgeInfo &UserTreeIdx,
4522 bool TryCopyableElementsVectorization) const;
4523
4524 /// Checks if the specified list of the instructions/values can be vectorized
4525 /// and fills required data before actual scheduling of the instructions.
4526 TreeEntry::EntryState getScalarsVectorizationState(
4527 const InstructionsState &S, ArrayRef<Value *> VL,
4528 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
4529 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
4530
4531 /// Maps a specific scalar to its tree entry(ies).
4532 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
4533
4534 /// Maps the operand index and entry to the corresponding tree entry.
4535 SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
4536 OperandsToTreeEntry;
4537
4538 /// Scalars, used in split vectorize nodes.
4539 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
4540
4541 /// Maps a value to the proposed vectorizable size.
4542 SmallDenseMap<Value *, unsigned> InstrElementSize;
4543
4544 /// A list of scalars that we found that we need to keep as scalars.
4545 ValueSet MustGather;
4546
4547 /// A set of first non-schedulable values.
4548 ValueSet NonScheduledFirst;
4549
4550 /// A map between the vectorized entries and the last instructions in the
4551 /// bundles. The bundles are built in use order, not in the def order of the
4552 /// instructions. So, we cannot rely directly on the last instruction in the
4553 /// bundle being the last instruction in the program order during
4554 /// vectorization process since the basic blocks are affected, need to
4555 /// pre-gather them before.
4556 SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4557
4558 /// List of gather nodes, depending on other gather/vector nodes, which should
4559 /// be emitted after the vector instruction emission process to correctly
4560 /// handle order of the vector instructions and shuffles.
4561 SetVector<const TreeEntry *> PostponedGathers;
4562
4563 using ValueToGatherNodesMap =
4564 DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
4565 ValueToGatherNodesMap ValueToGatherNodes;
4566
4567 /// A list of the load entries (node indices), which can be vectorized using
4568 /// strided or masked gather approach, but attempted to be represented as
4569 /// contiguous loads.
4570 SetVector<unsigned> LoadEntriesToVectorize;
4571
4572 /// true if graph nodes transforming mode is on.
4573 bool IsGraphTransformMode = false;
4574
4575 /// The index of the first gathered load entry in the VectorizeTree.
4576 std::optional<unsigned> GatheredLoadsEntriesFirst;
4577
4578 /// Maps compress entries to their mask data for the final codegen.
4579 SmallDenseMap<const TreeEntry *,
4580 std::tuple<SmallVector<int>, VectorType *, unsigned, bool>>
4581 CompressEntryToData;
4582
4583 /// This POD struct describes one external user in the vectorized tree.
4584 struct ExternalUser {
4585 ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, unsigned L)
4586 : Scalar(S), User(U), E(E), Lane(L) {}
4587
4588 /// Which scalar in our function.
4589 Value *Scalar = nullptr;
4590
4591 /// Which user that uses the scalar.
4592 llvm::User *User = nullptr;
4593
4594 /// Vector node, the value is part of.
4595 const TreeEntry &E;
4596
4597 /// Which lane does the scalar belong to.
4598 unsigned Lane;
4599 };
4600 using UserList = SmallVector<ExternalUser, 16>;
4601
4602 /// Checks if two instructions may access the same memory.
4603 ///
4604 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
4605 /// is invariant in the calling loop.
4606 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
4607 Instruction *Inst2) {
4608 assert(Loc1.Ptr && isSimple(Inst1) && "Expected simple first instruction.");
4609 // First check if the result is already in the cache.
4610 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
4611 auto Res = AliasCache.try_emplace(Key);
4612 if (!Res.second)
4613 return Res.first->second;
4614 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
4615 // Store the result in the cache.
4616 Res.first->getSecond() = Aliased;
4617 return Aliased;
4618 }
4619
4620 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4621
4622 /// Cache for alias results.
4623 /// TODO: consider moving this to the AliasAnalysis itself.
4624 SmallDenseMap<AliasCacheKey, bool> AliasCache;
4625
4626 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
4627 // globally through SLP because we don't perform any action which
4628 // invalidates capture results.
4629 BatchAAResults BatchAA;
4630
4631 /// Temporary store for deleted instructions. Instructions will be deleted
4632 /// eventually when the BoUpSLP is destructed. The deferral is required to
4633 /// ensure that there are no incorrect collisions in the AliasCache, which
4634 /// can happen if a new instruction is allocated at the same address as a
4635 /// previously deleted instruction.
4636 DenseSet<Instruction *> DeletedInstructions;
4637
4638 /// Set of the instruction, being analyzed already for reductions.
4639 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
4640
4641 /// Set of hashes for the list of reduction values already being analyzed.
4642 DenseSet<size_t> AnalyzedReductionVals;
4643
4644 /// Values, already been analyzed for mininmal bitwidth and found to be
4645 /// non-profitable.
4646 DenseSet<Value *> AnalyzedMinBWVals;
4647
4648 /// A list of values that need to extracted out of the tree.
4649 /// This list holds pairs of (Internal Scalar : External User). External User
4650 /// can be nullptr, it means that this Internal Scalar will be used later,
4651 /// after vectorization.
4652 UserList ExternalUses;
4653
4654 /// A list of GEPs which can be reaplced by scalar GEPs instead of
4655 /// extractelement instructions.
4656 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
4657
4658 /// A list of scalar to be extracted without specific user necause of too many
4659 /// uses.
4660 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
4661
4662 /// Values used only by @llvm.assume calls.
4663 SmallPtrSet<const Value *, 32> EphValues;
4664
4665 /// Holds all of the instructions that we gathered, shuffle instructions and
4666 /// extractelements.
4667 SetVector<Instruction *> GatherShuffleExtractSeq;
4668
4669 /// A list of blocks that we are going to CSE.
4670 DenseSet<BasicBlock *> CSEBlocks;
4671
4672 /// List of hashes of vector of loads, which are known to be non vectorizable.
4673 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
4674
4675 /// Represents a scheduling entity, either ScheduleData, ScheduleCopyableData
4676 /// or ScheduleBundle. ScheduleData used to gather dependecies for a single
4677 /// instructions, while ScheduleBundle represents a batch of instructions,
4678 /// going to be groupped together. ScheduleCopyableData models extra user for
4679 /// "copyable" instructions.
4680 class ScheduleEntity {
4681 friend class ScheduleBundle;
4682 friend class ScheduleData;
4683 friend class ScheduleCopyableData;
4684
4685 protected:
4686 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
4687 Kind getKind() const { return K; }
4688 ScheduleEntity(Kind K) : K(K) {}
4689
4690 private:
4691 /// Used for getting a "good" final ordering of instructions.
4692 int SchedulingPriority = 0;
4693 /// True if this instruction (or bundle) is scheduled (or considered as
4694 /// scheduled in the dry-run).
4695 bool IsScheduled = false;
4696 /// The kind of the ScheduleEntity.
4697 const Kind K = Kind::ScheduleData;
4698
4699 public:
4700 ScheduleEntity() = delete;
4701 /// Gets/sets the scheduling priority.
4702 void setSchedulingPriority(int Priority) { SchedulingPriority = Priority; }
4703 int getSchedulingPriority() const { return SchedulingPriority; }
4704 bool isReady() const {
4705 if (const auto *SD = dyn_cast<ScheduleData>(this))
4706 return SD->isReady();
4707 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4708 return CD->isReady();
4709 return cast<ScheduleBundle>(this)->isReady();
4710 }
4711 /// Returns true if the dependency information has been calculated.
4712 /// Note that depenendency validity can vary between instructions within
4713 /// a single bundle.
4714 bool hasValidDependencies() const {
4715 if (const auto *SD = dyn_cast<ScheduleData>(this))
4716 return SD->hasValidDependencies();
4717 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4718 return CD->hasValidDependencies();
4719 return cast<ScheduleBundle>(this)->hasValidDependencies();
4720 }
4721 /// Gets the number of unscheduled dependencies.
4722 int getUnscheduledDeps() const {
4723 if (const auto *SD = dyn_cast<ScheduleData>(this))
4724 return SD->getUnscheduledDeps();
4725 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4726 return CD->getUnscheduledDeps();
4727 return cast<ScheduleBundle>(this)->unscheduledDepsInBundle();
4728 }
4729 /// Increments the number of unscheduled dependencies.
4730 int incrementUnscheduledDeps(int Incr) {
4731 if (auto *SD = dyn_cast<ScheduleData>(this))
4732 return SD->incrementUnscheduledDeps(Incr);
4733 return cast<ScheduleCopyableData>(this)->incrementUnscheduledDeps(Incr);
4734 }
4735 /// Gets the number of dependencies.
4736 int getDependencies() const {
4737 if (const auto *SD = dyn_cast<ScheduleData>(this))
4738 return SD->getDependencies();
4739 return cast<ScheduleCopyableData>(this)->getDependencies();
4740 }
4741 /// Gets the instruction.
4742 Instruction *getInst() const {
4743 if (const auto *SD = dyn_cast<ScheduleData>(this))
4744 return SD->getInst();
4745 return cast<ScheduleCopyableData>(this)->getInst();
4746 }
4747
4748 /// Gets/sets if the bundle is scheduled.
4749 bool isScheduled() const { return IsScheduled; }
4750 void setScheduled(bool Scheduled) { IsScheduled = Scheduled; }
4751
4752 static bool classof(const ScheduleEntity *) { return true; }
4753
4754#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4755 void dump(raw_ostream &OS) const {
4756 if (const auto *SD = dyn_cast<ScheduleData>(this))
4757 return SD->dump(OS);
4758 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4759 return CD->dump(OS);
4760 return cast<ScheduleBundle>(this)->dump(OS);
4761 }
4762
4763 LLVM_DUMP_METHOD void dump() const {
4764 dump(dbgs());
4765 dbgs() << '\n';
4766 }
4767#endif // if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4768 };
4769
4770#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4772 const BoUpSLP::ScheduleEntity &SE) {
4773 SE.dump(OS);
4774 return OS;
4775 }
4776#endif
4777
4778 /// Contains all scheduling relevant data for an instruction.
4779 /// A ScheduleData either represents a single instruction or a member of an
4780 /// instruction bundle (= a group of instructions which is combined into a
4781 /// vector instruction).
4782 class ScheduleData final : public ScheduleEntity {
4783 public:
4784 // The initial value for the dependency counters. It means that the
4785 // dependencies are not calculated yet.
4786 enum { InvalidDeps = -1 };
4787
4788 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
4789 static bool classof(const ScheduleEntity *Entity) {
4790 return Entity->getKind() == Kind::ScheduleData;
4791 }
4792
4793 void init(int BlockSchedulingRegionID, Instruction *I) {
4794 NextLoadStore = nullptr;
4795 IsScheduled = false;
4796 SchedulingRegionID = BlockSchedulingRegionID;
4797 clearDependencies();
4798 Inst = I;
4799 }
4800
4801 /// Verify basic self consistency properties
4802 void verify() {
4803 if (hasValidDependencies()) {
4804 assert(UnscheduledDeps <= Dependencies && "invariant");
4805 } else {
4806 assert(UnscheduledDeps == Dependencies && "invariant");
4807 }
4808
4809 if (IsScheduled) {
4810 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
4811 "unexpected scheduled state");
4812 }
4813 }
4814
4815 /// Returns true if the dependency information has been calculated.
4816 /// Note that depenendency validity can vary between instructions within
4817 /// a single bundle.
4818 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
4819
4820 /// Returns true if it is ready for scheduling, i.e. it has no more
4821 /// unscheduled depending instructions/bundles.
4822 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
4823
4824 /// Modifies the number of unscheduled dependencies for this instruction,
4825 /// and returns the number of remaining dependencies for the containing
4826 /// bundle.
4827 int incrementUnscheduledDeps(int Incr) {
4828 assert(hasValidDependencies() &&
4829 "increment of unscheduled deps would be meaningless");
4830 UnscheduledDeps += Incr;
4831 return UnscheduledDeps;
4832 }
4833
4834 /// Sets the number of unscheduled dependencies to the number of
4835 /// dependencies.
4836 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
4837
4838 /// Clears all dependency information.
4839 void clearDependencies() {
4840 clearDirectDependencies();
4841 MemoryDependencies.clear();
4842 ControlDependencies.clear();
4843 }
4844
4845 /// Clears all direct dependencies only, except for control and memory
4846 /// dependencies.
4847 /// Required for copyable elements to correctly handle control/memory deps
4848 /// and avoid extra reclaculation of such deps.
4849 void clearDirectDependencies() {
4850 Dependencies = InvalidDeps;
4851 resetUnscheduledDeps();
4852 IsScheduled = false;
4853 }
4854
4855 /// Gets the number of unscheduled dependencies.
4856 int getUnscheduledDeps() const { return UnscheduledDeps; }
4857 /// Gets the number of dependencies.
4858 int getDependencies() const { return Dependencies; }
4859 /// Initializes the number of dependencies.
4860 void initDependencies() { Dependencies = 0; }
4861 /// Increments the number of dependencies.
4862 void incDependencies() { Dependencies++; }
4863
4864 /// Gets scheduling region ID.
4865 int getSchedulingRegionID() const { return SchedulingRegionID; }
4866
4867 /// Gets the instruction.
4868 Instruction *getInst() const { return Inst; }
4869
4870 /// Gets the list of memory dependencies.
4871 ArrayRef<ScheduleData *> getMemoryDependencies() const {
4872 return MemoryDependencies;
4873 }
4874 /// Adds a memory dependency.
4875 void addMemoryDependency(ScheduleData *Dep) {
4876 MemoryDependencies.push_back(Dep);
4877 }
4878 /// Gets the list of control dependencies.
4879 ArrayRef<ScheduleData *> getControlDependencies() const {
4880 return ControlDependencies;
4881 }
4882 /// Adds a control dependency.
4883 void addControlDependency(ScheduleData *Dep) {
4884 ControlDependencies.push_back(Dep);
4885 }
4886 /// Gets/sets the next load/store instruction in the block.
4887 ScheduleData *getNextLoadStore() const { return NextLoadStore; }
4888 void setNextLoadStore(ScheduleData *Next) { NextLoadStore = Next; }
4889
4890 void dump(raw_ostream &OS) const { OS << *Inst; }
4891
4892 LLVM_DUMP_METHOD void dump() const {
4893 dump(dbgs());
4894 dbgs() << '\n';
4895 }
4896
4897 private:
4898 Instruction *Inst = nullptr;
4899
4900 /// Single linked list of all memory instructions (e.g. load, store, call)
4901 /// in the block - until the end of the scheduling region.
4902 ScheduleData *NextLoadStore = nullptr;
4903
4904 /// The dependent memory instructions.
4905 /// This list is derived on demand in calculateDependencies().
4906 SmallVector<ScheduleData *> MemoryDependencies;
4907
4908 /// List of instructions which this instruction could be control dependent
4909 /// on. Allowing such nodes to be scheduled below this one could introduce
4910 /// a runtime fault which didn't exist in the original program.
4911 /// ex: this is a load or udiv following a readonly call which inf loops
4912 SmallVector<ScheduleData *> ControlDependencies;
4913
4914 /// This ScheduleData is in the current scheduling region if this matches
4915 /// the current SchedulingRegionID of BlockScheduling.
4916 int SchedulingRegionID = 0;
4917
4918 /// The number of dependencies. Constitutes of the number of users of the
4919 /// instruction plus the number of dependent memory instructions (if any).
4920 /// This value is calculated on demand.
4921 /// If InvalidDeps, the number of dependencies is not calculated yet.
4922 int Dependencies = InvalidDeps;
4923
4924 /// The number of dependencies minus the number of dependencies of scheduled
4925 /// instructions. As soon as this is zero, the instruction/bundle gets ready
4926 /// for scheduling.
4927 /// Note that this is negative as long as Dependencies is not calculated.
4928 int UnscheduledDeps = InvalidDeps;
4929 };
4930
4931#ifndef NDEBUG
4933 const BoUpSLP::ScheduleData &SD) {
4934 SD.dump(OS);
4935 return OS;
4936 }
4937#endif
4938
4939 class ScheduleBundle final : public ScheduleEntity {
4940 /// The schedule data for the instructions in the bundle.
4942 /// True if this bundle is valid.
4943 bool IsValid = true;
4944 /// The TreeEntry that this instruction corresponds to.
4945 TreeEntry *TE = nullptr;
4946 ScheduleBundle(bool IsValid)
4947 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
4948
4949 public:
4950 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
4951 static bool classof(const ScheduleEntity *Entity) {
4952 return Entity->getKind() == Kind::ScheduleBundle;
4953 }
4954
4955 /// Verify basic self consistency properties
4956 void verify() const {
4957 for (const ScheduleEntity *SD : Bundle) {
4958 if (SD->hasValidDependencies()) {
4959 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
4960 "invariant");
4961 } else {
4962 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
4963 "invariant");
4964 }
4965
4966 if (isScheduled()) {
4967 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
4968 "unexpected scheduled state");
4969 }
4970 }
4971 }
4972
4973 /// Returns the number of unscheduled dependencies in the bundle.
4974 int unscheduledDepsInBundle() const {
4975 assert(*this && "bundle must not be empty");
4976 int Sum = 0;
4977 for (const ScheduleEntity *BundleMember : Bundle) {
4978 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
4979 return ScheduleData::InvalidDeps;
4980 Sum += BundleMember->getUnscheduledDeps();
4981 }
4982 return Sum;
4983 }
4984
4985 /// Returns true if the dependency information has been calculated.
4986 /// Note that depenendency validity can vary between instructions within
4987 /// a single bundle.
4988 bool hasValidDependencies() const {
4989 return all_of(Bundle, [](const ScheduleEntity *SD) {
4990 return SD->hasValidDependencies();
4991 });
4992 }
4993
4994 /// Returns true if it is ready for scheduling, i.e. it has no more
4995 /// unscheduled depending instructions/bundles.
4996 bool isReady() const {
4997 assert(*this && "bundle must not be empty");
4998 return unscheduledDepsInBundle() == 0 && !isScheduled();
4999 }
5000
5001 /// Returns the bundle of scheduling data, associated with the current
5002 /// instruction.
5003 ArrayRef<ScheduleEntity *> getBundle() { return Bundle; }
5004 ArrayRef<const ScheduleEntity *> getBundle() const { return Bundle; }
5005 /// Adds an instruction to the bundle.
5006 void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
5007
5008 /// Gets/sets the associated tree entry.
5009 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
5010 TreeEntry *getTreeEntry() const { return TE; }
5011
5012 static ScheduleBundle invalid() { return {false}; }
5013
5014 operator bool() const { return IsValid; }
5015
5016#ifndef NDEBUG
5017 void dump(raw_ostream &OS) const {
5018 if (!*this) {
5019 OS << "[]";
5020 return;
5021 }
5022 OS << '[';
5023 interleaveComma(Bundle, OS, [&](const ScheduleEntity *SD) {
5025 OS << "<Copyable>";
5026 OS << *SD->getInst();
5027 });
5028 OS << ']';
5029 }
5030
5031 LLVM_DUMP_METHOD void dump() const {
5032 dump(dbgs());
5033 dbgs() << '\n';
5034 }
5035#endif // NDEBUG
5036 };
5037
5038#ifndef NDEBUG
5040 const BoUpSLP::ScheduleBundle &Bundle) {
5041 Bundle.dump(OS);
5042 return OS;
5043 }
5044#endif
5045
5046 /// Contains all scheduling relevant data for the copyable instruction.
5047 /// It models the virtual instructions, supposed to replace the original
5048 /// instructions. E.g., if instruction %0 = load is a part of the bundle [%0,
5049 /// %1], where %1 = add, then the ScheduleCopyableData models virtual
5050 /// instruction %virt = add %0, 0.
5051 class ScheduleCopyableData final : public ScheduleEntity {
5052 /// The source schedule data for the instruction.
5053 Instruction *Inst = nullptr;
5054 /// The edge information for the instruction.
5055 const EdgeInfo EI;
5056 /// This ScheduleData is in the current scheduling region if this matches
5057 /// the current SchedulingRegionID of BlockScheduling.
5058 int SchedulingRegionID = 0;
5059 /// Bundle, this data is part of.
5060 ScheduleBundle &Bundle;
5061
5062 public:
5063 ScheduleCopyableData(int BlockSchedulingRegionID, Instruction *I,
5064 const EdgeInfo &EI, ScheduleBundle &Bundle)
5065 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(I), EI(EI),
5066 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5067 static bool classof(const ScheduleEntity *Entity) {
5068 return Entity->getKind() == Kind::ScheduleCopyableData;
5069 }
5070
5071 /// Verify basic self consistency properties
5072 void verify() {
5073 if (hasValidDependencies()) {
5074 assert(UnscheduledDeps <= Dependencies && "invariant");
5075 } else {
5076 assert(UnscheduledDeps == Dependencies && "invariant");
5077 }
5078
5079 if (IsScheduled) {
5080 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5081 "unexpected scheduled state");
5082 }
5083 }
5084
5085 /// Returns true if the dependency information has been calculated.
5086 /// Note that depenendency validity can vary between instructions within
5087 /// a single bundle.
5088 bool hasValidDependencies() const {
5089 return Dependencies != ScheduleData::InvalidDeps;
5090 }
5091
5092 /// Returns true if it is ready for scheduling, i.e. it has no more
5093 /// unscheduled depending instructions/bundles.
5094 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
5095
5096 /// Modifies the number of unscheduled dependencies for this instruction,
5097 /// and returns the number of remaining dependencies for the containing
5098 /// bundle.
5099 int incrementUnscheduledDeps(int Incr) {
5100 assert(hasValidDependencies() &&
5101 "increment of unscheduled deps would be meaningless");
5102 UnscheduledDeps += Incr;
5103 assert(UnscheduledDeps >= 0 && "invariant");
5104 return UnscheduledDeps;
5105 }
5106
5107 /// Sets the number of unscheduled dependencies to the number of
5108 /// dependencies.
5109 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5110
5111 /// Gets the number of unscheduled dependencies.
5112 int getUnscheduledDeps() const { return UnscheduledDeps; }
5113 /// Gets the number of dependencies.
5114 int getDependencies() const { return Dependencies; }
5115 /// Initializes the number of dependencies.
5116 void initDependencies() { Dependencies = 0; }
5117 /// Increments the number of dependencies.
5118 void incDependencies() { Dependencies++; }
5119
5120 /// Gets scheduling region ID.
5121 int getSchedulingRegionID() const { return SchedulingRegionID; }
5122
5123 /// Gets the instruction.
5124 Instruction *getInst() const { return Inst; }
5125
5126 /// Clears all dependency information.
5127 void clearDependencies() {
5128 Dependencies = ScheduleData::InvalidDeps;
5129 UnscheduledDeps = ScheduleData::InvalidDeps;
5130 IsScheduled = false;
5131 }
5132
5133 /// Gets the edge information.
5134 const EdgeInfo &getEdgeInfo() const { return EI; }
5135
5136 /// Gets the bundle.
5137 ScheduleBundle &getBundle() { return Bundle; }
5138 const ScheduleBundle &getBundle() const { return Bundle; }
5139
5140#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5141 void dump(raw_ostream &OS) const { OS << "[Copyable]" << *getInst(); }
5142
5143 LLVM_DUMP_METHOD void dump() const {
5144 dump(dbgs());
5145 dbgs() << '\n';
5146 }
5147#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5148
5149 private:
5150 /// true, if it has valid dependency information. These nodes always have
5151 /// only single dependency.
5152 int Dependencies = ScheduleData::InvalidDeps;
5153
5154 /// The number of dependencies minus the number of dependencies of scheduled
5155 /// instructions. As soon as this is zero, the instruction/bundle gets ready
5156 /// for scheduling.
5157 /// Note that this is negative as long as Dependencies is not calculated.
5158 int UnscheduledDeps = ScheduleData::InvalidDeps;
5159 };
5160
5161#ifndef NDEBUG
5162 friend inline raw_ostream &
5163 operator<<(raw_ostream &OS, const BoUpSLP::ScheduleCopyableData &SD) {
5164 SD.dump(OS);
5165 return OS;
5166 }
5167#endif
5168
5169 friend struct GraphTraits<BoUpSLP *>;
5170 friend struct DOTGraphTraits<BoUpSLP *>;
5171
5172 /// Contains all scheduling data for a basic block.
5173 /// It does not schedules instructions, which are not memory read/write
5174 /// instructions and their operands are either constants, or arguments, or
5175 /// phis, or instructions from others blocks, or their users are phis or from
5176 /// the other blocks. The resulting vector instructions can be placed at the
5177 /// beginning of the basic block without scheduling (if operands does not need
5178 /// to be scheduled) or at the end of the block (if users are outside of the
5179 /// block). It allows to save some compile time and memory used by the
5180 /// compiler.
5181 /// ScheduleData is assigned for each instruction in between the boundaries of
5182 /// the tree entry, even for those, which are not part of the graph. It is
5183 /// required to correctly follow the dependencies between the instructions and
5184 /// their correct scheduling. The ScheduleData is not allocated for the
5185 /// instructions, which do not require scheduling, like phis, nodes with
5186 /// extractelements/insertelements only or nodes with instructions, with
5187 /// uses/operands outside of the block.
5188 struct BlockScheduling {
5189 BlockScheduling(BasicBlock *BB)
5190 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
5191
5192 void clear() {
5193 ScheduledBundles.clear();
5194 ScheduledBundlesList.clear();
5195 ScheduleCopyableDataMap.clear();
5196 ScheduleCopyableDataMapByInst.clear();
5197 ScheduleCopyableDataMapByInstUser.clear();
5198 ScheduleCopyableDataMapByUsers.clear();
5199 ReadyInsts.clear();
5200 ScheduleStart = nullptr;
5201 ScheduleEnd = nullptr;
5202 FirstLoadStoreInRegion = nullptr;
5203 LastLoadStoreInRegion = nullptr;
5204 RegionHasStackSave = false;
5205
5206 // Reduce the maximum schedule region size by the size of the
5207 // previous scheduling run.
5208 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5209 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
5210 ScheduleRegionSizeLimit = MinScheduleRegionSize;
5211 ScheduleRegionSize = 0;
5212
5213 // Make a new scheduling region, i.e. all existing ScheduleData is not
5214 // in the new region yet.
5215 ++SchedulingRegionID;
5216 }
5217
5218 ScheduleData *getScheduleData(Instruction *I) {
5219 if (!I)
5220 return nullptr;
5221 if (BB != I->getParent())
5222 // Avoid lookup if can't possibly be in map.
5223 return nullptr;
5224 ScheduleData *SD = ScheduleDataMap.lookup(I);
5225 if (SD && isInSchedulingRegion(*SD))
5226 return SD;
5227 return nullptr;
5228 }
5229
5230 ScheduleData *getScheduleData(Value *V) {
5231 return getScheduleData(dyn_cast<Instruction>(V));
5232 }
5233
5234 /// Returns the ScheduleCopyableData for the given edge (user tree entry and
5235 /// operand number) and value.
5236 ScheduleCopyableData *getScheduleCopyableData(const EdgeInfo &EI,
5237 const Value *V) const {
5238 if (ScheduleCopyableDataMap.empty())
5239 return nullptr;
5240 auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
5241 if (It == ScheduleCopyableDataMap.end())
5242 return nullptr;
5243 ScheduleCopyableData *SD = It->getSecond().get();
5244 if (!isInSchedulingRegion(*SD))
5245 return nullptr;
5246 return SD;
5247 }
5248
5249 /// Returns the ScheduleCopyableData for the given user \p User, operand
5250 /// number and operand \p V.
5252 getScheduleCopyableData(const Value *User, unsigned OperandIdx,
5253 const Value *V) {
5254 if (ScheduleCopyableDataMapByInstUser.empty())
5255 return {};
5256 const auto It = ScheduleCopyableDataMapByInstUser.find(
5257 std::make_pair(std::make_pair(User, OperandIdx), V));
5258 if (It == ScheduleCopyableDataMapByInstUser.end())
5259 return {};
5261 for (ScheduleCopyableData *SD : It->getSecond()) {
5262 if (isInSchedulingRegion(*SD))
5263 Res.push_back(SD);
5264 }
5265 return Res;
5266 }
5267
5268 /// Returns true if all operands of the given instruction \p User are
5269 /// replaced by copyable data.
5270 /// \param User The user instruction.
5271 /// \param Op The operand, which might be replaced by the copyable data.
5272 /// \param SLP The SLP tree.
5273 /// \param NumOps The number of operands used. If the instruction uses the
5274 /// same operand several times, check for the first use, then the second,
5275 /// etc.
5276 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5277 Instruction *Op, BoUpSLP &SLP,
5278 unsigned NumOps) const {
5279 assert(NumOps > 0 && "No operands");
5280 if (ScheduleCopyableDataMap.empty())
5281 return false;
5282 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5283 SmallDenseMap<const TreeEntry *, unsigned> OrderedEntriesCount;
5284 for (const Use &U : User->operands()) {
5285 if (U.get() != Op)
5286 continue;
5287 ArrayRef<TreeEntry *> Entries = SLP.getTreeEntries(User);
5288 if (Entries.empty())
5289 return false;
5290 // Check all tree entries, if they have operands replaced by copyable
5291 // data.
5292 for (TreeEntry *TE : Entries) {
5293 // Check if the user is commutative.
5294 // The commutatives are handled later, as their oeprands can be
5295 // reordered.
5296 // Same applies even for non-commutative cmps, because we can invert
5297 // their predicate potentially and, thus, reorder the operands.
5298 bool IsCommutativeUser =
5299 ::isCommutative(User) ||
5300 ::isCommutative(TE->getMatchingMainOpOrAltOp(User), User);
5301 EdgeInfo EI(TE, U.getOperandNo());
5302 if (!IsCommutativeUser && !isa<CmpInst>(User)) {
5303 unsigned &OpCnt =
5304 OrderedEntriesCount.try_emplace(TE, 0).first->getSecond();
5305 if (!getScheduleCopyableData(EI, Op) && OpCnt < NumOps)
5306 return false;
5307 // Found copyable operand - continue.
5308 ++OpCnt;
5309 continue;
5310 }
5311 ++PotentiallyReorderedEntriesCount.try_emplace(TE, 0)
5312 .first->getSecond();
5313 }
5314 }
5315 // Check the commutative/cmp entries.
5316 if (!PotentiallyReorderedEntriesCount.empty()) {
5317 for (auto &P : PotentiallyReorderedEntriesCount) {
5318 auto *It = find(P.first->Scalars, User);
5319 assert(It != P.first->Scalars.end() &&
5320 "User is not in the tree entry");
5321 int Lane = std::distance(P.first->Scalars.begin(), It);
5322 assert(Lane >= 0 && "Lane is not found");
5323 if (isa<StoreInst>(User) && !P.first->ReorderIndices.empty())
5324 Lane = P.first->ReorderIndices[Lane];
5325 assert(Lane < static_cast<int>(P.first->Scalars.size()) &&
5326 "Couldn't find extract lane");
5327 SmallVector<unsigned> OpIndices;
5328 for (unsigned OpIdx :
5330 P.first->getMainOp()))) {
5331 if (P.first->getOperand(OpIdx)[Lane] == Op &&
5332 getScheduleCopyableData(EdgeInfo(P.first, OpIdx), Op))
5333 --P.getSecond();
5334 }
5335 }
5336 return all_of(PotentiallyReorderedEntriesCount,
5337 [&](const std::pair<const TreeEntry *, unsigned> &P) {
5338 return P.second == NumOps - 1;
5339 });
5340 }
5341 return true;
5342 }
5343
5345 getScheduleCopyableData(const Instruction *I) const {
5346 if (ScheduleCopyableDataMapByInst.empty())
5347 return {};
5348 const auto It = ScheduleCopyableDataMapByInst.find(I);
5349 if (It == ScheduleCopyableDataMapByInst.end())
5350 return {};
5352 for (ScheduleCopyableData *SD : It->getSecond()) {
5353 if (isInSchedulingRegion(*SD))
5354 Res.push_back(SD);
5355 }
5356 return Res;
5357 }
5358
5360 getScheduleCopyableDataUsers(const Instruction *User) const {
5361 if (ScheduleCopyableDataMapByUsers.empty())
5362 return {};
5363 const auto It = ScheduleCopyableDataMapByUsers.find(User);
5364 if (It == ScheduleCopyableDataMapByUsers.end())
5365 return {};
5367 for (ScheduleCopyableData *SD : It->getSecond()) {
5368 if (isInSchedulingRegion(*SD))
5369 Res.push_back(SD);
5370 }
5371 return Res;
5372 }
5373
5374 ScheduleCopyableData &addScheduleCopyableData(const EdgeInfo &EI,
5375 Instruction *I,
5376 int SchedulingRegionID,
5377 ScheduleBundle &Bundle) {
5378 assert(!getScheduleCopyableData(EI, I) && "already in the map");
5379 ScheduleCopyableData *CD =
5380 ScheduleCopyableDataMap
5381 .try_emplace(std::make_pair(EI, I),
5382 std::make_unique<ScheduleCopyableData>(
5383 SchedulingRegionID, I, EI, Bundle))
5384 .first->getSecond()
5385 .get();
5386 ScheduleCopyableDataMapByInst[I].push_back(CD);
5387 if (EI.UserTE) {
5388 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
5389 const auto *It = find(Op, I);
5390 assert(It != Op.end() && "Lane not set");
5391 SmallPtrSet<Instruction *, 4> Visited;
5392 do {
5393 int Lane = std::distance(Op.begin(), It);
5394 assert(Lane >= 0 && "Lane not set");
5395 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
5396 !EI.UserTE->ReorderIndices.empty())
5397 Lane = EI.UserTE->ReorderIndices[Lane];
5398 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
5399 "Couldn't find extract lane");
5400 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
5401 if (!Visited.insert(In).second) {
5402 It = find(make_range(std::next(It), Op.end()), I);
5403 continue;
5404 }
5405 ScheduleCopyableDataMapByInstUser
5406 .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx), I))
5407 .first->getSecond()
5408 .push_back(CD);
5409 ScheduleCopyableDataMapByUsers.try_emplace(I)
5410 .first->getSecond()
5411 .insert(CD);
5412 // Remove extra deps for users, becoming non-immediate users of the
5413 // instruction. It may happen, if the chain of same copyable elements
5414 // appears in the tree.
5415 if (In == I) {
5416 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5417 if (ScheduleCopyableData *UserCD =
5418 getScheduleCopyableData(UserEI, In))
5419 ScheduleCopyableDataMapByUsers[I].remove(UserCD);
5420 }
5421 It = find(make_range(std::next(It), Op.end()), I);
5422 } while (It != Op.end());
5423 } else {
5424 ScheduleCopyableDataMapByUsers.try_emplace(I).first->getSecond().insert(
5425 CD);
5426 }
5427 return *CD;
5428 }
5429
5430 ArrayRef<ScheduleBundle *> getScheduleBundles(Value *V) const {
5431 auto *I = dyn_cast<Instruction>(V);
5432 if (!I)
5433 return {};
5434 auto It = ScheduledBundles.find(I);
5435 if (It == ScheduledBundles.end())
5436 return {};
5437 return It->getSecond();
5438 }
5439
5440 /// Returns true if the entity is in the scheduling region.
5441 bool isInSchedulingRegion(const ScheduleEntity &SD) const {
5442 if (const auto *Data = dyn_cast<ScheduleData>(&SD))
5443 return Data->getSchedulingRegionID() == SchedulingRegionID;
5444 if (const auto *CD = dyn_cast<ScheduleCopyableData>(&SD))
5445 return CD->getSchedulingRegionID() == SchedulingRegionID;
5446 return all_of(cast<ScheduleBundle>(SD).getBundle(),
5447 [&](const ScheduleEntity *BundleMember) {
5448 return isInSchedulingRegion(*BundleMember);
5449 });
5450 }
5451
5452 /// Marks an instruction as scheduled and puts all dependent ready
5453 /// instructions into the ready-list.
5454 template <typename ReadyListType>
5455 void schedule(const BoUpSLP &R, const InstructionsState &S,
5456 const EdgeInfo &EI, ScheduleEntity *Data,
5457 ReadyListType &ReadyList) {
5458 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5460 // Handle the def-use chain dependencies.
5461
5462 // Decrement the unscheduled counter and insert to ready list if ready.
5463 auto DecrUnsched = [&](auto *Data, bool IsControl = false) {
5464 if ((IsControl || Data->hasValidDependencies()) &&
5465 Data->incrementUnscheduledDeps(-1) == 0) {
5466 // There are no more unscheduled dependencies after
5467 // decrementing, so we can put the dependent instruction
5468 // into the ready list.
5469 SmallVector<ScheduleBundle *, 1> CopyableBundle;
5471 if (auto *CD = dyn_cast<ScheduleCopyableData>(Data)) {
5472 CopyableBundle.push_back(&CD->getBundle());
5473 Bundles = CopyableBundle;
5474 } else {
5475 Bundles = getScheduleBundles(Data->getInst());
5476 }
5477 if (!Bundles.empty()) {
5478 for (ScheduleBundle *Bundle : Bundles) {
5479 if (Bundle->unscheduledDepsInBundle() == 0) {
5480 assert(!Bundle->isScheduled() &&
5481 "already scheduled bundle gets ready");
5482 ReadyList.insert(Bundle);
5484 << "SLP: gets ready: " << *Bundle << "\n");
5485 }
5486 }
5487 return;
5488 }
5489 assert(!Data->isScheduled() &&
5490 "already scheduled bundle gets ready");
5492 "Expected non-copyable data");
5493 ReadyList.insert(Data);
5494 LLVM_DEBUG(dbgs() << "SLP: gets ready: " << *Data << "\n");
5495 }
5496 };
5497
5498 auto DecrUnschedForInst = [&](Instruction *User, unsigned OpIdx,
5499 Instruction *I) {
5500 if (!ScheduleCopyableDataMap.empty()) {
5502 getScheduleCopyableData(User, OpIdx, I);
5503 for (ScheduleCopyableData *CD : CopyableData)
5504 DecrUnsched(CD, /*IsControl=*/false);
5505 if (!CopyableData.empty())
5506 return;
5507 }
5508 if (ScheduleData *OpSD = getScheduleData(I))
5509 DecrUnsched(OpSD, /*IsControl=*/false);
5510 };
5511
5512 // If BundleMember is a vector bundle, its operands may have been
5513 // reordered during buildTree(). We therefore need to get its operands
5514 // through the TreeEntry.
5515 if (!Bundles.empty()) {
5516 auto *In = BundleMember->getInst();
5517 // Count uses of each instruction operand.
5518 SmallDenseMap<const Instruction *, unsigned> OperandsUses;
5519 unsigned TotalOpCount = 0;
5520 if (isa<ScheduleCopyableData>(BundleMember)) {
5521 // Copyable data is used only once (uses itself).
5522 TotalOpCount = OperandsUses[In] = 1;
5523 } else {
5524 for (const Use &U : In->operands()) {
5525 if (auto *I = dyn_cast<Instruction>(U.get())) {
5526 auto Res = OperandsUses.try_emplace(I, 0);
5527 ++Res.first->getSecond();
5528 ++TotalOpCount;
5529 }
5530 }
5531 }
5532 // Decrement the unscheduled counter and insert to ready list if
5533 // ready.
5534 auto DecrUnschedForInst = [&](Instruction *I, TreeEntry *UserTE,
5535 unsigned OpIdx) {
5536 if (!ScheduleCopyableDataMap.empty()) {
5537 const EdgeInfo EI = {UserTE, OpIdx};
5538 if (ScheduleCopyableData *CD = getScheduleCopyableData(EI, I)) {
5539 DecrUnsched(CD, /*IsControl=*/false);
5540 return;
5541 }
5542 }
5543 auto It = OperandsUses.find(I);
5544 assert(It != OperandsUses.end() && "Operand not found");
5545 if (It->second > 0) {
5546 --It->getSecond();
5547 assert(TotalOpCount > 0 && "No more operands to decrement");
5548 --TotalOpCount;
5549 if (ScheduleData *OpSD = getScheduleData(I))
5550 DecrUnsched(OpSD, /*IsControl=*/false);
5551 }
5552 };
5553
5554 for (ScheduleBundle *Bundle : Bundles) {
5555 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
5556 break;
5557 // Need to search for the lane since the tree entry can be
5558 // reordered.
5559 int Lane = std::distance(Bundle->getTreeEntry()->Scalars.begin(),
5560 find(Bundle->getTreeEntry()->Scalars, In));
5561 assert(Lane >= 0 && "Lane not set");
5562 if (isa<StoreInst>(In) &&
5563 !Bundle->getTreeEntry()->ReorderIndices.empty())
5564 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
5565 assert(Lane < static_cast<int>(
5566 Bundle->getTreeEntry()->Scalars.size()) &&
5567 "Couldn't find extract lane");
5568
5569 // Since vectorization tree is being built recursively this
5570 // assertion ensures that the tree entry has all operands set before
5571 // reaching this code. Couple of exceptions known at the moment are
5572 // extracts where their second (immediate) operand is not added.
5573 // Since immediates do not affect scheduler behavior this is
5574 // considered okay.
5575 assert(In &&
5577 In->getNumOperands() ==
5578 Bundle->getTreeEntry()->getNumOperands() ||
5579 Bundle->getTreeEntry()->isCopyableElement(In)) &&
5580 "Missed TreeEntry operands?");
5581
5582 for (unsigned OpIdx :
5583 seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
5584 if (auto *I = dyn_cast<Instruction>(
5585 Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
5586 LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): " << *I
5587 << "\n");
5588 DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx);
5589 }
5590 }
5591 } else {
5592 // If BundleMember is a stand-alone instruction, no operand reordering
5593 // has taken place, so we directly access its operands.
5594 for (Use &U : BundleMember->getInst()->operands()) {
5595 if (auto *I = dyn_cast<Instruction>(U.get())) {
5597 << "SLP: check for readiness (def): " << *I << "\n");
5598 DecrUnschedForInst(BundleMember->getInst(), U.getOperandNo(), I);
5599 }
5600 }
5601 }
5602 // Handle the memory dependencies.
5603 auto *SD = dyn_cast<ScheduleData>(BundleMember);
5604 if (!SD)
5605 return;
5606 SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
5607 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
5608 if (!VisitedMemory.insert(MemoryDep).second)
5609 continue;
5610 // There are no more unscheduled dependencies after decrementing,
5611 // so we can put the dependent instruction into the ready list.
5612 LLVM_DEBUG(dbgs() << "SLP: check for readiness (mem): "
5613 << *MemoryDep << "\n");
5614 DecrUnsched(MemoryDep);
5615 }
5616 // Handle the control dependencies.
5617 SmallPtrSet<const ScheduleData *, 4> VisitedControl;
5618 for (ScheduleData *Dep : SD->getControlDependencies()) {
5619 if (!VisitedControl.insert(Dep).second)
5620 continue;
5621 // There are no more unscheduled dependencies after decrementing,
5622 // so we can put the dependent instruction into the ready list.
5624 << "SLP: check for readiness (ctrl): " << *Dep << "\n");
5625 DecrUnsched(Dep, /*IsControl=*/true);
5626 }
5627 };
5628 if (auto *SD = dyn_cast<ScheduleData>(Data)) {
5629 SD->setScheduled(/*Scheduled=*/true);
5630 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
5633 Instruction *In = SD->getInst();
5634 if (R.isVectorized(In)) {
5635 ArrayRef<TreeEntry *> Entries = R.getTreeEntries(In);
5636 for (TreeEntry *TE : Entries) {
5638 In->getNumOperands() != TE->getNumOperands())
5639 continue;
5640 auto &BundlePtr =
5641 PseudoBundles.emplace_back(std::make_unique<ScheduleBundle>());
5642 BundlePtr->setTreeEntry(TE);
5643 BundlePtr->add(SD);
5644 Bundles.push_back(BundlePtr.get());
5645 }
5646 }
5647 ProcessBundleMember(SD, Bundles);
5648 } else {
5649 ScheduleBundle &Bundle = *cast<ScheduleBundle>(Data);
5650 Bundle.setScheduled(/*Scheduled=*/true);
5651 LLVM_DEBUG(dbgs() << "SLP: schedule " << Bundle << "\n");
5652 auto AreAllBundlesScheduled =
5653 [&](const ScheduleEntity *SD,
5654 ArrayRef<ScheduleBundle *> SDBundles) {
5656 return true;
5657 return !SDBundles.empty() &&
5658 all_of(SDBundles, [&](const ScheduleBundle *SDBundle) {
5659 return SDBundle->isScheduled();
5660 });
5661 };
5662 for (ScheduleEntity *SD : Bundle.getBundle()) {
5665 SDBundles = getScheduleBundles(SD->getInst());
5666 if (AreAllBundlesScheduled(SD, SDBundles)) {
5667 SD->setScheduled(/*Scheduled=*/true);
5668 ProcessBundleMember(SD, isa<ScheduleCopyableData>(SD) ? &Bundle
5669 : SDBundles);
5670 }
5671 }
5672 }
5673 }
5674
5675 /// Verify basic self consistency properties of the data structure.
5676 void verify() {
5677 if (!ScheduleStart)
5678 return;
5679
5680 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
5681 ScheduleStart->comesBefore(ScheduleEnd) &&
5682 "Not a valid scheduling region?");
5683
5684 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5685 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
5686 if (!Bundles.empty()) {
5687 for (ScheduleBundle *Bundle : Bundles) {
5688 assert(isInSchedulingRegion(*Bundle) &&
5689 "primary schedule data not in window?");
5690 Bundle->verify();
5691 }
5692 continue;
5693 }
5694 auto *SD = getScheduleData(I);
5695 if (!SD)
5696 continue;
5697 assert(isInSchedulingRegion(*SD) &&
5698 "primary schedule data not in window?");
5699 SD->verify();
5700 }
5701
5702 assert(all_of(ReadyInsts,
5703 [](const ScheduleEntity *Bundle) {
5704 return Bundle->isReady();
5705 }) &&
5706 "item in ready list not ready?");
5707 }
5708
5709 /// Put all instructions into the ReadyList which are ready for scheduling.
5710 template <typename ReadyListType>
5711 void initialFillReadyList(ReadyListType &ReadyList) {
5712 SmallPtrSet<ScheduleBundle *, 16> Visited;
5713 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5714 ScheduleData *SD = getScheduleData(I);
5715 if (SD && SD->hasValidDependencies() && SD->isReady()) {
5716 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
5717 !Bundles.empty()) {
5718 for (ScheduleBundle *Bundle : Bundles) {
5719 if (!Visited.insert(Bundle).second)
5720 continue;
5721 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
5722 ReadyList.insert(Bundle);
5723 LLVM_DEBUG(dbgs() << "SLP: initially in ready list: "
5724 << *Bundle << "\n");
5725 }
5726 }
5727 continue;
5728 }
5729 ReadyList.insert(SD);
5731 << "SLP: initially in ready list: " << *SD << "\n");
5732 }
5733 }
5734 }
5735
5736 /// Build a bundle from the ScheduleData nodes corresponding to the
5737 /// scalar instruction for each lane.
5738 /// \param VL The list of scalar instructions.
5739 /// \param S The state of the instructions.
5740 /// \param EI The edge in the SLP graph or the user node/operand number.
5741 ScheduleBundle &buildBundle(ArrayRef<Value *> VL,
5742 const InstructionsState &S, const EdgeInfo &EI);
5743
5744 /// Checks if a bundle of instructions can be scheduled, i.e. has no
5745 /// cyclic dependencies. This is only a dry-run, no instructions are
5746 /// actually moved at this stage.
5747 /// \returns the scheduling bundle. The returned Optional value is not
5748 /// std::nullopt if \p VL is allowed to be scheduled.
5749 std::optional<ScheduleBundle *>
5750 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
5751 const InstructionsState &S, const EdgeInfo &EI);
5752
5753 /// Allocates schedule data chunk.
5754 ScheduleData *allocateScheduleDataChunks();
5755
5756 /// Extends the scheduling region so that V is inside the region.
5757 /// \returns true if the region size is within the limit.
5758 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
5759
5760 /// Initialize the ScheduleData structures for new instructions in the
5761 /// scheduling region.
5762 void initScheduleData(Instruction *FromI, Instruction *ToI,
5763 ScheduleData *PrevLoadStore,
5764 ScheduleData *NextLoadStore);
5765
5766 /// Updates the dependency information of a bundle and of all instructions/
5767 /// bundles which depend on the original bundle.
5768 void calculateDependencies(ScheduleBundle &Bundle, bool InsertInReadyList,
5769 BoUpSLP *SLP,
5770 ArrayRef<ScheduleData *> ControlDeps = {});
5771
5772 /// Sets all instruction in the scheduling region to un-scheduled.
5773 void resetSchedule();
5774
5775 BasicBlock *BB;
5776
5777 /// Simple memory allocation for ScheduleData.
5779
5780 /// The size of a ScheduleData array in ScheduleDataChunks.
5781 int ChunkSize;
5782
5783 /// The allocator position in the current chunk, which is the last entry
5784 /// of ScheduleDataChunks.
5785 int ChunkPos;
5786
5787 /// Attaches ScheduleData to Instruction.
5788 /// Note that the mapping survives during all vectorization iterations, i.e.
5789 /// ScheduleData structures are recycled.
5790 SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
5791
5792 /// Attaches ScheduleCopyableData to EdgeInfo (UserTreeEntry + operand
5793 /// number) and the operand instruction, represented as copyable element.
5794 SmallDenseMap<std::pair<EdgeInfo, const Value *>,
5795 std::unique_ptr<ScheduleCopyableData>>
5796 ScheduleCopyableDataMap;
5797
5798 /// Represents mapping between instruction and all related
5799 /// ScheduleCopyableData (for all uses in the tree, represenedt as copyable
5800 /// element). The SLP tree may contain several representations of the same
5801 /// instruction.
5802 SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
5803 ScheduleCopyableDataMapByInst;
5804
5805 /// Represents mapping between user value and operand number, the operand
5806 /// value and all related ScheduleCopyableData. The relation is 1:n, because
5807 /// the same user may refernce the same operand in different tree entries
5808 /// and the operand may be modelled by the different copyable data element.
5809 SmallDenseMap<std::pair<std::pair<const Value *, unsigned>, const Value *>,
5811 ScheduleCopyableDataMapByInstUser;
5812
5813 /// Represents mapping between instruction and all related
5814 /// ScheduleCopyableData. It represents the mapping between the actual
5815 /// instruction and the last copyable data element in the chain. E.g., if
5816 /// the graph models the following instructions:
5817 /// %0 = non-add instruction ...
5818 /// ...
5819 /// %4 = add %3, 1
5820 /// %5 = add %4, 1
5821 /// %6 = insertelement poison, %0, 0
5822 /// %7 = insertelement %6, %5, 1
5823 /// And the graph is modeled as:
5824 /// [%5, %0] -> [%4, copyable %0 <0> ] -> [%3, copyable %0 <1> ]
5825 /// -> [1, 0] -> [%1, 0]
5826 ///
5827 /// this map will map %0 only to the copyable element <1>, which is the last
5828 /// user (direct user of the actual instruction). <0> uses <1>, so <1> will
5829 /// keep the map to <0>, not the %0.
5830 SmallDenseMap<const Instruction *,
5831 SmallSetVector<ScheduleCopyableData *, 4>>
5832 ScheduleCopyableDataMapByUsers;
5833
5834 /// Attaches ScheduleBundle to Instruction.
5835 SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
5836 ScheduledBundles;
5837 /// The list of ScheduleBundles.
5838 SmallVector<std::unique_ptr<ScheduleBundle>> ScheduledBundlesList;
5839
5840 /// The ready-list for scheduling (only used for the dry-run).
5841 SetVector<ScheduleEntity *> ReadyInsts;
5842
5843 /// The first instruction of the scheduling region.
5844 Instruction *ScheduleStart = nullptr;
5845
5846 /// The first instruction _after_ the scheduling region.
5847 Instruction *ScheduleEnd = nullptr;
5848
5849 /// The first memory accessing instruction in the scheduling region
5850 /// (can be null).
5851 ScheduleData *FirstLoadStoreInRegion = nullptr;
5852
5853 /// The last memory accessing instruction in the scheduling region
5854 /// (can be null).
5855 ScheduleData *LastLoadStoreInRegion = nullptr;
5856
5857 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
5858 /// region? Used to optimize the dependence calculation for the
5859 /// common case where there isn't.
5860 bool RegionHasStackSave = false;
5861
5862 /// The current size of the scheduling region.
5863 int ScheduleRegionSize = 0;
5864
5865 /// The maximum size allowed for the scheduling region.
5866 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
5867
5868 /// The ID of the scheduling region. For a new vectorization iteration this
5869 /// is incremented which "removes" all ScheduleData from the region.
5870 /// Make sure that the initial SchedulingRegionID is greater than the
5871 /// initial SchedulingRegionID in ScheduleData (which is 0).
5872 int SchedulingRegionID = 1;
5873 };
5874
5875 /// Attaches the BlockScheduling structures to basic blocks.
5876 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
5877
5878 /// Performs the "real" scheduling. Done before vectorization is actually
5879 /// performed in a basic block.
5880 void scheduleBlock(const BoUpSLP &R, BlockScheduling *BS);
5881
5882 /// List of users to ignore during scheduling and that don't need extracting.
5883 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
5884
5885 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
5886 /// sorted SmallVectors of unsigned.
5887 struct OrdersTypeDenseMapInfo {
5888 static OrdersType getEmptyKey() {
5889 OrdersType V;
5890 V.push_back(~1U);
5891 return V;
5892 }
5893
5894 static OrdersType getTombstoneKey() {
5895 OrdersType V;
5896 V.push_back(~2U);
5897 return V;
5898 }
5899
5900 static unsigned getHashValue(const OrdersType &V) {
5901 return static_cast<unsigned>(hash_combine_range(V));
5902 }
5903
5904 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
5905 return LHS == RHS;
5906 }
5907 };
5908
5909 // Analysis and block reference.
5910 Function *F;
5911 ScalarEvolution *SE;
5912 TargetTransformInfo *TTI;
5913 TargetLibraryInfo *TLI;
5914 LoopInfo *LI;
5915 DominatorTree *DT;
5916 AssumptionCache *AC;
5917 DemandedBits *DB;
5918 const DataLayout *DL;
5919 OptimizationRemarkEmitter *ORE;
5920
5921 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
5922 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
5923
5924 /// Instruction builder to construct the vectorized tree.
5925 IRBuilder<TargetFolder> Builder;
5926
5927 /// A map of scalar integer values to the smallest bit width with which they
5928 /// can legally be represented. The values map to (width, signed) pairs,
5929 /// where "width" indicates the minimum bit width and "signed" is True if the
5930 /// value must be signed-extended, rather than zero-extended, back to its
5931 /// original width.
5932 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
5933
5934 /// Final size of the reduced vector, if the current graph represents the
5935 /// input for the reduction and it was possible to narrow the size of the
5936 /// reduction.
5937 unsigned ReductionBitWidth = 0;
5938
5939 /// Canonical graph size before the transformations.
5940 unsigned BaseGraphSize = 1;
5941
5942 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
5943 /// type sizes, used in the tree.
5944 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
5945
5946 /// Indices of the vectorized nodes, which supposed to be the roots of the new
5947 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
5948 DenseSet<unsigned> ExtraBitWidthNodes;
5949};
5950
5951} // end namespace slpvectorizer
5952
5953template <> struct DenseMapInfo<BoUpSLP::EdgeInfo> {
5957 return BoUpSLP::EdgeInfo(FirstInfo::getEmptyKey(),
5958 SecondInfo::getEmptyKey());
5959 }
5960
5962 return BoUpSLP::EdgeInfo(FirstInfo::getTombstoneKey(),
5963 SecondInfo::getTombstoneKey());
5964 }
5965
5966 static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val) {
5967 return detail::combineHashValue(FirstInfo::getHashValue(Val.UserTE),
5968 SecondInfo::getHashValue(Val.EdgeIdx));
5969 }
5970
5971 static bool isEqual(const BoUpSLP::EdgeInfo &LHS,
5972 const BoUpSLP::EdgeInfo &RHS) {
5973 return LHS == RHS;
5974 }
5975};
5976
5977template <> struct GraphTraits<BoUpSLP *> {
5978 using TreeEntry = BoUpSLP::TreeEntry;
5979
5980 /// NodeRef has to be a pointer per the GraphWriter.
5982
5983 using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
5984
5985 /// Add the VectorizableTree to the index iterator to be able to return
5986 /// TreeEntry pointers.
5988 : public iterator_adaptor_base<
5989 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
5991
5995
5996 NodeRef operator*() { return I->UserTE; }
5997 };
5998
6000 return R.VectorizableTree[0].get();
6001 }
6002
6004 return {&N->UserTreeIndex, N->Container};
6005 }
6006
6008 return {&N->UserTreeIndex + 1, N->Container};
6009 }
6010
6011 /// For the node iterator we just need to turn the TreeEntry iterator into a
6012 /// TreeEntry* iterator so that it dereferences to NodeRef.
6014 using ItTy = ContainerTy::iterator;
6015 ItTy It;
6016
6017 public:
6018 nodes_iterator(const ItTy &It2) : It(It2) {}
6019 NodeRef operator*() { return It->get(); }
6021 ++It;
6022 return *this;
6023 }
6024 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
6025 };
6026
6028 return nodes_iterator(R->VectorizableTree.begin());
6029 }
6030
6032 return nodes_iterator(R->VectorizableTree.end());
6033 }
6034
6035 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
6036};
6037
6038template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
6039 using TreeEntry = BoUpSLP::TreeEntry;
6040
6041 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
6042
6043 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
6044 std::string Str;
6045 raw_string_ostream OS(Str);
6046 OS << Entry->Idx << ".\n";
6047 if (isSplat(Entry->Scalars))
6048 OS << "<splat> ";
6049 for (auto *V : Entry->Scalars) {
6050 OS << *V;
6051 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
6052 return EU.Scalar == V;
6053 }))
6054 OS << " <extract>";
6055 OS << "\n";
6056 }
6057 return Str;
6058 }
6059
6060 static std::string getNodeAttributes(const TreeEntry *Entry,
6061 const BoUpSLP *) {
6062 if (Entry->isGather())
6063 return "color=red";
6064 if (Entry->State == TreeEntry::ScatterVectorize ||
6065 Entry->State == TreeEntry::StridedVectorize ||
6066 Entry->State == TreeEntry::CompressVectorize)
6067 return "color=blue";
6068 return "";
6069 }
6070};
6071
6072} // end namespace llvm
6073
6076 for (auto *I : DeletedInstructions) {
6077 if (!I->getParent()) {
6078 // Temporarily insert instruction back to erase them from parent and
6079 // memory later.
6080 if (isa<PHINode>(I))
6081 // Phi nodes must be the very first instructions in the block.
6082 I->insertBefore(F->getEntryBlock(),
6083 F->getEntryBlock().getFirstNonPHIIt());
6084 else
6085 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
6086 continue;
6087 }
6088 for (Use &U : I->operands()) {
6089 auto *Op = dyn_cast<Instruction>(U.get());
6090 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
6092 DeadInsts.emplace_back(Op);
6093 }
6094 I->dropAllReferences();
6095 }
6096 for (auto *I : DeletedInstructions) {
6097 assert(I->use_empty() &&
6098 "trying to erase instruction with users.");
6099 I->eraseFromParent();
6100 }
6101
6102 // Cleanup any dead scalar code feeding the vectorized instructions
6104
6105#ifdef EXPENSIVE_CHECKS
6106 // If we could guarantee that this call is not extremely slow, we could
6107 // remove the ifdef limitation (see PR47712).
6108 assert(!verifyFunction(*F, &dbgs()));
6109#endif
6110}
6111
6112/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
6113/// contains original mask for the scalars reused in the node. Procedure
6114/// transform this mask in accordance with the given \p Mask.
6116 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
6117 "Expected non-empty mask.");
6118 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
6119 Prev.swap(Reuses);
6120 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
6121 if (Mask[I] != PoisonMaskElem)
6122 Reuses[Mask[I]] = Prev[I];
6123}
6124
6125/// Reorders the given \p Order according to the given \p Mask. \p Order - is
6126/// the original order of the scalars. Procedure transforms the provided order
6127/// in accordance with the given \p Mask. If the resulting \p Order is just an
6128/// identity order, \p Order is cleared.
6130 bool BottomOrder = false) {
6131 assert(!Mask.empty() && "Expected non-empty mask.");
6132 unsigned Sz = Mask.size();
6133 if (BottomOrder) {
6134 SmallVector<unsigned> PrevOrder;
6135 if (Order.empty()) {
6136 PrevOrder.resize(Sz);
6137 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
6138 } else {
6139 PrevOrder.swap(Order);
6140 }
6141 Order.assign(Sz, Sz);
6142 for (unsigned I = 0; I < Sz; ++I)
6143 if (Mask[I] != PoisonMaskElem)
6144 Order[I] = PrevOrder[Mask[I]];
6145 if (all_of(enumerate(Order), [&](const auto &Data) {
6146 return Data.value() == Sz || Data.index() == Data.value();
6147 })) {
6148 Order.clear();
6149 return;
6150 }
6151 fixupOrderingIndices(Order);
6152 return;
6153 }
6154 SmallVector<int> MaskOrder;
6155 if (Order.empty()) {
6156 MaskOrder.resize(Sz);
6157 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
6158 } else {
6159 inversePermutation(Order, MaskOrder);
6160 }
6161 reorderReuses(MaskOrder, Mask);
6162 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
6163 Order.clear();
6164 return;
6165 }
6166 Order.assign(Sz, Sz);
6167 for (unsigned I = 0; I < Sz; ++I)
6168 if (MaskOrder[I] != PoisonMaskElem)
6169 Order[MaskOrder[I]] = I;
6170 fixupOrderingIndices(Order);
6171}
6172
6173std::optional<BoUpSLP::OrdersType>
6174BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
6175 bool TopToBottom, bool IgnoreReorder) {
6176 assert(TE.isGather() && "Expected gather node only.");
6177 // Try to find subvector extract/insert patterns and reorder only such
6178 // patterns.
6179 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
6180 Type *ScalarTy = GatheredScalars.front()->getType();
6181 size_t NumScalars = GatheredScalars.size();
6182 if (!isValidElementType(ScalarTy))
6183 return std::nullopt;
6184 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
6185 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, NumScalars);
6186 SmallVector<int> ExtractMask;
6187 SmallVector<int> Mask;
6190 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
6192 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
6193 /*ForOrder=*/true);
6194 // No shuffled operands - ignore.
6195 if (GatherShuffles.empty() && ExtractShuffles.empty())
6196 return std::nullopt;
6197 OrdersType CurrentOrder(NumScalars, NumScalars);
6198 if (GatherShuffles.size() == 1 &&
6199 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
6200 Entries.front().front()->isSame(TE.Scalars)) {
6201 // If the full matched node in whole tree rotation - no need to consider the
6202 // matching order, rotating the whole tree.
6203 if (TopToBottom)
6204 return std::nullopt;
6205 // No need to keep the order for the same user node.
6206 if (Entries.front().front()->UserTreeIndex.UserTE ==
6207 TE.UserTreeIndex.UserTE)
6208 return std::nullopt;
6209 // No need to keep the order for the matched root node, if it can be freely
6210 // reordered.
6211 if (!IgnoreReorder && Entries.front().front()->Idx == 0)
6212 return std::nullopt;
6213 // If shuffling 2 elements only and the matching node has reverse reuses -
6214 // no need to count order, both work fine.
6215 if (!Entries.front().front()->ReuseShuffleIndices.empty() &&
6216 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6217 any_of(enumerate(Entries.front().front()->ReuseShuffleIndices),
6218 [](const auto &P) {
6219 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6220 }))
6221 return std::nullopt;
6222
6223 // Perfect match in the graph, will reuse the previously vectorized
6224 // node. Cost is 0.
6225 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
6226 return CurrentOrder;
6227 }
6228 auto IsSplatMask = [](ArrayRef<int> Mask) {
6229 int SingleElt = PoisonMaskElem;
6230 return all_of(Mask, [&](int I) {
6231 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
6232 SingleElt = I;
6233 return I == PoisonMaskElem || I == SingleElt;
6234 });
6235 };
6236 // Exclusive broadcast mask - ignore.
6237 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
6238 (Entries.size() != 1 ||
6239 Entries.front().front()->ReorderIndices.empty())) ||
6240 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
6241 return std::nullopt;
6242 SmallBitVector ShuffledSubMasks(NumParts);
6243 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
6244 ArrayRef<int> Mask, int PartSz, int NumParts,
6245 function_ref<unsigned(unsigned)> GetVF) {
6246 for (int I : seq<int>(0, NumParts)) {
6247 if (ShuffledSubMasks.test(I))
6248 continue;
6249 const int VF = GetVF(I);
6250 if (VF == 0)
6251 continue;
6252 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
6253 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
6254 // Shuffle of at least 2 vectors - ignore.
6255 if (any_of(Slice, [&](unsigned I) { return I != NumScalars; })) {
6256 llvm::fill(Slice, NumScalars);
6257 ShuffledSubMasks.set(I);
6258 continue;
6259 }
6260 // Try to include as much elements from the mask as possible.
6261 int FirstMin = INT_MAX;
6262 int SecondVecFound = false;
6263 for (int K : seq<int>(Limit)) {
6264 int Idx = Mask[I * PartSz + K];
6265 if (Idx == PoisonMaskElem) {
6266 Value *V = GatheredScalars[I * PartSz + K];
6267 if (isConstant(V) && !isa<PoisonValue>(V)) {
6268 SecondVecFound = true;
6269 break;
6270 }
6271 continue;
6272 }
6273 if (Idx < VF) {
6274 if (FirstMin > Idx)
6275 FirstMin = Idx;
6276 } else {
6277 SecondVecFound = true;
6278 break;
6279 }
6280 }
6281 FirstMin = (FirstMin / PartSz) * PartSz;
6282 // Shuffle of at least 2 vectors - ignore.
6283 if (SecondVecFound) {
6284 llvm::fill(Slice, NumScalars);
6285 ShuffledSubMasks.set(I);
6286 continue;
6287 }
6288 for (int K : seq<int>(Limit)) {
6289 int Idx = Mask[I * PartSz + K];
6290 if (Idx == PoisonMaskElem)
6291 continue;
6292 Idx -= FirstMin;
6293 if (Idx >= PartSz) {
6294 SecondVecFound = true;
6295 break;
6296 }
6297 if (CurrentOrder[I * PartSz + Idx] >
6298 static_cast<unsigned>(I * PartSz + K) &&
6299 CurrentOrder[I * PartSz + Idx] !=
6300 static_cast<unsigned>(I * PartSz + Idx))
6301 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
6302 }
6303 // Shuffle of at least 2 vectors - ignore.
6304 if (SecondVecFound) {
6305 llvm::fill(Slice, NumScalars);
6306 ShuffledSubMasks.set(I);
6307 continue;
6308 }
6309 }
6310 };
6311 int PartSz = getPartNumElems(NumScalars, NumParts);
6312 if (!ExtractShuffles.empty())
6313 TransformMaskToOrder(
6314 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
6315 if (!ExtractShuffles[I])
6316 return 0U;
6317 unsigned VF = 0;
6318 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
6319 for (unsigned Idx : seq<unsigned>(Sz)) {
6320 int K = I * PartSz + Idx;
6321 if (ExtractMask[K] == PoisonMaskElem)
6322 continue;
6323 if (!TE.ReuseShuffleIndices.empty())
6324 K = TE.ReuseShuffleIndices[K];
6325 if (K == PoisonMaskElem)
6326 continue;
6327 if (!TE.ReorderIndices.empty())
6328 K = std::distance(TE.ReorderIndices.begin(),
6329 find(TE.ReorderIndices, K));
6330 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
6331 if (!EI)
6332 continue;
6333 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
6334 ->getElementCount()
6335 .getKnownMinValue());
6336 }
6337 return VF;
6338 });
6339 // Check special corner case - single shuffle of the same entry.
6340 if (GatherShuffles.size() == 1 && NumParts != 1) {
6341 if (ShuffledSubMasks.any())
6342 return std::nullopt;
6343 PartSz = NumScalars;
6344 NumParts = 1;
6345 }
6346 if (!Entries.empty())
6347 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
6348 if (!GatherShuffles[I])
6349 return 0U;
6350 return std::max(Entries[I].front()->getVectorFactor(),
6351 Entries[I].back()->getVectorFactor());
6352 });
6353 unsigned NumUndefs = count(CurrentOrder, NumScalars);
6354 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6355 return std::nullopt;
6356 return std::move(CurrentOrder);
6357}
6358
6359static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
6360 const TargetLibraryInfo &TLI,
6361 bool CompareOpcodes = true) {
6364 return false;
6365 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
6366 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
6367 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6368 (!GEP2 || GEP2->getNumOperands() == 2) &&
6369 (((!GEP1 || isConstant(GEP1->getOperand(1))) &&
6370 (!GEP2 || isConstant(GEP2->getOperand(1)))) ||
6371 !CompareOpcodes ||
6372 (GEP1 && GEP2 &&
6373 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
6374}
6375
6376/// Calculates minimal alignment as a common alignment.
6377template <typename T>
6379 Align CommonAlignment = cast<T>(VL.consume_front())->getAlign();
6380 for (Value *V : VL)
6381 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
6382 return CommonAlignment;
6383}
6384
6385/// Check if \p Order represents reverse order.
6387 assert(!Order.empty() &&
6388 "Order is empty. Please check it before using isReverseOrder.");
6389 unsigned Sz = Order.size();
6390 return all_of(enumerate(Order), [&](const auto &Pair) {
6391 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6392 });
6393}
6394
6395/// Checks if the provided list of pointers \p Pointers represents the strided
6396/// pointers for type ElemTy. If they are not, nullptr is returned.
6397/// Otherwise, SCEV* of the stride value is returned.
6398static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
6399 const DataLayout &DL, ScalarEvolution &SE,
6400 SmallVectorImpl<unsigned> &SortedIndices) {
6402 const SCEV *PtrSCEVLowest = nullptr;
6403 const SCEV *PtrSCEVHighest = nullptr;
6404 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
6405 // addresses).
6406 for (Value *Ptr : PointerOps) {
6407 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
6408 if (!PtrSCEV)
6409 return nullptr;
6410 SCEVs.push_back(PtrSCEV);
6411 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6412 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6413 continue;
6414 }
6415 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
6416 if (isa<SCEVCouldNotCompute>(Diff))
6417 return nullptr;
6418 if (Diff->isNonConstantNegative()) {
6419 PtrSCEVLowest = PtrSCEV;
6420 continue;
6421 }
6422 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
6423 if (isa<SCEVCouldNotCompute>(Diff1))
6424 return nullptr;
6425 if (Diff1->isNonConstantNegative()) {
6426 PtrSCEVHighest = PtrSCEV;
6427 continue;
6428 }
6429 }
6430 // Dist = PtrSCEVHighest - PtrSCEVLowest;
6431 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
6432 if (isa<SCEVCouldNotCompute>(Dist))
6433 return nullptr;
6434 int Size = DL.getTypeStoreSize(ElemTy);
6435 auto TryGetStride = [&](const SCEV *Dist,
6436 const SCEV *Multiplier) -> const SCEV * {
6437 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
6438 if (M->getOperand(0) == Multiplier)
6439 return M->getOperand(1);
6440 if (M->getOperand(1) == Multiplier)
6441 return M->getOperand(0);
6442 return nullptr;
6443 }
6444 if (Multiplier == Dist)
6445 return SE.getConstant(Dist->getType(), 1);
6446 return SE.getUDivExactExpr(Dist, Multiplier);
6447 };
6448 // Stride_in_elements = Dist / element_size * (num_elems - 1).
6449 const SCEV *Stride = nullptr;
6450 if (Size != 1 || SCEVs.size() > 2) {
6451 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
6452 Stride = TryGetStride(Dist, Sz);
6453 if (!Stride)
6454 return nullptr;
6455 }
6456 if (!Stride || isa<SCEVConstant>(Stride))
6457 return nullptr;
6458 // Iterate through all pointers and check if all distances are
6459 // unique multiple of Stride.
6460 using DistOrdPair = std::pair<int64_t, int>;
6461 auto Compare = llvm::less_first();
6462 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
6463 int Cnt = 0;
6464 bool IsConsecutive = true;
6465 for (const SCEV *PtrSCEV : SCEVs) {
6466 unsigned Dist = 0;
6467 if (PtrSCEV != PtrSCEVLowest) {
6468 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
6469 const SCEV *Coeff = TryGetStride(Diff, Stride);
6470 if (!Coeff)
6471 return nullptr;
6472 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
6473 if (!SC || isa<SCEVCouldNotCompute>(SC))
6474 return nullptr;
6475 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
6476 SE.getMulExpr(Stride, SC)))
6477 ->isZero())
6478 return nullptr;
6479 Dist = SC->getAPInt().getZExtValue();
6480 }
6481 // If the strides are not the same or repeated, we can't vectorize.
6482 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
6483 return nullptr;
6484 auto Res = Offsets.emplace(Dist, Cnt);
6485 if (!Res.second)
6486 return nullptr;
6487 // Consecutive order if the inserted element is the last one.
6488 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
6489 ++Cnt;
6490 }
6491 if (Offsets.size() != SCEVs.size())
6492 return nullptr;
6493 SortedIndices.clear();
6494 if (!IsConsecutive) {
6495 // Fill SortedIndices array only if it is non-consecutive.
6496 SortedIndices.resize(PointerOps.size());
6497 Cnt = 0;
6498 for (const std::pair<int64_t, int> &Pair : Offsets) {
6499 SortedIndices[Cnt] = Pair.second;
6500 ++Cnt;
6501 }
6502 }
6503 return Stride;
6504}
6505
6506static std::pair<InstructionCost, InstructionCost>
6507getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
6508 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
6509 Type *ScalarTy, VectorType *VecTy);
6510
6511/// Returns the cost of the shuffle instructions with the given \p Kind, vector
6512/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
6513/// subvector pattern.
6514static InstructionCost
6516 VectorType *Tp, ArrayRef<int> Mask = {},
6518 int Index = 0, VectorType *SubTp = nullptr,
6520 VectorType *DstTy = Tp;
6521 if (!Mask.empty())
6522 DstTy = FixedVectorType::get(Tp->getScalarType(), Mask.size());
6523
6524 if (Kind != TTI::SK_PermuteTwoSrc)
6525 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
6526 Args);
6527 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6528 int NumSubElts;
6530 Mask, NumSrcElts, NumSubElts, Index)) {
6531 if (Index + NumSubElts > NumSrcElts &&
6532 Index + NumSrcElts <= static_cast<int>(Mask.size()))
6533 return TTI.getShuffleCost(TTI::SK_InsertSubvector, DstTy, Tp, Mask,
6534 TTI::TCK_RecipThroughput, Index, Tp);
6535 }
6536 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
6537 Args);
6538}
6539
6540/// This is similar to TargetTransformInfo::getScalarizationOverhead, but if
6541/// ScalarTy is a FixedVectorType, a vector will be inserted or extracted
6542/// instead of a scalar.
6543static InstructionCost
6545 VectorType *Ty, const APInt &DemandedElts, bool Insert,
6546 bool Extract, TTI::TargetCostKind CostKind,
6547 bool ForPoisonSrc = true, ArrayRef<Value *> VL = {}) {
6549 "ScalableVectorType is not supported.");
6550 assert(getNumElements(ScalarTy) * DemandedElts.getBitWidth() ==
6551 getNumElements(Ty) &&
6552 "Incorrect usage.");
6553 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
6554 assert(SLPReVec && "Only supported by REVEC.");
6555 // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
6556 // of CreateInsertElement.
6557 unsigned ScalarTyNumElements = VecTy->getNumElements();
6559 for (unsigned I : seq(DemandedElts.getBitWidth())) {
6560 if (!DemandedElts[I])
6561 continue;
6562 if (Insert)
6564 I * ScalarTyNumElements, VecTy);
6565 if (Extract)
6567 I * ScalarTyNumElements, VecTy);
6568 }
6569 return Cost;
6570 }
6571 return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
6572 CostKind, ForPoisonSrc, VL);
6573}
6574
6575/// This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy
6576/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6578 const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val,
6579 TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar,
6580 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
6581 if (Opcode == Instruction::ExtractElement) {
6582 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
6583 assert(SLPReVec && "Only supported by REVEC.");
6584 assert(isa<VectorType>(Val) && "Val must be a vector type.");
6586 cast<VectorType>(Val), {}, CostKind,
6587 Index * VecTy->getNumElements(), VecTy);
6588 }
6589 }
6590 return TTI.getVectorInstrCost(Opcode, Val, CostKind, Index, Scalar,
6591 ScalarUserAndIdx);
6592}
6593
6594/// This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst
6595/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6597 const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst,
6598 VectorType *VecTy, unsigned Index,
6600 if (auto *ScalarTy = dyn_cast<FixedVectorType>(Dst)) {
6601 assert(SLPReVec && "Only supported by REVEC.");
6602 auto *SubTp =
6603 getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
6605 Index * ScalarTy->getNumElements(), SubTp) +
6606 TTI.getCastInstrCost(Opcode, Dst, SubTp, TTI::CastContextHint::None,
6607 CostKind);
6608 }
6609 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index, CostKind);
6610}
6611
6612/// Creates subvector insert. Generates shuffle using \p Generator or
6613/// using default shuffle.
6615 IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index,
6616 function_ref<Value *(Value *, Value *, ArrayRef<int>)> Generator = {}) {
6617 if (isa<PoisonValue>(Vec) && isa<PoisonValue>(V))
6618 return Vec;
6619 const unsigned SubVecVF = getNumElements(V->getType());
6620 // Create shuffle, insertvector requires that index is multiple of
6621 // the subvector length.
6622 const unsigned VecVF = getNumElements(Vec->getType());
6623 SmallVector<int> Mask(VecVF, PoisonMaskElem);
6624 if (isa<PoisonValue>(Vec)) {
6625 auto *Begin = std::next(Mask.begin(), Index);
6626 std::iota(Begin, std::next(Begin, SubVecVF), 0);
6627 Vec = Builder.CreateShuffleVector(V, Mask);
6628 return Vec;
6629 }
6630 std::iota(Mask.begin(), Mask.end(), 0);
6631 std::iota(std::next(Mask.begin(), Index),
6632 std::next(Mask.begin(), Index + SubVecVF), VecVF);
6633 if (Generator)
6634 return Generator(Vec, V, Mask);
6635 // 1. Resize V to the size of Vec.
6636 SmallVector<int> ResizeMask(VecVF, PoisonMaskElem);
6637 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
6638 V = Builder.CreateShuffleVector(V, ResizeMask);
6639 // 2. Insert V into Vec.
6640 return Builder.CreateShuffleVector(Vec, V, Mask);
6641}
6642
6643/// Generates subvector extract using \p Generator or using default shuffle.
6645 unsigned SubVecVF, unsigned Index) {
6646 SmallVector<int> Mask(SubVecVF, PoisonMaskElem);
6647 std::iota(Mask.begin(), Mask.end(), Index);
6648 return Builder.CreateShuffleVector(Vec, Mask);
6649}
6650
6651/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered
6652/// with \p Order.
6653/// \return true if the mask represents strided access, false - otherwise.
6655 ArrayRef<unsigned> Order, Type *ScalarTy,
6656 const DataLayout &DL, ScalarEvolution &SE,
6657 SmallVectorImpl<int> &CompressMask) {
6658 const unsigned Sz = PointerOps.size();
6659 CompressMask.assign(Sz, PoisonMaskElem);
6660 // The first element always set.
6661 CompressMask[0] = 0;
6662 // Check if the mask represents strided access.
6663 std::optional<unsigned> Stride = 0;
6664 Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps[Order.front()];
6665 for (unsigned I : seq<unsigned>(1, Sz)) {
6666 Value *Ptr = Order.empty() ? PointerOps[I] : PointerOps[Order[I]];
6667 std::optional<int64_t> OptPos =
6668 getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
6669 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
6670 return false;
6671 unsigned Pos = static_cast<unsigned>(*OptPos);
6672 CompressMask[I] = Pos;
6673 if (!Stride)
6674 continue;
6675 if (*Stride == 0) {
6676 *Stride = Pos;
6677 continue;
6678 }
6679 if (Pos != *Stride * I)
6680 Stride.reset();
6681 }
6682 return Stride.has_value();
6683}
6684
6685/// Checks if the \p VL can be transformed to a (masked)load + compress or
6686/// (masked) interleaved load.
6688 ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
6691 const DominatorTree &DT, const TargetLibraryInfo &TLI,
6692 const function_ref<bool(Value *)> AreAllUsersVectorized, bool &IsMasked,
6693 unsigned &InterleaveFactor, SmallVectorImpl<int> &CompressMask,
6694 VectorType *&LoadVecTy) {
6695 InterleaveFactor = 0;
6696 Type *ScalarTy = VL.front()->getType();
6697 const size_t Sz = VL.size();
6698 auto *VecTy = getWidenedType(ScalarTy, Sz);
6700 SmallVector<int> Mask;
6701 if (!Order.empty())
6702 inversePermutation(Order, Mask);
6703 // Check external uses.
6704 for (const auto [I, V] : enumerate(VL)) {
6705 if (AreAllUsersVectorized(V))
6706 continue;
6707 InstructionCost ExtractCost =
6708 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
6709 Mask.empty() ? I : Mask[I]);
6710 InstructionCost ScalarCost =
6711 TTI.getInstructionCost(cast<Instruction>(V), CostKind);
6712 if (ExtractCost <= ScalarCost)
6713 return false;
6714 }
6715 Value *Ptr0;
6716 Value *PtrN;
6717 if (Order.empty()) {
6718 Ptr0 = PointerOps.front();
6719 PtrN = PointerOps.back();
6720 } else {
6721 Ptr0 = PointerOps[Order.front()];
6722 PtrN = PointerOps[Order.back()];
6723 }
6724 std::optional<int64_t> Diff =
6725 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
6726 if (!Diff)
6727 return false;
6728 const size_t MaxRegSize =
6730 .getFixedValue();
6731 // Check for very large distances between elements.
6732 if (*Diff / Sz >= MaxRegSize / 8)
6733 return false;
6734 LoadVecTy = getWidenedType(ScalarTy, *Diff + 1);
6735 auto *LI = cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]);
6736 Align CommonAlignment = LI->getAlign();
6737 IsMasked = !isSafeToLoadUnconditionally(
6738 Ptr0, LoadVecTy, CommonAlignment, DL,
6739 cast<LoadInst>(Order.empty() ? VL.back() : VL[Order.back()]), &AC, &DT,
6740 &TLI);
6741 if (IsMasked && !TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment,
6742 LI->getPointerAddressSpace()))
6743 return false;
6744 // TODO: perform the analysis of each scalar load for better
6745 // safe-load-unconditionally analysis.
6746 bool IsStrided =
6747 buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask);
6748 assert(CompressMask.size() >= 2 && "At least two elements are required");
6749 SmallVector<Value *> OrderedPointerOps(PointerOps);
6750 if (!Order.empty())
6751 reorderScalars(OrderedPointerOps, Mask);
6752 auto [ScalarGEPCost, VectorGEPCost] =
6753 getGEPCosts(TTI, OrderedPointerOps, OrderedPointerOps.front(),
6754 Instruction::GetElementPtr, CostKind, ScalarTy, LoadVecTy);
6755 // The cost of scalar loads.
6756 InstructionCost ScalarLoadsCost =
6757 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
6758 [&](InstructionCost C, Value *V) {
6759 return C + TTI.getInstructionCost(cast<Instruction>(V),
6760 CostKind);
6761 }) +
6762 ScalarGEPCost;
6763 APInt DemandedElts = APInt::getAllOnes(Sz);
6764 InstructionCost GatherCost =
6765 getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
6766 /*Insert=*/true,
6767 /*Extract=*/false, CostKind) +
6768 ScalarLoadsCost;
6769 InstructionCost LoadCost = 0;
6770 if (IsMasked) {
6771 LoadCost =
6772 TTI.getMaskedMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6773 LI->getPointerAddressSpace(), CostKind);
6774 } else {
6775 LoadCost =
6776 TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6777 LI->getPointerAddressSpace(), CostKind);
6778 }
6779 if (IsStrided && !IsMasked && Order.empty()) {
6780 // Check for potential segmented(interleaved) loads.
6781 VectorType *AlignedLoadVecTy = getWidenedType(
6782 ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1));
6783 if (!isSafeToLoadUnconditionally(Ptr0, AlignedLoadVecTy, CommonAlignment,
6784 DL, cast<LoadInst>(VL.back()), &AC, &DT,
6785 &TLI))
6786 AlignedLoadVecTy = LoadVecTy;
6787 if (TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
6788 CommonAlignment,
6789 LI->getPointerAddressSpace())) {
6790 InstructionCost InterleavedCost =
6791 VectorGEPCost + TTI.getInterleavedMemoryOpCost(
6792 Instruction::Load, AlignedLoadVecTy,
6793 CompressMask[1], {}, CommonAlignment,
6794 LI->getPointerAddressSpace(), CostKind, IsMasked);
6795 if (InterleavedCost < GatherCost) {
6796 InterleaveFactor = CompressMask[1];
6797 LoadVecTy = AlignedLoadVecTy;
6798 return true;
6799 }
6800 }
6801 }
6802 InstructionCost CompressCost = ::getShuffleCost(
6803 TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind);
6804 if (!Order.empty()) {
6805 SmallVector<int> NewMask(Sz, PoisonMaskElem);
6806 for (unsigned I : seq<unsigned>(Sz)) {
6807 NewMask[I] = CompressMask[Mask[I]];
6808 }
6809 CompressMask.swap(NewMask);
6810 }
6811 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
6812 return TotalVecCost < GatherCost;
6813}
6814
6815/// Checks if the \p VL can be transformed to a (masked)load + compress or
6816/// (masked) interleaved load.
6817static bool
6820 const DataLayout &DL, ScalarEvolution &SE,
6821 AssumptionCache &AC, const DominatorTree &DT,
6822 const TargetLibraryInfo &TLI,
6823 const function_ref<bool(Value *)> AreAllUsersVectorized) {
6824 bool IsMasked;
6825 unsigned InterleaveFactor;
6826 SmallVector<int> CompressMask;
6827 VectorType *LoadVecTy;
6828 return isMaskedLoadCompress(VL, PointerOps, Order, TTI, DL, SE, AC, DT, TLI,
6829 AreAllUsersVectorized, IsMasked, InterleaveFactor,
6830 CompressMask, LoadVecTy);
6831}
6832
6833/// Checks if strided loads can be generated out of \p VL loads with pointers \p
6834/// PointerOps:
6835/// 1. Target with strided load support is detected.
6836/// 2. The number of loads is greater than MinProfitableStridedLoads, or the
6837/// potential stride <= MaxProfitableLoadStride and the potential stride is
6838/// power-of-2 (to avoid perf regressions for the very small number of loads)
6839/// and max distance > number of loads, or potential stride is -1.
6840/// 3. The loads are ordered, or number of unordered loads <=
6841/// MaxProfitableUnorderedLoads, or loads are in reversed order. (this check is
6842/// to avoid extra costs for very expensive shuffles).
6843/// 4. Any pointer operand is an instruction with the users outside of the
6844/// current graph (for masked gathers extra extractelement instructions
6845/// might be required).
6847 Align Alignment, const int64_t Diff, Value *Ptr0,
6848 Value *PtrN, StridedPtrInfo &SPtrInfo) const {
6849 const size_t Sz = PointerOps.size();
6850 if (Diff % (Sz - 1) != 0)
6851 return false;
6852
6853 // Try to generate strided load node.
6854 auto IsAnyPointerUsedOutGraph = any_of(PointerOps, [&](Value *V) {
6855 return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
6856 return !isVectorized(U) && !MustGather.contains(U);
6857 });
6858 });
6859
6860 const uint64_t AbsoluteDiff = std::abs(Diff);
6861 auto *VecTy = getWidenedType(ScalarTy, Sz);
6862 if (IsAnyPointerUsedOutGraph ||
6863 (AbsoluteDiff > Sz &&
6865 (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
6866 AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) ||
6867 Diff == -(static_cast<int64_t>(Sz) - 1)) {
6868 int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
6869 if (Diff != Stride * static_cast<int64_t>(Sz - 1))
6870 return false;
6871 if (!TTI->isLegalStridedLoadStore(VecTy, Alignment))
6872 return false;
6873
6874 // Iterate through all pointers and check if all distances are
6875 // unique multiple of Dist.
6877 for (Value *Ptr : PointerOps) {
6878 int64_t Dist = 0;
6879 if (Ptr == PtrN)
6880 Dist = Diff;
6881 else if (Ptr != Ptr0)
6882 Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
6883 // If the strides are not the same or repeated, we can't
6884 // vectorize.
6885 if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second)
6886 break;
6887 }
6888 if (Dists.size() == Sz) {
6889 Type *StrideTy = DL->getIndexType(Ptr0->getType());
6890 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride);
6891 SPtrInfo.Ty = getWidenedType(ScalarTy, Sz);
6892 return true;
6893 }
6894 }
6895 return false;
6896}
6897
6899 Type *ScalarTy, Align CommonAlignment,
6900 SmallVectorImpl<unsigned> &SortedIndices,
6901 StridedPtrInfo &SPtrInfo) const {
6902 const unsigned Sz = PointerOps.size();
6903 FixedVectorType *StridedLoadTy = getWidenedType(ScalarTy, Sz);
6904 if (Sz <= MinProfitableStridedLoads || !TTI->isTypeLegal(StridedLoadTy) ||
6905 !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
6906 return false;
6907 if (const SCEV *Stride =
6908 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, SortedIndices)) {
6909 SPtrInfo.Ty = getWidenedType(ScalarTy, PointerOps.size());
6910 SPtrInfo.StrideSCEV = Stride;
6911 return true;
6912 }
6913 return false;
6914}
6915
6917 ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
6918 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo,
6919 unsigned *BestVF, bool TryRecursiveCheck) const {
6920 // Check that a vectorized load would load the same memory as a scalar
6921 // load. For example, we don't want to vectorize loads that are smaller
6922 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
6923 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
6924 // from such a struct, we read/write packed bits disagreeing with the
6925 // unvectorized version.
6926 if (BestVF)
6927 *BestVF = 0;
6929 return LoadsState::Gather;
6930 Type *ScalarTy = VL0->getType();
6931
6932 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
6933 return LoadsState::Gather;
6934
6935 // Make sure all loads in the bundle are simple - we can't vectorize
6936 // atomic or volatile loads.
6937 PointerOps.clear();
6938 const size_t Sz = VL.size();
6939 PointerOps.resize(Sz);
6940 auto *POIter = PointerOps.begin();
6941 for (Value *V : VL) {
6942 auto *L = dyn_cast<LoadInst>(V);
6943 if (!L || !L->isSimple())
6944 return LoadsState::Gather;
6945 *POIter = L->getPointerOperand();
6946 ++POIter;
6947 }
6948
6949 Order.clear();
6950 // Check the order of pointer operands or that all pointers are the same.
6951 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
6952
6953 auto *VecTy = getWidenedType(ScalarTy, Sz);
6954 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
6955 if (!IsSorted) {
6956 if (analyzeRtStrideCandidate(PointerOps, ScalarTy, CommonAlignment, Order,
6957 SPtrInfo))
6959
6960 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
6961 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
6962 return LoadsState::Gather;
6963
6964 if (!all_of(PointerOps, [&](Value *P) {
6965 return arePointersCompatible(P, PointerOps.front(), *TLI);
6966 }))
6967 return LoadsState::Gather;
6968
6969 } else {
6970 Value *Ptr0;
6971 Value *PtrN;
6972 if (Order.empty()) {
6973 Ptr0 = PointerOps.front();
6974 PtrN = PointerOps.back();
6975 } else {
6976 Ptr0 = PointerOps[Order.front()];
6977 PtrN = PointerOps[Order.back()];
6978 }
6979 std::optional<int64_t> Diff =
6980 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
6981 // Check that the sorted loads are consecutive.
6982 if (static_cast<uint64_t>(*Diff) == Sz - 1)
6983 return LoadsState::Vectorize;
6984 if (isMaskedLoadCompress(VL, PointerOps, Order, *TTI, *DL, *SE, *AC, *DT,
6985 *TLI, [&](Value *V) {
6986 return areAllUsersVectorized(
6987 cast<Instruction>(V), UserIgnoreList);
6988 }))
6990 Align Alignment =
6991 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
6992 ->getAlign();
6993 if (isStridedLoad(PointerOps, ScalarTy, Alignment, *Diff, Ptr0, PtrN,
6994 SPtrInfo))
6996 }
6997 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
6998 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
6999 return LoadsState::Gather;
7000 // Correctly identify compare the cost of loads + shuffles rather than
7001 // strided/masked gather loads. Returns true if vectorized + shuffles
7002 // representation is better than just gather.
7003 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
7004 unsigned *BestVF,
7005 bool ProfitableGatherPointers) {
7006 if (BestVF)
7007 *BestVF = 0;
7008 // Compare masked gather cost and loads + insert subvector costs.
7010 auto [ScalarGEPCost, VectorGEPCost] =
7011 getGEPCosts(TTI, PointerOps, PointerOps.front(),
7012 Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
7013 // Estimate the cost of masked gather GEP. If not a splat, roughly
7014 // estimate as a buildvector, otherwise estimate as splat.
7015 APInt DemandedElts = APInt::getAllOnes(Sz);
7016 Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType();
7017 VectorType *PtrVecTy = getWidenedType(PtrScalarTy, Sz);
7018 if (static_cast<unsigned>(count_if(
7019 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
7020 any_of(PointerOps, [&](Value *V) {
7021 return getUnderlyingObject(V) !=
7022 getUnderlyingObject(PointerOps.front());
7023 }))
7024 VectorGEPCost += getScalarizationOverhead(TTI, PtrScalarTy, PtrVecTy,
7025 DemandedElts, /*Insert=*/true,
7026 /*Extract=*/false, CostKind);
7027 else
7028 VectorGEPCost +=
7030 TTI, PtrScalarTy, PtrVecTy, APInt::getOneBitSet(Sz, 0),
7031 /*Insert=*/true, /*Extract=*/false, CostKind) +
7032 ::getShuffleCost(TTI, TTI::SK_Broadcast, PtrVecTy, {}, CostKind);
7033 // The cost of scalar loads.
7034 InstructionCost ScalarLoadsCost =
7035 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
7036 [&](InstructionCost C, Value *V) {
7037 return C + TTI.getInstructionCost(
7039 }) +
7040 ScalarGEPCost;
7041 // The cost of masked gather.
7042 InstructionCost MaskedGatherCost =
7043 TTI.getGatherScatterOpCost(
7044 Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
7045 /*VariableMask=*/false, CommonAlignment, CostKind) +
7046 (ProfitableGatherPointers ? 0 : VectorGEPCost);
7047 InstructionCost GatherCost =
7048 getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
7049 /*Insert=*/true,
7050 /*Extract=*/false, CostKind) +
7051 ScalarLoadsCost;
7052 // The list of loads is small or perform partial check already - directly
7053 // compare masked gather cost and gather cost.
7054 constexpr unsigned ListLimit = 4;
7055 if (!TryRecursiveCheck || VL.size() < ListLimit)
7056 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7057
7058 // FIXME: The following code has not been updated for non-power-of-2
7059 // vectors (and not whole registers). The splitting logic here does not
7060 // cover the original vector if the vector factor is not a power of two.
7061 if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size()))
7062 return false;
7063
7064 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
7065 unsigned MinVF = getMinVF(2 * Sz);
7066 DemandedElts.clearAllBits();
7067 // Iterate through possible vectorization factors and check if vectorized +
7068 // shuffles is better than just gather.
7069 for (unsigned VF =
7070 getFloorFullVectorNumberOfElements(TTI, ScalarTy, VL.size() - 1);
7071 VF >= MinVF;
7072 VF = getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {
7074 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
7075 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
7077 SmallVector<Value *> PointerOps;
7078 LoadsState LS = canVectorizeLoads(Slice, Slice.front(), Order,
7079 PointerOps, SPtrInfo, BestVF,
7080 /*TryRecursiveCheck=*/false);
7081 // Check that the sorted loads are consecutive.
7082 if (LS == LoadsState::Gather) {
7083 if (BestVF) {
7084 DemandedElts.setAllBits();
7085 break;
7086 }
7087 DemandedElts.setBits(Cnt, Cnt + VF);
7088 continue;
7089 }
7090 // If need the reorder - consider as high-cost masked gather for now.
7091 if ((LS == LoadsState::Vectorize ||
7094 !Order.empty() && !isReverseOrder(Order))
7096 States.push_back(LS);
7097 }
7098 if (DemandedElts.isAllOnes())
7099 // All loads gathered - try smaller VF.
7100 continue;
7101 // Can be vectorized later as a serie of loads/insertelements.
7102 InstructionCost VecLdCost = 0;
7103 if (!DemandedElts.isZero()) {
7104 VecLdCost = getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
7105 /*Insert=*/true,
7106 /*Extract=*/false, CostKind) +
7107 ScalarGEPCost;
7108 for (unsigned Idx : seq<unsigned>(VL.size()))
7109 if (DemandedElts[Idx])
7110 VecLdCost +=
7111 TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
7112 }
7113 auto *SubVecTy = getWidenedType(ScalarTy, VF);
7114 for (auto [I, LS] : enumerate(States)) {
7115 auto *LI0 = cast<LoadInst>(VL[I * VF]);
7116 InstructionCost VectorGEPCost =
7117 (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
7118 ? 0
7119 : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
7120 LI0->getPointerOperand(),
7121 Instruction::GetElementPtr, CostKind, ScalarTy,
7122 SubVecTy)
7123 .second;
7124 if (LS == LoadsState::ScatterVectorize) {
7125 if (static_cast<unsigned>(
7126 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
7127 PointerOps.size() - 1 ||
7128 any_of(PointerOps, [&](Value *V) {
7129 return getUnderlyingObject(V) !=
7130 getUnderlyingObject(PointerOps.front());
7131 }))
7132 VectorGEPCost += getScalarizationOverhead(
7133 TTI, ScalarTy, SubVecTy, APInt::getAllOnes(VF),
7134 /*Insert=*/true, /*Extract=*/false, CostKind);
7135 else
7136 VectorGEPCost +=
7138 TTI, ScalarTy, SubVecTy, APInt::getOneBitSet(VF, 0),
7139 /*Insert=*/true, /*Extract=*/false, CostKind) +
7140 ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
7141 CostKind);
7142 }
7143 switch (LS) {
7145 VecLdCost +=
7146 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
7147 LI0->getPointerAddressSpace(), CostKind,
7149 VectorGEPCost;
7150 break;
7152 VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
7153 LI0->getPointerOperand(),
7154 /*VariableMask=*/false,
7155 CommonAlignment, CostKind) +
7156 VectorGEPCost;
7157 break;
7159 VecLdCost += TTI.getMaskedMemoryOpCost(
7160 Instruction::Load, SubVecTy, CommonAlignment,
7161 LI0->getPointerAddressSpace(), CostKind) +
7162 VectorGEPCost +
7164 {}, CostKind);
7165 break;
7167 VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
7168 LI0->getPointerOperand(),
7169 /*VariableMask=*/false,
7170 CommonAlignment, CostKind) +
7171 VectorGEPCost;
7172 break;
7173 case LoadsState::Gather:
7174 // Gathers are already calculated - ignore.
7175 continue;
7176 }
7177 SmallVector<int> ShuffleMask(VL.size());
7178 for (int Idx : seq<int>(0, VL.size()))
7179 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
7180 if (I > 0)
7181 VecLdCost +=
7182 ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, ShuffleMask,
7183 CostKind, I * VF, SubVecTy);
7184 }
7185 // If masked gather cost is higher - better to vectorize, so
7186 // consider it as a gather node. It will be better estimated
7187 // later.
7188 if (MaskedGatherCost >= VecLdCost &&
7189 VecLdCost - GatherCost < -SLPCostThreshold) {
7190 if (BestVF)
7191 *BestVF = VF;
7192 return true;
7193 }
7194 }
7195 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7196 };
7197 // TODO: need to improve analysis of the pointers, if not all of them are
7198 // GEPs or have > 2 operands, we end up with a gather node, which just
7199 // increases the cost.
7200 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
7201 bool ProfitableGatherPointers =
7202 L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
7203 return L->isLoopInvariant(V);
7204 })) <= Sz / 2;
7205 if (ProfitableGatherPointers || all_of(PointerOps, [](Value *P) {
7207 return (!GEP && doesNotNeedToBeScheduled(P)) ||
7208 (GEP && GEP->getNumOperands() == 2 &&
7209 isa<Constant, Instruction>(GEP->getOperand(1)));
7210 })) {
7211 // Check if potential masked gather can be represented as series
7212 // of loads + insertsubvectors.
7213 // If masked gather cost is higher - better to vectorize, so
7214 // consider it as a gather node. It will be better estimated
7215 // later.
7216 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
7217 ProfitableGatherPointers))
7219 }
7220
7221 return LoadsState::Gather;
7222}
7223
7225 ArrayRef<BasicBlock *> BBs, Type *ElemTy,
7226 const DataLayout &DL, ScalarEvolution &SE,
7227 SmallVectorImpl<unsigned> &SortedIndices) {
7228 assert(
7229 all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
7230 "Expected list of pointer operands.");
7231 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
7232 // Ptr into, sort and return the sorted indices with values next to one
7233 // another.
7235 std::pair<BasicBlock *, Value *>,
7237 Bases;
7238 Bases
7239 .try_emplace(std::make_pair(
7241 .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U);
7242
7243 SortedIndices.clear();
7244 for (auto [Cnt, Ptr] : enumerate(VL.drop_front())) {
7245 auto Key = std::make_pair(BBs[Cnt + 1],
7247 bool Found = any_of(Bases.try_emplace(Key).first->second,
7248 [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) {
7249 std::optional<int64_t> Diff =
7250 getPointersDiff(ElemTy, std::get<0>(Base.front()),
7251 ElemTy, Ptr, DL, SE,
7252 /*StrictCheck=*/true);
7253 if (!Diff)
7254 return false;
7255
7256 Base.emplace_back(Ptr, *Diff, Cnt + 1);
7257 return true;
7258 });
7259
7260 if (!Found) {
7261 // If we haven't found enough to usefully cluster, return early.
7262 if (Bases.size() > VL.size() / 2 - 1)
7263 return false;
7264
7265 // Not found already - add a new Base
7266 Bases.find(Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
7267 }
7268 }
7269
7270 if (Bases.size() == VL.size())
7271 return false;
7272
7273 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
7274 Bases.front().second.size() == VL.size()))
7275 return false;
7276
7277 // For each of the bases sort the pointers by Offset and check if any of the
7278 // base become consecutively allocated.
7279 auto ComparePointers = [](Value *Ptr1, Value *Ptr2) {
7280 SmallPtrSet<Value *, 13> FirstPointers;
7281 SmallPtrSet<Value *, 13> SecondPointers;
7282 Value *P1 = Ptr1;
7283 Value *P2 = Ptr2;
7284 unsigned Depth = 0;
7285 while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1)) {
7286 if (P1 == P2 || Depth > RecursionMaxDepth)
7287 return false;
7288 FirstPointers.insert(P1);
7289 SecondPointers.insert(P2);
7290 P1 = getUnderlyingObject(P1, /*MaxLookup=*/1);
7291 P2 = getUnderlyingObject(P2, /*MaxLookup=*/1);
7292 ++Depth;
7293 }
7294 assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) &&
7295 "Unable to find matching root.");
7296 return FirstPointers.contains(P2) && !SecondPointers.contains(P1);
7297 };
7298 for (auto &Base : Bases) {
7299 for (auto &Vec : Base.second) {
7300 if (Vec.size() > 1) {
7302 int64_t InitialOffset = std::get<1>(Vec[0]);
7303 bool AnyConsecutive =
7304 all_of(enumerate(Vec), [InitialOffset](const auto &P) {
7305 return std::get<1>(P.value()) ==
7306 int64_t(P.index()) + InitialOffset;
7307 });
7308 // Fill SortedIndices array only if it looks worth-while to sort the
7309 // ptrs.
7310 if (!AnyConsecutive)
7311 return false;
7312 }
7313 }
7314 stable_sort(Base.second, [&](const auto &V1, const auto &V2) {
7315 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
7316 });
7317 }
7318
7319 for (auto &T : Bases)
7320 for (const auto &Vec : T.second)
7321 for (const auto &P : Vec)
7322 SortedIndices.push_back(std::get<2>(P));
7323
7324 assert(SortedIndices.size() == VL.size() &&
7325 "Expected SortedIndices to be the size of VL");
7326 return true;
7327}
7328
7329std::optional<BoUpSLP::OrdersType>
7330BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
7331 assert(TE.isGather() && "Expected gather node only.");
7332 Type *ScalarTy = TE.Scalars[0]->getType();
7333
7335 Ptrs.reserve(TE.Scalars.size());
7337 BBs.reserve(TE.Scalars.size());
7338 for (Value *V : TE.Scalars) {
7339 auto *L = dyn_cast<LoadInst>(V);
7340 if (!L || !L->isSimple())
7341 return std::nullopt;
7342 Ptrs.push_back(L->getPointerOperand());
7343 BBs.push_back(L->getParent());
7344 }
7345
7346 BoUpSLP::OrdersType Order;
7347 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
7348 clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order))
7349 return std::move(Order);
7350 return std::nullopt;
7351}
7352
7353/// Check if two insertelement instructions are from the same buildvector.
7356 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
7357 // Instructions must be from the same basic blocks.
7358 if (VU->getParent() != V->getParent())
7359 return false;
7360 // Checks if 2 insertelements are from the same buildvector.
7361 if (VU->getType() != V->getType())
7362 return false;
7363 // Multiple used inserts are separate nodes.
7364 if (!VU->hasOneUse() && !V->hasOneUse())
7365 return false;
7366 auto *IE1 = VU;
7367 auto *IE2 = V;
7368 std::optional<unsigned> Idx1 = getElementIndex(IE1);
7369 std::optional<unsigned> Idx2 = getElementIndex(IE2);
7370 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
7371 return false;
7372 // Go through the vector operand of insertelement instructions trying to find
7373 // either VU as the original vector for IE2 or V as the original vector for
7374 // IE1.
7375 SmallBitVector ReusedIdx(
7376 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
7377 bool IsReusedIdx = false;
7378 do {
7379 if (IE2 == VU && !IE1)
7380 return VU->hasOneUse();
7381 if (IE1 == V && !IE2)
7382 return V->hasOneUse();
7383 if (IE1 && IE1 != V) {
7384 unsigned Idx1 = getElementIndex(IE1).value_or(*Idx2);
7385 IsReusedIdx |= ReusedIdx.test(Idx1);
7386 ReusedIdx.set(Idx1);
7387 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
7388 IE1 = nullptr;
7389 else
7390 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
7391 }
7392 if (IE2 && IE2 != VU) {
7393 unsigned Idx2 = getElementIndex(IE2).value_or(*Idx1);
7394 IsReusedIdx |= ReusedIdx.test(Idx2);
7395 ReusedIdx.set(Idx2);
7396 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
7397 IE2 = nullptr;
7398 else
7399 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
7400 }
7401 } while (!IsReusedIdx && (IE1 || IE2));
7402 return false;
7403}
7404
7405/// Checks if the specified instruction \p I is an alternate operation for
7406/// the given \p MainOp and \p AltOp instructions.
7407static bool isAlternateInstruction(Instruction *I, Instruction *MainOp,
7408 Instruction *AltOp,
7409 const TargetLibraryInfo &TLI);
7410
7411std::optional<BoUpSLP::OrdersType>
7412BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
7413 bool IgnoreReorder) {
7414 // No need to reorder if need to shuffle reuses, still need to shuffle the
7415 // node.
7416 if (!TE.ReuseShuffleIndices.empty()) {
7417 // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
7418 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
7419 "Reshuffling scalars not yet supported for nodes with padding");
7420
7421 if (isSplat(TE.Scalars))
7422 return std::nullopt;
7423 // Check if reuse shuffle indices can be improved by reordering.
7424 // For this, check that reuse mask is "clustered", i.e. each scalar values
7425 // is used once in each submask of size <number_of_scalars>.
7426 // Example: 4 scalar values.
7427 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
7428 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
7429 // element 3 is used twice in the second submask.
7430 unsigned Sz = TE.Scalars.size();
7431 if (TE.isGather()) {
7432 if (std::optional<OrdersType> CurrentOrder =
7433 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder)) {
7434 SmallVector<int> Mask;
7435 fixupOrderingIndices(*CurrentOrder);
7436 inversePermutation(*CurrentOrder, Mask);
7437 ::addMask(Mask, TE.ReuseShuffleIndices);
7438 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
7439 unsigned Sz = TE.Scalars.size();
7440 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
7441 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
7442 if (Idx != PoisonMaskElem)
7443 Res[Idx + K * Sz] = I + K * Sz;
7444 }
7445 return std::move(Res);
7446 }
7447 }
7448 if (Sz == 2 && TE.getVectorFactor() == 4 &&
7449 ::getNumberOfParts(*TTI, getWidenedType(TE.Scalars.front()->getType(),
7450 2 * TE.getVectorFactor())) == 1)
7451 return std::nullopt;
7452 if (TE.ReuseShuffleIndices.size() % Sz != 0)
7453 return std::nullopt;
7454 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
7455 Sz)) {
7456 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
7457 if (TE.ReorderIndices.empty())
7458 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
7459 else
7460 inversePermutation(TE.ReorderIndices, ReorderMask);
7461 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
7462 unsigned VF = ReorderMask.size();
7463 OrdersType ResOrder(VF, VF);
7464 unsigned NumParts = divideCeil(VF, Sz);
7465 SmallBitVector UsedVals(NumParts);
7466 for (unsigned I = 0; I < VF; I += Sz) {
7467 int Val = PoisonMaskElem;
7468 unsigned UndefCnt = 0;
7469 unsigned Limit = std::min(Sz, VF - I);
7470 if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
7471 [&](int Idx) {
7472 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
7473 Val = Idx;
7474 if (Idx == PoisonMaskElem)
7475 ++UndefCnt;
7476 return Idx != PoisonMaskElem && Idx != Val;
7477 }) ||
7478 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
7479 UndefCnt > Sz / 2)
7480 return std::nullopt;
7481 UsedVals.set(Val);
7482 for (unsigned K = 0; K < NumParts; ++K) {
7483 unsigned Idx = Val + Sz * K;
7484 if (Idx < VF && I + K < VF)
7485 ResOrder[Idx] = I + K;
7486 }
7487 }
7488 return std::move(ResOrder);
7489 }
7490 unsigned VF = TE.getVectorFactor();
7491 // Try build correct order for extractelement instructions.
7492 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
7493 TE.ReuseShuffleIndices.end());
7494 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
7495 all_of(TE.Scalars, [Sz](Value *V) {
7496 if (isa<PoisonValue>(V))
7497 return true;
7498 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
7499 return Idx && *Idx < Sz;
7500 })) {
7501 assert(!TE.isAltShuffle() && "Alternate instructions are only supported "
7502 "by BinaryOperator and CastInst.");
7503 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
7504 if (TE.ReorderIndices.empty())
7505 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
7506 else
7507 inversePermutation(TE.ReorderIndices, ReorderMask);
7508 for (unsigned I = 0; I < VF; ++I) {
7509 int &Idx = ReusedMask[I];
7510 if (Idx == PoisonMaskElem)
7511 continue;
7512 Value *V = TE.Scalars[ReorderMask[Idx]];
7513 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
7514 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
7515 }
7516 }
7517 // Build the order of the VF size, need to reorder reuses shuffles, they are
7518 // always of VF size.
7519 OrdersType ResOrder(VF);
7520 std::iota(ResOrder.begin(), ResOrder.end(), 0);
7521 auto *It = ResOrder.begin();
7522 for (unsigned K = 0; K < VF; K += Sz) {
7523 OrdersType CurrentOrder(TE.ReorderIndices);
7524 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
7525 if (SubMask.front() == PoisonMaskElem)
7526 std::iota(SubMask.begin(), SubMask.end(), 0);
7527 reorderOrder(CurrentOrder, SubMask);
7528 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
7529 std::advance(It, Sz);
7530 }
7531 if (TE.isGather() && all_of(enumerate(ResOrder), [](const auto &Data) {
7532 return Data.index() == Data.value();
7533 }))
7534 return std::nullopt; // No need to reorder.
7535 return std::move(ResOrder);
7536 }
7537 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
7538 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
7539 !Instruction::isBinaryOp(TE.UserTreeIndex.UserTE->getOpcode())) &&
7540 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
7541 return std::nullopt;
7542 if (TE.State == TreeEntry::SplitVectorize ||
7543 ((TE.State == TreeEntry::Vectorize ||
7544 TE.State == TreeEntry::StridedVectorize ||
7545 TE.State == TreeEntry::CompressVectorize) &&
7547 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))))) {
7548 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
7549 "Alternate instructions are only supported by "
7550 "BinaryOperator and CastInst.");
7551 return TE.ReorderIndices;
7552 }
7553 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
7554 TE.isAltShuffle()) {
7555 assert(TE.ReuseShuffleIndices.empty() &&
7556 "ReuseShuffleIndices should be "
7557 "empty for alternate instructions.");
7558 SmallVector<int> Mask;
7559 TE.buildAltOpShuffleMask(
7560 [&](Instruction *I) {
7561 assert(TE.getMatchingMainOpOrAltOp(I) &&
7562 "Unexpected main/alternate opcode");
7563 return isAlternateInstruction(I, TE.getMainOp(), TE.getAltOp(), *TLI);
7564 },
7565 Mask);
7566 const int VF = TE.getVectorFactor();
7567 OrdersType ResOrder(VF, VF);
7568 for (unsigned I : seq<unsigned>(VF)) {
7569 if (Mask[I] == PoisonMaskElem)
7570 continue;
7571 ResOrder[Mask[I] % VF] = I;
7572 }
7573 return std::move(ResOrder);
7574 }
7575 if (!TE.ReorderIndices.empty())
7576 return TE.ReorderIndices;
7577 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
7578 if (!TE.ReorderIndices.empty())
7579 return TE.ReorderIndices;
7580
7581 SmallVector<Instruction *> UserBVHead(TE.Scalars.size());
7582 for (auto [I, V] : zip(UserBVHead, TE.Scalars)) {
7583 if (isa<Constant>(V) || !V->hasNUsesOrMore(1))
7584 continue;
7585 auto *II = dyn_cast<InsertElementInst>(*V->user_begin());
7586 if (!II)
7587 continue;
7588 Instruction *BVHead = nullptr;
7589 BasicBlock *BB = II->getParent();
7590 while (II && II->hasOneUse() && II->getParent() == BB) {
7591 BVHead = II;
7592 II = dyn_cast<InsertElementInst>(II->getOperand(0));
7593 }
7594 I = BVHead;
7595 }
7596
7597 auto CompareByBasicBlocks = [&](BasicBlock *BB1, BasicBlock *BB2) {
7598 assert(BB1 != BB2 && "Expected different basic blocks.");
7599 if (!DT->isReachableFromEntry(BB1))
7600 return false;
7601 if (!DT->isReachableFromEntry(BB2))
7602 return true;
7603 auto *NodeA = DT->getNode(BB1);
7604 auto *NodeB = DT->getNode(BB2);
7605 assert(NodeA && "Should only process reachable instructions");
7606 assert(NodeB && "Should only process reachable instructions");
7607 assert((NodeA == NodeB) ==
7608 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
7609 "Different nodes should have different DFS numbers");
7610 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
7611 };
7612 auto PHICompare = [&](unsigned I1, unsigned I2) {
7613 Value *V1 = TE.Scalars[I1];
7614 Value *V2 = TE.Scalars[I2];
7615 if (V1 == V2 || (V1->use_empty() && V2->use_empty()))
7616 return false;
7617 if (isa<PoisonValue>(V1))
7618 return true;
7619 if (isa<PoisonValue>(V2))
7620 return false;
7621 if (V1->getNumUses() < V2->getNumUses())
7622 return true;
7623 if (V1->getNumUses() > V2->getNumUses())
7624 return false;
7625 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
7626 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
7627 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
7628 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
7629 FirstUserOfPhi2->getParent());
7630 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
7631 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
7632 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
7633 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
7634 if (IE1 && !IE2)
7635 return true;
7636 if (!IE1 && IE2)
7637 return false;
7638 if (IE1 && IE2) {
7639 if (UserBVHead[I1] && !UserBVHead[I2])
7640 return true;
7641 if (!UserBVHead[I1])
7642 return false;
7643 if (UserBVHead[I1] == UserBVHead[I2])
7644 return getElementIndex(IE1) < getElementIndex(IE2);
7645 if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent())
7646 return CompareByBasicBlocks(UserBVHead[I1]->getParent(),
7647 UserBVHead[I2]->getParent());
7648 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
7649 }
7650 if (EE1 && !EE2)
7651 return true;
7652 if (!EE1 && EE2)
7653 return false;
7654 if (EE1 && EE2) {
7655 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
7656 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
7657 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
7658 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
7659 if (!Inst2 && !P2)
7660 return Inst1 || P1;
7661 if (EE1->getOperand(0) == EE2->getOperand(0))
7662 return getElementIndex(EE1) < getElementIndex(EE2);
7663 if (!Inst1 && Inst2)
7664 return false;
7665 if (Inst1 && Inst2) {
7666 if (Inst1->getParent() != Inst2->getParent())
7667 return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent());
7668 return Inst1->comesBefore(Inst2);
7669 }
7670 if (!P1 && P2)
7671 return false;
7672 assert(P1 && P2 &&
7673 "Expected either instructions or arguments vector operands.");
7674 return P1->getArgNo() < P2->getArgNo();
7675 }
7676 return false;
7677 };
7678 OrdersType Phis(TE.Scalars.size());
7679 std::iota(Phis.begin(), Phis.end(), 0);
7680 stable_sort(Phis, PHICompare);
7681 if (isIdentityOrder(Phis))
7682 return std::nullopt; // No need to reorder.
7683 return std::move(Phis);
7684 }
7685 if (TE.isGather() &&
7686 (!TE.hasState() || !TE.isAltShuffle() ||
7687 ScalarsInSplitNodes.contains(TE.getMainOp())) &&
7688 allSameType(TE.Scalars)) {
7689 // TODO: add analysis of other gather nodes with extractelement
7690 // instructions and other values/instructions, not only undefs.
7691 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
7693 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
7694 all_of(TE.Scalars, [](Value *V) {
7695 auto *EE = dyn_cast<ExtractElementInst>(V);
7696 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
7697 })) {
7698 // Check that gather of extractelements can be represented as
7699 // just a shuffle of a single vector.
7700 OrdersType CurrentOrder;
7701 bool Reuse =
7702 canReuseExtract(TE.Scalars, CurrentOrder, /*ResizeAllowed=*/true);
7703 if (Reuse || !CurrentOrder.empty())
7704 return std::move(CurrentOrder);
7705 }
7706 // If the gather node is <undef, v, .., poison> and
7707 // insertelement poison, v, 0 [+ permute]
7708 // is cheaper than
7709 // insertelement poison, v, n - try to reorder.
7710 // If rotating the whole graph, exclude the permute cost, the whole graph
7711 // might be transformed.
7712 int Sz = TE.Scalars.size();
7713 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
7714 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
7715 const auto *It = find_if_not(TE.Scalars, isConstant);
7716 if (It == TE.Scalars.begin())
7717 return OrdersType();
7718 auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
7719 if (It != TE.Scalars.end()) {
7720 OrdersType Order(Sz, Sz);
7721 unsigned Idx = std::distance(TE.Scalars.begin(), It);
7722 Order[Idx] = 0;
7723 fixupOrderingIndices(Order);
7724 SmallVector<int> Mask;
7725 inversePermutation(Order, Mask);
7726 InstructionCost PermuteCost =
7727 TopToBottom
7728 ? 0
7729 : ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, Ty, Mask);
7730 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
7731 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
7732 PoisonValue::get(Ty), *It);
7733 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
7734 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
7735 PoisonValue::get(Ty), *It);
7736 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
7737 OrdersType Order(Sz, Sz);
7738 Order[Idx] = 0;
7739 return std::move(Order);
7740 }
7741 }
7742 }
7743 if (isSplat(TE.Scalars))
7744 return std::nullopt;
7745 if (TE.Scalars.size() >= 3)
7746 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
7747 return Order;
7748 // Check if can include the order of vectorized loads. For masked gathers do
7749 // extra analysis later, so include such nodes into a special list.
7750 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
7751 SmallVector<Value *> PointerOps;
7752 StridedPtrInfo SPtrInfo;
7753 OrdersType CurrentOrder;
7754 LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
7755 CurrentOrder, PointerOps, SPtrInfo);
7758 return std::move(CurrentOrder);
7759 }
7760 // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
7761 // has been auditted for correctness with non-power-of-two vectors.
7762 if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
7763 if (std::optional<OrdersType> CurrentOrder =
7764 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
7765 return CurrentOrder;
7766 }
7767 return std::nullopt;
7768}
7769
7770/// Checks if the given mask is a "clustered" mask with the same clusters of
7771/// size \p Sz, which are not identity submasks.
7773 unsigned Sz) {
7774 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
7775 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
7776 return false;
7777 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
7778 ArrayRef<int> Cluster = Mask.slice(I, Sz);
7779 if (Cluster != FirstCluster)
7780 return false;
7781 }
7782 return true;
7783}
7784
7785void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
7786 // Reorder reuses mask.
7787 reorderReuses(TE.ReuseShuffleIndices, Mask);
7788 const unsigned Sz = TE.Scalars.size();
7789 // For vectorized and non-clustered reused no need to do anything else.
7790 if (!TE.isGather() ||
7792 Sz) ||
7793 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
7794 return;
7795 SmallVector<int> NewMask;
7796 inversePermutation(TE.ReorderIndices, NewMask);
7797 addMask(NewMask, TE.ReuseShuffleIndices);
7798 // Clear reorder since it is going to be applied to the new mask.
7799 TE.ReorderIndices.clear();
7800 // Try to improve gathered nodes with clustered reuses, if possible.
7801 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
7802 SmallVector<unsigned> NewOrder(Slice);
7803 inversePermutation(NewOrder, NewMask);
7804 reorderScalars(TE.Scalars, NewMask);
7805 // Fill the reuses mask with the identity submasks.
7806 for (auto *It = TE.ReuseShuffleIndices.begin(),
7807 *End = TE.ReuseShuffleIndices.end();
7808 It != End; std::advance(It, Sz))
7809 std::iota(It, std::next(It, Sz), 0);
7810}
7811
7813 ArrayRef<unsigned> SecondaryOrder) {
7814 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
7815 "Expected same size of orders");
7816 size_t Sz = Order.size();
7817 SmallBitVector UsedIndices(Sz);
7818 for (unsigned Idx : seq<unsigned>(0, Sz)) {
7819 if (Order[Idx] != Sz)
7820 UsedIndices.set(Order[Idx]);
7821 }
7822 if (SecondaryOrder.empty()) {
7823 for (unsigned Idx : seq<unsigned>(0, Sz))
7824 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
7825 Order[Idx] = Idx;
7826 } else {
7827 for (unsigned Idx : seq<unsigned>(0, Sz))
7828 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
7829 !UsedIndices.test(SecondaryOrder[Idx]))
7830 Order[Idx] = SecondaryOrder[Idx];
7831 }
7832}
7833
7836 return false;
7837
7838 constexpr unsigned TinyVF = 2;
7839 constexpr unsigned TinyTree = 10;
7840 constexpr unsigned PhiOpsLimit = 12;
7841 constexpr unsigned GatherLoadsLimit = 2;
7842 if (VectorizableTree.size() <= TinyTree)
7843 return true;
7844 if (VectorizableTree.front()->hasState() &&
7845 !VectorizableTree.front()->isGather() &&
7846 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
7847 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
7848 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
7849 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
7850 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
7851 VectorizableTree.front()->ReorderIndices.empty()) {
7852 // Check if the tree has only single store and single (unordered) load node,
7853 // other nodes are phis or geps/binops, combined with phis, and/or single
7854 // gather load node
7855 if (VectorizableTree.front()->hasState() &&
7856 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
7857 VectorizableTree.front()->Scalars.size() == TinyVF &&
7858 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
7859 return false;
7860 // Single node, which require reorder - skip.
7861 if (VectorizableTree.front()->hasState() &&
7862 VectorizableTree.front()->getOpcode() == Instruction::Store &&
7863 VectorizableTree.front()->ReorderIndices.empty()) {
7864 const unsigned ReorderedSplitsCnt =
7865 count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
7866 return TE->State == TreeEntry::SplitVectorize &&
7867 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
7868 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
7869 ::isCommutative(TE->UserTreeIndex.UserTE->getMainOp());
7870 });
7871 if (ReorderedSplitsCnt <= 1 &&
7872 static_cast<unsigned>(count_if(
7873 VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
7874 return ((!TE->isGather() &&
7875 (TE->ReorderIndices.empty() ||
7876 (TE->UserTreeIndex.UserTE &&
7877 TE->UserTreeIndex.UserTE->State ==
7878 TreeEntry::Vectorize &&
7879 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
7880 .empty()))) ||
7881 (TE->isGather() && TE->ReorderIndices.empty() &&
7882 (!TE->hasState() || TE->isAltShuffle() ||
7883 TE->getOpcode() == Instruction::Load ||
7884 TE->getOpcode() == Instruction::ZExt ||
7885 TE->getOpcode() == Instruction::SExt))) &&
7886 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
7887 !TE->isGather() || none_of(TE->Scalars, [&](Value *V) {
7888 return !isConstant(V) && isVectorized(V);
7889 }));
7890 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
7891 return false;
7892 }
7893 bool HasPhis = false;
7894 bool HasLoad = true;
7895 unsigned GatherLoads = 0;
7896 for (const std::unique_ptr<TreeEntry> &TE :
7897 ArrayRef(VectorizableTree).drop_front()) {
7898 if (TE->State == TreeEntry::SplitVectorize)
7899 continue;
7900 if (!TE->hasState()) {
7901 if (all_of(TE->Scalars, IsaPred<Constant, PHINode>) ||
7903 continue;
7904 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
7906 continue;
7907 return true;
7908 }
7909 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
7910 if (!TE->isGather()) {
7911 HasLoad = false;
7912 continue;
7913 }
7914 if (HasLoad)
7915 return true;
7916 ++GatherLoads;
7917 if (GatherLoads >= GatherLoadsLimit)
7918 return true;
7919 }
7920 if (TE->getOpcode() == Instruction::GetElementPtr ||
7921 Instruction::isBinaryOp(TE->getOpcode()))
7922 continue;
7923 if (TE->getOpcode() != Instruction::PHI &&
7924 (!TE->hasCopyableElements() ||
7925 static_cast<unsigned>(count_if(TE->Scalars, IsaPred<PHINode>)) <
7926 TE->Scalars.size() / 2))
7927 return true;
7928 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
7929 TE->getNumOperands() > PhiOpsLimit)
7930 return false;
7931 HasPhis = true;
7932 }
7933 return !HasPhis;
7934 }
7935 return true;
7936}
7937
7938void BoUpSLP::TreeEntry::reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
7939 ArrayRef<int> MaskOrder) {
7940 assert(State == TreeEntry::SplitVectorize && "Expected split user node.");
7941 SmallVector<int> NewMask(getVectorFactor());
7942 SmallVector<int> NewMaskOrder(getVectorFactor());
7943 std::iota(NewMask.begin(), NewMask.end(), 0);
7944 std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
7945 if (Idx == 0) {
7946 copy(Mask, NewMask.begin());
7947 copy(MaskOrder, NewMaskOrder.begin());
7948 } else {
7949 assert(Idx == 1 && "Expected either 0 or 1 index.");
7950 unsigned Offset = CombinedEntriesWithIndices.back().second;
7951 for (unsigned I : seq<unsigned>(Mask.size())) {
7952 NewMask[I + Offset] = Mask[I] + Offset;
7953 NewMaskOrder[I + Offset] = MaskOrder[I] + Offset;
7954 }
7955 }
7956 reorderScalars(Scalars, NewMask);
7957 reorderOrder(ReorderIndices, NewMaskOrder, /*BottomOrder=*/true);
7958 if (!ReorderIndices.empty() && BoUpSLP::isIdentityOrder(ReorderIndices))
7959 ReorderIndices.clear();
7960}
7961
7963 // Maps VF to the graph nodes.
7965 // ExtractElement gather nodes which can be vectorized and need to handle
7966 // their ordering.
7968
7969 // Phi nodes can have preferred ordering based on their result users
7971
7972 // AltShuffles can also have a preferred ordering that leads to fewer
7973 // instructions, e.g., the addsub instruction in x86.
7974 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
7975
7976 // Maps a TreeEntry to the reorder indices of external users.
7978 ExternalUserReorderMap;
7979 // Find all reorderable nodes with the given VF.
7980 // Currently the are vectorized stores,loads,extracts + some gathering of
7981 // extracts.
7982 for_each(VectorizableTree, [&, &TTIRef = *TTI](
7983 const std::unique_ptr<TreeEntry> &TE) {
7984 // Look for external users that will probably be vectorized.
7985 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
7986 findExternalStoreUsersReorderIndices(TE.get());
7987 if (!ExternalUserReorderIndices.empty()) {
7988 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
7989 ExternalUserReorderMap.try_emplace(TE.get(),
7990 std::move(ExternalUserReorderIndices));
7991 }
7992
7993 // Patterns like [fadd,fsub] can be combined into a single instruction in
7994 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
7995 // to take into account their order when looking for the most used order.
7996 if (TE->hasState() && TE->isAltShuffle() &&
7997 TE->State != TreeEntry::SplitVectorize) {
7998 Type *ScalarTy = TE->Scalars[0]->getType();
7999 VectorType *VecTy = getWidenedType(ScalarTy, TE->Scalars.size());
8000 unsigned Opcode0 = TE->getOpcode();
8001 unsigned Opcode1 = TE->getAltOpcode();
8002 SmallBitVector OpcodeMask(
8003 getAltInstrMask(TE->Scalars, ScalarTy, Opcode0, Opcode1));
8004 // If this pattern is supported by the target then we consider the order.
8005 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
8006 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8007 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
8008 }
8009 // TODO: Check the reverse order too.
8010 }
8011
8012 bool IgnoreReorder =
8013 !UserIgnoreList && VectorizableTree.front()->hasState() &&
8014 (VectorizableTree.front()->getOpcode() == Instruction::InsertElement ||
8015 VectorizableTree.front()->getOpcode() == Instruction::Store);
8016 if (std::optional<OrdersType> CurrentOrder =
8017 getReorderingData(*TE, /*TopToBottom=*/true, IgnoreReorder)) {
8018 // Do not include ordering for nodes used in the alt opcode vectorization,
8019 // better to reorder them during bottom-to-top stage. If follow the order
8020 // here, it causes reordering of the whole graph though actually it is
8021 // profitable just to reorder the subgraph that starts from the alternate
8022 // opcode vectorization node. Such nodes already end-up with the shuffle
8023 // instruction and it is just enough to change this shuffle rather than
8024 // rotate the scalars for the whole graph.
8025 unsigned Cnt = 0;
8026 const TreeEntry *UserTE = TE.get();
8027 while (UserTE && Cnt < RecursionMaxDepth) {
8028 if (!UserTE->UserTreeIndex)
8029 break;
8030 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8031 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
8032 UserTE->UserTreeIndex.UserTE->Idx != 0)
8033 return;
8034 UserTE = UserTE->UserTreeIndex.UserTE;
8035 ++Cnt;
8036 }
8037 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8038 if (!(TE->State == TreeEntry::Vectorize ||
8039 TE->State == TreeEntry::StridedVectorize ||
8040 TE->State == TreeEntry::SplitVectorize ||
8041 TE->State == TreeEntry::CompressVectorize) ||
8042 !TE->ReuseShuffleIndices.empty())
8043 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
8044 if (TE->State == TreeEntry::Vectorize &&
8045 TE->getOpcode() == Instruction::PHI)
8046 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
8047 }
8048 });
8049
8050 // Reorder the graph nodes according to their vectorization factor.
8051 for (unsigned VF = VectorizableTree.front()->getVectorFactor();
8052 !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
8053 auto It = VFToOrderedEntries.find(VF);
8054 if (It == VFToOrderedEntries.end())
8055 continue;
8056 // Try to find the most profitable order. We just are looking for the most
8057 // used order and reorder scalar elements in the nodes according to this
8058 // mostly used order.
8059 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
8060 // Delete VF entry upon exit.
8061 auto Cleanup = make_scope_exit([&]() { VFToOrderedEntries.erase(It); });
8062
8063 // All operands are reordered and used only in this node - propagate the
8064 // most used order to the user node.
8067 OrdersUses;
8068 for (const TreeEntry *OpTE : OrderedEntries) {
8069 // No need to reorder this nodes, still need to extend and to use shuffle,
8070 // just need to merge reordering shuffle and the reuse shuffle.
8071 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE) &&
8072 OpTE->State != TreeEntry::SplitVectorize)
8073 continue;
8074 // Count number of orders uses.
8075 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8076 &PhisToOrders]() -> const OrdersType & {
8077 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8078 auto It = GathersToOrders.find(OpTE);
8079 if (It != GathersToOrders.end())
8080 return It->second;
8081 }
8082 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8083 auto It = AltShufflesToOrders.find(OpTE);
8084 if (It != AltShufflesToOrders.end())
8085 return It->second;
8086 }
8087 if (OpTE->State == TreeEntry::Vectorize &&
8088 OpTE->getOpcode() == Instruction::PHI) {
8089 auto It = PhisToOrders.find(OpTE);
8090 if (It != PhisToOrders.end())
8091 return It->second;
8092 }
8093 return OpTE->ReorderIndices;
8094 }();
8095 // First consider the order of the external scalar users.
8096 auto It = ExternalUserReorderMap.find(OpTE);
8097 if (It != ExternalUserReorderMap.end()) {
8098 const auto &ExternalUserReorderIndices = It->second;
8099 // If the OpTE vector factor != number of scalars - use natural order,
8100 // it is an attempt to reorder node with reused scalars but with
8101 // external uses.
8102 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8103 OrdersUses.try_emplace(OrdersType(), 0).first->second +=
8104 ExternalUserReorderIndices.size();
8105 } else {
8106 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
8107 ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
8108 }
8109 // No other useful reorder data in this entry.
8110 if (Order.empty())
8111 continue;
8112 }
8113 // Stores actually store the mask, not the order, need to invert.
8114 if (OpTE->State == TreeEntry::Vectorize &&
8115 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
8116 assert(!OpTE->isAltShuffle() &&
8117 "Alternate instructions are only supported by BinaryOperator "
8118 "and CastInst.");
8119 SmallVector<int> Mask;
8120 inversePermutation(Order, Mask);
8121 unsigned E = Order.size();
8122 OrdersType CurrentOrder(E, E);
8123 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
8124 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8125 });
8126 fixupOrderingIndices(CurrentOrder);
8127 ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
8128 } else {
8129 ++OrdersUses.try_emplace(Order, 0).first->second;
8130 }
8131 }
8132 if (OrdersUses.empty())
8133 continue;
8134 // Choose the most used order.
8135 unsigned IdentityCnt = 0;
8136 unsigned FilledIdentityCnt = 0;
8137 OrdersType IdentityOrder(VF, VF);
8138 for (auto &Pair : OrdersUses) {
8139 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
8140 if (!Pair.first.empty())
8141 FilledIdentityCnt += Pair.second;
8142 IdentityCnt += Pair.second;
8143 combineOrders(IdentityOrder, Pair.first);
8144 }
8145 }
8146 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
8147 unsigned Cnt = IdentityCnt;
8148 for (auto &Pair : OrdersUses) {
8149 // Prefer identity order. But, if filled identity found (non-empty order)
8150 // with same number of uses, as the new candidate order, we can choose
8151 // this candidate order.
8152 if (Cnt < Pair.second ||
8153 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8154 Cnt == Pair.second && !BestOrder.empty() &&
8155 isIdentityOrder(BestOrder))) {
8156 combineOrders(Pair.first, BestOrder);
8157 BestOrder = Pair.first;
8158 Cnt = Pair.second;
8159 } else {
8160 combineOrders(BestOrder, Pair.first);
8161 }
8162 }
8163 // Set order of the user node.
8164 if (isIdentityOrder(BestOrder))
8165 continue;
8166 fixupOrderingIndices(BestOrder);
8167 SmallVector<int> Mask;
8168 inversePermutation(BestOrder, Mask);
8169 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
8170 unsigned E = BestOrder.size();
8171 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
8172 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8173 });
8174 // Do an actual reordering, if profitable.
8175 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8176 // Just do the reordering for the nodes with the given VF.
8177 if (TE->Scalars.size() != VF) {
8178 if (TE->ReuseShuffleIndices.size() == VF) {
8179 assert(TE->State != TreeEntry::SplitVectorize &&
8180 "Split vectorized not expected.");
8181 // Need to reorder the reuses masks of the operands with smaller VF to
8182 // be able to find the match between the graph nodes and scalar
8183 // operands of the given node during vectorization/cost estimation.
8184 assert(
8185 (!TE->UserTreeIndex ||
8186 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8187 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8188 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8189 "All users must be of VF size.");
8190 if (SLPReVec) {
8191 assert(SLPReVec && "Only supported by REVEC.");
8192 // ShuffleVectorInst does not do reorderOperands (and it should not
8193 // because ShuffleVectorInst supports only a limited set of
8194 // patterns). Only do reorderNodeWithReuses if the user is not
8195 // ShuffleVectorInst.
8196 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
8197 isa<ShuffleVectorInst>(TE->UserTreeIndex.UserTE->getMainOp()))
8198 continue;
8199 }
8200 // Update ordering of the operands with the smaller VF than the given
8201 // one.
8202 reorderNodeWithReuses(*TE, Mask);
8203 // Update orders in user split vectorize nodes.
8204 if (TE->UserTreeIndex &&
8205 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8206 TE->UserTreeIndex.UserTE->reorderSplitNode(
8207 TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
8208 }
8209 continue;
8210 }
8211 if ((TE->State == TreeEntry::SplitVectorize &&
8212 TE->ReuseShuffleIndices.empty()) ||
8213 ((TE->State == TreeEntry::Vectorize ||
8214 TE->State == TreeEntry::StridedVectorize ||
8215 TE->State == TreeEntry::CompressVectorize) &&
8217 InsertElementInst>(TE->getMainOp()) ||
8218 (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp()))))) {
8219 assert(
8220 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
8221 TE->ReuseShuffleIndices.empty())) &&
8222 "Alternate instructions are only supported by BinaryOperator "
8223 "and CastInst.");
8224 // Build correct orders for extract{element,value}, loads,
8225 // stores and alternate (split) nodes.
8226 reorderOrder(TE->ReorderIndices, Mask);
8227 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
8228 TE->reorderOperands(Mask);
8229 } else {
8230 // Reorder the node and its operands.
8231 TE->reorderOperands(Mask);
8232 assert(TE->ReorderIndices.empty() &&
8233 "Expected empty reorder sequence.");
8234 reorderScalars(TE->Scalars, Mask);
8235 }
8236 if (!TE->ReuseShuffleIndices.empty()) {
8237 // Apply reversed order to keep the original ordering of the reused
8238 // elements to avoid extra reorder indices shuffling.
8239 OrdersType CurrentOrder;
8240 reorderOrder(CurrentOrder, MaskOrder);
8241 SmallVector<int> NewReuses;
8242 inversePermutation(CurrentOrder, NewReuses);
8243 addMask(NewReuses, TE->ReuseShuffleIndices);
8244 TE->ReuseShuffleIndices.swap(NewReuses);
8245 } else if (TE->UserTreeIndex &&
8246 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8247 // Update orders in user split vectorize nodes.
8248 TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
8249 Mask, MaskOrder);
8250 }
8251 }
8252}
8253
8254void BoUpSLP::buildReorderableOperands(
8255 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
8256 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
8257 SmallVectorImpl<TreeEntry *> &GatherOps) {
8258 for (unsigned I : seq<unsigned>(UserTE->getNumOperands())) {
8259 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
8260 return OpData.first == I &&
8261 (OpData.second->State == TreeEntry::Vectorize ||
8262 OpData.second->State == TreeEntry::StridedVectorize ||
8263 OpData.second->State == TreeEntry::CompressVectorize ||
8264 OpData.second->State == TreeEntry::SplitVectorize);
8265 }))
8266 continue;
8267 // Do not request operands, if they do not exist.
8268 if (UserTE->hasState()) {
8269 if (UserTE->getOpcode() == Instruction::ExtractElement ||
8270 UserTE->getOpcode() == Instruction::ExtractValue)
8271 continue;
8272 if (UserTE->getOpcode() == Instruction::InsertElement && I == 0)
8273 continue;
8274 if (UserTE->getOpcode() == Instruction::Store &&
8275 UserTE->State == TreeEntry::Vectorize && I == 1)
8276 continue;
8277 if (UserTE->getOpcode() == Instruction::Load &&
8278 (UserTE->State == TreeEntry::Vectorize ||
8279 UserTE->State == TreeEntry::StridedVectorize ||
8280 UserTE->State == TreeEntry::CompressVectorize))
8281 continue;
8282 }
8283 TreeEntry *TE = getOperandEntry(UserTE, I);
8284 assert(TE && "Expected operand entry.");
8285 if (!TE->isGather()) {
8286 // Add the node to the list of the ordered nodes with the identity
8287 // order.
8288 Edges.emplace_back(I, TE);
8289 // Add ScatterVectorize nodes to the list of operands, where just
8290 // reordering of the scalars is required. Similar to the gathers, so
8291 // simply add to the list of gathered ops.
8292 // If there are reused scalars, process this node as a regular vectorize
8293 // node, just reorder reuses mask.
8294 if (TE->State == TreeEntry::ScatterVectorize &&
8295 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
8296 GatherOps.push_back(TE);
8297 continue;
8298 }
8299 if (ReorderableGathers.contains(TE))
8300 GatherOps.push_back(TE);
8301 }
8302}
8303
8304void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
8305 struct TreeEntryCompare {
8306 bool operator()(const TreeEntry *LHS, const TreeEntry *RHS) const {
8307 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
8308 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
8309 return LHS->Idx < RHS->Idx;
8310 }
8311 };
8313 DenseSet<const TreeEntry *> GathersToOrders;
8314 // Find all reorderable leaf nodes with the given VF.
8315 // Currently the are vectorized loads,extracts without alternate operands +
8316 // some gathering of extracts.
8318 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8319 if (TE->State != TreeEntry::Vectorize &&
8320 TE->State != TreeEntry::StridedVectorize &&
8321 TE->State != TreeEntry::CompressVectorize &&
8322 TE->State != TreeEntry::SplitVectorize)
8323 NonVectorized.insert(TE.get());
8324 if (std::optional<OrdersType> CurrentOrder =
8325 getReorderingData(*TE, /*TopToBottom=*/false, IgnoreReorder)) {
8326 Queue.push(TE.get());
8327 if (!(TE->State == TreeEntry::Vectorize ||
8328 TE->State == TreeEntry::StridedVectorize ||
8329 TE->State == TreeEntry::CompressVectorize ||
8330 TE->State == TreeEntry::SplitVectorize) ||
8331 !TE->ReuseShuffleIndices.empty())
8332 GathersToOrders.insert(TE.get());
8333 }
8334 }
8335
8336 // 1. Propagate order to the graph nodes, which use only reordered nodes.
8337 // I.e., if the node has operands, that are reordered, try to make at least
8338 // one operand order in the natural order and reorder others + reorder the
8339 // user node itself.
8340 SmallPtrSet<const TreeEntry *, 4> Visited, RevisitedOps;
8341 while (!Queue.empty()) {
8342 // 1. Filter out only reordered nodes.
8343 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
8344 TreeEntry *TE = Queue.top();
8345 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
8346 Queue.pop();
8347 SmallVector<TreeEntry *> OrderedOps(1, TE);
8348 while (!Queue.empty()) {
8349 TE = Queue.top();
8350 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
8351 break;
8352 Queue.pop();
8353 OrderedOps.push_back(TE);
8354 }
8355 for (TreeEntry *TE : OrderedOps) {
8356 if (!(TE->State == TreeEntry::Vectorize ||
8357 TE->State == TreeEntry::StridedVectorize ||
8358 TE->State == TreeEntry::CompressVectorize ||
8359 TE->State == TreeEntry::SplitVectorize ||
8360 (TE->isGather() && GathersToOrders.contains(TE))) ||
8361 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
8362 !Visited.insert(TE).second)
8363 continue;
8364 // Build a map between user nodes and their operands order to speedup
8365 // search. The graph currently does not provide this dependency directly.
8366 Users.first = TE->UserTreeIndex.UserTE;
8367 Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
8368 }
8369 if (Users.first) {
8370 auto &Data = Users;
8371 if (Data.first->State == TreeEntry::SplitVectorize) {
8372 assert(
8373 Data.second.size() <= 2 &&
8374 "Expected not greater than 2 operands for split vectorize node.");
8375 if (any_of(Data.second,
8376 [](const auto &Op) { return !Op.second->UserTreeIndex; }))
8377 continue;
8378 // Update orders in user split vectorize nodes.
8379 assert(Data.first->CombinedEntriesWithIndices.size() == 2 &&
8380 "Expected exactly 2 entries.");
8381 for (const auto &P : Data.first->CombinedEntriesWithIndices) {
8382 TreeEntry &OpTE = *VectorizableTree[P.first];
8383 OrdersType Order = OpTE.ReorderIndices;
8384 if (Order.empty() || !OpTE.ReuseShuffleIndices.empty()) {
8385 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
8386 continue;
8387 const auto BestOrder =
8388 getReorderingData(OpTE, /*TopToBottom=*/false, IgnoreReorder);
8389 if (!BestOrder || BestOrder->empty() || isIdentityOrder(*BestOrder))
8390 continue;
8391 Order = *BestOrder;
8392 }
8393 fixupOrderingIndices(Order);
8394 SmallVector<int> Mask;
8395 inversePermutation(Order, Mask);
8396 const unsigned E = Order.size();
8397 SmallVector<int> MaskOrder(E, PoisonMaskElem);
8398 transform(Order, MaskOrder.begin(), [E](unsigned I) {
8399 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8400 });
8401 Data.first->reorderSplitNode(P.second ? 1 : 0, Mask, MaskOrder);
8402 // Clear ordering of the operand.
8403 if (!OpTE.ReorderIndices.empty()) {
8404 OpTE.ReorderIndices.clear();
8405 } else if (!OpTE.ReuseShuffleIndices.empty()) {
8406 reorderReuses(OpTE.ReuseShuffleIndices, Mask);
8407 } else {
8408 assert(OpTE.isGather() && "Expected only gather/buildvector node.");
8409 reorderScalars(OpTE.Scalars, Mask);
8410 }
8411 }
8412 if (Data.first->ReuseShuffleIndices.empty() &&
8413 !Data.first->ReorderIndices.empty()) {
8414 // Insert user node to the list to try to sink reordering deeper in
8415 // the graph.
8416 Queue.push(Data.first);
8417 }
8418 continue;
8419 }
8420 // Check that operands are used only in the User node.
8421 SmallVector<TreeEntry *> GatherOps;
8422 buildReorderableOperands(Data.first, Data.second, NonVectorized,
8423 GatherOps);
8424 // All operands are reordered and used only in this node - propagate the
8425 // most used order to the user node.
8428 OrdersUses;
8429 // Do the analysis for each tree entry only once, otherwise the order of
8430 // the same node my be considered several times, though might be not
8431 // profitable.
8434 for (const auto &Op : Data.second) {
8435 TreeEntry *OpTE = Op.second;
8436 if (!VisitedOps.insert(OpTE).second)
8437 continue;
8438 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
8439 continue;
8440 const auto Order = [&]() -> const OrdersType {
8441 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
8442 return getReorderingData(*OpTE, /*TopToBottom=*/false,
8443 IgnoreReorder)
8444 .value_or(OrdersType(1));
8445 return OpTE->ReorderIndices;
8446 }();
8447 // The order is partially ordered, skip it in favor of fully non-ordered
8448 // orders.
8449 if (Order.size() == 1)
8450 continue;
8451
8452 // Check that the reordering does not increase number of shuffles, i.e.
8453 // same-values-nodes has same parents or their parents has same parents.
8454 if (!Order.empty() && !isIdentityOrder(Order)) {
8455 Value *Root = OpTE->hasState()
8456 ? OpTE->getMainOp()
8457 : *find_if_not(OpTE->Scalars, isConstant);
8458 auto GetSameNodesUsers = [&](Value *Root) {
8460 for (const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
8461 if (TE != OpTE && TE->UserTreeIndex &&
8462 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8463 TE->Scalars.size() == OpTE->Scalars.size() &&
8464 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8465 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8466 Res.insert(TE->UserTreeIndex.UserTE);
8467 }
8468 for (const TreeEntry *TE : getTreeEntries(Root)) {
8469 if (TE != OpTE && TE->UserTreeIndex &&
8470 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8471 TE->Scalars.size() == OpTE->Scalars.size() &&
8472 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8473 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8474 Res.insert(TE->UserTreeIndex.UserTE);
8475 }
8476 return Res.takeVector();
8477 };
8478 auto GetNumOperands = [](const TreeEntry *TE) {
8479 if (TE->State == TreeEntry::SplitVectorize)
8480 return TE->getNumOperands();
8481 if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
8482 return CI->arg_size();
8483 return TE->getNumOperands();
8484 };
8485 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
8486 const TreeEntry *TE) {
8488 if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
8490 for (unsigned Idx : seq<unsigned>(GetNumOperands(TE))) {
8493 continue;
8494 const TreeEntry *Op = getOperandEntry(TE, Idx);
8495 if (Op->isGather() && Op->hasState()) {
8496 const TreeEntry *VecOp =
8497 getSameValuesTreeEntry(Op->getMainOp(), Op->Scalars);
8498 if (VecOp)
8499 Op = VecOp;
8500 }
8501 if (Op->ReorderIndices.empty() && Op->ReuseShuffleIndices.empty())
8502 return false;
8503 }
8504 return true;
8505 };
8506 SmallVector<TreeEntry *> Users = GetSameNodesUsers(Root);
8507 if (!Users.empty() && !all_of(Users, [&](TreeEntry *UTE) {
8508 if (!RevisitedOps.insert(UTE).second)
8509 return false;
8510 return UTE == Data.first || !UTE->ReorderIndices.empty() ||
8511 !UTE->ReuseShuffleIndices.empty() ||
8512 (UTE->UserTreeIndex &&
8513 UTE->UserTreeIndex.UserTE == Data.first) ||
8514 (Data.first->UserTreeIndex &&
8515 Data.first->UserTreeIndex.UserTE == UTE) ||
8516 (IgnoreReorder && UTE->UserTreeIndex &&
8517 UTE->UserTreeIndex.UserTE->Idx == 0) ||
8518 NodeShouldBeReorderedWithOperands(UTE);
8519 }))
8520 continue;
8521 for (TreeEntry *UTE : Users) {
8523 if (auto *CI = dyn_cast<CallInst>(UTE->getMainOp()); CI)
8525 for (unsigned Idx : seq<unsigned>(GetNumOperands(UTE))) {
8528 continue;
8529 const TreeEntry *Op = getOperandEntry(UTE, Idx);
8530 Visited.erase(Op);
8531 Queue.push(const_cast<TreeEntry *>(Op));
8532 }
8533 }
8534 }
8535 unsigned NumOps = count_if(
8536 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
8537 return P.second == OpTE;
8538 });
8539 // Stores actually store the mask, not the order, need to invert.
8540 if (OpTE->State == TreeEntry::Vectorize &&
8541 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
8542 assert(!OpTE->isAltShuffle() &&
8543 "Alternate instructions are only supported by BinaryOperator "
8544 "and CastInst.");
8545 SmallVector<int> Mask;
8546 inversePermutation(Order, Mask);
8547 unsigned E = Order.size();
8548 OrdersType CurrentOrder(E, E);
8549 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
8550 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8551 });
8552 fixupOrderingIndices(CurrentOrder);
8553 OrdersUses.try_emplace(CurrentOrder, 0).first->second += NumOps;
8554 } else {
8555 OrdersUses.try_emplace(Order, 0).first->second += NumOps;
8556 }
8557 auto Res = OrdersUses.try_emplace(OrdersType(), 0);
8558 const auto AllowsReordering = [&](const TreeEntry *TE) {
8559 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
8560 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
8561 (IgnoreReorder && TE->Idx == 0))
8562 return true;
8563 if (TE->isGather()) {
8564 if (GathersToOrders.contains(TE))
8565 return !getReorderingData(*TE, /*TopToBottom=*/false,
8566 IgnoreReorder)
8567 .value_or(OrdersType(1))
8568 .empty();
8569 return true;
8570 }
8571 return false;
8572 };
8573 if (OpTE->UserTreeIndex) {
8574 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
8575 if (!VisitedUsers.insert(UserTE).second)
8576 continue;
8577 // May reorder user node if it requires reordering, has reused
8578 // scalars, is an alternate op vectorize node or its op nodes require
8579 // reordering.
8580 if (AllowsReordering(UserTE))
8581 continue;
8582 // Check if users allow reordering.
8583 // Currently look up just 1 level of operands to avoid increase of
8584 // the compile time.
8585 // Profitable to reorder if definitely more operands allow
8586 // reordering rather than those with natural order.
8588 if (static_cast<unsigned>(count_if(
8589 Ops, [UserTE, &AllowsReordering](
8590 const std::pair<unsigned, TreeEntry *> &Op) {
8591 return AllowsReordering(Op.second) &&
8592 Op.second->UserTreeIndex.UserTE == UserTE;
8593 })) <= Ops.size() / 2)
8594 ++Res.first->second;
8595 }
8596 }
8597 if (OrdersUses.empty()) {
8598 Visited.insert_range(llvm::make_second_range(Data.second));
8599 continue;
8600 }
8601 // Choose the most used order.
8602 unsigned IdentityCnt = 0;
8603 unsigned VF = Data.second.front().second->getVectorFactor();
8604 OrdersType IdentityOrder(VF, VF);
8605 for (auto &Pair : OrdersUses) {
8606 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
8607 IdentityCnt += Pair.second;
8608 combineOrders(IdentityOrder, Pair.first);
8609 }
8610 }
8611 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
8612 unsigned Cnt = IdentityCnt;
8613 for (auto &Pair : OrdersUses) {
8614 // Prefer identity order. But, if filled identity found (non-empty
8615 // order) with same number of uses, as the new candidate order, we can
8616 // choose this candidate order.
8617 if (Cnt < Pair.second) {
8618 combineOrders(Pair.first, BestOrder);
8619 BestOrder = Pair.first;
8620 Cnt = Pair.second;
8621 } else {
8622 combineOrders(BestOrder, Pair.first);
8623 }
8624 }
8625 // Set order of the user node.
8626 if (isIdentityOrder(BestOrder)) {
8627 Visited.insert_range(llvm::make_second_range(Data.second));
8628 continue;
8629 }
8630 fixupOrderingIndices(BestOrder);
8631 // Erase operands from OrderedEntries list and adjust their orders.
8632 VisitedOps.clear();
8633 SmallVector<int> Mask;
8634 inversePermutation(BestOrder, Mask);
8635 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
8636 unsigned E = BestOrder.size();
8637 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
8638 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8639 });
8640 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
8641 TreeEntry *TE = Op.second;
8642 if (!VisitedOps.insert(TE).second)
8643 continue;
8644 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
8645 reorderNodeWithReuses(*TE, Mask);
8646 continue;
8647 }
8648 // Gathers are processed separately.
8649 if (TE->State != TreeEntry::Vectorize &&
8650 TE->State != TreeEntry::StridedVectorize &&
8651 TE->State != TreeEntry::CompressVectorize &&
8652 TE->State != TreeEntry::SplitVectorize &&
8653 (TE->State != TreeEntry::ScatterVectorize ||
8654 TE->ReorderIndices.empty()))
8655 continue;
8656 assert((BestOrder.size() == TE->ReorderIndices.size() ||
8657 TE->ReorderIndices.empty()) &&
8658 "Non-matching sizes of user/operand entries.");
8659 reorderOrder(TE->ReorderIndices, Mask);
8660 if (IgnoreReorder && TE == VectorizableTree.front().get())
8661 IgnoreReorder = false;
8662 }
8663 // For gathers just need to reorder its scalars.
8664 for (TreeEntry *Gather : GatherOps) {
8665 assert(Gather->ReorderIndices.empty() &&
8666 "Unexpected reordering of gathers.");
8667 if (!Gather->ReuseShuffleIndices.empty()) {
8668 // Just reorder reuses indices.
8669 reorderReuses(Gather->ReuseShuffleIndices, Mask);
8670 continue;
8671 }
8672 reorderScalars(Gather->Scalars, Mask);
8673 Visited.insert(Gather);
8674 }
8675 // Reorder operands of the user node and set the ordering for the user
8676 // node itself.
8677 auto IsNotProfitableAltCodeNode = [](const TreeEntry &TE) {
8678 return TE.isAltShuffle() &&
8679 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
8680 TE.ReorderIndices.empty());
8681 };
8682 if (Data.first->State != TreeEntry::Vectorize ||
8684 Data.first->getMainOp()) ||
8685 IsNotProfitableAltCodeNode(*Data.first))
8686 Data.first->reorderOperands(Mask);
8687 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
8688 IsNotProfitableAltCodeNode(*Data.first) ||
8689 Data.first->State == TreeEntry::StridedVectorize ||
8690 Data.first->State == TreeEntry::CompressVectorize) {
8691 reorderScalars(Data.first->Scalars, Mask);
8692 reorderOrder(Data.first->ReorderIndices, MaskOrder,
8693 /*BottomOrder=*/true);
8694 if (Data.first->ReuseShuffleIndices.empty() &&
8695 !Data.first->ReorderIndices.empty() &&
8696 !IsNotProfitableAltCodeNode(*Data.first)) {
8697 // Insert user node to the list to try to sink reordering deeper in
8698 // the graph.
8699 Queue.push(Data.first);
8700 }
8701 } else {
8702 reorderOrder(Data.first->ReorderIndices, Mask);
8703 }
8704 }
8705 }
8706 // If the reordering is unnecessary, just remove the reorder.
8707 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
8708 VectorizableTree.front()->ReuseShuffleIndices.empty())
8709 VectorizableTree.front()->ReorderIndices.clear();
8710}
8711
8712Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
8713 if (Entry.hasState() &&
8714 (Entry.getOpcode() == Instruction::Store ||
8715 Entry.getOpcode() == Instruction::Load) &&
8716 Entry.State == TreeEntry::StridedVectorize &&
8717 !Entry.ReorderIndices.empty() && isReverseOrder(Entry.ReorderIndices))
8718 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
8719 return dyn_cast<Instruction>(Entry.Scalars.front());
8720}
8721
8723 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
8724 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
8725 DenseMap<Value *, unsigned> ScalarToExtUses;
8726 SmallPtrSet<Value *, 4> ExternalUsers;
8727 // Collect the values that we need to extract from the tree.
8728 for (auto &TEPtr : VectorizableTree) {
8729 TreeEntry *Entry = TEPtr.get();
8730
8731 // No need to handle users of gathered values.
8732 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
8733 continue;
8734
8735 // For each lane:
8736 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
8737 Value *Scalar = Entry->Scalars[Lane];
8738 if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
8739 continue;
8740
8741 // All uses must be replaced already? No need to do it again.
8742 auto It = ScalarToExtUses.find(Scalar);
8743 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
8744 continue;
8745
8746 if (Scalar->hasNUsesOrMore(NumVectScalars)) {
8747 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8748 LLVM_DEBUG(dbgs() << "SLP: Need to extract from lane " << FoundLane
8749 << " from " << *Scalar << "for many users.\n");
8750 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
8751 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
8752 ExternalUsesWithNonUsers.insert(Scalar);
8753 continue;
8754 }
8755
8756 // Check if the scalar is externally used as an extra arg.
8757 const auto ExtI = ExternallyUsedValues.find(Scalar);
8758 if (ExtI != ExternallyUsedValues.end()) {
8759 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8760 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
8761 << FoundLane << " from " << *Scalar << ".\n");
8762 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
8763 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
8764 continue;
8765 }
8766 for (User *U : Scalar->users()) {
8767 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
8768
8769 Instruction *UserInst = dyn_cast<Instruction>(U);
8770 if (!UserInst || isDeleted(UserInst))
8771 continue;
8772
8773 // Ignore users in the user ignore list.
8774 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
8775 continue;
8776
8777 // Skip in-tree scalars that become vectors
8778 if (ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
8779 !UseEntries.empty()) {
8780 // Some in-tree scalars will remain as scalar in vectorized
8781 // instructions. If that is the case, the one in FoundLane will
8782 // be used.
8783 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
8784 isa<LoadInst, StoreInst>(UserInst)) ||
8785 isa<CallInst>(UserInst)) ||
8786 all_of(UseEntries, [&](TreeEntry *UseEntry) {
8787 return UseEntry->State == TreeEntry::ScatterVectorize ||
8789 Scalar, getRootEntryInstruction(*UseEntry), TLI,
8790 TTI);
8791 })) {
8792 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
8793 << ".\n");
8794 assert(none_of(UseEntries,
8795 [](TreeEntry *UseEntry) {
8796 return UseEntry->isGather();
8797 }) &&
8798 "Bad state");
8799 continue;
8800 }
8801 U = nullptr;
8802 if (It != ScalarToExtUses.end()) {
8803 ExternalUses[It->second].User = nullptr;
8804 break;
8805 }
8806 }
8807
8808 if (U && Scalar->hasNUsesOrMore(UsesLimit))
8809 U = nullptr;
8810 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8811 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
8812 << " from lane " << FoundLane << " from " << *Scalar
8813 << ".\n");
8814 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
8815 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
8816 ExternalUsesWithNonUsers.insert(Scalar);
8817 if (!U)
8818 break;
8819 }
8820 }
8821 }
8822}
8823
8825BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
8828 PtrToStoresMap;
8829 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
8830 Value *V = TE->Scalars[Lane];
8831 // Don't iterate over the users of constant data.
8832 if (!isa<Instruction>(V))
8833 continue;
8834 // To save compilation time we don't visit if we have too many users.
8835 if (V->hasNUsesOrMore(UsesLimit))
8836 break;
8837
8838 // Collect stores per pointer object.
8839 for (User *U : V->users()) {
8840 auto *SI = dyn_cast<StoreInst>(U);
8841 // Test whether we can handle the store. V might be a global, which could
8842 // be used in a different function.
8843 if (SI == nullptr || !SI->isSimple() || SI->getFunction() != F ||
8844 !isValidElementType(SI->getValueOperand()->getType()))
8845 continue;
8846 // Skip entry if already
8847 if (isVectorized(U))
8848 continue;
8849
8850 Value *Ptr =
8851 getUnderlyingObject(SI->getPointerOperand(), RecursionMaxDepth);
8852 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
8853 SI->getValueOperand()->getType(), Ptr}];
8854 // For now just keep one store per pointer object per lane.
8855 // TODO: Extend this to support multiple stores per pointer per lane
8856 if (StoresVec.size() > Lane)
8857 continue;
8858 if (!StoresVec.empty()) {
8859 std::optional<int64_t> Diff = getPointersDiff(
8860 SI->getValueOperand()->getType(), SI->getPointerOperand(),
8861 SI->getValueOperand()->getType(),
8862 StoresVec.front()->getPointerOperand(), *DL, *SE,
8863 /*StrictCheck=*/true);
8864 // We failed to compare the pointers so just abandon this store.
8865 if (!Diff)
8866 continue;
8867 }
8868 StoresVec.push_back(SI);
8869 }
8870 }
8871 SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());
8872 unsigned I = 0;
8873 for (auto &P : PtrToStoresMap) {
8874 Res[I].swap(P.second);
8875 ++I;
8876 }
8877 return Res;
8878}
8879
8880bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
8881 OrdersType &ReorderIndices) const {
8882 // We check whether the stores in StoreVec can form a vector by sorting them
8883 // and checking whether they are consecutive.
8884
8885 // To avoid calling getPointersDiff() while sorting we create a vector of
8886 // pairs {store, offset from first} and sort this instead.
8888 StoreInst *S0 = StoresVec[0];
8889 StoreOffsetVec.emplace_back(0, 0);
8890 Type *S0Ty = S0->getValueOperand()->getType();
8891 Value *S0Ptr = S0->getPointerOperand();
8892 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
8893 StoreInst *SI = StoresVec[Idx];
8894 std::optional<int64_t> Diff =
8895 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
8896 SI->getPointerOperand(), *DL, *SE,
8897 /*StrictCheck=*/true);
8898 StoreOffsetVec.emplace_back(*Diff, Idx);
8899 }
8900
8901 // Check if the stores are consecutive by checking if their difference is 1.
8902 if (StoreOffsetVec.size() != StoresVec.size())
8903 return false;
8904 sort(StoreOffsetVec, llvm::less_first());
8905 unsigned Idx = 0;
8906 int64_t PrevDist = 0;
8907 for (const auto &P : StoreOffsetVec) {
8908 if (Idx > 0 && P.first != PrevDist + 1)
8909 return false;
8910 PrevDist = P.first;
8911 ++Idx;
8912 }
8913
8914 // Calculate the shuffle indices according to their offset against the sorted
8915 // StoreOffsetVec.
8916 ReorderIndices.assign(StoresVec.size(), 0);
8917 bool IsIdentity = true;
8918 for (auto [I, P] : enumerate(StoreOffsetVec)) {
8919 ReorderIndices[P.second] = I;
8920 IsIdentity &= P.second == I;
8921 }
8922 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
8923 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
8924 // same convention here.
8925 if (IsIdentity)
8926 ReorderIndices.clear();
8927
8928 return true;
8929}
8930
8931#ifndef NDEBUG
8933 for (unsigned Idx : Order)
8934 dbgs() << Idx << ", ";
8935 dbgs() << "\n";
8936}
8937#endif
8938
8940BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
8941 unsigned NumLanes = TE->Scalars.size();
8942
8943 SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);
8944
8945 // Holds the reorder indices for each candidate store vector that is a user of
8946 // the current TreeEntry.
8947 SmallVector<OrdersType, 1> ExternalReorderIndices;
8948
8949 // Now inspect the stores collected per pointer and look for vectorization
8950 // candidates. For each candidate calculate the reorder index vector and push
8951 // it into `ExternalReorderIndices`
8952 for (ArrayRef<StoreInst *> StoresVec : Stores) {
8953 // If we have fewer than NumLanes stores, then we can't form a vector.
8954 if (StoresVec.size() != NumLanes)
8955 continue;
8956
8957 // If the stores are not consecutive then abandon this StoresVec.
8958 OrdersType ReorderIndices;
8959 if (!canFormVector(StoresVec, ReorderIndices))
8960 continue;
8961
8962 // We now know that the scalars in StoresVec can form a vector instruction,
8963 // so set the reorder indices.
8964 ExternalReorderIndices.push_back(ReorderIndices);
8965 }
8966 return ExternalReorderIndices;
8967}
8968
8970 const SmallDenseSet<Value *> &UserIgnoreLst) {
8971 deleteTree();
8972 assert(TreeEntryToStridedPtrInfoMap.empty() &&
8973 "TreeEntryToStridedPtrInfoMap is not cleared");
8974 UserIgnoreList = &UserIgnoreLst;
8975 if (!allSameType(Roots))
8976 return;
8977 buildTreeRec(Roots, 0, EdgeInfo());
8978}
8979
8981 deleteTree();
8982 assert(TreeEntryToStridedPtrInfoMap.empty() &&
8983 "TreeEntryToStridedPtrInfoMap is not cleared");
8984 if (!allSameType(Roots))
8985 return;
8986 buildTreeRec(Roots, 0, EdgeInfo());
8987}
8988
8989/// Tries to find subvector of loads and builds new vector of only loads if can
8990/// be profitable.
8992 const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL,
8994 SmallVectorImpl<SmallVector<std::pair<LoadInst *, int64_t>>> &GatheredLoads,
8995 bool AddNew = true) {
8996 if (VL.empty())
8997 return;
8998 Type *ScalarTy = getValueType(VL.front());
8999 if (!isValidElementType(ScalarTy))
9000 return;
9002 SmallVector<DenseMap<int64_t, LoadInst *>> ClusteredDistToLoad;
9003 for (Value *V : VL) {
9004 auto *LI = dyn_cast<LoadInst>(V);
9005 if (!LI)
9006 continue;
9007 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
9008 continue;
9009 bool IsFound = false;
9010 for (auto [Map, Data] : zip(ClusteredDistToLoad, ClusteredLoads)) {
9011 assert(LI->getParent() == Data.front().first->getParent() &&
9012 LI->getType() == Data.front().first->getType() &&
9013 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) ==
9014 getUnderlyingObject(Data.front().first->getPointerOperand(),
9016 "Expected loads with the same type, same parent and same "
9017 "underlying pointer.");
9018 std::optional<int64_t> Dist = getPointersDiff(
9019 LI->getType(), LI->getPointerOperand(), Data.front().first->getType(),
9020 Data.front().first->getPointerOperand(), DL, SE,
9021 /*StrictCheck=*/true);
9022 if (!Dist)
9023 continue;
9024 auto It = Map.find(*Dist);
9025 if (It != Map.end() && It->second != LI)
9026 continue;
9027 if (It == Map.end()) {
9028 Data.emplace_back(LI, *Dist);
9029 Map.try_emplace(*Dist, LI);
9030 }
9031 IsFound = true;
9032 break;
9033 }
9034 if (!IsFound) {
9035 ClusteredLoads.emplace_back().emplace_back(LI, 0);
9036 ClusteredDistToLoad.emplace_back().try_emplace(0, LI);
9037 }
9038 }
9039 auto FindMatchingLoads =
9042 &GatheredLoads,
9043 SetVector<unsigned> &ToAdd, SetVector<unsigned> &Repeated,
9044 int64_t &Offset, unsigned &Start) {
9045 if (Loads.empty())
9046 return GatheredLoads.end();
9047 LoadInst *LI = Loads.front().first;
9048 for (auto [Idx, Data] : enumerate(GatheredLoads)) {
9049 if (Idx < Start)
9050 continue;
9051 ToAdd.clear();
9052 if (LI->getParent() != Data.front().first->getParent() ||
9053 LI->getType() != Data.front().first->getType())
9054 continue;
9055 std::optional<int64_t> Dist =
9057 Data.front().first->getType(),
9058 Data.front().first->getPointerOperand(), DL, SE,
9059 /*StrictCheck=*/true);
9060 if (!Dist)
9061 continue;
9062 SmallSet<int64_t, 4> DataDists;
9064 for (std::pair<LoadInst *, int64_t> P : Data) {
9065 DataDists.insert(P.second);
9066 DataLoads.insert(P.first);
9067 }
9068 // Found matching gathered loads - check if all loads are unique or
9069 // can be effectively vectorized.
9070 unsigned NumUniques = 0;
9071 for (auto [Cnt, Pair] : enumerate(Loads)) {
9072 bool Used = DataLoads.contains(Pair.first);
9073 if (!Used && !DataDists.contains(*Dist + Pair.second)) {
9074 ++NumUniques;
9075 ToAdd.insert(Cnt);
9076 } else if (Used) {
9077 Repeated.insert(Cnt);
9078 }
9079 }
9080 if (NumUniques > 0 &&
9081 (Loads.size() == NumUniques ||
9082 (Loads.size() - NumUniques >= 2 &&
9083 Loads.size() - NumUniques >= Loads.size() / 2 &&
9084 (has_single_bit(Data.size() + NumUniques) ||
9085 bit_ceil(Data.size()) <
9086 bit_ceil(Data.size() + NumUniques))))) {
9087 Offset = *Dist;
9088 Start = Idx + 1;
9089 return std::next(GatheredLoads.begin(), Idx);
9090 }
9091 }
9092 ToAdd.clear();
9093 return GatheredLoads.end();
9094 };
9095 for (ArrayRef<std::pair<LoadInst *, int64_t>> Data : ClusteredLoads) {
9096 unsigned Start = 0;
9097 SetVector<unsigned> ToAdd, LocalToAdd, Repeated;
9098 int64_t Offset = 0;
9099 auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,
9100 Offset, Start);
9101 while (It != GatheredLoads.end()) {
9102 assert(!LocalToAdd.empty() && "Expected some elements to add.");
9103 for (unsigned Idx : LocalToAdd)
9104 It->emplace_back(Data[Idx].first, Data[Idx].second + Offset);
9105 ToAdd.insert_range(LocalToAdd);
9106 It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset,
9107 Start);
9108 }
9109 if (any_of(seq<unsigned>(Data.size()), [&](unsigned Idx) {
9110 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
9111 })) {
9112 auto AddNewLoads =
9114 for (unsigned Idx : seq<unsigned>(Data.size())) {
9115 if (ToAdd.contains(Idx) || Repeated.contains(Idx))
9116 continue;
9117 Loads.push_back(Data[Idx]);
9118 }
9119 };
9120 if (!AddNew) {
9121 LoadInst *LI = Data.front().first;
9122 It = find_if(
9123 GatheredLoads, [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9124 return PD.front().first->getParent() == LI->getParent() &&
9125 PD.front().first->getType() == LI->getType();
9126 });
9127 while (It != GatheredLoads.end()) {
9128 AddNewLoads(*It);
9129 It = std::find_if(
9130 std::next(It), GatheredLoads.end(),
9131 [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9132 return PD.front().first->getParent() == LI->getParent() &&
9133 PD.front().first->getType() == LI->getType();
9134 });
9135 }
9136 }
9137 GatheredLoads.emplace_back().append(Data.begin(), Data.end());
9138 AddNewLoads(GatheredLoads.emplace_back());
9139 }
9140 }
9141}
9142
9143void BoUpSLP::tryToVectorizeGatheredLoads(
9144 const SmallMapVector<
9145 std::tuple<BasicBlock *, Value *, Type *>,
9146 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
9147 &GatheredLoads) {
9148 GatheredLoadsEntriesFirst = VectorizableTree.size();
9149
9150 SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
9151 LoadEntriesToVectorize.size());
9152 for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
9153 Set.insert_range(VectorizableTree[Idx]->Scalars);
9154
9155 // Sort loads by distance.
9156 auto LoadSorter = [](const std::pair<LoadInst *, int64_t> &L1,
9157 const std::pair<LoadInst *, int64_t> &L2) {
9158 return L1.second > L2.second;
9159 };
9160
9161 auto IsMaskedGatherSupported = [&, TTI = TTI](ArrayRef<LoadInst *> Loads) {
9162 ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
9163 Loads.size());
9164 Align Alignment = computeCommonAlignment<LoadInst>(Values);
9165 auto *Ty = getWidenedType(Loads.front()->getType(), Loads.size());
9166 return TTI->isLegalMaskedGather(Ty, Alignment) &&
9167 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
9168 };
9169
9170 auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,
9171 BoUpSLP::ValueSet &VectorizedLoads,
9172 SmallVectorImpl<LoadInst *> &NonVectorized,
9173 bool Final, unsigned MaxVF) {
9175 unsigned StartIdx = 0;
9176 SmallVector<int> CandidateVFs;
9177 if (VectorizeNonPowerOf2 && has_single_bit(MaxVF + 1))
9178 CandidateVFs.push_back(MaxVF);
9179 for (int NumElts = getFloorFullVectorNumberOfElements(
9180 *TTI, Loads.front()->getType(), MaxVF);
9181 NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(
9182 *TTI, Loads.front()->getType(), NumElts - 1)) {
9183 CandidateVFs.push_back(NumElts);
9184 if (VectorizeNonPowerOf2 && NumElts > 2)
9185 CandidateVFs.push_back(NumElts - 1);
9186 }
9187
9188 if (Final && CandidateVFs.empty())
9189 return Results;
9190
9191 unsigned BestVF = Final ? CandidateVFs.back() : 0;
9192 for (unsigned NumElts : CandidateVFs) {
9193 if (Final && NumElts > BestVF)
9194 continue;
9195 SmallVector<unsigned> MaskedGatherVectorized;
9196 for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E;
9197 ++Cnt) {
9198 ArrayRef<LoadInst *> Slice =
9199 ArrayRef(Loads).slice(Cnt, std::min(NumElts, E - Cnt));
9200 if (VectorizedLoads.count(Slice.front()) ||
9201 VectorizedLoads.count(Slice.back()) ||
9203 continue;
9204 // Check if it is profitable to try vectorizing gathered loads. It is
9205 // profitable if we have more than 3 consecutive loads or if we have
9206 // less but all users are vectorized or deleted.
9207 bool AllowToVectorize = false;
9208 // Check if it is profitable to vectorize 2-elements loads.
9209 if (NumElts == 2) {
9210 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
9211 Slice.front()->getType(), ElementCount::getFixed(NumElts));
9212 auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
9213 for (LoadInst *LI : Slice) {
9214 // If single use/user - allow to vectorize.
9215 if (LI->hasOneUse())
9216 continue;
9217 // 1. Check if number of uses equals number of users.
9218 // 2. All users are deleted.
9219 // 3. The load broadcasts are not allowed or the load is not
9220 // broadcasted.
9221 if (static_cast<unsigned int>(std::distance(
9222 LI->user_begin(), LI->user_end())) != LI->getNumUses())
9223 return false;
9224 if (!IsLegalBroadcastLoad)
9225 continue;
9226 if (LI->hasNUsesOrMore(UsesLimit))
9227 return false;
9228 for (User *U : LI->users()) {
9229 if (auto *UI = dyn_cast<Instruction>(U); UI && isDeleted(UI))
9230 continue;
9231 for (const TreeEntry *UTE : getTreeEntries(U)) {
9232 for (int I : seq<int>(UTE->getNumOperands())) {
9233 if (all_of(UTE->getOperand(I), [LI](Value *V) {
9234 return V == LI || isa<PoisonValue>(V);
9235 }))
9236 // Found legal broadcast - do not vectorize.
9237 return false;
9238 }
9239 }
9240 }
9241 }
9242 return true;
9243 };
9244 AllowToVectorize = CheckIfAllowed(Slice);
9245 } else {
9246 AllowToVectorize =
9247 (NumElts >= 3 ||
9248 any_of(ValueToGatherNodes.at(Slice.front()),
9249 [=](const TreeEntry *TE) {
9250 return TE->Scalars.size() == 2 &&
9251 ((TE->Scalars.front() == Slice.front() &&
9252 TE->Scalars.back() == Slice.back()) ||
9253 (TE->Scalars.front() == Slice.back() &&
9254 TE->Scalars.back() == Slice.front()));
9255 })) &&
9256 hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),
9257 Slice.size());
9258 }
9259 if (AllowToVectorize) {
9260 SmallVector<Value *> PointerOps;
9261 OrdersType CurrentOrder;
9262 // Try to build vector load.
9263 ArrayRef<Value *> Values(
9264 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9265 StridedPtrInfo SPtrInfo;
9266 LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
9267 PointerOps, SPtrInfo, &BestVF);
9268 if (LS != LoadsState::Gather ||
9269 (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
9270 if (LS == LoadsState::ScatterVectorize) {
9271 if (MaskedGatherVectorized.empty() ||
9272 Cnt >= MaskedGatherVectorized.back() + NumElts)
9273 MaskedGatherVectorized.push_back(Cnt);
9274 continue;
9275 }
9276 if (LS != LoadsState::Gather) {
9277 Results.emplace_back(Values, LS);
9278 VectorizedLoads.insert_range(Slice);
9279 // If we vectorized initial block, no need to try to vectorize it
9280 // again.
9281 if (Cnt == StartIdx)
9282 StartIdx += NumElts;
9283 }
9284 // Check if the whole array was vectorized already - exit.
9285 if (StartIdx >= Loads.size())
9286 break;
9287 // Erase last masked gather candidate, if another candidate within
9288 // the range is found to be better.
9289 if (!MaskedGatherVectorized.empty() &&
9290 Cnt < MaskedGatherVectorized.back() + NumElts)
9291 MaskedGatherVectorized.pop_back();
9292 Cnt += NumElts - 1;
9293 continue;
9294 }
9295 }
9296 if (!AllowToVectorize || BestVF == 0)
9298 }
9299 // Mark masked gathers candidates as vectorized, if any.
9300 for (unsigned Cnt : MaskedGatherVectorized) {
9301 ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(
9302 Cnt, std::min<unsigned>(NumElts, Loads.size() - Cnt));
9303 ArrayRef<Value *> Values(
9304 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9305 Results.emplace_back(Values, LoadsState::ScatterVectorize);
9306 VectorizedLoads.insert_range(Slice);
9307 // If we vectorized initial block, no need to try to vectorize it again.
9308 if (Cnt == StartIdx)
9309 StartIdx += NumElts;
9310 }
9311 }
9312 for (LoadInst *LI : Loads) {
9313 if (!VectorizedLoads.contains(LI))
9314 NonVectorized.push_back(LI);
9315 }
9316 return Results;
9317 };
9318 auto ProcessGatheredLoads =
9319 [&, &TTI = *TTI](
9321 bool Final = false) {
9322 SmallVector<LoadInst *> NonVectorized;
9323 for (ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
9324 GatheredLoads) {
9325 if (LoadsDists.size() <= 1) {
9326 NonVectorized.push_back(LoadsDists.back().first);
9327 continue;
9328 }
9330 LoadsDists);
9331 SmallVector<LoadInst *> OriginalLoads(make_first_range(LoadsDists));
9332 stable_sort(LocalLoadsDists, LoadSorter);
9334 unsigned MaxConsecutiveDistance = 0;
9335 unsigned CurrentConsecutiveDist = 1;
9336 int64_t LastDist = LocalLoadsDists.front().second;
9337 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
9338 for (const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
9339 if (isVectorized(L.first))
9340 continue;
9341 assert(LastDist >= L.second &&
9342 "Expected first distance always not less than second");
9343 if (static_cast<uint64_t>(LastDist - L.second) ==
9344 CurrentConsecutiveDist) {
9345 ++CurrentConsecutiveDist;
9346 MaxConsecutiveDistance =
9347 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
9348 Loads.push_back(L.first);
9349 continue;
9350 }
9351 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
9352 !Loads.empty())
9353 Loads.pop_back();
9354 CurrentConsecutiveDist = 1;
9355 LastDist = L.second;
9356 Loads.push_back(L.first);
9357 }
9358 if (Loads.size() <= 1)
9359 continue;
9360 if (AllowMaskedGather)
9361 MaxConsecutiveDistance = Loads.size();
9362 else if (MaxConsecutiveDistance < 2)
9363 continue;
9364 BoUpSLP::ValueSet VectorizedLoads;
9365 SmallVector<LoadInst *> SortedNonVectorized;
9367 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
9368 Final, MaxConsecutiveDistance);
9369 if (!Results.empty() && !SortedNonVectorized.empty() &&
9370 OriginalLoads.size() == Loads.size() &&
9371 MaxConsecutiveDistance == Loads.size() &&
9373 [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
9374 return P.second == LoadsState::ScatterVectorize;
9375 })) {
9376 VectorizedLoads.clear();
9377 SmallVector<LoadInst *> UnsortedNonVectorized;
9379 UnsortedResults =
9380 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
9381 UnsortedNonVectorized, Final,
9382 OriginalLoads.size());
9383 if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
9384 SortedNonVectorized.swap(UnsortedNonVectorized);
9385 Results.swap(UnsortedResults);
9386 }
9387 }
9388 for (auto [Slice, _] : Results) {
9389 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
9390 << Slice.size() << ")\n");
9391 if (any_of(Slice, [&](Value *V) { return isVectorized(V); })) {
9392 for (Value *L : Slice)
9393 if (!isVectorized(L))
9394 SortedNonVectorized.push_back(cast<LoadInst>(L));
9395 continue;
9396 }
9397
9398 // Select maximum VF as a maximum of user gathered nodes and
9399 // distance between scalar loads in these nodes.
9400 unsigned MaxVF = Slice.size();
9401 unsigned UserMaxVF = 0;
9402 unsigned InterleaveFactor = 0;
9403 if (MaxVF == 2) {
9404 UserMaxVF = MaxVF;
9405 } else {
9406 // Found distance between segments of the interleaved loads.
9407 std::optional<unsigned> InterleavedLoadsDistance = 0;
9408 unsigned Order = 0;
9409 std::optional<unsigned> CommonVF = 0;
9410 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
9411 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
9412 for (auto [Idx, V] : enumerate(Slice)) {
9413 for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
9414 UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
9415 unsigned Pos =
9416 EntryToPosition.try_emplace(E, Idx).first->second;
9417 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
9418 if (CommonVF) {
9419 if (*CommonVF == 0) {
9420 CommonVF = E->Scalars.size();
9421 continue;
9422 }
9423 if (*CommonVF != E->Scalars.size())
9424 CommonVF.reset();
9425 }
9426 // Check if the load is the part of the interleaved load.
9427 if (Pos != Idx && InterleavedLoadsDistance) {
9428 if (!DeinterleavedNodes.contains(E) &&
9429 any_of(E->Scalars, [&, Slice = Slice](Value *V) {
9430 if (isa<Constant>(V))
9431 return false;
9432 if (isVectorized(V))
9433 return true;
9434 const auto &Nodes = ValueToGatherNodes.at(V);
9435 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
9436 !is_contained(Slice, V);
9437 })) {
9438 InterleavedLoadsDistance.reset();
9439 continue;
9440 }
9441 DeinterleavedNodes.insert(E);
9442 if (*InterleavedLoadsDistance == 0) {
9443 InterleavedLoadsDistance = Idx - Pos;
9444 continue;
9445 }
9446 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
9447 (Idx - Pos) / *InterleavedLoadsDistance < Order)
9448 InterleavedLoadsDistance.reset();
9449 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
9450 }
9451 }
9452 }
9453 DeinterleavedNodes.clear();
9454 // Check if the large load represents interleaved load operation.
9455 if (InterleavedLoadsDistance.value_or(0) > 1 &&
9456 CommonVF.value_or(0) != 0) {
9457 InterleaveFactor = bit_ceil(*InterleavedLoadsDistance);
9458 unsigned VF = *CommonVF;
9459 OrdersType Order;
9460 SmallVector<Value *> PointerOps;
9461 StridedPtrInfo SPtrInfo;
9462 // Segmented load detected - vectorize at maximum vector factor.
9463 if (InterleaveFactor <= Slice.size() &&
9464 TTI.isLegalInterleavedAccessType(
9465 getWidenedType(Slice.front()->getType(), VF),
9466 InterleaveFactor,
9467 cast<LoadInst>(Slice.front())->getAlign(),
9468 cast<LoadInst>(Slice.front())
9470 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
9471 SPtrInfo) == LoadsState::Vectorize) {
9472 UserMaxVF = InterleaveFactor * VF;
9473 } else {
9474 InterleaveFactor = 0;
9475 }
9476 }
9477 // Cannot represent the loads as consecutive vectorizable nodes -
9478 // just exit.
9479 unsigned ConsecutiveNodesSize = 0;
9480 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
9481 any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9482 [&, Slice = Slice](const auto &P) {
9483 const auto *It = find_if(Slice, [&](Value *V) {
9484 return std::get<1>(P).contains(V);
9485 });
9486 if (It == Slice.end())
9487 return false;
9488 const TreeEntry &TE =
9489 *VectorizableTree[std::get<0>(P)];
9490 ArrayRef<Value *> VL = TE.Scalars;
9491 OrdersType Order;
9492 SmallVector<Value *> PointerOps;
9493 StridedPtrInfo SPtrInfo;
9495 VL, VL.front(), Order, PointerOps, SPtrInfo);
9496 if (State == LoadsState::ScatterVectorize ||
9498 return false;
9499 ConsecutiveNodesSize += VL.size();
9500 size_t Start = std::distance(Slice.begin(), It);
9501 size_t Sz = Slice.size() - Start;
9502 return Sz < VL.size() ||
9503 Slice.slice(Start, VL.size()) != VL;
9504 }))
9505 continue;
9506 // Try to build long masked gather loads.
9507 UserMaxVF = bit_ceil(UserMaxVF);
9508 if (InterleaveFactor == 0 &&
9509 any_of(seq<unsigned>(Slice.size() / UserMaxVF),
9510 [&, Slice = Slice](unsigned Idx) {
9511 OrdersType Order;
9512 SmallVector<Value *> PointerOps;
9513 StridedPtrInfo SPtrInfo;
9514 return canVectorizeLoads(
9515 Slice.slice(Idx * UserMaxVF, UserMaxVF),
9516 Slice[Idx * UserMaxVF], Order, PointerOps,
9517 SPtrInfo) == LoadsState::ScatterVectorize;
9518 }))
9519 UserMaxVF = MaxVF;
9520 if (Slice.size() != ConsecutiveNodesSize)
9521 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
9522 }
9523 for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
9524 bool IsVectorized = true;
9525 for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
9526 ArrayRef<Value *> SubSlice =
9527 Slice.slice(I, std::min(VF, E - I));
9528 if (isVectorized(SubSlice.front()))
9529 continue;
9530 // Check if the subslice is to be-vectorized entry, which is not
9531 // equal to entry.
9532 if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9533 [&](const auto &P) {
9534 return !SubSlice.equals(
9535 VectorizableTree[std::get<0>(P)]
9536 ->Scalars) &&
9537 set_is_subset(SubSlice, std::get<1>(P));
9538 }))
9539 continue;
9540 unsigned Sz = VectorizableTree.size();
9541 buildTreeRec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
9542 if (Sz == VectorizableTree.size()) {
9543 IsVectorized = false;
9544 // Try non-interleaved vectorization with smaller vector
9545 // factor.
9546 if (InterleaveFactor > 0) {
9547 VF = 2 * (MaxVF / InterleaveFactor);
9548 InterleaveFactor = 0;
9549 }
9550 continue;
9551 }
9552 }
9553 if (IsVectorized)
9554 break;
9555 }
9556 }
9557 NonVectorized.append(SortedNonVectorized);
9558 }
9559 return NonVectorized;
9560 };
9561 for (const auto &GLs : GatheredLoads) {
9562 const auto &Ref = GLs.second;
9563 SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);
9564 if (!Ref.empty() && !NonVectorized.empty() &&
9565 std::accumulate(
9566 Ref.begin(), Ref.end(), 0u,
9567 [](unsigned S, ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
9568 -> unsigned { return S + LoadsDists.size(); }) !=
9569 NonVectorized.size() &&
9570 IsMaskedGatherSupported(NonVectorized)) {
9572 FinalGatheredLoads;
9573 for (LoadInst *LI : NonVectorized) {
9574 // Reinsert non-vectorized loads to other list of loads with the same
9575 // base pointers.
9576 gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI,
9577 FinalGatheredLoads,
9578 /*AddNew=*/false);
9579 }
9580 // Final attempt to vectorize non-vectorized loads.
9581 (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
9582 }
9583 }
9584 // Try to vectorize postponed load entries, previously marked as gathered.
9585 for (unsigned Idx : LoadEntriesToVectorize) {
9586 const TreeEntry &E = *VectorizableTree[Idx];
9587 SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
9588 // Avoid reordering, if possible.
9589 if (!E.ReorderIndices.empty()) {
9590 // Build a mask out of the reorder indices and reorder scalars per this
9591 // mask.
9592 SmallVector<int> ReorderMask;
9593 inversePermutation(E.ReorderIndices, ReorderMask);
9594 reorderScalars(GatheredScalars, ReorderMask);
9595 }
9596 buildTreeRec(GatheredScalars, 0, EdgeInfo());
9597 }
9598 // If no new entries created, consider it as no gathered loads entries must be
9599 // handled.
9600 if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
9601 VectorizableTree.size())
9602 GatheredLoadsEntriesFirst.reset();
9603}
9604
9605/// Generates key/subkey pair for the given value to provide effective sorting
9606/// of the values and better detection of the vectorizable values sequences. The
9607/// keys/subkeys can be used for better sorting of the values themselves (keys)
9608/// and in values subgroups (subkeys).
9609static std::pair<size_t, size_t> generateKeySubkey(
9610 Value *V, const TargetLibraryInfo *TLI,
9611 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
9612 bool AllowAlternate) {
9613 hash_code Key = hash_value(V->getValueID() + 2);
9614 hash_code SubKey = hash_value(0);
9615 // Sort the loads by the distance between the pointers.
9616 if (auto *LI = dyn_cast<LoadInst>(V)) {
9617 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
9618 if (LI->isSimple())
9619 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
9620 else
9621 Key = SubKey = hash_value(LI);
9622 } else if (isVectorLikeInstWithConstOps(V)) {
9623 // Sort extracts by the vector operands.
9625 Key = hash_value(Value::UndefValueVal + 1);
9626 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
9627 if (!isUndefVector(EI->getVectorOperand()).all() &&
9628 !isa<UndefValue>(EI->getIndexOperand()))
9629 SubKey = hash_value(EI->getVectorOperand());
9630 }
9631 } else if (auto *I = dyn_cast<Instruction>(V)) {
9632 // Sort other instructions just by the opcodes except for CMPInst.
9633 // For CMP also sort by the predicate kind.
9635 isValidForAlternation(I->getOpcode())) {
9636 if (AllowAlternate)
9637 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
9638 else
9639 Key = hash_combine(hash_value(I->getOpcode()), Key);
9640 SubKey = hash_combine(
9641 hash_value(I->getOpcode()), hash_value(I->getType()),
9643 ? I->getType()
9644 : cast<CastInst>(I)->getOperand(0)->getType()));
9645 // For casts, look through the only operand to improve compile time.
9646 if (isa<CastInst>(I)) {
9647 std::pair<size_t, size_t> OpVals =
9648 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
9649 /*AllowAlternate=*/true);
9650 Key = hash_combine(OpVals.first, Key);
9651 SubKey = hash_combine(OpVals.first, SubKey);
9652 }
9653 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
9654 CmpInst::Predicate Pred = CI->getPredicate();
9655 if (CI->isCommutative())
9656 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
9658 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
9659 hash_value(SwapPred),
9660 hash_value(CI->getOperand(0)->getType()));
9661 } else if (auto *Call = dyn_cast<CallInst>(I)) {
9664 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
9665 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
9666 SubKey = hash_combine(hash_value(I->getOpcode()),
9667 hash_value(Call->getCalledFunction()));
9668 } else {
9670 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
9671 }
9672 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
9673 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
9674 hash_value(Op.Tag), SubKey);
9675 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
9676 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
9677 SubKey = hash_value(Gep->getPointerOperand());
9678 else
9679 SubKey = hash_value(Gep);
9680 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
9681 !isa<ConstantInt>(I->getOperand(1))) {
9682 // Do not try to vectorize instructions with potentially high cost.
9683 SubKey = hash_value(I);
9684 } else {
9685 SubKey = hash_value(I->getOpcode());
9686 }
9687 Key = hash_combine(hash_value(I->getParent()), Key);
9688 }
9689 return std::make_pair(Key, SubKey);
9690}
9691
9692/// Checks if the specified instruction \p I is an main operation for the given
9693/// \p MainOp and \p AltOp instructions.
9694static bool isMainInstruction(Instruction *I, Instruction *MainOp,
9695 Instruction *AltOp, const TargetLibraryInfo &TLI);
9696
9697bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
9698 ArrayRef<Value *> VL) const {
9699 Type *ScalarTy = S.getMainOp()->getType();
9700 unsigned Opcode0 = S.getOpcode();
9701 unsigned Opcode1 = S.getAltOpcode();
9702 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
9703 // If this pattern is supported by the target then consider it profitable.
9704 if (TTI->isLegalAltInstr(getWidenedType(ScalarTy, VL.size()), Opcode0,
9705 Opcode1, OpcodeMask))
9706 return true;
9708 for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
9709 Operands.emplace_back();
9710 // Prepare the operand vector.
9711 for (Value *V : VL) {
9712 if (isa<PoisonValue>(V)) {
9713 Operands.back().push_back(
9714 PoisonValue::get(S.getMainOp()->getOperand(I)->getType()));
9715 continue;
9716 }
9717 Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
9718 }
9719 }
9720 if (Operands.size() == 2) {
9721 // Try find best operands candidates.
9722 for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
9724 Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
9725 Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
9726 Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
9727 std::optional<int> Res = findBestRootPair(Candidates);
9728 switch (Res.value_or(0)) {
9729 case 0:
9730 break;
9731 case 1:
9732 std::swap(Operands[0][I + 1], Operands[1][I + 1]);
9733 break;
9734 case 2:
9735 std::swap(Operands[0][I], Operands[1][I]);
9736 break;
9737 default:
9738 llvm_unreachable("Unexpected index.");
9739 }
9740 }
9741 }
9742 DenseSet<unsigned> UniqueOpcodes;
9743 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
9744 unsigned NonInstCnt = 0;
9745 // Estimate number of instructions, required for the vectorized node and for
9746 // the buildvector node.
9747 unsigned UndefCnt = 0;
9748 // Count the number of extra shuffles, required for vector nodes.
9749 unsigned ExtraShuffleInsts = 0;
9750 // Check that operands do not contain same values and create either perfect
9751 // diamond match or shuffled match.
9752 if (Operands.size() == 2) {
9753 // Do not count same operands twice.
9754 if (Operands.front() == Operands.back()) {
9755 Operands.erase(Operands.begin());
9756 } else if (!allConstant(Operands.front()) &&
9757 all_of(Operands.front(), [&](Value *V) {
9758 return is_contained(Operands.back(), V);
9759 })) {
9760 Operands.erase(Operands.begin());
9761 ++ExtraShuffleInsts;
9762 }
9763 }
9764 const Loop *L = LI->getLoopFor(S.getMainOp()->getParent());
9765 // Vectorize node, if:
9766 // 1. at least single operand is constant or splat.
9767 // 2. Operands have many loop invariants (the instructions are not loop
9768 // invariants).
9769 // 3. At least single unique operands is supposed to vectorized.
9770 return none_of(Operands,
9771 [&](ArrayRef<Value *> Op) {
9772 if (allConstant(Op) ||
9773 (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
9774 getSameOpcode(Op, *TLI)))
9775 return false;
9776 DenseMap<Value *, unsigned> Uniques;
9777 for (Value *V : Op) {
9779 isVectorized(V) || (L && L->isLoopInvariant(V))) {
9780 if (isa<UndefValue>(V))
9781 ++UndefCnt;
9782 continue;
9783 }
9784 auto Res = Uniques.try_emplace(V, 0);
9785 // Found first duplicate - need to add shuffle.
9786 if (!Res.second && Res.first->second == 1)
9787 ++ExtraShuffleInsts;
9788 ++Res.first->getSecond();
9789 if (auto *I = dyn_cast<Instruction>(V))
9790 UniqueOpcodes.insert(I->getOpcode());
9791 else if (Res.second)
9792 ++NonInstCnt;
9793 }
9794 return none_of(Uniques, [&](const auto &P) {
9795 return P.first->hasNUsesOrMore(P.second + 1) &&
9796 none_of(P.first->users(), [&](User *U) {
9797 return isVectorized(U) || Uniques.contains(U);
9798 });
9799 });
9800 }) ||
9801 // Do not vectorize node, if estimated number of vector instructions is
9802 // more than estimated number of buildvector instructions. Number of
9803 // vector operands is number of vector instructions + number of vector
9804 // instructions for operands (buildvectors). Number of buildvector
9805 // instructions is just number_of_operands * number_of_scalars.
9806 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
9807 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
9808 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
9809}
9810
9811/// Builds the arguments types vector for the given call instruction with the
9812/// given \p ID for the specified vector factor.
9815 const unsigned VF, unsigned MinBW,
9816 const TargetTransformInfo *TTI) {
9817 SmallVector<Type *> ArgTys;
9818 for (auto [Idx, Arg] : enumerate(CI->args())) {
9821 ArgTys.push_back(Arg->getType());
9822 continue;
9823 }
9824 if (MinBW > 0) {
9825 ArgTys.push_back(
9826 getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
9827 continue;
9828 }
9829 }
9830 ArgTys.push_back(getWidenedType(Arg->getType(), VF));
9831 }
9832 return ArgTys;
9833}
9834
9835/// Calculates the costs of vectorized intrinsic (if possible) and vectorized
9836/// function (if possible) calls. Returns invalid cost for the corresponding
9837/// calls, if they cannot be vectorized/will be scalarized.
9838static std::pair<InstructionCost, InstructionCost>
9841 ArrayRef<Type *> ArgTys) {
9842 auto Shape = VFShape::get(CI->getFunctionType(),
9844 false /*HasGlobalPred*/);
9845 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
9846 auto LibCost = InstructionCost::getInvalid();
9847 if (!CI->isNoBuiltin() && VecFunc) {
9848 // Calculate the cost of the vector library call.
9849 // If the corresponding vector call is cheaper, return its cost.
9850 LibCost =
9851 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
9852 }
9854
9855 // Calculate the cost of the vector intrinsic call.
9856 FastMathFlags FMF;
9857 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
9858 FMF = FPCI->getFastMathFlags();
9859 const InstructionCost ScalarLimit = 10000;
9860 IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF, nullptr,
9861 LibCost.isValid() ? LibCost : ScalarLimit);
9862 auto IntrinsicCost =
9863 TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
9864 if ((LibCost.isValid() && IntrinsicCost > LibCost) ||
9865 (!LibCost.isValid() && IntrinsicCost > ScalarLimit))
9867
9868 return {IntrinsicCost, LibCost};
9869}
9870
9871BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
9872 const InstructionsState &S, ArrayRef<Value *> VL,
9873 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
9874 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
9875 assert(S.getMainOp() &&
9876 "Expected instructions with same/alternate opcodes only.");
9877
9878 unsigned ShuffleOrOp =
9879 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
9880 Instruction *VL0 = S.getMainOp();
9881 switch (ShuffleOrOp) {
9882 case Instruction::PHI: {
9883 // Too many operands - gather, most probably won't be vectorized.
9884 if (VL0->getNumOperands() > MaxPHINumOperands)
9885 return TreeEntry::NeedToGather;
9886 // Check for terminator values (e.g. invoke).
9887 for (Value *V : VL) {
9888 auto *PHI = dyn_cast<PHINode>(V);
9889 if (!PHI)
9890 continue;
9891 for (Value *Incoming : PHI->incoming_values()) {
9893 if (Term && Term->isTerminator()) {
9895 << "SLP: Need to swizzle PHINodes (terminator use).\n");
9896 return TreeEntry::NeedToGather;
9897 }
9898 }
9899 }
9900
9901 return TreeEntry::Vectorize;
9902 }
9903 case Instruction::ExtractElement:
9904 if (any_of(VL, [&](Value *V) {
9905 auto *EI = dyn_cast<ExtractElementInst>(V);
9906 if (!EI)
9907 return true;
9908 return isVectorized(EI->getOperand(0));
9909 }))
9910 return TreeEntry::NeedToGather;
9911 [[fallthrough]];
9912 case Instruction::ExtractValue: {
9913 bool Reuse = canReuseExtract(VL, CurrentOrder);
9914 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
9915 // non-full registers).
9916 if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size()))
9917 return TreeEntry::NeedToGather;
9918 if (Reuse || !CurrentOrder.empty())
9919 return TreeEntry::Vectorize;
9920 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
9921 return TreeEntry::NeedToGather;
9922 }
9923 case Instruction::InsertElement: {
9924 // Check that we have a buildvector and not a shuffle of 2 or more
9925 // different vectors.
9926 ValueSet SourceVectors;
9927 for (Value *V : VL) {
9928 if (isa<PoisonValue>(V)) {
9929 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement/poison vector.\n");
9930 return TreeEntry::NeedToGather;
9931 }
9932 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
9933 assert(getElementIndex(V) != std::nullopt &&
9934 "Non-constant or undef index?");
9935 }
9936
9937 if (count_if(VL, [&SourceVectors](Value *V) {
9938 return !SourceVectors.contains(V);
9939 }) >= 2) {
9940 // Found 2nd source vector - cancel.
9941 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
9942 "different source vectors.\n");
9943 return TreeEntry::NeedToGather;
9944 }
9945
9946 if (any_of(VL, [&SourceVectors](Value *V) {
9947 // The last InsertElement can have multiple uses.
9948 return SourceVectors.contains(V) && !V->hasOneUse();
9949 })) {
9950 assert(SLPReVec && "Only supported by REVEC.");
9951 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
9952 "multiple uses.\n");
9953 return TreeEntry::NeedToGather;
9954 }
9955
9956 return TreeEntry::Vectorize;
9957 }
9958 case Instruction::Load: {
9959 // Check that a vectorized load would load the same memory as a scalar
9960 // load. For example, we don't want to vectorize loads that are smaller
9961 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
9962 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
9963 // from such a struct, we read/write packed bits disagreeing with the
9964 // unvectorized version.
9965 auto IsGatheredNode = [&]() {
9966 if (!GatheredLoadsEntriesFirst)
9967 return false;
9968 return all_of(VL, [&](Value *V) {
9969 if (isa<PoisonValue>(V))
9970 return true;
9971 return any_of(getTreeEntries(V), [&](const TreeEntry *TE) {
9972 return TE->Idx >= *GatheredLoadsEntriesFirst;
9973 });
9974 });
9975 };
9976 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps, SPtrInfo)) {
9978 return TreeEntry::Vectorize;
9980 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
9981 // Delay slow vectorized nodes for better vectorization attempts.
9982 LoadEntriesToVectorize.insert(VectorizableTree.size());
9983 return TreeEntry::NeedToGather;
9984 }
9985 return IsGatheredNode() ? TreeEntry::NeedToGather
9986 : TreeEntry::CompressVectorize;
9988 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
9989 // Delay slow vectorized nodes for better vectorization attempts.
9990 LoadEntriesToVectorize.insert(VectorizableTree.size());
9991 return TreeEntry::NeedToGather;
9992 }
9993 return IsGatheredNode() ? TreeEntry::NeedToGather
9994 : TreeEntry::ScatterVectorize;
9996 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
9997 // Delay slow vectorized nodes for better vectorization attempts.
9998 LoadEntriesToVectorize.insert(VectorizableTree.size());
9999 return TreeEntry::NeedToGather;
10000 }
10001 return IsGatheredNode() ? TreeEntry::NeedToGather
10002 : TreeEntry::StridedVectorize;
10003 case LoadsState::Gather:
10004#ifndef NDEBUG
10005 Type *ScalarTy = VL0->getType();
10006 if (DL->getTypeSizeInBits(ScalarTy) !=
10007 DL->getTypeAllocSizeInBits(ScalarTy))
10008 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
10009 else if (any_of(VL, [](Value *V) {
10010 auto *LI = dyn_cast<LoadInst>(V);
10011 return !LI || !LI->isSimple();
10012 }))
10013 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
10014 else
10015 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
10016#endif // NDEBUG
10018 return TreeEntry::NeedToGather;
10019 }
10020 llvm_unreachable("Unexpected state of loads");
10021 }
10022 case Instruction::ZExt:
10023 case Instruction::SExt:
10024 case Instruction::FPToUI:
10025 case Instruction::FPToSI:
10026 case Instruction::FPExt:
10027 case Instruction::PtrToInt:
10028 case Instruction::IntToPtr:
10029 case Instruction::SIToFP:
10030 case Instruction::UIToFP:
10031 case Instruction::Trunc:
10032 case Instruction::FPTrunc:
10033 case Instruction::BitCast: {
10034 Type *SrcTy = VL0->getOperand(0)->getType();
10035 for (Value *V : VL) {
10036 if (isa<PoisonValue>(V))
10037 continue;
10038 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
10039 if (Ty != SrcTy || !isValidElementType(Ty)) {
10040 LLVM_DEBUG(
10041 dbgs() << "SLP: Gathering casts with different src types.\n");
10042 return TreeEntry::NeedToGather;
10043 }
10044 }
10045 return TreeEntry::Vectorize;
10046 }
10047 case Instruction::ICmp:
10048 case Instruction::FCmp: {
10049 // Check that all of the compares have the same predicate.
10050 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
10052 Type *ComparedTy = VL0->getOperand(0)->getType();
10053 for (Value *V : VL) {
10054 if (isa<PoisonValue>(V))
10055 continue;
10056 auto *Cmp = cast<CmpInst>(V);
10057 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
10058 Cmp->getOperand(0)->getType() != ComparedTy) {
10059 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
10060 return TreeEntry::NeedToGather;
10061 }
10062 }
10063 return TreeEntry::Vectorize;
10064 }
10065 case Instruction::Select:
10066 case Instruction::FNeg:
10067 case Instruction::Add:
10068 case Instruction::FAdd:
10069 case Instruction::Sub:
10070 case Instruction::FSub:
10071 case Instruction::Mul:
10072 case Instruction::FMul:
10073 case Instruction::UDiv:
10074 case Instruction::SDiv:
10075 case Instruction::FDiv:
10076 case Instruction::URem:
10077 case Instruction::SRem:
10078 case Instruction::FRem:
10079 case Instruction::Shl:
10080 case Instruction::LShr:
10081 case Instruction::AShr:
10082 case Instruction::And:
10083 case Instruction::Or:
10084 case Instruction::Xor:
10085 case Instruction::Freeze:
10086 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10087 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
10088 auto *I = dyn_cast<Instruction>(V);
10089 return I && I->isBinaryOp() && !I->isFast();
10090 }))
10091 return TreeEntry::NeedToGather;
10092 return TreeEntry::Vectorize;
10093 case Instruction::GetElementPtr: {
10094 // We don't combine GEPs with complicated (nested) indexing.
10095 for (Value *V : VL) {
10096 auto *I = dyn_cast<GetElementPtrInst>(V);
10097 if (!I)
10098 continue;
10099 if (I->getNumOperands() != 2) {
10100 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
10101 return TreeEntry::NeedToGather;
10102 }
10103 }
10104
10105 // We can't combine several GEPs into one vector if they operate on
10106 // different types.
10107 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
10108 for (Value *V : VL) {
10109 auto *GEP = dyn_cast<GEPOperator>(V);
10110 if (!GEP)
10111 continue;
10112 Type *CurTy = GEP->getSourceElementType();
10113 if (Ty0 != CurTy) {
10114 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
10115 return TreeEntry::NeedToGather;
10116 }
10117 }
10118
10119 // We don't combine GEPs with non-constant indexes.
10120 Type *Ty1 = VL0->getOperand(1)->getType();
10121 for (Value *V : VL) {
10122 auto *I = dyn_cast<GetElementPtrInst>(V);
10123 if (!I)
10124 continue;
10125 auto *Op = I->getOperand(1);
10126 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
10127 (Op->getType() != Ty1 &&
10128 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
10129 Op->getType()->getScalarSizeInBits() >
10130 DL->getIndexSizeInBits(
10131 V->getType()->getPointerAddressSpace())))) {
10132 LLVM_DEBUG(
10133 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
10134 return TreeEntry::NeedToGather;
10135 }
10136 }
10137
10138 return TreeEntry::Vectorize;
10139 }
10140 case Instruction::Store: {
10141 // Check if the stores are consecutive or if we need to swizzle them.
10142 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
10143 // Avoid types that are padded when being allocated as scalars, while
10144 // being packed together in a vector (such as i1).
10145 if (DL->getTypeSizeInBits(ScalarTy) !=
10146 DL->getTypeAllocSizeInBits(ScalarTy)) {
10147 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
10148 return TreeEntry::NeedToGather;
10149 }
10150 // Make sure all stores in the bundle are simple - we can't vectorize
10151 // atomic or volatile stores.
10152 for (Value *V : VL) {
10153 auto *SI = cast<StoreInst>(V);
10154 if (!SI->isSimple()) {
10155 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
10156 return TreeEntry::NeedToGather;
10157 }
10158 PointerOps.push_back(SI->getPointerOperand());
10159 }
10160
10161 // Check the order of pointer operands.
10162 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
10163 Value *Ptr0;
10164 Value *PtrN;
10165 if (CurrentOrder.empty()) {
10166 Ptr0 = PointerOps.front();
10167 PtrN = PointerOps.back();
10168 } else {
10169 Ptr0 = PointerOps[CurrentOrder.front()];
10170 PtrN = PointerOps[CurrentOrder.back()];
10171 }
10172 std::optional<int64_t> Dist =
10173 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
10174 // Check that the sorted pointer operands are consecutive.
10175 if (static_cast<uint64_t>(*Dist) == VL.size() - 1)
10176 return TreeEntry::Vectorize;
10177 }
10178
10179 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
10180 return TreeEntry::NeedToGather;
10181 }
10182 case Instruction::Call: {
10183 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10184 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
10185 auto *I = dyn_cast<Instruction>(V);
10186 return I && !I->isFast();
10187 }))
10188 return TreeEntry::NeedToGather;
10189 // Check if the calls are all to the same vectorizable intrinsic or
10190 // library function.
10191 CallInst *CI = cast<CallInst>(VL0);
10193
10194 VFShape Shape = VFShape::get(
10195 CI->getFunctionType(),
10196 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
10197 false /*HasGlobalPred*/);
10198 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10199
10200 if (!VecFunc && !isTriviallyVectorizable(ID)) {
10201 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
10202 return TreeEntry::NeedToGather;
10203 }
10204 Function *F = CI->getCalledFunction();
10205 unsigned NumArgs = CI->arg_size();
10206 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
10207 for (unsigned J = 0; J != NumArgs; ++J)
10209 ScalarArgs[J] = CI->getArgOperand(J);
10210 for (Value *V : VL) {
10211 CallInst *CI2 = dyn_cast<CallInst>(V);
10212 if (!CI2 || CI2->getCalledFunction() != F ||
10213 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
10214 (VecFunc &&
10215 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
10217 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
10218 << "\n");
10219 return TreeEntry::NeedToGather;
10220 }
10221 // Some intrinsics have scalar arguments and should be same in order for
10222 // them to be vectorized.
10223 for (unsigned J = 0; J != NumArgs; ++J) {
10225 Value *A1J = CI2->getArgOperand(J);
10226 if (ScalarArgs[J] != A1J) {
10228 << "SLP: mismatched arguments in call:" << *CI
10229 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
10230 return TreeEntry::NeedToGather;
10231 }
10232 }
10233 }
10234 // Verify that the bundle operands are identical between the two calls.
10235 if (CI->hasOperandBundles() &&
10236 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
10237 CI->op_begin() + CI->getBundleOperandsEndIndex(),
10238 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
10239 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
10240 << "!=" << *V << '\n');
10241 return TreeEntry::NeedToGather;
10242 }
10243 }
10244 SmallVector<Type *> ArgTys =
10245 buildIntrinsicArgTypes(CI, ID, VL.size(), 0, TTI);
10246 auto *VecTy = getWidenedType(S.getMainOp()->getType(), VL.size());
10247 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
10248 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10249 return TreeEntry::NeedToGather;
10250
10251 return TreeEntry::Vectorize;
10252 }
10253 case Instruction::ShuffleVector: {
10254 if (!S.isAltShuffle()) {
10255 // REVEC can support non alternate shuffle.
10257 return TreeEntry::Vectorize;
10258 // If this is not an alternate sequence of opcode like add-sub
10259 // then do not vectorize this instruction.
10260 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
10261 return TreeEntry::NeedToGather;
10262 }
10263 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
10264 LLVM_DEBUG(
10265 dbgs()
10266 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
10267 "the whole alt sequence is not profitable.\n");
10268 return TreeEntry::NeedToGather;
10269 }
10270
10271 return TreeEntry::Vectorize;
10272 }
10273 default:
10274 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
10275 return TreeEntry::NeedToGather;
10276 }
10277}
10278
10279namespace {
10280/// Allows to correctly handle operands of the phi nodes based on the \p Main
10281/// PHINode order of incoming basic blocks/values.
10282class PHIHandler {
10283 DominatorTree &DT;
10284 PHINode *Main = nullptr;
10287
10288public:
10289 PHIHandler() = delete;
10290 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
10291 : DT(DT), Main(Main), Phis(Phis),
10292 Operands(Main->getNumIncomingValues(),
10293 SmallVector<Value *>(Phis.size(), nullptr)) {}
10294 void buildOperands() {
10295 constexpr unsigned FastLimit = 4;
10296 if (Main->getNumIncomingValues() <= FastLimit) {
10297 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
10298 BasicBlock *InBB = Main->getIncomingBlock(I);
10299 if (!DT.isReachableFromEntry(InBB)) {
10300 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
10301 continue;
10302 }
10303 // Prepare the operand vector.
10304 for (auto [Idx, V] : enumerate(Phis)) {
10305 auto *P = dyn_cast<PHINode>(V);
10306 if (!P) {
10308 "Expected isa instruction or poison value.");
10309 Operands[I][Idx] = V;
10310 continue;
10311 }
10312 if (P->getIncomingBlock(I) == InBB)
10313 Operands[I][Idx] = P->getIncomingValue(I);
10314 else
10315 Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
10316 }
10317 }
10318 return;
10319 }
10320 SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
10321 Blocks;
10322 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues())) {
10323 BasicBlock *InBB = Main->getIncomingBlock(I);
10324 if (!DT.isReachableFromEntry(InBB)) {
10325 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
10326 continue;
10327 }
10328 Blocks.try_emplace(InBB).first->second.push_back(I);
10329 }
10330 for (auto [Idx, V] : enumerate(Phis)) {
10331 if (isa<PoisonValue>(V)) {
10332 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues()))
10333 Operands[I][Idx] = V;
10334 continue;
10335 }
10336 auto *P = cast<PHINode>(V);
10337 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
10338 BasicBlock *InBB = P->getIncomingBlock(I);
10339 if (InBB == Main->getIncomingBlock(I)) {
10341 continue;
10342 Operands[I][Idx] = P->getIncomingValue(I);
10343 continue;
10344 }
10345 auto *It = Blocks.find(InBB);
10346 if (It == Blocks.end())
10347 continue;
10348 Operands[It->second.front()][Idx] = P->getIncomingValue(I);
10349 }
10350 }
10351 for (const auto &P : Blocks) {
10352 ArrayRef<unsigned> IncomingValues = P.second;
10353 if (IncomingValues.size() <= 1)
10354 continue;
10355 unsigned BasicI = IncomingValues.consume_front();
10356 for (unsigned I : IncomingValues) {
10358 [&](const auto &Data) {
10359 return !Data.value() ||
10360 Data.value() == Operands[BasicI][Data.index()];
10361 }) &&
10362 "Expected empty operands list.");
10363 Operands[I] = Operands[BasicI];
10364 }
10365 }
10366 }
10367 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
10368};
10369} // namespace
10370
10371/// Returns main/alternate instructions for the given \p VL. Unlike
10372/// getSameOpcode supports non-compatible instructions for better SplitVectorize
10373/// node support.
10374/// \returns first main/alt instructions, if only poisons and instruction with
10375/// only 2 opcodes exists. Returns pair of nullptr otherwise.
10376static std::pair<Instruction *, Instruction *>
10378 Instruction *MainOp = nullptr;
10379 Instruction *AltOp = nullptr;
10380 for (Value *V : VL) {
10381 if (isa<PoisonValue>(V))
10382 continue;
10383 auto *I = dyn_cast<Instruction>(V);
10384 if (!I)
10385 return {};
10386 if (!MainOp) {
10387 MainOp = I;
10388 continue;
10389 }
10390 if (MainOp->getOpcode() == I->getOpcode()) {
10391 if (I->getParent() != MainOp->getParent())
10392 return {};
10393 continue;
10394 }
10395 if (!AltOp) {
10396 AltOp = I;
10397 continue;
10398 }
10399 if (AltOp->getOpcode() == I->getOpcode()) {
10400 if (I->getParent() != AltOp->getParent())
10401 return {};
10402 continue;
10403 }
10404 return {};
10405 }
10406 if (!AltOp)
10407 return {};
10408 assert(MainOp && AltOp && MainOp->getOpcode() != AltOp->getOpcode() &&
10409 "Expected different main and alt instructions.");
10410 return std::make_pair(MainOp, AltOp);
10411}
10412
10413/// Checks that every instruction appears once in the list and if not, packs
10414/// them, building \p ReuseShuffleIndices mask and mutating \p VL. The list of
10415/// unique scalars is extended by poison values to the whole register size.
10416///
10417/// \returns false if \p VL could not be uniquified, in which case \p VL is
10418/// unchanged and \p ReuseShuffleIndices is empty.
10420 SmallVectorImpl<int> &ReuseShuffleIndices,
10421 const TargetTransformInfo &TTI,
10422 const TargetLibraryInfo &TLI,
10423 const InstructionsState &S,
10424 const BoUpSLP::EdgeInfo &UserTreeIdx,
10425 bool TryPad = false) {
10426 // Check that every instruction appears once in this bundle.
10427 SmallVector<Value *> UniqueValues;
10428 SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
10429 for (Value *V : VL) {
10430 if (isConstant(V)) {
10431 // Constants are always considered distinct, even if the same constant
10432 // appears multiple times in VL.
10433 ReuseShuffleIndices.emplace_back(
10434 isa<PoisonValue>(V) ? PoisonMaskElem : UniqueValues.size());
10435 UniqueValues.emplace_back(V);
10436 continue;
10437 }
10438 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
10439 ReuseShuffleIndices.emplace_back(Res.first->second);
10440 if (Res.second)
10441 UniqueValues.emplace_back(V);
10442 }
10443
10444 // Easy case: VL has unique values and a "natural" size
10445 size_t NumUniqueScalarValues = UniqueValues.size();
10446 bool IsFullVectors = hasFullVectorsOrPowerOf2(
10447 TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues);
10448 if (NumUniqueScalarValues == VL.size() &&
10449 (VectorizeNonPowerOf2 || IsFullVectors)) {
10450 ReuseShuffleIndices.clear();
10451 return true;
10452 }
10453
10454 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
10455 if ((UserTreeIdx.UserTE &&
10456 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(TTI)) ||
10458 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
10459 "for nodes with padding.\n");
10460 ReuseShuffleIndices.clear();
10461 return false;
10462 }
10463
10464 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
10465 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
10466 (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
10467 return isa<UndefValue>(V) || !isConstant(V);
10468 }))) {
10469 if (TryPad && UniquePositions.size() > 1 && NumUniqueScalarValues > 1 &&
10470 S.getMainOp()->isSafeToRemove() &&
10471 (S.areInstructionsWithCopyableElements() ||
10472 all_of(UniqueValues, IsaPred<Instruction, PoisonValue>))) {
10473 // Find the number of elements, which forms full vectors.
10474 unsigned PWSz = getFullVectorNumberOfElements(
10475 TTI, UniqueValues.front()->getType(), UniqueValues.size());
10476 PWSz = std::min<unsigned>(PWSz, VL.size());
10477 if (PWSz == VL.size()) {
10478 // We ended up with the same size after removing duplicates and
10479 // upgrading the resulting vector size to a "nice size". Just keep
10480 // the initial VL then.
10481 ReuseShuffleIndices.clear();
10482 } else {
10483 // Pad unique values with poison to grow the vector to a "nice" size
10484 SmallVector<Value *> PaddedUniqueValues(UniqueValues.begin(),
10485 UniqueValues.end());
10486 PaddedUniqueValues.append(
10487 PWSz - UniqueValues.size(),
10488 PoisonValue::get(UniqueValues.front()->getType()));
10489 // Check that extended with poisons/copyable operations are still valid
10490 // for vectorization (div/rem are not allowed).
10491 if (!S.areInstructionsWithCopyableElements() &&
10492 !getSameOpcode(PaddedUniqueValues, TLI).valid()) {
10493 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
10494 ReuseShuffleIndices.clear();
10495 return false;
10496 }
10497 VL = std::move(PaddedUniqueValues);
10498 }
10499 return true;
10500 }
10501 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
10502 ReuseShuffleIndices.clear();
10503 return false;
10504 }
10505 VL = std::move(UniqueValues);
10506 return true;
10507}
10508
10509bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
10510 const InstructionsState &LocalState,
10511 SmallVectorImpl<Value *> &Op1,
10512 SmallVectorImpl<Value *> &Op2,
10513 OrdersType &ReorderIndices) const {
10514 constexpr unsigned SmallNodeSize = 4;
10515 if (VL.size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
10517 return false;
10518
10519 // Check if this is a duplicate of another split entry.
10520 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *LocalState.getMainOp()
10521 << ".\n");
10522 for (TreeEntry *E : getSplitTreeEntries(LocalState.getMainOp())) {
10523 if (E->isSame(VL)) {
10524 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at "
10525 << *LocalState.getMainOp() << ".\n");
10526 return false;
10527 }
10528 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
10529 if (all_of(VL, [&](Value *V) {
10530 return isa<PoisonValue>(V) || Values.contains(V);
10531 })) {
10532 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
10533 return false;
10534 }
10535 }
10536
10537 ReorderIndices.assign(VL.size(), VL.size());
10538 SmallBitVector Op1Indices(VL.size());
10539 for (auto [Idx, V] : enumerate(VL)) {
10540 auto *I = dyn_cast<Instruction>(V);
10541 if (!I) {
10542 Op1.push_back(V);
10543 Op1Indices.set(Idx);
10544 continue;
10545 }
10546 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
10547 isMainInstruction(I, LocalState.getMainOp(), LocalState.getAltOp(),
10548 *TLI)) ||
10549 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
10550 !isAlternateInstruction(I, LocalState.getMainOp(),
10551 LocalState.getAltOp(), *TLI))) {
10552 Op1.push_back(V);
10553 Op1Indices.set(Idx);
10554 continue;
10555 }
10556 Op2.push_back(V);
10557 }
10558 Type *ScalarTy = getValueType(VL.front());
10559 VectorType *VecTy = getWidenedType(ScalarTy, VL.size());
10560 unsigned Opcode0 = LocalState.getOpcode();
10561 unsigned Opcode1 = LocalState.getAltOpcode();
10562 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
10563 // Enable split node, only if all nodes do not form legal alternate
10564 // instruction (like X86 addsub).
10565 SmallPtrSet<Value *, 4> UOp1(llvm::from_range, Op1);
10566 SmallPtrSet<Value *, 4> UOp2(llvm::from_range, Op2);
10567 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
10568 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
10569 !hasFullVectorsOrPowerOf2(*TTI, Op1.front()->getType(), Op1.size()) ||
10570 !hasFullVectorsOrPowerOf2(*TTI, Op2.front()->getType(), Op2.size()))
10571 return false;
10572 // Enable split node, only if all nodes are power-of-2/full registers.
10573 unsigned Op1Cnt = 0, Op2Cnt = Op1.size();
10574 for (unsigned Idx : seq<unsigned>(VL.size())) {
10575 if (Op1Indices.test(Idx)) {
10576 ReorderIndices[Op1Cnt] = Idx;
10577 ++Op1Cnt;
10578 } else {
10579 ReorderIndices[Op2Cnt] = Idx;
10580 ++Op2Cnt;
10581 }
10582 }
10583 if (isIdentityOrder(ReorderIndices))
10584 ReorderIndices.clear();
10585 SmallVector<int> Mask;
10586 if (!ReorderIndices.empty())
10587 inversePermutation(ReorderIndices, Mask);
10588 unsigned NumParts = TTI->getNumberOfParts(VecTy);
10589 VectorType *Op1VecTy = getWidenedType(ScalarTy, Op1.size());
10590 VectorType *Op2VecTy = getWidenedType(ScalarTy, Op2.size());
10591 // Check non-profitable single register ops, which better to be represented
10592 // as alternate ops.
10593 if (NumParts >= VL.size())
10594 return false;
10596 InstructionCost InsertCost = ::getShuffleCost(
10597 *TTI, TTI::SK_InsertSubvector, VecTy, {}, Kind, Op1.size(), Op2VecTy);
10598 FixedVectorType *SubVecTy =
10599 getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size()));
10600 InstructionCost NewShuffleCost =
10601 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, SubVecTy, Mask, Kind);
10602 if (!LocalState.isCmpOp() && NumParts <= 1 &&
10603 (Mask.empty() || InsertCost >= NewShuffleCost))
10604 return false;
10605 if ((LocalState.getMainOp()->isBinaryOp() &&
10606 LocalState.getAltOp()->isBinaryOp() &&
10607 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
10608 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
10609 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
10610 (LocalState.getMainOp()->isUnaryOp() &&
10611 LocalState.getAltOp()->isUnaryOp())) {
10612 InstructionCost OriginalVecOpsCost =
10613 TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
10614 TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
10615 SmallVector<int> OriginalMask(VL.size(), PoisonMaskElem);
10616 for (unsigned Idx : seq<unsigned>(VL.size())) {
10617 if (isa<PoisonValue>(VL[Idx]))
10618 continue;
10619 OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.size());
10620 }
10621 InstructionCost OriginalCost =
10622 OriginalVecOpsCost + ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
10623 VecTy, OriginalMask, Kind);
10624 InstructionCost NewVecOpsCost =
10625 TTI->getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
10626 TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
10627 InstructionCost NewCost =
10628 NewVecOpsCost + InsertCost +
10629 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
10630 VectorizableTree.front()->getOpcode() == Instruction::Store
10631 ? NewShuffleCost
10632 : 0);
10633 // If not profitable to split - exit.
10634 if (NewCost >= OriginalCost)
10635 return false;
10636 }
10637 return true;
10638}
10639
10640namespace {
10641/// Class accepts incoming list of values, checks if it is able to model
10642/// "copyable" values as compatible operations, and generates the list of values
10643/// for scheduling and list of operands doe the new nodes.
10644class InstructionsCompatibilityAnalysis {
10645 DominatorTree &DT;
10646 const DataLayout &DL;
10647 const TargetTransformInfo &TTI;
10648 const TargetLibraryInfo &TLI;
10649 unsigned MainOpcode = 0;
10650 Instruction *MainOp = nullptr;
10651
10652 /// Checks if the opcode is supported as the main opcode for copyable
10653 /// elements.
10654 static bool isSupportedOpcode(const unsigned Opcode) {
10655 return Opcode == Instruction::Add || Opcode == Instruction::LShr ||
10656 Opcode == Instruction::Shl || Opcode == Instruction::SDiv ||
10657 Opcode == Instruction::UDiv;
10658 }
10659
10660 /// Identifies the best candidate value, which represents main opcode
10661 /// operation.
10662 /// Currently the best candidate is the Add instruction with the parent
10663 /// block with the highest DFS incoming number (block, that dominates other).
10664 void findAndSetMainInstruction(ArrayRef<Value *> VL, const BoUpSLP &R) {
10665 BasicBlock *Parent = nullptr;
10666 // Checks if the instruction has supported opcode.
10667 auto IsSupportedInstruction = [&](Instruction *I, bool AnyUndef) {
10668 if (AnyUndef && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
10669 return false;
10670 return I && isSupportedOpcode(I->getOpcode()) &&
10671 (!doesNotNeedToBeScheduled(I) || !R.isVectorized(I));
10672 };
10673 // Exclude operands instructions immediately to improve compile time, it
10674 // will be unable to schedule anyway.
10675 SmallDenseSet<Value *, 8> Operands;
10676 SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
10677 bool AnyUndef = false;
10678 for (Value *V : VL) {
10679 auto *I = dyn_cast<Instruction>(V);
10680 if (!I) {
10681 AnyUndef |= isa<UndefValue>(V);
10682 continue;
10683 }
10684 if (!DT.isReachableFromEntry(I->getParent()))
10685 continue;
10686 if (Candidates.empty()) {
10687 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
10688 Parent = I->getParent();
10689 Operands.insert(I->op_begin(), I->op_end());
10690 continue;
10691 }
10692 if (Parent == I->getParent()) {
10693 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
10694 Operands.insert(I->op_begin(), I->op_end());
10695 continue;
10696 }
10697 auto *NodeA = DT.getNode(Parent);
10698 auto *NodeB = DT.getNode(I->getParent());
10699 assert(NodeA && "Should only process reachable instructions");
10700 assert(NodeB && "Should only process reachable instructions");
10701 assert((NodeA == NodeB) ==
10702 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10703 "Different nodes should have different DFS numbers");
10704 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
10705 Candidates.clear();
10706 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
10707 Parent = I->getParent();
10708 Operands.clear();
10709 Operands.insert(I->op_begin(), I->op_end());
10710 }
10711 }
10712 unsigned BestOpcodeNum = 0;
10713 MainOp = nullptr;
10714 for (const auto &P : Candidates) {
10715 if (P.second.size() < BestOpcodeNum)
10716 continue;
10717 for (Instruction *I : P.second) {
10718 if (IsSupportedInstruction(I, AnyUndef) && !Operands.contains(I)) {
10719 MainOp = I;
10720 BestOpcodeNum = P.second.size();
10721 break;
10722 }
10723 }
10724 }
10725 if (MainOp) {
10726 // Do not match, if any copyable is a terminator from the same block as
10727 // the main operation.
10728 if (any_of(VL, [&](Value *V) {
10729 auto *I = dyn_cast<Instruction>(V);
10730 return I && I->getParent() == MainOp->getParent() &&
10731 I->isTerminator();
10732 })) {
10733 MainOp = nullptr;
10734 return;
10735 }
10736 MainOpcode = MainOp->getOpcode();
10737 }
10738 }
10739
10740 /// Returns the idempotent value for the \p MainOp with the detected \p
10741 /// MainOpcode. For Add, returns 0. For Or, it should choose between false and
10742 /// the operand itself, since V or V == V.
10743 Value *selectBestIdempotentValue() const {
10744 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
10745 return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(),
10746 !MainOp->isCommutative());
10747 }
10748
10749 /// Returns the value and operands for the \p V, considering if it is original
10750 /// instruction and its actual operands should be returned, or it is a
10751 /// copyable element and its should be represented as idempotent instruction.
10752 SmallVector<Value *> getOperands(const InstructionsState &S, Value *V) const {
10753 if (isa<PoisonValue>(V))
10754 return {V, V};
10755 if (!S.isCopyableElement(V))
10756 return convertTo(cast<Instruction>(V), S).second;
10757 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
10758 return {V, selectBestIdempotentValue()};
10759 }
10760
10761 /// Builds operands for the original instructions.
10762 void
10763 buildOriginalOperands(const InstructionsState &S, ArrayRef<Value *> VL,
10764 SmallVectorImpl<BoUpSLP::ValueList> &Operands) const {
10765
10766 unsigned ShuffleOrOp =
10767 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
10768 Instruction *VL0 = S.getMainOp();
10769
10770 switch (ShuffleOrOp) {
10771 case Instruction::PHI: {
10772 auto *PH = cast<PHINode>(VL0);
10773
10774 // Keeps the reordered operands to avoid code duplication.
10775 PHIHandler Handler(DT, PH, VL);
10776 Handler.buildOperands();
10777 Operands.assign(PH->getNumOperands(), {});
10778 for (unsigned I : seq<unsigned>(PH->getNumOperands()))
10779 Operands[I].assign(Handler.getOperands(I).begin(),
10780 Handler.getOperands(I).end());
10781 return;
10782 }
10783 case Instruction::ExtractValue:
10784 case Instruction::ExtractElement:
10785 // This is a special case, as it does not gather, but at the same time
10786 // we are not extending buildTree_rec() towards the operands.
10787 Operands.assign(1, {VL.size(), VL0->getOperand(0)});
10788 return;
10789 case Instruction::InsertElement:
10790 Operands.assign(2, {VL.size(), nullptr});
10791 for (auto [Idx, V] : enumerate(VL)) {
10792 auto *IE = cast<InsertElementInst>(V);
10793 for (auto [OpIdx, Ops] : enumerate(Operands))
10794 Ops[Idx] = IE->getOperand(OpIdx);
10795 }
10796 return;
10797 case Instruction::Load:
10798 Operands.assign(
10799 1, {VL.size(),
10800 PoisonValue::get(cast<LoadInst>(VL0)->getPointerOperandType())});
10801 for (auto [V, Op] : zip(VL, Operands.back())) {
10802 auto *LI = dyn_cast<LoadInst>(V);
10803 if (!LI)
10804 continue;
10805 Op = LI->getPointerOperand();
10806 }
10807 return;
10808 case Instruction::ZExt:
10809 case Instruction::SExt:
10810 case Instruction::FPToUI:
10811 case Instruction::FPToSI:
10812 case Instruction::FPExt:
10813 case Instruction::PtrToInt:
10814 case Instruction::IntToPtr:
10815 case Instruction::SIToFP:
10816 case Instruction::UIToFP:
10817 case Instruction::Trunc:
10818 case Instruction::FPTrunc:
10819 case Instruction::BitCast:
10820 case Instruction::ICmp:
10821 case Instruction::FCmp:
10822 case Instruction::Select:
10823 case Instruction::FNeg:
10824 case Instruction::Add:
10825 case Instruction::FAdd:
10826 case Instruction::Sub:
10827 case Instruction::FSub:
10828 case Instruction::Mul:
10829 case Instruction::FMul:
10830 case Instruction::UDiv:
10831 case Instruction::SDiv:
10832 case Instruction::FDiv:
10833 case Instruction::URem:
10834 case Instruction::SRem:
10835 case Instruction::FRem:
10836 case Instruction::Shl:
10837 case Instruction::LShr:
10838 case Instruction::AShr:
10839 case Instruction::And:
10840 case Instruction::Or:
10841 case Instruction::Xor:
10842 case Instruction::Freeze:
10843 case Instruction::Store:
10844 case Instruction::ShuffleVector:
10845 Operands.assign(VL0->getNumOperands(), {VL.size(), nullptr});
10846 for (auto [Idx, V] : enumerate(VL)) {
10847 auto *I = dyn_cast<Instruction>(V);
10848 if (!I) {
10849 for (auto [OpIdx, Ops] : enumerate(Operands))
10850 Ops[Idx] = PoisonValue::get(VL0->getOperand(OpIdx)->getType());
10851 continue;
10852 }
10853 auto [Op, ConvertedOps] = convertTo(I, S);
10854 for (auto [OpIdx, Ops] : enumerate(Operands))
10855 Ops[Idx] = ConvertedOps[OpIdx];
10856 }
10857 return;
10858 case Instruction::GetElementPtr: {
10859 Operands.assign(2, {VL.size(), nullptr});
10860 // Need to cast all indices to the same type before vectorization to
10861 // avoid crash.
10862 // Required to be able to find correct matches between different gather
10863 // nodes and reuse the vectorized values rather than trying to gather them
10864 // again.
10865 const unsigned IndexIdx = 1;
10866 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
10867 Type *Ty =
10868 all_of(VL,
10869 [&](Value *V) {
10871 return !GEP || VL0Ty == GEP->getOperand(IndexIdx)->getType();
10872 })
10873 ? VL0Ty
10874 : DL.getIndexType(cast<GetElementPtrInst>(VL0)
10875 ->getPointerOperandType()
10876 ->getScalarType());
10877 for (auto [Idx, V] : enumerate(VL)) {
10879 if (!GEP) {
10880 Operands[0][Idx] = V;
10881 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
10882 continue;
10883 }
10884 Operands[0][Idx] = GEP->getPointerOperand();
10885 auto *Op = GEP->getOperand(IndexIdx);
10886 auto *CI = dyn_cast<ConstantInt>(Op);
10887 Operands[1][Idx] = CI ? ConstantFoldIntegerCast(
10888 CI, Ty, CI->getValue().isSignBitSet(), DL)
10889 : Op;
10890 }
10891 return;
10892 }
10893 case Instruction::Call: {
10894 auto *CI = cast<CallInst>(VL0);
10896 for (unsigned Idx : seq<unsigned>(CI->arg_size())) {
10898 continue;
10899 auto &Ops = Operands.emplace_back();
10900 for (Value *V : VL) {
10901 auto *I = dyn_cast<Instruction>(V);
10902 Ops.push_back(I ? I->getOperand(Idx)
10903 : PoisonValue::get(VL0->getOperand(Idx)->getType()));
10904 }
10905 }
10906 return;
10907 }
10908 default:
10909 break;
10910 }
10911 llvm_unreachable("Unexpected vectorization of the instructions.");
10912 }
10913
10914public:
10915 InstructionsCompatibilityAnalysis(DominatorTree &DT, const DataLayout &DL,
10916 const TargetTransformInfo &TTI,
10917 const TargetLibraryInfo &TLI)
10918 : DT(DT), DL(DL), TTI(TTI), TLI(TLI) {}
10919
10920 InstructionsState
10921 buildInstructionsState(ArrayRef<Value *> VL, const BoUpSLP &R,
10922 bool TryCopyableElementsVectorization,
10923 bool WithProfitabilityCheck = false,
10924 bool SkipSameCodeCheck = false) {
10925 InstructionsState S = (SkipSameCodeCheck || !allSameBlock(VL))
10926 ? InstructionsState::invalid()
10927 : getSameOpcode(VL, TLI);
10928 if (S)
10929 return S;
10930 if (!VectorizeCopyableElements || !TryCopyableElementsVectorization)
10931 return S;
10932 findAndSetMainInstruction(VL, R);
10933 if (!MainOp)
10934 return InstructionsState::invalid();
10935 S = InstructionsState(MainOp, MainOp, /*HasCopyables=*/true);
10936 if (!WithProfitabilityCheck)
10937 return S;
10938 // Check if it is profitable to vectorize the instruction.
10939 SmallVector<BoUpSLP::ValueList> Operands = buildOperands(S, VL);
10940 auto BuildCandidates =
10941 [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates, Value *V1,
10942 Value *V2) {
10943 if (V1 != V2 && isa<PHINode>(V1))
10944 return;
10945 auto *I1 = dyn_cast<Instruction>(V1);
10946 auto *I2 = dyn_cast<Instruction>(V2);
10947 if (I1 && I2 && I1->getOpcode() == I2->getOpcode() &&
10948 I1->getParent() != I2->getParent())
10949 return;
10950 Candidates.emplace_back(V1, (I1 || I2) ? V2 : V1);
10951 };
10952 if (VL.size() == 2) {
10953 // Check if the operands allow better vectorization.
10954 SmallVector<std::pair<Value *, Value *>, 4> Candidates1, Candidates2;
10955 BuildCandidates(Candidates1, Operands[0][0], Operands[0][1]);
10956 BuildCandidates(Candidates2, Operands[1][0], Operands[1][1]);
10957 bool Res = !Candidates1.empty() && !Candidates2.empty() &&
10958 R.findBestRootPair(Candidates1) &&
10959 R.findBestRootPair(Candidates2);
10960 if (!Res && isCommutative(MainOp)) {
10961 Candidates1.clear();
10962 Candidates2.clear();
10963 BuildCandidates(Candidates1, Operands[0][0], Operands[1][1]);
10964 BuildCandidates(Candidates2, Operands[1][0], Operands[0][1]);
10965 Res = !Candidates1.empty() && !Candidates2.empty() &&
10966 R.findBestRootPair(Candidates1) &&
10967 R.findBestRootPair(Candidates2);
10968 }
10969 if (!Res)
10970 return InstructionsState::invalid();
10972 InstructionCost ScalarCost = TTI.getInstructionCost(S.getMainOp(), Kind);
10973 InstructionCost VectorCost;
10974 FixedVectorType *VecTy =
10975 getWidenedType(S.getMainOp()->getType(), VL.size());
10976 switch (MainOpcode) {
10977 case Instruction::Add:
10978 case Instruction::LShr:
10979 case Instruction::Shl:
10980 case Instruction::SDiv:
10981 case Instruction::UDiv:
10982 VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
10983 break;
10984 default:
10985 llvm_unreachable("Unexpected instruction.");
10986 }
10987 if (VectorCost > ScalarCost)
10988 return InstructionsState::invalid();
10989 return S;
10990 }
10991 assert(Operands.size() == 2 && "Unexpected number of operands!");
10992 unsigned CopyableNum =
10993 count_if(VL, [&](Value *V) { return S.isCopyableElement(V); });
10994 if (CopyableNum < VL.size() / 2)
10995 return S;
10996 // Too many phi copyables - exit.
10997 const unsigned Limit = VL.size() / 24;
10998 if ((CopyableNum >= VL.size() - Limit ||
10999 (CopyableNum >= VL.size() - 1 && VL.size() > 4) ||
11000 CopyableNum >= MaxPHINumOperands) &&
11001 all_of(VL, [&](Value *V) {
11002 return isa<PHINode>(V) || !S.isCopyableElement(V);
11003 }))
11004 return InstructionsState::invalid();
11005 // Check profitability if number of copyables > VL.size() / 2.
11006 // 1. Reorder operands for better matching.
11007 if (isCommutative(MainOp)) {
11008 for (auto &Ops : Operands) {
11009 // Make instructions the first operands.
11010 if (!isa<Instruction>(Ops.front()) && isa<Instruction>(Ops.back())) {
11011 std::swap(Ops.front(), Ops.back());
11012 continue;
11013 }
11014 // Make constants the second operands.
11015 if (isa<Constant>(Ops.front())) {
11016 std::swap(Ops.front(), Ops.back());
11017 continue;
11018 }
11019 }
11020 }
11021 // 2. Check, if operands can be vectorized.
11022 if (count_if(Operands.back(), IsaPred<Instruction>) > 1)
11023 return InstructionsState::invalid();
11024 auto CheckOperand = [&](ArrayRef<Value *> Ops) {
11025 if (allConstant(Ops) || isSplat(Ops))
11026 return true;
11027 // Check if it is "almost" splat, i.e. has >= 4 elements and only single
11028 // one is different.
11029 constexpr unsigned Limit = 4;
11030 if (Operands.front().size() >= Limit) {
11031 SmallDenseMap<const Value *, unsigned> Counters;
11032 for (Value *V : Ops) {
11033 if (isa<UndefValue>(V))
11034 continue;
11035 ++Counters[V];
11036 }
11037 if (Counters.size() == 2 &&
11038 any_of(Counters, [&](const std::pair<const Value *, unsigned> &C) {
11039 return C.second == 1;
11040 }))
11041 return true;
11042 }
11043 // First operand not a constant or splat? Last attempt - check for
11044 // potential vectorization.
11045 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
11046 InstructionsState OpS = Analysis.buildInstructionsState(
11047 Ops, R, /*TryCopyableElementsVectorization=*/true);
11048 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !allSameBlock(Ops)))
11049 return false;
11050 unsigned CopyableNum =
11051 count_if(Ops, [&](Value *V) { return OpS.isCopyableElement(V); });
11052 return CopyableNum <= VL.size() / 2;
11053 };
11054 if (!CheckOperand(Operands.front()))
11055 return InstructionsState::invalid();
11056
11057 return S;
11058 }
11059
11060 SmallVector<BoUpSLP::ValueList> buildOperands(const InstructionsState &S,
11061 ArrayRef<Value *> VL) {
11062 assert(S && "Invalid state!");
11064 if (S.areInstructionsWithCopyableElements()) {
11065 MainOp = S.getMainOp();
11066 MainOpcode = S.getOpcode();
11067 Operands.assign(MainOp->getNumOperands(),
11068 BoUpSLP::ValueList(VL.size(), nullptr));
11069 for (auto [Idx, V] : enumerate(VL)) {
11070 SmallVector<Value *> OperandsForValue = getOperands(S, V);
11071 for (auto [OperandIdx, Operand] : enumerate(OperandsForValue))
11072 Operands[OperandIdx][Idx] = Operand;
11073 }
11074 } else {
11075 buildOriginalOperands(S, VL, Operands);
11076 }
11077 return Operands;
11078 }
11079};
11080} // namespace
11081
11082BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
11083 ArrayRef<Value *> VL, unsigned Depth, const EdgeInfo &UserTreeIdx,
11084 bool TryCopyableElementsVectorization) const {
11085 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
11086
11087 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
11088 InstructionsState S = Analysis.buildInstructionsState(
11089 VL, *this, TryCopyableElementsVectorization,
11090 /*WithProfitabilityCheck=*/true, TryCopyableElementsVectorization);
11091
11092 // Don't go into catchswitch blocks, which can happen with PHIs.
11093 // Such blocks can only have PHIs and the catchswitch. There is no
11094 // place to insert a shuffle if we need to, so just avoid that issue.
11095 if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
11096 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
11097 // Do not try to pack to avoid extra instructions here.
11098 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11099 /*TryToFindDuplicates=*/false);
11100 }
11101
11102 // Check if this is a duplicate of another entry.
11103 if (S) {
11104 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp() << ".\n");
11105 for (TreeEntry *E : getTreeEntries(S.getMainOp())) {
11106 if (E->isSame(VL)) {
11107 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
11108 << ".\n");
11109 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11110 }
11111 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
11112 if (all_of(VL, [&](Value *V) {
11113 return isa<PoisonValue>(V) || Values.contains(V) ||
11114 (S.getOpcode() == Instruction::PHI && isa<PHINode>(V) &&
11115 LI->getLoopFor(S.getMainOp()->getParent()) &&
11116 isVectorized(V));
11117 })) {
11118 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
11119 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11120 }
11121 }
11122 }
11123
11124 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
11125 // a load), in which case peek through to include it in the tree, without
11126 // ballooning over-budget.
11127 if (Depth >= RecursionMaxDepth &&
11128 !(S && !S.isAltShuffle() && VL.size() >= 4 &&
11129 (match(S.getMainOp(), m_Load(m_Value())) ||
11130 all_of(VL, [&S](const Value *I) {
11131 return match(I,
11133 cast<Instruction>(I)->getOpcode() == S.getOpcode();
11134 })))) {
11135 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
11136 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11137 }
11138
11139 // Don't handle scalable vectors
11140 if (S && S.getOpcode() == Instruction::ExtractElement &&
11142 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
11143 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
11144 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11145 }
11146
11147 // Don't handle vectors.
11148 if (!SLPReVec && getValueType(VL.front())->isVectorTy()) {
11149 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
11150 // Do not try to pack to avoid extra instructions here.
11151 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11152 /*TryToFindDuplicates=*/false);
11153 }
11154
11155 // If all of the operands are identical or constant we have a simple solution.
11156 // If we deal with insert/extract instructions, they all must have constant
11157 // indices, otherwise we should gather them, not try to vectorize.
11158 // If alternate op node with 2 elements with gathered operands - do not
11159 // vectorize.
11160 auto NotProfitableForVectorization = [&S, this, Depth](ArrayRef<Value *> VL) {
11161 if (!S || !S.isAltShuffle() || VL.size() > 2)
11162 return false;
11163 if (VectorizableTree.size() < MinTreeSize)
11164 return false;
11165 if (Depth >= RecursionMaxDepth - 1)
11166 return true;
11167 // Check if all operands are extracts, part of vector node or can build a
11168 // regular vectorize node.
11169 SmallVector<unsigned, 8> InstsCount;
11170 for (Value *V : VL) {
11171 auto *I = cast<Instruction>(V);
11172 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
11173 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
11174 }));
11175 }
11176 bool IsCommutative =
11177 isCommutative(S.getMainOp()) || isCommutative(S.getAltOp());
11178 if ((IsCommutative &&
11179 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
11180 (!IsCommutative &&
11181 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
11182 return true;
11183 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
11185 auto *I1 = cast<Instruction>(VL.front());
11186 auto *I2 = cast<Instruction>(VL.back());
11187 for (int Op : seq<int>(S.getMainOp()->getNumOperands()))
11188 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
11189 I2->getOperand(Op));
11190 if (static_cast<unsigned>(count_if(
11191 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
11193 })) >= S.getMainOp()->getNumOperands() / 2)
11194 return false;
11195 if (S.getMainOp()->getNumOperands() > 2)
11196 return true;
11197 if (IsCommutative) {
11198 // Check permuted operands.
11199 Candidates.clear();
11200 for (int Op = 0, E = S.getMainOp()->getNumOperands(); Op < E; ++Op)
11201 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
11202 I2->getOperand((Op + 1) % E));
11203 if (any_of(
11204 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
11206 }))
11207 return false;
11208 }
11209 return true;
11210 };
11211 SmallVector<unsigned> SortedIndices;
11212 BasicBlock *BB = nullptr;
11213 bool IsScatterVectorizeUserTE =
11214 UserTreeIdx.UserTE &&
11215 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11216 bool AreAllSameBlock = S.valid();
11217 bool AreScatterAllGEPSameBlock =
11218 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
11219 VL.size() > 2 &&
11220 all_of(VL,
11221 [&BB](Value *V) {
11222 auto *I = dyn_cast<GetElementPtrInst>(V);
11223 if (!I)
11224 return doesNotNeedToBeScheduled(V);
11225 if (!BB)
11226 BB = I->getParent();
11227 return BB == I->getParent() && I->getNumOperands() == 2;
11228 }) &&
11229 BB &&
11230 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
11231 SortedIndices));
11232 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
11233 if (!AreAllSameInsts || (!S && allConstant(VL)) || isSplat(VL) ||
11234 (S &&
11236 S.getMainOp()) &&
11238 NotProfitableForVectorization(VL)) {
11239 if (!S) {
11240 LLVM_DEBUG(dbgs() << "SLP: Try split and if failed, gathering due to "
11241 "C,S,B,O, small shuffle. \n";
11242 dbgs() << "[";
11243 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
11244 dbgs() << "]\n");
11245 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11246 /*TryToFindDuplicates=*/true,
11247 /*TrySplitVectorize=*/true);
11248 }
11249 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n";
11250 dbgs() << "[";
11251 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
11252 dbgs() << "]\n");
11253 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11254 }
11255
11256 // Don't vectorize ephemeral values.
11257 if (S && !EphValues.empty()) {
11258 for (Value *V : VL) {
11259 if (EphValues.count(V)) {
11260 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
11261 << ") is ephemeral.\n");
11262 // Do not try to pack to avoid extra instructions here.
11263 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11264 /*TryToFindDuplicates=*/false);
11265 }
11266 }
11267 }
11268
11269 // We now know that this is a vector of instructions of the same type from
11270 // the same block.
11271
11272 // Check that none of the instructions in the bundle are already in the tree
11273 // and the node may be not profitable for the vectorization as the small
11274 // alternate node.
11275 if (S && S.isAltShuffle()) {
11276 auto GetNumVectorizedExtracted = [&]() {
11277 APInt Extracted = APInt::getZero(VL.size());
11278 APInt Vectorized = APInt::getAllOnes(VL.size());
11279 for (auto [Idx, V] : enumerate(VL)) {
11280 auto *I = dyn_cast<Instruction>(V);
11281 if (!I || doesNotNeedToBeScheduled(I) ||
11282 all_of(I->operands(), [&](const Use &U) {
11283 return isa<ExtractElementInst>(U.get());
11284 }))
11285 continue;
11286 if (isVectorized(I))
11287 Vectorized.clearBit(Idx);
11288 else if (!I->hasOneUser() && !areAllUsersVectorized(I, UserIgnoreList))
11289 Extracted.setBit(Idx);
11290 }
11291 return std::make_pair(Vectorized, Extracted);
11292 };
11293 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
11295 bool PreferScalarize = !Vectorized.isAllOnes() && VL.size() == 2;
11296 if (!Vectorized.isAllOnes() && !PreferScalarize) {
11297 // Rough cost estimation, if the vector code (+ potential extracts) is
11298 // more profitable than the scalar + buildvector.
11299 Type *ScalarTy = VL.front()->getType();
11300 auto *VecTy = getWidenedType(ScalarTy, VL.size());
11301 InstructionCost VectorizeCostEstimate =
11302 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, {}, Kind) +
11303 ::getScalarizationOverhead(*TTI, ScalarTy, VecTy, Extracted,
11304 /*Insert=*/false, /*Extract=*/true, Kind);
11305 InstructionCost ScalarizeCostEstimate = ::getScalarizationOverhead(
11306 *TTI, ScalarTy, VecTy, Vectorized,
11307 /*Insert=*/true, /*Extract=*/false, Kind, /*ForPoisonSrc=*/false);
11308 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
11309 }
11310 if (PreferScalarize) {
11311 LLVM_DEBUG(dbgs() << "SLP: The instructions are in tree and alternate "
11312 "node is not profitable.\n");
11313 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11314 }
11315 }
11316
11317 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
11318 if (UserIgnoreList && !UserIgnoreList->empty()) {
11319 for (Value *V : VL) {
11320 if (UserIgnoreList->contains(V)) {
11321 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
11322 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11323 }
11324 }
11325 }
11326
11327 // Special processing for sorted pointers for ScatterVectorize node with
11328 // constant indeces only.
11329 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
11330 assert(VL.front()->getType()->isPointerTy() &&
11332 "Expected pointers only.");
11333 // Reset S to make it GetElementPtr kind of node.
11334 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
11335 assert(It != VL.end() && "Expected at least one GEP.");
11336 S = getSameOpcode(*It, *TLI);
11337 }
11338
11339 // Check that all of the users of the scalars that we want to vectorize are
11340 // schedulable.
11341 Instruction *VL0 = S.getMainOp();
11342 BB = VL0->getParent();
11343
11344 if (S &&
11346 !DT->isReachableFromEntry(BB))) {
11347 // Don't go into unreachable blocks. They may contain instructions with
11348 // dependency cycles which confuse the final scheduling.
11349 // Do not vectorize EH and non-returning blocks, not profitable in most
11350 // cases.
11351 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
11352 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11353 }
11354 return ScalarsVectorizationLegality(S, /*IsLegal=*/true);
11355}
11356
11357void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
11358 const EdgeInfo &UserTreeIdx,
11359 unsigned InterleaveFactor) {
11360 assert((allConstant(VLRef) || allSameType(VLRef)) && "Invalid types!");
11361
11362 SmallVector<int> ReuseShuffleIndices;
11363 SmallVector<Value *> VL(VLRef);
11364
11365 // Tries to build split node.
11366 auto TrySplitNode = [&](const InstructionsState &LocalState) {
11367 SmallVector<Value *> Op1, Op2;
11368 OrdersType ReorderIndices;
11369 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
11370 return false;
11371
11372 auto Invalid = ScheduleBundle::invalid();
11373 auto *TE = newTreeEntry(VL, TreeEntry::SplitVectorize, Invalid, LocalState,
11374 UserTreeIdx, {}, ReorderIndices);
11375 LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n"; TE->dump());
11376 auto AddNode = [&](ArrayRef<Value *> Op, unsigned Idx) {
11377 InstructionsState S = getSameOpcode(Op, *TLI);
11378 if (S && (isa<LoadInst>(S.getMainOp()) ||
11379 getSameValuesTreeEntry(S.getMainOp(), Op, /*SameVF=*/true))) {
11380 // Build gather node for loads, they will be gathered later.
11381 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11382 Idx == 0 ? 0 : Op1.size());
11383 (void)newTreeEntry(Op, TreeEntry::NeedToGather, Invalid, S, {TE, Idx});
11384 } else {
11385 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11386 Idx == 0 ? 0 : Op1.size());
11387 buildTreeRec(Op, Depth, {TE, Idx});
11388 }
11389 };
11390 AddNode(Op1, 0);
11391 AddNode(Op2, 1);
11392 return true;
11393 };
11394
11395 auto AreOnlyConstsWithPHIs = [](ArrayRef<Value *> VL) {
11396 bool AreConsts = false;
11397 for (Value *V : VL) {
11398 if (isa<PoisonValue>(V))
11399 continue;
11400 if (isa<Constant>(V)) {
11401 AreConsts = true;
11402 continue;
11403 }
11404 if (!isa<PHINode>(V))
11405 return false;
11406 }
11407 return AreConsts;
11408 };
11409 if (AreOnlyConstsWithPHIs(VL)) {
11410 LLVM_DEBUG(dbgs() << "SLP: Gathering due to all constants and PHIs.\n");
11411 newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
11412 return;
11413 }
11414
11415 ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
11416 VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/false);
11417 InstructionsState S = Legality.getInstructionsState();
11418 if (!Legality.isLegal()) {
11419 if (Legality.trySplitVectorize()) {
11420 auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
11421 // Last chance to try to vectorize alternate node.
11422 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
11423 return;
11424 }
11425 if (!S)
11426 Legality = getScalarsVectorizationLegality(
11427 VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/true);
11428 if (!Legality.isLegal()) {
11429 if (Legality.tryToFindDuplicates())
11430 tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S,
11431 UserTreeIdx);
11432
11433 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11434 return;
11435 }
11436 S = Legality.getInstructionsState();
11437 }
11438
11439 // FIXME: investigate if there are profitable cases for VL.size() <= 4.
11440 if (S.isAltShuffle() && TrySplitNode(S))
11441 return;
11442
11443 // Check that every instruction appears once in this bundle.
11444 if (!tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S, UserTreeIdx,
11445 /*TryPad=*/true)) {
11446 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11447 return;
11448 }
11449
11450 // Perform specific checks for each particular instruction kind.
11451 bool IsScatterVectorizeUserTE =
11452 UserTreeIdx.UserTE &&
11453 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11454 OrdersType CurrentOrder;
11455 SmallVector<Value *> PointerOps;
11456 StridedPtrInfo SPtrInfo;
11457 TreeEntry::EntryState State = getScalarsVectorizationState(
11458 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
11459 if (State == TreeEntry::NeedToGather) {
11460 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11461 return;
11462 }
11463
11464 Instruction *VL0 = S.getMainOp();
11465 BasicBlock *BB = VL0->getParent();
11466 auto &BSRef = BlocksSchedules[BB];
11467 if (!BSRef)
11468 BSRef = std::make_unique<BlockScheduling>(BB);
11469
11470 BlockScheduling &BS = *BSRef;
11471
11472 SetVector<Value *> UniqueValues(llvm::from_range, VL);
11473 std::optional<ScheduleBundle *> BundlePtr =
11474 BS.tryScheduleBundle(UniqueValues.getArrayRef(), this, S, UserTreeIdx);
11475#ifdef EXPENSIVE_CHECKS
11476 // Make sure we didn't break any internal invariants
11477 BS.verify();
11478#endif
11479 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
11480 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
11481 // Last chance to try to vectorize alternate node.
11482 if (S.isAltShuffle() && ReuseShuffleIndices.empty() && TrySplitNode(S))
11483 return;
11484 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11485 NonScheduledFirst.insert(VL.front());
11486 if (S.getOpcode() == Instruction::Load &&
11487 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
11489 return;
11490 }
11491 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
11492 SmallVector<ValueList> Operands = Analysis.buildOperands(S, VL);
11493 ScheduleBundle Empty;
11494 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() : Empty;
11495 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
11496
11497 unsigned ShuffleOrOp =
11498 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
11499 auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) {
11500 // Postpone PHI nodes creation
11501 SmallVector<unsigned> PHIOps;
11502 for (unsigned I : seq<unsigned>(Operands.size())) {
11504 if (Op.empty())
11505 continue;
11506 InstructionsState S = getSameOpcode(Op, *TLI);
11507 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
11508 buildTreeRec(Op, Depth + 1, {TE, I});
11509 else
11510 PHIOps.push_back(I);
11511 }
11512 for (unsigned I : PHIOps)
11513 buildTreeRec(Operands[I], Depth + 1, {TE, I});
11514 };
11515 switch (ShuffleOrOp) {
11516 case Instruction::PHI: {
11517 TreeEntry *TE =
11518 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
11519 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n";
11520 TE->dump());
11521
11522 TE->setOperands(Operands);
11523 CreateOperandNodes(TE, Operands);
11524 return;
11525 }
11526 case Instruction::ExtractValue:
11527 case Instruction::ExtractElement: {
11528 if (CurrentOrder.empty()) {
11529 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
11530 } else {
11531 LLVM_DEBUG({
11532 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
11533 "with order";
11534 for (unsigned Idx : CurrentOrder)
11535 dbgs() << " " << Idx;
11536 dbgs() << "\n";
11537 });
11538 fixupOrderingIndices(CurrentOrder);
11539 }
11540 // Insert new order with initial value 0, if it does not exist,
11541 // otherwise return the iterator to the existing one.
11542 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11543 ReuseShuffleIndices, CurrentOrder);
11544 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
11545 "(ExtractValueInst/ExtractElementInst).\n";
11546 TE->dump());
11547 // This is a special case, as it does not gather, but at the same time
11548 // we are not extending buildTreeRec() towards the operands.
11549 TE->setOperands(Operands);
11550 return;
11551 }
11552 case Instruction::InsertElement: {
11553 assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
11554
11555 auto OrdCompare = [](const std::pair<int, int> &P1,
11556 const std::pair<int, int> &P2) {
11557 return P1.first > P2.first;
11558 };
11559 PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
11560 decltype(OrdCompare)>
11561 Indices(OrdCompare);
11562 for (int I = 0, E = VL.size(); I < E; ++I) {
11563 unsigned Idx = *getElementIndex(VL[I]);
11564 Indices.emplace(Idx, I);
11565 }
11566 OrdersType CurrentOrder(VL.size(), VL.size());
11567 bool IsIdentity = true;
11568 for (int I = 0, E = VL.size(); I < E; ++I) {
11569 CurrentOrder[Indices.top().second] = I;
11570 IsIdentity &= Indices.top().second == I;
11571 Indices.pop();
11572 }
11573 if (IsIdentity)
11574 CurrentOrder.clear();
11575 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11576 {}, CurrentOrder);
11577 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
11578 TE->dump());
11579
11580 TE->setOperands(Operands);
11581 buildTreeRec(TE->getOperand(1), Depth + 1, {TE, 1});
11582 return;
11583 }
11584 case Instruction::Load: {
11585 // Check that a vectorized load would load the same memory as a scalar
11586 // load. For example, we don't want to vectorize loads that are smaller
11587 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
11588 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
11589 // from such a struct, we read/write packed bits disagreeing with the
11590 // unvectorized version.
11591 TreeEntry *TE = nullptr;
11592 fixupOrderingIndices(CurrentOrder);
11593 switch (State) {
11594 case TreeEntry::Vectorize:
11595 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11596 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
11597 if (CurrentOrder.empty())
11598 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
11599 TE->dump());
11600 else
11602 << "SLP: added a new TreeEntry (jumbled LoadInst).\n";
11603 TE->dump());
11604 break;
11605 case TreeEntry::CompressVectorize:
11606 // Vectorizing non-consecutive loads with (masked)load + compress.
11607 TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
11608 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11609 LLVM_DEBUG(
11610 dbgs()
11611 << "SLP: added a new TreeEntry (masked LoadInst + compress).\n";
11612 TE->dump());
11613 break;
11614 case TreeEntry::StridedVectorize:
11615 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
11616 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
11617 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11618 TreeEntryToStridedPtrInfoMap[TE] = SPtrInfo;
11619 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
11620 TE->dump());
11621 break;
11622 case TreeEntry::ScatterVectorize:
11623 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
11624 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
11625 UserTreeIdx, ReuseShuffleIndices);
11626 LLVM_DEBUG(
11627 dbgs()
11628 << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
11629 TE->dump());
11630 break;
11631 case TreeEntry::CombinedVectorize:
11632 case TreeEntry::SplitVectorize:
11633 case TreeEntry::NeedToGather:
11634 llvm_unreachable("Unexpected loads state.");
11635 }
11636 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
11637 assert(Operands.size() == 1 && "Expected a single operand only");
11638 SmallVector<int> Mask;
11639 inversePermutation(CurrentOrder, Mask);
11640 reorderScalars(Operands.front(), Mask);
11641 }
11642 TE->setOperands(Operands);
11643 if (State == TreeEntry::ScatterVectorize)
11644 buildTreeRec(PointerOps, Depth + 1, {TE, 0});
11645 return;
11646 }
11647 case Instruction::ZExt:
11648 case Instruction::SExt:
11649 case Instruction::FPToUI:
11650 case Instruction::FPToSI:
11651 case Instruction::FPExt:
11652 case Instruction::PtrToInt:
11653 case Instruction::IntToPtr:
11654 case Instruction::SIToFP:
11655 case Instruction::UIToFP:
11656 case Instruction::Trunc:
11657 case Instruction::FPTrunc:
11658 case Instruction::BitCast: {
11659 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
11660 std::make_pair(std::numeric_limits<unsigned>::min(),
11661 std::numeric_limits<unsigned>::max()));
11662 if (ShuffleOrOp == Instruction::ZExt ||
11663 ShuffleOrOp == Instruction::SExt) {
11664 CastMaxMinBWSizes = std::make_pair(
11665 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
11666 PrevMaxBW),
11667 std::min<unsigned>(
11668 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
11669 PrevMinBW));
11670 } else if (ShuffleOrOp == Instruction::Trunc) {
11671 CastMaxMinBWSizes = std::make_pair(
11672 std::max<unsigned>(
11673 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
11674 PrevMaxBW),
11675 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
11676 PrevMinBW));
11677 }
11678 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11679 ReuseShuffleIndices);
11680 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
11681 TE->dump());
11682
11683 TE->setOperands(Operands);
11684 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
11685 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
11686 if (ShuffleOrOp == Instruction::Trunc) {
11687 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11688 } else if (ShuffleOrOp == Instruction::SIToFP ||
11689 ShuffleOrOp == Instruction::UIToFP) {
11690 unsigned NumSignBits =
11691 ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
11692 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
11693 APInt Mask = DB->getDemandedBits(OpI);
11694 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
11695 }
11696 if (NumSignBits * 2 >=
11697 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
11698 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11699 }
11700 return;
11701 }
11702 case Instruction::ICmp:
11703 case Instruction::FCmp: {
11704 // Check that all of the compares have the same predicate.
11705 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
11706 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11707 ReuseShuffleIndices);
11708 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n";
11709 TE->dump());
11710
11711 VLOperands Ops(VL, Operands, S, *this);
11712 if (cast<CmpInst>(VL0)->isCommutative()) {
11713 // Commutative predicate - collect + sort operands of the instructions
11714 // so that each side is more likely to have the same opcode.
11716 "Commutative Predicate mismatch");
11717 Ops.reorder();
11718 Operands.front() = Ops.getVL(0);
11719 Operands.back() = Ops.getVL(1);
11720 } else {
11721 // Collect operands - commute if it uses the swapped predicate.
11722 for (auto [Idx, V] : enumerate(VL)) {
11723 if (isa<PoisonValue>(V))
11724 continue;
11725 auto *Cmp = cast<CmpInst>(V);
11726 if (Cmp->getPredicate() != P0)
11727 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
11728 }
11729 }
11730 TE->setOperands(Operands);
11731 buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
11732 buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
11733 if (ShuffleOrOp == Instruction::ICmp) {
11734 unsigned NumSignBits0 =
11735 ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
11736 if (NumSignBits0 * 2 >=
11737 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
11738 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11739 unsigned NumSignBits1 =
11740 ComputeNumSignBits(VL0->getOperand(1), *DL, AC, nullptr, DT);
11741 if (NumSignBits1 * 2 >=
11742 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
11743 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
11744 }
11745 return;
11746 }
11747 case Instruction::Select:
11748 case Instruction::FNeg:
11749 case Instruction::Add:
11750 case Instruction::FAdd:
11751 case Instruction::Sub:
11752 case Instruction::FSub:
11753 case Instruction::Mul:
11754 case Instruction::FMul:
11755 case Instruction::UDiv:
11756 case Instruction::SDiv:
11757 case Instruction::FDiv:
11758 case Instruction::URem:
11759 case Instruction::SRem:
11760 case Instruction::FRem:
11761 case Instruction::Shl:
11762 case Instruction::LShr:
11763 case Instruction::AShr:
11764 case Instruction::And:
11765 case Instruction::Or:
11766 case Instruction::Xor:
11767 case Instruction::Freeze: {
11768 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11769 ReuseShuffleIndices);
11770 LLVM_DEBUG(
11771 dbgs() << "SLP: added a new TreeEntry "
11772 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
11773 TE->dump());
11774
11775 if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
11776 VLOperands Ops(VL, Operands, S, *this);
11777 Ops.reorder();
11778 Operands[0] = Ops.getVL(0);
11779 Operands[1] = Ops.getVL(1);
11780 }
11781 TE->setOperands(Operands);
11782 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
11783 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
11784 return;
11785 }
11786 case Instruction::GetElementPtr: {
11787 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11788 ReuseShuffleIndices);
11789 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
11790 TE->dump());
11791 TE->setOperands(Operands);
11792
11793 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
11794 buildTreeRec(Operands[I], Depth + 1, {TE, I});
11795 return;
11796 }
11797 case Instruction::Store: {
11798 bool Consecutive = CurrentOrder.empty();
11799 if (!Consecutive)
11800 fixupOrderingIndices(CurrentOrder);
11801 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11802 ReuseShuffleIndices, CurrentOrder);
11803 if (Consecutive)
11804 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
11805 TE->dump());
11806 else
11807 LLVM_DEBUG(
11808 dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n";
11809 TE->dump());
11810 TE->setOperands(Operands);
11811 buildTreeRec(TE->getOperand(0), Depth + 1, {TE, 0});
11812 return;
11813 }
11814 case Instruction::Call: {
11815 // Check if the calls are all to the same vectorizable intrinsic or
11816 // library function.
11817 CallInst *CI = cast<CallInst>(VL0);
11819
11820 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11821 ReuseShuffleIndices);
11822 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
11823 TE->dump());
11824 if (isCommutative(VL0)) {
11825 VLOperands Ops(VL, Operands, S, *this);
11826 Ops.reorder();
11827 Operands[0] = Ops.getVL(0);
11828 Operands[1] = Ops.getVL(1);
11829 }
11830 TE->setOperands(Operands);
11831 for (unsigned I : seq<unsigned>(CI->arg_size())) {
11832 // For scalar operands no need to create an entry since no need to
11833 // vectorize it.
11835 continue;
11836 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
11837 }
11838 return;
11839 }
11840 case Instruction::ShuffleVector: {
11841 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11842 ReuseShuffleIndices);
11843 if (S.isAltShuffle()) {
11844 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
11845 TE->dump());
11846 } else {
11847 assert(SLPReVec && "Only supported by REVEC.");
11848 LLVM_DEBUG(
11849 dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
11850 TE->dump());
11851 }
11852
11853 // Reorder operands if reordering would enable vectorization.
11854 auto *CI = dyn_cast<CmpInst>(VL0);
11855 if (CI && any_of(VL, [](Value *V) {
11856 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
11857 })) {
11858 auto *MainCI = cast<CmpInst>(S.getMainOp());
11859 auto *AltCI = cast<CmpInst>(S.getAltOp());
11860 CmpInst::Predicate MainP = MainCI->getPredicate();
11861 CmpInst::Predicate AltP = AltCI->getPredicate();
11862 assert(MainP != AltP &&
11863 "Expected different main/alternate predicates.");
11864 // Collect operands - commute if it uses the swapped predicate or
11865 // alternate operation.
11866 for (auto [Idx, V] : enumerate(VL)) {
11867 if (isa<PoisonValue>(V))
11868 continue;
11869 auto *Cmp = cast<CmpInst>(V);
11870
11871 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
11872 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
11873 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
11874 } else {
11875 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
11876 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
11877 }
11878 }
11879 TE->setOperands(Operands);
11880 buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
11881 buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
11882 return;
11883 }
11884
11885 if (isa<BinaryOperator>(VL0) || CI) {
11886 VLOperands Ops(VL, Operands, S, *this);
11887 Ops.reorder();
11888 Operands[0] = Ops.getVL(0);
11889 Operands[1] = Ops.getVL(1);
11890 }
11891 TE->setOperands(Operands);
11892 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
11893 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
11894 return;
11895 }
11896 default:
11897 break;
11898 }
11899 llvm_unreachable("Unexpected vectorization of the instructions.");
11900}
11901
11903 unsigned N = 1;
11904 Type *EltTy = T;
11905
11907 if (EltTy->isEmptyTy())
11908 return 0;
11909 if (auto *ST = dyn_cast<StructType>(EltTy)) {
11910 // Check that struct is homogeneous.
11911 for (const auto *Ty : ST->elements())
11912 if (Ty != *ST->element_begin())
11913 return 0;
11914 N *= ST->getNumElements();
11915 EltTy = *ST->element_begin();
11916 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
11917 N *= AT->getNumElements();
11918 EltTy = AT->getElementType();
11919 } else {
11920 auto *VT = cast<FixedVectorType>(EltTy);
11921 N *= VT->getNumElements();
11922 EltTy = VT->getElementType();
11923 }
11924 }
11925
11926 if (!isValidElementType(EltTy))
11927 return 0;
11928 size_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N));
11929 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
11930 VTSize != DL->getTypeStoreSizeInBits(T))
11931 return 0;
11932 return N;
11933}
11934
11935bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
11936 SmallVectorImpl<unsigned> &CurrentOrder,
11937 bool ResizeAllowed) const {
11939 assert(It != VL.end() && "Expected at least one extract instruction.");
11940 auto *E0 = cast<Instruction>(*It);
11941 assert(
11943 "Invalid opcode");
11944 // Check if all of the extracts come from the same vector and from the
11945 // correct offset.
11946 Value *Vec = E0->getOperand(0);
11947
11948 CurrentOrder.clear();
11949
11950 // We have to extract from a vector/aggregate with the same number of elements.
11951 unsigned NElts;
11952 if (E0->getOpcode() == Instruction::ExtractValue) {
11953 NElts = canMapToVector(Vec->getType());
11954 if (!NElts)
11955 return false;
11956 // Check if load can be rewritten as load of vector.
11957 LoadInst *LI = dyn_cast<LoadInst>(Vec);
11958 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
11959 return false;
11960 } else {
11961 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
11962 }
11963
11964 unsigned E = VL.size();
11965 if (!ResizeAllowed && NElts != E)
11966 return false;
11968 unsigned MinIdx = NElts, MaxIdx = 0;
11969 for (auto [I, V] : enumerate(VL)) {
11970 auto *Inst = dyn_cast<Instruction>(V);
11971 if (!Inst)
11972 continue;
11973 if (Inst->getOperand(0) != Vec)
11974 return false;
11975 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
11976 if (isa<UndefValue>(EE->getIndexOperand()))
11977 continue;
11978 std::optional<unsigned> Idx = getExtractIndex(Inst);
11979 if (!Idx)
11980 return false;
11981 const unsigned ExtIdx = *Idx;
11982 if (ExtIdx >= NElts)
11983 continue;
11984 Indices[I] = ExtIdx;
11985 if (MinIdx > ExtIdx)
11986 MinIdx = ExtIdx;
11987 if (MaxIdx < ExtIdx)
11988 MaxIdx = ExtIdx;
11989 }
11990 if (MaxIdx - MinIdx + 1 > E)
11991 return false;
11992 if (MaxIdx + 1 <= E)
11993 MinIdx = 0;
11994
11995 // Check that all of the indices extract from the correct offset.
11996 bool ShouldKeepOrder = true;
11997 // Assign to all items the initial value E + 1 so we can check if the extract
11998 // instruction index was used already.
11999 // Also, later we can check that all the indices are used and we have a
12000 // consecutive access in the extract instructions, by checking that no
12001 // element of CurrentOrder still has value E + 1.
12002 CurrentOrder.assign(E, E);
12003 for (unsigned I = 0; I < E; ++I) {
12004 if (Indices[I] == PoisonMaskElem)
12005 continue;
12006 const unsigned ExtIdx = Indices[I] - MinIdx;
12007 if (CurrentOrder[ExtIdx] != E) {
12008 CurrentOrder.clear();
12009 return false;
12010 }
12011 ShouldKeepOrder &= ExtIdx == I;
12012 CurrentOrder[ExtIdx] = I;
12013 }
12014 if (ShouldKeepOrder)
12015 CurrentOrder.clear();
12016
12017 return ShouldKeepOrder;
12018}
12019
12020bool BoUpSLP::areAllUsersVectorized(
12021 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
12022 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
12023 all_of(I->users(), [this](User *U) {
12024 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
12025 (isa<ExtractElementInst>(U) && MustGather.contains(U));
12026 });
12027}
12028
12029void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
12030 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
12031 SmallVectorImpl<Value *> *OpScalars,
12032 SmallVectorImpl<Value *> *AltScalars) const {
12033 unsigned Sz = Scalars.size();
12034 Mask.assign(Sz, PoisonMaskElem);
12035 SmallVector<int> OrderMask;
12036 if (!ReorderIndices.empty())
12037 inversePermutation(ReorderIndices, OrderMask);
12038 for (unsigned I = 0; I < Sz; ++I) {
12039 unsigned Idx = I;
12040 if (!ReorderIndices.empty())
12041 Idx = OrderMask[I];
12042 if (isa<PoisonValue>(Scalars[Idx]))
12043 continue;
12044 auto *OpInst = cast<Instruction>(Scalars[Idx]);
12045 if (IsAltOp(OpInst)) {
12046 Mask[I] = Sz + Idx;
12047 if (AltScalars)
12048 AltScalars->push_back(OpInst);
12049 } else {
12050 Mask[I] = Idx;
12051 if (OpScalars)
12052 OpScalars->push_back(OpInst);
12053 }
12054 }
12055 if (!ReuseShuffleIndices.empty()) {
12056 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
12057 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
12058 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
12059 });
12060 Mask.swap(NewMask);
12061 }
12062}
12063
12065 Instruction *AltOp,
12066 const TargetLibraryInfo &TLI) {
12067 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == MainOp;
12068}
12069
12071 Instruction *AltOp,
12072 const TargetLibraryInfo &TLI) {
12073 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
12074 auto *AltCI = cast<CmpInst>(AltOp);
12075 CmpInst::Predicate MainP = MainCI->getPredicate();
12076 [[maybe_unused]] CmpInst::Predicate AltP = AltCI->getPredicate();
12077 assert(MainP != AltP && "Expected different main/alternate predicates.");
12078 auto *CI = cast<CmpInst>(I);
12079 if (isCmpSameOrSwapped(MainCI, CI, TLI))
12080 return false;
12081 if (isCmpSameOrSwapped(AltCI, CI, TLI))
12082 return true;
12083 CmpInst::Predicate P = CI->getPredicate();
12085
12086 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
12087 "CmpInst expected to match either main or alternate predicate or "
12088 "their swap.");
12089 return MainP != P && MainP != SwappedP;
12090 }
12091 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == AltOp;
12092}
12093
12094TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
12095 assert(!Ops.empty());
12096 const auto *Op0 = Ops.front();
12097
12098 const bool IsConstant = all_of(Ops, [](Value *V) {
12099 // TODO: We should allow undef elements here
12100 return isConstant(V) && !isa<UndefValue>(V);
12101 });
12102 const bool IsUniform = all_of(Ops, [=](Value *V) {
12103 // TODO: We should allow undef elements here
12104 return V == Op0;
12105 });
12106 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
12107 // TODO: We should allow undef elements here
12108 if (auto *CI = dyn_cast<ConstantInt>(V))
12109 return CI->getValue().isPowerOf2();
12110 return false;
12111 });
12112 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
12113 // TODO: We should allow undef elements here
12114 if (auto *CI = dyn_cast<ConstantInt>(V))
12115 return CI->getValue().isNegatedPowerOf2();
12116 return false;
12117 });
12118
12120 if (IsConstant && IsUniform)
12122 else if (IsConstant)
12124 else if (IsUniform)
12126
12128 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
12129 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
12130
12131 return {VK, VP};
12132}
12133
12134namespace {
12135/// The base class for shuffle instruction emission and shuffle cost estimation.
12136class BaseShuffleAnalysis {
12137protected:
12138 Type *ScalarTy = nullptr;
12139
12140 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
12141
12142 /// V is expected to be a vectorized value.
12143 /// When REVEC is disabled, there is no difference between VF and
12144 /// VNumElements.
12145 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
12146 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
12147 /// of 8.
12148 unsigned getVF(Value *V) const {
12149 assert(V && "V cannot be nullptr");
12150 assert(isa<FixedVectorType>(V->getType()) &&
12151 "V does not have FixedVectorType");
12152 assert(ScalarTy && "ScalarTy cannot be nullptr");
12153 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
12154 unsigned VNumElements =
12155 cast<FixedVectorType>(V->getType())->getNumElements();
12156 assert(VNumElements > ScalarTyNumElements &&
12157 "the number of elements of V is not large enough");
12158 assert(VNumElements % ScalarTyNumElements == 0 &&
12159 "the number of elements of V is not a vectorized value");
12160 return VNumElements / ScalarTyNumElements;
12161 }
12162
12163 /// Checks if the mask is an identity mask.
12164 /// \param IsStrict if is true the function returns false if mask size does
12165 /// not match vector size.
12166 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
12167 bool IsStrict) {
12168 int Limit = Mask.size();
12169 int VF = VecTy->getNumElements();
12170 int Index = -1;
12171 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
12172 return true;
12173 if (!IsStrict) {
12174 // Consider extract subvector starting from index 0.
12175 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&
12176 Index == 0)
12177 return true;
12178 // All VF-size submasks are identity (e.g.
12179 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
12180 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
12181 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
12182 return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
12184 }))
12185 return true;
12186 }
12187 return false;
12188 }
12189
12190 /// Tries to combine 2 different masks into single one.
12191 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
12192 /// change the size of the vector, \p LocalVF is the original size of the
12193 /// shuffled vector.
12194 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
12195 ArrayRef<int> ExtMask) {
12196 unsigned VF = Mask.size();
12197 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
12198 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
12199 if (ExtMask[I] == PoisonMaskElem)
12200 continue;
12201 int MaskedIdx = Mask[ExtMask[I] % VF];
12202 NewMask[I] =
12203 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
12204 }
12205 Mask.swap(NewMask);
12206 }
12207
12208 /// Looks through shuffles trying to reduce final number of shuffles in the
12209 /// code. The function looks through the previously emitted shuffle
12210 /// instructions and properly mark indices in mask as undef.
12211 /// For example, given the code
12212 /// \code
12213 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
12214 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
12215 /// \endcode
12216 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
12217 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
12218 /// <0, 1, 2, 3> for the shuffle.
12219 /// If 2 operands are of different size, the smallest one will be resized and
12220 /// the mask recalculated properly.
12221 /// For example, given the code
12222 /// \code
12223 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
12224 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
12225 /// \endcode
12226 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
12227 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
12228 /// <0, 1, 2, 3> for the shuffle.
12229 /// So, it tries to transform permutations to simple vector merge, if
12230 /// possible.
12231 /// \param V The input vector which must be shuffled using the given \p Mask.
12232 /// If the better candidate is found, \p V is set to this best candidate
12233 /// vector.
12234 /// \param Mask The input mask for the shuffle. If the best candidate is found
12235 /// during looking-through-shuffles attempt, it is updated accordingly.
12236 /// \param SinglePermute true if the shuffle operation is originally a
12237 /// single-value-permutation. In this case the look-through-shuffles procedure
12238 /// may look for resizing shuffles as the best candidates.
12239 /// \return true if the shuffle results in the non-resizing identity shuffle
12240 /// (and thus can be ignored), false - otherwise.
12241 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
12242 bool SinglePermute) {
12243 Value *Op = V;
12244 ShuffleVectorInst *IdentityOp = nullptr;
12245 SmallVector<int> IdentityMask;
12246 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
12247 // Exit if not a fixed vector type or changing size shuffle.
12248 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
12249 if (!SVTy)
12250 break;
12251 // Remember the identity or broadcast mask, if it is not a resizing
12252 // shuffle. If no better candidates are found, this Op and Mask will be
12253 // used in the final shuffle.
12254 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
12255 if (!IdentityOp || !SinglePermute ||
12256 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
12258 IdentityMask.size()))) {
12259 IdentityOp = SV;
12260 // Store current mask in the IdentityMask so later we did not lost
12261 // this info if IdentityOp is selected as the best candidate for the
12262 // permutation.
12263 IdentityMask.assign(Mask);
12264 }
12265 }
12266 // Remember the broadcast mask. If no better candidates are found, this Op
12267 // and Mask will be used in the final shuffle.
12268 // Zero splat can be used as identity too, since it might be used with
12269 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
12270 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
12271 // expensive, the analysis founds out, that the source vector is just a
12272 // broadcast, this original mask can be transformed to identity mask <0,
12273 // 1, 2, 3>.
12274 // \code
12275 // %0 = shuffle %v, poison, zeroinitalizer
12276 // %res = shuffle %0, poison, <3, 1, 2, 0>
12277 // \endcode
12278 // may be transformed to
12279 // \code
12280 // %0 = shuffle %v, poison, zeroinitalizer
12281 // %res = shuffle %0, poison, <0, 1, 2, 3>
12282 // \endcode
12283 if (SV->isZeroEltSplat()) {
12284 IdentityOp = SV;
12285 IdentityMask.assign(Mask);
12286 }
12287 int LocalVF = Mask.size();
12288 if (auto *SVOpTy =
12289 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
12290 LocalVF = SVOpTy->getNumElements();
12291 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
12292 for (auto [Idx, I] : enumerate(Mask)) {
12293 if (I == PoisonMaskElem ||
12294 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
12295 continue;
12296 ExtMask[Idx] = SV->getMaskValue(I);
12297 }
12298 bool IsOp1Undef = isUndefVector</*isPoisonOnly=*/true>(
12299 SV->getOperand(0),
12300 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
12301 .all();
12302 bool IsOp2Undef = isUndefVector</*isPoisonOnly=*/true>(
12303 SV->getOperand(1),
12304 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
12305 .all();
12306 if (!IsOp1Undef && !IsOp2Undef) {
12307 // Update mask and mark undef elems.
12308 for (int &I : Mask) {
12309 if (I == PoisonMaskElem)
12310 continue;
12311 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
12313 I = PoisonMaskElem;
12314 }
12315 break;
12316 }
12317 SmallVector<int> ShuffleMask(SV->getShuffleMask());
12318 combineMasks(LocalVF, ShuffleMask, Mask);
12319 Mask.swap(ShuffleMask);
12320 if (IsOp2Undef)
12321 Op = SV->getOperand(0);
12322 else
12323 Op = SV->getOperand(1);
12324 }
12325 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
12326 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
12328 if (IdentityOp) {
12329 V = IdentityOp;
12330 assert(Mask.size() == IdentityMask.size() &&
12331 "Expected masks of same sizes.");
12332 // Clear known poison elements.
12333 for (auto [I, Idx] : enumerate(Mask))
12334 if (Idx == PoisonMaskElem)
12335 IdentityMask[I] = PoisonMaskElem;
12336 Mask.swap(IdentityMask);
12337 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
12338 return SinglePermute &&
12339 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
12340 /*IsStrict=*/true) ||
12341 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
12342 Shuffle->isZeroEltSplat() &&
12344 all_of(enumerate(Mask), [&](const auto &P) {
12345 return P.value() == PoisonMaskElem ||
12346 Shuffle->getShuffleMask()[P.index()] == 0;
12347 })));
12348 }
12349 V = Op;
12350 return false;
12351 }
12352 V = Op;
12353 return true;
12354 }
12355
12356 /// Smart shuffle instruction emission, walks through shuffles trees and
12357 /// tries to find the best matching vector for the actual shuffle
12358 /// instruction.
12359 template <typename T, typename ShuffleBuilderTy>
12360 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
12361 ShuffleBuilderTy &Builder, Type *ScalarTy) {
12362 assert(V1 && "Expected at least one vector value.");
12363 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
12364 SmallVector<int> NewMask(Mask);
12365 if (ScalarTyNumElements != 1) {
12366 assert(SLPReVec && "FixedVectorType is not expected.");
12367 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, NewMask);
12368 Mask = NewMask;
12369 }
12370 if (V2)
12371 Builder.resizeToMatch(V1, V2);
12372 int VF = Mask.size();
12373 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
12374 VF = FTy->getNumElements();
12376 V2, buildUseMask(VF, Mask, UseMask::SecondArg))
12377 .all()) {
12378 // Peek through shuffles.
12379 Value *Op1 = V1;
12380 Value *Op2 = V2;
12381 int VF =
12382 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
12383 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
12384 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
12385 for (int I = 0, E = Mask.size(); I < E; ++I) {
12386 if (Mask[I] < VF)
12387 CombinedMask1[I] = Mask[I];
12388 else
12389 CombinedMask2[I] = Mask[I] - VF;
12390 }
12391 Value *PrevOp1;
12392 Value *PrevOp2;
12393 do {
12394 PrevOp1 = Op1;
12395 PrevOp2 = Op2;
12396 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
12397 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
12398 // Check if we have 2 resizing shuffles - need to peek through operands
12399 // again.
12400 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
12401 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
12402 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
12403 for (auto [Idx, I] : enumerate(CombinedMask1)) {
12404 if (I == PoisonMaskElem)
12405 continue;
12406 ExtMask1[Idx] = SV1->getMaskValue(I);
12407 }
12408 SmallBitVector UseMask1 = buildUseMask(
12409 cast<FixedVectorType>(SV1->getOperand(1)->getType())
12410 ->getNumElements(),
12411 ExtMask1, UseMask::SecondArg);
12412 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
12413 for (auto [Idx, I] : enumerate(CombinedMask2)) {
12414 if (I == PoisonMaskElem)
12415 continue;
12416 ExtMask2[Idx] = SV2->getMaskValue(I);
12417 }
12418 SmallBitVector UseMask2 = buildUseMask(
12419 cast<FixedVectorType>(SV2->getOperand(1)->getType())
12420 ->getNumElements(),
12421 ExtMask2, UseMask::SecondArg);
12422 if (SV1->getOperand(0)->getType() ==
12423 SV2->getOperand(0)->getType() &&
12424 SV1->getOperand(0)->getType() != SV1->getType() &&
12425 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
12426 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
12427 Op1 = SV1->getOperand(0);
12428 Op2 = SV2->getOperand(0);
12429 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
12430 int LocalVF = ShuffleMask1.size();
12431 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
12432 LocalVF = FTy->getNumElements();
12433 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
12434 CombinedMask1.swap(ShuffleMask1);
12435 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
12436 LocalVF = ShuffleMask2.size();
12437 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
12438 LocalVF = FTy->getNumElements();
12439 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
12440 CombinedMask2.swap(ShuffleMask2);
12441 }
12442 }
12443 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
12444 Builder.resizeToMatch(Op1, Op2);
12445 VF = std::max(cast<VectorType>(Op1->getType())
12446 ->getElementCount()
12447 .getKnownMinValue(),
12449 ->getElementCount()
12450 .getKnownMinValue());
12451 for (int I = 0, E = Mask.size(); I < E; ++I) {
12452 if (CombinedMask2[I] != PoisonMaskElem) {
12453 assert(CombinedMask1[I] == PoisonMaskElem &&
12454 "Expected undefined mask element");
12455 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
12456 }
12457 }
12458 if (Op1 == Op2 &&
12459 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
12460 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
12462 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
12463 ArrayRef(CombinedMask1))))
12464 return Builder.createIdentity(Op1);
12465 return Builder.createShuffleVector(
12466 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
12467 CombinedMask1);
12468 }
12469 if (isa<PoisonValue>(V1))
12470 return Builder.createPoison(
12471 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
12472 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
12473 assert(V1 && "Expected non-null value after looking through shuffles.");
12474
12475 if (!IsIdentity)
12476 return Builder.createShuffleVector(V1, NewMask);
12477 return Builder.createIdentity(V1);
12478 }
12479
12480 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
12481 /// shuffle emission.
12482 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
12483 ArrayRef<int> Mask) {
12484 for (unsigned I : seq<unsigned>(CommonMask.size()))
12485 if (Mask[I] != PoisonMaskElem)
12486 CommonMask[I] = I;
12487 }
12488};
12489} // namespace
12490
12491/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
12492static std::pair<InstructionCost, InstructionCost>
12494 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
12495 Type *ScalarTy, VectorType *VecTy) {
12496 InstructionCost ScalarCost = 0;
12497 InstructionCost VecCost = 0;
12498 // Here we differentiate two cases: (1) when Ptrs represent a regular
12499 // vectorization tree node (as they are pointer arguments of scattered
12500 // loads) or (2) when Ptrs are the arguments of loads or stores being
12501 // vectorized as plane wide unit-stride load/store since all the
12502 // loads/stores are known to be from/to adjacent locations.
12503 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
12504 // Case 2: estimate costs for pointer related costs when vectorizing to
12505 // a wide load/store.
12506 // Scalar cost is estimated as a set of pointers with known relationship
12507 // between them.
12508 // For vector code we will use BasePtr as argument for the wide load/store
12509 // but we also need to account all the instructions which are going to
12510 // stay in vectorized code due to uses outside of these scalar
12511 // loads/stores.
12512 ScalarCost = TTI.getPointersChainCost(
12513 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
12514 CostKind);
12515
12516 SmallVector<const Value *> PtrsRetainedInVecCode;
12517 for (Value *V : Ptrs) {
12518 if (V == BasePtr) {
12519 PtrsRetainedInVecCode.push_back(V);
12520 continue;
12521 }
12523 // For simplicity assume Ptr to stay in vectorized code if it's not a
12524 // GEP instruction. We don't care since it's cost considered free.
12525 // TODO: We should check for any uses outside of vectorizable tree
12526 // rather than just single use.
12527 if (!Ptr || !Ptr->hasOneUse())
12528 PtrsRetainedInVecCode.push_back(V);
12529 }
12530
12531 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
12532 // If all pointers stay in vectorized code then we don't have
12533 // any savings on that.
12534 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
12535 }
12536 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
12537 TTI::PointersChainInfo::getKnownStride(),
12538 VecTy, CostKind);
12539 } else {
12540 // Case 1: Ptrs are the arguments of loads that we are going to transform
12541 // into masked gather load intrinsic.
12542 // All the scalar GEPs will be removed as a result of vectorization.
12543 // For any external uses of some lanes extract element instructions will
12544 // be generated (which cost is estimated separately).
12545 TTI::PointersChainInfo PtrsInfo =
12546 all_of(Ptrs,
12547 [](const Value *V) {
12549 return Ptr && !Ptr->hasAllConstantIndices();
12550 })
12551 ? TTI::PointersChainInfo::getUnknownStride()
12552 : TTI::PointersChainInfo::getKnownStride();
12553
12554 ScalarCost =
12555 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
12556 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
12557 if (!BaseGEP) {
12558 auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
12559 if (It != Ptrs.end())
12560 BaseGEP = cast<GEPOperator>(*It);
12561 }
12562 if (BaseGEP) {
12563 SmallVector<const Value *> Indices(BaseGEP->indices());
12564 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
12565 BaseGEP->getPointerOperand(), Indices, VecTy,
12566 CostKind);
12567 }
12568 }
12569
12570 return std::make_pair(ScalarCost, VecCost);
12571}
12572
12573void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
12574 assert(TE.isGather() && TE.ReorderIndices.empty() &&
12575 "Expected gather node without reordering.");
12576 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
12577 SmallSet<size_t, 2> LoadKeyUsed;
12578
12579 // Do not reorder nodes if it small (just 2 elements), all-constant or all
12580 // instructions have same opcode already.
12581 if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||
12582 all_of(TE.Scalars, isConstant))
12583 return;
12584
12585 if (any_of(seq<unsigned>(TE.Idx), [&](unsigned Idx) {
12586 return VectorizableTree[Idx]->isSame(TE.Scalars);
12587 }))
12588 return;
12589
12590 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
12591 Key = hash_combine(hash_value(LI->getParent()), Key);
12592 Value *Ptr =
12593 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth);
12594 if (LoadKeyUsed.contains(Key)) {
12595 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
12596 if (LIt != LoadsMap.end()) {
12597 for (LoadInst *RLI : LIt->second) {
12598 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
12599 LI->getType(), LI->getPointerOperand(), *DL, *SE,
12600 /*StrictCheck=*/true))
12601 return hash_value(RLI->getPointerOperand());
12602 }
12603 for (LoadInst *RLI : LIt->second) {
12605 LI->getPointerOperand(), *TLI)) {
12606 hash_code SubKey = hash_value(RLI->getPointerOperand());
12607 return SubKey;
12608 }
12609 }
12610 if (LIt->second.size() > 2) {
12611 hash_code SubKey =
12612 hash_value(LIt->second.back()->getPointerOperand());
12613 return SubKey;
12614 }
12615 }
12616 }
12617 LoadKeyUsed.insert(Key);
12618 LoadsMap.try_emplace(std::make_pair(Key, Ptr)).first->second.push_back(LI);
12619 return hash_value(LI->getPointerOperand());
12620 };
12621 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
12622 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
12623 bool IsOrdered = true;
12624 unsigned NumInstructions = 0;
12625 // Try to "cluster" scalar instructions, to be able to build extra vectorized
12626 // nodes.
12627 for (auto [I, V] : enumerate(TE.Scalars)) {
12628 size_t Key = 1, Idx = 1;
12629 if (auto *Inst = dyn_cast<Instruction>(V);
12631 !isDeleted(Inst) && !isVectorized(V)) {
12632 std::tie(Key, Idx) = generateKeySubkey(V, TLI, GenerateLoadsSubkey,
12633 /*AllowAlternate=*/false);
12634 ++NumInstructions;
12635 }
12636 auto &Container = SortedValues[Key];
12637 if (IsOrdered && !KeyToIndex.contains(V) &&
12640 ((Container.contains(Idx) &&
12641 KeyToIndex.at(Container[Idx].back()).back() != I - 1) ||
12642 (!Container.empty() && !Container.contains(Idx) &&
12643 KeyToIndex.at(Container.back().second.back()).back() != I - 1)))
12644 IsOrdered = false;
12645 auto &KTI = KeyToIndex[V];
12646 if (KTI.empty())
12647 Container[Idx].push_back(V);
12648 KTI.push_back(I);
12649 }
12651 APInt DemandedElts = APInt::getAllOnes(TE.Scalars.size());
12652 if (!IsOrdered && NumInstructions > 1) {
12653 unsigned Cnt = 0;
12654 TE.ReorderIndices.resize(TE.Scalars.size(), TE.Scalars.size());
12655 for (const auto &D : SortedValues) {
12656 for (const auto &P : D.second) {
12657 unsigned Sz = 0;
12658 for (Value *V : P.second) {
12659 ArrayRef<unsigned> Indices = KeyToIndex.at(V);
12660 for (auto [K, Idx] : enumerate(Indices)) {
12661 TE.ReorderIndices[Cnt + K] = Idx;
12662 TE.Scalars[Cnt + K] = V;
12663 }
12664 Sz += Indices.size();
12665 Cnt += Indices.size();
12666 }
12667 if (Sz > 1 && isa<Instruction>(P.second.front())) {
12668 const unsigned SubVF = getFloorFullVectorNumberOfElements(
12669 *TTI, TE.Scalars.front()->getType(), Sz);
12670 SubVectors.emplace_back(Cnt - Sz, SubVF);
12671 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
12672 DemandedElts.clearBit(I);
12673 } else if (!P.second.empty() && isConstant(P.second.front())) {
12674 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
12675 DemandedElts.clearBit(I);
12676 }
12677 }
12678 }
12679 }
12680 // Reuses always require shuffles, so consider it as profitable.
12681 if (!TE.ReuseShuffleIndices.empty() || TE.ReorderIndices.empty())
12682 return;
12683 // Do simple cost estimation.
12686 auto *ScalarTy = TE.Scalars.front()->getType();
12687 auto *VecTy = getWidenedType(ScalarTy, TE.Scalars.size());
12688 for (auto [Idx, Sz] : SubVectors) {
12690 Idx, getWidenedType(ScalarTy, Sz));
12691 }
12692 Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
12693 /*Insert=*/true,
12694 /*Extract=*/false, CostKind);
12695 int Sz = TE.Scalars.size();
12696 SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
12697 TE.ReorderIndices.end());
12698 for (unsigned I : seq<unsigned>(Sz)) {
12699 Value *V = TE.getOrdered(I);
12700 if (isa<PoisonValue>(V)) {
12701 ReorderMask[I] = PoisonMaskElem;
12702 } else if (isConstant(V) || DemandedElts[I]) {
12703 ReorderMask[I] = I + TE.ReorderIndices.size();
12704 }
12705 }
12706 Cost += ::getShuffleCost(*TTI,
12707 any_of(ReorderMask, [&](int I) { return I >= Sz; })
12710 VecTy, ReorderMask);
12711 DemandedElts = APInt::getAllOnes(TE.Scalars.size());
12712 ReorderMask.assign(Sz, PoisonMaskElem);
12713 for (unsigned I : seq<unsigned>(Sz)) {
12714 Value *V = TE.getOrdered(I);
12715 if (isConstant(V)) {
12716 DemandedElts.clearBit(I);
12717 if (!isa<PoisonValue>(V))
12718 ReorderMask[I] = I;
12719 } else {
12720 ReorderMask[I] = I + Sz;
12721 }
12722 }
12723 InstructionCost BVCost =
12724 getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
12725 /*Insert=*/true, /*Extract=*/false, CostKind);
12726 if (!DemandedElts.isAllOnes())
12727 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);
12728 if (Cost >= BVCost) {
12729 SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end());
12730 reorderScalars(TE.Scalars, Mask);
12731 TE.ReorderIndices.clear();
12732 }
12733}
12734
12735/// Check if we can convert fadd/fsub sequence to FMAD.
12736/// \returns Cost of the FMAD, if conversion is possible, invalid cost otherwise.
12738 const InstructionsState &S,
12739 DominatorTree &DT, const DataLayout &DL,
12741 const TargetLibraryInfo &TLI) {
12742 assert(all_of(VL,
12743 [](Value *V) {
12744 return V->getType()->getScalarType()->isFloatingPointTy();
12745 }) &&
12746 "Can only convert to FMA for floating point types");
12747 assert(S.isAddSubLikeOp() && "Can only convert to FMA for add/sub");
12748
12749 auto CheckForContractable = [&](ArrayRef<Value *> VL) {
12750 FastMathFlags FMF;
12751 FMF.set();
12752 for (Value *V : VL) {
12753 auto *I = dyn_cast<Instruction>(V);
12754 if (!I)
12755 continue;
12756 if (S.isCopyableElement(I))
12757 continue;
12758 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(I);
12759 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
12760 continue;
12761 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
12762 FMF &= FPCI->getFastMathFlags();
12763 }
12764 return FMF.allowContract();
12765 };
12766 if (!CheckForContractable(VL))
12768 // fmul also should be contractable
12769 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
12770 SmallVector<BoUpSLP::ValueList> Operands = Analysis.buildOperands(S, VL);
12771
12772 InstructionsState OpS = getSameOpcode(Operands.front(), TLI);
12773 if (!OpS.valid())
12775
12776 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
12778 if (!CheckForContractable(Operands.front()))
12780 // Compare the costs.
12781 InstructionCost FMulPlusFAddCost = 0;
12782 InstructionCost FMACost = 0;
12784 FastMathFlags FMF;
12785 FMF.set();
12786 for (Value *V : VL) {
12787 auto *I = dyn_cast<Instruction>(V);
12788 if (!I)
12789 continue;
12790 if (!S.isCopyableElement(I))
12791 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
12792 FMF &= FPCI->getFastMathFlags();
12793 FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
12794 }
12795 unsigned NumOps = 0;
12796 for (auto [V, Op] : zip(VL, Operands.front())) {
12797 if (S.isCopyableElement(V))
12798 continue;
12799 auto *I = dyn_cast<Instruction>(Op);
12800 if (!I || !I->hasOneUse() || OpS.isCopyableElement(I)) {
12801 if (auto *OpI = dyn_cast<Instruction>(V))
12802 FMACost += TTI.getInstructionCost(OpI, CostKind);
12803 if (I)
12804 FMACost += TTI.getInstructionCost(I, CostKind);
12805 continue;
12806 }
12807 ++NumOps;
12808 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
12809 FMF &= FPCI->getFastMathFlags();
12810 FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
12811 }
12812 Type *Ty = VL.front()->getType();
12813 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, Ty, {Ty, Ty, Ty}, FMF);
12814 FMACost += NumOps * TTI.getIntrinsicInstrCost(ICA, CostKind);
12815 return FMACost < FMulPlusFAddCost ? FMACost : InstructionCost::getInvalid();
12816}
12817
12820 BaseGraphSize = VectorizableTree.size();
12821 // Turn graph transforming mode on and off, when done.
12822 class GraphTransformModeRAAI {
12823 bool &SavedIsGraphTransformMode;
12824
12825 public:
12826 GraphTransformModeRAAI(bool &IsGraphTransformMode)
12827 : SavedIsGraphTransformMode(IsGraphTransformMode) {
12828 IsGraphTransformMode = true;
12829 }
12830 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
12831 } TransformContext(IsGraphTransformMode);
12832 // Operands are profitable if they are:
12833 // 1. At least one constant
12834 // or
12835 // 2. Splats
12836 // or
12837 // 3. Results in good vectorization opportunity, i.e. may generate vector
12838 // nodes and reduce cost of the graph.
12839 auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
12840 const InstructionsState &S) {
12842 for (unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
12843 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
12844 I2->getOperand(Op));
12845 return all_of(
12846 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
12847 return all_of(Cand,
12848 [](const std::pair<Value *, Value *> &P) {
12849 return isa<Constant>(P.first) ||
12850 isa<Constant>(P.second) || P.first == P.second;
12851 }) ||
12853 });
12854 };
12855
12856 // Try to reorder gather nodes for better vectorization opportunities.
12857 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
12858 TreeEntry &E = *VectorizableTree[Idx];
12859 if (E.isGather())
12860 reorderGatherNode(E);
12861 }
12862
12863 // Better to use full gathered loads analysis, if there are only 2 loads
12864 // gathered nodes each having less than 16 elements.
12865 constexpr unsigned VFLimit = 16;
12866 bool ForceLoadGather =
12867 count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12868 return TE->isGather() && TE->hasState() &&
12869 TE->getOpcode() == Instruction::Load &&
12870 TE->getVectorFactor() < VFLimit;
12871 }) == 2;
12872
12873 // Checks if the scalars are used in other node.
12874 auto AreReusedScalars = [&](const TreeEntry *TE, ArrayRef<Value *> VL,
12875 function_ref<bool(Value *)> CheckContainer) {
12876 return TE->isSame(VL) || all_of(VL, [&](Value *V) {
12877 if (isa<PoisonValue>(V))
12878 return true;
12879 auto *I = dyn_cast<Instruction>(V);
12880 if (!I)
12881 return false;
12882 return is_contained(TE->Scalars, I) || CheckContainer(I);
12883 });
12884 };
12885 auto CheckForSameVectorNodes = [&](const TreeEntry &E) {
12886 if (E.hasState()) {
12887 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(E.getMainOp());
12888 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
12889 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
12890 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
12891 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12892 return is_contained(TEs, TE);
12893 });
12894 });
12895 }))
12896 return true;
12897 ;
12898 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(E.getMainOp());
12899 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
12900 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
12901 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
12902 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12903 return is_contained(TEs, TE);
12904 });
12905 });
12906 }))
12907 return true;
12908 } else {
12909 // Check if the gather node full copy of split node.
12910 auto *It = find_if(E.Scalars, IsaPred<Instruction>);
12911 if (It != E.Scalars.end()) {
12912 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(*It);
12913 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
12914 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
12915 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
12916 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12917 return is_contained(TEs, TE);
12918 });
12919 });
12920 }))
12921 return true;
12922 }
12923 }
12924 return false;
12925 };
12926 // The tree may grow here, so iterate over nodes, built before.
12927 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
12928 TreeEntry &E = *VectorizableTree[Idx];
12929 if (E.isGather()) {
12930 ArrayRef<Value *> VL = E.Scalars;
12931 const unsigned Sz = getVectorElementSize(VL.front());
12932 unsigned MinVF = getMinVF(2 * Sz);
12933 // Do not try partial vectorization for small nodes (<= 2), nodes with the
12934 // same opcode and same parent block or all constants.
12935 if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
12936 !(!E.hasState() || E.getOpcode() == Instruction::Load ||
12937 // We use allSameOpcode instead of isAltShuffle because we don't
12938 // want to use interchangeable instruction here.
12939 !allSameOpcode(VL) || !allSameBlock(VL)) ||
12940 allConstant(VL) || isSplat(VL))
12941 continue;
12942 if (ForceLoadGather && E.hasState() && E.getOpcode() == Instruction::Load)
12943 continue;
12944 // Check if the node is a copy of other vector nodes.
12945 if (CheckForSameVectorNodes(E))
12946 continue;
12947 // Try to find vectorizable sequences and transform them into a series of
12948 // insertvector instructions.
12949 unsigned StartIdx = 0;
12950 unsigned End = VL.size();
12951 for (unsigned VF = getFloorFullVectorNumberOfElements(
12952 *TTI, VL.front()->getType(), VL.size() - 1);
12953 VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
12954 *TTI, VL.front()->getType(), VF - 1)) {
12955 if (StartIdx + VF > End)
12956 continue;
12958 bool AllStrided = true;
12959 for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
12960 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
12961 // If any instruction is vectorized already - do not try again.
12962 // Reuse the existing node, if it fully matches the slice.
12963 if (isVectorized(Slice.front()) &&
12964 !getSameValuesTreeEntry(Slice.front(), Slice, /*SameVF=*/true))
12965 continue;
12966 // Constant already handled effectively - skip.
12967 if (allConstant(Slice))
12968 continue;
12969 // Do not try to vectorize small splats (less than vector register and
12970 // only with the single non-undef element).
12971 bool IsSplat = isSplat(Slice);
12972 bool IsTwoRegisterSplat = true;
12973 if (IsSplat && VF == 2) {
12974 unsigned NumRegs2VF = ::getNumberOfParts(
12975 *TTI, getWidenedType(Slice.front()->getType(), 2 * VF));
12976 IsTwoRegisterSplat = NumRegs2VF == 2;
12977 }
12978 if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat ||
12979 count(Slice, Slice.front()) ==
12980 static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
12981 : 1)) {
12982 if (IsSplat)
12983 continue;
12984 InstructionsState S = getSameOpcode(Slice, *TLI);
12985 if (!S || !allSameOpcode(Slice) || !allSameBlock(Slice) ||
12986 (S.getOpcode() == Instruction::Load &&
12988 (S.getOpcode() != Instruction::Load &&
12989 !hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), VF)))
12990 continue;
12991 if (VF == 2) {
12992 // Try to vectorize reduced values or if all users are vectorized.
12993 // For expensive instructions extra extracts might be profitable.
12994 if ((!UserIgnoreList || E.Idx != 0) &&
12995 TTI->getInstructionCost(S.getMainOp(), CostKind) <
12997 !all_of(Slice, [&](Value *V) {
12998 if (isa<PoisonValue>(V))
12999 return true;
13000 return areAllUsersVectorized(cast<Instruction>(V),
13001 UserIgnoreList);
13002 }))
13003 continue;
13004 if (S.getOpcode() == Instruction::Load) {
13005 OrdersType Order;
13006 SmallVector<Value *> PointerOps;
13007 StridedPtrInfo SPtrInfo;
13008 LoadsState Res = canVectorizeLoads(Slice, Slice.front(), Order,
13009 PointerOps, SPtrInfo);
13010 AllStrided &= Res == LoadsState::StridedVectorize ||
13012 Res == LoadsState::Gather;
13013 // Do not vectorize gathers.
13014 if (Res == LoadsState::ScatterVectorize ||
13015 Res == LoadsState::Gather) {
13016 if (Res == LoadsState::Gather) {
13018 // If reductions and the scalars from the root node are
13019 // analyzed - mark as non-vectorizable reduction.
13020 if (UserIgnoreList && E.Idx == 0)
13021 analyzedReductionVals(Slice);
13022 }
13023 continue;
13024 }
13025 } else if (S.getOpcode() == Instruction::ExtractElement ||
13026 (TTI->getInstructionCost(S.getMainOp(), CostKind) <
13028 !CheckOperandsProfitability(
13029 S.getMainOp(),
13032 S))) {
13033 // Do not vectorize extractelements (handled effectively
13034 // alread). Do not vectorize non-profitable instructions (with
13035 // low cost and non-vectorizable operands.)
13036 continue;
13037 }
13038 }
13039 }
13040 Slices.emplace_back(Cnt, Slice.size());
13041 }
13042 // Do not try to vectorize if all slides are strided or gathered with
13043 // vector factor 2 and there are more than 2 slices. Better to handle
13044 // them in gathered loads analysis, may result in better vectorization.
13045 if (VF == 2 && AllStrided && Slices.size() > 2)
13046 continue;
13047 auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
13048 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
13049 if (StartIdx == Cnt)
13050 StartIdx = Cnt + Sz;
13051 if (End == Cnt + Sz)
13052 End = Cnt;
13053 };
13054 for (auto [Cnt, Sz] : Slices) {
13055 ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);
13056 const TreeEntry *SameTE = nullptr;
13057 if (const auto *It = find_if(Slice, IsaPred<Instruction>);
13058 It != Slice.end()) {
13059 // If any instruction is vectorized already - do not try again.
13060 SameTE = getSameValuesTreeEntry(*It, Slice);
13061 }
13062 unsigned PrevSize = VectorizableTree.size();
13063 [[maybe_unused]] unsigned PrevEntriesSize =
13064 LoadEntriesToVectorize.size();
13065 buildTreeRec(Slice, 0, EdgeInfo(&E, UINT_MAX));
13066 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
13067 VectorizableTree[PrevSize]->isGather() &&
13068 VectorizableTree[PrevSize]->hasState() &&
13069 VectorizableTree[PrevSize]->getOpcode() !=
13070 Instruction::ExtractElement &&
13071 !isSplat(Slice)) {
13072 if (UserIgnoreList && E.Idx == 0 && VF == 2)
13073 analyzedReductionVals(Slice);
13074 VectorizableTree.pop_back();
13075 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
13076 "LoadEntriesToVectorize expected to remain the same");
13077 continue;
13078 }
13079 AddCombinedNode(PrevSize, Cnt, Sz);
13080 }
13081 }
13082 // Restore ordering, if no extra vectorization happened.
13083 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
13084 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
13085 reorderScalars(E.Scalars, Mask);
13086 E.ReorderIndices.clear();
13087 }
13088 }
13089 if (!E.hasState())
13090 continue;
13091 switch (E.getOpcode()) {
13092 case Instruction::Load: {
13093 // No need to reorder masked gather loads, just reorder the scalar
13094 // operands.
13095 if (E.State != TreeEntry::Vectorize)
13096 break;
13097 Type *ScalarTy = E.getMainOp()->getType();
13098 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
13099 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
13100 // Check if profitable to represent consecutive load + reverse as strided
13101 // load with stride -1.
13102 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
13103 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13104 SmallVector<int> Mask;
13105 inversePermutation(E.ReorderIndices, Mask);
13106 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
13107 InstructionCost OriginalVecCost =
13108 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
13109 BaseLI->getPointerAddressSpace(), CostKind,
13111 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
13112 InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
13113 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
13114 /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
13115 if (StridedCost < OriginalVecCost || ForceStridedLoads) {
13116 // Strided load is more profitable than consecutive load + reverse -
13117 // transform the node to strided load.
13118 Type *StrideTy = DL->getIndexType(cast<LoadInst>(E.Scalars.front())
13119 ->getPointerOperand()
13120 ->getType());
13121 StridedPtrInfo SPtrInfo;
13122 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
13123 SPtrInfo.Ty = VecTy;
13124 TreeEntryToStridedPtrInfoMap[&E] = SPtrInfo;
13125 E.State = TreeEntry::StridedVectorize;
13126 }
13127 }
13128 break;
13129 }
13130 case Instruction::Store: {
13131 Type *ScalarTy =
13132 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
13133 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
13134 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
13135 // Check if profitable to represent consecutive load + reverse as strided
13136 // load with stride -1.
13137 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
13138 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13139 SmallVector<int> Mask;
13140 inversePermutation(E.ReorderIndices, Mask);
13141 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
13142 InstructionCost OriginalVecCost =
13143 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
13144 BaseSI->getPointerAddressSpace(), CostKind,
13146 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
13147 InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
13148 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
13149 /*VariableMask=*/false, CommonAlignment, CostKind, BaseSI);
13150 if (StridedCost < OriginalVecCost)
13151 // Strided store is more profitable than reverse + consecutive store -
13152 // transform the node to strided store.
13153 E.State = TreeEntry::StridedVectorize;
13154 } else if (!E.ReorderIndices.empty()) {
13155 // Check for interleaved stores.
13156 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {
13157 auto *BaseSI = cast<StoreInst>(E.Scalars.front());
13158 assert(Mask.size() > 1 && "Expected mask greater than 1 element.");
13159 if (Mask.size() < 4)
13160 return 0u;
13161 for (unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
13163 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
13164 TTI.isLegalInterleavedAccessType(
13165 VecTy, Factor, BaseSI->getAlign(),
13166 BaseSI->getPointerAddressSpace()))
13167 return Factor;
13168 }
13169
13170 return 0u;
13171 };
13172 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
13173 unsigned InterleaveFactor = IsInterleaveMask(Mask);
13174 if (InterleaveFactor != 0)
13175 E.setInterleave(InterleaveFactor);
13176 }
13177 break;
13178 }
13179 case Instruction::Select: {
13180 if (E.State != TreeEntry::Vectorize)
13181 break;
13182 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
13183 if (MinMaxID == Intrinsic::not_intrinsic)
13184 break;
13185 // This node is a minmax node.
13186 E.CombinedOp = TreeEntry::MinMax;
13187 TreeEntry *CondEntry = getOperandEntry(&E, 0);
13188 if (SelectOnly && CondEntry->UserTreeIndex &&
13189 CondEntry->State == TreeEntry::Vectorize) {
13190 // The condition node is part of the combined minmax node.
13191 CondEntry->State = TreeEntry::CombinedVectorize;
13192 }
13193 break;
13194 }
13195 case Instruction::FSub:
13196 case Instruction::FAdd: {
13197 // Check if possible to convert (a*b)+c to fma.
13198 if (E.State != TreeEntry::Vectorize ||
13199 !E.getOperations().isAddSubLikeOp())
13200 break;
13201 if (!canConvertToFMA(E.Scalars, E.getOperations(), *DT, *DL, *TTI, *TLI)
13202 .isValid())
13203 break;
13204 // This node is a fmuladd node.
13205 E.CombinedOp = TreeEntry::FMulAdd;
13206 TreeEntry *FMulEntry = getOperandEntry(&E, 0);
13207 if (FMulEntry->UserTreeIndex &&
13208 FMulEntry->State == TreeEntry::Vectorize) {
13209 // The FMul node is part of the combined fmuladd node.
13210 FMulEntry->State = TreeEntry::CombinedVectorize;
13211 }
13212 break;
13213 }
13214 default:
13215 break;
13216 }
13217 }
13218
13219 if (LoadEntriesToVectorize.empty()) {
13220 // Single load node - exit.
13221 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
13222 VectorizableTree.front()->getOpcode() == Instruction::Load)
13223 return;
13224 // Small graph with small VF - exit.
13225 constexpr unsigned SmallTree = 3;
13226 constexpr unsigned SmallVF = 2;
13227 if ((VectorizableTree.size() <= SmallTree &&
13228 VectorizableTree.front()->Scalars.size() == SmallVF) ||
13229 (VectorizableTree.size() <= 2 && UserIgnoreList))
13230 return;
13231
13232 if (VectorizableTree.front()->isNonPowOf2Vec() &&
13233 getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
13234 getCanonicalGraphSize() <= SmallTree &&
13235 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
13236 [](const std::unique_ptr<TreeEntry> &TE) {
13237 return TE->isGather() && TE->hasState() &&
13238 TE->getOpcode() == Instruction::Load &&
13239 !allSameBlock(TE->Scalars);
13240 }) == 1)
13241 return;
13242 }
13243
13244 // A list of loads to be gathered during the vectorization process. We can
13245 // try to vectorize them at the end, if profitable.
13246 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
13248 GatheredLoads;
13249
13250 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
13251 TreeEntry &E = *TE;
13252 if (E.isGather() &&
13253 ((E.hasState() && E.getOpcode() == Instruction::Load) ||
13254 (!E.hasState() && any_of(E.Scalars,
13255 [&](Value *V) {
13256 return isa<LoadInst>(V) &&
13257 !isVectorized(V) &&
13258 !isDeleted(cast<Instruction>(V));
13259 }))) &&
13260 !isSplat(E.Scalars)) {
13261 for (Value *V : E.Scalars) {
13262 auto *LI = dyn_cast<LoadInst>(V);
13263 if (!LI)
13264 continue;
13265 if (isDeleted(LI) || isVectorized(LI) || !LI->isSimple())
13266 continue;
13268 *this, V, *DL, *SE, *TTI,
13269 GatheredLoads[std::make_tuple(
13270 LI->getParent(),
13271 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth),
13272 LI->getType())]);
13273 }
13274 }
13275 }
13276 // Try to vectorize gathered loads if this is not just a gather of loads.
13277 if (!GatheredLoads.empty())
13278 tryToVectorizeGatheredLoads(GatheredLoads);
13279}
13280
13281/// Merges shuffle masks and emits final shuffle instruction, if required. It
13282/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
13283/// when the actual shuffle instruction is generated only if this is actually
13284/// required. Otherwise, the shuffle instruction emission is delayed till the
13285/// end of the process, to reduce the number of emitted instructions and further
13286/// analysis/transformations.
13287class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
13288 bool IsFinalized = false;
13289 SmallVector<int> CommonMask;
13291 const TargetTransformInfo &TTI;
13292 InstructionCost Cost = 0;
13293 SmallDenseSet<Value *> VectorizedVals;
13294 BoUpSLP &R;
13295 SmallPtrSetImpl<Value *> &CheckedExtracts;
13296 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13297 /// While set, still trying to estimate the cost for the same nodes and we
13298 /// can delay actual cost estimation (virtual shuffle instruction emission).
13299 /// May help better estimate the cost if same nodes must be permuted + allows
13300 /// to move most of the long shuffles cost estimation to TTI.
13301 bool SameNodesEstimated = true;
13302
13303 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
13304 if (Ty->getScalarType()->isPointerTy()) {
13307 IntegerType::get(Ty->getContext(),
13308 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
13309 Ty->getScalarType());
13310 if (auto *VTy = dyn_cast<VectorType>(Ty))
13311 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
13312 return Res;
13313 }
13314 return Constant::getAllOnesValue(Ty);
13315 }
13316
13317 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
13318 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
13319 return TTI::TCC_Free;
13320 auto *VecTy = getWidenedType(ScalarTy, VL.size());
13321 InstructionCost GatherCost = 0;
13322 SmallVector<Value *> Gathers(VL);
13323 if (!Root && isSplat(VL)) {
13324 // Found the broadcasting of the single scalar, calculate the cost as
13325 // the broadcast.
13326 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
13327 assert(It != VL.end() && "Expected at least one non-undef value.");
13328 // Add broadcast for non-identity shuffle only.
13329 bool NeedShuffle =
13330 count(VL, *It) > 1 &&
13331 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
13332 if (!NeedShuffle) {
13333 if (isa<FixedVectorType>(ScalarTy)) {
13334 assert(SLPReVec && "FixedVectorType is not expected.");
13335 return TTI.getShuffleCost(
13336 TTI::SK_InsertSubvector, VecTy, VecTy, {}, CostKind,
13337 std::distance(VL.begin(), It) * getNumElements(ScalarTy),
13338 cast<FixedVectorType>(ScalarTy));
13339 }
13340 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
13341 CostKind, std::distance(VL.begin(), It),
13342 PoisonValue::get(VecTy), *It);
13343 }
13344
13345 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
13346 transform(VL, ShuffleMask.begin(), [](Value *V) {
13347 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
13348 });
13349 InstructionCost InsertCost =
13350 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
13351 PoisonValue::get(VecTy), *It);
13352 return InsertCost + ::getShuffleCost(TTI,
13354 VecTy, ShuffleMask, CostKind,
13355 /*Index=*/0, /*SubTp=*/nullptr,
13356 /*Args=*/*It);
13357 }
13358 return GatherCost +
13359 (all_of(Gathers, IsaPred<UndefValue>)
13361 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
13362 ScalarTy));
13363 };
13364
13365 /// Compute the cost of creating a vector containing the extracted values from
13366 /// \p VL.
13368 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
13369 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13370 unsigned NumParts) {
13371 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
13372 unsigned NumElts =
13373 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
13374 auto *EE = dyn_cast<ExtractElementInst>(V);
13375 if (!EE)
13376 return Sz;
13377 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
13378 if (!VecTy)
13379 return Sz;
13380 return std::max(Sz, VecTy->getNumElements());
13381 });
13382 // FIXME: this must be moved to TTI for better estimation.
13383 unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
13384 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
13386 SmallVectorImpl<unsigned> &SubVecSizes)
13387 -> std::optional<TTI::ShuffleKind> {
13388 if (NumElts <= EltsPerVector)
13389 return std::nullopt;
13390 int OffsetReg0 =
13391 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
13392 [](int S, int I) {
13393 if (I == PoisonMaskElem)
13394 return S;
13395 return std::min(S, I);
13396 }),
13397 EltsPerVector);
13398 int OffsetReg1 = OffsetReg0;
13399 DenseSet<int> RegIndices;
13400 // Check that if trying to permute same single/2 input vectors.
13402 int FirstRegId = -1;
13403 Indices.assign(1, OffsetReg0);
13404 for (auto [Pos, I] : enumerate(Mask)) {
13405 if (I == PoisonMaskElem)
13406 continue;
13407 int Idx = I - OffsetReg0;
13408 int RegId =
13409 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
13410 if (FirstRegId < 0)
13411 FirstRegId = RegId;
13412 RegIndices.insert(RegId);
13413 if (RegIndices.size() > 2)
13414 return std::nullopt;
13415 if (RegIndices.size() == 2) {
13416 ShuffleKind = TTI::SK_PermuteTwoSrc;
13417 if (Indices.size() == 1) {
13418 OffsetReg1 = alignDown(
13419 std::accumulate(
13420 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
13421 [&](int S, int I) {
13422 if (I == PoisonMaskElem)
13423 return S;
13424 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
13425 ((I - OffsetReg0) % NumElts) / EltsPerVector;
13426 if (RegId == FirstRegId)
13427 return S;
13428 return std::min(S, I);
13429 }),
13430 EltsPerVector);
13431 unsigned Index = OffsetReg1 % NumElts;
13432 Indices.push_back(Index);
13433 SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
13434 }
13435 Idx = I - OffsetReg1;
13436 }
13437 I = (Idx % NumElts) % EltsPerVector +
13438 (RegId == FirstRegId ? 0 : EltsPerVector);
13439 }
13440 return ShuffleKind;
13441 };
13442 InstructionCost Cost = 0;
13443
13444 // Process extracts in blocks of EltsPerVector to check if the source vector
13445 // operand can be re-used directly. If not, add the cost of creating a
13446 // shuffle to extract the values into a vector register.
13447 for (unsigned Part : seq<unsigned>(NumParts)) {
13448 if (!ShuffleKinds[Part])
13449 continue;
13450 ArrayRef<int> MaskSlice = Mask.slice(
13451 Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
13452 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
13453 copy(MaskSlice, SubMask.begin());
13455 SmallVector<unsigned, 2> SubVecSizes;
13456 std::optional<TTI::ShuffleKind> RegShuffleKind =
13457 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
13458 if (!RegShuffleKind) {
13459 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
13461 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
13462 Cost +=
13463 ::getShuffleCost(TTI, *ShuffleKinds[Part],
13464 getWidenedType(ScalarTy, NumElts), MaskSlice);
13465 continue;
13466 }
13467 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
13468 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
13469 Cost +=
13470 ::getShuffleCost(TTI, *RegShuffleKind,
13471 getWidenedType(ScalarTy, EltsPerVector), SubMask);
13472 }
13473 const unsigned BaseVF = getFullVectorNumberOfElements(
13474 *R.TTI, VL.front()->getType(), alignTo(NumElts, EltsPerVector));
13475 for (const auto [Idx, SubVecSize] : zip(Indices, SubVecSizes)) {
13476 assert((Idx + SubVecSize) <= BaseVF &&
13477 "SK_ExtractSubvector index out of range");
13479 getWidenedType(ScalarTy, BaseVF), {}, CostKind,
13480 Idx, getWidenedType(ScalarTy, SubVecSize));
13481 }
13482 // Second attempt to check, if just a permute is better estimated than
13483 // subvector extract.
13484 SubMask.assign(NumElts, PoisonMaskElem);
13485 copy(MaskSlice, SubMask.begin());
13486 InstructionCost OriginalCost = ::getShuffleCost(
13487 TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask);
13488 if (OriginalCost < Cost)
13489 Cost = OriginalCost;
13490 }
13491 return Cost;
13492 }
13493 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
13494 /// mask \p Mask, register number \p Part, that includes \p SliceSize
13495 /// elements.
13496 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
13497 ArrayRef<int> Mask, unsigned Part,
13498 unsigned SliceSize) {
13499 if (SameNodesEstimated) {
13500 // Delay the cost estimation if the same nodes are reshuffling.
13501 // If we already requested the cost of reshuffling of E1 and E2 before, no
13502 // need to estimate another cost with the sub-Mask, instead include this
13503 // sub-Mask into the CommonMask to estimate it later and avoid double cost
13504 // estimation.
13505 if ((InVectors.size() == 2 &&
13506 cast<const TreeEntry *>(InVectors.front()) == &E1 &&
13507 cast<const TreeEntry *>(InVectors.back()) == E2) ||
13508 (!E2 && cast<const TreeEntry *>(InVectors.front()) == &E1)) {
13509 unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
13510 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
13511 [](int Idx) { return Idx == PoisonMaskElem; }) &&
13512 "Expected all poisoned elements.");
13513 ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
13514 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
13515 return;
13516 }
13517 // Found non-matching nodes - need to estimate the cost for the matched
13518 // and transform mask.
13519 Cost += createShuffle(InVectors.front(),
13520 InVectors.size() == 1 ? nullptr : InVectors.back(),
13521 CommonMask);
13522 transformMaskAfterShuffle(CommonMask, CommonMask);
13523 } else if (InVectors.size() == 2) {
13524 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
13525 transformMaskAfterShuffle(CommonMask, CommonMask);
13526 }
13527 SameNodesEstimated = false;
13528 if (!E2 && InVectors.size() == 1) {
13529 unsigned VF = E1.getVectorFactor();
13530 if (Value *V1 = dyn_cast<Value *>(InVectors.front())) {
13531 VF = std::max(VF, getVF(V1));
13532 } else {
13533 const auto *E = cast<const TreeEntry *>(InVectors.front());
13534 VF = std::max(VF, E->getVectorFactor());
13535 }
13536 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13537 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
13538 CommonMask[Idx] = Mask[Idx] + VF;
13539 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
13540 transformMaskAfterShuffle(CommonMask, CommonMask);
13541 } else {
13542 auto P = InVectors.front();
13543 Cost += createShuffle(&E1, E2, Mask);
13544 unsigned VF = Mask.size();
13545 if (Value *V1 = dyn_cast<Value *>(P)) {
13546 VF = std::max(VF,
13547 getNumElements(V1->getType()));
13548 } else {
13549 const auto *E = cast<const TreeEntry *>(P);
13550 VF = std::max(VF, E->getVectorFactor());
13551 }
13552 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13553 if (Mask[Idx] != PoisonMaskElem)
13554 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
13555 Cost += createShuffle(P, InVectors.front(), CommonMask);
13556 transformMaskAfterShuffle(CommonMask, CommonMask);
13557 }
13558 }
13559
13560 class ShuffleCostBuilder {
13561 const TargetTransformInfo &TTI;
13562
13563 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
13564 int Index = -1;
13565 return Mask.empty() ||
13566 (VF == Mask.size() &&
13569 Index == 0);
13570 }
13571
13572 public:
13573 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
13574 ~ShuffleCostBuilder() = default;
13575 InstructionCost createShuffleVector(Value *V1, Value *,
13576 ArrayRef<int> Mask) const {
13577 // Empty mask or identity mask are free.
13578 unsigned VF =
13579 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
13580 if (isEmptyOrIdentity(Mask, VF))
13581 return TTI::TCC_Free;
13582 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
13583 cast<VectorType>(V1->getType()), Mask);
13584 }
13585 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
13586 // Empty mask or identity mask are free.
13587 unsigned VF =
13588 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
13589 if (isEmptyOrIdentity(Mask, VF))
13590 return TTI::TCC_Free;
13591 return ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
13592 cast<VectorType>(V1->getType()), Mask);
13593 }
13594 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
13595 InstructionCost createPoison(Type *Ty, unsigned VF) const {
13596 return TTI::TCC_Free;
13597 }
13598 void resizeToMatch(Value *&, Value *&) const {}
13599 };
13600
13601 /// Smart shuffle instruction emission, walks through shuffles trees and
13602 /// tries to find the best matching vector for the actual shuffle
13603 /// instruction.
13605 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
13607 ArrayRef<int> Mask) {
13608 ShuffleCostBuilder Builder(TTI);
13609 SmallVector<int> CommonMask(Mask);
13610 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
13611 unsigned CommonVF = Mask.size();
13612 InstructionCost ExtraCost = 0;
13613 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
13614 unsigned VF) -> InstructionCost {
13615 if (E.isGather() && allConstant(E.Scalars))
13616 return TTI::TCC_Free;
13617 Type *EScalarTy = E.Scalars.front()->getType();
13618 bool IsSigned = true;
13619 if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
13620 EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
13621 IsSigned = It->second.second;
13622 }
13623 if (EScalarTy != ScalarTy) {
13624 unsigned CastOpcode = Instruction::Trunc;
13625 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13626 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13627 if (DstSz > SrcSz)
13628 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13629 return TTI.getCastInstrCost(CastOpcode, getWidenedType(ScalarTy, VF),
13630 getWidenedType(EScalarTy, VF),
13631 TTI::CastContextHint::None, CostKind);
13632 }
13633 return TTI::TCC_Free;
13634 };
13635 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
13636 if (isa<Constant>(V))
13637 return TTI::TCC_Free;
13638 auto *VecTy = cast<VectorType>(V->getType());
13639 Type *EScalarTy = VecTy->getElementType();
13640 if (EScalarTy != ScalarTy) {
13641 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
13642 unsigned CastOpcode = Instruction::Trunc;
13643 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13644 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13645 if (DstSz > SrcSz)
13646 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13647 return TTI.getCastInstrCost(
13648 CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
13649 VecTy, TTI::CastContextHint::None, CostKind);
13650 }
13651 return TTI::TCC_Free;
13652 };
13653 if (!V1 && !V2 && !P2.isNull()) {
13654 // Shuffle 2 entry nodes.
13655 const TreeEntry *E = cast<const TreeEntry *>(P1);
13656 unsigned VF = E->getVectorFactor();
13657 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
13658 CommonVF = std::max(VF, E2->getVectorFactor());
13659 assert(all_of(Mask,
13660 [=](int Idx) {
13661 return Idx < 2 * static_cast<int>(CommonVF);
13662 }) &&
13663 "All elements in mask must be less than 2 * CommonVF.");
13664 if (E->Scalars.size() == E2->Scalars.size()) {
13665 SmallVector<int> EMask = E->getCommonMask();
13666 SmallVector<int> E2Mask = E2->getCommonMask();
13667 if (!EMask.empty() || !E2Mask.empty()) {
13668 for (int &Idx : CommonMask) {
13669 if (Idx == PoisonMaskElem)
13670 continue;
13671 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
13672 Idx = EMask[Idx];
13673 else if (Idx >= static_cast<int>(CommonVF))
13674 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
13675 E->Scalars.size();
13676 }
13677 }
13678 CommonVF = E->Scalars.size();
13679 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
13680 GetNodeMinBWAffectedCost(*E2, CommonVF);
13681 } else {
13682 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
13683 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
13684 }
13685 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13686 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13687 } else if (!V1 && P2.isNull()) {
13688 // Shuffle single entry node.
13689 const TreeEntry *E = cast<const TreeEntry *>(P1);
13690 unsigned VF = E->getVectorFactor();
13691 CommonVF = VF;
13692 assert(
13693 all_of(Mask,
13694 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
13695 "All elements in mask must be less than CommonVF.");
13696 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
13697 SmallVector<int> EMask = E->getCommonMask();
13698 assert(!EMask.empty() && "Expected non-empty common mask.");
13699 for (int &Idx : CommonMask) {
13700 if (Idx != PoisonMaskElem)
13701 Idx = EMask[Idx];
13702 }
13703 CommonVF = E->Scalars.size();
13704 } else if (unsigned Factor = E->getInterleaveFactor();
13705 Factor > 0 && E->Scalars.size() != Mask.size() &&
13707 Factor)) {
13708 // Deinterleaved nodes are free.
13709 std::iota(CommonMask.begin(), CommonMask.end(), 0);
13710 }
13711 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
13712 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13713 // Not identity/broadcast? Try to see if the original vector is better.
13714 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
13715 CommonVF == CommonMask.size() &&
13716 any_of(enumerate(CommonMask),
13717 [](const auto &&P) {
13718 return P.value() != PoisonMaskElem &&
13719 static_cast<unsigned>(P.value()) != P.index();
13720 }) &&
13721 any_of(CommonMask,
13722 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
13723 SmallVector<int> ReorderMask;
13724 inversePermutation(E->ReorderIndices, ReorderMask);
13725 ::addMask(CommonMask, ReorderMask);
13726 }
13727 } else if (V1 && P2.isNull()) {
13728 // Shuffle single vector.
13729 ExtraCost += GetValueMinBWAffectedCost(V1);
13730 CommonVF = getVF(V1);
13731 assert(
13732 all_of(Mask,
13733 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
13734 "All elements in mask must be less than CommonVF.");
13735 } else if (V1 && !V2) {
13736 // Shuffle vector and tree node.
13737 unsigned VF = getVF(V1);
13738 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
13739 CommonVF = std::max(VF, E2->getVectorFactor());
13740 assert(all_of(Mask,
13741 [=](int Idx) {
13742 return Idx < 2 * static_cast<int>(CommonVF);
13743 }) &&
13744 "All elements in mask must be less than 2 * CommonVF.");
13745 if (E2->Scalars.size() == VF && VF != CommonVF) {
13746 SmallVector<int> E2Mask = E2->getCommonMask();
13747 assert(!E2Mask.empty() && "Expected non-empty common mask.");
13748 for (int &Idx : CommonMask) {
13749 if (Idx == PoisonMaskElem)
13750 continue;
13751 if (Idx >= static_cast<int>(CommonVF))
13752 Idx = E2Mask[Idx - CommonVF] + VF;
13753 }
13754 CommonVF = VF;
13755 }
13756 ExtraCost += GetValueMinBWAffectedCost(V1);
13757 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13758 ExtraCost += GetNodeMinBWAffectedCost(
13759 *E2, std::min(CommonVF, E2->getVectorFactor()));
13760 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13761 } else if (!V1 && V2) {
13762 // Shuffle vector and tree node.
13763 unsigned VF = getVF(V2);
13764 const TreeEntry *E1 = cast<const TreeEntry *>(P1);
13765 CommonVF = std::max(VF, E1->getVectorFactor());
13766 assert(all_of(Mask,
13767 [=](int Idx) {
13768 return Idx < 2 * static_cast<int>(CommonVF);
13769 }) &&
13770 "All elements in mask must be less than 2 * CommonVF.");
13771 if (E1->Scalars.size() == VF && VF != CommonVF) {
13772 SmallVector<int> E1Mask = E1->getCommonMask();
13773 assert(!E1Mask.empty() && "Expected non-empty common mask.");
13774 for (int &Idx : CommonMask) {
13775 if (Idx == PoisonMaskElem)
13776 continue;
13777 if (Idx >= static_cast<int>(CommonVF))
13778 Idx = E1Mask[Idx - CommonVF] + VF;
13779 else
13780 Idx = E1Mask[Idx];
13781 }
13782 CommonVF = VF;
13783 }
13784 ExtraCost += GetNodeMinBWAffectedCost(
13785 *E1, std::min(CommonVF, E1->getVectorFactor()));
13786 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13787 ExtraCost += GetValueMinBWAffectedCost(V2);
13788 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13789 } else {
13790 assert(V1 && V2 && "Expected both vectors.");
13791 unsigned VF = getVF(V1);
13792 CommonVF = std::max(VF, getVF(V2));
13793 assert(all_of(Mask,
13794 [=](int Idx) {
13795 return Idx < 2 * static_cast<int>(CommonVF);
13796 }) &&
13797 "All elements in mask must be less than 2 * CommonVF.");
13798 ExtraCost +=
13799 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
13800 if (V1->getType() != V2->getType()) {
13801 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13802 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13803 } else {
13804 if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
13805 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13806 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
13807 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13808 }
13809 }
13810 InVectors.front() =
13811 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
13812 if (InVectors.size() == 2)
13813 InVectors.pop_back();
13814 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
13815 V1, V2, CommonMask, Builder, ScalarTy);
13816 }
13817
13818public:
13820 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
13821 SmallPtrSetImpl<Value *> &CheckedExtracts)
13822 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
13823 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
13824 CheckedExtracts(CheckedExtracts) {}
13825 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
13826 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13827 unsigned NumParts, bool &UseVecBaseAsInput) {
13828 UseVecBaseAsInput = false;
13829 if (Mask.empty())
13830 return nullptr;
13831 Value *VecBase = nullptr;
13832 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
13833 if (!E->ReorderIndices.empty()) {
13834 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
13835 E->ReorderIndices.end());
13836 reorderScalars(VL, ReorderMask);
13837 }
13838 // Check if it can be considered reused if same extractelements were
13839 // vectorized already.
13840 bool PrevNodeFound = any_of(
13841 ArrayRef(R.VectorizableTree).take_front(E->Idx),
13842 [&](const std::unique_ptr<TreeEntry> &TE) {
13843 return ((TE->hasState() && !TE->isAltShuffle() &&
13844 TE->getOpcode() == Instruction::ExtractElement) ||
13845 TE->isGather()) &&
13846 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
13847 return VL.size() > Data.index() &&
13848 (Mask[Data.index()] == PoisonMaskElem ||
13849 isa<UndefValue>(VL[Data.index()]) ||
13850 Data.value() == VL[Data.index()]);
13851 });
13852 });
13853 SmallPtrSet<Value *, 4> UniqueBases;
13854 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
13855 SmallDenseMap<Value *, APInt, 4> VectorOpsToExtracts;
13856 for (unsigned Part : seq<unsigned>(NumParts)) {
13857 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
13858 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
13859 for (auto [I, V] :
13860 enumerate(ArrayRef(VL).slice(Part * SliceSize, Limit))) {
13861 // Ignore non-extractelement scalars.
13862 if (isa<UndefValue>(V) ||
13863 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
13864 continue;
13865 // If all users of instruction are going to be vectorized and this
13866 // instruction itself is not going to be vectorized, consider this
13867 // instruction as dead and remove its cost from the final cost of the
13868 // vectorized tree.
13869 // Also, avoid adjusting the cost for extractelements with multiple uses
13870 // in different graph entries.
13871 auto *EE = cast<ExtractElementInst>(V);
13872 VecBase = EE->getVectorOperand();
13873 UniqueBases.insert(VecBase);
13874 ArrayRef<TreeEntry *> VEs = R.getTreeEntries(V);
13875 if (!CheckedExtracts.insert(V).second ||
13876 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
13877 any_of(EE->users(),
13878 [&](User *U) {
13879 return isa<GetElementPtrInst>(U) &&
13880 !R.areAllUsersVectorized(cast<Instruction>(U),
13881 &VectorizedVals);
13882 }) ||
13883 (!VEs.empty() && !is_contained(VEs, E)))
13884 continue;
13885 std::optional<unsigned> EEIdx = getExtractIndex(EE);
13886 if (!EEIdx)
13887 continue;
13888 unsigned Idx = *EEIdx;
13889 // Take credit for instruction that will become dead.
13890 if (EE->hasOneUse() || !PrevNodeFound) {
13891 Instruction *Ext = EE->user_back();
13892 if (isa<SExtInst, ZExtInst>(Ext) &&
13893 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
13894 // Use getExtractWithExtendCost() to calculate the cost of
13895 // extractelement/ext pair.
13896 Cost -= TTI.getExtractWithExtendCost(
13897 Ext->getOpcode(), Ext->getType(), EE->getVectorOperandType(),
13898 Idx, CostKind);
13899 // Add back the cost of s|zext which is subtracted separately.
13900 Cost += TTI.getCastInstrCost(
13901 Ext->getOpcode(), Ext->getType(), EE->getType(),
13903 continue;
13904 }
13905 }
13906 APInt &DemandedElts =
13907 VectorOpsToExtracts
13908 .try_emplace(VecBase,
13909 APInt::getZero(getNumElements(VecBase->getType())))
13910 .first->getSecond();
13911 DemandedElts.setBit(Idx);
13912 }
13913 }
13914 for (const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
13916 DemandedElts, /*Insert=*/false,
13917 /*Extract=*/true, CostKind);
13918 // Check that gather of extractelements can be represented as just a
13919 // shuffle of a single/two vectors the scalars are extracted from.
13920 // Found the bunch of extractelement instructions that must be gathered
13921 // into a vector and can be represented as a permutation elements in a
13922 // single input vector or of 2 input vectors.
13923 // Done for reused if same extractelements were vectorized already.
13924 if (!PrevNodeFound)
13925 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
13926 InVectors.assign(1, E);
13927 CommonMask.assign(Mask.begin(), Mask.end());
13928 transformMaskAfterShuffle(CommonMask, CommonMask);
13929 SameNodesEstimated = false;
13930 if (NumParts != 1 && UniqueBases.size() != 1) {
13931 UseVecBaseAsInput = true;
13932 VecBase =
13933 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
13934 }
13935 return VecBase;
13936 }
13937 /// Checks if the specified entry \p E needs to be delayed because of its
13938 /// dependency nodes.
13939 std::optional<InstructionCost>
13940 needToDelay(const TreeEntry *,
13942 // No need to delay the cost estimation during analysis.
13943 return std::nullopt;
13944 }
13945 /// Reset the builder to handle perfect diamond match.
13947 IsFinalized = false;
13948 CommonMask.clear();
13949 InVectors.clear();
13950 Cost = 0;
13951 VectorizedVals.clear();
13952 SameNodesEstimated = true;
13953 }
13954 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
13955 if (&E1 == &E2) {
13956 assert(all_of(Mask,
13957 [&](int Idx) {
13958 return Idx < static_cast<int>(E1.getVectorFactor());
13959 }) &&
13960 "Expected single vector shuffle mask.");
13961 add(E1, Mask);
13962 return;
13963 }
13964 if (InVectors.empty()) {
13965 CommonMask.assign(Mask.begin(), Mask.end());
13966 InVectors.assign({&E1, &E2});
13967 return;
13968 }
13969 assert(!CommonMask.empty() && "Expected non-empty common mask.");
13970 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
13971 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
13972 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
13973 const auto *It =
13974 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
13975 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
13976 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
13977 }
13978 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
13979 if (InVectors.empty()) {
13980 CommonMask.assign(Mask.begin(), Mask.end());
13981 InVectors.assign(1, &E1);
13982 return;
13983 }
13984 assert(!CommonMask.empty() && "Expected non-empty common mask.");
13985 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
13986 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
13987 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
13988 const auto *It =
13989 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
13990 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
13991 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
13992 if (!SameNodesEstimated && InVectors.size() == 1)
13993 InVectors.emplace_back(&E1);
13994 }
13995 /// Adds 2 input vectors and the mask for their shuffling.
13996 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
13997 // May come only for shuffling of 2 vectors with extractelements, already
13998 // handled in adjustExtracts.
13999 assert(InVectors.size() == 1 &&
14000 all_of(enumerate(CommonMask),
14001 [&](auto P) {
14002 if (P.value() == PoisonMaskElem)
14003 return Mask[P.index()] == PoisonMaskElem;
14004 auto *EI = cast<ExtractElementInst>(
14005 cast<const TreeEntry *>(InVectors.front())
14006 ->getOrdered(P.index()));
14007 return EI->getVectorOperand() == V1 ||
14008 EI->getVectorOperand() == V2;
14009 }) &&
14010 "Expected extractelement vectors.");
14011 }
14012 /// Adds another one input vector and the mask for the shuffling.
14013 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
14014 if (InVectors.empty()) {
14015 assert(CommonMask.empty() && !ForExtracts &&
14016 "Expected empty input mask/vectors.");
14017 CommonMask.assign(Mask.begin(), Mask.end());
14018 InVectors.assign(1, V1);
14019 return;
14020 }
14021 if (ForExtracts) {
14022 // No need to add vectors here, already handled them in adjustExtracts.
14023 assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
14024 !CommonMask.empty() &&
14025 all_of(enumerate(CommonMask),
14026 [&](auto P) {
14027 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
14028 ->getOrdered(P.index());
14029 if (P.value() == PoisonMaskElem)
14030 return P.value() == Mask[P.index()] ||
14031 isa<UndefValue>(Scalar);
14032 if (isa<Constant>(V1))
14033 return true;
14034 auto *EI = cast<ExtractElementInst>(Scalar);
14035 return EI->getVectorOperand() == V1;
14036 }) &&
14037 "Expected only tree entry for extractelement vectors.");
14038 return;
14039 }
14040 assert(!InVectors.empty() && !CommonMask.empty() &&
14041 "Expected only tree entries from extracts/reused buildvectors.");
14042 unsigned VF = getVF(V1);
14043 if (InVectors.size() == 2) {
14044 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14045 transformMaskAfterShuffle(CommonMask, CommonMask);
14046 VF = std::max<unsigned>(VF, CommonMask.size());
14047 } else if (const auto *InTE =
14048 InVectors.front().dyn_cast<const TreeEntry *>()) {
14049 VF = std::max(VF, InTE->getVectorFactor());
14050 } else {
14051 VF = std::max(
14052 VF, cast<FixedVectorType>(cast<Value *>(InVectors.front())->getType())
14053 ->getNumElements());
14054 }
14055 InVectors.push_back(V1);
14056 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14057 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
14058 CommonMask[Idx] = Mask[Idx] + VF;
14059 }
14060 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
14061 Value *Root = nullptr) {
14062 Cost += getBuildVectorCost(VL, Root);
14063 if (!Root) {
14064 // FIXME: Need to find a way to avoid use of getNullValue here.
14066 unsigned VF = VL.size();
14067 if (MaskVF != 0)
14068 VF = std::min(VF, MaskVF);
14069 Type *VLScalarTy = VL.front()->getType();
14070 for (Value *V : VL.take_front(VF)) {
14071 Type *ScalarTy = VLScalarTy->getScalarType();
14072 if (isa<PoisonValue>(V)) {
14073 Vals.push_back(PoisonValue::get(ScalarTy));
14074 continue;
14075 }
14076 if (isa<UndefValue>(V)) {
14077 Vals.push_back(UndefValue::get(ScalarTy));
14078 continue;
14079 }
14080 Vals.push_back(Constant::getNullValue(ScalarTy));
14081 }
14082 if (auto *VecTy = dyn_cast<FixedVectorType>(VLScalarTy)) {
14083 assert(SLPReVec && "FixedVectorType is not expected.");
14084 // When REVEC is enabled, we need to expand vector types into scalar
14085 // types.
14086 Vals = replicateMask(Vals, VecTy->getNumElements());
14087 }
14088 return ConstantVector::get(Vals);
14089 }
14092 cast<FixedVectorType>(Root->getType())->getNumElements()),
14093 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
14094 }
14096 /// Finalize emission of the shuffles.
14098 ArrayRef<int> ExtMask,
14099 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14100 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
14103 Action = {}) {
14104 IsFinalized = true;
14105 if (Action) {
14106 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
14107 if (InVectors.size() == 2)
14108 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
14109 else
14110 Cost += createShuffle(Vec, nullptr, CommonMask);
14111 transformMaskAfterShuffle(CommonMask, CommonMask);
14112 assert(VF > 0 &&
14113 "Expected vector length for the final value before action.");
14114 Value *V = cast<Value *>(Vec);
14115 Action(V, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
14116 Cost += createShuffle(V1, V2, Mask);
14117 return V1;
14118 });
14119 InVectors.front() = V;
14120 }
14121 if (!SubVectors.empty()) {
14122 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
14123 if (InVectors.size() == 2)
14124 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
14125 else
14126 Cost += createShuffle(Vec, nullptr, CommonMask);
14127 transformMaskAfterShuffle(CommonMask, CommonMask);
14128 // Add subvectors permutation cost.
14129 if (!SubVectorsMask.empty()) {
14130 assert(SubVectorsMask.size() <= CommonMask.size() &&
14131 "Expected same size of masks for subvectors and common mask.");
14132 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
14133 copy(SubVectorsMask, SVMask.begin());
14134 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
14135 if (I2 != PoisonMaskElem) {
14136 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
14137 I1 = I2 + CommonMask.size();
14138 }
14139 }
14141 getWidenedType(ScalarTy, CommonMask.size()),
14142 SVMask, CostKind);
14143 }
14144 for (auto [E, Idx] : SubVectors) {
14145 Type *EScalarTy = E->Scalars.front()->getType();
14146 bool IsSigned = true;
14147 if (auto It = R.MinBWs.find(E); It != R.MinBWs.end()) {
14148 EScalarTy =
14149 IntegerType::get(EScalarTy->getContext(), It->second.first);
14150 IsSigned = It->second.second;
14151 }
14152 if (ScalarTy != EScalarTy) {
14153 unsigned CastOpcode = Instruction::Trunc;
14154 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
14155 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
14156 if (DstSz > SrcSz)
14157 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14158 Cost += TTI.getCastInstrCost(
14159 CastOpcode, getWidenedType(ScalarTy, E->getVectorFactor()),
14160 getWidenedType(EScalarTy, E->getVectorFactor()),
14162 }
14165 getWidenedType(ScalarTy, CommonMask.size()), {}, CostKind, Idx,
14166 getWidenedType(ScalarTy, E->getVectorFactor()));
14167 if (!CommonMask.empty()) {
14168 std::iota(std::next(CommonMask.begin(), Idx),
14169 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
14170 Idx);
14171 }
14172 }
14173 }
14174
14175 if (!ExtMask.empty()) {
14176 if (CommonMask.empty()) {
14177 CommonMask.assign(ExtMask.begin(), ExtMask.end());
14178 } else {
14179 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
14180 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
14181 if (ExtMask[I] == PoisonMaskElem)
14182 continue;
14183 NewMask[I] = CommonMask[ExtMask[I]];
14184 }
14185 CommonMask.swap(NewMask);
14186 }
14187 }
14188 if (CommonMask.empty()) {
14189 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
14190 return Cost;
14191 }
14192 return Cost +
14193 createShuffle(InVectors.front(),
14194 InVectors.size() == 2 ? InVectors.back() : nullptr,
14195 CommonMask);
14196 }
14197
14199 assert((IsFinalized || CommonMask.empty()) &&
14200 "Shuffle construction must be finalized.");
14201 }
14202};
14203
14204const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
14205 unsigned Idx) const {
14206 TreeEntry *Op = OperandsToTreeEntry.at({E, Idx});
14207 assert(Op->isSame(E->getOperand(Idx)) && "Operands mismatch!");
14208 return Op;
14209}
14210
14211TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
14212 if (TE.State == TreeEntry::ScatterVectorize ||
14213 TE.State == TreeEntry::StridedVectorize)
14215 if (TE.State == TreeEntry::CompressVectorize)
14217 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
14218 !TE.isAltShuffle()) {
14219 if (TE.ReorderIndices.empty())
14221 SmallVector<int> Mask;
14222 inversePermutation(TE.ReorderIndices, Mask);
14223 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
14225 }
14227}
14228
14230BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
14231 SmallPtrSetImpl<Value *> &CheckedExtracts) {
14232 ArrayRef<Value *> VL = E->Scalars;
14233
14234 Type *ScalarTy = getValueType(VL[0]);
14235 if (!isValidElementType(ScalarTy))
14238
14239 // If we have computed a smaller type for the expression, update VecTy so
14240 // that the costs will be accurate.
14241 auto It = MinBWs.find(E);
14242 Type *OrigScalarTy = ScalarTy;
14243 if (It != MinBWs.end()) {
14244 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
14245 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
14246 if (VecTy)
14247 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
14248 }
14249 auto *VecTy = getWidenedType(ScalarTy, VL.size());
14250 unsigned EntryVF = E->getVectorFactor();
14251 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
14252
14253 if (E->isGather()) {
14254 if (allConstant(VL))
14255 return 0;
14256 if (isa<InsertElementInst>(VL[0]))
14258 if (isa<CmpInst>(VL.front()))
14259 ScalarTy = VL.front()->getType();
14260 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
14261 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
14262 }
14263 if (E->State == TreeEntry::SplitVectorize) {
14264 assert(E->CombinedEntriesWithIndices.size() == 2 &&
14265 "Expected exactly 2 combined entries.");
14266 assert(E->ReuseShuffleIndices.empty() && "Expected empty reuses mask.");
14267 InstructionCost VectorCost = 0;
14268 if (E->ReorderIndices.empty()) {
14269 VectorCost = ::getShuffleCost(
14270 *TTI, TTI::SK_InsertSubvector, FinalVecTy, {}, CostKind,
14271 E->CombinedEntriesWithIndices.back().second,
14273 ScalarTy,
14274 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
14275 ->getVectorFactor()));
14276 } else {
14277 unsigned CommonVF =
14278 std::max(VectorizableTree[E->CombinedEntriesWithIndices.front().first]
14279 ->getVectorFactor(),
14280 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
14281 ->getVectorFactor());
14282 VectorCost = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
14283 getWidenedType(ScalarTy, CommonVF),
14284 E->getSplitMask(), CostKind);
14285 }
14286 LLVM_DEBUG(dumpTreeCosts(E, 0, VectorCost, 0, "Calculated costs for Tree"));
14287 return VectorCost;
14288 }
14289 InstructionCost CommonCost = 0;
14290 SmallVector<int> Mask;
14291 if (!E->ReorderIndices.empty() && E->State != TreeEntry::CompressVectorize &&
14292 (E->State != TreeEntry::StridedVectorize ||
14293 !isReverseOrder(E->ReorderIndices))) {
14294 SmallVector<int> NewMask;
14295 if (E->getOpcode() == Instruction::Store) {
14296 // For stores the order is actually a mask.
14297 NewMask.resize(E->ReorderIndices.size());
14298 copy(E->ReorderIndices, NewMask.begin());
14299 } else {
14300 inversePermutation(E->ReorderIndices, NewMask);
14301 }
14302 ::addMask(Mask, NewMask);
14303 }
14304 if (!E->ReuseShuffleIndices.empty())
14305 ::addMask(Mask, E->ReuseShuffleIndices);
14306 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
14307 CommonCost =
14308 ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
14309 assert((E->State == TreeEntry::Vectorize ||
14310 E->State == TreeEntry::ScatterVectorize ||
14311 E->State == TreeEntry::StridedVectorize ||
14312 E->State == TreeEntry::CompressVectorize) &&
14313 "Unhandled state");
14314 assert(E->getOpcode() &&
14315 ((allSameType(VL) && allSameBlock(VL)) ||
14316 (E->getOpcode() == Instruction::GetElementPtr &&
14317 E->getMainOp()->getType()->isPointerTy()) ||
14318 E->hasCopyableElements()) &&
14319 "Invalid VL");
14320 Instruction *VL0 = E->getMainOp();
14321 unsigned ShuffleOrOp =
14322 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
14323 if (E->CombinedOp != TreeEntry::NotCombinedOp)
14324 ShuffleOrOp = E->CombinedOp;
14325 SmallSetVector<Value *, 16> UniqueValues(VL.begin(), VL.end());
14326 const unsigned Sz = UniqueValues.size();
14327 SmallBitVector UsedScalars(Sz, false);
14328 for (unsigned I = 0; I < Sz; ++I) {
14329 if (isa<Instruction>(UniqueValues[I]) &&
14330 !E->isCopyableElement(UniqueValues[I]) &&
14331 getTreeEntries(UniqueValues[I]).front() == E)
14332 continue;
14333 UsedScalars.set(I);
14334 }
14335 auto GetCastContextHint = [&](Value *V) {
14336 if (ArrayRef<TreeEntry *> OpTEs = getTreeEntries(V); OpTEs.size() == 1)
14337 return getCastContextHint(*OpTEs.front());
14338 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
14339 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
14340 !SrcState.isAltShuffle())
14343 };
14344 auto GetCostDiff =
14345 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
14346 function_ref<InstructionCost(InstructionCost)> VectorCost) {
14347 // Calculate the cost of this instruction.
14348 InstructionCost ScalarCost = 0;
14349 if (isa<CastInst, CallInst>(VL0)) {
14350 // For some of the instructions no need to calculate cost for each
14351 // particular instruction, we can use the cost of the single
14352 // instruction x total number of scalar instructions.
14353 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
14354 } else {
14355 for (unsigned I = 0; I < Sz; ++I) {
14356 if (UsedScalars.test(I))
14357 continue;
14358 ScalarCost += ScalarEltCost(I);
14359 }
14360 }
14361
14362 InstructionCost VecCost = VectorCost(CommonCost);
14363 // Check if the current node must be resized, if the parent node is not
14364 // resized.
14365 if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) &&
14366 E->Idx != 0 &&
14367 (E->getOpcode() != Instruction::Load || E->UserTreeIndex)) {
14368 const EdgeInfo &EI = E->UserTreeIndex;
14369 if (!EI.UserTE->hasState() ||
14370 EI.UserTE->getOpcode() != Instruction::Select ||
14371 EI.EdgeIdx != 0) {
14372 auto UserBWIt = MinBWs.find(EI.UserTE);
14373 Type *UserScalarTy =
14374 (EI.UserTE->isGather() ||
14375 EI.UserTE->State == TreeEntry::SplitVectorize)
14376 ? EI.UserTE->Scalars.front()->getType()
14377 : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
14378 if (UserBWIt != MinBWs.end())
14379 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
14380 UserBWIt->second.first);
14381 if (ScalarTy != UserScalarTy) {
14382 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
14383 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
14384 unsigned VecOpcode;
14385 auto *UserVecTy = getWidenedType(UserScalarTy, E->Scalars.size());
14386 if (BWSz > SrcBWSz)
14387 VecOpcode = Instruction::Trunc;
14388 else
14389 VecOpcode =
14390 It->second.second ? Instruction::SExt : Instruction::ZExt;
14391 TTI::CastContextHint CCH = GetCastContextHint(VL0);
14392 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
14393 CostKind);
14394 }
14395 }
14396 }
14397 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
14398 ScalarCost, "Calculated costs for Tree"));
14399 return VecCost - ScalarCost;
14400 };
14401 // Calculate cost difference from vectorizing set of GEPs.
14402 // Negative value means vectorizing is profitable.
14403 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
14404 assert((E->State == TreeEntry::Vectorize ||
14405 E->State == TreeEntry::StridedVectorize ||
14406 E->State == TreeEntry::CompressVectorize) &&
14407 "Entry state expected to be Vectorize, StridedVectorize or "
14408 "MaskedLoadCompressVectorize here.");
14409 InstructionCost ScalarCost = 0;
14410 InstructionCost VecCost = 0;
14411 std::tie(ScalarCost, VecCost) = getGEPCosts(
14412 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
14413 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
14414 "Calculated GEPs cost for Tree"));
14415
14416 return VecCost - ScalarCost;
14417 };
14418
14419 auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
14420 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI ? VI : VL);
14421 if (MinMaxID == Intrinsic::not_intrinsic)
14423 Type *CanonicalType = Ty;
14424 if (CanonicalType->isPtrOrPtrVectorTy())
14425 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
14426 CanonicalType->getContext(),
14427 DL->getTypeSizeInBits(CanonicalType->getScalarType())));
14428
14429 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
14430 {CanonicalType, CanonicalType});
14432 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
14433 // If the selects are the only uses of the compares, they will be
14434 // dead and we can adjust the cost by removing their cost.
14435 if (VI && SelectOnly) {
14436 assert((!Ty->isVectorTy() || SLPReVec) &&
14437 "Expected only for scalar type.");
14438 auto *CI = cast<CmpInst>(VI->getOperand(0));
14439 IntrinsicCost -= TTI->getCmpSelInstrCost(
14440 CI->getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
14441 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
14442 {TTI::OK_AnyValue, TTI::OP_None}, CI);
14443 }
14444 return IntrinsicCost;
14445 };
14446 auto GetFMulAddCost = [&, &TTI = *TTI](const InstructionsState &S,
14447 Instruction *VI) {
14448 InstructionCost Cost = canConvertToFMA(VI, S, *DT, *DL, TTI, *TLI);
14449 return Cost;
14450 };
14451 switch (ShuffleOrOp) {
14452 case Instruction::PHI: {
14453 // Count reused scalars.
14454 InstructionCost ScalarCost = 0;
14455 SmallPtrSet<const TreeEntry *, 4> CountedOps;
14456 for (Value *V : UniqueValues) {
14457 auto *PHI = dyn_cast<PHINode>(V);
14458 if (!PHI)
14459 continue;
14460
14461 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
14462 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
14463 Value *Op = PHI->getIncomingValue(I);
14464 Operands[I] = Op;
14465 }
14466 if (const TreeEntry *OpTE =
14467 getSameValuesTreeEntry(Operands.front(), Operands))
14468 if (CountedOps.insert(OpTE).second &&
14469 !OpTE->ReuseShuffleIndices.empty())
14470 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
14471 OpTE->Scalars.size());
14472 }
14473
14474 return CommonCost - ScalarCost;
14475 }
14476 case Instruction::ExtractValue:
14477 case Instruction::ExtractElement: {
14478 APInt DemandedElts;
14479 VectorType *SrcVecTy = nullptr;
14480 auto GetScalarCost = [&](unsigned Idx) {
14481 if (isa<PoisonValue>(UniqueValues[Idx]))
14483
14484 auto *I = cast<Instruction>(UniqueValues[Idx]);
14485 if (!SrcVecTy) {
14486 if (ShuffleOrOp == Instruction::ExtractElement) {
14487 auto *EE = cast<ExtractElementInst>(I);
14488 SrcVecTy = EE->getVectorOperandType();
14489 } else {
14490 auto *EV = cast<ExtractValueInst>(I);
14491 Type *AggregateTy = EV->getAggregateOperand()->getType();
14492 unsigned NumElts;
14493 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
14494 NumElts = ATy->getNumElements();
14495 else
14496 NumElts = AggregateTy->getStructNumElements();
14497 SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
14498 }
14499 }
14500 if (I->hasOneUse()) {
14501 Instruction *Ext = I->user_back();
14502 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
14504 // Use getExtractWithExtendCost() to calculate the cost of
14505 // extractelement/ext pair.
14506 InstructionCost Cost = TTI->getExtractWithExtendCost(
14507 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I),
14508 CostKind);
14509 // Subtract the cost of s|zext which is subtracted separately.
14510 Cost -= TTI->getCastInstrCost(
14511 Ext->getOpcode(), Ext->getType(), I->getType(),
14513 return Cost;
14514 }
14515 }
14516 if (DemandedElts.isZero())
14517 DemandedElts = APInt::getZero(getNumElements(SrcVecTy));
14518 DemandedElts.setBit(*getExtractIndex(I));
14520 };
14521 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
14522 return CommonCost - (DemandedElts.isZero()
14524 : TTI.getScalarizationOverhead(
14525 SrcVecTy, DemandedElts, /*Insert=*/false,
14526 /*Extract=*/true, CostKind));
14527 };
14528 return GetCostDiff(GetScalarCost, GetVectorCost);
14529 }
14530 case Instruction::InsertElement: {
14531 assert(E->ReuseShuffleIndices.empty() &&
14532 "Unique insertelements only are expected.");
14533 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
14534 unsigned const NumElts = SrcVecTy->getNumElements();
14535 unsigned const NumScalars = VL.size();
14536
14537 unsigned NumOfParts = ::getNumberOfParts(*TTI, SrcVecTy);
14538
14539 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
14540 unsigned OffsetBeg = *getElementIndex(VL.front());
14541 unsigned OffsetEnd = OffsetBeg;
14542 InsertMask[OffsetBeg] = 0;
14543 for (auto [I, V] : enumerate(VL.drop_front())) {
14544 unsigned Idx = *getElementIndex(V);
14545 if (OffsetBeg > Idx)
14546 OffsetBeg = Idx;
14547 else if (OffsetEnd < Idx)
14548 OffsetEnd = Idx;
14549 InsertMask[Idx] = I + 1;
14550 }
14551 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
14552 if (NumOfParts > 0 && NumOfParts < NumElts)
14553 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
14554 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
14555 VecScalarsSz;
14556 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
14557 unsigned InsertVecSz = std::min<unsigned>(
14558 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
14559 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
14560 bool IsWholeSubvector =
14561 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
14562 // Check if we can safely insert a subvector. If it is not possible, just
14563 // generate a whole-sized vector and shuffle the source vector and the new
14564 // subvector.
14565 if (OffsetBeg + InsertVecSz > VecSz) {
14566 // Align OffsetBeg to generate correct mask.
14567 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
14568 InsertVecSz = VecSz;
14569 }
14570
14571 APInt DemandedElts = APInt::getZero(NumElts);
14572 // TODO: Add support for Instruction::InsertValue.
14573 SmallVector<int> Mask;
14574 if (!E->ReorderIndices.empty()) {
14575 inversePermutation(E->ReorderIndices, Mask);
14576 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
14577 } else {
14578 Mask.assign(VecSz, PoisonMaskElem);
14579 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
14580 }
14581 bool IsIdentity = true;
14582 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
14583 Mask.swap(PrevMask);
14584 for (unsigned I = 0; I < NumScalars; ++I) {
14585 unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);
14586 DemandedElts.setBit(InsertIdx);
14587 IsIdentity &= InsertIdx - OffsetBeg == I;
14588 Mask[InsertIdx - OffsetBeg] = I;
14589 }
14590 assert(Offset < NumElts && "Failed to find vector index offset");
14591
14593 Cost -=
14594 getScalarizationOverhead(*TTI, ScalarTy, SrcVecTy, DemandedElts,
14595 /*Insert*/ true, /*Extract*/ false, CostKind);
14596
14597 // First cost - resize to actual vector size if not identity shuffle or
14598 // need to shift the vector.
14599 // Do not calculate the cost if the actual size is the register size and
14600 // we can merge this shuffle with the following SK_Select.
14601 auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz);
14602 if (!IsIdentity)
14604 InsertVecTy, Mask);
14605 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
14606 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
14607 }));
14608 // Second cost - permutation with subvector, if some elements are from the
14609 // initial vector or inserting a subvector.
14610 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
14611 // subvector of ActualVecTy.
14612 SmallBitVector InMask =
14613 isUndefVector(FirstInsert->getOperand(0),
14614 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
14615 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
14616 if (InsertVecSz != VecSz) {
14617 auto *ActualVecTy = getWidenedType(ScalarTy, VecSz);
14618 Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, ActualVecTy, {},
14619 CostKind, OffsetBeg - Offset, InsertVecTy);
14620 } else {
14621 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
14622 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
14623 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
14624 I <= End; ++I)
14625 if (Mask[I] != PoisonMaskElem)
14626 Mask[I] = I + VecSz;
14627 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
14628 Mask[I] =
14629 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
14630 Cost +=
14631 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
14632 }
14633 }
14634 return Cost;
14635 }
14636 case Instruction::ZExt:
14637 case Instruction::SExt:
14638 case Instruction::FPToUI:
14639 case Instruction::FPToSI:
14640 case Instruction::FPExt:
14641 case Instruction::PtrToInt:
14642 case Instruction::IntToPtr:
14643 case Instruction::SIToFP:
14644 case Instruction::UIToFP:
14645 case Instruction::Trunc:
14646 case Instruction::FPTrunc:
14647 case Instruction::BitCast: {
14648 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
14649 Type *SrcScalarTy = VL0->getOperand(0)->getType();
14650 auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
14651 unsigned Opcode = ShuffleOrOp;
14652 unsigned VecOpcode = Opcode;
14653 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
14654 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
14655 // Check if the values are candidates to demote.
14656 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->getScalarType());
14657 if (SrcIt != MinBWs.end()) {
14658 SrcBWSz = SrcIt->second.first;
14659 unsigned SrcScalarTyNumElements = getNumElements(SrcScalarTy);
14660 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
14661 SrcVecTy =
14662 getWidenedType(SrcScalarTy, VL.size() * SrcScalarTyNumElements);
14663 }
14664 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
14665 if (BWSz == SrcBWSz) {
14666 VecOpcode = Instruction::BitCast;
14667 } else if (BWSz < SrcBWSz) {
14668 VecOpcode = Instruction::Trunc;
14669 } else if (It != MinBWs.end()) {
14670 assert(BWSz > SrcBWSz && "Invalid cast!");
14671 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
14672 } else if (SrcIt != MinBWs.end()) {
14673 assert(BWSz > SrcBWSz && "Invalid cast!");
14674 VecOpcode =
14675 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
14676 }
14677 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
14678 !SrcIt->second.second) {
14679 VecOpcode = Instruction::UIToFP;
14680 }
14681 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
14682 assert(Idx == 0 && "Expected 0 index only");
14683 return TTI->getCastInstrCost(Opcode, VL0->getType(),
14684 VL0->getOperand(0)->getType(),
14686 };
14687 auto GetVectorCost = [=](InstructionCost CommonCost) {
14688 // Do not count cost here if minimum bitwidth is in effect and it is just
14689 // a bitcast (here it is just a noop).
14690 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
14691 return CommonCost;
14692 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
14693 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
14694
14695 bool IsArithmeticExtendedReduction =
14696 E->Idx == 0 && UserIgnoreList &&
14697 all_of(*UserIgnoreList, [](Value *V) {
14698 auto *I = cast<Instruction>(V);
14699 return is_contained({Instruction::Add, Instruction::FAdd,
14700 Instruction::Mul, Instruction::FMul,
14701 Instruction::And, Instruction::Or,
14702 Instruction::Xor},
14703 I->getOpcode());
14704 });
14705 if (IsArithmeticExtendedReduction &&
14706 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
14707 return CommonCost;
14708 return CommonCost +
14709 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
14710 VecOpcode == Opcode ? VI : nullptr);
14711 };
14712 return GetCostDiff(GetScalarCost, GetVectorCost);
14713 }
14714 case Instruction::FCmp:
14715 case Instruction::ICmp:
14716 case Instruction::Select: {
14717 CmpPredicate VecPred, SwappedVecPred;
14718 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
14719 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
14720 match(VL0, MatchCmp))
14721 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
14722 else
14723 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
14726 auto GetScalarCost = [&](unsigned Idx) {
14727 if (isa<PoisonValue>(UniqueValues[Idx]))
14729
14730 auto *VI = cast<Instruction>(UniqueValues[Idx]);
14731 CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
14734 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
14735 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
14736 !match(VI, MatchCmp)) ||
14737 (CurrentPred != static_cast<CmpInst::Predicate>(VecPred) &&
14738 CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred)))
14739 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
14742
14743 InstructionCost ScalarCost = TTI->getCmpSelInstrCost(
14744 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
14745 CostKind, getOperandInfo(VI->getOperand(0)),
14746 getOperandInfo(VI->getOperand(1)), VI);
14747 InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
14748 if (IntrinsicCost.isValid())
14749 ScalarCost = IntrinsicCost;
14750
14751 return ScalarCost;
14752 };
14753 auto GetVectorCost = [&](InstructionCost CommonCost) {
14754 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
14755
14756 InstructionCost VecCost =
14757 TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VecPred,
14758 CostKind, getOperandInfo(E->getOperand(0)),
14759 getOperandInfo(E->getOperand(1)), VL0);
14760 if (auto *SI = dyn_cast<SelectInst>(VL0)) {
14761 auto *CondType =
14762 getWidenedType(SI->getCondition()->getType(), VL.size());
14763 unsigned CondNumElements = CondType->getNumElements();
14764 unsigned VecTyNumElements = getNumElements(VecTy);
14765 assert(VecTyNumElements >= CondNumElements &&
14766 VecTyNumElements % CondNumElements == 0 &&
14767 "Cannot vectorize Instruction::Select");
14768 if (CondNumElements != VecTyNumElements) {
14769 // When the return type is i1 but the source is fixed vector type, we
14770 // need to duplicate the condition value.
14771 VecCost += ::getShuffleCost(
14772 *TTI, TTI::SK_PermuteSingleSrc, CondType,
14773 createReplicatedMask(VecTyNumElements / CondNumElements,
14774 CondNumElements));
14775 }
14776 }
14777 return VecCost + CommonCost;
14778 };
14779 return GetCostDiff(GetScalarCost, GetVectorCost);
14780 }
14781 case TreeEntry::MinMax: {
14782 auto GetScalarCost = [&](unsigned Idx) {
14783 return GetMinMaxCost(OrigScalarTy);
14784 };
14785 auto GetVectorCost = [&](InstructionCost CommonCost) {
14786 InstructionCost VecCost = GetMinMaxCost(VecTy);
14787 return VecCost + CommonCost;
14788 };
14789 return GetCostDiff(GetScalarCost, GetVectorCost);
14790 }
14791 case TreeEntry::FMulAdd: {
14792 auto GetScalarCost = [&](unsigned Idx) {
14793 if (isa<PoisonValue>(UniqueValues[Idx]))
14795 return GetFMulAddCost(E->getOperations(),
14796 cast<Instruction>(UniqueValues[Idx]));
14797 };
14798 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
14799 FastMathFlags FMF;
14800 FMF.set();
14801 for (Value *V : E->Scalars) {
14802 if (auto *FPCI = dyn_cast<FPMathOperator>(V)) {
14803 FMF &= FPCI->getFastMathFlags();
14804 if (auto *FPCIOp = dyn_cast<FPMathOperator>(FPCI->getOperand(0)))
14805 FMF &= FPCIOp->getFastMathFlags();
14806 }
14807 }
14808 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
14809 {VecTy, VecTy, VecTy}, FMF);
14810 InstructionCost VecCost = TTI.getIntrinsicInstrCost(ICA, CostKind);
14811 return VecCost + CommonCost;
14812 };
14813 return GetCostDiff(GetScalarCost, GetVectorCost);
14814 }
14815 case Instruction::FNeg:
14816 case Instruction::Add:
14817 case Instruction::FAdd:
14818 case Instruction::Sub:
14819 case Instruction::FSub:
14820 case Instruction::Mul:
14821 case Instruction::FMul:
14822 case Instruction::UDiv:
14823 case Instruction::SDiv:
14824 case Instruction::FDiv:
14825 case Instruction::URem:
14826 case Instruction::SRem:
14827 case Instruction::FRem:
14828 case Instruction::Shl:
14829 case Instruction::LShr:
14830 case Instruction::AShr:
14831 case Instruction::And:
14832 case Instruction::Or:
14833 case Instruction::Xor: {
14834 auto GetScalarCost = [&](unsigned Idx) {
14835 if (isa<PoisonValue>(UniqueValues[Idx]))
14837
14838 // We cannot retrieve the operand from UniqueValues[Idx] because an
14839 // interchangeable instruction may be used. The order and the actual
14840 // operand might differ from what is retrieved from UniqueValues[Idx].
14841 Value *Op1 = E->getOperand(0)[Idx];
14842 Value *Op2;
14843 SmallVector<const Value *, 2> Operands(1, Op1);
14844 if (isa<UnaryOperator>(UniqueValues[Idx])) {
14845 Op2 = Op1;
14846 } else {
14847 Op2 = E->getOperand(1)[Idx];
14848 Operands.push_back(Op2);
14849 }
14852 InstructionCost ScalarCost = TTI->getArithmeticInstrCost(
14853 ShuffleOrOp, OrigScalarTy, CostKind, Op1Info, Op2Info, Operands);
14854 if (auto *I = dyn_cast<Instruction>(UniqueValues[Idx]);
14855 I && (ShuffleOrOp == Instruction::FAdd ||
14856 ShuffleOrOp == Instruction::FSub)) {
14857 InstructionCost IntrinsicCost = GetFMulAddCost(E->getOperations(), I);
14858 if (IntrinsicCost.isValid())
14859 ScalarCost = IntrinsicCost;
14860 }
14861 return ScalarCost;
14862 };
14863 auto GetVectorCost = [=](InstructionCost CommonCost) {
14864 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
14865 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
14866 ArrayRef<Value *> Ops = E->getOperand(I);
14867 if (all_of(Ops, [&](Value *Op) {
14868 auto *CI = dyn_cast<ConstantInt>(Op);
14869 return CI && CI->getValue().countr_one() >= It->second.first;
14870 }))
14871 return CommonCost;
14872 }
14873 }
14874 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
14875 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
14876 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
14877 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
14878 Op2Info, {}, nullptr, TLI) +
14879 CommonCost;
14880 };
14881 return GetCostDiff(GetScalarCost, GetVectorCost);
14882 }
14883 case Instruction::GetElementPtr: {
14884 return CommonCost + GetGEPCostDiff(VL, VL0);
14885 }
14886 case Instruction::Load: {
14887 auto GetScalarCost = [&](unsigned Idx) {
14888 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
14889 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
14890 VI->getAlign(), VI->getPointerAddressSpace(),
14892 };
14893 auto *LI0 = cast<LoadInst>(VL0);
14894 auto GetVectorCost = [&](InstructionCost CommonCost) {
14895 InstructionCost VecLdCost;
14896 switch (E->State) {
14897 case TreeEntry::Vectorize:
14898 if (unsigned Factor = E->getInterleaveFactor()) {
14899 VecLdCost = TTI->getInterleavedMemoryOpCost(
14900 Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
14901 LI0->getPointerAddressSpace(), CostKind);
14902
14903 } else {
14904 VecLdCost = TTI->getMemoryOpCost(
14905 Instruction::Load, VecTy, LI0->getAlign(),
14906 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
14907 }
14908 break;
14909 case TreeEntry::StridedVectorize: {
14910 Align CommonAlignment =
14911 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
14912 VecLdCost = TTI->getStridedMemoryOpCost(
14913 Instruction::Load, VecTy, LI0->getPointerOperand(),
14914 /*VariableMask=*/false, CommonAlignment, CostKind);
14915 break;
14916 }
14917 case TreeEntry::CompressVectorize: {
14918 bool IsMasked;
14919 unsigned InterleaveFactor;
14920 SmallVector<int> CompressMask;
14921 VectorType *LoadVecTy;
14922 SmallVector<Value *> Scalars(VL);
14923 if (!E->ReorderIndices.empty()) {
14924 SmallVector<int> Mask(E->ReorderIndices.begin(),
14925 E->ReorderIndices.end());
14926 reorderScalars(Scalars, Mask);
14927 }
14928 SmallVector<Value *> PointerOps(Scalars.size());
14929 for (auto [I, V] : enumerate(Scalars))
14930 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
14931 [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress(
14932 Scalars, PointerOps, E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
14933 *TLI, [](Value *) { return true; }, IsMasked, InterleaveFactor,
14934 CompressMask, LoadVecTy);
14935 assert(IsVectorized && "Failed to vectorize load");
14936 CompressEntryToData.try_emplace(E, CompressMask, LoadVecTy,
14937 InterleaveFactor, IsMasked);
14938 Align CommonAlignment = LI0->getAlign();
14939 if (InterleaveFactor) {
14940 VecLdCost = TTI->getInterleavedMemoryOpCost(
14941 Instruction::Load, LoadVecTy, InterleaveFactor, {},
14942 CommonAlignment, LI0->getPointerAddressSpace(), CostKind);
14943 } else if (IsMasked) {
14944 VecLdCost = TTI->getMaskedMemoryOpCost(
14945 Instruction::Load, LoadVecTy, CommonAlignment,
14946 LI0->getPointerAddressSpace(), CostKind);
14947 // TODO: include this cost into CommonCost.
14948 VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
14949 LoadVecTy, CompressMask, CostKind);
14950 } else {
14951 VecLdCost = TTI->getMemoryOpCost(
14952 Instruction::Load, LoadVecTy, CommonAlignment,
14953 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
14954 // TODO: include this cost into CommonCost.
14955 VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
14956 LoadVecTy, CompressMask, CostKind);
14957 }
14958 break;
14959 }
14960 case TreeEntry::ScatterVectorize: {
14961 Align CommonAlignment =
14962 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
14963 VecLdCost = TTI->getGatherScatterOpCost(
14964 Instruction::Load, VecTy, LI0->getPointerOperand(),
14965 /*VariableMask=*/false, CommonAlignment, CostKind);
14966 break;
14967 }
14968 case TreeEntry::CombinedVectorize:
14969 case TreeEntry::SplitVectorize:
14970 case TreeEntry::NeedToGather:
14971 llvm_unreachable("Unexpected vectorization state.");
14972 }
14973 return VecLdCost + CommonCost;
14974 };
14975
14976 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
14977 // If this node generates masked gather load then it is not a terminal node.
14978 // Hence address operand cost is estimated separately.
14979 if (E->State == TreeEntry::ScatterVectorize)
14980 return Cost;
14981
14982 // Estimate cost of GEPs since this tree node is a terminator.
14983 SmallVector<Value *> PointerOps(VL.size());
14984 for (auto [I, V] : enumerate(VL))
14985 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
14986 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
14987 }
14988 case Instruction::Store: {
14989 bool IsReorder = !E->ReorderIndices.empty();
14990 auto GetScalarCost = [=](unsigned Idx) {
14991 auto *VI = cast<StoreInst>(VL[Idx]);
14992 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
14993 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
14994 VI->getAlign(), VI->getPointerAddressSpace(),
14995 CostKind, OpInfo, VI);
14996 };
14997 auto *BaseSI =
14998 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
14999 auto GetVectorCost = [=](InstructionCost CommonCost) {
15000 // We know that we can merge the stores. Calculate the cost.
15001 InstructionCost VecStCost;
15002 if (E->State == TreeEntry::StridedVectorize) {
15003 Align CommonAlignment =
15004 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
15005 VecStCost = TTI->getStridedMemoryOpCost(
15006 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
15007 /*VariableMask=*/false, CommonAlignment, CostKind);
15008 } else {
15009 assert(E->State == TreeEntry::Vectorize &&
15010 "Expected either strided or consecutive stores.");
15011 if (unsigned Factor = E->getInterleaveFactor()) {
15012 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
15013 "No reused shuffles expected");
15014 CommonCost = 0;
15015 VecStCost = TTI->getInterleavedMemoryOpCost(
15016 Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
15017 BaseSI->getPointerAddressSpace(), CostKind);
15018 } else {
15019 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
15020 VecStCost = TTI->getMemoryOpCost(
15021 Instruction::Store, VecTy, BaseSI->getAlign(),
15022 BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
15023 }
15024 }
15025 return VecStCost + CommonCost;
15026 };
15027 SmallVector<Value *> PointerOps(VL.size());
15028 for (auto [I, V] : enumerate(VL)) {
15029 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
15030 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
15031 }
15032
15033 return GetCostDiff(GetScalarCost, GetVectorCost) +
15034 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
15035 }
15036 case Instruction::Call: {
15037 auto GetScalarCost = [&](unsigned Idx) {
15038 auto *CI = cast<CallInst>(UniqueValues[Idx]);
15041 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
15042 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
15043 }
15044 return TTI->getCallInstrCost(CI->getCalledFunction(),
15046 CI->getFunctionType()->params(), CostKind);
15047 };
15048 auto GetVectorCost = [=](InstructionCost CommonCost) {
15049 auto *CI = cast<CallInst>(VL0);
15052 CI, ID, VecTy->getNumElements(),
15053 It != MinBWs.end() ? It->second.first : 0, TTI);
15054 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
15055 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
15056 };
15057 return GetCostDiff(GetScalarCost, GetVectorCost);
15058 }
15059 case Instruction::ShuffleVector: {
15060 if (!SLPReVec || E->isAltShuffle())
15061 assert(E->isAltShuffle() &&
15062 ((Instruction::isBinaryOp(E->getOpcode()) &&
15063 Instruction::isBinaryOp(E->getAltOpcode())) ||
15064 (Instruction::isCast(E->getOpcode()) &&
15065 Instruction::isCast(E->getAltOpcode())) ||
15066 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
15067 "Invalid Shuffle Vector Operand");
15068 // Try to find the previous shuffle node with the same operands and same
15069 // main/alternate ops.
15070 auto TryFindNodeWithEqualOperands = [=]() {
15071 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
15072 if (TE.get() == E)
15073 break;
15074 if (TE->hasState() && TE->isAltShuffle() &&
15075 ((TE->getOpcode() == E->getOpcode() &&
15076 TE->getAltOpcode() == E->getAltOpcode()) ||
15077 (TE->getOpcode() == E->getAltOpcode() &&
15078 TE->getAltOpcode() == E->getOpcode())) &&
15079 TE->hasEqualOperands(*E))
15080 return true;
15081 }
15082 return false;
15083 };
15084 auto GetScalarCost = [&](unsigned Idx) {
15085 if (isa<PoisonValue>(UniqueValues[Idx]))
15087
15088 auto *VI = cast<Instruction>(UniqueValues[Idx]);
15089 assert(E->getMatchingMainOpOrAltOp(VI) &&
15090 "Unexpected main/alternate opcode");
15091 (void)E;
15092 return TTI->getInstructionCost(VI, CostKind);
15093 };
15094 // Need to clear CommonCost since the final shuffle cost is included into
15095 // vector cost.
15096 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
15097 // VecCost is equal to sum of the cost of creating 2 vectors
15098 // and the cost of creating shuffle.
15099 InstructionCost VecCost = 0;
15100 if (TryFindNodeWithEqualOperands()) {
15101 LLVM_DEBUG({
15102 dbgs() << "SLP: diamond match for alternate node found.\n";
15103 E->dump();
15104 });
15105 // No need to add new vector costs here since we're going to reuse
15106 // same main/alternate vector ops, just do different shuffling.
15107 } else if (Instruction::isBinaryOp(E->getOpcode())) {
15108 VecCost =
15109 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
15110 VecCost +=
15111 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
15112 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
15113 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
15114 VecCost = TTIRef.getCmpSelInstrCost(
15115 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(), CostKind,
15116 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15117 VL0);
15118 VecCost += TTIRef.getCmpSelInstrCost(
15119 E->getOpcode(), VecTy, MaskTy,
15120 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
15121 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15122 E->getAltOp());
15123 } else {
15124 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
15125 auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
15126 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
15127 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
15128 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
15129 unsigned SrcBWSz =
15130 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
15131 if (SrcIt != MinBWs.end()) {
15132 SrcBWSz = SrcIt->second.first;
15133 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
15134 SrcTy = getWidenedType(SrcSclTy, VL.size());
15135 }
15136 if (BWSz <= SrcBWSz) {
15137 if (BWSz < SrcBWSz)
15138 VecCost =
15139 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
15141 LLVM_DEBUG({
15142 dbgs()
15143 << "SLP: alternate extension, which should be truncated.\n";
15144 E->dump();
15145 });
15146 return VecCost;
15147 }
15148 }
15149 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
15151 VecCost +=
15152 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
15154 }
15155 SmallVector<int> Mask;
15156 E->buildAltOpShuffleMask(
15157 [&](Instruction *I) {
15158 assert(E->getMatchingMainOpOrAltOp(I) &&
15159 "Unexpected main/alternate opcode");
15160 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
15161 *TLI);
15162 },
15163 Mask);
15165 FinalVecTy, Mask, CostKind);
15166 // Patterns like [fadd,fsub] can be combined into a single instruction
15167 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
15168 // need to take into account their order when looking for the most used
15169 // order.
15170 unsigned Opcode0 = E->getOpcode();
15171 unsigned Opcode1 = E->getAltOpcode();
15172 SmallBitVector OpcodeMask(
15173 getAltInstrMask(E->Scalars, ScalarTy, Opcode0, Opcode1));
15174 // If this pattern is supported by the target then we consider the
15175 // order.
15176 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
15177 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
15178 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
15179 return AltVecCost < VecCost ? AltVecCost : VecCost;
15180 }
15181 // TODO: Check the reverse order too.
15182 return VecCost;
15183 };
15184 if (SLPReVec && !E->isAltShuffle())
15185 return GetCostDiff(
15186 GetScalarCost, [&](InstructionCost) -> InstructionCost {
15187 // If a group uses mask in order, the shufflevector can be
15188 // eliminated by instcombine. Then the cost is 0.
15190 "Not supported shufflevector usage.");
15191 auto *SV = cast<ShuffleVectorInst>(VL.front());
15192 unsigned SVNumElements =
15193 cast<FixedVectorType>(SV->getOperand(0)->getType())
15194 ->getNumElements();
15195 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
15196 for (size_t I = 0, End = VL.size(); I != End; I += GroupSize) {
15197 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
15198 int NextIndex = 0;
15199 if (!all_of(Group, [&](Value *V) {
15201 "Not supported shufflevector usage.");
15202 auto *SV = cast<ShuffleVectorInst>(V);
15203 int Index;
15204 [[maybe_unused]] bool IsExtractSubvectorMask =
15205 SV->isExtractSubvectorMask(Index);
15206 assert(IsExtractSubvectorMask &&
15207 "Not supported shufflevector usage.");
15208 if (NextIndex != Index)
15209 return false;
15210 NextIndex += SV->getShuffleMask().size();
15211 return true;
15212 }))
15213 return ::getShuffleCost(
15215 calculateShufflevectorMask(E->Scalars));
15216 }
15217 return TTI::TCC_Free;
15218 });
15219 return GetCostDiff(GetScalarCost, GetVectorCost);
15220 }
15221 case Instruction::Freeze:
15222 return CommonCost;
15223 default:
15224 llvm_unreachable("Unknown instruction");
15225 }
15226}
15227
15228bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
15229 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
15230 << VectorizableTree.size() << " is fully vectorizable .\n");
15231
15232 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
15233 SmallVector<int> Mask;
15234 return TE->isGather() &&
15235 !any_of(TE->Scalars,
15236 [this](Value *V) { return EphValues.contains(V); }) &&
15237 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
15238 TE->Scalars.size() < Limit ||
15239 (((TE->hasState() &&
15240 TE->getOpcode() == Instruction::ExtractElement) ||
15242 isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
15243 (TE->hasState() && TE->getOpcode() == Instruction::Load &&
15244 !TE->isAltShuffle()) ||
15245 any_of(TE->Scalars, IsaPred<LoadInst>));
15246 };
15247
15248 // We only handle trees of heights 1 and 2.
15249 if (VectorizableTree.size() == 1 &&
15250 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
15251 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
15252 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
15253 (ForReduction &&
15254 AreVectorizableGathers(VectorizableTree[0].get(),
15255 VectorizableTree[0]->Scalars.size()) &&
15256 VectorizableTree[0]->getVectorFactor() > 2)))
15257 return true;
15258
15259 if (VectorizableTree.size() != 2)
15260 return false;
15261
15262 // Handle splat and all-constants stores. Also try to vectorize tiny trees
15263 // with the second gather nodes if they have less scalar operands rather than
15264 // the initial tree element (may be profitable to shuffle the second gather)
15265 // or they are extractelements, which form shuffle.
15266 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
15267 AreVectorizableGathers(VectorizableTree[1].get(),
15268 VectorizableTree[0]->Scalars.size()))
15269 return true;
15270
15271 // Gathering cost would be too much for tiny trees.
15272 if (VectorizableTree[0]->isGather() ||
15273 (VectorizableTree[1]->isGather() &&
15274 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
15275 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
15276 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
15277 return false;
15278
15279 return true;
15280}
15281
15282static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
15284 bool MustMatchOrInst) {
15285 // Look past the root to find a source value. Arbitrarily follow the
15286 // path through operand 0 of any 'or'. Also, peek through optional
15287 // shift-left-by-multiple-of-8-bits.
15288 Value *ZextLoad = Root;
15289 const APInt *ShAmtC;
15290 bool FoundOr = false;
15291 while (!isa<ConstantExpr>(ZextLoad) &&
15292 (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
15293 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
15294 ShAmtC->urem(8) == 0))) {
15295 auto *BinOp = cast<BinaryOperator>(ZextLoad);
15296 ZextLoad = BinOp->getOperand(0);
15297 if (BinOp->getOpcode() == Instruction::Or)
15298 FoundOr = true;
15299 }
15300 // Check if the input is an extended load of the required or/shift expression.
15301 Value *Load;
15302 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
15303 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
15304 return false;
15305
15306 // Require that the total load bit width is a legal integer type.
15307 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
15308 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
15309 Type *SrcTy = Load->getType();
15310 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
15311 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
15312 return false;
15313
15314 // Everything matched - assume that we can fold the whole sequence using
15315 // load combining.
15316 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
15317 << *(cast<Instruction>(Root)) << "\n");
15318
15319 return true;
15320}
15321
15323 if (RdxKind != RecurKind::Or)
15324 return false;
15325
15326 unsigned NumElts = VectorizableTree[0]->Scalars.size();
15327 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
15328 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
15329 /* MatchOr */ false);
15330}
15331
15333 // Peek through a final sequence of stores and check if all operations are
15334 // likely to be load-combined.
15335 unsigned NumElts = Stores.size();
15336 for (Value *Scalar : Stores) {
15337 Value *X;
15338 if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
15339 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
15340 return false;
15341 }
15342 return true;
15343}
15344
15345bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
15346 if (!DebugCounter::shouldExecute(VectorizedGraphs))
15347 return true;
15348
15349 // Graph is empty - do nothing.
15350 if (VectorizableTree.empty()) {
15351 assert(ExternalUses.empty() && "We shouldn't have any external users");
15352
15353 return true;
15354 }
15355
15356 // No need to vectorize inserts of gathered values.
15357 if (VectorizableTree.size() == 2 &&
15358 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
15359 VectorizableTree[1]->isGather() &&
15360 (VectorizableTree[1]->getVectorFactor() <= 2 ||
15361 !(isSplat(VectorizableTree[1]->Scalars) ||
15362 allConstant(VectorizableTree[1]->Scalars))))
15363 return true;
15364
15365 // If the graph includes only PHI nodes and gathers, it is defnitely not
15366 // profitable for the vectorization, we can skip it, if the cost threshold is
15367 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
15368 // gathers/buildvectors.
15369 constexpr int Limit = 4;
15370 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
15371 !VectorizableTree.empty() &&
15372 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15373 return (TE->isGather() &&
15374 (!TE->hasState() ||
15375 TE->getOpcode() != Instruction::ExtractElement) &&
15376 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
15377 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
15378 }))
15379 return true;
15380
15381 // Do not vectorize small tree of phis only, if all vector phis are also
15382 // gathered.
15383 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15384 VectorizableTree.size() <= Limit &&
15385 all_of(VectorizableTree,
15386 [&](const std::unique_ptr<TreeEntry> &TE) {
15387 return (TE->isGather() &&
15388 (!TE->hasState() ||
15389 TE->getOpcode() != Instruction::ExtractElement) &&
15390 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <=
15391 Limit) ||
15392 (TE->hasState() &&
15393 (TE->getOpcode() == Instruction::InsertElement ||
15394 (TE->getOpcode() == Instruction::PHI &&
15395 all_of(TE->Scalars, [&](Value *V) {
15396 return isa<PoisonValue>(V) || MustGather.contains(V);
15397 }))));
15398 }) &&
15399 any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15400 return TE->State == TreeEntry::Vectorize &&
15401 TE->getOpcode() == Instruction::PHI;
15402 }))
15403 return true;
15404
15405 // If the tree contains only phis, buildvectors, split nodes and
15406 // small nodes with reuses, we can skip it.
15407 SmallVector<const TreeEntry *> StoreLoadNodes;
15408 unsigned NumGathers = 0;
15409 constexpr int LimitTreeSize = 36;
15410 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
15411 all_of(VectorizableTree,
15412 [&](const std::unique_ptr<TreeEntry> &TE) {
15413 if (!TE->isGather() && TE->hasState() &&
15414 (TE->getOpcode() == Instruction::Load ||
15415 TE->getOpcode() == Instruction::Store)) {
15416 StoreLoadNodes.push_back(TE.get());
15417 return true;
15418 }
15419 if (TE->isGather())
15420 ++NumGathers;
15421 return TE->State == TreeEntry::SplitVectorize ||
15422 (TE->Idx == 0 && TE->Scalars.size() == 2 &&
15423 TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
15424 VectorizableTree.size() > LimitTreeSize) ||
15425 (TE->isGather() &&
15426 none_of(TE->Scalars, IsaPred<ExtractElementInst>)) ||
15427 (TE->hasState() &&
15428 (TE->getOpcode() == Instruction::PHI ||
15429 (TE->hasCopyableElements() &&
15430 static_cast<unsigned>(count_if(
15431 TE->Scalars, IsaPred<PHINode, Constant>)) >=
15432 TE->Scalars.size() / 2) ||
15433 ((!TE->ReuseShuffleIndices.empty() ||
15434 !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
15435 TE->Scalars.size() == 2)));
15436 }) &&
15437 (StoreLoadNodes.empty() ||
15438 (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.size() &&
15439 (NumGathers > 0 || none_of(StoreLoadNodes, [&](const TreeEntry *TE) {
15440 return TE->getOpcode() == Instruction::Store ||
15441 all_of(TE->Scalars, [&](Value *V) {
15442 return !isa<LoadInst>(V) ||
15443 areAllUsersVectorized(cast<Instruction>(V));
15444 });
15445 })))))
15446 return true;
15447
15448 // If the tree contains only buildvector, 2 non-buildvectors (with root user
15449 // tree node) and other buildvectors, we can skip it.
15450 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15451 VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
15452 VectorizableTree.size() >= Limit &&
15453 count_if(ArrayRef(VectorizableTree).drop_front(),
15454 [&](const std::unique_ptr<TreeEntry> &TE) {
15455 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
15456 TE->UserTreeIndex.UserTE->Idx == 0;
15457 }) == 2)
15458 return true;
15459
15460 // If the tree contains only vectorization of the phi node from the
15461 // buildvector - skip it.
15462 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15463 VectorizableTree.size() > 2 &&
15464 VectorizableTree.front()->State == TreeEntry::Vectorize &&
15465 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
15466 VectorizableTree[1]->State == TreeEntry::Vectorize &&
15467 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
15468 all_of(
15469 ArrayRef(VectorizableTree).drop_front(2),
15470 [&](const std::unique_ptr<TreeEntry> &TE) { return TE->isGather(); }))
15471 return true;
15472
15473 // We can vectorize the tree if its size is greater than or equal to the
15474 // minimum size specified by the MinTreeSize command line option.
15475 if (VectorizableTree.size() >= MinTreeSize)
15476 return false;
15477
15478 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
15479 // can vectorize it if we can prove it fully vectorizable.
15480 if (isFullyVectorizableTinyTree(ForReduction))
15481 return false;
15482
15483 // Check if any of the gather node forms an insertelement buildvector
15484 // somewhere.
15485 bool IsAllowedSingleBVNode =
15486 VectorizableTree.size() > 1 ||
15487 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
15488 !VectorizableTree.front()->isAltShuffle() &&
15489 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
15490 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
15491 allSameBlock(VectorizableTree.front()->Scalars));
15492 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15493 return TE->isGather() && all_of(TE->Scalars, [&](Value *V) {
15494 return isa<ExtractElementInst, Constant>(V) ||
15495 (IsAllowedSingleBVNode &&
15496 !V->hasNUsesOrMore(UsesLimit) &&
15497 any_of(V->users(), IsaPred<InsertElementInst>));
15498 });
15499 }))
15500 return false;
15501
15502 if (VectorizableTree.back()->isGather() &&
15503 VectorizableTree.back()->hasState() &&
15504 VectorizableTree.back()->isAltShuffle() &&
15505 VectorizableTree.back()->getVectorFactor() > 2 &&
15506 allSameBlock(VectorizableTree.back()->Scalars) &&
15507 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
15508 TTI->getScalarizationOverhead(
15509 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
15510 VectorizableTree.back()->getVectorFactor()),
15511 APInt::getAllOnes(VectorizableTree.back()->getVectorFactor()),
15512 /*Insert=*/true, /*Extract=*/false,
15514 return false;
15515
15516 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
15517 // vectorizable.
15518 return true;
15519}
15520
15523 constexpr unsigned SmallTree = 3;
15524 if (VectorizableTree.front()->isNonPowOf2Vec() &&
15525 getCanonicalGraphSize() <= SmallTree &&
15526 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
15527 [](const std::unique_ptr<TreeEntry> &TE) {
15528 return TE->isGather() && TE->hasState() &&
15529 TE->getOpcode() == Instruction::Load &&
15530 !allSameBlock(TE->Scalars);
15531 }) == 1)
15532 return true;
15533 return false;
15534 }
15535 bool Res = false;
15536 for (unsigned Idx : seq<unsigned>(getTreeSize())) {
15537 TreeEntry &E = *VectorizableTree[Idx];
15538 if (E.State == TreeEntry::SplitVectorize)
15539 return false;
15540 if (!E.isGather())
15541 continue;
15542 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
15543 (!E.hasState() &&
15545 (isa<ExtractElementInst>(E.Scalars.front()) &&
15546 getSameOpcode(ArrayRef(E.Scalars).drop_front(), *TLI).valid()))
15547 return false;
15548 if (isSplat(E.Scalars) || allConstant(E.Scalars))
15549 continue;
15550 Res = true;
15551 }
15552 return Res;
15553}
15554
15556 // Walk from the bottom of the tree to the top, tracking which values are
15557 // live. When we see a call instruction that is not part of our tree,
15558 // query TTI to see if there is a cost to keeping values live over it
15559 // (for example, if spills and fills are required).
15560
15561 const TreeEntry *Root = VectorizableTree.front().get();
15562 if (Root->isGather())
15563 return 0;
15564
15567 EntriesToOperands;
15568 SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;
15569 SmallPtrSet<const Instruction *, 8> LastInstructions;
15570 for (const auto &TEPtr : VectorizableTree) {
15571 if (!TEPtr->isGather()) {
15572 Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
15573 EntriesToLastInstruction.try_emplace(TEPtr.get(), LastInst);
15574 LastInstructions.insert(LastInst);
15575 }
15576 if (TEPtr->UserTreeIndex)
15577 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
15578 }
15579
15580 auto NoCallIntrinsic = [this](const Instruction *I) {
15581 const auto *II = dyn_cast<IntrinsicInst>(I);
15582 if (!II)
15583 return false;
15584 if (II->isAssumeLikeIntrinsic())
15585 return true;
15586 IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
15587 InstructionCost IntrCost =
15588 TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
15589 InstructionCost CallCost = TTI->getCallInstrCost(
15590 nullptr, II->getType(), ICA.getArgTypes(), TTI::TCK_RecipThroughput);
15591 return IntrCost < CallCost;
15592 };
15593
15594 // Maps last instruction in the entry to the last instruction for the one of
15595 // operand entries and the flag. If the flag is true, there are no calls in
15596 // between these instructions.
15598 CheckedInstructions;
15599 unsigned Budget = 0;
15600 const unsigned BudgetLimit =
15601 ScheduleRegionSizeBudget / VectorizableTree.size();
15602 auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
15603 const Instruction *Last) {
15604 assert(First->getParent() == Last->getParent() &&
15605 "Expected instructions in same block.");
15606 if (auto It = CheckedInstructions.find(Last);
15607 It != CheckedInstructions.end()) {
15608 const Instruction *Checked = It->second.getPointer();
15609 if (Checked == First || Checked->comesBefore(First))
15610 return It->second.getInt() != 0;
15611 Last = Checked;
15612 } else if (Last == First || Last->comesBefore(First)) {
15613 return true;
15614 }
15616 ++First->getIterator().getReverse(),
15617 PrevInstIt =
15618 Last->getIterator().getReverse();
15619 SmallVector<const Instruction *> LastInstsInRange;
15620 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
15621 // Debug information does not impact spill cost.
15622 // Vectorized calls, represented as vector intrinsics, do not impact spill
15623 // cost.
15624 if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
15625 CB && !NoCallIntrinsic(CB) && !isVectorized(CB)) {
15626 for (const Instruction *LastInst : LastInstsInRange)
15627 CheckedInstructions.try_emplace(LastInst, &*PrevInstIt, 0);
15628 return false;
15629 }
15630 if (LastInstructions.contains(&*PrevInstIt))
15631 LastInstsInRange.push_back(&*PrevInstIt);
15632
15633 ++PrevInstIt;
15634 ++Budget;
15635 }
15636 for (const Instruction *LastInst : LastInstsInRange)
15637 CheckedInstructions.try_emplace(
15638 LastInst, PrevInstIt == InstIt ? First : &*PrevInstIt,
15639 Budget <= BudgetLimit ? 1 : 0);
15640 return Budget <= BudgetLimit;
15641 };
15642 auto AddCosts = [&](const TreeEntry *Op) {
15643 Type *ScalarTy = Op->Scalars.front()->getType();
15644 auto It = MinBWs.find(Op);
15645 if (It != MinBWs.end())
15646 ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
15647 auto *VecTy = getWidenedType(ScalarTy, Op->getVectorFactor());
15648 Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
15649 if (ScalarTy->isVectorTy()) {
15650 // Handle revec dead vector instructions.
15651 Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
15652 }
15653 };
15654 // Memoize the relationship between blocks, i.e. if there is (at least one)
15655 // non-vectorized call between the blocks. This allows to skip the analysis of
15656 // the same block paths multiple times.
15658 ParentOpParentToPreds;
15659 auto CheckPredecessors = [&](BasicBlock *Root, BasicBlock *Pred,
15660 BasicBlock *OpParent) {
15661 auto Key = std::make_pair(Root, OpParent);
15662 if (auto It = ParentOpParentToPreds.find(Key);
15663 It != ParentOpParentToPreds.end())
15664 return It->second;
15666 if (Pred)
15667 Worklist.push_back(Pred);
15668 else
15669 Worklist.append(pred_begin(Root), pred_end(Root));
15672 ParentsPairsToAdd;
15673 bool Res = false;
15674 auto Cleanup = make_scope_exit([&]() {
15675 for (const auto &KeyPair : ParentsPairsToAdd) {
15676 assert(!ParentOpParentToPreds.contains(KeyPair) &&
15677 "Should not have been added before.");
15678 ParentOpParentToPreds.try_emplace(KeyPair, Res);
15679 }
15680 });
15681 while (!Worklist.empty()) {
15682 BasicBlock *BB = Worklist.pop_back_val();
15683 if (BB == OpParent || !Visited.insert(BB).second)
15684 continue;
15685 auto Pair = std::make_pair(BB, OpParent);
15686 if (auto It = ParentOpParentToPreds.find(Pair);
15687 It != ParentOpParentToPreds.end()) {
15688 Res = It->second;
15689 return Res;
15690 }
15691 ParentsPairsToAdd.insert(Pair);
15692 unsigned BlockSize = BB->size();
15693 if (BlockSize > static_cast<unsigned>(ScheduleRegionSizeBudget))
15694 return Res;
15695 Budget += BlockSize;
15696 if (Budget > BudgetLimit)
15697 return Res;
15698 if (!isa<CatchSwitchInst>(BB->getTerminator()) &&
15699 !CheckForNonVecCallsInSameBlock(&*BB->getFirstNonPHIOrDbgOrAlloca(),
15700 BB->getTerminator()))
15701 return Res;
15702 Worklist.append(pred_begin(BB), pred_end(BB));
15703 }
15704 Res = true;
15705 return Res;
15706 };
15707 SmallVector<const TreeEntry *> LiveEntries(1, Root);
15708 while (!LiveEntries.empty()) {
15709 const TreeEntry *Entry = LiveEntries.pop_back_val();
15710 SmallVector<const TreeEntry *> Operands = EntriesToOperands.lookup(Entry);
15711 if (Operands.empty())
15712 continue;
15713 Instruction *LastInst = EntriesToLastInstruction.at(Entry);
15714 BasicBlock *Parent = LastInst->getParent();
15715 for (const TreeEntry *Op : Operands) {
15716 if (!Op->isGather())
15717 LiveEntries.push_back(Op);
15718 if (Entry->State == TreeEntry::SplitVectorize ||
15719 (Entry->getOpcode() != Instruction::PHI && Op->isGather()) ||
15720 (Op->isGather() && allConstant(Op->Scalars)))
15721 continue;
15722 Budget = 0;
15723 BasicBlock *Pred = nullptr;
15724 if (auto *Phi = dyn_cast<PHINode>(Entry->getMainOp()))
15725 Pred = Phi->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
15726 BasicBlock *OpParent;
15727 Instruction *OpLastInst;
15728 if (Op->isGather()) {
15729 assert(Entry->getOpcode() == Instruction::PHI &&
15730 "Expected phi node only.");
15731 OpParent = cast<PHINode>(Entry->getMainOp())
15732 ->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
15733 OpLastInst = OpParent->getTerminator();
15734 for (Value *V : Op->Scalars) {
15735 auto *Inst = dyn_cast<Instruction>(V);
15736 if (!Inst)
15737 continue;
15738 if (isVectorized(V)) {
15739 OpParent = Inst->getParent();
15740 OpLastInst = Inst;
15741 break;
15742 }
15743 }
15744 } else {
15745 OpLastInst = EntriesToLastInstruction.at(Op);
15746 OpParent = OpLastInst->getParent();
15747 }
15748 // Check the call instructions within the same basic blocks.
15749 if (OpParent == Parent) {
15750 if (Entry->getOpcode() == Instruction::PHI) {
15751 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
15752 AddCosts(Op);
15753 continue;
15754 }
15755 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
15756 AddCosts(Op);
15757 continue;
15758 }
15759 // Check for call instruction in between blocks.
15760 // 1. Check entry's block to the head.
15761 if (Entry->getOpcode() != Instruction::PHI &&
15762 !CheckForNonVecCallsInSameBlock(
15763 &*LastInst->getParent()->getFirstNonPHIOrDbgOrAlloca(),
15764 LastInst)) {
15765 AddCosts(Op);
15766 continue;
15767 }
15768 // 2. Check op's block from the end.
15769 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
15770 OpParent->getTerminator())) {
15771 AddCosts(Op);
15772 continue;
15773 }
15774 // 3. Check the predecessors of entry's block till op's block.
15775 if (!CheckPredecessors(Parent, Pred, OpParent)) {
15776 AddCosts(Op);
15777 continue;
15778 }
15779 }
15780 }
15781
15782 return Cost;
15783}
15784
15785/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
15786/// buildvector sequence.
15788 const InsertElementInst *IE2) {
15789 if (IE1 == IE2)
15790 return false;
15791 const auto *I1 = IE1;
15792 const auto *I2 = IE2;
15793 const InsertElementInst *PrevI1;
15794 const InsertElementInst *PrevI2;
15795 unsigned Idx1 = *getElementIndex(IE1);
15796 unsigned Idx2 = *getElementIndex(IE2);
15797 do {
15798 if (I2 == IE1)
15799 return true;
15800 if (I1 == IE2)
15801 return false;
15802 PrevI1 = I1;
15803 PrevI2 = I2;
15804 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
15805 getElementIndex(I1).value_or(Idx2) != Idx2)
15806 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
15807 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
15808 getElementIndex(I2).value_or(Idx1) != Idx1)
15809 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
15810 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
15811 llvm_unreachable("Two different buildvectors not expected.");
15812}
15813
15814namespace {
15815/// Returns incoming Value *, if the requested type is Value * too, or a default
15816/// value, otherwise.
15817struct ValueSelect {
15818 template <typename U>
15819 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
15820 return V;
15821 }
15822 template <typename U>
15823 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
15824 return U();
15825 }
15826};
15827} // namespace
15828
15829/// Does the analysis of the provided shuffle masks and performs the requested
15830/// actions on the vectors with the given shuffle masks. It tries to do it in
15831/// several steps.
15832/// 1. If the Base vector is not undef vector, resizing the very first mask to
15833/// have common VF and perform action for 2 input vectors (including non-undef
15834/// Base). Other shuffle masks are combined with the resulting after the 1 stage
15835/// and processed as a shuffle of 2 elements.
15836/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
15837/// action only for 1 vector with the given mask, if it is not the identity
15838/// mask.
15839/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
15840/// vectors, combing the masks properly between the steps.
15841template <typename T>
15843 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
15844 function_ref<unsigned(T *)> GetVF,
15845 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
15847 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
15848 SmallVector<int> Mask(ShuffleMask.begin()->second);
15849 auto VMIt = std::next(ShuffleMask.begin());
15850 T *Prev = nullptr;
15851 SmallBitVector UseMask =
15852 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
15853 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
15854 if (!IsBaseUndef.all()) {
15855 // Base is not undef, need to combine it with the next subvectors.
15856 std::pair<T *, bool> Res =
15857 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
15858 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
15859 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
15860 if (Mask[Idx] == PoisonMaskElem)
15861 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
15862 else
15863 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
15864 }
15865 [[maybe_unused]] auto *V = ValueSelect::get<T *>(Base);
15866 assert((!V || GetVF(V) == Mask.size()) &&
15867 "Expected base vector of VF number of elements.");
15868 Prev = Action(Mask, {nullptr, Res.first});
15869 } else if (ShuffleMask.size() == 1) {
15870 // Base is undef and only 1 vector is shuffled - perform the action only for
15871 // single vector, if the mask is not the identity mask.
15872 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
15873 /*ForSingleMask=*/true);
15874 if (Res.second)
15875 // Identity mask is found.
15876 Prev = Res.first;
15877 else
15878 Prev = Action(Mask, {ShuffleMask.begin()->first});
15879 } else {
15880 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
15881 // shuffles step by step, combining shuffle between the steps.
15882 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
15883 unsigned Vec2VF = GetVF(VMIt->first);
15884 if (Vec1VF == Vec2VF) {
15885 // No need to resize the input vectors since they are of the same size, we
15886 // can shuffle them directly.
15887 ArrayRef<int> SecMask = VMIt->second;
15888 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
15889 if (SecMask[I] != PoisonMaskElem) {
15890 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
15891 Mask[I] = SecMask[I] + Vec1VF;
15892 }
15893 }
15894 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
15895 } else {
15896 // Vectors of different sizes - resize and reshuffle.
15897 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
15898 /*ForSingleMask=*/false);
15899 std::pair<T *, bool> Res2 =
15900 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
15901 ArrayRef<int> SecMask = VMIt->second;
15902 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
15903 if (Mask[I] != PoisonMaskElem) {
15904 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
15905 if (Res1.second)
15906 Mask[I] = I;
15907 } else if (SecMask[I] != PoisonMaskElem) {
15908 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
15909 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
15910 }
15911 }
15912 Prev = Action(Mask, {Res1.first, Res2.first});
15913 }
15914 VMIt = std::next(VMIt);
15915 }
15916 [[maybe_unused]] bool IsBaseNotUndef = !IsBaseUndef.all();
15917 // Perform requested actions for the remaining masks/vectors.
15918 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
15919 // Shuffle other input vectors, if any.
15920 std::pair<T *, bool> Res =
15921 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
15922 ArrayRef<int> SecMask = VMIt->second;
15923 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
15924 if (SecMask[I] != PoisonMaskElem) {
15925 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
15926 "Multiple uses of scalars.");
15927 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
15928 } else if (Mask[I] != PoisonMaskElem) {
15929 Mask[I] = I;
15930 }
15931 }
15932 Prev = Action(Mask, {Prev, Res.first});
15933 }
15934 return Prev;
15935}
15936
15937namespace {
15938/// Data type for handling buildvector sequences with the reused scalars from
15939/// other tree entries.
15940template <typename T> struct ShuffledInsertData {
15941 /// List of insertelements to be replaced by shuffles.
15942 SmallVector<InsertElementInst *> InsertElements;
15943 /// The parent vectors and shuffle mask for the given list of inserts.
15944 MapVector<T, SmallVector<int>> ValueMasks;
15945};
15946} // namespace
15947
15949 InstructionCost ReductionCost) {
15950 InstructionCost Cost = ReductionCost;
15951 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
15952 << VectorizableTree.size() << ".\n");
15953
15954 SmallPtrSet<Value *, 4> CheckedExtracts;
15955 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
15956 TreeEntry &TE = *VectorizableTree[I];
15957 // No need to count the cost for combined entries, they are combined and
15958 // just skip their cost.
15959 if (TE.State == TreeEntry::CombinedVectorize) {
15960 LLVM_DEBUG(
15961 dbgs() << "SLP: Skipping cost for combined node that starts with "
15962 << *TE.Scalars[0] << ".\n";
15963 TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
15964 continue;
15965 }
15966 if (TE.hasState() &&
15967 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
15968 if (const TreeEntry *E =
15969 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
15970 E && E->getVectorFactor() == TE.getVectorFactor()) {
15971 // Some gather nodes might be absolutely the same as some vectorizable
15972 // nodes after reordering, need to handle it.
15973 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
15974 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
15975 << "SLP: Current total cost = " << Cost << "\n");
15976 continue;
15977 }
15978 }
15979
15980 // Exclude cost of gather loads nodes which are not used. These nodes were
15981 // built as part of the final attempt to vectorize gathered loads.
15982 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
15983 "Expected gather nodes with users only.");
15984
15985 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
15986 Cost += C;
15987 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
15988 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
15989 << "SLP: Current total cost = " << Cost << "\n");
15990 }
15991
15992 if (Cost >= -SLPCostThreshold &&
15993 none_of(ExternalUses, [](const ExternalUser &EU) {
15994 return isa_and_nonnull<InsertElementInst>(EU.User);
15995 }))
15996 return Cost;
15997
15998 SmallPtrSet<Value *, 16> ExtractCostCalculated;
15999 InstructionCost ExtractCost = 0;
16001 SmallVector<APInt> DemandedElts;
16002 SmallDenseSet<Value *, 4> UsedInserts;
16004 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
16006 SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
16007 // Keep track {Scalar, Index, User} tuple.
16008 // On AArch64, this helps in fusing a mov instruction, associated with
16009 // extractelement, with fmul in the backend so that extractelement is free.
16011 for (ExternalUser &EU : ExternalUses) {
16012 ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane);
16013 }
16014 SmallDenseSet<std::pair<Value *, Value *>, 8> CheckedScalarUser;
16015 for (ExternalUser &EU : ExternalUses) {
16016 LLVM_DEBUG(dbgs() << "SLP: Computing cost for external use of TreeEntry "
16017 << EU.E.Idx << " in lane " << EU.Lane << "\n");
16018 LLVM_DEBUG(if (EU.User) dbgs() << " User:" << *EU.User << "\n";
16019 else dbgs() << " User: nullptr\n");
16020 LLVM_DEBUG(dbgs() << " Use: " << EU.Scalar->getNameOrAsOperand() << "\n");
16021
16022 // Uses by ephemeral values are free (because the ephemeral value will be
16023 // removed prior to code generation, and so the extraction will be
16024 // removed as well).
16025 if (EphValues.count(EU.User))
16026 continue;
16027
16028 // Check if the scalar for the given user or all users is accounted already.
16029 if (!CheckedScalarUser.insert(std::make_pair(EU.Scalar, EU.User)).second ||
16030 (EU.User &&
16031 CheckedScalarUser.contains(std::make_pair(EU.Scalar, nullptr))))
16032 continue;
16033
16034 // Used in unreachable blocks or in EH pads (rarely executed) or is
16035 // terminated with unreachable instruction.
16036 if (BasicBlock *UserParent =
16037 EU.User ? cast<Instruction>(EU.User)->getParent() : nullptr;
16038 UserParent &&
16039 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
16040 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
16041 continue;
16042
16043 // We only add extract cost once for the same scalar.
16044 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
16045 !ExtractCostCalculated.insert(EU.Scalar).second)
16046 continue;
16047
16048 // No extract cost for vector "scalar" if REVEC is disabled
16049 if (!SLPReVec && isa<FixedVectorType>(EU.Scalar->getType()))
16050 continue;
16051
16052 // If found user is an insertelement, do not calculate extract cost but try
16053 // to detect it as a final shuffled/identity match.
16054 // TODO: what if a user is insertvalue when REVEC is enabled?
16055 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
16056 VU && VU->getOperand(1) == EU.Scalar) {
16057 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
16058 if (!UsedInserts.insert(VU).second)
16059 continue;
16060 std::optional<unsigned> InsertIdx = getElementIndex(VU);
16061 if (InsertIdx) {
16062 const TreeEntry *ScalarTE = &EU.E;
16063 auto *It = find_if(
16064 ShuffledInserts,
16065 [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
16066 // Checks if 2 insertelements are from the same buildvector.
16067 InsertElementInst *VecInsert = Data.InsertElements.front();
16069 VU, VecInsert, [this](InsertElementInst *II) -> Value * {
16070 Value *Op0 = II->getOperand(0);
16071 if (isVectorized(II) && !isVectorized(Op0))
16072 return nullptr;
16073 return Op0;
16074 });
16075 });
16076 int VecId = -1;
16077 if (It == ShuffledInserts.end()) {
16078 auto &Data = ShuffledInserts.emplace_back();
16079 Data.InsertElements.emplace_back(VU);
16080 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
16081 VecId = ShuffledInserts.size() - 1;
16082 auto It = MinBWs.find(ScalarTE);
16083 if (It != MinBWs.end() &&
16084 VectorCasts
16085 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
16086 .second) {
16087 unsigned BWSz = It->second.first;
16088 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
16089 unsigned VecOpcode;
16090 if (DstBWSz < BWSz)
16091 VecOpcode = Instruction::Trunc;
16092 else
16093 VecOpcode =
16094 It->second.second ? Instruction::SExt : Instruction::ZExt;
16096 InstructionCost C = TTI->getCastInstrCost(
16097 VecOpcode, FTy,
16098 getWidenedType(IntegerType::get(FTy->getContext(), BWSz),
16099 FTy->getNumElements()),
16101 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
16102 << " for extending externally used vector with "
16103 "non-equal minimum bitwidth.\n");
16104 Cost += C;
16105 }
16106 } else {
16107 if (isFirstInsertElement(VU, It->InsertElements.front()))
16108 It->InsertElements.front() = VU;
16109 VecId = std::distance(ShuffledInserts.begin(), It);
16110 }
16111 int InIdx = *InsertIdx;
16112 SmallVectorImpl<int> &Mask =
16113 ShuffledInserts[VecId].ValueMasks[ScalarTE];
16114 if (Mask.empty())
16115 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
16116 Mask[InIdx] = EU.Lane;
16117 DemandedElts[VecId].setBit(InIdx);
16118 continue;
16119 }
16120 }
16121 }
16122
16124 // If we plan to rewrite the tree in a smaller type, we will need to sign
16125 // extend the extracted value back to the original type. Here, we account
16126 // for the extract and the added cost of the sign extend if needed.
16127 InstructionCost ExtraCost = TTI::TCC_Free;
16128 auto *ScalarTy = EU.Scalar->getType();
16129 const unsigned BundleWidth = EU.E.getVectorFactor();
16130 assert(EU.Lane < BundleWidth && "Extracted lane out of bounds.");
16131 auto *VecTy = getWidenedType(ScalarTy, BundleWidth);
16132 const TreeEntry *Entry = &EU.E;
16133 auto It = MinBWs.find(Entry);
16134 if (It != MinBWs.end()) {
16135 Type *MinTy = IntegerType::get(F->getContext(), It->second.first);
16136 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy))
16137 MinTy = getWidenedType(MinTy, VecTy->getNumElements());
16138 unsigned Extend = isKnownNonNegative(EU.Scalar, SimplifyQuery(*DL))
16139 ? Instruction::ZExt
16140 : Instruction::SExt;
16141 VecTy = getWidenedType(MinTy, BundleWidth);
16142 ExtraCost =
16143 getExtractWithExtendCost(*TTI, Extend, ScalarTy, VecTy, EU.Lane);
16144 LLVM_DEBUG(dbgs() << " ExtractExtend or ExtractSubvec cost: "
16145 << ExtraCost << "\n");
16146 } else {
16147 ExtraCost =
16148 getVectorInstrCost(*TTI, ScalarTy, Instruction::ExtractElement, VecTy,
16149 CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
16150 LLVM_DEBUG(dbgs() << " ExtractElement cost for " << *ScalarTy << " from "
16151 << *VecTy << ": " << ExtraCost << "\n");
16152 }
16153 // Leave the scalar instructions as is if they are cheaper than extracts.
16154 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
16155 Entry->getOpcode() == Instruction::Load) {
16156 // Checks if the user of the external scalar is phi in loop body.
16157 auto IsPhiInLoop = [&](const ExternalUser &U) {
16158 if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
16159 auto *I = cast<Instruction>(U.Scalar);
16160 const Loop *L = LI->getLoopFor(Phi->getParent());
16161 return L && (Phi->getParent() == I->getParent() ||
16162 L == LI->getLoopFor(I->getParent()));
16163 }
16164 return false;
16165 };
16166 if (!ValueToExtUses) {
16167 ValueToExtUses.emplace();
16168 for (const auto &P : enumerate(ExternalUses)) {
16169 // Ignore phis in loops.
16170 if (IsPhiInLoop(P.value()))
16171 continue;
16172
16173 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
16174 }
16175 }
16176 // Can use original instruction, if no operands vectorized or they are
16177 // marked as externally used already.
16178 auto *Inst = cast<Instruction>(EU.Scalar);
16179 InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
16180 auto OperandIsScalar = [&](Value *V) {
16181 if (!isVectorized(V)) {
16182 // Some extractelements might be not vectorized, but
16183 // transformed into shuffle and removed from the function,
16184 // consider it here.
16185 if (auto *EE = dyn_cast<ExtractElementInst>(V))
16186 return !EE->hasOneUse() || !MustGather.contains(EE);
16187 return true;
16188 }
16189 return ValueToExtUses->contains(V);
16190 };
16191 bool CanBeUsedAsScalar = all_of(Inst->operands(), OperandIsScalar);
16192 bool CanBeUsedAsScalarCast = false;
16193 if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
16194 if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));
16195 Op && all_of(Op->operands(), OperandIsScalar)) {
16196 InstructionCost OpCost =
16197 (isVectorized(Op) && !ValueToExtUses->contains(Op))
16198 ? TTI->getInstructionCost(Op, CostKind)
16199 : 0;
16200 if (ScalarCost + OpCost <= ExtraCost) {
16201 CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
16202 ScalarCost += OpCost;
16203 }
16204 }
16205 }
16206 if (CanBeUsedAsScalar) {
16207 bool KeepScalar = ScalarCost <= ExtraCost;
16208 // Try to keep original scalar if the user is the phi node from the same
16209 // block as the root phis, currently vectorized. It allows to keep
16210 // better ordering info of PHIs, being vectorized currently.
16211 bool IsProfitablePHIUser =
16212 (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
16213 VectorizableTree.front()->Scalars.size() > 2)) &&
16214 VectorizableTree.front()->hasState() &&
16215 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
16216 !Inst->hasNUsesOrMore(UsesLimit) &&
16217 none_of(Inst->users(),
16218 [&](User *U) {
16219 auto *PHIUser = dyn_cast<PHINode>(U);
16220 return (!PHIUser ||
16221 PHIUser->getParent() !=
16222 cast<Instruction>(
16223 VectorizableTree.front()->getMainOp())
16224 ->getParent()) &&
16225 !isVectorized(U);
16226 }) &&
16227 count_if(Entry->Scalars, [&](Value *V) {
16228 return ValueToExtUses->contains(V);
16229 }) <= 2;
16230 if (IsProfitablePHIUser) {
16231 KeepScalar = true;
16232 } else if (KeepScalar && ScalarCost != TTI::TCC_Free &&
16233 ExtraCost - ScalarCost <= TTI::TCC_Basic &&
16234 (!GatheredLoadsEntriesFirst.has_value() ||
16235 Entry->Idx < *GatheredLoadsEntriesFirst)) {
16236 unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
16237 return ValueToExtUses->contains(V);
16238 });
16239 auto It = ExtractsCount.find(Entry);
16240 if (It != ExtractsCount.end()) {
16241 assert(ScalarUsesCount >= It->getSecond().size() &&
16242 "Expected total number of external uses not less than "
16243 "number of scalar uses.");
16244 ScalarUsesCount -= It->getSecond().size();
16245 }
16246 // Keep original scalar if number of externally used instructions in
16247 // the same entry is not power of 2. It may help to do some extra
16248 // vectorization for now.
16249 KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
16250 }
16251 if (KeepScalar) {
16252 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
16253 for (Value *V : Inst->operands()) {
16254 auto It = ValueToExtUses->find(V);
16255 if (It != ValueToExtUses->end()) {
16256 // Replace all uses to avoid compiler crash.
16257 ExternalUses[It->second].User = nullptr;
16258 }
16259 }
16260 ExtraCost = ScalarCost;
16261 if (!IsPhiInLoop(EU))
16262 ExtractsCount[Entry].insert(Inst);
16263 if (CanBeUsedAsScalarCast) {
16264 ScalarOpsFromCasts.insert(Inst->getOperand(0));
16265 // Update the users of the operands of the cast operand to avoid
16266 // compiler crash.
16267 if (auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
16268 for (Value *V : IOp->operands()) {
16269 auto It = ValueToExtUses->find(V);
16270 if (It != ValueToExtUses->end()) {
16271 // Replace all uses to avoid compiler crash.
16272 ExternalUses[It->second].User = nullptr;
16273 }
16274 }
16275 }
16276 }
16277 }
16278 }
16279 }
16280
16281 ExtractCost += ExtraCost;
16282 }
16283 // Insert externals for extract of operands of casts to be emitted as scalars
16284 // instead of extractelement.
16285 for (Value *V : ScalarOpsFromCasts) {
16286 ExternalUsesAsOriginalScalar.insert(V);
16287 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V); !TEs.empty()) {
16288 ExternalUses.emplace_back(V, nullptr, *TEs.front(),
16289 TEs.front()->findLaneForValue(V));
16290 }
16291 }
16292 // Add reduced value cost, if resized.
16293 if (!VectorizedVals.empty()) {
16294 const TreeEntry &Root = *VectorizableTree.front();
16295 auto BWIt = MinBWs.find(&Root);
16296 if (BWIt != MinBWs.end()) {
16297 Type *DstTy = Root.Scalars.front()->getType();
16298 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->getScalarType());
16299 unsigned SrcSz =
16300 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
16301 if (OriginalSz != SrcSz) {
16302 unsigned Opcode = Instruction::Trunc;
16303 if (OriginalSz > SrcSz)
16304 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
16305 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
16306 if (auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
16307 assert(SLPReVec && "Only supported by REVEC.");
16308 SrcTy = getWidenedType(SrcTy, VecTy->getNumElements());
16309 }
16310 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
16313 }
16314 }
16315 }
16316
16317 Cost += ExtractCost;
16318 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
16319 bool ForSingleMask) {
16320 InstructionCost C = 0;
16321 unsigned VF = Mask.size();
16322 unsigned VecVF = TE->getVectorFactor();
16323 bool HasLargeIndex =
16324 any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); });
16325 if ((VF != VecVF && HasLargeIndex) ||
16327
16328 if (HasLargeIndex) {
16329 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
16330 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
16331 OrigMask.begin());
16333 getWidenedType(TE->getMainOp()->getType(), VecVF),
16334 OrigMask);
16335 LLVM_DEBUG(
16336 dbgs() << "SLP: Adding cost " << C
16337 << " for final shuffle of insertelement external users.\n";
16338 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
16339 Cost += C;
16340 return std::make_pair(TE, true);
16341 }
16342
16343 if (!ForSingleMask) {
16344 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
16345 for (unsigned I = 0; I < VF; ++I) {
16346 if (Mask[I] != PoisonMaskElem)
16347 ResizeMask[Mask[I]] = Mask[I];
16348 }
16349 if (!ShuffleVectorInst::isIdentityMask(ResizeMask, VF))
16352 getWidenedType(TE->getMainOp()->getType(), VecVF), ResizeMask);
16353 LLVM_DEBUG(
16354 dbgs() << "SLP: Adding cost " << C
16355 << " for final shuffle of insertelement external users.\n";
16356 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
16357
16358 Cost += C;
16359 }
16360 }
16361 return std::make_pair(TE, false);
16362 };
16363 // Calculate the cost of the reshuffled vectors, if any.
16364 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
16365 Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0);
16366 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
16367 unsigned VF = 0;
16368 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
16370 assert((TEs.size() == 1 || TEs.size() == 2) &&
16371 "Expected exactly 1 or 2 tree entries.");
16372 if (TEs.size() == 1) {
16373 if (VF == 0)
16374 VF = TEs.front()->getVectorFactor();
16375 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16376 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
16377 !all_of(enumerate(Mask), [=](const auto &Data) {
16378 return Data.value() == PoisonMaskElem ||
16379 (Data.index() < VF &&
16380 static_cast<int>(Data.index()) == Data.value());
16381 })) {
16384 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
16385 << " for final shuffle of insertelement "
16386 "external users.\n";
16387 TEs.front()->dump();
16388 dbgs() << "SLP: Current total cost = " << Cost << "\n");
16389 Cost += C;
16390 }
16391 } else {
16392 if (VF == 0) {
16393 if (TEs.front() &&
16394 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
16395 VF = TEs.front()->getVectorFactor();
16396 else
16397 VF = Mask.size();
16398 }
16399 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16401 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, FTy, Mask);
16402 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
16403 << " for final shuffle of vector node and external "
16404 "insertelement users.\n";
16405 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
16406 dbgs() << "SLP: Current total cost = " << Cost << "\n");
16407 Cost += C;
16408 }
16409 VF = Mask.size();
16410 return TEs.back();
16411 };
16413 MutableArrayRef(Vector.data(), Vector.size()), Base,
16414 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
16415 EstimateShufflesCost);
16416 InstructionCost InsertCost = TTI->getScalarizationOverhead(
16418 ShuffledInserts[I].InsertElements.front()->getType()),
16419 DemandedElts[I],
16420 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
16421 Cost -= InsertCost;
16422 }
16423
16424 // Add the cost for reduced value resize (if required).
16425 if (ReductionBitWidth != 0) {
16426 assert(UserIgnoreList && "Expected reduction tree.");
16427 const TreeEntry &E = *VectorizableTree.front();
16428 auto It = MinBWs.find(&E);
16429 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
16430 unsigned SrcSize = It->second.first;
16431 unsigned DstSize = ReductionBitWidth;
16432 unsigned Opcode = Instruction::Trunc;
16433 if (SrcSize < DstSize) {
16434 bool IsArithmeticExtendedReduction =
16435 all_of(*UserIgnoreList, [](Value *V) {
16436 auto *I = cast<Instruction>(V);
16437 return is_contained({Instruction::Add, Instruction::FAdd,
16438 Instruction::Mul, Instruction::FMul,
16439 Instruction::And, Instruction::Or,
16440 Instruction::Xor},
16441 I->getOpcode());
16442 });
16443 if (IsArithmeticExtendedReduction)
16444 Opcode =
16445 Instruction::BitCast; // Handle it by getExtendedReductionCost
16446 else
16447 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
16448 }
16449 if (Opcode != Instruction::BitCast) {
16450 auto *SrcVecTy =
16451 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
16452 auto *DstVecTy =
16453 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
16454 TTI::CastContextHint CCH = getCastContextHint(E);
16455 InstructionCost CastCost;
16456 switch (E.getOpcode()) {
16457 case Instruction::SExt:
16458 case Instruction::ZExt:
16459 case Instruction::Trunc: {
16460 const TreeEntry *OpTE = getOperandEntry(&E, 0);
16461 CCH = getCastContextHint(*OpTE);
16462 break;
16463 }
16464 default:
16465 break;
16466 }
16467 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
16469 Cost += CastCost;
16470 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
16471 << " for final resize for reduction from " << SrcVecTy
16472 << " to " << DstVecTy << "\n";
16473 dbgs() << "SLP: Current total cost = " << Cost << "\n");
16474 }
16475 }
16476 }
16477
16478 std::optional<InstructionCost> SpillCost;
16479 if (Cost < -SLPCostThreshold) {
16480 SpillCost = getSpillCost();
16481 Cost += *SpillCost;
16482 }
16483#ifndef NDEBUG
16484 SmallString<256> Str;
16485 {
16486 raw_svector_ostream OS(Str);
16487 OS << "SLP: Spill Cost = ";
16488 if (SpillCost)
16489 OS << *SpillCost;
16490 else
16491 OS << "<skipped>";
16492 OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n"
16493 << "SLP: Total Cost = " << Cost << ".\n";
16494 }
16495 LLVM_DEBUG(dbgs() << Str);
16496 if (ViewSLPTree)
16497 ViewGraph(this, "SLP" + F->getName(), false, Str);
16498#endif
16499
16500 return Cost;
16501}
16502
16503/// Tries to find extractelement instructions with constant indices from fixed
16504/// vector type and gather such instructions into a bunch, which highly likely
16505/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
16506/// successful, the matched scalars are replaced by poison values in \p VL for
16507/// future analysis.
16508std::optional<TTI::ShuffleKind>
16509BoUpSLP::tryToGatherSingleRegisterExtractElements(
16511 // Scan list of gathered scalars for extractelements that can be represented
16512 // as shuffles.
16514 SmallVector<int> UndefVectorExtracts;
16515 for (int I = 0, E = VL.size(); I < E; ++I) {
16516 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
16517 if (!EI) {
16518 if (isa<UndefValue>(VL[I]))
16519 UndefVectorExtracts.push_back(I);
16520 continue;
16521 }
16522 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
16523 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
16524 continue;
16525 std::optional<unsigned> Idx = getExtractIndex(EI);
16526 // Undefined index.
16527 if (!Idx) {
16528 UndefVectorExtracts.push_back(I);
16529 continue;
16530 }
16531 if (Idx >= VecTy->getNumElements()) {
16532 UndefVectorExtracts.push_back(I);
16533 continue;
16534 }
16535 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
16536 ExtractMask.reset(*Idx);
16537 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
16538 UndefVectorExtracts.push_back(I);
16539 continue;
16540 }
16541 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
16542 }
16543 // Sort the vector operands by the maximum number of uses in extractelements.
16545 VectorOpToIdx.takeVector();
16546 stable_sort(Vectors, [](const auto &P1, const auto &P2) {
16547 return P1.second.size() > P2.second.size();
16548 });
16549 // Find the best pair of the vectors or a single vector.
16550 const int UndefSz = UndefVectorExtracts.size();
16551 unsigned SingleMax = 0;
16552 unsigned PairMax = 0;
16553 if (!Vectors.empty()) {
16554 SingleMax = Vectors.front().second.size() + UndefSz;
16555 if (Vectors.size() > 1) {
16556 auto *ItNext = std::next(Vectors.begin());
16557 PairMax = SingleMax + ItNext->second.size();
16558 }
16559 }
16560 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
16561 return std::nullopt;
16562 // Check if better to perform a shuffle of 2 vectors or just of a single
16563 // vector.
16564 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
16565 SmallVector<Value *> GatheredExtracts(
16566 VL.size(), PoisonValue::get(VL.front()->getType()));
16567 if (SingleMax >= PairMax && SingleMax) {
16568 for (int Idx : Vectors.front().second)
16569 std::swap(GatheredExtracts[Idx], VL[Idx]);
16570 } else if (!Vectors.empty()) {
16571 for (unsigned Idx : {0, 1})
16572 for (int Idx : Vectors[Idx].second)
16573 std::swap(GatheredExtracts[Idx], VL[Idx]);
16574 }
16575 // Add extracts from undefs too.
16576 for (int Idx : UndefVectorExtracts)
16577 std::swap(GatheredExtracts[Idx], VL[Idx]);
16578 // Check that gather of extractelements can be represented as just a
16579 // shuffle of a single/two vectors the scalars are extracted from.
16580 std::optional<TTI::ShuffleKind> Res =
16581 isFixedVectorShuffle(GatheredExtracts, Mask, AC);
16582 if (!Res || all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
16583 // TODO: try to check other subsets if possible.
16584 // Restore the original VL if attempt was not successful.
16585 copy(SavedVL, VL.begin());
16586 return std::nullopt;
16587 }
16588 // Restore unused scalars from mask, if some of the extractelements were not
16589 // selected for shuffle.
16590 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
16591 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
16592 isa<UndefValue>(GatheredExtracts[I])) {
16593 std::swap(VL[I], GatheredExtracts[I]);
16594 continue;
16595 }
16596 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
16597 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
16598 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
16599 is_contained(UndefVectorExtracts, I))
16600 continue;
16601 }
16602 return Res;
16603}
16604
16605/// Tries to find extractelement instructions with constant indices from fixed
16606/// vector type and gather such instructions into a bunch, which highly likely
16607/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
16608/// successful, the matched scalars are replaced by poison values in \p VL for
16609/// future analysis.
16611BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
16612 SmallVectorImpl<int> &Mask,
16613 unsigned NumParts) const {
16614 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
16615 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
16616 Mask.assign(VL.size(), PoisonMaskElem);
16617 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
16618 for (unsigned Part : seq<unsigned>(NumParts)) {
16619 // Scan list of gathered scalars for extractelements that can be represented
16620 // as shuffles.
16621 MutableArrayRef<Value *> SubVL = MutableArrayRef(VL).slice(
16622 Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
16623 SmallVector<int> SubMask;
16624 std::optional<TTI::ShuffleKind> Res =
16625 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
16626 ShufflesRes[Part] = Res;
16627 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
16628 }
16629 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
16630 return Res.has_value();
16631 }))
16632 ShufflesRes.clear();
16633 return ShufflesRes;
16634}
16635
16636std::optional<TargetTransformInfo::ShuffleKind>
16637BoUpSLP::isGatherShuffledSingleRegisterEntry(
16638 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
16639 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
16640 Entries.clear();
16641 // TODO: currently checking only for Scalars in the tree entry, need to count
16642 // reused elements too for better cost estimation.
16643 auto GetUserEntry = [&](const TreeEntry *TE) {
16644 while (TE->UserTreeIndex && TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16645 TE = TE->UserTreeIndex.UserTE;
16646 if (TE == VectorizableTree.front().get())
16647 return EdgeInfo(const_cast<TreeEntry *>(TE), 0);
16648 return TE->UserTreeIndex;
16649 };
16650 auto HasGatherUser = [&](const TreeEntry *TE) {
16651 while (TE->Idx != 0 && TE->UserTreeIndex) {
16652 if (TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16653 return true;
16654 TE = TE->UserTreeIndex.UserTE;
16655 }
16656 return false;
16657 };
16658 const EdgeInfo TEUseEI = GetUserEntry(TE);
16659 if (!TEUseEI)
16660 return std::nullopt;
16661 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
16662 const BasicBlock *TEInsertBlock = nullptr;
16663 // Main node of PHI entries keeps the correct order of operands/incoming
16664 // blocks.
16665 if (auto *PHI = dyn_cast_or_null<PHINode>(
16666 TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() : nullptr);
16667 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
16668 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
16669 TEInsertPt = TEInsertBlock->getTerminator();
16670 } else {
16671 TEInsertBlock = TEInsertPt->getParent();
16672 }
16673 if (!DT->isReachableFromEntry(TEInsertBlock))
16674 return std::nullopt;
16675 auto *NodeUI = DT->getNode(TEInsertBlock);
16676 assert(NodeUI && "Should only process reachable instructions");
16677 SmallPtrSet<Value *, 4> GatheredScalars(llvm::from_range, VL);
16678 auto CheckOrdering = [&](const Instruction *InsertPt) {
16679 // Argument InsertPt is an instruction where vector code for some other
16680 // tree entry (one that shares one or more scalars with TE) is going to be
16681 // generated. This lambda returns true if insertion point of vector code
16682 // for the TE dominates that point (otherwise dependency is the other way
16683 // around). The other node is not limited to be of a gather kind. Gather
16684 // nodes are not scheduled and their vector code is inserted before their
16685 // first user. If user is PHI, that is supposed to be at the end of a
16686 // predecessor block. Otherwise it is the last instruction among scalars of
16687 // the user node. So, instead of checking dependency between instructions
16688 // themselves, we check dependency between their insertion points for vector
16689 // code (since each scalar instruction ends up as a lane of a vector
16690 // instruction).
16691 const BasicBlock *InsertBlock = InsertPt->getParent();
16692 auto *NodeEUI = DT->getNode(InsertBlock);
16693 if (!NodeEUI)
16694 return false;
16695 assert((NodeUI == NodeEUI) ==
16696 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
16697 "Different nodes should have different DFS numbers");
16698 // Check the order of the gather nodes users.
16699 if (TEInsertPt->getParent() != InsertBlock &&
16700 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
16701 return false;
16702 if (TEInsertPt->getParent() == InsertBlock &&
16703 TEInsertPt->comesBefore(InsertPt))
16704 return false;
16705 return true;
16706 };
16707 // Find all tree entries used by the gathered values. If no common entries
16708 // found - not a shuffle.
16709 // Here we build a set of tree nodes for each gathered value and trying to
16710 // find the intersection between these sets. If we have at least one common
16711 // tree node for each gathered value - we have just a permutation of the
16712 // single vector. If we have 2 different sets, we're in situation where we
16713 // have a permutation of 2 input vectors.
16715 SmallDenseMap<Value *, int> UsedValuesEntry;
16716 SmallPtrSet<const Value *, 16> VisitedValue;
16717 auto CheckAndUseSameNode = [&](const TreeEntry *TEPtr) {
16718 // The node is reused - exit.
16719 if ((TEPtr->getVectorFactor() != VL.size() &&
16720 TEPtr->Scalars.size() != VL.size()) ||
16721 (!TEPtr->isSame(VL) && !TEPtr->isSame(TE->Scalars)))
16722 return false;
16723 UsedTEs.clear();
16724 UsedTEs.emplace_back().insert(TEPtr);
16725 for (Value *V : VL) {
16726 if (isConstant(V))
16727 continue;
16728 UsedValuesEntry.try_emplace(V, 0);
16729 }
16730 return true;
16731 };
16732 auto CheckParentNodes = [&](const TreeEntry *User1, const TreeEntry *User2,
16733 unsigned EdgeIdx) {
16734 const TreeEntry *Ptr1 = User1;
16735 const TreeEntry *Ptr2 = User2;
16736 SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
16737 while (Ptr2) {
16738 PtrToIdx.try_emplace(Ptr2, EdgeIdx);
16739 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
16740 Ptr2 = Ptr2->UserTreeIndex.UserTE;
16741 }
16742 while (Ptr1) {
16743 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
16744 Ptr1 = Ptr1->UserTreeIndex.UserTE;
16745 if (auto It = PtrToIdx.find(Ptr1); It != PtrToIdx.end())
16746 return Idx < It->second;
16747 }
16748 return false;
16749 };
16750 for (Value *V : VL) {
16751 if (isConstant(V) || !VisitedValue.insert(V).second)
16752 continue;
16753 // Build a list of tree entries where V is used.
16754 SmallPtrSet<const TreeEntry *, 4> VToTEs;
16755 for (const TreeEntry *TEPtr : ValueToGatherNodes.lookup(V)) {
16756 if (TEPtr == TE || TEPtr->Idx == 0)
16757 continue;
16758 assert(any_of(TEPtr->Scalars,
16759 [&](Value *V) { return GatheredScalars.contains(V); }) &&
16760 "Must contain at least single gathered value.");
16761 assert(TEPtr->UserTreeIndex &&
16762 "Expected only single user of a gather node.");
16763 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
16764
16765 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
16766 UseEI.UserTE->hasState())
16767 ? dyn_cast<PHINode>(UseEI.UserTE->getMainOp())
16768 : nullptr;
16769 Instruction *InsertPt =
16770 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
16771 : &getLastInstructionInBundle(UseEI.UserTE);
16772 if (TEInsertPt == InsertPt) {
16773 // Check nodes, which might be emitted first.
16774 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16775 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
16776 TEUseEI.UserTE->isAltShuffle()) &&
16777 all_of(TEUseEI.UserTE->Scalars, isUsedOutsideBlock)) {
16778 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
16779 (UseEI.UserTE->hasState() &&
16780 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16781 !UseEI.UserTE->isAltShuffle()) ||
16782 !all_of(UseEI.UserTE->Scalars, isUsedOutsideBlock))
16783 continue;
16784 }
16785
16786 // If the schedulable insertion point is used in multiple entries - just
16787 // exit, no known ordering at this point, available only after real
16788 // scheduling.
16789 if (!doesNotNeedToBeScheduled(InsertPt) &&
16790 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
16791 continue;
16792 // If the users are the PHI nodes with the same incoming blocks - skip.
16793 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16794 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
16795 UseEI.UserTE->State == TreeEntry::Vectorize &&
16796 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16797 TEUseEI.UserTE != UseEI.UserTE)
16798 continue;
16799 // If 2 gathers are operands of the same entry (regardless of whether
16800 // user is PHI or else), compare operands indices, use the earlier one
16801 // as the base.
16802 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
16803 continue;
16804 // If the user instruction is used for some reason in different
16805 // vectorized nodes - make it depend on index.
16806 if (TEUseEI.UserTE != UseEI.UserTE &&
16807 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
16808 HasGatherUser(TEUseEI.UserTE)))
16809 continue;
16810 // If the user node is the operand of the other user node - skip.
16811 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
16812 continue;
16813 }
16814
16815 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
16816 TEUseEI.UserTE->doesNotNeedToSchedule() !=
16817 UseEI.UserTE->doesNotNeedToSchedule() &&
16818 is_contained(UseEI.UserTE->Scalars, TEInsertPt))
16819 continue;
16820 // Check if the user node of the TE comes after user node of TEPtr,
16821 // otherwise TEPtr depends on TE.
16822 if ((TEInsertBlock != InsertPt->getParent() ||
16823 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
16824 !CheckOrdering(InsertPt))
16825 continue;
16826 // The node is reused - exit.
16827 if (CheckAndUseSameNode(TEPtr))
16828 break;
16829 VToTEs.insert(TEPtr);
16830 }
16831 if (ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V); !VTEs.empty()) {
16832 const auto *It = find_if(
16833 VTEs, [&](const TreeEntry *MTE) { return MTE != TEUseEI.UserTE; });
16834 if (It != VTEs.end()) {
16835 const TreeEntry *VTE = *It;
16836 if (none_of(TE->CombinedEntriesWithIndices,
16837 [&](const auto &P) { return P.first == VTE->Idx; })) {
16838 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16839 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16840 continue;
16841 }
16842 // The node is reused - exit.
16843 if (CheckAndUseSameNode(VTE))
16844 break;
16845 VToTEs.insert(VTE);
16846 }
16847 }
16848 if (ArrayRef<TreeEntry *> VTEs = getTreeEntries(V); !VTEs.empty()) {
16849 const TreeEntry *VTE = VTEs.front();
16850 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
16851 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
16852 VTEs = VTEs.drop_front();
16853 // Iterate through all vectorized nodes.
16854 const auto *MIt = find_if(VTEs, [](const TreeEntry *MTE) {
16855 return MTE->State == TreeEntry::Vectorize;
16856 });
16857 if (MIt == VTEs.end())
16858 continue;
16859 VTE = *MIt;
16860 }
16861 if (none_of(TE->CombinedEntriesWithIndices,
16862 [&](const auto &P) { return P.first == VTE->Idx; })) {
16863 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16864 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16865 continue;
16866 }
16867 // The node is reused - exit.
16868 if (CheckAndUseSameNode(VTE))
16869 break;
16870 VToTEs.insert(VTE);
16871 }
16872 if (VToTEs.empty())
16873 continue;
16874 if (UsedTEs.empty()) {
16875 // The first iteration, just insert the list of nodes to vector.
16876 UsedTEs.push_back(VToTEs);
16877 UsedValuesEntry.try_emplace(V, 0);
16878 } else {
16879 // Need to check if there are any previously used tree nodes which use V.
16880 // If there are no such nodes, consider that we have another one input
16881 // vector.
16882 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
16883 unsigned Idx = 0;
16884 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
16885 // Do we have a non-empty intersection of previously listed tree entries
16886 // and tree entries using current V?
16887 set_intersect(VToTEs, Set);
16888 if (!VToTEs.empty()) {
16889 // Yes, write the new subset and continue analysis for the next
16890 // scalar.
16891 Set.swap(VToTEs);
16892 break;
16893 }
16894 VToTEs = SavedVToTEs;
16895 ++Idx;
16896 }
16897 // No non-empty intersection found - need to add a second set of possible
16898 // source vectors.
16899 if (Idx == UsedTEs.size()) {
16900 // If the number of input vectors is greater than 2 - not a permutation,
16901 // fallback to the regular gather.
16902 // TODO: support multiple reshuffled nodes.
16903 if (UsedTEs.size() == 2)
16904 continue;
16905 UsedTEs.push_back(SavedVToTEs);
16906 Idx = UsedTEs.size() - 1;
16907 }
16908 UsedValuesEntry.try_emplace(V, Idx);
16909 }
16910 }
16911
16912 if (UsedTEs.empty()) {
16913 Entries.clear();
16914 return std::nullopt;
16915 }
16916
16917 unsigned VF = 0;
16918 if (UsedTEs.size() == 1) {
16919 // Keep the order to avoid non-determinism.
16920 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
16921 UsedTEs.front().end());
16922 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
16923 return TE1->Idx < TE2->Idx;
16924 });
16925 // Try to find the perfect match in another gather node at first.
16926 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
16927 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
16928 });
16929 if (It != FirstEntries.end() &&
16930 ((*It)->getVectorFactor() == VL.size() ||
16931 ((*It)->getVectorFactor() == TE->Scalars.size() &&
16932 TE->ReuseShuffleIndices.size() == VL.size() &&
16933 (*It)->isSame(TE->Scalars)))) {
16934 Entries.push_back(*It);
16935 if ((*It)->getVectorFactor() == VL.size()) {
16936 std::iota(std::next(Mask.begin(), Part * VL.size()),
16937 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
16938 } else {
16939 SmallVector<int> CommonMask = TE->getCommonMask();
16940 copy(CommonMask, Mask.begin());
16941 }
16942 // Clear undef scalars.
16943 for (unsigned I : seq<unsigned>(VL.size()))
16944 if (isa<PoisonValue>(VL[I]))
16945 Mask[Part * VL.size() + I] = PoisonMaskElem;
16947 }
16948 // No perfect match, just shuffle, so choose the first tree node from the
16949 // tree.
16950 Entries.push_back(FirstEntries.front());
16951 // Update mapping between values and corresponding tree entries.
16952 for (auto &P : UsedValuesEntry)
16953 P.second = 0;
16954 VF = FirstEntries.front()->getVectorFactor();
16955 } else {
16956 // Try to find nodes with the same vector factor.
16957 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
16958 // Keep the order of tree nodes to avoid non-determinism.
16959 DenseMap<int, const TreeEntry *> VFToTE;
16960 for (const TreeEntry *TE : UsedTEs.front()) {
16961 unsigned VF = TE->getVectorFactor();
16962 auto It = VFToTE.find(VF);
16963 if (It != VFToTE.end()) {
16964 if (It->second->Idx > TE->Idx)
16965 It->getSecond() = TE;
16966 continue;
16967 }
16968 VFToTE.try_emplace(VF, TE);
16969 }
16970 // Same, keep the order to avoid non-determinism.
16971 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
16972 UsedTEs.back().end());
16973 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
16974 return TE1->Idx < TE2->Idx;
16975 });
16976 for (const TreeEntry *TE : SecondEntries) {
16977 auto It = VFToTE.find(TE->getVectorFactor());
16978 if (It != VFToTE.end()) {
16979 VF = It->first;
16980 Entries.push_back(It->second);
16981 Entries.push_back(TE);
16982 break;
16983 }
16984 }
16985 // No 2 source vectors with the same vector factor - just choose 2 with max
16986 // index.
16987 if (Entries.empty()) {
16989 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
16990 return TE1->Idx < TE2->Idx;
16991 }));
16992 Entries.push_back(SecondEntries.front());
16993 VF = std::max(Entries.front()->getVectorFactor(),
16994 Entries.back()->getVectorFactor());
16995 } else {
16996 VF = Entries.front()->getVectorFactor();
16997 }
16998 SmallVector<SmallPtrSet<Value *, 8>> ValuesToEntries;
16999 for (const TreeEntry *E : Entries)
17000 ValuesToEntries.emplace_back().insert(E->Scalars.begin(),
17001 E->Scalars.end());
17002 // Update mapping between values and corresponding tree entries.
17003 for (auto &P : UsedValuesEntry) {
17004 for (unsigned Idx : seq<unsigned>(ValuesToEntries.size()))
17005 if (ValuesToEntries[Idx].contains(P.first)) {
17006 P.second = Idx;
17007 break;
17008 }
17009 }
17010 }
17011
17012 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
17013 // Checks if the 2 PHIs are compatible in terms of high possibility to be
17014 // vectorized.
17015 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
17016 auto *PHI = cast<PHINode>(V);
17017 auto *PHI1 = cast<PHINode>(V1);
17018 // Check that all incoming values are compatible/from same parent (if they
17019 // are instructions).
17020 // The incoming values are compatible if they all are constants, or
17021 // instruction with the same/alternate opcodes from the same basic block.
17022 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
17023 Value *In = PHI->getIncomingValue(I);
17024 Value *In1 = PHI1->getIncomingValue(I);
17025 if (isConstant(In) && isConstant(In1))
17026 continue;
17027 if (!getSameOpcode({In, In1}, *TLI))
17028 return false;
17029 if (cast<Instruction>(In)->getParent() !=
17031 return false;
17032 }
17033 return true;
17034 };
17035 // Check if the value can be ignored during analysis for shuffled gathers.
17036 // We suppose it is better to ignore instruction, which do not form splats,
17037 // are not vectorized/not extractelements (these instructions will be handled
17038 // by extractelements processing) or may form vector node in future.
17039 auto MightBeIgnored = [=](Value *V) {
17040 auto *I = dyn_cast<Instruction>(V);
17041 return I && !IsSplatOrUndefs && !isVectorized(I) &&
17043 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
17044 };
17045 // Check that the neighbor instruction may form a full vector node with the
17046 // current instruction V. It is possible, if they have same/alternate opcode
17047 // and same parent basic block.
17048 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
17049 Value *V1 = VL[Idx];
17050 bool UsedInSameVTE = false;
17051 auto It = UsedValuesEntry.find(V1);
17052 if (It != UsedValuesEntry.end())
17053 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
17054 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
17055 getSameOpcode({V, V1}, *TLI) &&
17056 cast<Instruction>(V)->getParent() ==
17057 cast<Instruction>(V1)->getParent() &&
17058 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
17059 };
17060 // Build a shuffle mask for better cost estimation and vector emission.
17061 SmallBitVector UsedIdxs(Entries.size());
17063 for (int I = 0, E = VL.size(); I < E; ++I) {
17064 Value *V = VL[I];
17065 auto It = UsedValuesEntry.find(V);
17066 if (It == UsedValuesEntry.end())
17067 continue;
17068 // Do not try to shuffle scalars, if they are constants, or instructions
17069 // that can be vectorized as a result of the following vector build
17070 // vectorization.
17071 if (isConstant(V) || (MightBeIgnored(V) &&
17072 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
17073 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
17074 continue;
17075 unsigned Idx = It->second;
17076 EntryLanes.emplace_back(Idx, I);
17077 UsedIdxs.set(Idx);
17078 }
17079 // Iterate through all shuffled scalars and select entries, which can be used
17080 // for final shuffle.
17082 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
17083 if (!UsedIdxs.test(I))
17084 continue;
17085 // Fix the entry number for the given scalar. If it is the first entry, set
17086 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
17087 // These indices are used when calculating final shuffle mask as the vector
17088 // offset.
17089 for (std::pair<unsigned, int> &Pair : EntryLanes)
17090 if (Pair.first == I)
17091 Pair.first = TempEntries.size();
17092 TempEntries.push_back(Entries[I]);
17093 }
17094 Entries.swap(TempEntries);
17095 if (EntryLanes.size() == Entries.size() &&
17096 !VL.equals(ArrayRef(TE->Scalars)
17097 .slice(Part * VL.size(),
17098 std::min<int>(VL.size(), TE->Scalars.size())))) {
17099 // We may have here 1 or 2 entries only. If the number of scalars is equal
17100 // to the number of entries, no need to do the analysis, it is not very
17101 // profitable. Since VL is not the same as TE->Scalars, it means we already
17102 // have some shuffles before. Cut off not profitable case.
17103 Entries.clear();
17104 return std::nullopt;
17105 }
17106 // Build the final mask, check for the identity shuffle, if possible.
17107 bool IsIdentity = Entries.size() == 1;
17108 // Pair.first is the offset to the vector, while Pair.second is the index of
17109 // scalar in the list.
17110 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
17111 unsigned Idx = Part * VL.size() + Pair.second;
17112 Mask[Idx] =
17113 Pair.first * VF +
17114 (ForOrder ? std::distance(
17115 Entries[Pair.first]->Scalars.begin(),
17116 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
17117 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
17118 IsIdentity &= Mask[Idx] == Pair.second;
17119 }
17120 if (ForOrder || IsIdentity || Entries.empty()) {
17121 switch (Entries.size()) {
17122 case 1:
17123 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
17125 break;
17126 case 2:
17127 if (EntryLanes.size() > 2 || VL.size() <= 2)
17129 break;
17130 default:
17131 break;
17132 }
17133 } else if (!isa<VectorType>(VL.front()->getType()) &&
17134 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
17135 // Do the cost estimation if shuffle beneficial than buildvector.
17136 SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),
17137 std::next(Mask.begin(), (Part + 1) * VL.size()));
17138 int MinElement = SubMask.front(), MaxElement = SubMask.front();
17139 for (int Idx : SubMask) {
17140 if (Idx == PoisonMaskElem)
17141 continue;
17142 if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
17143 MinElement = Idx;
17144 if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
17145 MaxElement = Idx;
17146 }
17147 assert(MaxElement >= 0 && MinElement >= 0 &&
17148 MaxElement % VF >= MinElement % VF &&
17149 "Expected at least single element.");
17150 unsigned NewVF = std::max<unsigned>(
17151 VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(),
17152 (MaxElement % VF) -
17153 (MinElement % VF) + 1));
17154 if (NewVF < VF) {
17155 for (int &Idx : SubMask) {
17156 if (Idx == PoisonMaskElem)
17157 continue;
17158 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
17159 (Idx >= static_cast<int>(VF) ? NewVF : 0);
17160 }
17161 } else {
17162 NewVF = VF;
17163 }
17164
17166 auto *VecTy = getWidenedType(VL.front()->getType(), NewVF);
17167 auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
17168 auto GetShuffleCost = [&,
17169 &TTI = *TTI](ArrayRef<int> Mask,
17171 VectorType *VecTy) -> InstructionCost {
17172 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
17174 Mask, Entries.front()->getInterleaveFactor()))
17175 return TTI::TCC_Free;
17176 return ::getShuffleCost(TTI,
17177 Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
17179 VecTy, Mask, CostKind);
17180 };
17181 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
17182 InstructionCost FirstShuffleCost = 0;
17183 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
17184 if (Entries.size() == 1 || !Entries[0]->isGather()) {
17185 FirstShuffleCost = ShuffleCost;
17186 } else {
17187 // Transform mask to include only first entry.
17188 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
17189 bool IsIdentity = true;
17190 for (auto [I, Idx] : enumerate(FirstMask)) {
17191 if (Idx >= static_cast<int>(NewVF)) {
17192 Idx = PoisonMaskElem;
17193 } else {
17194 DemandedElts.clearBit(I);
17195 if (Idx != PoisonMaskElem)
17196 IsIdentity &= static_cast<int>(I) == Idx;
17197 }
17198 }
17199 if (!IsIdentity)
17200 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
17201 FirstShuffleCost += getScalarizationOverhead(
17202 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
17203 /*Extract=*/false, CostKind);
17204 }
17205 InstructionCost SecondShuffleCost = 0;
17206 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
17207 if (Entries.size() == 1 || !Entries[1]->isGather()) {
17208 SecondShuffleCost = ShuffleCost;
17209 } else {
17210 // Transform mask to include only first entry.
17211 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
17212 bool IsIdentity = true;
17213 for (auto [I, Idx] : enumerate(SecondMask)) {
17214 if (Idx < static_cast<int>(NewVF) && Idx >= 0) {
17215 Idx = PoisonMaskElem;
17216 } else {
17217 DemandedElts.clearBit(I);
17218 if (Idx != PoisonMaskElem) {
17219 Idx -= NewVF;
17220 IsIdentity &= static_cast<int>(I) == Idx;
17221 }
17222 }
17223 }
17224 if (!IsIdentity)
17225 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
17226 SecondShuffleCost += getScalarizationOverhead(
17227 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
17228 /*Extract=*/false, CostKind);
17229 }
17230 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
17231 for (auto [I, Idx] : enumerate(SubMask))
17232 if (Idx == PoisonMaskElem)
17233 DemandedElts.clearBit(I);
17234 InstructionCost BuildVectorCost = getScalarizationOverhead(
17235 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
17236 /*Extract=*/false, CostKind);
17237 const TreeEntry *BestEntry = nullptr;
17238 if (FirstShuffleCost < ShuffleCost) {
17239 std::for_each(std::next(Mask.begin(), Part * VL.size()),
17240 std::next(Mask.begin(), (Part + 1) * VL.size()),
17241 [&](int &Idx) {
17242 if (Idx >= static_cast<int>(VF))
17243 Idx = PoisonMaskElem;
17244 });
17245 BestEntry = Entries.front();
17246 ShuffleCost = FirstShuffleCost;
17247 }
17248 if (SecondShuffleCost < ShuffleCost) {
17249 std::for_each(std::next(Mask.begin(), Part * VL.size()),
17250 std::next(Mask.begin(), (Part + 1) * VL.size()),
17251 [&](int &Idx) {
17252 if (Idx < static_cast<int>(VF))
17253 Idx = PoisonMaskElem;
17254 else
17255 Idx -= VF;
17256 });
17257 BestEntry = Entries[1];
17258 ShuffleCost = SecondShuffleCost;
17259 }
17260 if (BuildVectorCost >= ShuffleCost) {
17261 if (BestEntry) {
17262 Entries.clear();
17263 Entries.push_back(BestEntry);
17264 }
17265 return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
17267 }
17268 }
17269 Entries.clear();
17270 // Clear the corresponding mask elements.
17271 std::fill(std::next(Mask.begin(), Part * VL.size()),
17272 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
17273 return std::nullopt;
17274}
17275
17277BoUpSLP::isGatherShuffledEntry(
17278 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
17279 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
17280 bool ForOrder) {
17281 assert(NumParts > 0 && NumParts < VL.size() &&
17282 "Expected positive number of registers.");
17283 Entries.clear();
17284 // No need to check for the topmost gather node.
17285 if (TE == VectorizableTree.front().get() &&
17286 (!GatheredLoadsEntriesFirst.has_value() ||
17287 none_of(ArrayRef(VectorizableTree).drop_front(),
17288 [](const std::unique_ptr<TreeEntry> &TE) {
17289 return !TE->isGather();
17290 })))
17291 return {};
17292 // FIXME: Gathering for non-power-of-2 (non whole registers) nodes not
17293 // implemented yet.
17294 if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
17295 return {};
17296 Mask.assign(VL.size(), PoisonMaskElem);
17297 assert((TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
17298 "Expected only single user of the gather node.");
17299 assert(VL.size() % NumParts == 0 &&
17300 "Number of scalars must be divisible by NumParts.");
17301 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->isGather() &&
17302 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
17303 (TE->Idx == 0 ||
17304 (TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) ||
17305 isSplat(TE->Scalars) ||
17306 (TE->hasState() &&
17307 getSameValuesTreeEntry(TE->getMainOp(), TE->Scalars))))
17308 return {};
17309 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
17311 for (unsigned Part : seq<unsigned>(NumParts)) {
17312 ArrayRef<Value *> SubVL =
17313 VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
17314 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
17315 std::optional<TTI::ShuffleKind> SubRes =
17316 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
17317 ForOrder);
17318 if (!SubRes)
17319 SubEntries.clear();
17320 Res.push_back(SubRes);
17321 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
17322 SubEntries.front()->getVectorFactor() == VL.size() &&
17323 (SubEntries.front()->isSame(TE->Scalars) ||
17324 SubEntries.front()->isSame(VL))) {
17325 SmallVector<const TreeEntry *> LocalSubEntries;
17326 LocalSubEntries.swap(SubEntries);
17327 Entries.clear();
17328 Res.clear();
17329 std::iota(Mask.begin(), Mask.end(), 0);
17330 // Clear undef scalars.
17331 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
17332 if (isa<PoisonValue>(VL[I]))
17334 Entries.emplace_back(1, LocalSubEntries.front());
17336 return Res;
17337 }
17338 }
17339 if (all_of(Res,
17340 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
17341 Entries.clear();
17342 return {};
17343 }
17344 return Res;
17345}
17346
17347InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
17348 Type *ScalarTy) const {
17349 const unsigned VF = VL.size();
17350 auto *VecTy = getWidenedType(ScalarTy, VF);
17351 // Find the cost of inserting/extracting values from the vector.
17352 // Check if the same elements are inserted several times and count them as
17353 // shuffle candidates.
17354 APInt DemandedElements = APInt::getZero(VF);
17357 auto EstimateInsertCost = [&](unsigned I, Value *V) {
17358 DemandedElements.setBit(I);
17359 if (V->getType() != ScalarTy)
17360 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
17362 };
17363 SmallVector<int> ConstantShuffleMask(VF, PoisonMaskElem);
17364 std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
17365 for (auto [I, V] : enumerate(VL)) {
17366 // No need to shuffle duplicates for constants.
17367 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V))
17368 continue;
17369
17370 if (isConstant(V)) {
17371 ConstantShuffleMask[I] = I + VF;
17372 continue;
17373 }
17374 EstimateInsertCost(I, V);
17375 }
17376 // FIXME: add a cost for constant vector materialization.
17377 bool IsAnyNonUndefConst =
17378 any_of(VL, [](Value *V) { return !isa<UndefValue>(V) && isConstant(V); });
17379 // 1. Shuffle input source vector and constant vector.
17380 if (!ForPoisonSrc && IsAnyNonUndefConst) {
17382 ConstantShuffleMask);
17383 }
17384
17385 // 2. Insert unique non-constants.
17386 if (!DemandedElements.isZero())
17387 Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElements,
17388 /*Insert=*/true,
17389 /*Extract=*/false, CostKind,
17390 ForPoisonSrc && !IsAnyNonUndefConst, VL);
17391 return Cost;
17392}
17393
17394Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
17395 auto It = EntryToLastInstruction.find(E);
17396 if (It != EntryToLastInstruction.end())
17397 return *cast<Instruction>(It->second);
17398 Instruction *Res = nullptr;
17399 // Get the basic block this bundle is in. All instructions in the bundle
17400 // should be in this block (except for extractelement-like instructions with
17401 // constant indices or gathered loads or copyables).
17402 Instruction *Front;
17403 unsigned Opcode;
17404 if (E->hasState()) {
17405 Front = E->getMainOp();
17406 Opcode = E->getOpcode();
17407 } else {
17408 Front = cast<Instruction>(*find_if(E->Scalars, IsaPred<Instruction>));
17409 Opcode = Front->getOpcode();
17410 }
17411 auto *BB = Front->getParent();
17412 assert(
17413 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
17414 E->isGather() && E->Idx < *GatheredLoadsEntriesFirst) ||
17415 E->State == TreeEntry::SplitVectorize || E->hasCopyableElements() ||
17416 all_of(E->Scalars,
17417 [=](Value *V) -> bool {
17418 if (Opcode == Instruction::GetElementPtr &&
17419 !isa<GetElementPtrInst>(V))
17420 return true;
17421 auto *I = dyn_cast<Instruction>(V);
17422 return !I || !E->getMatchingMainOpOrAltOp(I) ||
17423 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
17424 })) &&
17425 "Expected gathered loads or GEPs or instructions from same basic "
17426 "block.");
17427
17428 auto FindLastInst = [&]() {
17429 Instruction *LastInst = Front;
17430 for (Value *V : E->Scalars) {
17431 auto *I = dyn_cast<Instruction>(V);
17432 if (!I)
17433 continue;
17434 if (E->isCopyableElement(I))
17435 continue;
17436 if (LastInst->getParent() == I->getParent()) {
17437 if (LastInst->comesBefore(I))
17438 LastInst = I;
17439 continue;
17440 }
17441 assert(((Opcode == Instruction::GetElementPtr &&
17443 E->State == TreeEntry::SplitVectorize ||
17444 (isVectorLikeInstWithConstOps(LastInst) &&
17446 (GatheredLoadsEntriesFirst.has_value() &&
17447 Opcode == Instruction::Load && E->isGather() &&
17448 E->Idx < *GatheredLoadsEntriesFirst)) &&
17449 "Expected vector-like or non-GEP in GEP node insts only.");
17450 if (!DT->isReachableFromEntry(LastInst->getParent())) {
17451 LastInst = I;
17452 continue;
17453 }
17454 if (!DT->isReachableFromEntry(I->getParent()))
17455 continue;
17456 auto *NodeA = DT->getNode(LastInst->getParent());
17457 auto *NodeB = DT->getNode(I->getParent());
17458 assert(NodeA && "Should only process reachable instructions");
17459 assert(NodeB && "Should only process reachable instructions");
17460 assert((NodeA == NodeB) ==
17461 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17462 "Different nodes should have different DFS numbers");
17463 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
17464 LastInst = I;
17465 }
17466 BB = LastInst->getParent();
17467 return LastInst;
17468 };
17469
17470 auto FindFirstInst = [&]() {
17471 Instruction *FirstInst = Front;
17472 for (Value *V : E->Scalars) {
17473 auto *I = dyn_cast<Instruction>(V);
17474 if (!I)
17475 continue;
17476 if (E->isCopyableElement(I))
17477 continue;
17478 if (FirstInst->getParent() == I->getParent()) {
17479 if (I->comesBefore(FirstInst))
17480 FirstInst = I;
17481 continue;
17482 }
17483 assert(((Opcode == Instruction::GetElementPtr &&
17485 (isVectorLikeInstWithConstOps(FirstInst) &&
17487 "Expected vector-like or non-GEP in GEP node insts only.");
17488 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
17489 FirstInst = I;
17490 continue;
17491 }
17492 if (!DT->isReachableFromEntry(I->getParent()))
17493 continue;
17494 auto *NodeA = DT->getNode(FirstInst->getParent());
17495 auto *NodeB = DT->getNode(I->getParent());
17496 assert(NodeA && "Should only process reachable instructions");
17497 assert(NodeB && "Should only process reachable instructions");
17498 assert((NodeA == NodeB) ==
17499 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17500 "Different nodes should have different DFS numbers");
17501 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
17502 FirstInst = I;
17503 }
17504 return FirstInst;
17505 };
17506
17507 if (E->State == TreeEntry::SplitVectorize) {
17508 Res = FindLastInst();
17509 if (ArrayRef<TreeEntry *> Entries = getTreeEntries(Res); !Entries.empty()) {
17510 for (auto *E : Entries) {
17511 auto *I = dyn_cast_or_null<Instruction>(E->VectorizedValue);
17512 if (!I)
17513 I = &getLastInstructionInBundle(E);
17514 if (Res->getParent() == I->getParent() && Res->comesBefore(I))
17515 Res = I;
17516 }
17517 }
17518 EntryToLastInstruction.try_emplace(E, Res);
17519 return *Res;
17520 }
17521
17522 // Set insertpoint for gathered loads to the very first load.
17523 if (GatheredLoadsEntriesFirst.has_value() &&
17524 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
17525 Opcode == Instruction::Load) {
17526 Res = FindFirstInst();
17527 EntryToLastInstruction.try_emplace(E, Res);
17528 return *Res;
17529 }
17530
17531 // Set the insert point to the beginning of the basic block if the entry
17532 // should not be scheduled.
17533 auto FindScheduleBundle = [&](const TreeEntry *E) -> const ScheduleBundle * {
17534 if (E->isGather())
17535 return nullptr;
17536 // Found previously that the instruction do not need to be scheduled.
17537 const auto *It = BlocksSchedules.find(BB);
17538 if (It == BlocksSchedules.end())
17539 return nullptr;
17540 for (Value *V : E->Scalars) {
17541 auto *I = dyn_cast<Instruction>(V);
17542 if (!I || isa<PHINode>(I) ||
17543 (!E->isCopyableElement(I) && doesNotNeedToBeScheduled(I)))
17544 continue;
17545 ArrayRef<ScheduleBundle *> Bundles = It->second->getScheduleBundles(I);
17546 if (Bundles.empty())
17547 continue;
17548 const auto *It = find_if(
17549 Bundles, [&](ScheduleBundle *B) { return B->getTreeEntry() == E; });
17550 if (It != Bundles.end())
17551 return *It;
17552 }
17553 return nullptr;
17554 };
17555 const ScheduleBundle *Bundle = FindScheduleBundle(E);
17556 if (!E->isGather() && !Bundle) {
17557 if ((Opcode == Instruction::GetElementPtr &&
17558 any_of(E->Scalars,
17559 [](Value *V) {
17560 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
17561 })) ||
17562 all_of(E->Scalars, [&](Value *V) {
17563 return isa<PoisonValue>(V) ||
17564 (E->Idx == 0 && isa<InsertElementInst>(V)) ||
17565 E->isCopyableElement(V) ||
17566 (!isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V));
17567 }))
17568 Res = FindLastInst();
17569 else
17570 Res = FindFirstInst();
17571 EntryToLastInstruction.try_emplace(E, Res);
17572 return *Res;
17573 }
17574
17575 // Find the last instruction. The common case should be that BB has been
17576 // scheduled, and the last instruction is VL.back(). So we start with
17577 // VL.back() and iterate over schedule data until we reach the end of the
17578 // bundle. The end of the bundle is marked by null ScheduleData.
17579 if (Bundle) {
17580 assert(!E->isGather() && "Gathered instructions should not be scheduled");
17581 Res = Bundle->getBundle().back()->getInst();
17582 EntryToLastInstruction.try_emplace(E, Res);
17583 return *Res;
17584 }
17585
17586 // LastInst can still be null at this point if there's either not an entry
17587 // for BB in BlocksSchedules or there's no ScheduleData available for
17588 // VL.back(). This can be the case if buildTreeRec aborts for various
17589 // reasons (e.g., the maximum recursion depth is reached, the maximum region
17590 // size is reached, etc.). ScheduleData is initialized in the scheduling
17591 // "dry-run".
17592 //
17593 // If this happens, we can still find the last instruction by brute force. We
17594 // iterate forwards from Front (inclusive) until we either see all
17595 // instructions in the bundle or reach the end of the block. If Front is the
17596 // last instruction in program order, LastInst will be set to Front, and we
17597 // will visit all the remaining instructions in the block.
17598 //
17599 // One of the reasons we exit early from buildTreeRec is to place an upper
17600 // bound on compile-time. Thus, taking an additional compile-time hit here is
17601 // not ideal. However, this should be exceedingly rare since it requires that
17602 // we both exit early from buildTreeRec and that the bundle be out-of-order
17603 // (causing us to iterate all the way to the end of the block).
17604 if (!Res)
17605 Res = FindLastInst();
17606 assert(Res && "Failed to find last instruction in bundle");
17607 EntryToLastInstruction.try_emplace(E, Res);
17608 return *Res;
17609}
17610
17611void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
17612 auto *Front = E->getMainOp();
17613 Instruction *LastInst = &getLastInstructionInBundle(E);
17614 assert(LastInst && "Failed to find last instruction in bundle");
17615 BasicBlock::iterator LastInstIt = LastInst->getIterator();
17616 // If the instruction is PHI, set the insert point after all the PHIs.
17617 bool IsPHI = isa<PHINode>(LastInst);
17618 if (IsPHI) {
17619 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
17620 if (LastInstIt != LastInst->getParent()->end() &&
17621 LastInstIt->getParent()->isLandingPad())
17622 LastInstIt = std::next(LastInstIt);
17623 }
17624 if (IsPHI ||
17625 (!E->isGather() && E->State != TreeEntry::SplitVectorize &&
17626 E->doesNotNeedToSchedule()) ||
17627 (GatheredLoadsEntriesFirst.has_value() &&
17628 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
17629 E->getOpcode() == Instruction::Load)) {
17630 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
17631 } else {
17632 // Set the insertion point after the last instruction in the bundle. Set the
17633 // debug location to Front.
17634 Builder.SetInsertPoint(
17635 LastInst->getParent(),
17636 LastInst->getNextNode()->getIterator());
17637 }
17638 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
17639}
17640
17641Value *BoUpSLP::gather(
17642 ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
17643 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle) {
17644 // List of instructions/lanes from current block and/or the blocks which are
17645 // part of the current loop. These instructions will be inserted at the end to
17646 // make it possible to optimize loops and hoist invariant instructions out of
17647 // the loops body with better chances for success.
17649 SmallSet<int, 4> PostponedIndices;
17650 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
17651 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
17652 SmallPtrSet<BasicBlock *, 4> Visited;
17653 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
17654 InsertBB = InsertBB->getSinglePredecessor();
17655 return InsertBB && InsertBB == InstBB;
17656 };
17657 for (int I = 0, E = VL.size(); I < E; ++I) {
17658 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
17659 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
17660 isVectorized(Inst) ||
17661 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
17662 PostponedIndices.insert(I).second)
17663 PostponedInsts.emplace_back(Inst, I);
17664 }
17665
17666 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
17667 Type *Ty) {
17668 Value *Scalar = V;
17669 if (Scalar->getType() != Ty) {
17670 assert(Scalar->getType()->isIntOrIntVectorTy() &&
17671 Ty->isIntOrIntVectorTy() && "Expected integer types only.");
17672 Value *V = Scalar;
17673 if (auto *CI = dyn_cast<CastInst>(Scalar);
17675 Value *Op = CI->getOperand(0);
17676 if (auto *IOp = dyn_cast<Instruction>(Op);
17677 !IOp || !(isDeleted(IOp) || isVectorized(IOp)))
17678 V = Op;
17679 }
17680 Scalar = Builder.CreateIntCast(
17681 V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
17682 }
17683
17684 Instruction *InsElt;
17685 if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
17686 assert(SLPReVec && "FixedVectorType is not expected.");
17687 Vec =
17688 createInsertVector(Builder, Vec, Scalar, Pos * getNumElements(VecTy));
17689 auto *II = dyn_cast<Instruction>(Vec);
17690 if (!II)
17691 return Vec;
17692 InsElt = II;
17693 } else {
17694 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
17695 InsElt = dyn_cast<InsertElementInst>(Vec);
17696 if (!InsElt)
17697 return Vec;
17698 }
17699 GatherShuffleExtractSeq.insert(InsElt);
17700 CSEBlocks.insert(InsElt->getParent());
17701 // Add to our 'need-to-extract' list.
17702 if (isa<Instruction>(V)) {
17703 if (ArrayRef<TreeEntry *> Entries = getTreeEntries(V); !Entries.empty()) {
17704 // Find which lane we need to extract.
17705 User *UserOp = nullptr;
17706 if (Scalar != V) {
17707 if (auto *SI = dyn_cast<Instruction>(Scalar))
17708 UserOp = SI;
17709 } else {
17710 if (V->getType()->isVectorTy()) {
17711 if (auto *SV = dyn_cast<ShuffleVectorInst>(InsElt);
17712 SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
17713 // Find shufflevector, caused by resize.
17714 auto FindOperand = [](Value *Vec, Value *V) -> Instruction * {
17715 if (auto *SV = dyn_cast<ShuffleVectorInst>(Vec)) {
17716 if (SV->getOperand(0) == V)
17717 return SV;
17718 if (SV->getOperand(1) == V)
17719 return SV;
17720 }
17721 return nullptr;
17722 };
17723 InsElt = nullptr;
17724 if (Instruction *User = FindOperand(SV->getOperand(0), V))
17725 InsElt = User;
17726 else if (Instruction *User = FindOperand(SV->getOperand(1), V))
17727 InsElt = User;
17728 assert(InsElt &&
17729 "Failed to find shufflevector, caused by resize.");
17730 }
17731 }
17732 UserOp = InsElt;
17733 }
17734 if (UserOp) {
17735 unsigned FoundLane = Entries.front()->findLaneForValue(V);
17736 ExternalUses.emplace_back(V, UserOp, *Entries.front(), FoundLane);
17737 }
17738 }
17739 }
17740 return Vec;
17741 };
17742 auto *VecTy = getWidenedType(ScalarTy, VL.size());
17743 Value *Vec = PoisonValue::get(VecTy);
17744 SmallVector<int> NonConsts;
17745 SmallVector<int> Mask(VL.size());
17746 std::iota(Mask.begin(), Mask.end(), 0);
17747 Value *OriginalRoot = Root;
17748 if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
17749 SV && isa<PoisonValue>(SV->getOperand(1)) &&
17750 SV->getOperand(0)->getType() == VecTy) {
17751 Root = SV->getOperand(0);
17752 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
17753 }
17754 // Insert constant values at first.
17755 for (int I = 0, E = VL.size(); I < E; ++I) {
17756 if (PostponedIndices.contains(I))
17757 continue;
17758 if (!isConstant(VL[I])) {
17759 NonConsts.push_back(I);
17760 continue;
17761 }
17762 if (isa<PoisonValue>(VL[I]))
17763 continue;
17764 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
17765 Mask[I] = I + E;
17766 }
17767 if (Root) {
17768 if (isa<PoisonValue>(Vec)) {
17769 Vec = OriginalRoot;
17770 } else {
17771 Vec = CreateShuffle(Root, Vec, Mask);
17772 if (auto *OI = dyn_cast<Instruction>(OriginalRoot);
17773 OI && OI->use_empty() &&
17774 none_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
17775 return TE->VectorizedValue == OI;
17776 }))
17777 eraseInstruction(OI);
17778 }
17779 }
17780 // Insert non-constant values.
17781 for (int I : NonConsts)
17782 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
17783 // Append instructions, which are/may be part of the loop, in the end to make
17784 // it possible to hoist non-loop-based instructions.
17785 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
17786 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
17787
17788 return Vec;
17789}
17790
17791/// Merges shuffle masks and emits final shuffle instruction, if required. It
17792/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
17793/// when the actual shuffle instruction is generated only if this is actually
17794/// required. Otherwise, the shuffle instruction emission is delayed till the
17795/// end of the process, to reduce the number of emitted instructions and further
17796/// analysis/transformations.
17797/// The class also will look through the previously emitted shuffle instructions
17798/// and properly mark indices in mask as undef.
17799/// For example, given the code
17800/// \code
17801/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
17802/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
17803/// \endcode
17804/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
17805/// look through %s1 and %s2 and emit
17806/// \code
17807/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
17808/// \endcode
17809/// instead.
17810/// If 2 operands are of different size, the smallest one will be resized and
17811/// the mask recalculated properly.
17812/// For example, given the code
17813/// \code
17814/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
17815/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
17816/// \endcode
17817/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
17818/// look through %s1 and %s2 and emit
17819/// \code
17820/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
17821/// \endcode
17822/// instead.
17823class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
17824 bool IsFinalized = false;
17825 /// Combined mask for all applied operands and masks. It is built during
17826 /// analysis and actual emission of shuffle vector instructions.
17827 SmallVector<int> CommonMask;
17828 /// List of operands for the shuffle vector instruction. It hold at max 2
17829 /// operands, if the 3rd is going to be added, the first 2 are combined into
17830 /// shuffle with \p CommonMask mask, the first operand sets to be the
17831 /// resulting shuffle and the second operand sets to be the newly added
17832 /// operand. The \p CommonMask is transformed in the proper way after that.
17833 SmallVector<Value *, 2> InVectors;
17834 IRBuilderBase &Builder;
17835 BoUpSLP &R;
17836
17837 class ShuffleIRBuilder {
17838 IRBuilderBase &Builder;
17839 /// Holds all of the instructions that we gathered.
17840 SetVector<Instruction *> &GatherShuffleExtractSeq;
17841 /// A list of blocks that we are going to CSE.
17842 DenseSet<BasicBlock *> &CSEBlocks;
17843 /// Data layout.
17844 const DataLayout &DL;
17845
17846 public:
17847 ShuffleIRBuilder(IRBuilderBase &Builder,
17848 SetVector<Instruction *> &GatherShuffleExtractSeq,
17849 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
17850 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
17851 CSEBlocks(CSEBlocks), DL(DL) {}
17852 ~ShuffleIRBuilder() = default;
17853 /// Creates shufflevector for the 2 operands with the given mask.
17854 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
17855 if (V1->getType() != V2->getType()) {
17857 V1->getType()->isIntOrIntVectorTy() &&
17858 "Expected integer vector types only.");
17859 if (V1->getType() != V2->getType()) {
17860 if (cast<VectorType>(V2->getType())
17861 ->getElementType()
17862 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
17863 ->getElementType()
17864 ->getIntegerBitWidth())
17865 V2 = Builder.CreateIntCast(
17866 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
17867 else
17868 V1 = Builder.CreateIntCast(
17869 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
17870 }
17871 }
17872 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
17873 if (auto *I = dyn_cast<Instruction>(Vec)) {
17874 GatherShuffleExtractSeq.insert(I);
17875 CSEBlocks.insert(I->getParent());
17876 }
17877 return Vec;
17878 }
17879 /// Creates permutation of the single vector operand with the given mask, if
17880 /// it is not identity mask.
17881 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
17882 if (Mask.empty())
17883 return V1;
17884 unsigned VF = Mask.size();
17885 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
17886 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
17887 return V1;
17888 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
17889 if (auto *I = dyn_cast<Instruction>(Vec)) {
17890 GatherShuffleExtractSeq.insert(I);
17891 CSEBlocks.insert(I->getParent());
17892 }
17893 return Vec;
17894 }
17895 Value *createIdentity(Value *V) { return V; }
17896 Value *createPoison(Type *Ty, unsigned VF) {
17897 return PoisonValue::get(getWidenedType(Ty, VF));
17898 }
17899 /// Resizes 2 input vector to match the sizes, if the they are not equal
17900 /// yet. The smallest vector is resized to the size of the larger vector.
17901 void resizeToMatch(Value *&V1, Value *&V2) {
17902 if (V1->getType() == V2->getType())
17903 return;
17904 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
17905 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
17906 int VF = std::max(V1VF, V2VF);
17907 int MinVF = std::min(V1VF, V2VF);
17908 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
17909 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
17910 0);
17911 Value *&Op = MinVF == V1VF ? V1 : V2;
17912 Op = Builder.CreateShuffleVector(Op, IdentityMask);
17913 if (auto *I = dyn_cast<Instruction>(Op)) {
17914 GatherShuffleExtractSeq.insert(I);
17915 CSEBlocks.insert(I->getParent());
17916 }
17917 if (MinVF == V1VF)
17918 V1 = Op;
17919 else
17920 V2 = Op;
17921 }
17922 };
17923
17924 /// Smart shuffle instruction emission, walks through shuffles trees and
17925 /// tries to find the best matching vector for the actual shuffle
17926 /// instruction.
17927 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
17928 assert(V1 && "Expected at least one vector value.");
17929 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
17930 R.CSEBlocks, *R.DL);
17931 return BaseShuffleAnalysis::createShuffle<Value *>(
17932 V1, V2, Mask, ShuffleBuilder, ScalarTy);
17933 }
17934
17935 /// Cast value \p V to the vector type with the same number of elements, but
17936 /// the base type \p ScalarTy.
17937 Value *castToScalarTyElem(Value *V,
17938 std::optional<bool> IsSigned = std::nullopt) {
17939 auto *VecTy = cast<VectorType>(V->getType());
17940 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
17941 if (VecTy->getElementType() == ScalarTy->getScalarType())
17942 return V;
17943 return Builder.CreateIntCast(
17944 V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
17945 IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
17946 }
17947
17948 Value *getVectorizedValue(const TreeEntry &E) {
17949 Value *Vec = E.VectorizedValue;
17950 if (!Vec->getType()->isIntOrIntVectorTy())
17951 return Vec;
17952 return castToScalarTyElem(Vec, any_of(E.Scalars, [&](Value *V) {
17953 return !isa<PoisonValue>(V) &&
17954 !isKnownNonNegative(
17955 V, SimplifyQuery(*R.DL));
17956 }));
17957 }
17958
17959public:
17961 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
17962
17963 /// Adjusts extractelements after reusing them.
17964 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
17965 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
17966 unsigned NumParts, bool &UseVecBaseAsInput) {
17967 UseVecBaseAsInput = false;
17968 SmallPtrSet<Value *, 4> UniqueBases;
17969 Value *VecBase = nullptr;
17970 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
17971 if (!E->ReorderIndices.empty()) {
17972 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
17973 E->ReorderIndices.end());
17974 reorderScalars(VL, ReorderMask);
17975 }
17976 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
17977 int Idx = Mask[I];
17978 if (Idx == PoisonMaskElem)
17979 continue;
17980 auto *EI = cast<ExtractElementInst>(VL[I]);
17981 VecBase = EI->getVectorOperand();
17982 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecBase); !TEs.empty())
17983 VecBase = TEs.front()->VectorizedValue;
17984 assert(VecBase && "Expected vectorized value.");
17985 UniqueBases.insert(VecBase);
17986 // If the only one use is vectorized - can delete the extractelement
17987 // itself.
17988 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
17989 (NumParts != 1 && count(VL, EI) > 1) ||
17990 any_of(EI->users(), [&](User *U) {
17991 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
17992 return UTEs.empty() || UTEs.size() > 1 ||
17993 (isa<GetElementPtrInst>(U) &&
17994 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
17995 (!UTEs.empty() &&
17996 count_if(R.VectorizableTree,
17997 [&](const std::unique_ptr<TreeEntry> &TE) {
17998 return TE->UserTreeIndex.UserTE ==
17999 UTEs.front() &&
18000 is_contained(VL, EI);
18001 }) != 1);
18002 }))
18003 continue;
18004 R.eraseInstruction(EI);
18005 }
18006 if (NumParts == 1 || UniqueBases.size() == 1) {
18007 assert(VecBase && "Expected vectorized value.");
18008 return castToScalarTyElem(VecBase);
18009 }
18010 UseVecBaseAsInput = true;
18011 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
18012 for (auto [I, Idx] : enumerate(Mask))
18013 if (Idx != PoisonMaskElem)
18014 Idx = I;
18015 };
18016 // Perform multi-register vector shuffle, joining them into a single virtual
18017 // long vector.
18018 // Need to shuffle each part independently and then insert all this parts
18019 // into a long virtual vector register, forming the original vector.
18020 Value *Vec = nullptr;
18021 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
18022 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
18023 for (unsigned Part : seq<unsigned>(NumParts)) {
18024 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
18025 ArrayRef<Value *> SubVL = ArrayRef(VL).slice(Part * SliceSize, Limit);
18026 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
18027 constexpr int MaxBases = 2;
18028 SmallVector<Value *, MaxBases> Bases(MaxBases);
18029 auto VLMask = zip(SubVL, SubMask);
18030 const unsigned VF = std::accumulate(
18031 VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) {
18032 if (std::get<1>(D) == PoisonMaskElem)
18033 return S;
18034 Value *VecOp =
18035 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
18036 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
18037 !TEs.empty())
18038 VecOp = TEs.front()->VectorizedValue;
18039 assert(VecOp && "Expected vectorized value.");
18040 const unsigned Size =
18041 cast<FixedVectorType>(VecOp->getType())->getNumElements();
18042 return std::max(S, Size);
18043 });
18044 for (const auto [V, I] : VLMask) {
18045 if (I == PoisonMaskElem)
18046 continue;
18047 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
18048 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp); !TEs.empty())
18049 VecOp = TEs.front()->VectorizedValue;
18050 assert(VecOp && "Expected vectorized value.");
18051 VecOp = castToScalarTyElem(VecOp);
18052 Bases[I / VF] = VecOp;
18053 }
18054 if (!Bases.front())
18055 continue;
18056 Value *SubVec;
18057 if (Bases.back()) {
18058 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
18059 TransformToIdentity(SubMask);
18060 } else {
18061 SubVec = Bases.front();
18062 }
18063 if (!Vec) {
18064 Vec = SubVec;
18065 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
18066 [&](unsigned P) {
18067 ArrayRef<int> SubMask =
18068 Mask.slice(P * SliceSize,
18069 getNumElems(Mask.size(),
18070 SliceSize, P));
18071 return all_of(SubMask, [](int Idx) {
18072 return Idx == PoisonMaskElem;
18073 });
18074 })) &&
18075 "Expected first part or all previous parts masked.");
18076 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18077 } else {
18078 unsigned NewVF =
18079 cast<FixedVectorType>(Vec->getType())->getNumElements();
18080 if (Vec->getType() != SubVec->getType()) {
18081 unsigned SubVecVF =
18082 cast<FixedVectorType>(SubVec->getType())->getNumElements();
18083 NewVF = std::max(NewVF, SubVecVF);
18084 }
18085 // Adjust SubMask.
18086 for (int &Idx : SubMask)
18087 if (Idx != PoisonMaskElem)
18088 Idx += NewVF;
18089 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18090 Vec = createShuffle(Vec, SubVec, VecMask);
18091 TransformToIdentity(VecMask);
18092 }
18093 }
18094 copy(VecMask, Mask.begin());
18095 return Vec;
18096 }
18097 /// Checks if the specified entry \p E needs to be delayed because of its
18098 /// dependency nodes.
18099 std::optional<Value *>
18100 needToDelay(const TreeEntry *E,
18102 // No need to delay emission if all deps are ready.
18103 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
18104 return all_of(
18105 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
18106 }))
18107 return std::nullopt;
18108 // Postpone gather emission, will be emitted after the end of the
18109 // process to keep correct order.
18110 auto *ResVecTy = getWidenedType(ScalarTy, E->getVectorFactor());
18111 return Builder.CreateAlignedLoad(
18112 ResVecTy,
18113 PoisonValue::get(PointerType::getUnqual(ScalarTy->getContext())),
18114 MaybeAlign());
18115 }
18116 /// Reset the builder to handle perfect diamond match.
18118 IsFinalized = false;
18119 CommonMask.clear();
18120 InVectors.clear();
18121 }
18122 /// Adds 2 input vectors (in form of tree entries) and the mask for their
18123 /// shuffling.
18124 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
18125 Value *V1 = getVectorizedValue(E1);
18126 Value *V2 = getVectorizedValue(E2);
18127 add(V1, V2, Mask);
18128 }
18129 /// Adds single input vector (in form of tree entry) and the mask for its
18130 /// shuffling.
18131 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
18132 Value *V1 = getVectorizedValue(E1);
18133 add(V1, Mask);
18134 }
18135 /// Adds 2 input vectors and the mask for their shuffling.
18136 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
18137 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
18140 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
18141 V1 = castToScalarTyElem(V1);
18142 V2 = castToScalarTyElem(V2);
18143 if (InVectors.empty()) {
18144 InVectors.push_back(V1);
18145 InVectors.push_back(V2);
18146 CommonMask.assign(Mask.begin(), Mask.end());
18147 return;
18148 }
18149 Value *Vec = InVectors.front();
18150 if (InVectors.size() == 2) {
18151 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18152 transformMaskAfterShuffle(CommonMask, CommonMask);
18153 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
18154 Mask.size()) {
18155 Vec = createShuffle(Vec, nullptr, CommonMask);
18156 transformMaskAfterShuffle(CommonMask, CommonMask);
18157 }
18158 V1 = createShuffle(V1, V2, Mask);
18159 unsigned VF = std::max(getVF(V1), getVF(Vec));
18160 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18161 if (Mask[Idx] != PoisonMaskElem)
18162 CommonMask[Idx] = Idx + VF;
18163 InVectors.front() = Vec;
18164 if (InVectors.size() == 2)
18165 InVectors.back() = V1;
18166 else
18167 InVectors.push_back(V1);
18168 }
18169 /// Adds another one input vector and the mask for the shuffling.
18170 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
18172 "castToScalarTyElem expects V1 to be FixedVectorType");
18173 V1 = castToScalarTyElem(V1);
18174 if (InVectors.empty()) {
18175 InVectors.push_back(V1);
18176 CommonMask.assign(Mask.begin(), Mask.end());
18177 return;
18178 }
18179 const auto *It = find(InVectors, V1);
18180 if (It == InVectors.end()) {
18181 if (InVectors.size() == 2 ||
18182 InVectors.front()->getType() != V1->getType()) {
18183 Value *V = InVectors.front();
18184 if (InVectors.size() == 2) {
18185 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
18186 transformMaskAfterShuffle(CommonMask, CommonMask);
18187 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
18188 CommonMask.size()) {
18189 V = createShuffle(InVectors.front(), nullptr, CommonMask);
18190 transformMaskAfterShuffle(CommonMask, CommonMask);
18191 }
18192 unsigned VF = std::max(CommonMask.size(), Mask.size());
18193 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18194 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
18195 CommonMask[Idx] = V->getType() != V1->getType()
18196 ? Idx + VF
18197 : Mask[Idx] + getVF(V1);
18198 if (V->getType() != V1->getType())
18199 V1 = createShuffle(V1, nullptr, Mask);
18200 InVectors.front() = V;
18201 if (InVectors.size() == 2)
18202 InVectors.back() = V1;
18203 else
18204 InVectors.push_back(V1);
18205 return;
18206 }
18207 // Check if second vector is required if the used elements are already
18208 // used from the first one.
18209 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18210 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
18211 InVectors.push_back(V1);
18212 break;
18213 }
18214 }
18215 unsigned VF = 0;
18216 for (Value *V : InVectors)
18217 VF = std::max(VF, getVF(V));
18218 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18219 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
18220 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
18221 }
18222 /// Adds another one input vector and the mask for the shuffling.
18224 SmallVector<int> NewMask;
18225 inversePermutation(Order, NewMask);
18226 add(V1, NewMask);
18227 }
18228 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
18229 Value *Root = nullptr) {
18230 return R.gather(VL, Root, ScalarTy,
18231 [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
18232 return createShuffle(V1, V2, Mask);
18233 });
18234 }
18235 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
18236 /// Finalize emission of the shuffles.
18237 /// \param Action the action (if any) to be performed before final applying of
18238 /// the \p ExtMask mask.
18240 ArrayRef<int> ExtMask,
18241 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
18242 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
18245 Action = {}) {
18246 IsFinalized = true;
18247 if (Action) {
18248 Value *Vec = InVectors.front();
18249 if (InVectors.size() == 2) {
18250 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18251 InVectors.pop_back();
18252 } else {
18253 Vec = createShuffle(Vec, nullptr, CommonMask);
18254 }
18255 transformMaskAfterShuffle(CommonMask, CommonMask);
18256 assert(VF > 0 &&
18257 "Expected vector length for the final value before action.");
18258 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
18259 if (VecVF < VF) {
18260 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
18261 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
18262 Vec = createShuffle(Vec, nullptr, ResizeMask);
18263 }
18264 Action(Vec, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
18265 return createShuffle(V1, V2, Mask);
18266 });
18267 InVectors.front() = Vec;
18268 }
18269 if (!SubVectors.empty()) {
18270 Value *Vec = InVectors.front();
18271 if (InVectors.size() == 2) {
18272 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18273 InVectors.pop_back();
18274 } else {
18275 Vec = createShuffle(Vec, nullptr, CommonMask);
18276 }
18277 transformMaskAfterShuffle(CommonMask, CommonMask);
18278 auto CreateSubVectors = [&](Value *Vec,
18279 SmallVectorImpl<int> &CommonMask) {
18280 for (auto [E, Idx] : SubVectors) {
18281 Value *V = getVectorizedValue(*E);
18282 unsigned InsertionIndex = Idx * getNumElements(ScalarTy);
18283 // Use scalar version of the SCalarType to correctly handle shuffles
18284 // for revectorization. The revectorization mode operates by the
18285 // vectors, but here we need to operate on the scalars, because the
18286 // masks were already transformed for the vector elements and we don't
18287 // need doing this transformation again.
18288 Type *OrigScalarTy = ScalarTy;
18289 ScalarTy = ScalarTy->getScalarType();
18290 Vec = createInsertVector(
18291 Builder, Vec, V, InsertionIndex,
18292 std::bind(&ShuffleInstructionBuilder::createShuffle, this, _1, _2,
18293 _3));
18294 ScalarTy = OrigScalarTy;
18295 if (!CommonMask.empty()) {
18296 std::iota(std::next(CommonMask.begin(), Idx),
18297 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
18298 Idx);
18299 }
18300 }
18301 return Vec;
18302 };
18303 if (SubVectorsMask.empty()) {
18304 Vec = CreateSubVectors(Vec, CommonMask);
18305 } else {
18306 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
18307 copy(SubVectorsMask, SVMask.begin());
18308 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
18309 if (I2 != PoisonMaskElem) {
18310 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
18311 I1 = I2 + CommonMask.size();
18312 }
18313 }
18314 Value *InsertVec =
18315 CreateSubVectors(PoisonValue::get(Vec->getType()), CommonMask);
18316 Vec = createShuffle(InsertVec, Vec, SVMask);
18317 transformMaskAfterShuffle(CommonMask, SVMask);
18318 }
18319 InVectors.front() = Vec;
18320 }
18321
18322 if (!ExtMask.empty()) {
18323 if (CommonMask.empty()) {
18324 CommonMask.assign(ExtMask.begin(), ExtMask.end());
18325 } else {
18326 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
18327 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
18328 if (ExtMask[I] == PoisonMaskElem)
18329 continue;
18330 NewMask[I] = CommonMask[ExtMask[I]];
18331 }
18332 CommonMask.swap(NewMask);
18333 }
18334 }
18335 if (CommonMask.empty()) {
18336 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
18337 return InVectors.front();
18338 }
18339 if (InVectors.size() == 2)
18340 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
18341 return createShuffle(InVectors.front(), nullptr, CommonMask);
18342 }
18343
18345 assert((IsFinalized || CommonMask.empty()) &&
18346 "Shuffle construction must be finalized.");
18347 }
18348};
18349
18350Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
18351 return vectorizeTree(getOperandEntry(E, NodeIdx));
18352}
18353
18354template <typename BVTy, typename ResTy, typename... Args>
18355ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
18356 Args &...Params) {
18357 assert(E->isGather() && "Expected gather node.");
18358 unsigned VF = E->getVectorFactor();
18359
18360 bool NeedFreeze = false;
18361 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
18362 // Clear values, to be replaced by insertvector instructions.
18363 for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
18364 for_each(MutableArrayRef(GatheredScalars)
18365 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
18366 [&](Value *&V) { V = PoisonValue::get(V->getType()); });
18368 E->CombinedEntriesWithIndices.size());
18369 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
18370 [&](const auto &P) {
18371 return std::make_pair(VectorizableTree[P.first].get(), P.second);
18372 });
18373 // Build a mask out of the reorder indices and reorder scalars per this
18374 // mask.
18375 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
18376 E->ReorderIndices.end());
18377 if (!ReorderMask.empty())
18378 reorderScalars(GatheredScalars, ReorderMask);
18379 SmallVector<int> SubVectorsMask;
18380 inversePermutation(E->ReorderIndices, SubVectorsMask);
18381 // Transform non-clustered elements in the mask to poison (-1).
18382 // "Clustered" operations will be reordered using this mask later.
18383 if (!SubVectors.empty() && !SubVectorsMask.empty()) {
18384 for (unsigned I : seq<unsigned>(GatheredScalars.size()))
18385 if (E->Scalars[I] == GatheredScalars[ReorderMask[I]])
18386 SubVectorsMask[ReorderMask[I]] = PoisonMaskElem;
18387 } else {
18388 SubVectorsMask.clear();
18389 }
18390 SmallVector<Value *> StoredGS(GatheredScalars);
18391 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
18392 unsigned I, unsigned SliceSize,
18393 bool IsNotPoisonous) {
18394 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
18395 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
18396 }))
18397 return false;
18398 TreeEntry *UserTE = E->UserTreeIndex.UserTE;
18399 unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;
18400 if (UserTE->getNumOperands() != 2)
18401 return false;
18402 if (!IsNotPoisonous) {
18403 auto *It = find_if(ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
18404 [=](const std::unique_ptr<TreeEntry> &TE) {
18405 return TE->UserTreeIndex.UserTE == UserTE &&
18406 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
18407 });
18408 if (It == VectorizableTree.end())
18409 return false;
18410 SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end());
18411 if (!(*It)->ReorderIndices.empty()) {
18412 inversePermutation((*It)->ReorderIndices, ReorderMask);
18413 reorderScalars(GS, ReorderMask);
18414 }
18415 if (!all_of(zip(GatheredScalars, GS), [&](const auto &P) {
18416 Value *V0 = std::get<0>(P);
18417 Value *V1 = std::get<1>(P);
18418 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
18419 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
18420 is_contained(E->Scalars, V1));
18421 }))
18422 return false;
18423 }
18424 int Idx;
18425 if ((Mask.size() < InputVF &&
18426 ShuffleVectorInst::isExtractSubvectorMask(Mask, InputVF, Idx) &&
18427 Idx == 0) ||
18428 (Mask.size() == InputVF &&
18429 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
18430 std::iota(
18431 std::next(Mask.begin(), I * SliceSize),
18432 std::next(Mask.begin(),
18433 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
18434 0);
18435 } else {
18436 unsigned IVal =
18437 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
18438 std::fill(
18439 std::next(Mask.begin(), I * SliceSize),
18440 std::next(Mask.begin(),
18441 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
18442 IVal);
18443 }
18444 return true;
18445 };
18446 BVTy ShuffleBuilder(ScalarTy, Params...);
18447 ResTy Res = ResTy();
18448 SmallVector<int> Mask;
18449 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
18451 Value *ExtractVecBase = nullptr;
18452 bool UseVecBaseAsInput = false;
18455 Type *OrigScalarTy = GatheredScalars.front()->getType();
18456 auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
18457 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, GatheredScalars.size());
18458 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
18459 // Check for gathered extracts.
18460 bool Resized = false;
18461 ExtractShuffles =
18462 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
18463 if (!ExtractShuffles.empty()) {
18464 SmallVector<const TreeEntry *> ExtractEntries;
18465 for (auto [Idx, I] : enumerate(ExtractMask)) {
18466 if (I == PoisonMaskElem)
18467 continue;
18468 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(
18469 cast<ExtractElementInst>(StoredGS[Idx])->getVectorOperand());
18470 !TEs.empty())
18471 ExtractEntries.append(TEs.begin(), TEs.end());
18472 }
18473 if (std::optional<ResTy> Delayed =
18474 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
18475 // Delay emission of gathers which are not ready yet.
18476 PostponedGathers.insert(E);
18477 // Postpone gather emission, will be emitted after the end of the
18478 // process to keep correct order.
18479 return *Delayed;
18480 }
18481 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
18482 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
18483 ExtractVecBase = VecBase;
18484 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
18485 if (VF == VecBaseTy->getNumElements() &&
18486 GatheredScalars.size() != VF) {
18487 Resized = true;
18488 GatheredScalars.append(VF - GatheredScalars.size(),
18489 PoisonValue::get(OrigScalarTy));
18490 NumParts =
18491 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF), VF);
18492 }
18493 }
18494 }
18495 // Gather extracts after we check for full matched gathers only.
18496 if (!ExtractShuffles.empty() || !E->hasState() ||
18497 E->getOpcode() != Instruction::Load ||
18498 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
18499 any_of(E->Scalars, IsaPred<LoadInst>)) &&
18500 any_of(E->Scalars,
18501 [this](Value *V) {
18502 return isa<LoadInst>(V) && isVectorized(V);
18503 })) ||
18504 (E->hasState() && E->isAltShuffle()) ||
18505 all_of(E->Scalars, [this](Value *V) { return isVectorized(V); }) ||
18506 isSplat(E->Scalars) ||
18507 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
18508 GatherShuffles =
18509 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
18510 }
18511 if (!GatherShuffles.empty()) {
18512 if (std::optional<ResTy> Delayed =
18513 ShuffleBuilder.needToDelay(E, Entries)) {
18514 // Delay emission of gathers which are not ready yet.
18515 PostponedGathers.insert(E);
18516 // Postpone gather emission, will be emitted after the end of the
18517 // process to keep correct order.
18518 return *Delayed;
18519 }
18520 if (GatherShuffles.size() == 1 &&
18521 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
18522 Entries.front().front()->isSame(E->Scalars)) {
18523 // Perfect match in the graph, will reuse the previously vectorized
18524 // node. Cost is 0.
18525 LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "
18526 << shortBundleName(E->Scalars, E->Idx) << ".\n");
18527 // Restore the mask for previous partially matched values.
18528 Mask.resize(E->Scalars.size());
18529 const TreeEntry *FrontTE = Entries.front().front();
18530 if (FrontTE->ReorderIndices.empty() &&
18531 ((FrontTE->ReuseShuffleIndices.empty() &&
18532 E->Scalars.size() == FrontTE->Scalars.size()) ||
18533 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
18534 std::iota(Mask.begin(), Mask.end(), 0);
18535 } else {
18536 for (auto [I, V] : enumerate(E->Scalars)) {
18537 if (isa<PoisonValue>(V)) {
18539 continue;
18540 }
18541 Mask[I] = FrontTE->findLaneForValue(V);
18542 }
18543 }
18544 // Reset the builder(s) to correctly handle perfect diamond matched
18545 // nodes.
18546 ShuffleBuilder.resetForSameNode();
18547 ShuffleBuilder.add(*FrontTE, Mask);
18548 // Full matched entry found, no need to insert subvectors.
18549 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
18550 return Res;
18551 }
18552 if (!Resized) {
18553 if (GatheredScalars.size() != VF &&
18554 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
18555 return any_of(TEs, [&](const TreeEntry *TE) {
18556 return TE->getVectorFactor() == VF;
18557 });
18558 }))
18559 GatheredScalars.append(VF - GatheredScalars.size(),
18560 PoisonValue::get(OrigScalarTy));
18561 }
18562 // Remove shuffled elements from list of gathers.
18563 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
18564 if (Mask[I] != PoisonMaskElem)
18565 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
18566 }
18567 }
18568 }
18569 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
18570 SmallVectorImpl<int> &ReuseMask,
18571 bool IsRootPoison) {
18572 // For splats with can emit broadcasts instead of gathers, so try to find
18573 // such sequences.
18574 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
18575 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
18576 Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
18577 SmallVector<int> UndefPos;
18578 DenseMap<Value *, unsigned> UniquePositions;
18579 // Gather unique non-const values and all constant values.
18580 // For repeated values, just shuffle them.
18581 int NumNonConsts = 0;
18582 int SinglePos = 0;
18583 for (auto [I, V] : enumerate(Scalars)) {
18584 if (isa<UndefValue>(V)) {
18585 if (!isa<PoisonValue>(V)) {
18586 ReuseMask[I] = I;
18587 UndefPos.push_back(I);
18588 }
18589 continue;
18590 }
18591 if (isConstant(V)) {
18592 ReuseMask[I] = I;
18593 continue;
18594 }
18595 ++NumNonConsts;
18596 SinglePos = I;
18597 Value *OrigV = V;
18598 Scalars[I] = PoisonValue::get(OrigScalarTy);
18599 if (IsSplat) {
18600 Scalars.front() = OrigV;
18601 ReuseMask[I] = 0;
18602 } else {
18603 const auto Res = UniquePositions.try_emplace(OrigV, I);
18604 Scalars[Res.first->second] = OrigV;
18605 ReuseMask[I] = Res.first->second;
18606 }
18607 }
18608 if (NumNonConsts == 1) {
18609 // Restore single insert element.
18610 if (IsSplat) {
18611 ReuseMask.assign(VF, PoisonMaskElem);
18612 std::swap(Scalars.front(), Scalars[SinglePos]);
18613 if (!UndefPos.empty() && UndefPos.front() == 0)
18614 Scalars.front() = UndefValue::get(OrigScalarTy);
18615 }
18616 ReuseMask[SinglePos] = SinglePos;
18617 } else if (!UndefPos.empty() && IsSplat) {
18618 // For undef values, try to replace them with the simple broadcast.
18619 // We can do it if the broadcasted value is guaranteed to be
18620 // non-poisonous, or by freezing the incoming scalar value first.
18621 auto *It = find_if(Scalars, [this, E](Value *V) {
18622 return !isa<UndefValue>(V) &&
18624 (E->UserTreeIndex && any_of(V->uses(), [E](const Use &U) {
18625 // Check if the value already used in the same operation in
18626 // one of the nodes already.
18627 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
18628 is_contained(E->UserTreeIndex.UserTE->Scalars,
18629 U.getUser());
18630 })));
18631 });
18632 if (It != Scalars.end()) {
18633 // Replace undefs by the non-poisoned scalars and emit broadcast.
18634 int Pos = std::distance(Scalars.begin(), It);
18635 for (int I : UndefPos) {
18636 // Set the undef position to the non-poisoned scalar.
18637 ReuseMask[I] = Pos;
18638 // Replace the undef by the poison, in the mask it is replaced by
18639 // non-poisoned scalar already.
18640 if (I != Pos)
18641 Scalars[I] = PoisonValue::get(OrigScalarTy);
18642 }
18643 } else {
18644 // Replace undefs by the poisons, emit broadcast and then emit
18645 // freeze.
18646 for (int I : UndefPos) {
18647 ReuseMask[I] = PoisonMaskElem;
18648 if (isa<UndefValue>(Scalars[I]))
18649 Scalars[I] = PoisonValue::get(OrigScalarTy);
18650 }
18651 NeedFreeze = true;
18652 }
18653 }
18654 };
18655 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
18656 bool IsNonPoisoned = true;
18657 bool IsUsedInExpr = true;
18658 Value *Vec1 = nullptr;
18659 if (!ExtractShuffles.empty()) {
18660 // Gather of extractelements can be represented as just a shuffle of
18661 // a single/two vectors the scalars are extracted from.
18662 // Find input vectors.
18663 Value *Vec2 = nullptr;
18664 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
18665 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
18666 ExtractMask[I] = PoisonMaskElem;
18667 }
18668 if (UseVecBaseAsInput) {
18669 Vec1 = ExtractVecBase;
18670 } else {
18671 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
18672 if (ExtractMask[I] == PoisonMaskElem)
18673 continue;
18674 if (isa<UndefValue>(StoredGS[I]))
18675 continue;
18676 auto *EI = cast<ExtractElementInst>(StoredGS[I]);
18677 Value *VecOp = EI->getVectorOperand();
18678 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(VecOp);
18679 !TEs.empty() && TEs.front()->VectorizedValue)
18680 VecOp = TEs.front()->VectorizedValue;
18681 if (!Vec1) {
18682 Vec1 = VecOp;
18683 } else if (Vec1 != VecOp) {
18684 assert((!Vec2 || Vec2 == VecOp) &&
18685 "Expected only 1 or 2 vectors shuffle.");
18686 Vec2 = VecOp;
18687 }
18688 }
18689 }
18690 if (Vec2) {
18691 IsUsedInExpr = false;
18692 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1, AC) &&
18693 isGuaranteedNotToBePoison(Vec2, AC);
18694 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
18695 } else if (Vec1) {
18696 bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1, AC);
18697 IsUsedInExpr &= FindReusedSplat(
18698 ExtractMask,
18699 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
18700 ExtractMask.size(), IsNotPoisonedVec);
18701 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
18702 IsNonPoisoned &= IsNotPoisonedVec;
18703 } else {
18704 IsUsedInExpr = false;
18705 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
18706 /*ForExtracts=*/true);
18707 }
18708 }
18709 if (!GatherShuffles.empty()) {
18710 unsigned SliceSize =
18711 getPartNumElems(E->Scalars.size(),
18712 ::getNumberOfParts(*TTI, VecTy, E->Scalars.size()));
18713 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
18714 for (const auto [I, TEs] : enumerate(Entries)) {
18715 if (TEs.empty()) {
18716 assert(!GatherShuffles[I] &&
18717 "No shuffles with empty entries list expected.");
18718 continue;
18719 }
18720 assert((TEs.size() == 1 || TEs.size() == 2) &&
18721 "Expected shuffle of 1 or 2 entries.");
18722 unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
18723 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
18724 VecMask.assign(VecMask.size(), PoisonMaskElem);
18725 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
18726 if (TEs.size() == 1) {
18727 bool IsNotPoisonedVec =
18728 TEs.front()->VectorizedValue
18729 ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC)
18730 : true;
18731 IsUsedInExpr &=
18732 FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I,
18733 SliceSize, IsNotPoisonedVec);
18734 ShuffleBuilder.add(*TEs.front(), VecMask);
18735 IsNonPoisoned &= IsNotPoisonedVec;
18736 } else {
18737 IsUsedInExpr = false;
18738 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
18739 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
18740 IsNonPoisoned &=
18741 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) &&
18742 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue, AC);
18743 }
18744 }
18745 }
18746 // Try to figure out best way to combine values: build a shuffle and insert
18747 // elements or just build several shuffles.
18748 // Insert non-constant scalars.
18749 SmallVector<Value *> NonConstants(GatheredScalars);
18750 int EMSz = ExtractMask.size();
18751 int MSz = Mask.size();
18752 // Try to build constant vector and shuffle with it only if currently we
18753 // have a single permutation and more than 1 scalar constants.
18754 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
18755 bool IsIdentityShuffle =
18756 ((UseVecBaseAsInput ||
18757 all_of(ExtractShuffles,
18758 [](const std::optional<TTI::ShuffleKind> &SK) {
18759 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
18761 })) &&
18762 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
18763 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
18764 (!GatherShuffles.empty() &&
18765 all_of(GatherShuffles,
18766 [](const std::optional<TTI::ShuffleKind> &SK) {
18767 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
18769 }) &&
18770 none_of(Mask, [&](int I) { return I >= MSz; }) &&
18772 bool EnoughConstsForShuffle =
18773 IsSingleShuffle &&
18774 (none_of(GatheredScalars,
18775 [](Value *V) {
18776 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
18777 }) ||
18778 any_of(GatheredScalars,
18779 [](Value *V) {
18780 return isa<Constant>(V) && !isa<UndefValue>(V);
18781 })) &&
18782 (!IsIdentityShuffle ||
18783 (GatheredScalars.size() == 2 &&
18784 any_of(GatheredScalars,
18785 [](Value *V) { return !isa<UndefValue>(V); })) ||
18786 count_if(GatheredScalars, [](Value *V) {
18787 return isa<Constant>(V) && !isa<PoisonValue>(V);
18788 }) > 1);
18789 // NonConstants array contains just non-constant values, GatheredScalars
18790 // contains only constant to build final vector and then shuffle.
18791 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
18792 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
18793 NonConstants[I] = PoisonValue::get(OrigScalarTy);
18794 else
18795 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
18796 }
18797 // Generate constants for final shuffle and build a mask for them.
18798 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
18799 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
18800 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
18801 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
18802 ShuffleBuilder.add(BV, BVMask);
18803 }
18804 if (all_of(NonConstants, [=](Value *V) {
18805 return isa<PoisonValue>(V) ||
18806 (IsSingleShuffle && ((IsIdentityShuffle &&
18807 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
18808 }))
18809 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
18810 SubVectorsMask);
18811 else
18812 Res = ShuffleBuilder.finalize(
18813 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
18814 [&](Value *&Vec, SmallVectorImpl<int> &Mask, auto CreateShuffle) {
18815 bool IsSplat = isSplat(NonConstants);
18816 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
18817 TryPackScalars(NonConstants, BVMask, /*IsRootPoison=*/false);
18818 auto CheckIfSplatIsProfitable = [&]() {
18819 // Estimate the cost of splatting + shuffle and compare with
18820 // insert + shuffle.
18821 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
18822 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
18823 if (isa<ExtractElementInst>(V) || isVectorized(V))
18824 return false;
18825 InstructionCost SplatCost = TTI->getVectorInstrCost(
18826 Instruction::InsertElement, VecTy, CostKind, /*Index=*/0,
18827 PoisonValue::get(VecTy), V);
18828 SmallVector<int> NewMask(Mask.begin(), Mask.end());
18829 for (auto [Idx, I] : enumerate(BVMask))
18830 if (I != PoisonMaskElem)
18831 NewMask[Idx] = Mask.size();
18832 SplatCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy,
18833 NewMask, CostKind);
18834 InstructionCost BVCost = TTI->getVectorInstrCost(
18835 Instruction::InsertElement, VecTy, CostKind,
18836 *find_if(Mask, [](int I) { return I != PoisonMaskElem; }),
18837 Vec, V);
18838 // Shuffle required?
18839 if (count(BVMask, PoisonMaskElem) <
18840 static_cast<int>(BVMask.size() - 1)) {
18841 SmallVector<int> NewMask(Mask.begin(), Mask.end());
18842 for (auto [Idx, I] : enumerate(BVMask))
18843 if (I != PoisonMaskElem)
18844 NewMask[Idx] = I;
18845 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
18846 VecTy, NewMask, CostKind);
18847 }
18848 return SplatCost <= BVCost;
18849 };
18850 if (!IsSplat || Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
18851 for (auto [Idx, I] : enumerate(BVMask))
18852 if (I != PoisonMaskElem)
18853 Mask[Idx] = I;
18854 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
18855 } else {
18856 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
18857 SmallVector<Value *> Values(NonConstants.size(),
18858 PoisonValue::get(ScalarTy));
18859 Values[0] = V;
18860 Value *BV = ShuffleBuilder.gather(Values, BVMask.size());
18861 SmallVector<int> SplatMask(BVMask.size(), PoisonMaskElem);
18862 transform(BVMask, SplatMask.begin(), [](int I) {
18863 return I == PoisonMaskElem ? PoisonMaskElem : 0;
18864 });
18865 if (!ShuffleVectorInst::isIdentityMask(SplatMask, VF))
18866 BV = CreateShuffle(BV, nullptr, SplatMask);
18867 for (auto [Idx, I] : enumerate(BVMask))
18868 if (I != PoisonMaskElem)
18869 Mask[Idx] = BVMask.size() + Idx;
18870 Vec = CreateShuffle(Vec, BV, Mask);
18871 for (auto [Idx, I] : enumerate(Mask))
18872 if (I != PoisonMaskElem)
18873 Mask[Idx] = Idx;
18874 }
18875 });
18876 } else if (!allConstant(GatheredScalars)) {
18877 // Gather unique scalars and all constants.
18878 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
18879 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
18880 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
18881 ShuffleBuilder.add(BV, ReuseMask);
18882 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
18883 SubVectorsMask);
18884 } else {
18885 // Gather all constants.
18886 SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem);
18887 for (auto [I, V] : enumerate(GatheredScalars)) {
18888 if (!isa<PoisonValue>(V))
18889 Mask[I] = I;
18890 }
18891 Value *BV = ShuffleBuilder.gather(GatheredScalars);
18892 ShuffleBuilder.add(BV, Mask);
18893 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
18894 SubVectorsMask);
18895 }
18896
18897 if (NeedFreeze)
18898 Res = ShuffleBuilder.createFreeze(Res);
18899 return Res;
18900}
18901
18902Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
18903 for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
18904 (void)vectorizeTree(VectorizableTree[EIdx].get());
18905 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
18906 Builder, *this);
18907}
18908
18909/// \returns \p I after propagating metadata from \p VL only for instructions in
18910/// \p VL.
18913 for (Value *V : VL)
18914 if (isa<Instruction>(V))
18915 Insts.push_back(V);
18916 return llvm::propagateMetadata(Inst, Insts);
18917}
18918
18920 if (DebugLoc DL = PN.getDebugLoc())
18921 return DL;
18922 return DebugLoc::getUnknown();
18923}
18924
18925Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
18926 IRBuilderBase::InsertPointGuard Guard(Builder);
18927
18928 Value *V = E->Scalars.front();
18929 Type *ScalarTy = V->getType();
18930 if (!isa<CmpInst>(V))
18931 ScalarTy = getValueType(V);
18932 auto It = MinBWs.find(E);
18933 if (It != MinBWs.end()) {
18934 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
18935 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
18936 if (VecTy)
18937 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
18938 }
18939 if (E->VectorizedValue)
18940 return E->VectorizedValue;
18941 auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
18942 if (E->isGather()) {
18943 // Set insert point for non-reduction initial nodes.
18944 if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
18945 setInsertPointAfterBundle(E);
18946 Value *Vec = createBuildVector(E, ScalarTy);
18947 E->VectorizedValue = Vec;
18948 return Vec;
18949 }
18950 if (E->State == TreeEntry::SplitVectorize) {
18951 assert(E->CombinedEntriesWithIndices.size() == 2 &&
18952 "Expected exactly 2 combined entries.");
18953 setInsertPointAfterBundle(E);
18954 TreeEntry &OpTE1 =
18955 *VectorizableTree[E->CombinedEntriesWithIndices.front().first];
18956 assert(OpTE1.isSame(
18957 ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) &&
18958 "Expected same first part of scalars.");
18959 Value *Op1 = vectorizeTree(&OpTE1);
18960 TreeEntry &OpTE2 =
18961 *VectorizableTree[E->CombinedEntriesWithIndices.back().first];
18962 assert(
18963 OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) &&
18964 "Expected same second part of scalars.");
18965 Value *Op2 = vectorizeTree(&OpTE2);
18966 auto GetOperandSignedness = [&](const TreeEntry *OpE) {
18967 bool IsSigned = false;
18968 auto It = MinBWs.find(OpE);
18969 if (It != MinBWs.end())
18970 IsSigned = It->second.second;
18971 else
18972 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
18973 if (isa<PoisonValue>(V))
18974 return false;
18975 return !isKnownNonNegative(R, SimplifyQuery(*DL));
18976 });
18977 return IsSigned;
18978 };
18979 if (cast<VectorType>(Op1->getType())->getElementType() !=
18980 ScalarTy->getScalarType()) {
18981 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
18982 Op1 = Builder.CreateIntCast(
18983 Op1,
18985 ScalarTy,
18986 cast<FixedVectorType>(Op1->getType())->getNumElements()),
18987 GetOperandSignedness(&OpTE1));
18988 }
18989 if (cast<VectorType>(Op2->getType())->getElementType() !=
18990 ScalarTy->getScalarType()) {
18991 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
18992 Op2 = Builder.CreateIntCast(
18993 Op2,
18995 ScalarTy,
18996 cast<FixedVectorType>(Op2->getType())->getNumElements()),
18997 GetOperandSignedness(&OpTE2));
18998 }
18999 if (E->ReorderIndices.empty()) {
19000 SmallVector<int> Mask(E->getVectorFactor(), PoisonMaskElem);
19001 std::iota(
19002 Mask.begin(),
19003 std::next(Mask.begin(), E->CombinedEntriesWithIndices.back().second),
19004 0);
19005 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
19006 if (ScalarTyNumElements != 1) {
19007 assert(SLPReVec && "Only supported by REVEC.");
19008 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, Mask);
19009 }
19010 Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
19011 Vec = createInsertVector(Builder, Vec, Op2,
19012 E->CombinedEntriesWithIndices.back().second *
19013 ScalarTyNumElements);
19014 E->VectorizedValue = Vec;
19015 return Vec;
19016 }
19017 unsigned CommonVF =
19018 std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
19019 if (getNumElements(Op1->getType()) != CommonVF) {
19020 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
19021 std::iota(Mask.begin(), std::next(Mask.begin(), OpTE1.getVectorFactor()),
19022 0);
19023 Op1 = Builder.CreateShuffleVector(Op1, Mask);
19024 }
19025 if (getNumElements(Op2->getType()) != CommonVF) {
19026 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
19027 std::iota(Mask.begin(), std::next(Mask.begin(), OpTE2.getVectorFactor()),
19028 0);
19029 Op2 = Builder.CreateShuffleVector(Op2, Mask);
19030 }
19031 Value *Vec = Builder.CreateShuffleVector(Op1, Op2, E->getSplitMask());
19032 E->VectorizedValue = Vec;
19033 return Vec;
19034 }
19035
19036 bool IsReverseOrder =
19037 !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices);
19038 auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
19039 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
19040 if (E->getOpcode() == Instruction::Store &&
19041 E->State == TreeEntry::Vectorize) {
19042 ArrayRef<int> Mask =
19043 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
19044 E->ReorderIndices.size());
19045 ShuffleBuilder.add(V, Mask);
19046 } else if ((E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
19047 E->State == TreeEntry::CompressVectorize) {
19048 ShuffleBuilder.addOrdered(V, {});
19049 } else {
19050 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
19051 }
19053 E->CombinedEntriesWithIndices.size());
19054 transform(
19055 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
19056 return std::make_pair(VectorizableTree[P.first].get(), P.second);
19057 });
19058 assert(
19059 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
19060 "Expected either combined subnodes or reordering");
19061 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
19062 };
19063
19064 assert(!E->isGather() && "Unhandled state");
19065 unsigned ShuffleOrOp =
19066 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
19067 Instruction *VL0 = E->getMainOp();
19068 auto GetOperandSignedness = [&](unsigned Idx) {
19069 const TreeEntry *OpE = getOperandEntry(E, Idx);
19070 bool IsSigned = false;
19071 auto It = MinBWs.find(OpE);
19072 if (It != MinBWs.end())
19073 IsSigned = It->second.second;
19074 else
19075 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
19076 if (isa<PoisonValue>(V))
19077 return false;
19078 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19079 });
19080 return IsSigned;
19081 };
19082 switch (ShuffleOrOp) {
19083 case Instruction::PHI: {
19084 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
19085 E != VectorizableTree.front().get() || E->UserTreeIndex) &&
19086 "PHI reordering is free.");
19087 auto *PH = cast<PHINode>(VL0);
19088 Builder.SetInsertPoint(PH->getParent(),
19089 PH->getParent()->getFirstNonPHIIt());
19090 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
19091 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
19092 Value *V = NewPhi;
19093
19094 // Adjust insertion point once all PHI's have been generated.
19095 Builder.SetInsertPoint(PH->getParent(),
19096 PH->getParent()->getFirstInsertionPt());
19097 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
19098
19099 V = FinalShuffle(V, E);
19100
19101 E->VectorizedValue = V;
19102 // If phi node is fully emitted - exit.
19103 if (NewPhi->getNumIncomingValues() != 0)
19104 return NewPhi;
19105
19106 // PHINodes may have multiple entries from the same block. We want to
19107 // visit every block once.
19108 SmallPtrSet<BasicBlock *, 4> VisitedBBs;
19109
19110 for (unsigned I : seq<unsigned>(PH->getNumIncomingValues())) {
19111 BasicBlock *IBB = PH->getIncomingBlock(I);
19112
19113 // Stop emission if all incoming values are generated.
19114 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
19115 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
19116 return NewPhi;
19117 }
19118
19119 if (!VisitedBBs.insert(IBB).second) {
19120 Value *VecOp = NewPhi->getIncomingValueForBlock(IBB);
19121 NewPhi->addIncoming(VecOp, IBB);
19122 TreeEntry *OpTE = getOperandEntry(E, I);
19123 assert(!OpTE->VectorizedValue && "Expected no vectorized value.");
19124 OpTE->VectorizedValue = VecOp;
19125 continue;
19126 }
19127
19128 Builder.SetInsertPoint(IBB->getTerminator());
19129 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
19130 Value *Vec = vectorizeOperand(E, I);
19131 if (VecTy != Vec->getType()) {
19132 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
19133 MinBWs.contains(getOperandEntry(E, I))) &&
19134 "Expected item in MinBWs.");
19135 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
19136 }
19137 NewPhi->addIncoming(Vec, IBB);
19138 }
19139
19140 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
19141 "Invalid number of incoming values");
19142 assert(E->VectorizedValue && "Expected vectorized value.");
19143 return E->VectorizedValue;
19144 }
19145
19146 case Instruction::ExtractElement: {
19147 Value *V = E->getSingleOperand(0);
19148 setInsertPointAfterBundle(E);
19149 V = FinalShuffle(V, E);
19150 E->VectorizedValue = V;
19151 return V;
19152 }
19153 case Instruction::ExtractValue: {
19154 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
19155 Builder.SetInsertPoint(LI);
19156 Value *Ptr = LI->getPointerOperand();
19157 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
19158 Value *NewV = ::propagateMetadata(V, E->Scalars);
19159 NewV = FinalShuffle(NewV, E);
19160 E->VectorizedValue = NewV;
19161 return NewV;
19162 }
19163 case Instruction::InsertElement: {
19164 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
19165 if (const TreeEntry *OpE = getOperandEntry(E, 1);
19166 OpE && !OpE->isGather() && OpE->hasState() &&
19167 !OpE->hasCopyableElements())
19168 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
19169 else
19170 setInsertPointAfterBundle(E);
19171 Value *V = vectorizeOperand(E, 1);
19172 ArrayRef<Value *> Op = E->getOperand(1);
19173 Type *ScalarTy = Op.front()->getType();
19174 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
19175 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
19176 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
19177 assert(Res.first > 0 && "Expected item in MinBWs.");
19178 V = Builder.CreateIntCast(
19179 V,
19181 ScalarTy,
19182 cast<FixedVectorType>(V->getType())->getNumElements()),
19183 Res.second);
19184 }
19185
19186 // Create InsertVector shuffle if necessary
19187 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
19188 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
19189 }));
19190 const unsigned NumElts =
19191 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
19192 const unsigned NumScalars = E->Scalars.size();
19193
19194 unsigned Offset = *getElementIndex(VL0);
19195 assert(Offset < NumElts && "Failed to find vector index offset");
19196
19197 // Create shuffle to resize vector
19198 SmallVector<int> Mask;
19199 if (!E->ReorderIndices.empty()) {
19200 inversePermutation(E->ReorderIndices, Mask);
19201 Mask.append(NumElts - NumScalars, PoisonMaskElem);
19202 } else {
19203 Mask.assign(NumElts, PoisonMaskElem);
19204 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
19205 }
19206 // Create InsertVector shuffle if necessary
19207 bool IsIdentity = true;
19208 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
19209 Mask.swap(PrevMask);
19210 for (unsigned I = 0; I < NumScalars; ++I) {
19211 Value *Scalar = E->Scalars[PrevMask[I]];
19212 unsigned InsertIdx = *getElementIndex(Scalar);
19213 IsIdentity &= InsertIdx - Offset == I;
19214 Mask[InsertIdx - Offset] = I;
19215 }
19216 if (!IsIdentity || NumElts != NumScalars) {
19217 Value *V2 = nullptr;
19218 bool IsVNonPoisonous =
19220 SmallVector<int> InsertMask(Mask);
19221 if (NumElts != NumScalars && Offset == 0) {
19222 // Follow all insert element instructions from the current buildvector
19223 // sequence.
19224 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
19225 do {
19226 std::optional<unsigned> InsertIdx = getElementIndex(Ins);
19227 if (!InsertIdx)
19228 break;
19229 if (InsertMask[*InsertIdx] == PoisonMaskElem)
19230 InsertMask[*InsertIdx] = *InsertIdx;
19231 if (!Ins->hasOneUse())
19232 break;
19234 Ins->getUniqueUndroppableUser());
19235 } while (Ins);
19236 SmallBitVector UseMask =
19237 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19238 SmallBitVector IsFirstPoison =
19239 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19240 SmallBitVector IsFirstUndef =
19241 isUndefVector(FirstInsert->getOperand(0), UseMask);
19242 if (!IsFirstPoison.all()) {
19243 unsigned Idx = 0;
19244 for (unsigned I = 0; I < NumElts; I++) {
19245 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
19246 IsFirstUndef.test(I)) {
19247 if (IsVNonPoisonous) {
19248 InsertMask[I] = I < NumScalars ? I : 0;
19249 continue;
19250 }
19251 if (!V2)
19252 V2 = UndefValue::get(V->getType());
19253 if (Idx >= NumScalars)
19254 Idx = NumScalars - 1;
19255 InsertMask[I] = NumScalars + Idx;
19256 ++Idx;
19257 } else if (InsertMask[I] != PoisonMaskElem &&
19258 Mask[I] == PoisonMaskElem) {
19259 InsertMask[I] = PoisonMaskElem;
19260 }
19261 }
19262 } else {
19263 InsertMask = Mask;
19264 }
19265 }
19266 if (!V2)
19267 V2 = PoisonValue::get(V->getType());
19268 V = Builder.CreateShuffleVector(V, V2, InsertMask);
19269 if (auto *I = dyn_cast<Instruction>(V)) {
19270 GatherShuffleExtractSeq.insert(I);
19271 CSEBlocks.insert(I->getParent());
19272 }
19273 }
19274
19275 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
19276 for (unsigned I = 0; I < NumElts; I++) {
19277 if (Mask[I] != PoisonMaskElem)
19278 InsertMask[Offset + I] = I;
19279 }
19280 SmallBitVector UseMask =
19281 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19282 SmallBitVector IsFirstUndef =
19283 isUndefVector(FirstInsert->getOperand(0), UseMask);
19284 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
19285 NumElts != NumScalars) {
19286 if (IsFirstUndef.all()) {
19287 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
19288 SmallBitVector IsFirstPoison =
19289 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19290 if (!IsFirstPoison.all()) {
19291 for (unsigned I = 0; I < NumElts; I++) {
19292 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
19293 InsertMask[I] = I + NumElts;
19294 }
19295 }
19296 V = Builder.CreateShuffleVector(
19297 V,
19298 IsFirstPoison.all() ? PoisonValue::get(V->getType())
19299 : FirstInsert->getOperand(0),
19300 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
19301 if (auto *I = dyn_cast<Instruction>(V)) {
19302 GatherShuffleExtractSeq.insert(I);
19303 CSEBlocks.insert(I->getParent());
19304 }
19305 }
19306 } else {
19307 SmallBitVector IsFirstPoison =
19308 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19309 for (unsigned I = 0; I < NumElts; I++) {
19310 if (InsertMask[I] == PoisonMaskElem)
19311 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
19312 else
19313 InsertMask[I] += NumElts;
19314 }
19315 V = Builder.CreateShuffleVector(
19316 FirstInsert->getOperand(0), V, InsertMask,
19317 cast<Instruction>(E->Scalars.back())->getName());
19318 if (auto *I = dyn_cast<Instruction>(V)) {
19319 GatherShuffleExtractSeq.insert(I);
19320 CSEBlocks.insert(I->getParent());
19321 }
19322 }
19323 }
19324
19325 ++NumVectorInstructions;
19326 E->VectorizedValue = V;
19327 return V;
19328 }
19329 case Instruction::ZExt:
19330 case Instruction::SExt:
19331 case Instruction::FPToUI:
19332 case Instruction::FPToSI:
19333 case Instruction::FPExt:
19334 case Instruction::PtrToInt:
19335 case Instruction::IntToPtr:
19336 case Instruction::SIToFP:
19337 case Instruction::UIToFP:
19338 case Instruction::Trunc:
19339 case Instruction::FPTrunc:
19340 case Instruction::BitCast: {
19341 setInsertPointAfterBundle(E);
19342
19343 Value *InVec = vectorizeOperand(E, 0);
19344
19345 auto *CI = cast<CastInst>(VL0);
19346 Instruction::CastOps VecOpcode = CI->getOpcode();
19347 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
19348 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
19349 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
19350 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
19351 SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {
19352 // Check if the values are candidates to demote.
19353 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
19354 if (SrcIt != MinBWs.end())
19355 SrcBWSz = SrcIt->second.first;
19356 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
19357 if (BWSz == SrcBWSz) {
19358 VecOpcode = Instruction::BitCast;
19359 } else if (BWSz < SrcBWSz) {
19360 VecOpcode = Instruction::Trunc;
19361 } else if (It != MinBWs.end()) {
19362 assert(BWSz > SrcBWSz && "Invalid cast!");
19363 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
19364 } else if (SrcIt != MinBWs.end()) {
19365 assert(BWSz > SrcBWSz && "Invalid cast!");
19366 VecOpcode =
19367 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
19368 }
19369 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
19370 !SrcIt->second.second) {
19371 VecOpcode = Instruction::UIToFP;
19372 }
19373 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
19374 ? InVec
19375 : Builder.CreateCast(VecOpcode, InVec, VecTy);
19376 V = FinalShuffle(V, E);
19377
19378 E->VectorizedValue = V;
19379 ++NumVectorInstructions;
19380 return V;
19381 }
19382 case Instruction::FCmp:
19383 case Instruction::ICmp: {
19384 setInsertPointAfterBundle(E);
19385
19386 Value *L = vectorizeOperand(E, 0);
19387 Value *R = vectorizeOperand(E, 1);
19388 if (L->getType() != R->getType()) {
19389 assert((getOperandEntry(E, 0)->isGather() ||
19390 getOperandEntry(E, 1)->isGather() ||
19391 MinBWs.contains(getOperandEntry(E, 0)) ||
19392 MinBWs.contains(getOperandEntry(E, 1))) &&
19393 "Expected item in MinBWs.");
19394 if (cast<VectorType>(L->getType())
19395 ->getElementType()
19396 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
19397 ->getElementType()
19398 ->getIntegerBitWidth()) {
19399 Type *CastTy = R->getType();
19400 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
19401 } else {
19402 Type *CastTy = L->getType();
19403 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
19404 }
19405 }
19406
19407 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
19408 Value *V = Builder.CreateCmp(P0, L, R);
19409 propagateIRFlags(V, E->Scalars, VL0);
19410 if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end())
19411 ICmp->setSameSign(/*B=*/false);
19412 // Do not cast for cmps.
19413 VecTy = cast<FixedVectorType>(V->getType());
19414 V = FinalShuffle(V, E);
19415
19416 E->VectorizedValue = V;
19417 ++NumVectorInstructions;
19418 return V;
19419 }
19420 case Instruction::Select: {
19421 setInsertPointAfterBundle(E);
19422
19423 Value *Cond = vectorizeOperand(E, 0);
19424 Value *True = vectorizeOperand(E, 1);
19425 Value *False = vectorizeOperand(E, 2);
19426 if (True->getType() != VecTy || False->getType() != VecTy) {
19427 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
19428 getOperandEntry(E, 2)->isGather() ||
19429 MinBWs.contains(getOperandEntry(E, 1)) ||
19430 MinBWs.contains(getOperandEntry(E, 2))) &&
19431 "Expected item in MinBWs.");
19432 if (True->getType() != VecTy)
19433 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
19434 if (False->getType() != VecTy)
19435 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
19436 }
19437
19438 unsigned CondNumElements = getNumElements(Cond->getType());
19439 unsigned TrueNumElements = getNumElements(True->getType());
19440 assert(TrueNumElements >= CondNumElements &&
19441 TrueNumElements % CondNumElements == 0 &&
19442 "Cannot vectorize Instruction::Select");
19443 assert(TrueNumElements == getNumElements(False->getType()) &&
19444 "Cannot vectorize Instruction::Select");
19445 if (CondNumElements != TrueNumElements) {
19446 // When the return type is i1 but the source is fixed vector type, we
19447 // need to duplicate the condition value.
19448 Cond = Builder.CreateShuffleVector(
19449 Cond, createReplicatedMask(TrueNumElements / CondNumElements,
19450 CondNumElements));
19451 }
19452 assert(getNumElements(Cond->getType()) == TrueNumElements &&
19453 "Cannot vectorize Instruction::Select");
19454 Value *V = Builder.CreateSelect(Cond, True, False);
19455 V = FinalShuffle(V, E);
19456
19457 E->VectorizedValue = V;
19458 ++NumVectorInstructions;
19459 return V;
19460 }
19461 case Instruction::FNeg: {
19462 setInsertPointAfterBundle(E);
19463
19464 Value *Op = vectorizeOperand(E, 0);
19465
19466 Value *V = Builder.CreateUnOp(
19467 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
19468 propagateIRFlags(V, E->Scalars, VL0);
19469 if (auto *I = dyn_cast<Instruction>(V))
19470 V = ::propagateMetadata(I, E->Scalars);
19471
19472 V = FinalShuffle(V, E);
19473
19474 E->VectorizedValue = V;
19475 ++NumVectorInstructions;
19476
19477 return V;
19478 }
19479 case Instruction::Freeze: {
19480 setInsertPointAfterBundle(E);
19481
19482 Value *Op = vectorizeOperand(E, 0);
19483
19484 if (Op->getType() != VecTy) {
19485 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
19486 MinBWs.contains(getOperandEntry(E, 0))) &&
19487 "Expected item in MinBWs.");
19488 Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0));
19489 }
19490 Value *V = Builder.CreateFreeze(Op);
19491 V = FinalShuffle(V, E);
19492
19493 E->VectorizedValue = V;
19494 ++NumVectorInstructions;
19495
19496 return V;
19497 }
19498 case Instruction::Add:
19499 case Instruction::FAdd:
19500 case Instruction::Sub:
19501 case Instruction::FSub:
19502 case Instruction::Mul:
19503 case Instruction::FMul:
19504 case Instruction::UDiv:
19505 case Instruction::SDiv:
19506 case Instruction::FDiv:
19507 case Instruction::URem:
19508 case Instruction::SRem:
19509 case Instruction::FRem:
19510 case Instruction::Shl:
19511 case Instruction::LShr:
19512 case Instruction::AShr:
19513 case Instruction::And:
19514 case Instruction::Or:
19515 case Instruction::Xor: {
19516 setInsertPointAfterBundle(E);
19517
19518 Value *LHS = vectorizeOperand(E, 0);
19519 Value *RHS = vectorizeOperand(E, 1);
19520 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
19521 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
19522 ArrayRef<Value *> Ops = E->getOperand(I);
19523 if (all_of(Ops, [&](Value *Op) {
19524 auto *CI = dyn_cast<ConstantInt>(Op);
19525 return CI && CI->getValue().countr_one() >= It->second.first;
19526 })) {
19527 V = FinalShuffle(I == 0 ? RHS : LHS, E);
19528 E->VectorizedValue = V;
19529 ++NumVectorInstructions;
19530 return V;
19531 }
19532 }
19533 }
19534 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
19535 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
19536 getOperandEntry(E, 1)->isGather() ||
19537 MinBWs.contains(getOperandEntry(E, 0)) ||
19538 MinBWs.contains(getOperandEntry(E, 1))) &&
19539 "Expected item in MinBWs.");
19540 if (LHS->getType() != VecTy)
19541 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
19542 if (RHS->getType() != VecTy)
19543 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
19544 }
19545
19546 Value *V = Builder.CreateBinOp(
19547 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
19548 RHS);
19549 propagateIRFlags(V, E->Scalars, nullptr, It == MinBWs.end());
19550 if (auto *I = dyn_cast<Instruction>(V)) {
19551 V = ::propagateMetadata(I, E->Scalars);
19552 // Drop nuw flags for abs(sub(commutative), true).
19553 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
19554 any_of(E->Scalars, [](Value *V) {
19555 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
19556 }))
19557 I->setHasNoUnsignedWrap(/*b=*/false);
19558 }
19559
19560 V = FinalShuffle(V, E);
19561
19562 E->VectorizedValue = V;
19563 ++NumVectorInstructions;
19564
19565 return V;
19566 }
19567 case Instruction::Load: {
19568 // Loads are inserted at the head of the tree because we don't want to
19569 // sink them all the way down past store instructions.
19570 setInsertPointAfterBundle(E);
19571
19572 LoadInst *LI = cast<LoadInst>(VL0);
19573 Instruction *NewLI;
19574 FixedVectorType *StridedLoadTy = nullptr;
19575 Value *PO = LI->getPointerOperand();
19576 if (E->State == TreeEntry::Vectorize) {
19577 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
19578 } else if (E->State == TreeEntry::CompressVectorize) {
19579 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
19580 CompressEntryToData.at(E);
19581 Align CommonAlignment = LI->getAlign();
19582 if (IsMasked) {
19583 unsigned VF = getNumElements(LoadVecTy);
19584 SmallVector<Constant *> MaskValues(
19585 VF / getNumElements(LI->getType()),
19586 ConstantInt::getFalse(VecTy->getContext()));
19587 for (int I : CompressMask)
19588 MaskValues[I] = ConstantInt::getTrue(VecTy->getContext());
19589 if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
19590 assert(SLPReVec && "Only supported by REVEC.");
19591 MaskValues = replicateMask(MaskValues, VecTy->getNumElements());
19592 }
19593 Constant *MaskValue = ConstantVector::get(MaskValues);
19594 NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
19595 MaskValue);
19596 } else {
19597 NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
19598 }
19599 NewLI = ::propagateMetadata(NewLI, E->Scalars);
19600 // TODO: include this cost into CommonCost.
19601 if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
19602 assert(SLPReVec && "FixedVectorType is not expected.");
19603 transformScalarShuffleIndiciesToVector(VecTy->getNumElements(),
19604 CompressMask);
19605 }
19606 NewLI =
19607 cast<Instruction>(Builder.CreateShuffleVector(NewLI, CompressMask));
19608 } else if (E->State == TreeEntry::StridedVectorize) {
19609 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
19610 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
19611 PO = IsReverseOrder ? PtrN : Ptr0;
19612 Type *StrideTy = DL->getIndexType(PO->getType());
19613 Value *StrideVal;
19614 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
19615 StridedLoadTy = SPtrInfo.Ty;
19616 assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
19617 unsigned StridedLoadEC =
19618 StridedLoadTy->getElementCount().getKnownMinValue();
19619
19620 Value *Stride = SPtrInfo.StrideVal;
19621 if (!Stride) {
19622 const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
19623 assert(StrideSCEV && "Neither StrideVal nor StrideSCEV were set.");
19624 SCEVExpander Expander(*SE, *DL, "strided-load-vec");
19625 Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->getType(),
19626 &*Builder.GetInsertPoint());
19627 }
19628 Value *NewStride =
19629 Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true);
19630 StrideVal = Builder.CreateMul(
19631 NewStride, ConstantInt::get(
19632 StrideTy, (IsReverseOrder ? -1 : 1) *
19633 static_cast<int>(
19634 DL->getTypeAllocSize(ScalarTy))));
19635 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
19636 auto *Inst = Builder.CreateIntrinsic(
19637 Intrinsic::experimental_vp_strided_load,
19638 {StridedLoadTy, PO->getType(), StrideTy},
19639 {PO, StrideVal,
19640 Builder.getAllOnesMask(ElementCount::getFixed(StridedLoadEC)),
19641 Builder.getInt32(StridedLoadEC)});
19642 Inst->addParamAttr(
19643 /*ArgNo=*/0,
19644 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
19645 NewLI = Inst;
19646 } else {
19647 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
19648 Value *VecPtr = vectorizeOperand(E, 0);
19649 if (isa<FixedVectorType>(ScalarTy)) {
19650 assert(SLPReVec && "FixedVectorType is not expected.");
19651 // CreateMaskedGather expects VecTy and VecPtr have same size. We need
19652 // to expand VecPtr if ScalarTy is a vector type.
19653 unsigned ScalarTyNumElements =
19654 cast<FixedVectorType>(ScalarTy)->getNumElements();
19655 unsigned VecTyNumElements =
19656 cast<FixedVectorType>(VecTy)->getNumElements();
19657 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
19658 "Cannot expand getelementptr.");
19659 unsigned VF = VecTyNumElements / ScalarTyNumElements;
19660 SmallVector<Constant *> Indices(VecTyNumElements);
19661 transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) {
19662 return Builder.getInt64(I % ScalarTyNumElements);
19663 });
19664 VecPtr = Builder.CreateGEP(
19665 VecTy->getElementType(),
19666 Builder.CreateShuffleVector(
19667 VecPtr, createReplicatedMask(ScalarTyNumElements, VF)),
19668 ConstantVector::get(Indices));
19669 }
19670 // Use the minimum alignment of the gathered loads.
19671 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
19672 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
19673 }
19674 Value *V = E->State == TreeEntry::CompressVectorize
19675 ? NewLI
19676 : ::propagateMetadata(NewLI, E->Scalars);
19677
19678 V = FinalShuffle(V, E);
19679 E->VectorizedValue = V;
19680 ++NumVectorInstructions;
19681 return V;
19682 }
19683 case Instruction::Store: {
19684 auto *SI = cast<StoreInst>(VL0);
19685
19686 setInsertPointAfterBundle(E);
19687
19688 Value *VecValue = vectorizeOperand(E, 0);
19689 if (VecValue->getType() != VecTy)
19690 VecValue =
19691 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
19692 VecValue = FinalShuffle(VecValue, E);
19693
19694 Value *Ptr = SI->getPointerOperand();
19695 Instruction *ST;
19696 if (E->State == TreeEntry::Vectorize) {
19697 ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
19698 } else {
19699 assert(E->State == TreeEntry::StridedVectorize &&
19700 "Expected either strided or consecutive stores.");
19701 if (!E->ReorderIndices.empty()) {
19702 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
19703 Ptr = SI->getPointerOperand();
19704 }
19705 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
19706 Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
19707 auto *Inst = Builder.CreateIntrinsic(
19708 Intrinsic::experimental_vp_strided_store,
19709 {VecTy, Ptr->getType(), StrideTy},
19710 {VecValue, Ptr,
19711 ConstantInt::get(
19712 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
19713 Builder.getAllOnesMask(VecTy->getElementCount()),
19714 Builder.getInt32(E->Scalars.size())});
19715 Inst->addParamAttr(
19716 /*ArgNo=*/1,
19717 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
19718 ST = Inst;
19719 }
19720
19721 Value *V = ::propagateMetadata(ST, E->Scalars);
19722
19723 E->VectorizedValue = V;
19724 ++NumVectorInstructions;
19725 return V;
19726 }
19727 case Instruction::GetElementPtr: {
19728 auto *GEP0 = cast<GetElementPtrInst>(VL0);
19729 setInsertPointAfterBundle(E);
19730
19731 Value *Op0 = vectorizeOperand(E, 0);
19732
19733 SmallVector<Value *> OpVecs;
19734 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
19735 Value *OpVec = vectorizeOperand(E, J);
19736 OpVecs.push_back(OpVec);
19737 }
19738
19739 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
19740 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
19742 for (Value *V : E->Scalars) {
19744 GEPs.push_back(V);
19745 }
19746 V = ::propagateMetadata(I, GEPs);
19747 }
19748
19749 V = FinalShuffle(V, E);
19750
19751 E->VectorizedValue = V;
19752 ++NumVectorInstructions;
19753
19754 return V;
19755 }
19756 case Instruction::Call: {
19757 CallInst *CI = cast<CallInst>(VL0);
19758 setInsertPointAfterBundle(E);
19759
19761
19763 CI, ID, VecTy->getNumElements(),
19764 It != MinBWs.end() ? It->second.first : 0, TTI);
19765 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
19766 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
19767 VecCallCosts.first <= VecCallCosts.second;
19768
19769 Value *ScalarArg = nullptr;
19770 SmallVector<Value *> OpVecs;
19771 SmallVector<Type *, 2> TysForDecl;
19772 // Add return type if intrinsic is overloaded on it.
19773 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI))
19774 TysForDecl.push_back(VecTy);
19775 auto *CEI = cast<CallInst>(VL0);
19776 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
19777 // Some intrinsics have scalar arguments. This argument should not be
19778 // vectorized.
19779 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) {
19780 ScalarArg = CEI->getArgOperand(I);
19781 // if decided to reduce bitwidth of abs intrinsic, it second argument
19782 // must be set false (do not return poison, if value issigned min).
19783 if (ID == Intrinsic::abs && It != MinBWs.end() &&
19784 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
19785 ScalarArg = Builder.getFalse();
19786 OpVecs.push_back(ScalarArg);
19788 TysForDecl.push_back(ScalarArg->getType());
19789 continue;
19790 }
19791
19792 Value *OpVec = vectorizeOperand(E, I);
19793 ScalarArg = CEI->getArgOperand(I);
19794 if (cast<VectorType>(OpVec->getType())->getElementType() !=
19795 ScalarArg->getType()->getScalarType() &&
19796 It == MinBWs.end()) {
19797 auto *CastTy =
19798 getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
19799 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
19800 } else if (It != MinBWs.end()) {
19801 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
19802 }
19803 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
19804 OpVecs.push_back(OpVec);
19805 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))
19806 TysForDecl.push_back(OpVec->getType());
19807 }
19808
19809 Function *CF;
19810 if (!UseIntrinsic) {
19811 VFShape Shape =
19813 ElementCount::getFixed(VecTy->getNumElements()),
19814 false /*HasGlobalPred*/);
19815 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
19816 } else {
19817 CF = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, TysForDecl);
19818 }
19819
19821 CI->getOperandBundlesAsDefs(OpBundles);
19822 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
19823
19824 propagateIRFlags(V, E->Scalars, VL0);
19825 V = FinalShuffle(V, E);
19826
19827 E->VectorizedValue = V;
19828 ++NumVectorInstructions;
19829 return V;
19830 }
19831 case Instruction::ShuffleVector: {
19832 Value *V;
19833 if (SLPReVec && !E->isAltShuffle()) {
19834 setInsertPointAfterBundle(E);
19835 Value *Src = vectorizeOperand(E, 0);
19836 SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));
19837 if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
19838 SmallVector<int> NewMask(ThisMask.size());
19839 transform(ThisMask, NewMask.begin(), [&SVSrc](int Mask) {
19840 return SVSrc->getShuffleMask()[Mask];
19841 });
19842 V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
19843 SVSrc->getOperand(1), NewMask);
19844 } else {
19845 V = Builder.CreateShuffleVector(Src, ThisMask);
19846 }
19847 propagateIRFlags(V, E->Scalars, VL0);
19848 if (auto *I = dyn_cast<Instruction>(V))
19849 V = ::propagateMetadata(I, E->Scalars);
19850 V = FinalShuffle(V, E);
19851 } else {
19852 assert(E->isAltShuffle() &&
19853 ((Instruction::isBinaryOp(E->getOpcode()) &&
19854 Instruction::isBinaryOp(E->getAltOpcode())) ||
19855 (Instruction::isCast(E->getOpcode()) &&
19856 Instruction::isCast(E->getAltOpcode())) ||
19857 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
19858 "Invalid Shuffle Vector Operand");
19859
19860 Value *LHS = nullptr, *RHS = nullptr;
19861 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
19862 setInsertPointAfterBundle(E);
19863 LHS = vectorizeOperand(E, 0);
19864 RHS = vectorizeOperand(E, 1);
19865 } else {
19866 setInsertPointAfterBundle(E);
19867 LHS = vectorizeOperand(E, 0);
19868 }
19869 if (LHS && RHS &&
19870 ((Instruction::isBinaryOp(E->getOpcode()) &&
19871 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
19872 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
19873 assert((It != MinBWs.end() ||
19874 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
19875 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
19876 MinBWs.contains(getOperandEntry(E, 0)) ||
19877 MinBWs.contains(getOperandEntry(E, 1))) &&
19878 "Expected item in MinBWs.");
19879 Type *CastTy = VecTy;
19880 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
19882 ->getElementType()
19883 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
19884 ->getElementType()
19885 ->getIntegerBitWidth())
19886 CastTy = RHS->getType();
19887 else
19888 CastTy = LHS->getType();
19889 }
19890 if (LHS->getType() != CastTy)
19891 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
19892 if (RHS->getType() != CastTy)
19893 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
19894 }
19895
19896 Value *V0, *V1;
19897 if (Instruction::isBinaryOp(E->getOpcode())) {
19898 V0 = Builder.CreateBinOp(
19899 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
19900 V1 = Builder.CreateBinOp(
19901 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
19902 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
19903 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
19904 auto *AltCI = cast<CmpInst>(E->getAltOp());
19905 CmpInst::Predicate AltPred = AltCI->getPredicate();
19906 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
19907 } else {
19908 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
19909 unsigned SrcBWSz = DL->getTypeSizeInBits(
19910 cast<VectorType>(LHS->getType())->getElementType());
19911 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
19912 if (BWSz <= SrcBWSz) {
19913 if (BWSz < SrcBWSz)
19914 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
19915 assert(LHS->getType() == VecTy &&
19916 "Expected same type as operand.");
19917 if (auto *I = dyn_cast<Instruction>(LHS))
19918 LHS = ::propagateMetadata(I, E->Scalars);
19919 LHS = FinalShuffle(LHS, E);
19920 E->VectorizedValue = LHS;
19921 ++NumVectorInstructions;
19922 return LHS;
19923 }
19924 }
19925 V0 = Builder.CreateCast(
19926 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
19927 V1 = Builder.CreateCast(
19928 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
19929 }
19930 // Add V0 and V1 to later analysis to try to find and remove matching
19931 // instruction, if any.
19932 for (Value *V : {V0, V1}) {
19933 if (auto *I = dyn_cast<Instruction>(V)) {
19934 GatherShuffleExtractSeq.insert(I);
19935 CSEBlocks.insert(I->getParent());
19936 }
19937 }
19938
19939 // Create shuffle to take alternate operations from the vector.
19940 // Also, gather up main and alt scalar ops to propagate IR flags to
19941 // each vector operation.
19942 ValueList OpScalars, AltScalars;
19943 SmallVector<int> Mask;
19944 E->buildAltOpShuffleMask(
19945 [E, this](Instruction *I) {
19946 assert(E->getMatchingMainOpOrAltOp(I) &&
19947 "Unexpected main/alternate opcode");
19948 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
19949 *TLI);
19950 },
19951 Mask, &OpScalars, &AltScalars);
19952
19953 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
19954 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
19955 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
19956 // Drop nuw flags for abs(sub(commutative), true).
19957 if (auto *I = dyn_cast<Instruction>(Vec);
19958 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
19959 any_of(E->Scalars, [](Value *V) {
19960 if (isa<PoisonValue>(V))
19961 return false;
19962 auto *IV = cast<Instruction>(V);
19963 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
19964 }))
19965 I->setHasNoUnsignedWrap(/*b=*/false);
19966 };
19967 DropNuwFlag(V0, E->getOpcode());
19968 DropNuwFlag(V1, E->getAltOpcode());
19969
19970 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
19971 assert(SLPReVec && "FixedVectorType is not expected.");
19972 transformScalarShuffleIndiciesToVector(VecTy->getNumElements(), Mask);
19973 }
19974 V = Builder.CreateShuffleVector(V0, V1, Mask);
19975 if (auto *I = dyn_cast<Instruction>(V)) {
19976 V = ::propagateMetadata(I, E->Scalars);
19977 GatherShuffleExtractSeq.insert(I);
19978 CSEBlocks.insert(I->getParent());
19979 }
19980 }
19981
19982 E->VectorizedValue = V;
19983 ++NumVectorInstructions;
19984
19985 return V;
19986 }
19987 default:
19988 llvm_unreachable("unknown inst");
19989 }
19990 return nullptr;
19991}
19992
19994 ExtraValueToDebugLocsMap ExternallyUsedValues;
19995 return vectorizeTree(ExternallyUsedValues);
19996}
19997
19999 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
20000 Instruction *ReductionRoot,
20001 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
20002 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
20003 // need to rebuild it.
20004 EntryToLastInstruction.clear();
20005 // All blocks must be scheduled before any instructions are inserted.
20006 for (auto &BSIter : BlocksSchedules)
20007 scheduleBlock(*this, BSIter.second.get());
20008 // Cache last instructions for the nodes to avoid side effects, which may
20009 // appear during vectorization, like extra uses, etc.
20010 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20011 if (TE->isGather())
20012 continue;
20013 (void)getLastInstructionInBundle(TE.get());
20014 }
20015
20016 if (ReductionRoot)
20017 Builder.SetInsertPoint(ReductionRoot->getParent(),
20018 ReductionRoot->getIterator());
20019 else
20020 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20021
20022 // Vectorize gather operands of the nodes with the external uses only.
20024 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20025 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
20026 TE->UserTreeIndex.UserTE->hasState() &&
20027 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
20028 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
20029 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
20030 !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
20031 all_of(TE->UserTreeIndex.UserTE->Scalars,
20032 [](Value *V) { return isUsedOutsideBlock(V); })) {
20033 Instruction &LastInst =
20034 getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
20035 GatherEntries.emplace_back(TE.get(), &LastInst);
20036 }
20037 }
20038 for (auto &Entry : GatherEntries) {
20039 IRBuilderBase::InsertPointGuard Guard(Builder);
20040 Builder.SetInsertPoint(Entry.second);
20041 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
20042 (void)vectorizeTree(Entry.first);
20043 }
20044 // Emit gathered loads first to emit better code for the users of those
20045 // gathered loads.
20046 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20047 if (GatheredLoadsEntriesFirst.has_value() &&
20048 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
20049 (!TE->isGather() || TE->UserTreeIndex)) {
20050 assert((TE->UserTreeIndex ||
20051 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
20052 "Expected gathered load node.");
20053 (void)vectorizeTree(TE.get());
20054 }
20055 }
20056 (void)vectorizeTree(VectorizableTree[0].get());
20057 // Run through the list of postponed gathers and emit them, replacing the temp
20058 // emitted allocas with actual vector instructions.
20059 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
20061 for (const TreeEntry *E : PostponedNodes) {
20062 auto *TE = const_cast<TreeEntry *>(E);
20063 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
20064 TE->VectorizedValue = nullptr;
20065 auto *UserI = cast<Instruction>(TE->UserTreeIndex.UserTE->VectorizedValue);
20066 // If user is a PHI node, its vector code have to be inserted right before
20067 // block terminator. Since the node was delayed, there were some unresolved
20068 // dependencies at the moment when stab instruction was emitted. In a case
20069 // when any of these dependencies turn out an operand of another PHI, coming
20070 // from this same block, position of a stab instruction will become invalid.
20071 // The is because source vector that supposed to feed this gather node was
20072 // inserted at the end of the block [after stab instruction]. So we need
20073 // to adjust insertion point again to the end of block.
20074 if (isa<PHINode>(UserI)) {
20075 // Insert before all users.
20076 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
20077 for (User *U : PrevVec->users()) {
20078 if (U == UserI)
20079 continue;
20080 auto *UI = dyn_cast<Instruction>(U);
20081 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
20082 continue;
20083 if (UI->comesBefore(InsertPt))
20084 InsertPt = UI;
20085 }
20086 Builder.SetInsertPoint(InsertPt);
20087 } else {
20088 Builder.SetInsertPoint(PrevVec);
20089 }
20090 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
20091 Value *Vec = vectorizeTree(TE);
20092 if (auto *VecI = dyn_cast<Instruction>(Vec);
20093 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
20094 Builder.GetInsertPoint()->comesBefore(VecI))
20095 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
20096 Builder.GetInsertPoint());
20097 if (Vec->getType() != PrevVec->getType()) {
20098 assert(Vec->getType()->isIntOrIntVectorTy() &&
20099 PrevVec->getType()->isIntOrIntVectorTy() &&
20100 "Expected integer vector types only.");
20101 std::optional<bool> IsSigned;
20102 for (Value *V : TE->Scalars) {
20103 if (isVectorized(V)) {
20104 for (const TreeEntry *MNTE : getTreeEntries(V)) {
20105 auto It = MinBWs.find(MNTE);
20106 if (It != MinBWs.end()) {
20107 IsSigned = IsSigned.value_or(false) || It->second.second;
20108 if (*IsSigned)
20109 break;
20110 }
20111 }
20112 if (IsSigned.value_or(false))
20113 break;
20114 // Scan through gather nodes.
20115 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
20116 auto It = MinBWs.find(BVE);
20117 if (It != MinBWs.end()) {
20118 IsSigned = IsSigned.value_or(false) || It->second.second;
20119 if (*IsSigned)
20120 break;
20121 }
20122 }
20123 if (IsSigned.value_or(false))
20124 break;
20125 if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
20126 IsSigned =
20127 IsSigned.value_or(false) ||
20128 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
20129 continue;
20130 }
20131 if (IsSigned.value_or(false))
20132 break;
20133 }
20134 }
20135 if (IsSigned.value_or(false)) {
20136 // Final attempt - check user node.
20137 auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
20138 if (It != MinBWs.end())
20139 IsSigned = It->second.second;
20140 }
20141 assert(IsSigned &&
20142 "Expected user node or perfect diamond match in MinBWs.");
20143 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
20144 }
20145 PrevVec->replaceAllUsesWith(Vec);
20146 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
20147 // Replace the stub vector node, if it was used before for one of the
20148 // buildvector nodes already.
20149 auto It = PostponedValues.find(PrevVec);
20150 if (It != PostponedValues.end()) {
20151 for (TreeEntry *VTE : It->getSecond())
20152 VTE->VectorizedValue = Vec;
20153 }
20154 eraseInstruction(PrevVec);
20155 }
20156
20157 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
20158 << " values .\n");
20159
20161 // Maps vector instruction to original insertelement instruction
20162 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
20163 // Maps extract Scalar to the corresponding extractelement instruction in the
20164 // basic block. Only one extractelement per block should be emitted.
20166 ScalarToEEs;
20167 SmallDenseSet<Value *, 4> UsedInserts;
20169 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
20171 // Extract all of the elements with the external uses.
20172 for (const auto &ExternalUse : ExternalUses) {
20173 Value *Scalar = ExternalUse.Scalar;
20174 llvm::User *User = ExternalUse.User;
20175
20176 // Skip users that we already RAUW. This happens when one instruction
20177 // has multiple uses of the same value.
20178 if (User && !is_contained(Scalar->users(), User))
20179 continue;
20180 const TreeEntry *E = &ExternalUse.E;
20181 assert(E && "Invalid scalar");
20182 assert(!E->isGather() && "Extracting from a gather list");
20183 // Non-instruction pointers are not deleted, just skip them.
20184 if (E->getOpcode() == Instruction::GetElementPtr &&
20185 !isa<GetElementPtrInst>(Scalar))
20186 continue;
20187
20188 Value *Vec = E->VectorizedValue;
20189 assert(Vec && "Can't find vectorizable value");
20190
20191 Value *Lane = Builder.getInt32(ExternalUse.Lane);
20192 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
20193 if (Scalar->getType() != Vec->getType()) {
20194 Value *Ex = nullptr;
20195 Value *ExV = nullptr;
20196 auto *Inst = dyn_cast<Instruction>(Scalar);
20197 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
20198 auto It = ScalarToEEs.find(Scalar);
20199 if (It != ScalarToEEs.end()) {
20200 // No need to emit many extracts, just move the only one in the
20201 // current block.
20202 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
20203 : Builder.GetInsertBlock());
20204 if (EEIt != It->second.end()) {
20205 Value *PrevV = EEIt->second.first;
20206 if (auto *I = dyn_cast<Instruction>(PrevV);
20207 I && !ReplaceInst &&
20208 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
20209 Builder.GetInsertPoint()->comesBefore(I)) {
20210 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
20211 Builder.GetInsertPoint());
20212 if (auto *CI = dyn_cast<Instruction>(EEIt->second.second))
20213 CI->moveAfter(I);
20214 }
20215 Ex = PrevV;
20216 ExV = EEIt->second.second ? EEIt->second.second : Ex;
20217 }
20218 }
20219 if (!Ex) {
20220 // "Reuse" the existing extract to improve final codegen.
20221 if (ReplaceInst) {
20222 // Leave the instruction as is, if it cheaper extracts and all
20223 // operands are scalar.
20224 if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
20225 IgnoredExtracts.insert(EE);
20226 Ex = EE;
20227 } else {
20228 auto *CloneInst = Inst->clone();
20229 CloneInst->insertBefore(Inst->getIterator());
20230 if (Inst->hasName())
20231 CloneInst->takeName(Inst);
20232 Ex = CloneInst;
20233 }
20234 } else if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
20235 ES && isa<Instruction>(Vec)) {
20236 Value *V = ES->getVectorOperand();
20237 auto *IVec = cast<Instruction>(Vec);
20238 if (ArrayRef<TreeEntry *> ETEs = getTreeEntries(V); !ETEs.empty())
20239 V = ETEs.front()->VectorizedValue;
20240 if (auto *IV = dyn_cast<Instruction>(V);
20241 !IV || IV == Vec || IV->getParent() != IVec->getParent() ||
20242 IV->comesBefore(IVec))
20243 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
20244 else
20245 Ex = Builder.CreateExtractElement(Vec, Lane);
20246 } else if (auto *VecTy =
20247 dyn_cast<FixedVectorType>(Scalar->getType())) {
20248 assert(SLPReVec && "FixedVectorType is not expected.");
20249 unsigned VecTyNumElements = VecTy->getNumElements();
20250 // When REVEC is enabled, we need to extract a vector.
20251 // Note: The element size of Scalar may be different from the
20252 // element size of Vec.
20253 Ex = createExtractVector(Builder, Vec, VecTyNumElements,
20254 ExternalUse.Lane * VecTyNumElements);
20255 } else {
20256 Ex = Builder.CreateExtractElement(Vec, Lane);
20257 }
20258 // If necessary, sign-extend or zero-extend ScalarRoot
20259 // to the larger type.
20260 ExV = Ex;
20261 if (Scalar->getType() != Ex->getType())
20262 ExV = Builder.CreateIntCast(
20263 Ex, Scalar->getType(),
20264 !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
20265 auto *I = dyn_cast<Instruction>(Ex);
20266 ScalarToEEs[Scalar].try_emplace(I ? I->getParent()
20267 : &F->getEntryBlock(),
20268 std::make_pair(Ex, ExV));
20269 }
20270 // The then branch of the previous if may produce constants, since 0
20271 // operand might be a constant.
20272 if (auto *ExI = dyn_cast<Instruction>(Ex);
20273 ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {
20274 GatherShuffleExtractSeq.insert(ExI);
20275 CSEBlocks.insert(ExI->getParent());
20276 }
20277 return ExV;
20278 }
20279 assert(isa<FixedVectorType>(Scalar->getType()) &&
20280 isa<InsertElementInst>(Scalar) &&
20281 "In-tree scalar of vector type is not insertelement?");
20282 auto *IE = cast<InsertElementInst>(Scalar);
20283 VectorToInsertElement.try_emplace(Vec, IE);
20284 return Vec;
20285 };
20286 // If User == nullptr, the Scalar remains as scalar in vectorized
20287 // instructions or is used as extra arg. Generate ExtractElement instruction
20288 // and update the record for this scalar in ExternallyUsedValues.
20289 if (!User) {
20290 if (!ScalarsWithNullptrUser.insert(Scalar).second)
20291 continue;
20292 assert(
20293 (ExternallyUsedValues.count(Scalar) ||
20294 ExternalUsesWithNonUsers.count(Scalar) ||
20295 ExternalUsesAsOriginalScalar.contains(Scalar) ||
20296 any_of(
20297 Scalar->users(),
20298 [&, TTI = TTI](llvm::User *U) {
20299 if (ExternalUsesAsOriginalScalar.contains(U))
20300 return true;
20301 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
20302 return !UseEntries.empty() &&
20303 (E->State == TreeEntry::Vectorize ||
20304 E->State == TreeEntry::StridedVectorize ||
20305 E->State == TreeEntry::CompressVectorize) &&
20306 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
20307 return (UseEntry->State == TreeEntry::Vectorize ||
20308 UseEntry->State ==
20309 TreeEntry::StridedVectorize ||
20310 UseEntry->State ==
20311 TreeEntry::CompressVectorize) &&
20312 doesInTreeUserNeedToExtract(
20313 Scalar, getRootEntryInstruction(*UseEntry),
20314 TLI, TTI);
20315 });
20316 })) &&
20317 "Scalar with nullptr User must be registered in "
20318 "ExternallyUsedValues map or remain as scalar in vectorized "
20319 "instructions");
20320 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
20321 if (auto *PHI = dyn_cast<PHINode>(VecI)) {
20322 if (PHI->getParent()->isLandingPad())
20323 Builder.SetInsertPoint(
20324 PHI->getParent(),
20325 std::next(
20326 PHI->getParent()->getLandingPadInst()->getIterator()));
20327 else
20328 Builder.SetInsertPoint(PHI->getParent(),
20329 PHI->getParent()->getFirstNonPHIIt());
20330 } else {
20331 Builder.SetInsertPoint(VecI->getParent(),
20332 std::next(VecI->getIterator()));
20333 }
20334 } else {
20335 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20336 }
20337 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20338 // Required to update internally referenced instructions.
20339 if (Scalar != NewInst) {
20340 assert((!isa<ExtractElementInst>(Scalar) ||
20341 !IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&
20342 "Extractelements should not be replaced.");
20343 Scalar->replaceAllUsesWith(NewInst);
20344 }
20345 continue;
20346 }
20347
20348 if (auto *VU = dyn_cast<InsertElementInst>(User);
20349 VU && VU->getOperand(1) == Scalar) {
20350 // Skip if the scalar is another vector op or Vec is not an instruction.
20351 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
20352 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
20353 if (!UsedInserts.insert(VU).second)
20354 continue;
20355 // Need to use original vector, if the root is truncated.
20356 auto BWIt = MinBWs.find(E);
20357 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
20358 auto *ScalarTy = FTy->getElementType();
20359 auto Key = std::make_pair(Vec, ScalarTy);
20360 auto VecIt = VectorCasts.find(Key);
20361 if (VecIt == VectorCasts.end()) {
20362 IRBuilderBase::InsertPointGuard Guard(Builder);
20363 if (auto *IVec = dyn_cast<PHINode>(Vec)) {
20364 if (IVec->getParent()->isLandingPad())
20365 Builder.SetInsertPoint(IVec->getParent(),
20366 std::next(IVec->getParent()
20367 ->getLandingPadInst()
20368 ->getIterator()));
20369 else
20370 Builder.SetInsertPoint(
20371 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
20372 } else if (auto *IVec = dyn_cast<Instruction>(Vec)) {
20373 Builder.SetInsertPoint(IVec->getNextNode());
20374 }
20375 Vec = Builder.CreateIntCast(
20376 Vec,
20378 ScalarTy,
20379 cast<FixedVectorType>(Vec->getType())->getNumElements()),
20380 BWIt->second.second);
20381 VectorCasts.try_emplace(Key, Vec);
20382 } else {
20383 Vec = VecIt->second;
20384 }
20385 }
20386
20387 std::optional<unsigned> InsertIdx = getElementIndex(VU);
20388 if (InsertIdx) {
20389 auto *It = find_if(
20390 ShuffledInserts, [VU](const ShuffledInsertData<Value *> &Data) {
20391 // Checks if 2 insertelements are from the same buildvector.
20392 InsertElementInst *VecInsert = Data.InsertElements.front();
20394 VU, VecInsert,
20395 [](InsertElementInst *II) { return II->getOperand(0); });
20396 });
20397 unsigned Idx = *InsertIdx;
20398 if (It == ShuffledInserts.end()) {
20399 (void)ShuffledInserts.emplace_back();
20400 It = std::next(ShuffledInserts.begin(),
20401 ShuffledInserts.size() - 1);
20402 }
20403 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
20404 if (Mask.empty())
20405 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
20406 Mask[Idx] = ExternalUse.Lane;
20407 It->InsertElements.push_back(cast<InsertElementInst>(User));
20408 continue;
20409 }
20410 }
20411 }
20412 }
20413
20414 // Generate extracts for out-of-tree users.
20415 // Find the insertion point for the extractelement lane.
20416 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
20417 if (PHINode *PH = dyn_cast<PHINode>(User)) {
20418 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
20419 if (PH->getIncomingValue(I) == Scalar) {
20420 Instruction *IncomingTerminator =
20421 PH->getIncomingBlock(I)->getTerminator();
20422 if (isa<CatchSwitchInst>(IncomingTerminator)) {
20423 Builder.SetInsertPoint(VecI->getParent(),
20424 std::next(VecI->getIterator()));
20425 } else {
20426 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
20427 }
20428 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20429 PH->setOperand(I, NewInst);
20430 }
20431 }
20432 } else {
20433 Builder.SetInsertPoint(cast<Instruction>(User));
20434 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20435 User->replaceUsesOfWith(Scalar, NewInst);
20436 }
20437 } else {
20438 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20439 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20440 User->replaceUsesOfWith(Scalar, NewInst);
20441 }
20442
20443 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
20444 }
20445
20446 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
20447 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
20448 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
20449 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
20450 for (int I = 0, E = Mask.size(); I < E; ++I) {
20451 if (Mask[I] < VF)
20452 CombinedMask1[I] = Mask[I];
20453 else
20454 CombinedMask2[I] = Mask[I] - VF;
20455 }
20456 ShuffleInstructionBuilder ShuffleBuilder(
20457 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
20458 ShuffleBuilder.add(V1, CombinedMask1);
20459 if (V2)
20460 ShuffleBuilder.add(V2, CombinedMask2);
20461 return ShuffleBuilder.finalize({}, {}, {});
20462 };
20463
20464 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
20465 bool ForSingleMask) {
20466 unsigned VF = Mask.size();
20467 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
20468 if (VF != VecVF) {
20469 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
20470 Vec = CreateShuffle(Vec, nullptr, Mask);
20471 return std::make_pair(Vec, true);
20472 }
20473 if (!ForSingleMask) {
20474 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
20475 for (unsigned I = 0; I < VF; ++I) {
20476 if (Mask[I] != PoisonMaskElem)
20477 ResizeMask[Mask[I]] = Mask[I];
20478 }
20479 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
20480 }
20481 }
20482
20483 return std::make_pair(Vec, false);
20484 };
20485 // Perform shuffling of the vectorize tree entries for better handling of
20486 // external extracts.
20487 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
20488 // Find the first and the last instruction in the list of insertelements.
20489 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
20490 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
20491 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
20492 Builder.SetInsertPoint(LastInsert);
20493 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
20495 MutableArrayRef(Vector.data(), Vector.size()),
20496 FirstInsert->getOperand(0),
20497 [](Value *Vec) {
20498 return cast<VectorType>(Vec->getType())
20499 ->getElementCount()
20500 .getKnownMinValue();
20501 },
20502 ResizeToVF,
20503 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
20504 ArrayRef<Value *> Vals) {
20505 assert((Vals.size() == 1 || Vals.size() == 2) &&
20506 "Expected exactly 1 or 2 input values.");
20507 if (Vals.size() == 1) {
20508 // Do not create shuffle if the mask is a simple identity
20509 // non-resizing mask.
20510 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
20511 ->getNumElements() ||
20512 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
20513 return CreateShuffle(Vals.front(), nullptr, Mask);
20514 return Vals.front();
20515 }
20516 return CreateShuffle(Vals.front() ? Vals.front()
20517 : FirstInsert->getOperand(0),
20518 Vals.back(), Mask);
20519 });
20520 auto It = ShuffledInserts[I].InsertElements.rbegin();
20521 // Rebuild buildvector chain.
20522 InsertElementInst *II = nullptr;
20523 if (It != ShuffledInserts[I].InsertElements.rend())
20524 II = *It;
20526 while (It != ShuffledInserts[I].InsertElements.rend()) {
20527 assert(II && "Must be an insertelement instruction.");
20528 if (*It == II)
20529 ++It;
20530 else
20531 Inserts.push_back(cast<Instruction>(II));
20532 II = dyn_cast<InsertElementInst>(II->getOperand(0));
20533 }
20534 for (Instruction *II : reverse(Inserts)) {
20535 II->replaceUsesOfWith(II->getOperand(0), NewInst);
20536 if (auto *NewI = dyn_cast<Instruction>(NewInst))
20537 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
20538 II->moveAfter(NewI);
20539 NewInst = II;
20540 }
20541 LastInsert->replaceAllUsesWith(NewInst);
20542 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
20543 IE->replaceUsesOfWith(IE->getOperand(0),
20544 PoisonValue::get(IE->getOperand(0)->getType()));
20545 IE->replaceUsesOfWith(IE->getOperand(1),
20546 PoisonValue::get(IE->getOperand(1)->getType()));
20547 eraseInstruction(IE);
20548 }
20549 CSEBlocks.insert(LastInsert->getParent());
20550 }
20551
20552 SmallVector<Instruction *> RemovedInsts;
20553 // For each vectorized value:
20554 for (auto &TEPtr : VectorizableTree) {
20555 TreeEntry *Entry = TEPtr.get();
20556
20557 // No need to handle users of gathered values.
20558 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
20559 continue;
20560
20561 assert(Entry->VectorizedValue && "Can't find vectorizable value");
20562
20563 // For each lane:
20564 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
20565 Value *Scalar = Entry->Scalars[Lane];
20566
20567 if (Entry->getOpcode() == Instruction::GetElementPtr &&
20568 !isa<GetElementPtrInst>(Scalar))
20569 continue;
20570 if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);
20571 EE && IgnoredExtracts.contains(EE))
20572 continue;
20573 if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
20574 continue;
20575#ifndef NDEBUG
20576 Type *Ty = Scalar->getType();
20577 if (!Ty->isVoidTy()) {
20578 for (User *U : Scalar->users()) {
20579 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
20580
20581 // It is legal to delete users in the ignorelist.
20582 assert((isVectorized(U) ||
20583 (UserIgnoreList && UserIgnoreList->contains(U)) ||
20586 "Deleting out-of-tree value");
20587 }
20588 }
20589#endif
20590 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
20591 auto *I = cast<Instruction>(Scalar);
20592 RemovedInsts.push_back(I);
20593 }
20594 }
20595
20596 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
20597 // new vector instruction.
20598 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
20599 V->mergeDIAssignID(RemovedInsts);
20600
20601 // Clear up reduction references, if any.
20602 if (UserIgnoreList) {
20603 for (Instruction *I : RemovedInsts) {
20604 const TreeEntry *IE = getTreeEntries(I).front();
20605 if (IE->Idx != 0 &&
20606 !(VectorizableTree.front()->isGather() && IE->UserTreeIndex &&
20607 (ValueToGatherNodes.lookup(I).contains(
20608 VectorizableTree.front().get()) ||
20609 (IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
20610 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
20611 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
20612 IE->UserTreeIndex &&
20613 is_contained(VectorizableTree.front()->Scalars, I)) &&
20614 !(GatheredLoadsEntriesFirst.has_value() &&
20615 IE->Idx >= *GatheredLoadsEntriesFirst &&
20616 VectorizableTree.front()->isGather() &&
20617 is_contained(VectorizableTree.front()->Scalars, I)) &&
20618 !(!VectorizableTree.front()->isGather() &&
20619 VectorizableTree.front()->isCopyableElement(I)))
20620 continue;
20621 SmallVector<SelectInst *> LogicalOpSelects;
20622 I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
20623 // Do not replace condition of the logical op in form select <cond>.
20624 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
20625 (match(U.getUser(), m_LogicalAnd()) ||
20626 match(U.getUser(), m_LogicalOr())) &&
20627 U.getOperandNo() == 0;
20628 if (IsPoisoningLogicalOp) {
20629 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
20630 return false;
20631 }
20632 return UserIgnoreList->contains(U.getUser());
20633 });
20634 // Replace conditions of the poisoning logical ops with the non-poison
20635 // constant value.
20636 for (SelectInst *SI : LogicalOpSelects)
20637 SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));
20638 }
20639 }
20640 // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
20641 // cache correctness.
20642 // NOTE: removeInstructionAndOperands only marks the instruction for deletion
20643 // - instructions are not deleted until later.
20644 removeInstructionsAndOperands(ArrayRef(RemovedInsts), VectorValuesAndScales);
20645
20646 Builder.ClearInsertionPoint();
20647 InstrElementSize.clear();
20648
20649 const TreeEntry &RootTE = *VectorizableTree.front();
20650 Value *Vec = RootTE.VectorizedValue;
20651 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
20652 It != MinBWs.end() &&
20653 ReductionBitWidth != It->second.first) {
20654 IRBuilder<>::InsertPointGuard Guard(Builder);
20655 Builder.SetInsertPoint(ReductionRoot->getParent(),
20656 ReductionRoot->getIterator());
20657 Vec = Builder.CreateIntCast(
20658 Vec,
20659 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
20660 cast<VectorType>(Vec->getType())->getElementCount()),
20661 It->second.second);
20662 }
20663 return Vec;
20664}
20665
20667 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
20668 << " gather sequences instructions.\n");
20669 // LICM InsertElementInst sequences.
20670 for (Instruction *I : GatherShuffleExtractSeq) {
20671 if (isDeleted(I))
20672 continue;
20673
20674 // Check if this block is inside a loop.
20675 Loop *L = LI->getLoopFor(I->getParent());
20676 if (!L)
20677 continue;
20678
20679 // Check if it has a preheader.
20680 BasicBlock *PreHeader = L->getLoopPreheader();
20681 if (!PreHeader)
20682 continue;
20683
20684 // If the vector or the element that we insert into it are
20685 // instructions that are defined in this basic block then we can't
20686 // hoist this instruction.
20687 if (any_of(I->operands(), [L](Value *V) {
20688 auto *OpI = dyn_cast<Instruction>(V);
20689 return OpI && L->contains(OpI);
20690 }))
20691 continue;
20692
20693 // We can hoist this instruction. Move it to the pre-header.
20694 I->moveBefore(PreHeader->getTerminator()->getIterator());
20695 CSEBlocks.insert(PreHeader);
20696 }
20697
20698 // Make a list of all reachable blocks in our CSE queue.
20700 CSEWorkList.reserve(CSEBlocks.size());
20701 for (BasicBlock *BB : CSEBlocks)
20702 if (DomTreeNode *N = DT->getNode(BB)) {
20703 assert(DT->isReachableFromEntry(N));
20704 CSEWorkList.push_back(N);
20705 }
20706
20707 // Sort blocks by domination. This ensures we visit a block after all blocks
20708 // dominating it are visited.
20709 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
20710 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
20711 "Different nodes should have different DFS numbers");
20712 return A->getDFSNumIn() < B->getDFSNumIn();
20713 });
20714
20715 // Less defined shuffles can be replaced by the more defined copies.
20716 // Between two shuffles one is less defined if it has the same vector operands
20717 // and its mask indeces are the same as in the first one or undefs. E.g.
20718 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
20719 // poison, <0, 0, 0, 0>.
20720 auto &&IsIdenticalOrLessDefined = [TTI = TTI](Instruction *I1,
20721 Instruction *I2,
20722 SmallVectorImpl<int> &NewMask) {
20723 if (I1->getType() != I2->getType())
20724 return false;
20725 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
20726 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
20727 if (!SI1 || !SI2)
20728 return I1->isIdenticalTo(I2);
20729 if (SI1->isIdenticalTo(SI2))
20730 return true;
20731 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
20732 if (SI1->getOperand(I) != SI2->getOperand(I))
20733 return false;
20734 // Check if the second instruction is more defined than the first one.
20735 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
20736 ArrayRef<int> SM1 = SI1->getShuffleMask();
20737 // Count trailing undefs in the mask to check the final number of used
20738 // registers.
20739 unsigned LastUndefsCnt = 0;
20740 for (int I = 0, E = NewMask.size(); I < E; ++I) {
20741 if (SM1[I] == PoisonMaskElem)
20742 ++LastUndefsCnt;
20743 else
20744 LastUndefsCnt = 0;
20745 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
20746 NewMask[I] != SM1[I])
20747 return false;
20748 if (NewMask[I] == PoisonMaskElem)
20749 NewMask[I] = SM1[I];
20750 }
20751 // Check if the last undefs actually change the final number of used vector
20752 // registers.
20753 return SM1.size() - LastUndefsCnt > 1 &&
20754 ::getNumberOfParts(*TTI, SI1->getType()) ==
20756 *TTI, getWidenedType(SI1->getType()->getElementType(),
20757 SM1.size() - LastUndefsCnt));
20758 };
20759 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
20760 // instructions. TODO: We can further optimize this scan if we split the
20761 // instructions into different buckets based on the insert lane.
20763 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
20764 assert(*I &&
20765 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
20766 "Worklist not sorted properly!");
20767 BasicBlock *BB = (*I)->getBlock();
20768 // For all instructions in blocks containing gather sequences:
20769 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
20770 if (isDeleted(&In))
20771 continue;
20773 !GatherShuffleExtractSeq.contains(&In))
20774 continue;
20775
20776 // Check if we can replace this instruction with any of the
20777 // visited instructions.
20778 bool Replaced = false;
20779 for (Instruction *&V : Visited) {
20780 SmallVector<int> NewMask;
20781 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
20782 DT->dominates(V->getParent(), In.getParent())) {
20783 In.replaceAllUsesWith(V);
20784 eraseInstruction(&In);
20785 if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
20786 if (!NewMask.empty())
20787 SI->setShuffleMask(NewMask);
20788 Replaced = true;
20789 break;
20790 }
20792 GatherShuffleExtractSeq.contains(V) &&
20793 IsIdenticalOrLessDefined(V, &In, NewMask) &&
20794 DT->dominates(In.getParent(), V->getParent())) {
20795 In.moveAfter(V);
20796 V->replaceAllUsesWith(&In);
20798 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
20799 if (!NewMask.empty())
20800 SI->setShuffleMask(NewMask);
20801 V = &In;
20802 Replaced = true;
20803 break;
20804 }
20805 }
20806 if (!Replaced) {
20807 assert(!is_contained(Visited, &In));
20808 Visited.push_back(&In);
20809 }
20810 }
20811 }
20812 CSEBlocks.clear();
20813 GatherShuffleExtractSeq.clear();
20814}
20815
20816BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
20817 ArrayRef<Value *> VL, const InstructionsState &S, const EdgeInfo &EI) {
20818 auto &BundlePtr =
20819 ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
20820 for (Value *V : VL) {
20821 if (S.isNonSchedulable(V))
20822 continue;
20823 auto *I = cast<Instruction>(V);
20824 if (S.isCopyableElement(V)) {
20825 // Add a copyable element model.
20826 ScheduleCopyableData &SD =
20827 addScheduleCopyableData(EI, I, SchedulingRegionID, *BundlePtr);
20828 // Group the instructions to a bundle.
20829 BundlePtr->add(&SD);
20830 continue;
20831 }
20832 ScheduleData *BundleMember = getScheduleData(V);
20833 assert(BundleMember && "no ScheduleData for bundle member "
20834 "(maybe not in same basic block)");
20835 // Group the instructions to a bundle.
20836 BundlePtr->add(BundleMember);
20837 ScheduledBundles.try_emplace(I).first->getSecond().push_back(
20838 BundlePtr.get());
20839 }
20840 assert(BundlePtr && *BundlePtr && "Failed to find schedule bundle");
20841 return *BundlePtr;
20842}
20843
20844// Groups the instructions to a bundle (which is then a single scheduling entity)
20845// and schedules instructions until the bundle gets ready.
20846std::optional<BoUpSLP::ScheduleBundle *>
20847BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
20848 const InstructionsState &S,
20849 const EdgeInfo &EI) {
20850 // No need to schedule PHIs, insertelement, extractelement and extractvalue
20851 // instructions.
20852 if (isa<PHINode>(S.getMainOp()) ||
20853 isVectorLikeInstWithConstOps(S.getMainOp()))
20854 return nullptr;
20855 bool HasCopyables = S.areInstructionsWithCopyableElements();
20856 if (((!HasCopyables && doesNotNeedToSchedule(VL)) ||
20857 all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))) {
20858 // If all operands were replaced by copyables, the operands of this node
20859 // might be not, so need to recalculate dependencies for schedule data,
20860 // replaced by copyable schedule data.
20861 SmallVector<ScheduleData *> ControlDependentMembers;
20862 for (Value *V : VL) {
20863 auto *I = dyn_cast<Instruction>(V);
20864 if (!I || (HasCopyables && S.isCopyableElement(V)))
20865 continue;
20866 SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
20867 for (const Use &U : I->operands()) {
20868 unsigned &NumOps =
20869 UserOpToNumOps.try_emplace(std::make_pair(I, U.get()), 0)
20870 .first->getSecond();
20871 ++NumOps;
20872 if (auto *Op = dyn_cast<Instruction>(U.get());
20873 Op && areAllOperandsReplacedByCopyableData(I, Op, *SLP, NumOps)) {
20874 if (ScheduleData *OpSD = getScheduleData(Op);
20875 OpSD && OpSD->hasValidDependencies()) {
20876 OpSD->clearDirectDependencies();
20877 if (RegionHasStackSave ||
20879 ControlDependentMembers.push_back(OpSD);
20880 }
20881 }
20882 }
20883 }
20884 if (!ControlDependentMembers.empty()) {
20885 ScheduleBundle Invalid = ScheduleBundle::invalid();
20886 calculateDependencies(Invalid, /*InsertInReadyList=*/true, SLP,
20887 ControlDependentMembers);
20888 }
20889 return nullptr;
20890 }
20891
20892 // Initialize the instruction bundle.
20893 Instruction *OldScheduleEnd = ScheduleEnd;
20894 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
20895
20896 auto TryScheduleBundleImpl = [=](bool ReSchedule, ScheduleBundle &Bundle) {
20897 // Clear deps or recalculate the region, if the memory instruction is a
20898 // copyable. It may have memory deps, which must be recalculated.
20899 SmallVector<ScheduleData *> ControlDependentMembers;
20900 auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
20901 SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
20902 for (ScheduleEntity *SE : Bundle.getBundle()) {
20903 if (ScheduleCopyableData *SD = dyn_cast<ScheduleCopyableData>(SE)) {
20904 if (ScheduleData *BundleMember = getScheduleData(SD->getInst());
20905 BundleMember && BundleMember->hasValidDependencies()) {
20906 BundleMember->clearDirectDependencies();
20907 if (RegionHasStackSave ||
20909 BundleMember->getInst()))
20910 ControlDependentMembers.push_back(BundleMember);
20911 }
20912 continue;
20913 }
20914 auto *SD = cast<ScheduleData>(SE);
20915 if (SD->hasValidDependencies() &&
20916 (!S.areInstructionsWithCopyableElements() ||
20917 !S.isCopyableElement(SD->getInst())) &&
20918 !getScheduleCopyableData(SD->getInst()).empty() && EI.UserTE &&
20919 EI.UserTE->hasState() &&
20920 (!EI.UserTE->hasCopyableElements() ||
20921 !EI.UserTE->isCopyableElement(SD->getInst())))
20922 SD->clearDirectDependencies();
20923 for (const Use &U : SD->getInst()->operands()) {
20924 unsigned &NumOps =
20925 UserOpToNumOps
20926 .try_emplace(std::make_pair(SD->getInst(), U.get()), 0)
20927 .first->getSecond();
20928 ++NumOps;
20929 if (auto *Op = dyn_cast<Instruction>(U.get());
20930 Op && areAllOperandsReplacedByCopyableData(SD->getInst(), Op,
20931 *SLP, NumOps)) {
20932 if (ScheduleData *OpSD = getScheduleData(Op);
20933 OpSD && OpSD->hasValidDependencies()) {
20934 OpSD->clearDirectDependencies();
20935 if (RegionHasStackSave ||
20937 ControlDependentMembers.push_back(OpSD);
20938 }
20939 }
20940 }
20941 }
20942 };
20943 // The scheduling region got new instructions at the lower end (or it is a
20944 // new region for the first bundle). This makes it necessary to
20945 // recalculate all dependencies.
20946 // It is seldom that this needs to be done a second time after adding the
20947 // initial bundle to the region.
20948 if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
20949 for_each(ScheduleDataMap, [&](auto &P) {
20950 if (BB != P.first->getParent())
20951 return;
20952 ScheduleData *SD = P.second;
20953 if (isInSchedulingRegion(*SD))
20954 SD->clearDependencies();
20955 });
20956 for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {
20957 for_each(P.second, [&](ScheduleCopyableData *SD) {
20958 if (isInSchedulingRegion(*SD))
20959 SD->clearDependencies();
20960 });
20961 });
20962 ReSchedule = true;
20963 }
20964 // Check if the bundle data has deps for copyable elements already. In
20965 // this case need to reset deps and recalculate it.
20966 if (Bundle && !Bundle.getBundle().empty()) {
20967 if (S.areInstructionsWithCopyableElements() ||
20968 !ScheduleCopyableDataMap.empty())
20969 CheckIfNeedToClearDeps(Bundle);
20970 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << Bundle << " in block "
20971 << BB->getName() << "\n");
20972 calculateDependencies(Bundle, /*InsertInReadyList=*/!ReSchedule, SLP,
20973 ControlDependentMembers);
20974 } else if (!ControlDependentMembers.empty()) {
20975 ScheduleBundle Invalid = ScheduleBundle::invalid();
20976 calculateDependencies(Invalid, /*InsertInReadyList=*/!ReSchedule, SLP,
20977 ControlDependentMembers);
20978 }
20979
20980 if (ReSchedule) {
20981 resetSchedule();
20982 initialFillReadyList(ReadyInsts);
20983 }
20984
20985 // Now try to schedule the new bundle or (if no bundle) just calculate
20986 // dependencies. As soon as the bundle is "ready" it means that there are no
20987 // cyclic dependencies and we can schedule it. Note that's important that we
20988 // don't "schedule" the bundle yet.
20989 while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
20990 !ReadyInsts.empty()) {
20991 ScheduleEntity *Picked = ReadyInsts.pop_back_val();
20992 assert(Picked->isReady() && "must be ready to schedule");
20993 schedule(*SLP, S, EI, Picked, ReadyInsts);
20994 if (Picked == &Bundle)
20995 break;
20996 }
20997 };
20998
20999 // Make sure that the scheduling region contains all
21000 // instructions of the bundle.
21001 for (Value *V : VL) {
21002 if (S.isNonSchedulable(V))
21003 continue;
21004 if (!extendSchedulingRegion(V, S)) {
21005 // If the scheduling region got new instructions at the lower end (or it
21006 // is a new region for the first bundle). This makes it necessary to
21007 // recalculate all dependencies.
21008 // Otherwise the compiler may crash trying to incorrectly calculate
21009 // dependencies and emit instruction in the wrong order at the actual
21010 // scheduling.
21011 ScheduleBundle Invalid = ScheduleBundle::invalid();
21012 TryScheduleBundleImpl(/*ReSchedule=*/false, Invalid);
21013 return std::nullopt;
21014 }
21015 }
21016
21017 bool ReSchedule = false;
21018 for (Value *V : VL) {
21019 if (S.isNonSchedulable(V))
21020 continue;
21022 getScheduleCopyableData(cast<Instruction>(V));
21023 if (!CopyableData.empty()) {
21024 for (ScheduleCopyableData *SD : CopyableData)
21025 ReadyInsts.remove(SD);
21026 }
21027 ScheduleData *BundleMember = getScheduleData(V);
21028 assert((BundleMember || S.isCopyableElement(V)) &&
21029 "no ScheduleData for bundle member (maybe not in same basic block)");
21030 if (!BundleMember)
21031 continue;
21032
21033 // Make sure we don't leave the pieces of the bundle in the ready list when
21034 // whole bundle might not be ready.
21035 ReadyInsts.remove(BundleMember);
21036 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V);
21037 !Bundles.empty()) {
21038 for (ScheduleBundle *B : Bundles)
21039 ReadyInsts.remove(B);
21040 }
21041
21042 if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
21043 continue;
21044 // A bundle member was scheduled as single instruction before and now
21045 // needs to be scheduled as part of the bundle. We just get rid of the
21046 // existing schedule.
21047 // A bundle member has deps calculated before it was copyable element - need
21048 // to reschedule.
21049 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
21050 << " was already scheduled\n");
21051 ReSchedule = true;
21052 }
21053
21054 ScheduleBundle &Bundle = buildBundle(VL, S, EI);
21055 TryScheduleBundleImpl(ReSchedule, Bundle);
21056 if (!Bundle.isReady()) {
21057 for (ScheduleEntity *BD : Bundle.getBundle()) {
21058 // Copyable data scheduling is just removed.
21060 continue;
21061 if (BD->isReady()) {
21062 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(BD->getInst());
21063 if (Bundles.empty()) {
21064 ReadyInsts.insert(BD);
21065 continue;
21066 }
21067 for (ScheduleBundle *B : Bundles)
21068 if (B->isReady())
21069 ReadyInsts.insert(B);
21070 }
21071 }
21072 ScheduledBundlesList.pop_back();
21073 SmallVector<ScheduleData *> ControlDependentMembers;
21074 SmallPtrSet<Instruction *, 4> Visited;
21075 for (Value *V : VL) {
21076 if (S.isNonSchedulable(V))
21077 continue;
21078 auto *I = cast<Instruction>(V);
21079 if (S.isCopyableElement(I)) {
21080 // Remove the copyable data from the scheduling region and restore
21081 // previous mappings.
21082 auto KV = std::make_pair(EI, I);
21083 assert(ScheduleCopyableDataMap.contains(KV) &&
21084 "no ScheduleCopyableData for copyable element");
21085 ScheduleCopyableData *SD =
21086 ScheduleCopyableDataMapByInst.find(I)->getSecond().pop_back_val();
21087 ScheduleCopyableDataMapByUsers[I].remove(SD);
21088 if (EI.UserTE) {
21089 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
21090 const auto *It = find(Op, I);
21091 assert(It != Op.end() && "Lane not set");
21092 SmallPtrSet<Instruction *, 4> Visited;
21093 do {
21094 int Lane = std::distance(Op.begin(), It);
21095 assert(Lane >= 0 && "Lane not set");
21096 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
21097 !EI.UserTE->ReorderIndices.empty())
21098 Lane = EI.UserTE->ReorderIndices[Lane];
21099 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
21100 "Couldn't find extract lane");
21101 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
21102 if (!Visited.insert(In).second) {
21103 It = find(make_range(std::next(It), Op.end()), I);
21104 break;
21105 }
21106 ScheduleCopyableDataMapByInstUser
21107 [std::make_pair(std::make_pair(In, EI.EdgeIdx), I)]
21108 .pop_back();
21109 It = find(make_range(std::next(It), Op.end()), I);
21110 } while (It != Op.end());
21111 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
21112 if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI, I))
21113 ScheduleCopyableDataMapByUsers[I].insert(UserCD);
21114 }
21115 if (ScheduleCopyableDataMapByUsers[I].empty())
21116 ScheduleCopyableDataMapByUsers.erase(I);
21117 ScheduleCopyableDataMap.erase(KV);
21118 // Need to recalculate dependencies for the actual schedule data.
21119 if (ScheduleData *OpSD = getScheduleData(I);
21120 OpSD && OpSD->hasValidDependencies()) {
21121 OpSD->clearDirectDependencies();
21122 if (RegionHasStackSave ||
21124 ControlDependentMembers.push_back(OpSD);
21125 }
21126 continue;
21127 }
21128 ScheduledBundles.find(I)->getSecond().pop_back();
21129 }
21130 if (!ControlDependentMembers.empty()) {
21131 ScheduleBundle Invalid = ScheduleBundle::invalid();
21132 calculateDependencies(Invalid, /*InsertInReadyList=*/false, SLP,
21133 ControlDependentMembers);
21134 }
21135 return std::nullopt;
21136 }
21137 return &Bundle;
21138}
21139
21140BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
21141 // Allocate a new ScheduleData for the instruction.
21142 if (ChunkPos >= ChunkSize) {
21143 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
21144 ChunkPos = 0;
21145 }
21146 return &(ScheduleDataChunks.back()[ChunkPos++]);
21147}
21148
21149bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
21150 Value *V, const InstructionsState &S) {
21152 assert(I && "bundle member must be an instruction");
21153 if (getScheduleData(I))
21154 return true;
21155 if (!ScheduleStart) {
21156 // It's the first instruction in the new region.
21157 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
21158 ScheduleStart = I;
21159 ScheduleEnd = I->getNextNode();
21160 assert(ScheduleEnd && "tried to vectorize a terminator?");
21161 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
21162 return true;
21163 }
21164 // Search up and down at the same time, because we don't know if the new
21165 // instruction is above or below the existing scheduling region.
21166 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
21167 // against the budget. Otherwise debug info could affect codegen.
21169 ++ScheduleStart->getIterator().getReverse();
21170 BasicBlock::reverse_iterator UpperEnd = BB->rend();
21171 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
21172 BasicBlock::iterator LowerEnd = BB->end();
21173 auto IsAssumeLikeIntr = [](const Instruction &I) {
21174 if (auto *II = dyn_cast<IntrinsicInst>(&I))
21175 return II->isAssumeLikeIntrinsic();
21176 return false;
21177 };
21178 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21179 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21180 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
21181 &*DownIter != I) {
21182 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
21183 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
21184 return false;
21185 }
21186
21187 ++UpIter;
21188 ++DownIter;
21189
21190 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21191 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21192 }
21193 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
21194 assert(I->getParent() == ScheduleStart->getParent() &&
21195 "Instruction is in wrong basic block.");
21196 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
21197 ScheduleStart = I;
21198 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
21199 << "\n");
21200 return true;
21201 }
21202 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
21203 "Expected to reach top of the basic block or instruction down the "
21204 "lower end.");
21205 assert(I->getParent() == ScheduleEnd->getParent() &&
21206 "Instruction is in wrong basic block.");
21207 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
21208 nullptr);
21209 ScheduleEnd = I->getNextNode();
21210 assert(ScheduleEnd && "tried to vectorize a terminator?");
21211 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
21212 return true;
21213}
21214
21215void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
21216 Instruction *ToI,
21217 ScheduleData *PrevLoadStore,
21218 ScheduleData *NextLoadStore) {
21219 ScheduleData *CurrentLoadStore = PrevLoadStore;
21220 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
21221 // No need to allocate data for non-schedulable instructions.
21222 if (isa<PHINode>(I))
21223 continue;
21224 ScheduleData *SD = ScheduleDataMap.lookup(I);
21225 if (!SD) {
21226 SD = allocateScheduleDataChunks();
21227 ScheduleDataMap[I] = SD;
21228 }
21229 assert(!isInSchedulingRegion(*SD) &&
21230 "new ScheduleData already in scheduling region");
21231 SD->init(SchedulingRegionID, I);
21232
21233 if (I->mayReadOrWriteMemory() &&
21234 (!isa<IntrinsicInst>(I) ||
21235 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
21236 cast<IntrinsicInst>(I)->getIntrinsicID() !=
21237 Intrinsic::pseudoprobe))) {
21238 // Update the linked list of memory accessing instructions.
21239 if (CurrentLoadStore) {
21240 CurrentLoadStore->setNextLoadStore(SD);
21241 } else {
21242 FirstLoadStoreInRegion = SD;
21243 }
21244 CurrentLoadStore = SD;
21245 }
21246
21249 RegionHasStackSave = true;
21250 }
21251 if (NextLoadStore) {
21252 if (CurrentLoadStore)
21253 CurrentLoadStore->setNextLoadStore(NextLoadStore);
21254 } else {
21255 LastLoadStoreInRegion = CurrentLoadStore;
21256 }
21257}
21258
21259void BoUpSLP::BlockScheduling::calculateDependencies(
21260 ScheduleBundle &Bundle, bool InsertInReadyList, BoUpSLP *SLP,
21261 ArrayRef<ScheduleData *> ControlDeps) {
21262 SmallVector<ScheduleEntity *> WorkList;
21263 auto ProcessNode = [&](ScheduleEntity *SE) {
21264 if (auto *CD = dyn_cast<ScheduleCopyableData>(SE)) {
21265 if (CD->hasValidDependencies())
21266 return;
21267 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *CD << "\n");
21268 CD->initDependencies();
21269 CD->resetUnscheduledDeps();
21270 const EdgeInfo &EI = CD->getEdgeInfo();
21271 if (EI.UserTE) {
21272 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
21273 const auto *It = find(Op, CD->getInst());
21274 assert(It != Op.end() && "Lane not set");
21275 SmallPtrSet<Instruction *, 4> Visited;
21276 do {
21277 int Lane = std::distance(Op.begin(), It);
21278 assert(Lane >= 0 && "Lane not set");
21279 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
21280 !EI.UserTE->ReorderIndices.empty())
21281 Lane = EI.UserTE->ReorderIndices[Lane];
21282 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
21283 "Couldn't find extract lane");
21284 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
21285 if (EI.UserTE->isCopyableElement(In)) {
21286 // We may have not have related copyable scheduling data, if the
21287 // instruction is non-schedulable.
21288 if (ScheduleCopyableData *UseSD =
21289 getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) {
21290 CD->incDependencies();
21291 if (!UseSD->isScheduled())
21292 CD->incrementUnscheduledDeps(1);
21293 if (!UseSD->hasValidDependencies() ||
21294 (InsertInReadyList && UseSD->isReady()))
21295 WorkList.push_back(UseSD);
21296 }
21297 } else if (Visited.insert(In).second) {
21298 if (ScheduleData *UseSD = getScheduleData(In)) {
21299 CD->incDependencies();
21300 if (!UseSD->isScheduled())
21301 CD->incrementUnscheduledDeps(1);
21302 if (!UseSD->hasValidDependencies() ||
21303 (InsertInReadyList && UseSD->isReady()))
21304 WorkList.push_back(UseSD);
21305 }
21306 }
21307 It = find(make_range(std::next(It), Op.end()), CD->getInst());
21308 } while (It != Op.end());
21309 if (CD->isReady() && CD->getDependencies() == 0 &&
21310 (EI.UserTE->hasState() &&
21311 (EI.UserTE->getMainOp()->getParent() !=
21312 CD->getInst()->getParent() ||
21313 (isa<PHINode>(EI.UserTE->getMainOp()) &&
21314 (EI.UserTE->getMainOp()->hasNUsesOrMore(UsesLimit) ||
21315 any_of(EI.UserTE->getMainOp()->users(), [&](User *U) {
21316 auto *IU = dyn_cast<Instruction>(U);
21317 if (!IU)
21318 return true;
21319 return IU->getParent() == EI.UserTE->getMainOp()->getParent();
21320 })))))) {
21321 // If no uses in the block - mark as having pseudo-use, which cannot
21322 // be scheduled.
21323 // Prevents incorrect def-use tracking between external user and
21324 // actual instruction.
21325 CD->incDependencies();
21326 CD->incrementUnscheduledDeps(1);
21327 }
21328 }
21329 return;
21330 }
21331 auto *BundleMember = cast<ScheduleData>(SE);
21332 if (BundleMember->hasValidDependencies())
21333 return;
21334 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
21335 BundleMember->initDependencies();
21336 BundleMember->resetUnscheduledDeps();
21337 // Handle def-use chain dependencies.
21338 SmallDenseMap<Value *, unsigned> UserToNumOps;
21339 for (User *U : BundleMember->getInst()->users()) {
21340 if (isa<PHINode>(U))
21341 continue;
21342 if (ScheduleData *UseSD = getScheduleData(U)) {
21343 // The operand is a copyable element - skip.
21344 unsigned &NumOps = UserToNumOps.try_emplace(U, 0).first->getSecond();
21345 ++NumOps;
21346 if (areAllOperandsReplacedByCopyableData(
21347 cast<Instruction>(U), BundleMember->getInst(), *SLP, NumOps))
21348 continue;
21349 BundleMember->incDependencies();
21350 if (!UseSD->isScheduled())
21351 BundleMember->incrementUnscheduledDeps(1);
21352 if (!UseSD->hasValidDependencies() ||
21353 (InsertInReadyList && UseSD->isReady()))
21354 WorkList.push_back(UseSD);
21355 }
21356 }
21357 for (ScheduleCopyableData *UseSD :
21358 getScheduleCopyableDataUsers(BundleMember->getInst())) {
21359 BundleMember->incDependencies();
21360 if (!UseSD->isScheduled())
21361 BundleMember->incrementUnscheduledDeps(1);
21362 if (!UseSD->hasValidDependencies() ||
21363 (InsertInReadyList && UseSD->isReady()))
21364 WorkList.push_back(UseSD);
21365 }
21366
21367 SmallPtrSet<const Instruction *, 4> Visited;
21368 auto MakeControlDependent = [&](Instruction *I) {
21369 // Do not mark control dependent twice.
21370 if (!Visited.insert(I).second)
21371 return;
21372 auto *DepDest = getScheduleData(I);
21373 assert(DepDest && "must be in schedule window");
21374 DepDest->addControlDependency(BundleMember);
21375 BundleMember->incDependencies();
21376 if (!DepDest->isScheduled())
21377 BundleMember->incrementUnscheduledDeps(1);
21378 if (!DepDest->hasValidDependencies() ||
21379 (InsertInReadyList && DepDest->isReady()))
21380 WorkList.push_back(DepDest);
21381 };
21382
21383 // Any instruction which isn't safe to speculate at the beginning of the
21384 // block is control depend on any early exit or non-willreturn call
21385 // which proceeds it.
21386 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->getInst())) {
21387 for (Instruction *I = BundleMember->getInst()->getNextNode();
21388 I != ScheduleEnd; I = I->getNextNode()) {
21389 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
21390 continue;
21391
21392 // Add the dependency
21393 MakeControlDependent(I);
21394
21396 // Everything past here must be control dependent on I.
21397 break;
21398 }
21399 }
21400
21401 if (RegionHasStackSave) {
21402 // If we have an inalloc alloca instruction, it needs to be scheduled
21403 // after any preceeding stacksave. We also need to prevent any alloca
21404 // from reordering above a preceeding stackrestore.
21405 if (match(BundleMember->getInst(), m_Intrinsic<Intrinsic::stacksave>()) ||
21406 match(BundleMember->getInst(),
21408 for (Instruction *I = BundleMember->getInst()->getNextNode();
21409 I != ScheduleEnd; I = I->getNextNode()) {
21412 // Any allocas past here must be control dependent on I, and I
21413 // must be memory dependend on BundleMember->Inst.
21414 break;
21415
21416 if (!isa<AllocaInst>(I))
21417 continue;
21418
21419 // Add the dependency
21420 MakeControlDependent(I);
21421 }
21422 }
21423
21424 // In addition to the cases handle just above, we need to prevent
21425 // allocas and loads/stores from moving below a stacksave or a
21426 // stackrestore. Avoiding moving allocas below stackrestore is currently
21427 // thought to be conservatism. Moving loads/stores below a stackrestore
21428 // can lead to incorrect code.
21429 if (isa<AllocaInst>(BundleMember->getInst()) ||
21430 BundleMember->getInst()->mayReadOrWriteMemory()) {
21431 for (Instruction *I = BundleMember->getInst()->getNextNode();
21432 I != ScheduleEnd; I = I->getNextNode()) {
21435 continue;
21436
21437 // Add the dependency
21438 MakeControlDependent(I);
21439 break;
21440 }
21441 }
21442 }
21443
21444 // Handle the memory dependencies (if any).
21445 ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
21446 if (!NextLoadStore)
21447 return;
21448 Instruction *SrcInst = BundleMember->getInst();
21449 assert(SrcInst->mayReadOrWriteMemory() &&
21450 "NextLoadStore list for non memory effecting bundle?");
21451 MemoryLocation SrcLoc = getLocation(SrcInst);
21452 bool SrcMayWrite = SrcInst->mayWriteToMemory();
21453 unsigned NumAliased = 0;
21454 unsigned DistToSrc = 1;
21455 bool IsNonSimpleSrc = !SrcLoc.Ptr || !isSimple(SrcInst);
21456
21457 for (ScheduleData *DepDest = NextLoadStore; DepDest;
21458 DepDest = DepDest->getNextLoadStore()) {
21459 assert(isInSchedulingRegion(*DepDest) && "Expected to be in region");
21460
21461 // We have two limits to reduce the complexity:
21462 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
21463 // SLP->isAliased (which is the expensive part in this loop).
21464 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
21465 // the whole loop (even if the loop is fast, it's quadratic).
21466 // It's important for the loop break condition (see below) to
21467 // check this limit even between two read-only instructions.
21468 if (DistToSrc >= MaxMemDepDistance ||
21469 ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
21470 (IsNonSimpleSrc || NumAliased >= AliasedCheckLimit ||
21471 SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
21472
21473 // We increment the counter only if the locations are aliased
21474 // (instead of counting all alias checks). This gives a better
21475 // balance between reduced runtime and accurate dependencies.
21476 NumAliased++;
21477
21478 DepDest->addMemoryDependency(BundleMember);
21479 BundleMember->incDependencies();
21480 if (!DepDest->isScheduled())
21481 BundleMember->incrementUnscheduledDeps(1);
21482 if (!DepDest->hasValidDependencies() ||
21483 (InsertInReadyList && DepDest->isReady()))
21484 WorkList.push_back(DepDest);
21485 }
21486
21487 // Example, explaining the loop break condition: Let's assume our
21488 // starting instruction is i0 and MaxMemDepDistance = 3.
21489 //
21490 // +--------v--v--v
21491 // i0,i1,i2,i3,i4,i5,i6,i7,i8
21492 // +--------^--^--^
21493 //
21494 // MaxMemDepDistance let us stop alias-checking at i3 and we add
21495 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
21496 // Previously we already added dependencies from i3 to i6,i7,i8
21497 // (because of MaxMemDepDistance). As we added a dependency from
21498 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
21499 // and we can abort this loop at i6.
21500 if (DistToSrc >= 2 * MaxMemDepDistance)
21501 break;
21502 DistToSrc++;
21503 }
21504 };
21505
21506 assert((Bundle || !ControlDeps.empty()) &&
21507 "expected at least one instruction to schedule");
21508 if (Bundle)
21509 WorkList.push_back(Bundle.getBundle().front());
21510 WorkList.append(ControlDeps.begin(), ControlDeps.end());
21511 SmallPtrSet<ScheduleBundle *, 16> Visited;
21512 while (!WorkList.empty()) {
21513 ScheduleEntity *SD = WorkList.pop_back_val();
21514 SmallVector<ScheduleBundle *, 1> CopyableBundle;
21516 if (auto *CD = dyn_cast<ScheduleCopyableData>(SD)) {
21517 CopyableBundle.push_back(&CD->getBundle());
21518 Bundles = CopyableBundle;
21519 } else {
21520 Bundles = getScheduleBundles(SD->getInst());
21521 }
21522 if (Bundles.empty()) {
21523 if (!SD->hasValidDependencies())
21524 ProcessNode(SD);
21525 if (InsertInReadyList && SD->isReady()) {
21526 ReadyInsts.insert(SD);
21527 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD << "\n");
21528 }
21529 continue;
21530 }
21531 for (ScheduleBundle *Bundle : Bundles) {
21532 if (Bundle->hasValidDependencies() || !Visited.insert(Bundle).second)
21533 continue;
21534 assert(isInSchedulingRegion(*Bundle) &&
21535 "ScheduleData not in scheduling region");
21536 for_each(Bundle->getBundle(), ProcessNode);
21537 }
21538 if (InsertInReadyList && SD->isReady()) {
21539 for (ScheduleBundle *Bundle : Bundles) {
21540 assert(isInSchedulingRegion(*Bundle) &&
21541 "ScheduleData not in scheduling region");
21542 if (!Bundle->isReady())
21543 continue;
21544 ReadyInsts.insert(Bundle);
21545 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *Bundle
21546 << "\n");
21547 }
21548 }
21549 }
21550}
21551
21552void BoUpSLP::BlockScheduling::resetSchedule() {
21553 assert(ScheduleStart &&
21554 "tried to reset schedule on block which has not been scheduled");
21555 for_each(ScheduleDataMap, [&](auto &P) {
21556 if (BB != P.first->getParent())
21557 return;
21558 ScheduleData *SD = P.second;
21559 if (isInSchedulingRegion(*SD)) {
21560 SD->setScheduled(/*Scheduled=*/false);
21561 SD->resetUnscheduledDeps();
21562 }
21563 });
21564 for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {
21565 for_each(P.second, [&](ScheduleCopyableData *SD) {
21566 if (isInSchedulingRegion(*SD)) {
21567 SD->setScheduled(/*Scheduled=*/false);
21568 SD->resetUnscheduledDeps();
21569 }
21570 });
21571 });
21572 for_each(ScheduledBundles, [&](auto &P) {
21573 for_each(P.second, [&](ScheduleBundle *Bundle) {
21574 if (isInSchedulingRegion(*Bundle))
21575 Bundle->setScheduled(/*Scheduled=*/false);
21576 });
21577 });
21578 // Reset schedule data for copyable elements.
21579 for (auto &P : ScheduleCopyableDataMap) {
21580 if (isInSchedulingRegion(*P.second)) {
21581 P.second->setScheduled(/*Scheduled=*/false);
21582 P.second->resetUnscheduledDeps();
21583 }
21584 }
21585 ReadyInsts.clear();
21586}
21587
21588void BoUpSLP::scheduleBlock(const BoUpSLP &R, BlockScheduling *BS) {
21589 if (!BS->ScheduleStart)
21590 return;
21591
21592 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
21593
21594 // A key point - if we got here, pre-scheduling was able to find a valid
21595 // scheduling of the sub-graph of the scheduling window which consists
21596 // of all vector bundles and their transitive users. As such, we do not
21597 // need to reschedule anything *outside of* that subgraph.
21598
21599 BS->resetSchedule();
21600
21601 // For the real scheduling we use a more sophisticated ready-list: it is
21602 // sorted by the original instruction location. This lets the final schedule
21603 // be as close as possible to the original instruction order.
21604 // WARNING: If changing this order causes a correctness issue, that means
21605 // there is some missing dependence edge in the schedule data graph.
21606 struct ScheduleDataCompare {
21607 bool operator()(const ScheduleEntity *SD1,
21608 const ScheduleEntity *SD2) const {
21609 return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
21610 }
21611 };
21612 std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
21613
21614 // Ensure that all dependency data is updated (for nodes in the sub-graph)
21615 // and fill the ready-list with initial instructions.
21616 int Idx = 0;
21617 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
21618 I = I->getNextNode()) {
21619 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
21620 if (!Bundles.empty()) {
21621 for (ScheduleBundle *Bundle : Bundles) {
21622 Bundle->setSchedulingPriority(Idx++);
21623 if (!Bundle->hasValidDependencies())
21624 BS->calculateDependencies(*Bundle, /*InsertInReadyList=*/false, this);
21625 }
21626 SmallVector<ScheduleCopyableData *> SDs = BS->getScheduleCopyableData(I);
21627 for (ScheduleCopyableData *SD : reverse(SDs)) {
21628 ScheduleBundle &Bundle = SD->getBundle();
21629 Bundle.setSchedulingPriority(Idx++);
21630 if (!Bundle.hasValidDependencies())
21631 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
21632 }
21633 continue;
21634 }
21636 BS->getScheduleCopyableDataUsers(I);
21637 if (ScheduleData *SD = BS->getScheduleData(I)) {
21638 [[maybe_unused]] ArrayRef<TreeEntry *> SDTEs = getTreeEntries(I);
21639 assert((isVectorLikeInstWithConstOps(SD->getInst()) || SDTEs.empty() ||
21640 SDTEs.front()->doesNotNeedToSchedule() ||
21642 "scheduler and vectorizer bundle mismatch");
21643 SD->setSchedulingPriority(Idx++);
21644 if (!SD->hasValidDependencies() &&
21645 (!CopyableData.empty() ||
21646 any_of(R.ValueToGatherNodes.lookup(I), [&](const TreeEntry *TE) {
21647 assert(TE->isGather() && "expected gather node");
21648 return TE->hasState() && TE->hasCopyableElements() &&
21649 TE->isCopyableElement(I);
21650 }))) {
21651 // Need to calculate deps for these nodes to correctly handle copyable
21652 // dependencies, even if they were cancelled.
21653 // If copyables bundle was cancelled, the deps are cleared and need to
21654 // recalculate them.
21655 ScheduleBundle Bundle;
21656 Bundle.add(SD);
21657 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
21658 }
21659 }
21660 for (ScheduleCopyableData *SD : reverse(CopyableData)) {
21661 ScheduleBundle &Bundle = SD->getBundle();
21662 Bundle.setSchedulingPriority(Idx++);
21663 if (!Bundle.hasValidDependencies())
21664 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
21665 }
21666 }
21667 BS->initialFillReadyList(ReadyInsts);
21668
21669 Instruction *LastScheduledInst = BS->ScheduleEnd;
21670
21671 // Do the "real" scheduling.
21672 SmallPtrSet<Instruction *, 16> Scheduled;
21673 while (!ReadyInsts.empty()) {
21674 auto *Picked = *ReadyInsts.begin();
21675 ReadyInsts.erase(ReadyInsts.begin());
21676
21677 // Move the scheduled instruction(s) to their dedicated places, if not
21678 // there yet.
21679 if (auto *Bundle = dyn_cast<ScheduleBundle>(Picked)) {
21680 for (const ScheduleEntity *BundleMember : Bundle->getBundle()) {
21681 Instruction *PickedInst = BundleMember->getInst();
21682 // If copyable must be schedule as part of something else, skip it.
21683 bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst);
21684 if ((IsCopyable && BS->getScheduleData(PickedInst)) ||
21685 (!IsCopyable && !Scheduled.insert(PickedInst).second))
21686 continue;
21687 if (PickedInst->getNextNode() != LastScheduledInst)
21688 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
21689 LastScheduledInst = PickedInst;
21690 }
21691 EntryToLastInstruction.try_emplace(Bundle->getTreeEntry(),
21692 LastScheduledInst);
21693 } else {
21694 auto *SD = cast<ScheduleData>(Picked);
21695 Instruction *PickedInst = SD->getInst();
21696 if (PickedInst->getNextNode() != LastScheduledInst)
21697 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
21698 LastScheduledInst = PickedInst;
21699 }
21700 auto Invalid = InstructionsState::invalid();
21701 BS->schedule(R, Invalid, EdgeInfo(), Picked, ReadyInsts);
21702 }
21703
21704 // Check that we didn't break any of our invariants.
21705#ifdef EXPENSIVE_CHECKS
21706 BS->verify();
21707#endif
21708
21709#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
21710 // Check that all schedulable entities got scheduled
21711 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
21712 I = I->getNextNode()) {
21713 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
21714 assert(all_of(Bundles,
21715 [](const ScheduleBundle *Bundle) {
21716 return Bundle->isScheduled();
21717 }) &&
21718 "must be scheduled at this point");
21719 }
21720#endif
21721
21722 // Avoid duplicate scheduling of the block.
21723 BS->ScheduleStart = nullptr;
21724}
21725
21727 // If V is a store, just return the width of the stored value (or value
21728 // truncated just before storing) without traversing the expression tree.
21729 // This is the common case.
21730 if (auto *Store = dyn_cast<StoreInst>(V))
21731 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
21732
21733 if (auto *IEI = dyn_cast<InsertElementInst>(V))
21734 return getVectorElementSize(IEI->getOperand(1));
21735
21736 auto E = InstrElementSize.find(V);
21737 if (E != InstrElementSize.end())
21738 return E->second;
21739
21740 // If V is not a store, we can traverse the expression tree to find loads
21741 // that feed it. The type of the loaded value may indicate a more suitable
21742 // width than V's type. We want to base the vector element size on the width
21743 // of memory operations where possible.
21746 if (auto *I = dyn_cast<Instruction>(V)) {
21747 Worklist.emplace_back(I, I->getParent(), 0);
21748 Visited.insert(I);
21749 }
21750
21751 // Traverse the expression tree in bottom-up order looking for loads. If we
21752 // encounter an instruction we don't yet handle, we give up.
21753 auto Width = 0u;
21754 Value *FirstNonBool = nullptr;
21755 while (!Worklist.empty()) {
21756 auto [I, Parent, Level] = Worklist.pop_back_val();
21757
21758 // We should only be looking at scalar instructions here. If the current
21759 // instruction has a vector type, skip.
21760 auto *Ty = I->getType();
21761 if (isa<VectorType>(Ty))
21762 continue;
21763 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
21764 FirstNonBool = I;
21765 if (Level > RecursionMaxDepth)
21766 continue;
21767
21768 // If the current instruction is a load, update MaxWidth to reflect the
21769 // width of the loaded value.
21771 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
21772
21773 // Otherwise, we need to visit the operands of the instruction. We only
21774 // handle the interesting cases from buildTree here. If an operand is an
21775 // instruction we haven't yet visited and from the same basic block as the
21776 // user or the use is a PHI node, we add it to the worklist.
21779 for (Use &U : I->operands()) {
21780 if (auto *J = dyn_cast<Instruction>(U.get()))
21781 if (Visited.insert(J).second &&
21782 (isa<PHINode>(I) || J->getParent() == Parent)) {
21783 Worklist.emplace_back(J, J->getParent(), Level + 1);
21784 continue;
21785 }
21786 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
21787 FirstNonBool = U.get();
21788 }
21789 } else {
21790 break;
21791 }
21792 }
21793
21794 // If we didn't encounter a memory access in the expression tree, or if we
21795 // gave up for some reason, just return the width of V. Otherwise, return the
21796 // maximum width we found.
21797 if (!Width) {
21798 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
21799 V = FirstNonBool;
21800 Width = DL->getTypeSizeInBits(V->getType());
21801 }
21802
21803 for (Instruction *I : Visited)
21804 InstrElementSize[I] = Width;
21805
21806 return Width;
21807}
21808
21809bool BoUpSLP::collectValuesToDemote(
21810 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
21812 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
21813 bool &IsProfitableToDemote, bool IsTruncRoot) const {
21814 // We can always demote constants.
21815 if (all_of(E.Scalars, IsaPred<Constant>))
21816 return true;
21817
21818 unsigned OrigBitWidth =
21819 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
21820 if (OrigBitWidth == BitWidth) {
21821 MaxDepthLevel = 1;
21822 return true;
21823 }
21824
21825 // Check if the node was analyzed already and must keep its original bitwidth.
21826 if (NodesToKeepBWs.contains(E.Idx))
21827 return false;
21828
21829 // If the value is not a vectorized instruction in the expression and not used
21830 // by the insertelement instruction and not used in multiple vector nodes, it
21831 // cannot be demoted.
21832 bool IsSignedNode = any_of(E.Scalars, [&](Value *R) {
21833 if (isa<PoisonValue>(R))
21834 return false;
21835 return !isKnownNonNegative(R, SimplifyQuery(*DL));
21836 });
21837 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
21838 if (isa<PoisonValue>(V))
21839 return true;
21840 if (getTreeEntries(V).size() > 1)
21841 return false;
21842 // For lat shuffle of sext/zext with many uses need to check the extra bit
21843 // for unsigned values, otherwise may have incorrect casting for reused
21844 // scalars.
21845 bool IsSignedVal = !isKnownNonNegative(V, SimplifyQuery(*DL));
21846 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
21847 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
21848 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
21849 return true;
21850 }
21851 unsigned NumSignBits = ComputeNumSignBits(V, *DL, AC, nullptr, DT);
21852 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
21853 if (IsSignedNode)
21854 ++BitWidth1;
21855 if (auto *I = dyn_cast<Instruction>(V)) {
21856 APInt Mask = DB->getDemandedBits(I);
21857 unsigned BitWidth2 =
21858 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
21859 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
21860 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
21861 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
21862 break;
21863 BitWidth2 *= 2;
21864 }
21865 BitWidth1 = std::min(BitWidth1, BitWidth2);
21866 }
21867 BitWidth = std::max(BitWidth, BitWidth1);
21868 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
21869 };
21870 auto FinalAnalysis = [&, TTI = TTI]() {
21871 if (!IsProfitableToDemote)
21872 return false;
21873 bool Res = all_of(
21874 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
21875 // Demote gathers.
21876 if (Res && E.isGather()) {
21877 if (E.hasState()) {
21878 if (const TreeEntry *SameTE =
21879 getSameValuesTreeEntry(E.getMainOp(), E.Scalars);
21880 SameTE)
21881 if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot, BitWidth,
21882 ToDemote, Visited, NodesToKeepBWs,
21883 MaxDepthLevel, IsProfitableToDemote,
21884 IsTruncRoot)) {
21885 ToDemote.push_back(E.Idx);
21886 return true;
21887 }
21888 }
21889 // Check possible extractelement instructions bases and final vector
21890 // length.
21891 SmallPtrSet<Value *, 4> UniqueBases;
21892 for (Value *V : E.Scalars) {
21893 auto *EE = dyn_cast<ExtractElementInst>(V);
21894 if (!EE)
21895 continue;
21896 UniqueBases.insert(EE->getVectorOperand());
21897 }
21898 const unsigned VF = E.Scalars.size();
21899 Type *OrigScalarTy = E.Scalars.front()->getType();
21900 if (UniqueBases.size() <= 2 ||
21901 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF)) >=
21903 *TTI,
21905 IntegerType::get(OrigScalarTy->getContext(), BitWidth),
21906 VF))) {
21907 ToDemote.push_back(E.Idx);
21908 return true;
21909 }
21910 }
21911 return Res;
21912 };
21913 if (E.isGather() || !Visited.insert(&E).second ||
21914 any_of(E.Scalars, [&](Value *V) {
21915 return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {
21916 return isa<InsertElementInst>(U) && !isVectorized(U);
21917 });
21918 }))
21919 return FinalAnalysis();
21920
21921 if (any_of(E.Scalars, [&](Value *V) {
21922 return !isa<Constant>(V) && !all_of(V->users(), [=](User *U) {
21923 return isVectorized(U) ||
21924 (E.Idx == 0 && UserIgnoreList &&
21925 UserIgnoreList->contains(U)) ||
21926 (!isa<CmpInst>(U) && U->getType()->isSized() &&
21927 !U->getType()->isScalableTy() &&
21928 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
21929 }) && !IsPotentiallyTruncated(V, BitWidth);
21930 }))
21931 return false;
21932
21933 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
21934 bool &NeedToExit) {
21935 NeedToExit = false;
21936 unsigned InitLevel = MaxDepthLevel;
21937 for (const TreeEntry *Op : Operands) {
21938 unsigned Level = InitLevel;
21939 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
21940 ToDemote, Visited, NodesToKeepBWs, Level,
21941 IsProfitableToDemote, IsTruncRoot)) {
21942 if (!IsProfitableToDemote)
21943 return false;
21944 NeedToExit = true;
21945 if (!FinalAnalysis())
21946 return false;
21947 continue;
21948 }
21949 MaxDepthLevel = std::max(MaxDepthLevel, Level);
21950 }
21951 return true;
21952 };
21953 auto AttemptCheckBitwidth =
21954 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
21955 // Try all bitwidth < OrigBitWidth.
21956 NeedToExit = false;
21957 unsigned BestFailBitwidth = 0;
21958 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
21959 if (Checker(BitWidth, OrigBitWidth))
21960 return true;
21961 if (BestFailBitwidth == 0 && FinalAnalysis())
21962 BestFailBitwidth = BitWidth;
21963 }
21964 if (BitWidth >= OrigBitWidth) {
21965 if (BestFailBitwidth == 0) {
21966 BitWidth = OrigBitWidth;
21967 return false;
21968 }
21969 MaxDepthLevel = 1;
21970 BitWidth = BestFailBitwidth;
21971 NeedToExit = true;
21972 return true;
21973 }
21974 return false;
21975 };
21976 auto TryProcessInstruction =
21977 [&](unsigned &BitWidth, ArrayRef<const TreeEntry *> Operands = {},
21978 function_ref<bool(unsigned, unsigned)> Checker = {}) {
21979 if (Operands.empty()) {
21980 if (!IsTruncRoot)
21981 MaxDepthLevel = 1;
21982 for (Value *V : E.Scalars)
21983 (void)IsPotentiallyTruncated(V, BitWidth);
21984 } else {
21985 // Several vectorized uses? Check if we can truncate it, otherwise -
21986 // exit.
21987 if (any_of(E.Scalars, [&](Value *V) {
21988 return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth);
21989 }))
21990 return false;
21991 bool NeedToExit = false;
21992 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
21993 return false;
21994 if (NeedToExit)
21995 return true;
21996 if (!ProcessOperands(Operands, NeedToExit))
21997 return false;
21998 if (NeedToExit)
21999 return true;
22000 }
22001
22002 ++MaxDepthLevel;
22003 // Record the entry that we can demote.
22004 ToDemote.push_back(E.Idx);
22005 return IsProfitableToDemote;
22006 };
22007
22008 if (E.State == TreeEntry::SplitVectorize)
22009 return TryProcessInstruction(
22010 BitWidth,
22011 {VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(),
22012 VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()});
22013
22014 switch (E.getOpcode()) {
22015
22016 // We can always demote truncations and extensions. Since truncations can
22017 // seed additional demotion, we save the truncated value.
22018 case Instruction::Trunc:
22019 if (IsProfitableToDemoteRoot)
22020 IsProfitableToDemote = true;
22021 return TryProcessInstruction(BitWidth);
22022 case Instruction::ZExt:
22023 case Instruction::SExt:
22024 if (E.UserTreeIndex.UserTE && E.UserTreeIndex.UserTE->hasState() &&
22025 E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast &&
22026 E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy())
22027 return false;
22028 IsProfitableToDemote = true;
22029 return TryProcessInstruction(BitWidth);
22030
22031 // We can demote certain binary operations if we can demote both of their
22032 // operands.
22033 case Instruction::Add:
22034 case Instruction::Sub:
22035 case Instruction::Mul:
22036 case Instruction::And:
22037 case Instruction::Or:
22038 case Instruction::Xor: {
22039 return TryProcessInstruction(
22040 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
22041 }
22042 case Instruction::Freeze:
22043 return TryProcessInstruction(BitWidth, getOperandEntry(&E, 0));
22044 case Instruction::Shl: {
22045 // If we are truncating the result of this SHL, and if it's a shift of an
22046 // inrange amount, we can always perform a SHL in a smaller type.
22047 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
22048 return all_of(E.Scalars, [&](Value *V) {
22049 if (isa<PoisonValue>(V))
22050 return true;
22051 if (E.isCopyableElement(V))
22052 return true;
22053 auto *I = cast<Instruction>(V);
22054 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22055 return AmtKnownBits.getMaxValue().ult(BitWidth);
22056 });
22057 };
22058 return TryProcessInstruction(
22059 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
22060 }
22061 case Instruction::LShr: {
22062 // If this is a truncate of a logical shr, we can truncate it to a smaller
22063 // lshr iff we know that the bits we would otherwise be shifting in are
22064 // already zeros.
22065 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22066 return all_of(E.Scalars, [&](Value *V) {
22067 if (isa<PoisonValue>(V))
22068 return true;
22069 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22070 if (E.isCopyableElement(V))
22071 return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
22072 auto *I = cast<Instruction>(V);
22073 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22074 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22075 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
22076 SimplifyQuery(*DL));
22077 });
22078 };
22079 return TryProcessInstruction(
22080 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
22081 LShrChecker);
22082 }
22083 case Instruction::AShr: {
22084 // If this is a truncate of an arithmetic shr, we can truncate it to a
22085 // smaller ashr iff we know that all the bits from the sign bit of the
22086 // original type and the sign bit of the truncate type are similar.
22087 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22088 return all_of(E.Scalars, [&](Value *V) {
22089 if (isa<PoisonValue>(V))
22090 return true;
22091 auto *I = cast<Instruction>(V);
22092 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22093 unsigned ShiftedBits = OrigBitWidth - BitWidth;
22094 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22095 ShiftedBits <
22096 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22097 });
22098 };
22099 return TryProcessInstruction(
22100 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
22101 AShrChecker);
22102 }
22103 case Instruction::UDiv:
22104 case Instruction::URem: {
22105 // UDiv and URem can be truncated if all the truncated bits are zero.
22106 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22107 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
22108 return all_of(E.Scalars, [&](Value *V) {
22109 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22110 if (E.hasCopyableElements() && E.isCopyableElement(V))
22111 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
22112 auto *I = cast<Instruction>(V);
22113 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
22114 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22115 });
22116 };
22117 return TryProcessInstruction(
22118 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
22119 }
22120
22121 // We can demote selects if we can demote their true and false values.
22122 case Instruction::Select: {
22123 return TryProcessInstruction(
22124 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
22125 }
22126
22127 // We can demote phis if we can demote all their incoming operands.
22128 case Instruction::PHI: {
22129 const unsigned NumOps = E.getNumOperands();
22131 transform(seq<unsigned>(0, NumOps), Ops.begin(),
22132 [&](unsigned Idx) { return getOperandEntry(&E, Idx); });
22133
22134 return TryProcessInstruction(BitWidth, Ops);
22135 }
22136
22137 case Instruction::Call: {
22138 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
22139 if (!IC)
22140 break;
22142 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
22143 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
22144 break;
22145 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
22146 function_ref<bool(unsigned, unsigned)> CallChecker;
22147 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22148 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
22149 return all_of(E.Scalars, [&](Value *V) {
22150 auto *I = cast<Instruction>(V);
22151 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
22152 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22153 return MaskedValueIsZero(I->getOperand(0), Mask,
22154 SimplifyQuery(*DL)) &&
22155 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22156 }
22157 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
22158 "Expected min/max intrinsics only.");
22159 unsigned SignBits = OrigBitWidth - BitWidth;
22160 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
22161 unsigned Op0SignBits =
22162 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22163 unsigned Op1SignBits =
22164 ComputeNumSignBits(I->getOperand(1), *DL, AC, nullptr, DT);
22165 return SignBits <= Op0SignBits &&
22166 ((SignBits != Op0SignBits &&
22167 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
22168 MaskedValueIsZero(I->getOperand(0), Mask,
22169 SimplifyQuery(*DL))) &&
22170 SignBits <= Op1SignBits &&
22171 ((SignBits != Op1SignBits &&
22172 !isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) ||
22173 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
22174 });
22175 };
22176 auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22177 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
22178 return all_of(E.Scalars, [&](Value *V) {
22179 auto *I = cast<Instruction>(V);
22180 unsigned SignBits = OrigBitWidth - BitWidth;
22181 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
22182 unsigned Op0SignBits =
22183 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22184 return SignBits <= Op0SignBits &&
22185 ((SignBits != Op0SignBits &&
22186 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
22187 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
22188 });
22189 };
22190 if (ID != Intrinsic::abs) {
22191 Operands.push_back(getOperandEntry(&E, 1));
22192 CallChecker = CompChecker;
22193 } else {
22194 CallChecker = AbsChecker;
22195 }
22196 InstructionCost BestCost =
22197 std::numeric_limits<InstructionCost::CostType>::max();
22198 unsigned BestBitWidth = BitWidth;
22199 unsigned VF = E.Scalars.size();
22200 // Choose the best bitwidth based on cost estimations.
22201 auto Checker = [&](unsigned BitWidth, unsigned) {
22202 unsigned MinBW = PowerOf2Ceil(BitWidth);
22203 SmallVector<Type *> ArgTys =
22204 buildIntrinsicArgTypes(IC, ID, VF, MinBW, TTI);
22205 auto VecCallCosts = getVectorCallCosts(
22206 IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),
22207 TTI, TLI, ArgTys);
22208 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
22209 if (Cost < BestCost) {
22210 BestCost = Cost;
22211 BestBitWidth = BitWidth;
22212 }
22213 return false;
22214 };
22215 [[maybe_unused]] bool NeedToExit;
22216 (void)AttemptCheckBitwidth(Checker, NeedToExit);
22217 BitWidth = BestBitWidth;
22218 return TryProcessInstruction(BitWidth, Operands, CallChecker);
22219 }
22220
22221 // Otherwise, conservatively give up.
22222 default:
22223 break;
22224 }
22225 MaxDepthLevel = 1;
22226 return FinalAnalysis();
22227}
22228
22229static RecurKind getRdxKind(Value *V);
22230
22232 // We only attempt to truncate integer expressions.
22233 bool IsStoreOrInsertElt =
22234 VectorizableTree.front()->hasState() &&
22235 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
22236 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
22237 if ((IsStoreOrInsertElt || UserIgnoreList) &&
22238 ExtraBitWidthNodes.size() <= 1 &&
22239 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
22240 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
22241 return;
22242
22243 unsigned NodeIdx = 0;
22244 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
22245 NodeIdx = 1;
22246
22247 // Ensure the roots of the vectorizable tree don't form a cycle.
22248 assert((VectorizableTree[NodeIdx]->isGather() || NodeIdx != 0 ||
22249 !VectorizableTree[NodeIdx]->UserTreeIndex) &&
22250 "Unexpected tree is graph.");
22251
22252 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
22253 // resize to the final type.
22254 bool IsTruncRoot = false;
22255 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
22256 SmallVector<unsigned> RootDemotes;
22257 SmallDenseSet<unsigned, 8> NodesToKeepBWs;
22258 if (NodeIdx != 0 &&
22259 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22260 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
22261 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
22262 IsTruncRoot = true;
22263 RootDemotes.push_back(NodeIdx);
22264 IsProfitableToDemoteRoot = true;
22265 ++NodeIdx;
22266 }
22267
22268 // Analyzed the reduction already and not profitable - exit.
22269 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
22270 return;
22271
22272 SmallVector<unsigned> ToDemote;
22273 auto ComputeMaxBitWidth =
22274 [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot,
22275 unsigned Limit, bool IsTruncRoot, bool IsSignedCmp) -> unsigned {
22276 ToDemote.clear();
22277 // Check if the root is trunc and the next node is gather/buildvector, then
22278 // keep trunc in scalars, which is free in most cases.
22279 if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
22280 !NodesToKeepBWs.contains(E.Idx) &&
22281 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
22282 all_of(E.Scalars, [&](Value *V) {
22283 return V->hasOneUse() || isa<Constant>(V) ||
22284 (!V->hasNUsesOrMore(UsesLimit) &&
22285 none_of(V->users(), [&](User *U) {
22286 ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
22287 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22288 if (TEs.empty() || is_contained(TEs, UserTE))
22289 return false;
22290 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22291 SelectInst>(U) ||
22292 isa<SIToFPInst, UIToFPInst>(U) ||
22293 (UserTE->hasState() &&
22294 (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22295 SelectInst>(UserTE->getMainOp()) ||
22296 isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))))
22297 return true;
22298 unsigned UserTESz = DL->getTypeSizeInBits(
22299 UserTE->Scalars.front()->getType());
22300 if (all_of(TEs, [&](const TreeEntry *TE) {
22301 auto It = MinBWs.find(TE);
22302 return It != MinBWs.end() &&
22303 It->second.first > UserTESz;
22304 }))
22305 return true;
22306 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
22307 }));
22308 })) {
22309 ToDemote.push_back(E.Idx);
22310 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22311 auto It = MinBWs.find(UserTE);
22312 if (It != MinBWs.end())
22313 return It->second.first;
22314 unsigned MaxBitWidth =
22315 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
22316 MaxBitWidth = bit_ceil(MaxBitWidth);
22317 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22318 MaxBitWidth = 8;
22319 return MaxBitWidth;
22320 }
22321
22322 if (!E.hasState())
22323 return 0u;
22324
22325 unsigned VF = E.getVectorFactor();
22326 Type *ScalarTy = E.Scalars.front()->getType();
22327 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
22328 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
22329 if (!TreeRootIT)
22330 return 0u;
22331
22332 if (any_of(E.Scalars,
22333 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
22334 return 0u;
22335
22336 unsigned NumParts = ::getNumberOfParts(
22337 *TTI, getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
22338
22339 // The maximum bit width required to represent all the values that can be
22340 // demoted without loss of precision. It would be safe to truncate the roots
22341 // of the expression to this width.
22342 unsigned MaxBitWidth = 1u;
22343
22344 // True if the roots can be zero-extended back to their original type,
22345 // rather than sign-extended. We know that if the leading bits are not
22346 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
22347 // True.
22348 // Determine if the sign bit of all the roots is known to be zero. If not,
22349 // IsKnownPositive is set to False.
22350 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
22351 if (isa<PoisonValue>(R))
22352 return true;
22353 KnownBits Known = computeKnownBits(R, *DL);
22354 return Known.isNonNegative();
22355 });
22356
22357 if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
22358 E.UserTreeIndex.UserTE->hasState() &&
22359 E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
22360 MaxBitWidth =
22361 std::min(DL->getTypeSizeInBits(
22362 E.UserTreeIndex.UserTE->Scalars.front()->getType()),
22363 DL->getTypeSizeInBits(ScalarTy));
22364
22365 // We first check if all the bits of the roots are demanded. If they're not,
22366 // we can truncate the roots to this narrower type.
22367 for (Value *Root : E.Scalars) {
22368 if (isa<PoisonValue>(Root))
22369 continue;
22370 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, AC, nullptr, DT);
22371 TypeSize NumTypeBits =
22372 DL->getTypeSizeInBits(Root->getType()->getScalarType());
22373 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22374 // If we can't prove that the sign bit is zero, we must add one to the
22375 // maximum bit width to account for the unknown sign bit. This preserves
22376 // the existing sign bit so we can safely sign-extend the root back to the
22377 // original type. Otherwise, if we know the sign bit is zero, we will
22378 // zero-extend the root instead.
22379 //
22380 // FIXME: This is somewhat suboptimal, as there will be cases where adding
22381 // one to the maximum bit width will yield a larger-than-necessary
22382 // type. In general, we need to add an extra bit only if we can't
22383 // prove that the upper bit of the original type is equal to the
22384 // upper bit of the proposed smaller type. If these two bits are
22385 // the same (either zero or one) we know that sign-extending from
22386 // the smaller type will result in the same value. Here, since we
22387 // can't yet prove this, we are just making the proposed smaller
22388 // type larger to ensure correctness.
22389 if (!IsKnownPositive)
22390 ++BitWidth1;
22391
22392 auto *I = dyn_cast<Instruction>(Root);
22393 if (!I) {
22394 MaxBitWidth = std::max(BitWidth1, MaxBitWidth);
22395 continue;
22396 }
22397 APInt Mask = DB->getDemandedBits(I);
22398 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22399 MaxBitWidth =
22400 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
22401 }
22402
22403 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22404 MaxBitWidth = 8;
22405
22406 // If the original type is large, but reduced type does not improve the reg
22407 // use - ignore it.
22408 if (NumParts > 1 &&
22409 NumParts ==
22411 *TTI, getWidenedType(IntegerType::get(F->getContext(),
22412 bit_ceil(MaxBitWidth)),
22413 VF)))
22414 return 0u;
22415
22416 unsigned Opcode = E.getOpcode();
22417 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
22418 Opcode == Instruction::SExt ||
22419 Opcode == Instruction::ZExt || NumParts > 1;
22420 // Conservatively determine if we can actually truncate the roots of the
22421 // expression. Collect the values that can be demoted in ToDemote and
22422 // additional roots that require investigating in Roots.
22424 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
22425 bool NeedToDemote = IsProfitableToDemote;
22426
22427 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
22428 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
22429 NeedToDemote, IsTruncRoot) ||
22430 (MaxDepthLevel <= Limit &&
22431 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
22432 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
22433 DL->getTypeSizeInBits(TreeRootIT) /
22434 DL->getTypeSizeInBits(
22435 E.getMainOp()->getOperand(0)->getType()) >
22436 2)))))
22437 return 0u;
22438 // Round MaxBitWidth up to the next power-of-two.
22439 MaxBitWidth = bit_ceil(MaxBitWidth);
22440
22441 return MaxBitWidth;
22442 };
22443
22444 // If we can truncate the root, we must collect additional values that might
22445 // be demoted as a result. That is, those seeded by truncations we will
22446 // modify.
22447 // Add reduction ops sizes, if any.
22448 if (UserIgnoreList &&
22449 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
22450 // Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
22451 // x i1> to in)).
22452 if (all_of(*UserIgnoreList,
22453 [](Value *V) {
22454 return isa<PoisonValue>(V) ||
22455 cast<Instruction>(V)->getOpcode() == Instruction::Add;
22456 }) &&
22457 VectorizableTree.front()->State == TreeEntry::Vectorize &&
22458 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
22459 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
22460 Builder.getInt1Ty()) {
22461 ReductionBitWidth = 1;
22462 } else {
22463 for (Value *V : *UserIgnoreList) {
22464 if (isa<PoisonValue>(V))
22465 continue;
22466 unsigned NumSignBits = ComputeNumSignBits(V, *DL, AC, nullptr, DT);
22467 TypeSize NumTypeBits = DL->getTypeSizeInBits(V->getType());
22468 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22470 ++BitWidth1;
22471 unsigned BitWidth2 = BitWidth1;
22474 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22475 }
22476 ReductionBitWidth =
22477 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
22478 }
22479 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
22480 ReductionBitWidth = 8;
22481
22482 ReductionBitWidth = bit_ceil(ReductionBitWidth);
22483 }
22484 }
22485 bool IsTopRoot = NodeIdx == 0;
22486 while (NodeIdx < VectorizableTree.size() &&
22487 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22488 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
22489 RootDemotes.push_back(NodeIdx);
22490 ++NodeIdx;
22491 IsTruncRoot = true;
22492 }
22493 bool IsSignedCmp = false;
22494 if (UserIgnoreList &&
22495 all_of(*UserIgnoreList,
22497 m_SMax(m_Value(), m_Value())))))
22498 IsSignedCmp = true;
22499 while (NodeIdx < VectorizableTree.size()) {
22500 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
22501 unsigned Limit = 2;
22502 if (IsTopRoot &&
22503 ReductionBitWidth ==
22504 DL->getTypeSizeInBits(
22505 VectorizableTree.front()->Scalars.front()->getType()))
22506 Limit = 3;
22507 unsigned MaxBitWidth = ComputeMaxBitWidth(
22508 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
22509 IsTruncRoot, IsSignedCmp);
22510 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
22511 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
22512 ReductionBitWidth = bit_ceil(MaxBitWidth);
22513 else if (MaxBitWidth == 0)
22514 ReductionBitWidth = 0;
22515 }
22516
22517 for (unsigned Idx : RootDemotes) {
22518 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
22519 uint32_t OrigBitWidth =
22520 DL->getTypeSizeInBits(V->getType()->getScalarType());
22521 if (OrigBitWidth > MaxBitWidth) {
22522 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
22523 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
22524 }
22525 return false;
22526 }))
22527 ToDemote.push_back(Idx);
22528 }
22529 RootDemotes.clear();
22530 IsTopRoot = false;
22531 IsProfitableToDemoteRoot = true;
22532
22533 if (ExtraBitWidthNodes.empty()) {
22534 NodeIdx = VectorizableTree.size();
22535 } else {
22536 unsigned NewIdx = 0;
22537 do {
22538 NewIdx = *ExtraBitWidthNodes.begin();
22539 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
22540 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
22541 NodeIdx = NewIdx;
22542 IsTruncRoot =
22543 NodeIdx < VectorizableTree.size() &&
22544 VectorizableTree[NodeIdx]->UserTreeIndex &&
22545 VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
22546 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22547 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22548 Instruction::Trunc &&
22549 !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
22550 IsSignedCmp =
22551 NodeIdx < VectorizableTree.size() &&
22552 VectorizableTree[NodeIdx]->UserTreeIndex &&
22553 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22554 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22555 Instruction::ICmp &&
22556 any_of(
22557 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
22558 [&](Value *V) {
22559 auto *IC = dyn_cast<ICmpInst>(V);
22560 return IC && (IC->isSigned() ||
22561 !isKnownNonNegative(IC->getOperand(0),
22562 SimplifyQuery(*DL)) ||
22563 !isKnownNonNegative(IC->getOperand(1),
22564 SimplifyQuery(*DL)));
22565 });
22566 }
22567
22568 // If the maximum bit width we compute is less than the width of the roots'
22569 // type, we can proceed with the narrowing. Otherwise, do nothing.
22570 if (MaxBitWidth == 0 ||
22571 MaxBitWidth >=
22572 cast<IntegerType>(TreeRoot.front()->getType()->getScalarType())
22573 ->getBitWidth()) {
22574 if (UserIgnoreList)
22575 AnalyzedMinBWVals.insert_range(TreeRoot);
22576 NodesToKeepBWs.insert_range(ToDemote);
22577 continue;
22578 }
22579
22580 // Finally, map the values we can demote to the maximum bit with we
22581 // computed.
22582 for (unsigned Idx : ToDemote) {
22583 TreeEntry *TE = VectorizableTree[Idx].get();
22584 if (MinBWs.contains(TE))
22585 continue;
22586 bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
22587 if (isa<PoisonValue>(R))
22588 return false;
22589 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22590 });
22591 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
22592 }
22593 }
22594}
22595
22597 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
22598 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
22600 auto *AA = &AM.getResult<AAManager>(F);
22601 auto *LI = &AM.getResult<LoopAnalysis>(F);
22602 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
22603 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
22604 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
22606
22607 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
22608 if (!Changed)
22609 return PreservedAnalyses::all();
22610
22613 return PA;
22614}
22615
22617 TargetTransformInfo *TTI_,
22618 TargetLibraryInfo *TLI_, AAResults *AA_,
22619 LoopInfo *LI_, DominatorTree *DT_,
22620 AssumptionCache *AC_, DemandedBits *DB_,
22623 return false;
22624 SE = SE_;
22625 TTI = TTI_;
22626 TLI = TLI_;
22627 AA = AA_;
22628 LI = LI_;
22629 DT = DT_;
22630 AC = AC_;
22631 DB = DB_;
22632 DL = &F.getDataLayout();
22633
22634 Stores.clear();
22635 GEPs.clear();
22636 bool Changed = false;
22637
22638 // If the target claims to have no vector registers don't attempt
22639 // vectorization.
22640 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) {
22641 LLVM_DEBUG(
22642 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
22643 return false;
22644 }
22645
22646 // Don't vectorize when the attribute NoImplicitFloat is used.
22647 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
22648 return false;
22649
22650 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
22651
22652 // Use the bottom up slp vectorizer to construct chains that start with
22653 // store instructions.
22654 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
22655
22656 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
22657 // delete instructions.
22658
22659 // Update DFS numbers now so that we can use them for ordering.
22660 DT->updateDFSNumbers();
22661
22662 // Scan the blocks in the function in post order.
22663 for (auto *BB : post_order(&F.getEntryBlock())) {
22665 continue;
22666
22667 // Start new block - clear the list of reduction roots.
22668 R.clearReductionData();
22669 collectSeedInstructions(BB);
22670
22671 // Vectorize trees that end at stores.
22672 if (!Stores.empty()) {
22673 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
22674 << " underlying objects.\n");
22675 Changed |= vectorizeStoreChains(R);
22676 }
22677
22678 // Vectorize trees that end at reductions.
22679 Changed |= vectorizeChainsInBlock(BB, R);
22680
22681 // Vectorize the index computations of getelementptr instructions. This
22682 // is primarily intended to catch gather-like idioms ending at
22683 // non-consecutive loads.
22684 if (!GEPs.empty()) {
22685 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
22686 << " underlying objects.\n");
22687 Changed |= vectorizeGEPIndices(BB, R);
22688 }
22689 }
22690
22691 if (Changed) {
22692 R.optimizeGatherSequence();
22693 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
22694 }
22695 return Changed;
22696}
22697
22698std::optional<bool>
22699SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
22700 unsigned Idx, unsigned MinVF,
22701 unsigned &Size) {
22702 Size = 0;
22703 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
22704 << "\n");
22705 const unsigned Sz = R.getVectorElementSize(Chain[0]);
22706 unsigned VF = Chain.size();
22707
22708 if (!has_single_bit(Sz) ||
22710 *TTI, cast<StoreInst>(Chain.front())->getValueOperand()->getType(),
22711 VF) ||
22712 VF < 2 || VF < MinVF) {
22713 // Check if vectorizing with a non-power-of-2 VF should be considered. At
22714 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
22715 // all vector lanes are used.
22716 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
22717 return false;
22718 }
22719
22720 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
22721 << "\n");
22722
22723 SetVector<Value *> ValOps;
22724 for (Value *V : Chain)
22725 ValOps.insert(cast<StoreInst>(V)->getValueOperand());
22726 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
22727 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
22728 InstructionsState S = Analysis.buildInstructionsState(
22729 ValOps.getArrayRef(), R, /*TryCopyableElementsVectorization=*/true);
22730 if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
22731 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
22732 bool IsAllowedSize =
22733 hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(),
22734 ValOps.size()) ||
22735 (VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1));
22736 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
22737 (!S.getMainOp()->isSafeToRemove() ||
22738 any_of(ValOps.getArrayRef(),
22739 [&](Value *V) {
22740 return !isa<ExtractElementInst>(V) &&
22741 (V->getNumUses() > Chain.size() ||
22742 any_of(V->users(), [&](User *U) {
22743 return !Stores.contains(U);
22744 }));
22745 }))) ||
22746 (ValOps.size() > Chain.size() / 2 && !S)) {
22747 Size = (!IsAllowedSize && S) ? 1 : 2;
22748 return false;
22749 }
22750 }
22751 if (R.isLoadCombineCandidate(Chain))
22752 return true;
22753 R.buildTree(Chain);
22754 // Check if tree tiny and store itself or its value is not vectorized.
22755 if (R.isTreeTinyAndNotFullyVectorizable()) {
22756 if (R.isGathered(Chain.front()) ||
22757 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
22758 return std::nullopt;
22759 Size = R.getCanonicalGraphSize();
22760 return false;
22761 }
22762 if (R.isProfitableToReorder()) {
22763 R.reorderTopToBottom();
22764 R.reorderBottomToTop();
22765 }
22766 R.transformNodes();
22767 R.buildExternalUses();
22768
22769 R.computeMinimumValueSizes();
22770
22771 Size = R.getCanonicalGraphSize();
22772 if (S && S.getOpcode() == Instruction::Load)
22773 Size = 2; // cut off masked gather small trees
22774 InstructionCost Cost = R.getTreeCost();
22775
22776 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
22777 if (Cost < -SLPCostThreshold) {
22778 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
22779
22780 using namespace ore;
22781
22782 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
22783 cast<StoreInst>(Chain[0]))
22784 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
22785 << " and with tree size "
22786 << NV("TreeSize", R.getTreeSize()));
22787
22788 R.vectorizeTree();
22789 return true;
22790 }
22791
22792 return false;
22793}
22794
22795/// Checks if the quadratic mean deviation is less than 90% of the mean size.
22796static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
22797 bool First) {
22798 unsigned Num = 0;
22799 uint64_t Sum = std::accumulate(
22800 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
22801 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
22802 unsigned Size = First ? Val.first : Val.second;
22803 if (Size == 1)
22804 return V;
22805 ++Num;
22806 return V + Size;
22807 });
22808 if (Num == 0)
22809 return true;
22810 uint64_t Mean = Sum / Num;
22811 if (Mean == 0)
22812 return true;
22813 uint64_t Dev = std::accumulate(
22814 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
22815 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
22816 unsigned P = First ? Val.first : Val.second;
22817 if (P == 1)
22818 return V;
22819 return V + (P - Mean) * (P - Mean);
22820 }) /
22821 Num;
22822 return Dev * 96 / (Mean * Mean) == 0;
22823}
22824
22825namespace {
22826
22827/// A group of stores that we'll try to bundle together using vector ops.
22828/// They are ordered using the signed distance of their address operand to the
22829/// address of this group's BaseInstr.
22830class RelatedStoreInsts {
22831public:
22832 RelatedStoreInsts(unsigned BaseInstrIdx, ArrayRef<StoreInst *> AllStores)
22833 : AllStores(AllStores) {
22834 reset(BaseInstrIdx);
22835 }
22836
22837 void reset(unsigned NewBaseInstr) {
22838 assert(NewBaseInstr < AllStores.size() &&
22839 "Instruction index out of bounds");
22840 BaseInstrIdx = NewBaseInstr;
22841 Instrs.clear();
22842 insertOrLookup(NewBaseInstr, 0);
22843 }
22844
22845 /// Tries to insert \p InstrIdx as the store with a pointer distance of
22846 /// \p PtrDist.
22847 /// Does nothing if there is already a store with that \p PtrDist.
22848 /// \returns The previously associated Instruction index, or std::nullopt
22849 std::optional<unsigned> insertOrLookup(unsigned InstrIdx, int64_t PtrDist) {
22850 auto [It, Inserted] = Instrs.emplace(PtrDist, InstrIdx);
22851 return Inserted ? std::nullopt : std::make_optional(It->second);
22852 }
22853
22854 using DistToInstMap = std::map<int64_t, unsigned>;
22855 const DistToInstMap &getStores() const { return Instrs; }
22856
22857 /// If \p SI is related to this group of stores, return the distance of its
22858 /// pointer operand to the one the group's BaseInstr.
22859 std::optional<int64_t> getPointerDiff(StoreInst &SI, const DataLayout &DL,
22860 ScalarEvolution &SE) const {
22861 StoreInst &BaseStore = *AllStores[BaseInstrIdx];
22862 return getPointersDiff(
22863 BaseStore.getValueOperand()->getType(), BaseStore.getPointerOperand(),
22864 SI.getValueOperand()->getType(), SI.getPointerOperand(), DL, SE,
22865 /*StrictCheck=*/true);
22866 }
22867
22868 /// Recompute the pointer distances to be based on \p NewBaseInstIdx.
22869 /// Stores whose index is less than \p MinSafeIdx will be dropped.
22870 void rebase(unsigned MinSafeIdx, unsigned NewBaseInstIdx,
22871 int64_t DistFromCurBase) {
22872 DistToInstMap PrevSet = std::move(Instrs);
22873 reset(NewBaseInstIdx);
22874
22875 // Re-insert stores that come after MinSafeIdx to try and vectorize them
22876 // again. Their distance will be "rebased" to use NewBaseInstIdx as
22877 // reference.
22878 for (auto [Dist, InstIdx] : PrevSet) {
22879 if (InstIdx >= MinSafeIdx)
22880 insertOrLookup(InstIdx, Dist - DistFromCurBase);
22881 }
22882 }
22883
22884 /// Remove all stores that have been vectorized from this group.
22885 void clearVectorizedStores(const BoUpSLP::ValueSet &VectorizedStores) {
22886 DistToInstMap::reverse_iterator LastVectorizedStore = find_if(
22887 reverse(Instrs), [&](const std::pair<int64_t, unsigned> &DistAndIdx) {
22888 return VectorizedStores.contains(AllStores[DistAndIdx.second]);
22889 });
22890
22891 // Get a forward iterator pointing after the last vectorized store and erase
22892 // all stores before it so we don't try to vectorize them again.
22893 DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
22894 Instrs.erase(Instrs.begin(), VectorizedStoresEnd);
22895 }
22896
22897private:
22898 /// The index of the Base instruction, i.e. the one with a 0 pointer distance.
22899 unsigned BaseInstrIdx;
22900
22901 /// Maps a pointer distance from \p BaseInstrIdx to an instruction index.
22902 DistToInstMap Instrs;
22903
22904 /// Reference to all the stores in the BB being analyzed.
22905 ArrayRef<StoreInst *> AllStores;
22906};
22907
22908} // end anonymous namespace
22909
22910bool SLPVectorizerPass::vectorizeStores(
22911 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
22912 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
22913 &Visited) {
22914 // We may run into multiple chains that merge into a single chain. We mark the
22915 // stores that we vectorized so that we don't visit the same store twice.
22916 BoUpSLP::ValueSet VectorizedStores;
22917 bool Changed = false;
22918
22919 auto TryToVectorize = [&](const RelatedStoreInsts::DistToInstMap &StoreSeq) {
22920 int64_t PrevDist = -1;
22922 // Collect the chain into a list.
22923 for (auto [Idx, Data] : enumerate(StoreSeq)) {
22924 auto &[Dist, InstIdx] = Data;
22925 if (Operands.empty() || Dist - PrevDist == 1) {
22926 Operands.push_back(Stores[InstIdx]);
22927 PrevDist = Dist;
22928 if (Idx != StoreSeq.size() - 1)
22929 continue;
22930 }
22931 auto E = make_scope_exit([&, &Dist = Dist, &InstIdx = InstIdx]() {
22932 Operands.clear();
22933 Operands.push_back(Stores[InstIdx]);
22934 PrevDist = Dist;
22935 });
22936
22937 if (Operands.size() <= 1 ||
22938 !Visited
22939 .insert({Operands.front(),
22940 cast<StoreInst>(Operands.front())->getValueOperand(),
22941 Operands.back(),
22942 cast<StoreInst>(Operands.back())->getValueOperand(),
22943 Operands.size()})
22944 .second)
22945 continue;
22946
22947 unsigned MaxVecRegSize = R.getMaxVecRegSize();
22948 unsigned EltSize = R.getVectorElementSize(Operands[0]);
22949 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
22950
22951 unsigned MaxVF =
22952 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
22953 auto *Store = cast<StoreInst>(Operands[0]);
22954 Type *StoreTy = Store->getValueOperand()->getType();
22955 Type *ValueTy = StoreTy;
22956 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
22957 ValueTy = Trunc->getSrcTy();
22958 // When REVEC is enabled, StoreTy and ValueTy may be FixedVectorType. But
22959 // getStoreMinimumVF only support scalar type as arguments. As a result,
22960 // we need to use the element type of StoreTy and ValueTy to retrieve the
22961 // VF and then transform it back.
22962 // Remember: VF is defined as the number we want to vectorize, not the
22963 // number of elements in the final vector.
22964 Type *StoreScalarTy = StoreTy->getScalarType();
22965 unsigned MinVF = PowerOf2Ceil(TTI->getStoreMinimumVF(
22966 R.getMinVF(DL->getTypeStoreSizeInBits(StoreScalarTy)), StoreScalarTy,
22967 ValueTy->getScalarType()));
22968 MinVF /= getNumElements(StoreTy);
22969 MinVF = std::max<unsigned>(2, MinVF);
22970
22971 if (MaxVF < MinVF) {
22972 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
22973 << ") < "
22974 << "MinVF (" << MinVF << ")\n");
22975 continue;
22976 }
22977
22978 unsigned NonPowerOf2VF = 0;
22980 // First try vectorizing with a non-power-of-2 VF. At the moment, only
22981 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
22982 // lanes are used.
22983 unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF);
22984 if (has_single_bit(CandVF + 1)) {
22985 NonPowerOf2VF = CandVF;
22986 assert(NonPowerOf2VF != MaxVF &&
22987 "Non-power-of-2 VF should not be equal to MaxVF");
22988 }
22989 }
22990
22991 // MaxRegVF represents the number of instructions (scalar, or vector in
22992 // case of revec) that can be vectorized to naturally fit in a vector
22993 // register.
22994 unsigned MaxRegVF = MaxVF;
22995
22996 MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
22997 if (MaxVF < MinVF) {
22998 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
22999 << ") < "
23000 << "MinVF (" << MinVF << ")\n");
23001 continue;
23002 }
23003
23004 SmallVector<unsigned> CandidateVFs;
23005 for (unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
23006 VF = divideCeil(VF, 2))
23007 CandidateVFs.push_back(VF);
23008
23009 unsigned End = Operands.size();
23010 unsigned Repeat = 0;
23011 constexpr unsigned MaxAttempts = 4;
23012 OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(Operands.size());
23013 for (std::pair<unsigned, unsigned> &P : RangeSizes)
23014 P.first = P.second = 1;
23015 DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
23016 auto IsNotVectorized = [](bool First,
23017 const std::pair<unsigned, unsigned> &P) {
23018 return First ? P.first > 0 : P.second > 0;
23019 };
23020 auto IsVectorized = [](bool First,
23021 const std::pair<unsigned, unsigned> &P) {
23022 return First ? P.first == 0 : P.second == 0;
23023 };
23024 auto VFIsProfitable = [](bool First, unsigned Size,
23025 const std::pair<unsigned, unsigned> &P) {
23026 return First ? Size >= P.first : Size >= P.second;
23027 };
23028 auto FirstSizeSame = [](unsigned Size,
23029 const std::pair<unsigned, unsigned> &P) {
23030 return Size == P.first;
23031 };
23032 while (true) {
23033 ++Repeat;
23034 bool RepeatChanged = false;
23035 bool AnyProfitableGraph = false;
23036 for (unsigned VF : CandidateVFs) {
23037 AnyProfitableGraph = false;
23038 unsigned FirstUnvecStore =
23039 std::distance(RangeSizes.begin(),
23040 find_if(RangeSizes, std::bind(IsNotVectorized,
23041 VF >= MaxRegVF, _1)));
23042
23043 // Form slices of size VF starting from FirstUnvecStore and try to
23044 // vectorize them.
23045 while (FirstUnvecStore < End) {
23046 unsigned FirstVecStore = std::distance(
23047 RangeSizes.begin(),
23048 find_if(RangeSizes.drop_front(FirstUnvecStore),
23049 std::bind(IsVectorized, VF >= MaxRegVF, _1)));
23050 unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
23051 for (unsigned SliceStartIdx = FirstUnvecStore;
23052 SliceStartIdx + VF <= MaxSliceEnd;) {
23053 if (!checkTreeSizes(RangeSizes.slice(SliceStartIdx, VF),
23054 VF >= MaxRegVF)) {
23055 ++SliceStartIdx;
23056 continue;
23057 }
23058 ArrayRef<Value *> Slice =
23059 ArrayRef(Operands).slice(SliceStartIdx, VF);
23060 assert(all_of(Slice,
23061 [&](Value *V) {
23062 return cast<StoreInst>(V)
23063 ->getValueOperand()
23064 ->getType() ==
23065 cast<StoreInst>(Slice.front())
23066 ->getValueOperand()
23067 ->getType();
23068 }) &&
23069 "Expected all operands of same type.");
23070 if (!NonSchedulable.empty()) {
23071 auto [NonSchedSizeMax, NonSchedSizeMin] =
23072 NonSchedulable.lookup(Slice.front());
23073 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
23074 // VF is too ambitious. Try to vectorize another slice before
23075 // trying a smaller VF.
23076 SliceStartIdx += NonSchedSizeMax;
23077 continue;
23078 }
23079 }
23080 unsigned TreeSize;
23081 std::optional<bool> Res =
23082 vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
23083 if (!Res) {
23084 // Update the range of non schedulable VFs for slices starting
23085 // at SliceStartIdx.
23086 NonSchedulable
23087 .try_emplace(Slice.front(), std::make_pair(VF, VF))
23088 .first->getSecond()
23089 .second = VF;
23090 } else if (*Res) {
23091 // Mark the vectorized stores so that we don't vectorize them
23092 // again.
23093 VectorizedStores.insert_range(Slice);
23094 // Mark the vectorized stores so that we don't vectorize them
23095 // again.
23096 AnyProfitableGraph = RepeatChanged = Changed = true;
23097 // If we vectorized initial block, no need to try to vectorize
23098 // it again.
23099 for (std::pair<unsigned, unsigned> &P :
23100 RangeSizes.slice(SliceStartIdx, VF))
23101 P.first = P.second = 0;
23102 if (SliceStartIdx < FirstUnvecStore + MinVF) {
23103 for (std::pair<unsigned, unsigned> &P : RangeSizes.slice(
23104 FirstUnvecStore, SliceStartIdx - FirstUnvecStore))
23105 P.first = P.second = 0;
23106 FirstUnvecStore = SliceStartIdx + VF;
23107 }
23108 if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
23109 for (std::pair<unsigned, unsigned> &P :
23110 RangeSizes.slice(SliceStartIdx + VF,
23111 MaxSliceEnd - (SliceStartIdx + VF)))
23112 P.first = P.second = 0;
23113 if (MaxSliceEnd == End)
23114 End = SliceStartIdx;
23115 MaxSliceEnd = SliceStartIdx;
23116 }
23117 SliceStartIdx += VF;
23118 continue;
23119 }
23120 if (VF > 2 && Res &&
23121 !all_of(RangeSizes.slice(SliceStartIdx, VF),
23122 std::bind(VFIsProfitable, VF >= MaxRegVF, TreeSize,
23123 _1))) {
23124 SliceStartIdx += VF;
23125 continue;
23126 }
23127 // Check for the very big VFs that we're not rebuilding same
23128 // trees, just with larger number of elements.
23129 if (VF > MaxRegVF && TreeSize > 1 &&
23130 all_of(RangeSizes.slice(SliceStartIdx, VF),
23131 std::bind(FirstSizeSame, TreeSize, _1))) {
23132 SliceStartIdx += VF;
23133 while (SliceStartIdx != MaxSliceEnd &&
23134 RangeSizes[SliceStartIdx].first == TreeSize)
23135 ++SliceStartIdx;
23136 continue;
23137 }
23138 if (TreeSize > 1) {
23139 for (std::pair<unsigned, unsigned> &P :
23140 RangeSizes.slice(SliceStartIdx, VF)) {
23141 if (VF >= MaxRegVF)
23142 P.second = std::max(P.second, TreeSize);
23143 else
23144 P.first = std::max(P.first, TreeSize);
23145 }
23146 }
23147 ++SliceStartIdx;
23148 AnyProfitableGraph = true;
23149 }
23150 if (FirstUnvecStore >= End)
23151 break;
23152 if (MaxSliceEnd - FirstUnvecStore < VF &&
23153 MaxSliceEnd - FirstUnvecStore >= MinVF)
23154 AnyProfitableGraph = true;
23155 FirstUnvecStore = std::distance(
23156 RangeSizes.begin(),
23157 find_if(RangeSizes.drop_front(MaxSliceEnd),
23158 std::bind(IsNotVectorized, VF >= MaxRegVF, _1)));
23159 }
23160 if (!AnyProfitableGraph && VF >= MaxRegVF && has_single_bit(VF))
23161 break;
23162 }
23163 // All values vectorized - exit.
23164 if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
23165 return P.first == 0 && P.second == 0;
23166 }))
23167 break;
23168 // Check if tried all attempts or no need for the last attempts at all.
23169 if (Repeat >= MaxAttempts ||
23170 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
23171 break;
23172 constexpr unsigned StoresLimit = 64;
23173 const unsigned MaxTotalNum = std::min<unsigned>(
23174 Operands.size(),
23175 static_cast<unsigned>(
23176 End -
23177 std::distance(
23178 RangeSizes.begin(),
23179 find_if(RangeSizes, std::bind(IsNotVectorized, true, _1))) +
23180 1));
23181 unsigned VF = bit_ceil(CandidateVFs.front()) * 2;
23182 unsigned Limit =
23183 getFloorFullVectorNumberOfElements(*TTI, StoreTy, MaxTotalNum);
23184 CandidateVFs.clear();
23185 if (bit_floor(Limit) == VF)
23186 CandidateVFs.push_back(Limit);
23187 if (VF > MaxTotalNum || VF >= StoresLimit)
23188 break;
23189 for (std::pair<unsigned, unsigned> &P : RangeSizes) {
23190 if (P.first != 0)
23191 P.first = std::max(P.second, P.first);
23192 }
23193 // Last attempt to vectorize max number of elements, if all previous
23194 // attempts were unsuccessful because of the cost issues.
23195 CandidateVFs.push_back(VF);
23196 }
23197 }
23198 };
23199
23200 /// Groups of stores to vectorize
23201 SmallVector<RelatedStoreInsts> SortedStores;
23202
23203 // Inserts the specified store SI with the given index Idx to the set of the
23204 // stores. If the store with the same distance is found already - stop
23205 // insertion, try to vectorize already found stores. If some stores from this
23206 // sequence were not vectorized - try to vectorize them with the new store
23207 // later. But this logic is applied only to the stores, that come before the
23208 // previous store with the same distance.
23209 // Example:
23210 // 1. store x, %p
23211 // 2. store y, %p+1
23212 // 3. store z, %p+2
23213 // 4. store a, %p
23214 // 5. store b, %p+3
23215 // - Scan this from the last to first store. The very first bunch of stores is
23216 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
23217 // vector).
23218 // - The next store in the list - #1 - has the same distance from store #5 as
23219 // the store #4.
23220 // - Try to vectorize sequence of stores 4,2,3,5.
23221 // - If all these stores are vectorized - just drop them.
23222 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
23223 // - Start new stores sequence.
23224 // The new bunch of stores is {1, {1, 0}}.
23225 // - Add the stores from previous sequence, that were not vectorized.
23226 // Here we consider the stores in the reversed order, rather they are used in
23227 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
23228 // Store #3 can be added -> comes after store #4 with the same distance as
23229 // store #1.
23230 // Store #5 cannot be added - comes before store #4.
23231 // This logic allows to improve the compile time, we assume that the stores
23232 // after previous store with the same distance most likely have memory
23233 // dependencies and no need to waste compile time to try to vectorize them.
23234 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
23235 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
23236 std::optional<int64_t> PtrDist;
23237 auto *RelatedStores = find_if(
23238 SortedStores, [&PtrDist, SI, this](const RelatedStoreInsts &StoreSeq) {
23239 PtrDist = StoreSeq.getPointerDiff(*SI, *DL, *SE);
23240 return PtrDist.has_value();
23241 });
23242
23243 // We did not find a comparable store, start a new group.
23244 if (RelatedStores == SortedStores.end()) {
23245 SortedStores.emplace_back(Idx, Stores);
23246 return;
23247 }
23248
23249 // If there is already a store in the group with the same PtrDiff, try to
23250 // vectorize the existing instructions before adding the current store.
23251 // Otherwise, insert this store and keep collecting.
23252 if (std::optional<unsigned> PrevInst =
23253 RelatedStores->insertOrLookup(Idx, *PtrDist)) {
23254 TryToVectorize(RelatedStores->getStores());
23255 RelatedStores->clearVectorizedStores(VectorizedStores);
23256 RelatedStores->rebase(/*MinSafeIdx=*/*PrevInst + 1,
23257 /*NewBaseInstIdx=*/Idx,
23258 /*DistFromCurBase=*/*PtrDist);
23259 }
23260 };
23261 Type *PrevValTy = nullptr;
23262 for (auto [I, SI] : enumerate(Stores)) {
23263 if (R.isDeleted(SI))
23264 continue;
23265 if (!PrevValTy)
23266 PrevValTy = SI->getValueOperand()->getType();
23267 // Check that we do not try to vectorize stores of different types.
23268 if (PrevValTy != SI->getValueOperand()->getType()) {
23269 for (RelatedStoreInsts &StoreSeq : SortedStores)
23270 TryToVectorize(StoreSeq.getStores());
23271 SortedStores.clear();
23272 PrevValTy = SI->getValueOperand()->getType();
23273 }
23274 FillStoresSet(I, SI);
23275 }
23276
23277 // Final vectorization attempt.
23278 for (RelatedStoreInsts &StoreSeq : SortedStores)
23279 TryToVectorize(StoreSeq.getStores());
23280
23281 return Changed;
23282}
23283
23284void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
23285 // Initialize the collections. We will make a single pass over the block.
23286 Stores.clear();
23287 GEPs.clear();
23288
23289 // Visit the store and getelementptr instructions in BB and organize them in
23290 // Stores and GEPs according to the underlying objects of their pointer
23291 // operands.
23292 for (Instruction &I : *BB) {
23293 // Ignore store instructions that are volatile or have a pointer operand
23294 // that doesn't point to a scalar type.
23295 if (auto *SI = dyn_cast<StoreInst>(&I)) {
23296 if (!SI->isSimple())
23297 continue;
23298 if (!isValidElementType(SI->getValueOperand()->getType()))
23299 continue;
23300 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
23301 }
23302
23303 // Ignore getelementptr instructions that have more than one index, a
23304 // constant index, or a pointer operand that doesn't point to a scalar
23305 // type.
23306 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
23307 if (GEP->getNumIndices() != 1)
23308 continue;
23309 Value *Idx = GEP->idx_begin()->get();
23310 if (isa<Constant>(Idx))
23311 continue;
23312 if (!isValidElementType(Idx->getType()))
23313 continue;
23314 if (GEP->getType()->isVectorTy())
23315 continue;
23316 GEPs[GEP->getPointerOperand()].push_back(GEP);
23317 }
23318 }
23319}
23320
23321bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
23322 bool MaxVFOnly) {
23323 if (VL.size() < 2)
23324 return false;
23325
23326 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
23327 << VL.size() << ".\n");
23328
23329 // Check that all of the parts are instructions of the same type,
23330 // we permit an alternate opcode via InstructionsState.
23331 InstructionsState S = getSameOpcode(VL, *TLI);
23332 if (!S)
23333 return false;
23334
23335 Instruction *I0 = S.getMainOp();
23336 // Make sure invalid types (including vector type) are rejected before
23337 // determining vectorization factor for scalar instructions.
23338 for (Value *V : VL) {
23339 Type *Ty = V->getType();
23341 // NOTE: the following will give user internal llvm type name, which may
23342 // not be useful.
23343 R.getORE()->emit([&]() {
23344 std::string TypeStr;
23345 llvm::raw_string_ostream OS(TypeStr);
23346 Ty->print(OS);
23347 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
23348 << "Cannot SLP vectorize list: type "
23349 << TypeStr + " is unsupported by vectorizer";
23350 });
23351 return false;
23352 }
23353 }
23354
23355 Type *ScalarTy = getValueType(VL[0]);
23356 unsigned Sz = R.getVectorElementSize(I0);
23357 unsigned MinVF = R.getMinVF(Sz);
23358 unsigned MaxVF = std::max<unsigned>(
23359 getFloorFullVectorNumberOfElements(*TTI, ScalarTy, VL.size()), MinVF);
23360 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
23361 if (MaxVF < 2) {
23362 R.getORE()->emit([&]() {
23363 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
23364 << "Cannot SLP vectorize list: vectorization factor "
23365 << "less than 2 is not supported";
23366 });
23367 return false;
23368 }
23369
23370 bool Changed = false;
23371 bool CandidateFound = false;
23372 InstructionCost MinCost = SLPCostThreshold.getValue();
23373
23374 unsigned NextInst = 0, MaxInst = VL.size();
23375 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
23376 VF = getFloorFullVectorNumberOfElements(*TTI, I0->getType(), VF - 1)) {
23377 // No actual vectorization should happen, if number of parts is the same as
23378 // provided vectorization factor (i.e. the scalar type is used for vector
23379 // code during codegen).
23380 auto *VecTy = getWidenedType(ScalarTy, VF);
23381 if (TTI->getNumberOfParts(VecTy) == VF)
23382 continue;
23383 for (unsigned I = NextInst; I < MaxInst; ++I) {
23384 unsigned ActualVF = std::min(MaxInst - I, VF);
23385
23386 if (!hasFullVectorsOrPowerOf2(*TTI, ScalarTy, ActualVF))
23387 continue;
23388
23389 if (MaxVFOnly && ActualVF < MaxVF)
23390 break;
23391 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
23392 break;
23393
23394 SmallVector<Value *> Ops(ActualVF, nullptr);
23395 unsigned Idx = 0;
23396 for (Value *V : VL.drop_front(I)) {
23397 // Check that a previous iteration of this loop did not delete the
23398 // Value.
23399 if (auto *Inst = dyn_cast<Instruction>(V);
23400 !Inst || !R.isDeleted(Inst)) {
23401 Ops[Idx] = V;
23402 ++Idx;
23403 if (Idx == ActualVF)
23404 break;
23405 }
23406 }
23407 // Not enough vectorizable instructions - exit.
23408 if (Idx != ActualVF)
23409 break;
23410
23411 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
23412 << "\n");
23413
23414 R.buildTree(Ops);
23415 if (R.isTreeTinyAndNotFullyVectorizable())
23416 continue;
23417 if (R.isProfitableToReorder()) {
23418 R.reorderTopToBottom();
23419 R.reorderBottomToTop(!isa<InsertElementInst>(Ops.front()));
23420 }
23421 R.transformNodes();
23422 R.buildExternalUses();
23423
23424 R.computeMinimumValueSizes();
23425 InstructionCost Cost = R.getTreeCost();
23426 CandidateFound = true;
23427 MinCost = std::min(MinCost, Cost);
23428
23429 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
23430 << " for VF=" << ActualVF << "\n");
23431 if (Cost < -SLPCostThreshold) {
23432 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
23433 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
23435 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
23436 << " and with tree size "
23437 << ore::NV("TreeSize", R.getTreeSize()));
23438
23439 R.vectorizeTree();
23440 // Move to the next bundle.
23441 I += VF - 1;
23442 NextInst = I + 1;
23443 Changed = true;
23444 }
23445 }
23446 }
23447
23448 if (!Changed && CandidateFound) {
23449 R.getORE()->emit([&]() {
23450 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
23451 << "List vectorization was possible but not beneficial with cost "
23452 << ore::NV("Cost", MinCost) << " >= "
23453 << ore::NV("Treshold", -SLPCostThreshold);
23454 });
23455 } else if (!Changed) {
23456 R.getORE()->emit([&]() {
23457 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
23458 << "Cannot SLP vectorize list: vectorization was impossible"
23459 << " with available vectorization factors";
23460 });
23461 }
23462 return Changed;
23463}
23464
23465namespace {
23466
23467/// Model horizontal reductions.
23468///
23469/// A horizontal reduction is a tree of reduction instructions that has values
23470/// that can be put into a vector as its leaves. For example:
23471///
23472/// mul mul mul mul
23473/// \ / \ /
23474/// + +
23475/// \ /
23476/// +
23477/// This tree has "mul" as its leaf values and "+" as its reduction
23478/// instructions. A reduction can feed into a store or a binary operation
23479/// feeding a phi.
23480/// ...
23481/// \ /
23482/// +
23483/// |
23484/// phi +=
23485///
23486/// Or:
23487/// ...
23488/// \ /
23489/// +
23490/// |
23491/// *p =
23492///
23493class HorizontalReduction {
23494 using ReductionOpsType = SmallVector<Value *, 16>;
23495 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
23496 ReductionOpsListType ReductionOps;
23497 /// List of possibly reduced values.
23499 /// Maps reduced value to the corresponding reduction operation.
23500 SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;
23501 WeakTrackingVH ReductionRoot;
23502 /// The type of reduction operation.
23503 RecurKind RdxKind;
23504 /// Checks if the optimization of original scalar identity operations on
23505 /// matched horizontal reductions is enabled and allowed.
23506 bool IsSupportedHorRdxIdentityOp = false;
23507 /// The minimum number of the reduced values.
23508 const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
23509 /// Contains vector values for reduction including their scale factor and
23510 /// signedness.
23512
23513 static bool isCmpSelMinMax(Instruction *I) {
23514 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
23516 }
23517
23518 // And/or are potentially poison-safe logical patterns like:
23519 // select x, y, false
23520 // select x, true, y
23521 static bool isBoolLogicOp(Instruction *I) {
23522 return isa<SelectInst>(I) &&
23523 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
23524 }
23525
23526 /// Checks if instruction is associative and can be vectorized.
23527 static bool isVectorizable(RecurKind Kind, Instruction *I,
23528 bool TwoElementReduction = false) {
23529 if (Kind == RecurKind::None)
23530 return false;
23531
23532 // Integer ops that map to select instructions or intrinsics are fine.
23534 isBoolLogicOp(I))
23535 return true;
23536
23537 // No need to check for associativity, if 2 reduced values.
23538 if (TwoElementReduction)
23539 return true;
23540
23541 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
23542 // FP min/max are associative except for NaN and -0.0. We do not
23543 // have to rule out -0.0 here because the intrinsic semantics do not
23544 // specify a fixed result for it.
23545 return I->getFastMathFlags().noNaNs();
23546 }
23547
23548 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
23549 return true;
23550
23551 return I->isAssociative();
23552 }
23553
23554 static Value *getRdxOperand(Instruction *I, unsigned Index) {
23555 // Poison-safe 'or' takes the form: select X, true, Y
23556 // To make that work with the normal operand processing, we skip the
23557 // true value operand.
23558 // TODO: Change the code and data structures to handle this without a hack.
23559 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
23560 return I->getOperand(2);
23561 return I->getOperand(Index);
23562 }
23563
23564 /// Creates reduction operation with the current opcode.
23565 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
23566 Value *RHS, const Twine &Name, bool UseSelect) {
23567 Type *OpTy = LHS->getType();
23568 assert(OpTy == RHS->getType() && "Expected LHS and RHS of same type");
23569 switch (Kind) {
23570 case RecurKind::Or: {
23571 if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
23572 return Builder.CreateSelect(
23573 LHS, ConstantInt::getAllOnesValue(CmpInst::makeCmpResultType(OpTy)),
23574 RHS, Name);
23575 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
23576 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
23577 Name);
23578 }
23579 case RecurKind::And: {
23580 if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
23581 return Builder.CreateSelect(
23582 LHS, RHS,
23583 ConstantInt::getNullValue(CmpInst::makeCmpResultType(OpTy)), Name);
23584 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
23585 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
23586 Name);
23587 }
23588 case RecurKind::Add:
23589 case RecurKind::Mul:
23590 case RecurKind::Xor:
23591 case RecurKind::FAdd:
23592 case RecurKind::FMul: {
23593 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
23594 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
23595 Name);
23596 }
23597 case RecurKind::SMax:
23598 case RecurKind::SMin:
23599 case RecurKind::UMax:
23600 case RecurKind::UMin:
23601 if (UseSelect) {
23603 Value *Cmp = Builder.CreateICmp(Pred, LHS, RHS, Name);
23604 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
23605 }
23606 [[fallthrough]];
23607 case RecurKind::FMax:
23608 case RecurKind::FMin:
23609 case RecurKind::FMaximum:
23610 case RecurKind::FMinimum:
23611 case RecurKind::FMaximumNum:
23612 case RecurKind::FMinimumNum: {
23614 return Builder.CreateBinaryIntrinsic(Id, LHS, RHS);
23615 }
23616 default:
23617 llvm_unreachable("Unknown reduction operation.");
23618 }
23619 }
23620
23621 /// Creates reduction operation with the current opcode with the IR flags
23622 /// from \p ReductionOps, dropping nuw/nsw flags.
23623 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
23624 Value *RHS, const Twine &Name,
23625 const ReductionOpsListType &ReductionOps) {
23626 bool UseSelect = ReductionOps.size() == 2 ||
23627 // Logical or/and.
23628 (ReductionOps.size() == 1 &&
23629 any_of(ReductionOps.front(), IsaPred<SelectInst>));
23630 assert((!UseSelect || ReductionOps.size() != 2 ||
23631 isa<SelectInst>(ReductionOps[1][0])) &&
23632 "Expected cmp + select pairs for reduction");
23633 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
23635 if (auto *Sel = dyn_cast<SelectInst>(Op)) {
23636 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
23637 /*IncludeWrapFlags=*/false);
23638 propagateIRFlags(Op, ReductionOps[1], nullptr,
23639 /*IncludeWrapFlags=*/false);
23640 return Op;
23641 }
23642 }
23643 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
23644 return Op;
23645 }
23646
23647public:
23648 static RecurKind getRdxKind(Value *V) {
23649 auto *I = dyn_cast<Instruction>(V);
23650 if (!I)
23651 return RecurKind::None;
23652 if (match(I, m_Add(m_Value(), m_Value())))
23653 return RecurKind::Add;
23654 if (match(I, m_Mul(m_Value(), m_Value())))
23655 return RecurKind::Mul;
23656 if (match(I, m_And(m_Value(), m_Value())) ||
23658 return RecurKind::And;
23659 if (match(I, m_Or(m_Value(), m_Value())) ||
23661 return RecurKind::Or;
23662 if (match(I, m_Xor(m_Value(), m_Value())))
23663 return RecurKind::Xor;
23664 if (match(I, m_FAdd(m_Value(), m_Value())))
23665 return RecurKind::FAdd;
23666 if (match(I, m_FMul(m_Value(), m_Value())))
23667 return RecurKind::FMul;
23668
23670 return RecurKind::FMax;
23672 return RecurKind::FMin;
23673
23674 if (match(I, m_FMaximum(m_Value(), m_Value())))
23675 return RecurKind::FMaximum;
23676 if (match(I, m_FMinimum(m_Value(), m_Value())))
23677 return RecurKind::FMinimum;
23678 // This matches either cmp+select or intrinsics. SLP is expected to handle
23679 // either form.
23680 // TODO: If we are canonicalizing to intrinsics, we can remove several
23681 // special-case paths that deal with selects.
23682 if (match(I, m_SMax(m_Value(), m_Value())))
23683 return RecurKind::SMax;
23684 if (match(I, m_SMin(m_Value(), m_Value())))
23685 return RecurKind::SMin;
23686 if (match(I, m_UMax(m_Value(), m_Value())))
23687 return RecurKind::UMax;
23688 if (match(I, m_UMin(m_Value(), m_Value())))
23689 return RecurKind::UMin;
23690
23691 if (auto *Select = dyn_cast<SelectInst>(I)) {
23692 // Try harder: look for min/max pattern based on instructions producing
23693 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
23694 // During the intermediate stages of SLP, it's very common to have
23695 // pattern like this (since optimizeGatherSequence is run only once
23696 // at the end):
23697 // %1 = extractelement <2 x i32> %a, i32 0
23698 // %2 = extractelement <2 x i32> %a, i32 1
23699 // %cond = icmp sgt i32 %1, %2
23700 // %3 = extractelement <2 x i32> %a, i32 0
23701 // %4 = extractelement <2 x i32> %a, i32 1
23702 // %select = select i1 %cond, i32 %3, i32 %4
23703 CmpPredicate Pred;
23704 Instruction *L1;
23705 Instruction *L2;
23706
23707 Value *LHS = Select->getTrueValue();
23708 Value *RHS = Select->getFalseValue();
23709 Value *Cond = Select->getCondition();
23710
23711 // TODO: Support inverse predicates.
23712 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
23715 return RecurKind::None;
23716 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
23719 return RecurKind::None;
23720 } else {
23722 return RecurKind::None;
23723 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
23726 return RecurKind::None;
23727 }
23728
23729 switch (Pred) {
23730 default:
23731 return RecurKind::None;
23732 case CmpInst::ICMP_SGT:
23733 case CmpInst::ICMP_SGE:
23734 return RecurKind::SMax;
23735 case CmpInst::ICMP_SLT:
23736 case CmpInst::ICMP_SLE:
23737 return RecurKind::SMin;
23738 case CmpInst::ICMP_UGT:
23739 case CmpInst::ICMP_UGE:
23740 return RecurKind::UMax;
23741 case CmpInst::ICMP_ULT:
23742 case CmpInst::ICMP_ULE:
23743 return RecurKind::UMin;
23744 }
23745 }
23746 return RecurKind::None;
23747 }
23748
23749 /// Get the index of the first operand.
23750 static unsigned getFirstOperandIndex(Instruction *I) {
23751 return isCmpSelMinMax(I) ? 1 : 0;
23752 }
23753
23754private:
23755 /// Total number of operands in the reduction operation.
23756 static unsigned getNumberOfOperands(Instruction *I) {
23757 return isCmpSelMinMax(I) ? 3 : 2;
23758 }
23759
23760 /// Checks if the instruction is in basic block \p BB.
23761 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
23762 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
23763 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
23764 auto *Sel = cast<SelectInst>(I);
23765 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
23766 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
23767 }
23768 return I->getParent() == BB;
23769 }
23770
23771 /// Expected number of uses for reduction operations/reduced values.
23772 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
23773 if (IsCmpSelMinMax) {
23774 // SelectInst must be used twice while the condition op must have single
23775 // use only.
23776 if (auto *Sel = dyn_cast<SelectInst>(I))
23777 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
23778 return I->hasNUses(2);
23779 }
23780
23781 // Arithmetic reduction operation must be used once only.
23782 return I->hasOneUse();
23783 }
23784
23785 /// Initializes the list of reduction operations.
23786 void initReductionOps(Instruction *I) {
23787 if (isCmpSelMinMax(I))
23788 ReductionOps.assign(2, ReductionOpsType());
23789 else
23790 ReductionOps.assign(1, ReductionOpsType());
23791 }
23792
23793 /// Add all reduction operations for the reduction instruction \p I.
23794 void addReductionOps(Instruction *I) {
23795 if (isCmpSelMinMax(I)) {
23796 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
23797 ReductionOps[1].emplace_back(I);
23798 } else {
23799 ReductionOps[0].emplace_back(I);
23800 }
23801 }
23802
23803 static bool isGoodForReduction(ArrayRef<Value *> Data) {
23804 int Sz = Data.size();
23805 auto *I = dyn_cast<Instruction>(Data.front());
23806 return Sz > 1 || isConstant(Data.front()) ||
23807 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
23808 }
23809
23810public:
23811 HorizontalReduction() = default;
23813 : ReductionRoot(I), ReductionLimit(2) {
23814 RdxKind = HorizontalReduction::getRdxKind(I);
23815 ReductionOps.emplace_back().push_back(I);
23816 ReducedVals.emplace_back().assign(Ops.begin(), Ops.end());
23817 for (Value *V : Ops)
23818 ReducedValsToOps[V].push_back(I);
23819 }
23820
23821 bool matchReductionForOperands() const {
23822 // Analyze "regular" integer/FP types for reductions - no target-specific
23823 // types or pointers.
23824 assert(ReductionRoot && "Reduction root is not set!");
23825 if (!isVectorizable(RdxKind, cast<Instruction>(ReductionRoot),
23826 all_of(ReducedVals, [](ArrayRef<Value *> Ops) {
23827 return Ops.size() == 2;
23828 })))
23829 return false;
23830
23831 return true;
23832 }
23833
23834 /// Try to find a reduction tree.
23835 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
23836 ScalarEvolution &SE, const DataLayout &DL,
23837 const TargetLibraryInfo &TLI) {
23838 RdxKind = HorizontalReduction::getRdxKind(Root);
23839 if (!isVectorizable(RdxKind, Root))
23840 return false;
23841
23842 // Analyze "regular" integer/FP types for reductions - no target-specific
23843 // types or pointers.
23844 Type *Ty = Root->getType();
23845 if (!isValidElementType(Ty) || Ty->isPointerTy())
23846 return false;
23847
23848 // Though the ultimate reduction may have multiple uses, its condition must
23849 // have only single use.
23850 if (auto *Sel = dyn_cast<SelectInst>(Root))
23851 if (!Sel->getCondition()->hasOneUse())
23852 return false;
23853
23854 ReductionRoot = Root;
23855
23856 // Iterate through all the operands of the possible reduction tree and
23857 // gather all the reduced values, sorting them by their value id.
23858 BasicBlock *BB = Root->getParent();
23859 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
23861 1, std::make_pair(Root, 0));
23862 // Checks if the operands of the \p TreeN instruction are also reduction
23863 // operations or should be treated as reduced values or an extra argument,
23864 // which is not part of the reduction.
23865 auto CheckOperands = [&](Instruction *TreeN,
23866 SmallVectorImpl<Value *> &PossibleReducedVals,
23867 SmallVectorImpl<Instruction *> &ReductionOps,
23868 unsigned Level) {
23869 for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
23870 getNumberOfOperands(TreeN)))) {
23871 Value *EdgeVal = getRdxOperand(TreeN, I);
23872 ReducedValsToOps[EdgeVal].push_back(TreeN);
23873 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
23874 // If the edge is not an instruction, or it is different from the main
23875 // reduction opcode or has too many uses - possible reduced value.
23876 // Also, do not try to reduce const values, if the operation is not
23877 // foldable.
23878 if (!EdgeInst || Level > RecursionMaxDepth ||
23879 getRdxKind(EdgeInst) != RdxKind ||
23880 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
23881 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
23882 !isVectorizable(RdxKind, EdgeInst) ||
23883 (R.isAnalyzedReductionRoot(EdgeInst) &&
23884 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
23885 PossibleReducedVals.push_back(EdgeVal);
23886 continue;
23887 }
23888 ReductionOps.push_back(EdgeInst);
23889 }
23890 };
23891 // Try to regroup reduced values so that it gets more profitable to try to
23892 // reduce them. Values are grouped by their value ids, instructions - by
23893 // instruction op id and/or alternate op id, plus do extra analysis for
23894 // loads (grouping them by the distance between pointers) and cmp
23895 // instructions (grouping them by the predicate).
23896 SmallMapVector<
23897 size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
23898 8>
23899 PossibleReducedVals;
23900 initReductionOps(Root);
23901 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
23902 SmallSet<size_t, 2> LoadKeyUsed;
23903
23904 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
23906 Value *Ptr =
23908 if (!LoadKeyUsed.insert(Key).second) {
23909 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
23910 if (LIt != LoadsMap.end()) {
23911 for (LoadInst *RLI : LIt->second) {
23912 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
23913 LI->getType(), LI->getPointerOperand(), DL, SE,
23914 /*StrictCheck=*/true))
23915 return hash_value(RLI->getPointerOperand());
23916 }
23917 for (LoadInst *RLI : LIt->second) {
23919 LI->getPointerOperand(), TLI)) {
23920 hash_code SubKey = hash_value(RLI->getPointerOperand());
23921 return SubKey;
23922 }
23923 }
23924 if (LIt->second.size() > 2) {
23925 hash_code SubKey =
23926 hash_value(LIt->second.back()->getPointerOperand());
23927 return SubKey;
23928 }
23929 }
23930 }
23931 LoadsMap.try_emplace(std::make_pair(Key, Ptr))
23932 .first->second.push_back(LI);
23933 return hash_value(LI->getPointerOperand());
23934 };
23935
23936 while (!Worklist.empty()) {
23937 auto [TreeN, Level] = Worklist.pop_back_val();
23938 SmallVector<Value *> PossibleRedVals;
23939 SmallVector<Instruction *> PossibleReductionOps;
23940 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
23941 addReductionOps(TreeN);
23942 // Add reduction values. The values are sorted for better vectorization
23943 // results.
23944 for (Value *V : PossibleRedVals) {
23945 size_t Key, Idx;
23946 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
23947 /*AllowAlternate=*/false);
23948 ++PossibleReducedVals[Key][Idx].try_emplace(V, 0).first->second;
23949 }
23950 for (Instruction *I : reverse(PossibleReductionOps))
23951 Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);
23952 }
23953 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
23954 // Sort values by the total number of values kinds to start the reduction
23955 // from the longest possible reduced values sequences.
23956 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
23957 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
23958 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
23959 for (auto &Slice : PossibleRedVals) {
23960 PossibleRedValsVect.emplace_back();
23961 auto RedValsVect = Slice.second.takeVector();
23962 stable_sort(RedValsVect, llvm::less_second());
23963 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
23964 PossibleRedValsVect.back().append(Data.second, Data.first);
23965 }
23966 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
23967 return P1.size() > P2.size();
23968 });
23969 bool First = true;
23970 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
23971 if (First) {
23972 First = false;
23973 ReducedVals.emplace_back();
23974 } else if (!isGoodForReduction(Data)) {
23975 auto *LI = dyn_cast<LoadInst>(Data.front());
23976 auto *LastLI = dyn_cast<LoadInst>(ReducedVals.back().front());
23977 if (!LI || !LastLI ||
23979 getUnderlyingObject(LastLI->getPointerOperand()))
23980 ReducedVals.emplace_back();
23981 }
23982 ReducedVals.back().append(Data.rbegin(), Data.rend());
23983 }
23984 }
23985 // Sort the reduced values by number of same/alternate opcode and/or pointer
23986 // operand.
23987 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
23988 return P1.size() > P2.size();
23989 });
23990 return true;
23991 }
23992
23993 /// Attempt to vectorize the tree found by matchAssociativeReduction.
23994 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
23995 const TargetLibraryInfo &TLI, AssumptionCache *AC,
23996 DominatorTree &DT) {
23997 constexpr unsigned RegMaxNumber = 4;
23998 constexpr unsigned RedValsMaxNumber = 128;
23999 // If there are a sufficient number of reduction values, reduce
24000 // to a nearby power-of-2. We can safely generate oversized
24001 // vectors and rely on the backend to split them to legal sizes.
24002 if (unsigned NumReducedVals = std::accumulate(
24003 ReducedVals.begin(), ReducedVals.end(), 0,
24004 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
24005 if (!isGoodForReduction(Vals))
24006 return Num;
24007 return Num + Vals.size();
24008 });
24009 NumReducedVals < ReductionLimit &&
24010 all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
24011 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
24012 })) {
24013 for (ReductionOpsType &RdxOps : ReductionOps)
24014 for (Value *RdxOp : RdxOps)
24015 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
24016 return nullptr;
24017 }
24018
24019 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
24020 TargetFolder(DL));
24021 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
24022
24023 // Track the reduced values in case if they are replaced by extractelement
24024 // because of the vectorization.
24025 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *
24026 ReducedVals.front().size());
24027
24028 // The compare instruction of a min/max is the insertion point for new
24029 // instructions and may be replaced with a new compare instruction.
24030 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
24031 assert(isa<SelectInst>(RdxRootInst) &&
24032 "Expected min/max reduction to have select root instruction");
24033 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
24034 assert(isa<Instruction>(ScalarCond) &&
24035 "Expected min/max reduction to have compare condition");
24036 return cast<Instruction>(ScalarCond);
24037 };
24038
24039 bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {
24040 return isBoolLogicOp(cast<Instruction>(V));
24041 });
24042 // Return new VectorizedTree, based on previous value.
24043 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
24044 if (VectorizedTree) {
24045 // Update the final value in the reduction.
24047 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
24048 if (AnyBoolLogicOp) {
24049 auto It = ReducedValsToOps.find(VectorizedTree);
24050 auto It1 = ReducedValsToOps.find(Res);
24051 if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) ||
24052 isGuaranteedNotToBePoison(VectorizedTree, AC) ||
24053 (It != ReducedValsToOps.end() &&
24054 any_of(It->getSecond(), [&](Instruction *I) {
24055 return isBoolLogicOp(I) &&
24056 getRdxOperand(I, 0) == VectorizedTree;
24057 }))) {
24058 ;
24059 } else if (isGuaranteedNotToBePoison(Res, AC) ||
24060 (It1 != ReducedValsToOps.end() &&
24061 any_of(It1->getSecond(), [&](Instruction *I) {
24062 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
24063 }))) {
24064 std::swap(VectorizedTree, Res);
24065 } else {
24066 VectorizedTree = Builder.CreateFreeze(VectorizedTree);
24067 }
24068 }
24069
24070 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
24071 ReductionOps);
24072 }
24073 // Initialize the final value in the reduction.
24074 return Res;
24075 };
24076 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
24077 ReductionOps.front().size());
24078 for (ReductionOpsType &RdxOps : ReductionOps)
24079 for (Value *RdxOp : RdxOps) {
24080 if (!RdxOp)
24081 continue;
24082 IgnoreList.insert(RdxOp);
24083 }
24084 // Intersect the fast-math-flags from all reduction operations.
24085 FastMathFlags RdxFMF;
24086 RdxFMF.set();
24087 for (Value *U : IgnoreList)
24088 if (auto *FPMO = dyn_cast<FPMathOperator>(U))
24089 RdxFMF &= FPMO->getFastMathFlags();
24090 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
24091
24092 // Need to track reduced vals, they may be changed during vectorization of
24093 // subvectors.
24094 for (ArrayRef<Value *> Candidates : ReducedVals)
24095 for (Value *V : Candidates)
24096 TrackedVals.try_emplace(V, V);
24097
24098 auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,
24099 Value *V) -> unsigned & {
24100 auto *It = MV.find(V);
24101 assert(It != MV.end() && "Unable to find given key.");
24102 return It->second;
24103 };
24104
24105 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
24106 // List of the values that were reduced in other trees as part of gather
24107 // nodes and thus requiring extract if fully vectorized in other trees.
24108 SmallPtrSet<Value *, 4> RequiredExtract;
24109 WeakTrackingVH VectorizedTree = nullptr;
24110 bool CheckForReusedReductionOps = false;
24111 // Try to vectorize elements based on their type.
24113 for (ArrayRef<Value *> RV : ReducedVals)
24114 States.push_back(getSameOpcode(RV, TLI));
24115 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
24116 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
24117 InstructionsState S = States[I];
24118 SmallVector<Value *> Candidates;
24119 Candidates.reserve(2 * OrigReducedVals.size());
24120 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
24121 for (Value *ReducedVal : OrigReducedVals) {
24122 Value *RdxVal = TrackedVals.at(ReducedVal);
24123 // Check if the reduction value was not overriden by the extractelement
24124 // instruction because of the vectorization and exclude it, if it is not
24125 // compatible with other values.
24126 // Also check if the instruction was folded to constant/other value.
24127 auto *Inst = dyn_cast<Instruction>(RdxVal);
24128 if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
24129 (!S || !S.getMatchingMainOpOrAltOp(Inst))) ||
24130 (S && !Inst))
24131 continue;
24132 Candidates.push_back(RdxVal);
24133 TrackedToOrig.try_emplace(RdxVal, ReducedVal);
24134 }
24135 bool ShuffledExtracts = false;
24136 // Try to handle shuffled extractelements.
24137 if (S && S.getOpcode() == Instruction::ExtractElement &&
24138 !S.isAltShuffle() && I + 1 < E) {
24139 SmallVector<Value *> CommonCandidates(Candidates);
24140 for (Value *RV : ReducedVals[I + 1]) {
24141 Value *RdxVal = TrackedVals.at(RV);
24142 // Check if the reduction value was not overriden by the
24143 // extractelement instruction because of the vectorization and
24144 // exclude it, if it is not compatible with other values.
24145 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
24146 if (!Inst)
24147 continue;
24148 CommonCandidates.push_back(RdxVal);
24149 TrackedToOrig.try_emplace(RdxVal, RV);
24150 }
24151 SmallVector<int> Mask;
24152 if (isFixedVectorShuffle(CommonCandidates, Mask, AC)) {
24153 ++I;
24154 Candidates.swap(CommonCandidates);
24155 ShuffledExtracts = true;
24156 }
24157 }
24158
24159 // Emit code for constant values.
24160 if (Candidates.size() > 1 && allConstant(Candidates)) {
24161 Value *Res = Candidates.front();
24162 Value *OrigV = TrackedToOrig.at(Candidates.front());
24163 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24164 for (Value *VC : ArrayRef(Candidates).drop_front()) {
24165 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
24166 Value *OrigV = TrackedToOrig.at(VC);
24167 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24168 if (auto *ResI = dyn_cast<Instruction>(Res))
24169 V.analyzedReductionRoot(ResI);
24170 }
24171 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
24172 continue;
24173 }
24174
24175 unsigned NumReducedVals = Candidates.size();
24176 if (NumReducedVals < ReductionLimit &&
24177 (NumReducedVals < 2 || !isSplat(Candidates)))
24178 continue;
24179
24180 // Check if we support repeated scalar values processing (optimization of
24181 // original scalar identity operations on matched horizontal reductions).
24182 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
24183 RdxKind != RecurKind::FMul &&
24184 RdxKind != RecurKind::FMulAdd;
24185 // Gather same values.
24186 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
24187 if (IsSupportedHorRdxIdentityOp)
24188 for (Value *V : Candidates) {
24189 Value *OrigV = TrackedToOrig.at(V);
24190 ++SameValuesCounter.try_emplace(OrigV).first->second;
24191 }
24192 // Used to check if the reduced values used same number of times. In this
24193 // case the compiler may produce better code. E.g. if reduced values are
24194 // aabbccdd (8 x values), then the first node of the tree will have a node
24195 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
24196 // Plus, the final reduction will be performed on <8 x aabbccdd>.
24197 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
24198 // x abcd) * 2.
24199 // Currently it only handles add/fadd/xor. and/or/min/max do not require
24200 // this analysis, other operations may require an extra estimation of
24201 // the profitability.
24202 bool SameScaleFactor = false;
24203 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
24204 SameValuesCounter.size() != Candidates.size();
24205 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
24206 if (OptReusedScalars) {
24207 SameScaleFactor =
24208 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
24209 RdxKind == RecurKind::Xor) &&
24210 all_of(drop_begin(SameValuesCounter),
24211 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
24212 return P.second == SameValuesCounter.front().second;
24213 });
24214 Candidates.resize(SameValuesCounter.size());
24215 transform(SameValuesCounter, Candidates.begin(),
24216 [&](const auto &P) { return TrackedVals.at(P.first); });
24217 NumReducedVals = Candidates.size();
24218 // Have a reduction of the same element.
24219 if (NumReducedVals == 1) {
24220 Value *OrigV = TrackedToOrig.at(Candidates.front());
24221 unsigned Cnt = At(SameValuesCounter, OrigV);
24222 Value *RedVal =
24223 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
24224 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24225 VectorizedVals.try_emplace(OrigV, Cnt);
24226 ExternallyUsedValues.insert(OrigV);
24227 continue;
24228 }
24229 }
24230
24231 unsigned MaxVecRegSize = V.getMaxVecRegSize();
24232 unsigned EltSize = V.getVectorElementSize(Candidates[0]);
24233 const unsigned MaxElts = std::clamp<unsigned>(
24234 llvm::bit_floor(MaxVecRegSize / EltSize), RedValsMaxNumber,
24235 RegMaxNumber * RedValsMaxNumber);
24236
24237 unsigned ReduxWidth = NumReducedVals;
24238 auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) {
24239 unsigned NumParts, NumRegs;
24240 Type *ScalarTy = Candidates.front()->getType();
24241 ReduxWidth =
24242 getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth);
24243 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
24244 NumParts = ::getNumberOfParts(TTI, Tp);
24245 NumRegs =
24247 while (NumParts > NumRegs) {
24248 assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0.");
24249 ReduxWidth = bit_floor(ReduxWidth - 1);
24250 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
24251 NumParts = ::getNumberOfParts(TTI, Tp);
24252 NumRegs =
24254 }
24255 if (NumParts > NumRegs / 2)
24256 ReduxWidth = bit_floor(ReduxWidth);
24257 return ReduxWidth;
24258 };
24259 if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))
24260 ReduxWidth = GetVectorFactor(ReduxWidth);
24261 ReduxWidth = std::min(ReduxWidth, MaxElts);
24262
24263 unsigned Start = 0;
24264 unsigned Pos = Start;
24265 // Restarts vectorization attempt with lower vector factor.
24266 unsigned PrevReduxWidth = ReduxWidth;
24267 bool CheckForReusedReductionOpsLocal = false;
24268 auto AdjustReducedVals = [&](bool IgnoreVL = false) {
24269 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
24270 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
24271 // Check if any of the reduction ops are gathered. If so, worth
24272 // trying again with less number of reduction ops.
24273 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
24274 }
24275 ++Pos;
24276 if (Pos < NumReducedVals - ReduxWidth + 1)
24277 return IsAnyRedOpGathered;
24278 Pos = Start;
24279 --ReduxWidth;
24280 if (ReduxWidth > 1)
24281 ReduxWidth = GetVectorFactor(ReduxWidth);
24282 return IsAnyRedOpGathered;
24283 };
24284 bool AnyVectorized = false;
24285 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
24286 while (Pos < NumReducedVals - ReduxWidth + 1 &&
24287 ReduxWidth >= ReductionLimit) {
24288 // Dependency in tree of the reduction ops - drop this attempt, try
24289 // later.
24290 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
24291 Start == 0) {
24292 CheckForReusedReductionOps = true;
24293 break;
24294 }
24295 PrevReduxWidth = ReduxWidth;
24296 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
24297 // Been analyzed already - skip.
24298 if (IgnoredCandidates.contains(std::make_pair(Pos, ReduxWidth)) ||
24299 (!has_single_bit(ReduxWidth) &&
24300 (IgnoredCandidates.contains(
24301 std::make_pair(Pos, bit_floor(ReduxWidth))) ||
24302 IgnoredCandidates.contains(
24303 std::make_pair(Pos + (ReduxWidth - bit_floor(ReduxWidth)),
24304 bit_floor(ReduxWidth))))) ||
24305 V.areAnalyzedReductionVals(VL)) {
24306 (void)AdjustReducedVals(/*IgnoreVL=*/true);
24307 continue;
24308 }
24309 // Early exit if any of the reduction values were deleted during
24310 // previous vectorization attempts.
24311 if (any_of(VL, [&V](Value *RedVal) {
24312 auto *RedValI = dyn_cast<Instruction>(RedVal);
24313 return RedValI && V.isDeleted(RedValI);
24314 }))
24315 break;
24316 V.buildTree(VL, IgnoreList);
24317 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
24318 if (!AdjustReducedVals())
24319 V.analyzedReductionVals(VL);
24320 continue;
24321 }
24322 if (V.isLoadCombineReductionCandidate(RdxKind)) {
24323 if (!AdjustReducedVals())
24324 V.analyzedReductionVals(VL);
24325 continue;
24326 }
24327 V.reorderTopToBottom();
24328 // No need to reorder the root node at all for reassociative reduction.
24329 V.reorderBottomToTop(/*IgnoreReorder=*/RdxFMF.allowReassoc() ||
24330 VL.front()->getType()->isIntOrIntVectorTy() ||
24331 ReductionLimit > 2);
24332 // Keep extracted other reduction values, if they are used in the
24333 // vectorization trees.
24334 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
24335 ExternallyUsedValues);
24336 // The reduction root is used as the insertion point for new
24337 // instructions, so set it as externally used to prevent it from being
24338 // deleted.
24339 LocalExternallyUsedValues.insert(ReductionRoot);
24340 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
24341 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
24342 continue;
24343 for (Value *V : ReducedVals[Cnt])
24344 if (isa<Instruction>(V))
24345 LocalExternallyUsedValues.insert(TrackedVals[V]);
24346 }
24347 if (!IsSupportedHorRdxIdentityOp) {
24348 // Number of uses of the candidates in the vector of values.
24349 assert(SameValuesCounter.empty() &&
24350 "Reused values counter map is not empty");
24351 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24352 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24353 continue;
24354 Value *V = Candidates[Cnt];
24355 Value *OrigV = TrackedToOrig.at(V);
24356 ++SameValuesCounter.try_emplace(OrigV).first->second;
24357 }
24358 }
24359 V.transformNodes();
24360 SmallPtrSet<Value *, 4> VLScalars(llvm::from_range, VL);
24361 // Gather externally used values.
24362 SmallPtrSet<Value *, 4> Visited;
24363 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24364 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24365 continue;
24366 Value *RdxVal = Candidates[Cnt];
24367 if (auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
24368 RdxVal = It->second;
24369 if (!Visited.insert(RdxVal).second)
24370 continue;
24371 // Check if the scalar was vectorized as part of the vectorization
24372 // tree but not the top node.
24373 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
24374 LocalExternallyUsedValues.insert(RdxVal);
24375 continue;
24376 }
24377 Value *OrigV = TrackedToOrig.at(RdxVal);
24378 unsigned NumOps =
24379 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
24380 if (NumOps != ReducedValsToOps.at(OrigV).size())
24381 LocalExternallyUsedValues.insert(RdxVal);
24382 }
24383 // Do not need the list of reused scalars in regular mode anymore.
24384 if (!IsSupportedHorRdxIdentityOp)
24385 SameValuesCounter.clear();
24386 for (Value *RdxVal : VL)
24387 if (RequiredExtract.contains(RdxVal))
24388 LocalExternallyUsedValues.insert(RdxVal);
24389 V.buildExternalUses(LocalExternallyUsedValues);
24390
24391 V.computeMinimumValueSizes();
24392
24393 // Estimate cost.
24394 InstructionCost ReductionCost =
24395 getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT, DL, TLI);
24396 InstructionCost Cost = V.getTreeCost(VL, ReductionCost);
24397 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
24398 << " for reduction\n");
24399 if (!Cost.isValid())
24400 break;
24401 if (Cost >= -SLPCostThreshold) {
24402 V.getORE()->emit([&]() {
24403 return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
24404 ReducedValsToOps.at(VL[0]).front())
24405 << "Vectorizing horizontal reduction is possible "
24406 << "but not beneficial with cost " << ore::NV("Cost", Cost)
24407 << " and threshold "
24408 << ore::NV("Threshold", -SLPCostThreshold);
24409 });
24410 if (!AdjustReducedVals()) {
24411 V.analyzedReductionVals(VL);
24412 unsigned Offset = Pos == Start ? Pos : Pos - 1;
24413 if (ReduxWidth > ReductionLimit && V.isTreeNotExtendable()) {
24414 // Add subvectors of VL to the list of the analyzed values.
24415 for (unsigned VF = getFloorFullVectorNumberOfElements(
24416 *TTI, VL.front()->getType(), ReduxWidth - 1);
24417 VF >= ReductionLimit;
24419 *TTI, VL.front()->getType(), VF - 1)) {
24420 if (has_single_bit(VF) &&
24421 V.getCanonicalGraphSize() != V.getTreeSize())
24422 continue;
24423 for (unsigned Idx : seq<unsigned>(ReduxWidth - VF))
24424 IgnoredCandidates.insert(std::make_pair(Offset + Idx, VF));
24425 }
24426 }
24427 }
24428 continue;
24429 }
24430
24431 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
24432 << Cost << ". (HorRdx)\n");
24433 V.getORE()->emit([&]() {
24434 return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
24435 ReducedValsToOps.at(VL[0]).front())
24436 << "Vectorized horizontal reduction with cost "
24437 << ore::NV("Cost", Cost) << " and with tree size "
24438 << ore::NV("TreeSize", V.getTreeSize());
24439 });
24440
24441 Builder.setFastMathFlags(RdxFMF);
24442
24443 // Emit a reduction. If the root is a select (min/max idiom), the insert
24444 // point is the compare condition of that select.
24445 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
24446 Instruction *InsertPt = RdxRootInst;
24447 if (IsCmpSelMinMax)
24448 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
24449
24450 // Vectorize a tree.
24451 Value *VectorizedRoot = V.vectorizeTree(
24452 LocalExternallyUsedValues, InsertPt, VectorValuesAndScales);
24453 // Update TrackedToOrig mapping, since the tracked values might be
24454 // updated.
24455 for (Value *RdxVal : Candidates) {
24456 Value *OrigVal = TrackedToOrig.at(RdxVal);
24457 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
24458 if (TransformedRdxVal != RdxVal)
24459 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
24460 }
24461
24462 Builder.SetInsertPoint(InsertPt);
24463
24464 // To prevent poison from leaking across what used to be sequential,
24465 // safe, scalar boolean logic operations, the reduction operand must be
24466 // frozen.
24467 if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot, AC))
24468 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
24469
24470 // Emit code to correctly handle reused reduced values, if required.
24471 if (OptReusedScalars && !SameScaleFactor) {
24472 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
24473 SameValuesCounter, TrackedToOrig);
24474 }
24475
24476 Type *ScalarTy = VL.front()->getType();
24477 Type *VecTy = VectorizedRoot->getType();
24478 Type *RedScalarTy = VecTy->getScalarType();
24479 VectorValuesAndScales.emplace_back(
24480 VectorizedRoot,
24481 OptReusedScalars && SameScaleFactor
24482 ? SameValuesCounter.front().second
24483 : 1,
24484 RedScalarTy != ScalarTy->getScalarType()
24485 ? V.isSignedMinBitwidthRootNode()
24486 : true);
24487
24488 // Count vectorized reduced values to exclude them from final reduction.
24489 for (Value *RdxVal : VL) {
24490 Value *OrigV = TrackedToOrig.at(RdxVal);
24491 if (IsSupportedHorRdxIdentityOp) {
24492 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
24493 continue;
24494 }
24495 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24496 if (!V.isVectorized(RdxVal))
24497 RequiredExtract.insert(RdxVal);
24498 }
24499 Pos += ReduxWidth;
24500 Start = Pos;
24501 ReduxWidth = NumReducedVals - Pos;
24502 if (ReduxWidth > 1)
24503 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
24504 AnyVectorized = true;
24505 }
24506 if (OptReusedScalars && !AnyVectorized) {
24507 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
24508 Value *RdxVal = TrackedVals.at(P.first);
24509 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder, P.second);
24510 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24511 VectorizedVals.try_emplace(P.first, P.second);
24512 }
24513 continue;
24514 }
24515 }
24516 if (!VectorValuesAndScales.empty())
24517 VectorizedTree = GetNewVectorizedTree(
24518 VectorizedTree,
24519 emitReduction(Builder, *TTI, ReductionRoot->getType()));
24520
24521 if (!VectorizedTree) {
24522 if (!CheckForReusedReductionOps) {
24523 for (ReductionOpsType &RdxOps : ReductionOps)
24524 for (Value *RdxOp : RdxOps)
24525 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
24526 }
24527 return nullptr;
24528 }
24529
24530 // Reorder operands of bool logical op in the natural order to avoid
24531 // possible problem with poison propagation. If not possible to reorder
24532 // (both operands are originally RHS), emit an extra freeze instruction
24533 // for the LHS operand.
24534 // I.e., if we have original code like this:
24535 // RedOp1 = select i1 ?, i1 LHS, i1 false
24536 // RedOp2 = select i1 RHS, i1 ?, i1 false
24537
24538 // Then, we swap LHS/RHS to create a new op that matches the poison
24539 // semantics of the original code.
24540
24541 // If we have original code like this and both values could be poison:
24542 // RedOp1 = select i1 ?, i1 LHS, i1 false
24543 // RedOp2 = select i1 ?, i1 RHS, i1 false
24544
24545 // Then, we must freeze LHS in the new op.
24546 auto FixBoolLogicalOps =
24547 [&, VectorizedTree](Value *&LHS, Value *&RHS, Instruction *RedOp1,
24548 Instruction *RedOp2, bool InitStep) {
24549 if (!AnyBoolLogicOp)
24550 return;
24551 if (isBoolLogicOp(RedOp1) && ((!InitStep && LHS == VectorizedTree) ||
24552 getRdxOperand(RedOp1, 0) == LHS ||
24554 return;
24555 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
24556 getRdxOperand(RedOp2, 0) == RHS ||
24558 std::swap(LHS, RHS);
24559 return;
24560 }
24561 if (LHS != VectorizedTree)
24562 LHS = Builder.CreateFreeze(LHS);
24563 };
24564 // Finish the reduction.
24565 // Need to add extra arguments and not vectorized possible reduction values.
24566 // Try to avoid dependencies between the scalar remainders after reductions.
24567 auto FinalGen = [&](ArrayRef<std::pair<Instruction *, Value *>> InstVals,
24568 bool InitStep) {
24569 unsigned Sz = InstVals.size();
24570 SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 + Sz % 2);
24571 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
24572 Instruction *RedOp = InstVals[I + 1].first;
24573 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
24574 Value *RdxVal1 = InstVals[I].second;
24575 Value *StableRdxVal1 = RdxVal1;
24576 auto It1 = TrackedVals.find(RdxVal1);
24577 if (It1 != TrackedVals.end())
24578 StableRdxVal1 = It1->second;
24579 Value *RdxVal2 = InstVals[I + 1].second;
24580 Value *StableRdxVal2 = RdxVal2;
24581 auto It2 = TrackedVals.find(RdxVal2);
24582 if (It2 != TrackedVals.end())
24583 StableRdxVal2 = It2->second;
24584 // To prevent poison from leaking across what used to be sequential,
24585 // safe, scalar boolean logic operations, the reduction operand must be
24586 // frozen.
24587 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
24588 RedOp, InitStep);
24589 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
24590 StableRdxVal2, "op.rdx", ReductionOps);
24591 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
24592 }
24593 if (Sz % 2 == 1)
24594 ExtraReds[Sz / 2] = InstVals.back();
24595 return ExtraReds;
24596 };
24598 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
24599 VectorizedTree);
24600 SmallPtrSet<Value *, 8> Visited;
24601 for (ArrayRef<Value *> Candidates : ReducedVals) {
24602 for (Value *RdxVal : Candidates) {
24603 if (!Visited.insert(RdxVal).second)
24604 continue;
24605 unsigned NumOps = VectorizedVals.lookup(RdxVal);
24606 for (Instruction *RedOp :
24607 ArrayRef(ReducedValsToOps.at(RdxVal)).drop_back(NumOps))
24608 ExtraReductions.emplace_back(RedOp, RdxVal);
24609 }
24610 }
24611 // Iterate through all not-vectorized reduction values/extra arguments.
24612 bool InitStep = true;
24613 while (ExtraReductions.size() > 1) {
24615 FinalGen(ExtraReductions, InitStep);
24616 ExtraReductions.swap(NewReds);
24617 InitStep = false;
24618 }
24619 VectorizedTree = ExtraReductions.front().second;
24620
24621 ReductionRoot->replaceAllUsesWith(VectorizedTree);
24622
24623 // The original scalar reduction is expected to have no remaining
24624 // uses outside the reduction tree itself. Assert that we got this
24625 // correct, replace internal uses with undef, and mark for eventual
24626 // deletion.
24627#ifndef NDEBUG
24628 SmallPtrSet<Value *, 4> IgnoreSet;
24629 for (ArrayRef<Value *> RdxOps : ReductionOps)
24630 IgnoreSet.insert_range(RdxOps);
24631#endif
24632 for (ArrayRef<Value *> RdxOps : ReductionOps) {
24633 for (Value *Ignore : RdxOps) {
24634 if (!Ignore)
24635 continue;
24636#ifndef NDEBUG
24637 for (auto *U : Ignore->users()) {
24638 assert(IgnoreSet.count(U) &&
24639 "All users must be either in the reduction ops list.");
24640 }
24641#endif
24642 if (!Ignore->use_empty()) {
24643 Value *P = PoisonValue::get(Ignore->getType());
24644 Ignore->replaceAllUsesWith(P);
24645 }
24646 }
24647 V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
24648 }
24649 return VectorizedTree;
24650 }
24651
24652private:
24653 /// Creates the reduction from the given \p Vec vector value with the given
24654 /// scale \p Scale and signedness \p IsSigned.
24655 Value *createSingleOp(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
24656 Value *Vec, unsigned Scale, bool IsSigned,
24657 Type *DestTy) {
24658 Value *Rdx;
24659 if (auto *VecTy = dyn_cast<FixedVectorType>(DestTy)) {
24660 unsigned DestTyNumElements = getNumElements(VecTy);
24661 unsigned VF = getNumElements(Vec->getType()) / DestTyNumElements;
24662 Rdx = PoisonValue::get(
24663 getWidenedType(Vec->getType()->getScalarType(), DestTyNumElements));
24664 for (unsigned I : seq<unsigned>(DestTyNumElements)) {
24665 // Do reduction for each lane.
24666 // e.g., do reduce add for
24667 // VL[0] = <4 x Ty> <a, b, c, d>
24668 // VL[1] = <4 x Ty> <e, f, g, h>
24669 // Lane[0] = <2 x Ty> <a, e>
24670 // Lane[1] = <2 x Ty> <b, f>
24671 // Lane[2] = <2 x Ty> <c, g>
24672 // Lane[3] = <2 x Ty> <d, h>
24673 // result[0] = reduce add Lane[0]
24674 // result[1] = reduce add Lane[1]
24675 // result[2] = reduce add Lane[2]
24676 // result[3] = reduce add Lane[3]
24677 SmallVector<int, 16> Mask = createStrideMask(I, DestTyNumElements, VF);
24678 Value *Lane = Builder.CreateShuffleVector(Vec, Mask);
24679 Rdx = Builder.CreateInsertElement(
24680 Rdx, emitReduction(Lane, Builder, &TTI, DestTy), I);
24681 }
24682 } else {
24683 Rdx = emitReduction(Vec, Builder, &TTI, DestTy);
24684 }
24685 if (Rdx->getType() != DestTy)
24686 Rdx = Builder.CreateIntCast(Rdx, DestTy, IsSigned);
24687 // Improved analysis for add/fadd/xor reductions with same scale
24688 // factor for all operands of reductions. We can emit scalar ops for
24689 // them instead.
24690 if (Scale > 1)
24691 Rdx = emitScaleForReusedOps(Rdx, Builder, Scale);
24692 return Rdx;
24693 }
24694
24695 /// Calculate the cost of a reduction.
24696 InstructionCost getReductionCost(TargetTransformInfo *TTI,
24697 ArrayRef<Value *> ReducedVals,
24698 bool IsCmpSelMinMax, FastMathFlags FMF,
24699 const BoUpSLP &R, DominatorTree &DT,
24700 const DataLayout &DL,
24701 const TargetLibraryInfo &TLI) {
24703 Type *ScalarTy = ReducedVals.front()->getType();
24704 unsigned ReduxWidth = ReducedVals.size();
24705 FixedVectorType *VectorTy = R.getReductionType();
24706 InstructionCost VectorCost = 0, ScalarCost;
24707 // If all of the reduced values are constant, the vector cost is 0, since
24708 // the reduction value can be calculated at the compile time.
24709 bool AllConsts = allConstant(ReducedVals);
24710 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
24712 // Scalar cost is repeated for N-1 elements.
24713 int Cnt = ReducedVals.size();
24714 for (Value *RdxVal : ReducedVals) {
24715 if (Cnt == 1)
24716 break;
24717 --Cnt;
24718 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
24719 Cost += GenCostFn();
24720 continue;
24721 }
24722 InstructionCost ScalarCost = 0;
24723 for (User *U : RdxVal->users()) {
24724 auto *RdxOp = cast<Instruction>(U);
24725 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
24726 if (RdxKind == RecurKind::FAdd) {
24728 RdxOp, getSameOpcode(RdxOp, TLI), DT, DL, *TTI, TLI);
24729 if (FMACost.isValid()) {
24730 LLVM_DEBUG(dbgs() << "FMA cost: " << FMACost << "\n");
24731 if (auto *I = dyn_cast<Instruction>(RdxVal)) {
24732 // Also, exclude scalar fmul cost.
24733 InstructionCost FMulCost =
24735 LLVM_DEBUG(dbgs() << "Minus FMul cost: " << FMulCost << "\n");
24736 FMACost -= FMulCost;
24737 }
24738 ScalarCost += FMACost;
24739 continue;
24740 }
24741 }
24742 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
24743 continue;
24744 }
24745 ScalarCost = InstructionCost::getInvalid();
24746 break;
24747 }
24748 if (ScalarCost.isValid())
24749 Cost += ScalarCost;
24750 else
24751 Cost += GenCostFn();
24752 }
24753 return Cost;
24754 };
24755 // Require reduction cost if:
24756 // 1. This type is not a full register type and no other vectors with the
24757 // same type in the storage (first vector with small type).
24758 // 2. The storage does not have any vector with full vector use (first
24759 // vector with full register use).
24760 bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.empty();
24761 switch (RdxKind) {
24762 case RecurKind::Add:
24763 case RecurKind::Mul:
24764 case RecurKind::Or:
24765 case RecurKind::And:
24766 case RecurKind::Xor:
24767 case RecurKind::FAdd:
24768 case RecurKind::FMul: {
24769 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
24770 if (!AllConsts) {
24771 if (DoesRequireReductionOp) {
24772 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
24773 assert(SLPReVec && "FixedVectorType is not expected.");
24774 unsigned ScalarTyNumElements = VecTy->getNumElements();
24775 for (unsigned I : seq<unsigned>(ReducedVals.size())) {
24776 VectorCost += TTI->getShuffleCost(
24779 ReducedVals.size()),
24780 VectorTy,
24781 createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));
24782 VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy,
24783 FMF, CostKind);
24784 }
24785 VectorCost += TTI->getScalarizationOverhead(
24786 VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
24787 /*Extract*/ false, TTI::TCK_RecipThroughput);
24788 } else {
24789 Type *RedTy = VectorTy->getElementType();
24790 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
24791 std::make_pair(RedTy, true));
24792 if (RType == RedTy) {
24793 VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
24794 FMF, CostKind);
24795 } else {
24796 VectorCost = TTI->getExtendedReductionCost(
24797 RdxOpcode, !IsSigned, RedTy,
24798 getWidenedType(RType, ReduxWidth), FMF, CostKind);
24799 }
24800 }
24801 } else {
24802 Type *RedTy = VectorTy->getElementType();
24803 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
24804 std::make_pair(RedTy, true));
24805 VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
24806 InstructionCost FMACost = InstructionCost::getInvalid();
24807 if (RdxKind == RecurKind::FAdd) {
24808 // Check if the reduction operands can be converted to FMA.
24810 FastMathFlags FMF;
24811 FMF.set();
24812 for (Value *RdxVal : ReducedVals) {
24813 if (!RdxVal->hasOneUse()) {
24814 Ops.clear();
24815 break;
24816 }
24817 if (auto *FPCI = dyn_cast<FPMathOperator>(RdxVal))
24818 FMF &= FPCI->getFastMathFlags();
24819 Ops.push_back(RdxVal->user_back());
24820 }
24821 if (!Ops.empty()) {
24822 FMACost = canConvertToFMA(Ops, getSameOpcode(Ops, TLI), DT, DL,
24823 *TTI, TLI);
24824 if (FMACost.isValid()) {
24825 // Calculate actual FMAD cost.
24826 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
24827 {RVecTy, RVecTy, RVecTy}, FMF);
24828 FMACost = TTI->getIntrinsicInstrCost(ICA, CostKind);
24829
24830 LLVM_DEBUG(dbgs() << "Vector FMA cost: " << FMACost << "\n");
24831 // Also, exclude vector fmul cost.
24833 Instruction::FMul, RVecTy, CostKind);
24835 << "Minus vector FMul cost: " << FMulCost << "\n");
24836 FMACost -= FMulCost;
24837 }
24838 }
24839 }
24840 if (FMACost.isValid())
24841 VectorCost += FMACost;
24842 else
24843 VectorCost +=
24844 TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind);
24845 if (RType != RedTy) {
24846 unsigned Opcode = Instruction::Trunc;
24847 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
24848 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
24849 VectorCost += TTI->getCastInstrCost(
24850 Opcode, VectorTy, RVecTy, TTI::CastContextHint::None, CostKind);
24851 }
24852 }
24853 }
24854 ScalarCost = EvaluateScalarCost([&]() {
24855 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
24856 });
24857 break;
24858 }
24859 case RecurKind::FMax:
24860 case RecurKind::FMin:
24861 case RecurKind::FMaximum:
24862 case RecurKind::FMinimum:
24863 case RecurKind::SMax:
24864 case RecurKind::SMin:
24865 case RecurKind::UMax:
24866 case RecurKind::UMin: {
24868 if (!AllConsts) {
24869 if (DoesRequireReductionOp) {
24870 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
24871 } else {
24872 // Check if the previous reduction already exists and account it as
24873 // series of operations + single reduction.
24874 Type *RedTy = VectorTy->getElementType();
24875 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
24876 std::make_pair(RedTy, true));
24877 VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
24878 IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);
24879 VectorCost += TTI->getIntrinsicInstrCost(ICA, CostKind);
24880 if (RType != RedTy) {
24881 unsigned Opcode = Instruction::Trunc;
24882 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
24883 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
24884 VectorCost += TTI->getCastInstrCost(
24885 Opcode, VectorTy, RVecTy, TTI::CastContextHint::None, CostKind);
24886 }
24887 }
24888 }
24889 ScalarCost = EvaluateScalarCost([&]() {
24890 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
24891 return TTI->getIntrinsicInstrCost(ICA, CostKind);
24892 });
24893 break;
24894 }
24895 default:
24896 llvm_unreachable("Expected arithmetic or min/max reduction operation");
24897 }
24898
24899 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
24900 << " for reduction of " << shortBundleName(ReducedVals)
24901 << " (It is a splitting reduction)\n");
24902 return VectorCost - ScalarCost;
24903 }
24904
24905 /// Splits the values, stored in VectorValuesAndScales, into registers/free
24906 /// sub-registers, combines them with the given reduction operation as a
24907 /// vector operation and then performs single (small enough) reduction.
24908 Value *emitReduction(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
24909 Type *DestTy) {
24910 Value *ReducedSubTree = nullptr;
24911 // Creates reduction and combines with the previous reduction.
24912 auto CreateSingleOp = [&](Value *Vec, unsigned Scale, bool IsSigned) {
24913 Value *Rdx = createSingleOp(Builder, TTI, Vec, Scale, IsSigned, DestTy);
24914 if (ReducedSubTree)
24915 ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx,
24916 "op.rdx", ReductionOps);
24917 else
24918 ReducedSubTree = Rdx;
24919 };
24920 if (VectorValuesAndScales.size() == 1) {
24921 const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.front();
24922 CreateSingleOp(Vec, Scale, IsSigned);
24923 return ReducedSubTree;
24924 }
24925 // Scales Vec using given Cnt scale factor and then performs vector combine
24926 // with previous value of VecOp.
24927 Value *VecRes = nullptr;
24928 bool VecResSignedness = false;
24929 auto CreateVecOp = [&](Value *Vec, unsigned Cnt, bool IsSigned) {
24930 Type *ScalarTy = Vec->getType()->getScalarType();
24931 // Scale Vec using given Cnt scale factor.
24932 if (Cnt > 1) {
24933 ElementCount EC = cast<VectorType>(Vec->getType())->getElementCount();
24934 switch (RdxKind) {
24935 case RecurKind::Add: {
24936 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy) {
24937 unsigned VF = getNumElements(Vec->getType());
24938 LLVM_DEBUG(dbgs() << "SLP: ctpop " << Cnt << "of " << Vec
24939 << ". (HorRdx)\n");
24940 SmallVector<int> Mask(Cnt * VF, PoisonMaskElem);
24941 for (unsigned I : seq<unsigned>(Cnt))
24942 std::iota(std::next(Mask.begin(), VF * I),
24943 std::next(Mask.begin(), VF * (I + 1)), 0);
24944 ++NumVectorInstructions;
24945 Vec = Builder.CreateShuffleVector(Vec, Mask);
24946 break;
24947 }
24948 // res = mul vv, n
24949 if (ScalarTy != DestTy->getScalarType())
24950 Vec = Builder.CreateIntCast(
24951 Vec, getWidenedType(DestTy, getNumElements(Vec->getType())),
24952 IsSigned);
24954 EC, ConstantInt::get(DestTy->getScalarType(), Cnt));
24955 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of " << Vec
24956 << ". (HorRdx)\n");
24957 ++NumVectorInstructions;
24958 Vec = Builder.CreateMul(Vec, Scale);
24959 break;
24960 }
24961 case RecurKind::Xor: {
24962 // res = n % 2 ? 0 : vv
24964 << "SLP: Xor " << Cnt << "of " << Vec << ". (HorRdx)\n");
24965 if (Cnt % 2 == 0)
24966 Vec = Constant::getNullValue(Vec->getType());
24967 break;
24968 }
24969 case RecurKind::FAdd: {
24970 // res = fmul v, n
24971 Value *Scale =
24972 ConstantVector::getSplat(EC, ConstantFP::get(ScalarTy, Cnt));
24973 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of " << Vec
24974 << ". (HorRdx)\n");
24975 ++NumVectorInstructions;
24976 Vec = Builder.CreateFMul(Vec, Scale);
24977 break;
24978 }
24979 case RecurKind::And:
24980 case RecurKind::Or:
24981 case RecurKind::SMax:
24982 case RecurKind::SMin:
24983 case RecurKind::UMax:
24984 case RecurKind::UMin:
24985 case RecurKind::FMax:
24986 case RecurKind::FMin:
24987 case RecurKind::FMaximum:
24988 case RecurKind::FMinimum:
24989 // res = vv
24990 break;
24991 case RecurKind::Sub:
24992 case RecurKind::AddChainWithSubs:
24993 case RecurKind::Mul:
24994 case RecurKind::FMul:
24995 case RecurKind::FMulAdd:
24996 case RecurKind::AnyOf:
24997 case RecurKind::FindFirstIVSMin:
24998 case RecurKind::FindFirstIVUMin:
24999 case RecurKind::FindLastIVSMax:
25000 case RecurKind::FindLastIVUMax:
25001 case RecurKind::FMaxNum:
25002 case RecurKind::FMinNum:
25003 case RecurKind::FMaximumNum:
25004 case RecurKind::FMinimumNum:
25005 case RecurKind::None:
25006 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
25007 }
25008 }
25009 // Combine Vec with the previous VecOp.
25010 if (!VecRes) {
25011 VecRes = Vec;
25012 VecResSignedness = IsSigned;
25013 } else {
25014 ++NumVectorInstructions;
25015 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy &&
25016 VecRes->getType()->getScalarType() == Builder.getInt1Ty()) {
25017 // Handle ctpop.
25018 unsigned VecResVF = getNumElements(VecRes->getType());
25019 unsigned VecVF = getNumElements(Vec->getType());
25020 SmallVector<int> Mask(VecResVF + VecVF, PoisonMaskElem);
25021 std::iota(Mask.begin(), Mask.end(), 0);
25022 // Ensure that VecRes is always larger than Vec
25023 if (VecResVF < VecVF) {
25024 std::swap(VecRes, Vec);
25025 std::swap(VecResVF, VecVF);
25026 }
25027 if (VecResVF != VecVF) {
25028 SmallVector<int> ResizeMask(VecResVF, PoisonMaskElem);
25029 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
25030 Vec = Builder.CreateShuffleVector(Vec, ResizeMask);
25031 }
25032 VecRes = Builder.CreateShuffleVector(VecRes, Vec, Mask, "rdx.op");
25033 return;
25034 }
25035 if (VecRes->getType()->getScalarType() != DestTy->getScalarType())
25036 VecRes = Builder.CreateIntCast(
25037 VecRes, getWidenedType(DestTy, getNumElements(VecRes->getType())),
25038 VecResSignedness);
25039 if (ScalarTy != DestTy->getScalarType())
25040 Vec = Builder.CreateIntCast(
25041 Vec, getWidenedType(DestTy, getNumElements(Vec->getType())),
25042 IsSigned);
25043 unsigned VecResVF = getNumElements(VecRes->getType());
25044 unsigned VecVF = getNumElements(Vec->getType());
25045 // Ensure that VecRes is always larger than Vec
25046 if (VecResVF < VecVF) {
25047 std::swap(VecRes, Vec);
25048 std::swap(VecResVF, VecVF);
25049 }
25050 // extract + op + insert
25051 Value *Op = VecRes;
25052 if (VecResVF != VecVF)
25053 Op = createExtractVector(Builder, VecRes, VecVF, /*Index=*/0);
25054 Op = createOp(Builder, RdxKind, Op, Vec, "rdx.op", ReductionOps);
25055 if (VecResVF != VecVF)
25056 Op = createInsertVector(Builder, VecRes, Op, /*Index=*/0);
25057 VecRes = Op;
25058 }
25059 };
25060 for (auto [Vec, Scale, IsSigned] : VectorValuesAndScales)
25061 CreateVecOp(Vec, Scale, IsSigned);
25062 CreateSingleOp(VecRes, /*Scale=*/1, /*IsSigned=*/false);
25063
25064 return ReducedSubTree;
25065 }
25066
25067 /// Emit a horizontal reduction of the vectorized value.
25068 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
25069 const TargetTransformInfo *TTI, Type *DestTy) {
25070 assert(VectorizedValue && "Need to have a vectorized tree node");
25071 assert(RdxKind != RecurKind::FMulAdd &&
25072 "A call to the llvm.fmuladd intrinsic is not handled yet");
25073
25074 auto *FTy = cast<FixedVectorType>(VectorizedValue->getType());
25075 if (FTy->getScalarType() == Builder.getInt1Ty() &&
25076 RdxKind == RecurKind::Add &&
25077 DestTy->getScalarType() != FTy->getScalarType()) {
25078 // Convert vector_reduce_add(ZExt(<n x i1>)) to
25079 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
25080 Value *V = Builder.CreateBitCast(
25081 VectorizedValue, Builder.getIntNTy(FTy->getNumElements()));
25082 ++NumVectorInstructions;
25083 return Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, V);
25084 }
25085 ++NumVectorInstructions;
25086 return createSimpleReduction(Builder, VectorizedValue, RdxKind);
25087 }
25088
25089 /// Emits optimized code for unique scalar value reused \p Cnt times.
25090 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
25091 unsigned Cnt) {
25092 assert(IsSupportedHorRdxIdentityOp &&
25093 "The optimization of matched scalar identity horizontal reductions "
25094 "must be supported.");
25095 if (Cnt == 1)
25096 return VectorizedValue;
25097 switch (RdxKind) {
25098 case RecurKind::Add: {
25099 // res = mul vv, n
25100 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
25101 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
25102 << VectorizedValue << ". (HorRdx)\n");
25103 return Builder.CreateMul(VectorizedValue, Scale);
25104 }
25105 case RecurKind::Xor: {
25106 // res = n % 2 ? 0 : vv
25107 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
25108 << ". (HorRdx)\n");
25109 if (Cnt % 2 == 0)
25110 return Constant::getNullValue(VectorizedValue->getType());
25111 return VectorizedValue;
25112 }
25113 case RecurKind::FAdd: {
25114 // res = fmul v, n
25115 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
25116 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
25117 << VectorizedValue << ". (HorRdx)\n");
25118 return Builder.CreateFMul(VectorizedValue, Scale);
25119 }
25120 case RecurKind::And:
25121 case RecurKind::Or:
25122 case RecurKind::SMax:
25123 case RecurKind::SMin:
25124 case RecurKind::UMax:
25125 case RecurKind::UMin:
25126 case RecurKind::FMax:
25127 case RecurKind::FMin:
25128 case RecurKind::FMaximum:
25129 case RecurKind::FMinimum:
25130 // res = vv
25131 return VectorizedValue;
25132 case RecurKind::Sub:
25133 case RecurKind::AddChainWithSubs:
25134 case RecurKind::Mul:
25135 case RecurKind::FMul:
25136 case RecurKind::FMulAdd:
25137 case RecurKind::AnyOf:
25138 case RecurKind::FindFirstIVSMin:
25139 case RecurKind::FindFirstIVUMin:
25140 case RecurKind::FindLastIVSMax:
25141 case RecurKind::FindLastIVUMax:
25142 case RecurKind::FMaxNum:
25143 case RecurKind::FMinNum:
25144 case RecurKind::FMaximumNum:
25145 case RecurKind::FMinimumNum:
25146 case RecurKind::None:
25147 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
25148 }
25149 return nullptr;
25150 }
25151
25152 /// Emits actual operation for the scalar identity values, found during
25153 /// horizontal reduction analysis.
25154 Value *
25155 emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
25156 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
25157 const DenseMap<Value *, Value *> &TrackedToOrig) {
25158 assert(IsSupportedHorRdxIdentityOp &&
25159 "The optimization of matched scalar identity horizontal reductions "
25160 "must be supported.");
25161 ArrayRef<Value *> VL = R.getRootNodeScalars();
25162 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
25163 if (VTy->getElementType() != VL.front()->getType()) {
25164 VectorizedValue = Builder.CreateIntCast(
25165 VectorizedValue,
25166 getWidenedType(VL.front()->getType(), VTy->getNumElements()),
25167 R.isSignedMinBitwidthRootNode());
25168 }
25169 switch (RdxKind) {
25170 case RecurKind::Add: {
25171 // root = mul prev_root, <1, 1, n, 1>
25173 for (Value *V : VL) {
25174 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
25175 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
25176 }
25177 auto *Scale = ConstantVector::get(Vals);
25178 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
25179 << VectorizedValue << ". (HorRdx)\n");
25180 return Builder.CreateMul(VectorizedValue, Scale);
25181 }
25182 case RecurKind::And:
25183 case RecurKind::Or:
25184 // No need for multiple or/and(s).
25185 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
25186 << ". (HorRdx)\n");
25187 return VectorizedValue;
25188 case RecurKind::SMax:
25189 case RecurKind::SMin:
25190 case RecurKind::UMax:
25191 case RecurKind::UMin:
25192 case RecurKind::FMax:
25193 case RecurKind::FMin:
25194 case RecurKind::FMaximum:
25195 case RecurKind::FMinimum:
25196 // No need for multiple min/max(s) of the same value.
25197 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
25198 << ". (HorRdx)\n");
25199 return VectorizedValue;
25200 case RecurKind::Xor: {
25201 // Replace values with even number of repeats with 0, since
25202 // x xor x = 0.
25203 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
25204 // 7>, if elements 4th and 6th elements have even number of repeats.
25205 SmallVector<int> Mask(
25206 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
25208 std::iota(Mask.begin(), Mask.end(), 0);
25209 bool NeedShuffle = false;
25210 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
25211 Value *V = VL[I];
25212 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
25213 if (Cnt % 2 == 0) {
25214 Mask[I] = VF;
25215 NeedShuffle = true;
25216 }
25217 }
25218 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
25219 : Mask) dbgs()
25220 << I << " ";
25221 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
25222 if (NeedShuffle)
25223 VectorizedValue = Builder.CreateShuffleVector(
25224 VectorizedValue,
25225 ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
25226 return VectorizedValue;
25227 }
25228 case RecurKind::FAdd: {
25229 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
25231 for (Value *V : VL) {
25232 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
25233 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
25234 }
25235 auto *Scale = ConstantVector::get(Vals);
25236 return Builder.CreateFMul(VectorizedValue, Scale);
25237 }
25238 case RecurKind::Sub:
25239 case RecurKind::AddChainWithSubs:
25240 case RecurKind::Mul:
25241 case RecurKind::FMul:
25242 case RecurKind::FMulAdd:
25243 case RecurKind::AnyOf:
25244 case RecurKind::FindFirstIVSMin:
25245 case RecurKind::FindFirstIVUMin:
25246 case RecurKind::FindLastIVSMax:
25247 case RecurKind::FindLastIVUMax:
25248 case RecurKind::FMaxNum:
25249 case RecurKind::FMinNum:
25250 case RecurKind::FMaximumNum:
25251 case RecurKind::FMinimumNum:
25252 case RecurKind::None:
25253 llvm_unreachable("Unexpected reduction kind for reused scalars.");
25254 }
25255 return nullptr;
25256 }
25257};
25258} // end anonymous namespace
25259
25260/// Gets recurrence kind from the specified value.
25262 return HorizontalReduction::getRdxKind(V);
25263}
25264static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
25265 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
25266 return cast<FixedVectorType>(IE->getType())->getNumElements();
25267
25268 unsigned AggregateSize = 1;
25269 auto *IV = cast<InsertValueInst>(InsertInst);
25270 Type *CurrentType = IV->getType();
25271 do {
25272 if (auto *ST = dyn_cast<StructType>(CurrentType)) {
25273 for (auto *Elt : ST->elements())
25274 if (Elt != ST->getElementType(0)) // check homogeneity
25275 return std::nullopt;
25276 AggregateSize *= ST->getNumElements();
25277 CurrentType = ST->getElementType(0);
25278 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
25279 AggregateSize *= AT->getNumElements();
25280 CurrentType = AT->getElementType();
25281 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
25282 AggregateSize *= VT->getNumElements();
25283 return AggregateSize;
25284 } else if (CurrentType->isSingleValueType()) {
25285 return AggregateSize;
25286 } else {
25287 return std::nullopt;
25288 }
25289 } while (true);
25290}
25291
25292static void findBuildAggregateRec(Instruction *LastInsertInst,
25294 SmallVectorImpl<Value *> &BuildVectorOpds,
25295 SmallVectorImpl<Value *> &InsertElts,
25296 unsigned OperandOffset, const BoUpSLP &R) {
25297 do {
25298 Value *InsertedOperand = LastInsertInst->getOperand(1);
25299 std::optional<unsigned> OperandIndex =
25300 getElementIndex(LastInsertInst, OperandOffset);
25301 if (!OperandIndex || R.isDeleted(LastInsertInst))
25302 return;
25303 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
25305 BuildVectorOpds, InsertElts, *OperandIndex, R);
25306
25307 } else {
25308 BuildVectorOpds[*OperandIndex] = InsertedOperand;
25309 InsertElts[*OperandIndex] = LastInsertInst;
25310 }
25311 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
25312 } while (LastInsertInst != nullptr &&
25314 LastInsertInst->hasOneUse());
25315}
25316
25317/// Recognize construction of vectors like
25318/// %ra = insertelement <4 x float> poison, float %s0, i32 0
25319/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
25320/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
25321/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
25322/// starting from the last insertelement or insertvalue instruction.
25323///
25324/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
25325/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
25326/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
25327///
25328/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
25329///
25330/// \return true if it matches.
25331static bool findBuildAggregate(Instruction *LastInsertInst,
25333 SmallVectorImpl<Value *> &BuildVectorOpds,
25334 SmallVectorImpl<Value *> &InsertElts,
25335 const BoUpSLP &R) {
25336
25337 assert((isa<InsertElementInst>(LastInsertInst) ||
25338 isa<InsertValueInst>(LastInsertInst)) &&
25339 "Expected insertelement or insertvalue instruction!");
25340
25341 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
25342 "Expected empty result vectors!");
25343
25344 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
25345 if (!AggregateSize)
25346 return false;
25347 BuildVectorOpds.resize(*AggregateSize);
25348 InsertElts.resize(*AggregateSize);
25349
25350 findBuildAggregateRec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0, R);
25351 llvm::erase(BuildVectorOpds, nullptr);
25352 llvm::erase(InsertElts, nullptr);
25353 if (BuildVectorOpds.size() >= 2)
25354 return true;
25355
25356 return false;
25357}
25358
25359/// Try and get a reduction instruction from a phi node.
25360///
25361/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
25362/// if they come from either \p ParentBB or a containing loop latch.
25363///
25364/// \returns A candidate reduction value if possible, or \code nullptr \endcode
25365/// if not possible.
25367 BasicBlock *ParentBB, LoopInfo *LI) {
25368 // There are situations where the reduction value is not dominated by the
25369 // reduction phi. Vectorizing such cases has been reported to cause
25370 // miscompiles. See PR25787.
25371 auto DominatedReduxValue = [&](Value *R) {
25372 return isa<Instruction>(R) &&
25373 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
25374 };
25375
25376 Instruction *Rdx = nullptr;
25377
25378 // Return the incoming value if it comes from the same BB as the phi node.
25379 if (P->getIncomingBlock(0) == ParentBB) {
25380 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
25381 } else if (P->getIncomingBlock(1) == ParentBB) {
25382 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
25383 }
25384
25385 if (Rdx && DominatedReduxValue(Rdx))
25386 return Rdx;
25387
25388 // Otherwise, check whether we have a loop latch to look at.
25389 Loop *BBL = LI->getLoopFor(ParentBB);
25390 if (!BBL)
25391 return nullptr;
25392 BasicBlock *BBLatch = BBL->getLoopLatch();
25393 if (!BBLatch)
25394 return nullptr;
25395
25396 // There is a loop latch, return the incoming value if it comes from
25397 // that. This reduction pattern occasionally turns up.
25398 if (P->getIncomingBlock(0) == BBLatch) {
25399 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
25400 } else if (P->getIncomingBlock(1) == BBLatch) {
25401 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
25402 }
25403
25404 if (Rdx && DominatedReduxValue(Rdx))
25405 return Rdx;
25406
25407 return nullptr;
25408}
25409
25410static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
25411 if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
25412 return true;
25413 if (match(I, m_FMaxNum(m_Value(V0), m_Value(V1))))
25414 return true;
25415 if (match(I, m_FMinNum(m_Value(V0), m_Value(V1))))
25416 return true;
25417 if (match(I, m_FMaximum(m_Value(V0), m_Value(V1))))
25418 return true;
25419 if (match(I, m_FMinimum(m_Value(V0), m_Value(V1))))
25420 return true;
25422 return true;
25424 return true;
25426 return true;
25428 return true;
25429 return false;
25430}
25431
25432/// We could have an initial reduction that is not an add.
25433/// r *= v1 + v2 + v3 + v4
25434/// In such a case start looking for a tree rooted in the first '+'.
25435/// \Returns the new root if found, which may be nullptr if not an instruction.
25437 Instruction *Root) {
25438 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
25439 isa<IntrinsicInst>(Root)) &&
25440 "Expected binop, select, or intrinsic for reduction matching");
25441 Value *LHS =
25442 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
25443 Value *RHS =
25444 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
25445 if (LHS == Phi)
25446 return dyn_cast<Instruction>(RHS);
25447 if (RHS == Phi)
25448 return dyn_cast<Instruction>(LHS);
25449 return nullptr;
25450}
25451
25452/// \p Returns the first operand of \p I that does not match \p Phi. If
25453/// operand is not an instruction it returns nullptr.
25455 Value *Op0 = nullptr;
25456 Value *Op1 = nullptr;
25457 if (!matchRdxBop(I, Op0, Op1))
25458 return nullptr;
25459 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
25460}
25461
25462/// \Returns true if \p I is a candidate instruction for reduction vectorization.
25464 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
25465 Value *B0 = nullptr, *B1 = nullptr;
25466 bool IsBinop = matchRdxBop(I, B0, B1);
25467 return IsBinop || IsSelect;
25468}
25469
25470bool SLPVectorizerPass::vectorizeHorReduction(
25471 PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
25472 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
25473 if (!ShouldVectorizeHor)
25474 return false;
25475 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
25476
25477 if (Root->getParent() != BB || isa<PHINode>(Root))
25478 return false;
25479
25480 // If we can find a secondary reduction root, use that instead.
25481 auto SelectRoot = [&]() {
25482 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
25483 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
25484 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
25485 return NewRoot;
25486 return Root;
25487 };
25488
25489 // Start analysis starting from Root instruction. If horizontal reduction is
25490 // found, try to vectorize it. If it is not a horizontal reduction or
25491 // vectorization is not possible or not effective, and currently analyzed
25492 // instruction is a binary operation, try to vectorize the operands, using
25493 // pre-order DFS traversal order. If the operands were not vectorized, repeat
25494 // the same procedure considering each operand as a possible root of the
25495 // horizontal reduction.
25496 // Interrupt the process if the Root instruction itself was vectorized or all
25497 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
25498 // If a horizintal reduction was not matched or vectorized we collect
25499 // instructions for possible later attempts for vectorization.
25500 std::queue<std::pair<Instruction *, unsigned>> Stack;
25501 Stack.emplace(SelectRoot(), 0);
25502 SmallPtrSet<Value *, 8> VisitedInstrs;
25503 bool Res = false;
25504 auto TryToReduce = [this, &R, TTI = TTI](Instruction *Inst) -> Value * {
25505 if (R.isAnalyzedReductionRoot(Inst))
25506 return nullptr;
25507 if (!isReductionCandidate(Inst))
25508 return nullptr;
25509 HorizontalReduction HorRdx;
25510 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
25511 return nullptr;
25512 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
25513 };
25514 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
25515 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
25516 FutureSeed = getNonPhiOperand(Root, P);
25517 if (!FutureSeed)
25518 return false;
25519 }
25520 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
25521 // analysis is done separately.
25523 PostponedInsts.push_back(FutureSeed);
25524 return true;
25525 };
25526
25527 while (!Stack.empty()) {
25528 Instruction *Inst;
25529 unsigned Level;
25530 std::tie(Inst, Level) = Stack.front();
25531 Stack.pop();
25532 // Do not try to analyze instruction that has already been vectorized.
25533 // This may happen when we vectorize instruction operands on a previous
25534 // iteration while stack was populated before that happened.
25535 if (R.isDeleted(Inst))
25536 continue;
25537 if (Value *VectorizedV = TryToReduce(Inst)) {
25538 Res = true;
25539 if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
25540 // Try to find another reduction.
25541 Stack.emplace(I, Level);
25542 continue;
25543 }
25544 if (R.isDeleted(Inst))
25545 continue;
25546 } else {
25547 // We could not vectorize `Inst` so try to use it as a future seed.
25548 if (!TryAppendToPostponedInsts(Inst)) {
25549 assert(Stack.empty() && "Expected empty stack");
25550 break;
25551 }
25552 }
25553
25554 // Try to vectorize operands.
25555 // Continue analysis for the instruction from the same basic block only to
25556 // save compile time.
25557 if (++Level < RecursionMaxDepth)
25558 for (auto *Op : Inst->operand_values())
25559 if (VisitedInstrs.insert(Op).second)
25560 if (auto *I = dyn_cast<Instruction>(Op))
25561 // Do not try to vectorize CmpInst operands, this is done
25562 // separately.
25564 !R.isDeleted(I) && I->getParent() == BB)
25565 Stack.emplace(I, Level);
25566 }
25567 return Res;
25568}
25569
25570bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
25571 if (!I)
25572 return false;
25573
25574 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
25575 return false;
25576 // Skip potential FMA candidates.
25577 if ((I->getOpcode() == Instruction::FAdd ||
25578 I->getOpcode() == Instruction::FSub) &&
25579 canConvertToFMA(I, getSameOpcode(I, *TLI), *DT, *DL, *TTI, *TLI)
25580 .isValid())
25581 return false;
25582
25583 Value *P = I->getParent();
25584
25585 // Vectorize in current basic block only.
25586 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
25587 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
25588 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
25589 R.isDeleted(Op0) || R.isDeleted(Op1))
25590 return false;
25591
25592 // First collect all possible candidates
25594 Candidates.emplace_back(Op0, Op1);
25595
25596 auto *A = dyn_cast<BinaryOperator>(Op0);
25597 auto *B = dyn_cast<BinaryOperator>(Op1);
25598 // Try to skip B.
25599 if (A && B && B->hasOneUse()) {
25600 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
25601 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
25602 if (B0 && B0->getParent() == P && !R.isDeleted(B0))
25603 Candidates.emplace_back(A, B0);
25604 if (B1 && B1->getParent() == P && !R.isDeleted(B1))
25605 Candidates.emplace_back(A, B1);
25606 }
25607 // Try to skip A.
25608 if (B && A && A->hasOneUse()) {
25609 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
25610 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
25611 if (A0 && A0->getParent() == P && !R.isDeleted(A0))
25612 Candidates.emplace_back(A0, B);
25613 if (A1 && A1->getParent() == P && !R.isDeleted(A1))
25614 Candidates.emplace_back(A1, B);
25615 }
25616
25617 auto TryToReduce = [this, &R, &TTI = *TTI](Instruction *Inst,
25619 if (!isReductionCandidate(Inst))
25620 return false;
25621 Type *Ty = Inst->getType();
25622 if (!isValidElementType(Ty) || Ty->isPointerTy())
25623 return false;
25624 HorizontalReduction HorRdx(Inst, Ops);
25625 if (!HorRdx.matchReductionForOperands())
25626 return false;
25627 // Check the cost of operations.
25628 VectorType *VecTy = getWidenedType(Ty, Ops.size());
25630 InstructionCost ScalarCost =
25631 TTI.getScalarizationOverhead(
25632 VecTy, APInt::getAllOnes(getNumElements(VecTy)), /*Insert=*/false,
25633 /*Extract=*/true, CostKind) +
25634 TTI.getInstructionCost(Inst, CostKind);
25635 InstructionCost RedCost;
25636 switch (::getRdxKind(Inst)) {
25637 case RecurKind::Add:
25638 case RecurKind::Mul:
25639 case RecurKind::Or:
25640 case RecurKind::And:
25641 case RecurKind::Xor:
25642 case RecurKind::FAdd:
25643 case RecurKind::FMul: {
25644 FastMathFlags FMF;
25645 if (auto *FPCI = dyn_cast<FPMathOperator>(Inst))
25646 FMF = FPCI->getFastMathFlags();
25647 RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
25648 CostKind);
25649 break;
25650 }
25651 default:
25652 return false;
25653 }
25654 if (RedCost >= ScalarCost)
25655 return false;
25656
25657 return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) != nullptr;
25658 };
25659 if (Candidates.size() == 1)
25660 return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R);
25661
25662 // We have multiple options. Try to pick the single best.
25663 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
25664 if (!BestCandidate)
25665 return false;
25666 return (*BestCandidate == 0 &&
25667 TryToReduce(I, {Candidates[*BestCandidate].first,
25668 Candidates[*BestCandidate].second})) ||
25669 tryToVectorizeList({Candidates[*BestCandidate].first,
25670 Candidates[*BestCandidate].second},
25671 R);
25672}
25673
25674bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
25675 BasicBlock *BB, BoUpSLP &R) {
25676 SmallVector<WeakTrackingVH> PostponedInsts;
25677 bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts);
25678 Res |= tryToVectorize(PostponedInsts, R);
25679 return Res;
25680}
25681
25682bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
25683 BoUpSLP &R) {
25684 bool Res = false;
25685 for (Value *V : Insts)
25686 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
25687 Res |= tryToVectorize(Inst, R);
25688 return Res;
25689}
25690
25691bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
25692 BasicBlock *BB, BoUpSLP &R,
25693 bool MaxVFOnly) {
25694 if (!R.canMapToVector(IVI->getType()))
25695 return false;
25696
25697 SmallVector<Value *, 16> BuildVectorOpds;
25698 SmallVector<Value *, 16> BuildVectorInsts;
25699 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts, R))
25700 return false;
25701
25702 if (MaxVFOnly && BuildVectorOpds.size() == 2) {
25703 R.getORE()->emit([&]() {
25704 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
25705 << "Cannot SLP vectorize list: only 2 elements of buildvalue, "
25706 "trying reduction first.";
25707 });
25708 return false;
25709 }
25710 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
25711 // Aggregate value is unlikely to be processed in vector register.
25712 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
25713}
25714
25715bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
25716 BasicBlock *BB, BoUpSLP &R,
25717 bool MaxVFOnly) {
25718 SmallVector<Value *, 16> BuildVectorInsts;
25719 SmallVector<Value *, 16> BuildVectorOpds;
25720 SmallVector<int> Mask;
25721 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts, R) ||
25723 isFixedVectorShuffle(BuildVectorOpds, Mask, AC)))
25724 return false;
25725
25726 if (MaxVFOnly && BuildVectorInsts.size() == 2) {
25727 R.getORE()->emit([&]() {
25728 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
25729 << "Cannot SLP vectorize list: only 2 elements of buildvector, "
25730 "trying reduction first.";
25731 });
25732 return false;
25733 }
25734 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
25735 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
25736}
25737
25738template <typename T>
25740 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
25741 function_ref<bool(ArrayRef<T *>, T *)> AreCompatible,
25742 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
25743 bool MaxVFOnly, BoUpSLP &R) {
25744 bool Changed = false;
25745 // Sort by type, parent, operands.
25746 stable_sort(Incoming, Comparator);
25747
25748 // Try to vectorize elements base on their type.
25749 SmallVector<T *> Candidates;
25751 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
25752 VL.clear()) {
25753 // Look for the next elements with the same type, parent and operand
25754 // kinds.
25755 auto *I = dyn_cast<Instruction>(*IncIt);
25756 if (!I || R.isDeleted(I)) {
25757 ++IncIt;
25758 continue;
25759 }
25760 auto *SameTypeIt = IncIt;
25761 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
25762 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
25763 AreCompatible(VL, *SameTypeIt))) {
25764 auto *I = dyn_cast<Instruction>(*SameTypeIt);
25765 ++SameTypeIt;
25766 if (I && !R.isDeleted(I))
25767 VL.push_back(cast<T>(I));
25768 }
25769
25770 // Try to vectorize them.
25771 unsigned NumElts = VL.size();
25772 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
25773 << NumElts << ")\n");
25774 // The vectorization is a 3-state attempt:
25775 // 1. Try to vectorize instructions with the same/alternate opcodes with the
25776 // size of maximal register at first.
25777 // 2. Try to vectorize remaining instructions with the same type, if
25778 // possible. This may result in the better vectorization results rather than
25779 // if we try just to vectorize instructions with the same/alternate opcodes.
25780 // 3. Final attempt to try to vectorize all instructions with the
25781 // same/alternate ops only, this may result in some extra final
25782 // vectorization.
25783 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
25784 // Success start over because instructions might have been changed.
25785 Changed = true;
25786 VL.swap(Candidates);
25787 Candidates.clear();
25788 for (T *V : VL) {
25789 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
25790 Candidates.push_back(V);
25791 }
25792 } else {
25793 /// \Returns the minimum number of elements that we will attempt to
25794 /// vectorize.
25795 auto GetMinNumElements = [&R](Value *V) {
25796 unsigned EltSize = R.getVectorElementSize(V);
25797 return std::max(2U, R.getMaxVecRegSize() / EltSize);
25798 };
25799 if (NumElts < GetMinNumElements(*IncIt) &&
25800 (Candidates.empty() ||
25801 Candidates.front()->getType() == (*IncIt)->getType())) {
25802 for (T *V : VL) {
25803 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
25804 Candidates.push_back(V);
25805 }
25806 }
25807 }
25808 // Final attempt to vectorize instructions with the same types.
25809 if (Candidates.size() > 1 &&
25810 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
25811 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
25812 // Success start over because instructions might have been changed.
25813 Changed = true;
25814 } else if (MaxVFOnly) {
25815 // Try to vectorize using small vectors.
25817 for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
25818 VL.clear()) {
25819 auto *I = dyn_cast<Instruction>(*It);
25820 if (!I || R.isDeleted(I)) {
25821 ++It;
25822 continue;
25823 }
25824 auto *SameTypeIt = It;
25825 while (SameTypeIt != End &&
25826 (!isa<Instruction>(*SameTypeIt) ||
25827 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
25828 AreCompatible(*SameTypeIt, *It))) {
25829 auto *I = dyn_cast<Instruction>(*SameTypeIt);
25830 ++SameTypeIt;
25831 if (I && !R.isDeleted(I))
25832 VL.push_back(cast<T>(I));
25833 }
25834 unsigned NumElts = VL.size();
25835 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
25836 /*MaxVFOnly=*/false))
25837 Changed = true;
25838 It = SameTypeIt;
25839 }
25840 }
25841 Candidates.clear();
25842 }
25843
25844 // Start over at the next instruction of a different type (or the end).
25845 IncIt = SameTypeIt;
25846 }
25847 return Changed;
25848}
25849
25850/// Compare two cmp instructions. If IsCompatibility is true, function returns
25851/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
25852/// operands. If IsCompatibility is false, function implements strict weak
25853/// ordering relation between two cmp instructions, returning true if the first
25854/// instruction is "less" than the second, i.e. its predicate is less than the
25855/// predicate of the second or the operands IDs are less than the operands IDs
25856/// of the second cmp instruction.
25857template <bool IsCompatibility>
25858static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
25859 const DominatorTree &DT) {
25860 assert(isValidElementType(V->getType()) &&
25861 isValidElementType(V2->getType()) &&
25862 "Expected valid element types only.");
25863 if (V == V2)
25864 return IsCompatibility;
25865 auto *CI1 = cast<CmpInst>(V);
25866 auto *CI2 = cast<CmpInst>(V2);
25867 if (CI1->getOperand(0)->getType()->getTypeID() <
25868 CI2->getOperand(0)->getType()->getTypeID())
25869 return !IsCompatibility;
25870 if (CI1->getOperand(0)->getType()->getTypeID() >
25871 CI2->getOperand(0)->getType()->getTypeID())
25872 return false;
25873 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
25875 return !IsCompatibility;
25876 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
25878 return false;
25879 CmpInst::Predicate Pred1 = CI1->getPredicate();
25880 CmpInst::Predicate Pred2 = CI2->getPredicate();
25883 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
25884 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
25885 if (BasePred1 < BasePred2)
25886 return !IsCompatibility;
25887 if (BasePred1 > BasePred2)
25888 return false;
25889 // Compare operands.
25890 bool CI1Preds = Pred1 == BasePred1;
25891 bool CI2Preds = Pred2 == BasePred1;
25892 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
25893 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
25894 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
25895 if (Op1 == Op2)
25896 continue;
25897 if (Op1->getValueID() < Op2->getValueID())
25898 return !IsCompatibility;
25899 if (Op1->getValueID() > Op2->getValueID())
25900 return false;
25901 if (auto *I1 = dyn_cast<Instruction>(Op1))
25902 if (auto *I2 = dyn_cast<Instruction>(Op2)) {
25903 if (IsCompatibility) {
25904 if (I1->getParent() != I2->getParent())
25905 return false;
25906 } else {
25907 // Try to compare nodes with same parent.
25908 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
25909 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
25910 if (!NodeI1)
25911 return NodeI2 != nullptr;
25912 if (!NodeI2)
25913 return false;
25914 assert((NodeI1 == NodeI2) ==
25915 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
25916 "Different nodes should have different DFS numbers");
25917 if (NodeI1 != NodeI2)
25918 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
25919 }
25920 InstructionsState S = getSameOpcode({I1, I2}, TLI);
25921 if (S && (IsCompatibility || !S.isAltShuffle()))
25922 continue;
25923 if (IsCompatibility)
25924 return false;
25925 if (I1->getOpcode() != I2->getOpcode())
25926 return I1->getOpcode() < I2->getOpcode();
25927 }
25928 }
25929 return IsCompatibility;
25930}
25931
25932template <typename ItT>
25933bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
25934 BasicBlock *BB, BoUpSLP &R) {
25935 bool Changed = false;
25936 // Try to find reductions first.
25937 for (CmpInst *I : CmpInsts) {
25938 if (R.isDeleted(I))
25939 continue;
25940 for (Value *Op : I->operands())
25941 if (auto *RootOp = dyn_cast<Instruction>(Op)) {
25942 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R);
25943 if (R.isDeleted(I))
25944 break;
25945 }
25946 }
25947 // Try to vectorize operands as vector bundles.
25948 for (CmpInst *I : CmpInsts) {
25949 if (R.isDeleted(I))
25950 continue;
25951 Changed |= tryToVectorize(I, R);
25952 }
25953 // Try to vectorize list of compares.
25954 // Sort by type, compare predicate, etc.
25955 auto CompareSorter = [&](Value *V, Value *V2) {
25956 if (V == V2)
25957 return false;
25958 return compareCmp<false>(V, V2, *TLI, *DT);
25959 };
25960
25961 auto AreCompatibleCompares = [&](ArrayRef<Value *> VL, Value *V1) {
25962 if (VL.empty() || VL.back() == V1)
25963 return true;
25964 return compareCmp<true>(V1, VL.back(), *TLI, *DT);
25965 };
25966
25968 for (Instruction *V : CmpInsts)
25969 if (!R.isDeleted(V) && isValidElementType(getValueType(V)))
25970 Vals.push_back(V);
25971 if (Vals.size() <= 1)
25972 return Changed;
25974 Vals, CompareSorter, AreCompatibleCompares,
25975 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
25976 // Exclude possible reductions from other blocks.
25977 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
25978 return any_of(V->users(), [V](User *U) {
25979 auto *Select = dyn_cast<SelectInst>(U);
25980 return Select &&
25981 Select->getParent() != cast<Instruction>(V)->getParent();
25982 });
25983 });
25984 if (ArePossiblyReducedInOtherBlock)
25985 return false;
25986 return tryToVectorizeList(Candidates, R, MaxVFOnly);
25987 },
25988 /*MaxVFOnly=*/true, R);
25989 return Changed;
25990}
25991
25992bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
25993 BasicBlock *BB, BoUpSLP &R) {
25995 "This function only accepts Insert instructions");
25996 bool OpsChanged = false;
25997 SmallVector<WeakTrackingVH> PostponedInsts;
25998 for (auto *I : reverse(Instructions)) {
25999 // pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
26000 if (R.isDeleted(I) || isa<CmpInst>(I))
26001 continue;
26002 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
26003 OpsChanged |=
26004 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/true);
26005 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
26006 OpsChanged |=
26007 vectorizeInsertElementInst(LastInsertElem, BB, R, /*MaxVFOnly=*/true);
26008 }
26009 // pass2 - try to vectorize reductions only
26010 if (R.isDeleted(I))
26011 continue;
26012 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, PostponedInsts);
26013 if (R.isDeleted(I) || isa<CmpInst>(I))
26014 continue;
26015 // pass3 - try to match and vectorize a buildvector sequence.
26016 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
26017 OpsChanged |=
26018 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/false);
26019 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
26020 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
26021 /*MaxVFOnly=*/false);
26022 }
26023 }
26024 // Now try to vectorize postponed instructions.
26025 OpsChanged |= tryToVectorize(PostponedInsts, R);
26026
26027 Instructions.clear();
26028 return OpsChanged;
26029}
26030
26031bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
26032 bool Changed = false;
26033 SmallVector<Value *, 4> Incoming;
26034 SmallPtrSet<Value *, 16> VisitedInstrs;
26035 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
26036 // node. Allows better to identify the chains that can be vectorized in the
26037 // better way.
26038 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
26039 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
26041 isValidElementType(V2->getType()) &&
26042 "Expected vectorizable types only.");
26043 if (V1 == V2)
26044 return false;
26045 // It is fine to compare type IDs here, since we expect only vectorizable
26046 // types, like ints, floats and pointers, we don't care about other type.
26047 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
26048 return true;
26049 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
26050 return false;
26051 if (V1->getType()->getScalarSizeInBits() <
26052 V2->getType()->getScalarSizeInBits())
26053 return true;
26054 if (V1->getType()->getScalarSizeInBits() >
26055 V2->getType()->getScalarSizeInBits())
26056 return false;
26057 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
26058 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
26059 if (Opcodes1.size() < Opcodes2.size())
26060 return true;
26061 if (Opcodes1.size() > Opcodes2.size())
26062 return false;
26063 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
26064 {
26065 // Instructions come first.
26066 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
26067 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
26068 if (I1 && I2) {
26069 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
26070 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
26071 if (!NodeI1)
26072 return NodeI2 != nullptr;
26073 if (!NodeI2)
26074 return false;
26075 assert((NodeI1 == NodeI2) ==
26076 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
26077 "Different nodes should have different DFS numbers");
26078 if (NodeI1 != NodeI2)
26079 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
26080 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
26081 if (S && !S.isAltShuffle() && I1->getOpcode() == I2->getOpcode()) {
26082 const auto *E1 = dyn_cast<ExtractElementInst>(I1);
26083 const auto *E2 = dyn_cast<ExtractElementInst>(I2);
26084 if (!E1 || !E2)
26085 continue;
26086
26087 // Sort on ExtractElementInsts primarily by vector operands. Prefer
26088 // program order of the vector operands.
26089 const auto *V1 = dyn_cast<Instruction>(E1->getVectorOperand());
26090 const auto *V2 = dyn_cast<Instruction>(E2->getVectorOperand());
26091 if (V1 != V2) {
26092 if (V1 && !V2)
26093 return true;
26094 if (!V1 && V2)
26095 return false;
26097 DT->getNode(V1->getParent());
26099 DT->getNode(V2->getParent());
26100 if (!NodeI1)
26101 return NodeI2 != nullptr;
26102 if (!NodeI2)
26103 return false;
26104 assert((NodeI1 == NodeI2) ==
26105 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
26106 "Different nodes should have different DFS numbers");
26107 if (NodeI1 != NodeI2)
26108 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
26109 return V1->comesBefore(V2);
26110 }
26111 // If we have the same vector operand, try to sort by constant
26112 // index.
26113 std::optional<unsigned> Id1 = getExtractIndex(E1);
26114 std::optional<unsigned> Id2 = getExtractIndex(E2);
26115 // Bring constants to the top
26116 if (Id1 && !Id2)
26117 return true;
26118 if (!Id1 && Id2)
26119 return false;
26120 // First elements come first.
26121 if (Id1 && Id2)
26122 return *Id1 < *Id2;
26123
26124 continue;
26125 }
26126 if (I1->getOpcode() == I2->getOpcode())
26127 continue;
26128 return I1->getOpcode() < I2->getOpcode();
26129 }
26130 if (I1)
26131 return true;
26132 if (I2)
26133 return false;
26134 }
26135 {
26136 // Non-undef constants come next.
26137 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
26138 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
26139 if (C1 && C2)
26140 continue;
26141 if (C1)
26142 return true;
26143 if (C2)
26144 return false;
26145 }
26146 bool U1 = isa<UndefValue>(Opcodes1[I]);
26147 bool U2 = isa<UndefValue>(Opcodes2[I]);
26148 {
26149 // Non-constant non-instructions come next.
26150 if (!U1 && !U2) {
26151 auto ValID1 = Opcodes1[I]->getValueID();
26152 auto ValID2 = Opcodes2[I]->getValueID();
26153 if (ValID1 == ValID2)
26154 continue;
26155 if (ValID1 < ValID2)
26156 return true;
26157 if (ValID1 > ValID2)
26158 return false;
26159 }
26160 if (!U1)
26161 return true;
26162 if (!U2)
26163 return false;
26164 }
26165 // Undefs come last.
26166 assert(U1 && U2 && "The only thing left should be undef & undef.");
26167 }
26168 return false;
26169 };
26170 auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](ArrayRef<Value *> VL,
26171 Value *V1) {
26172 if (VL.empty() || V1 == VL.back())
26173 return true;
26174 Value *V2 = VL.back();
26175 if (V1->getType() != V2->getType())
26176 return false;
26177 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
26178 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
26179 if (Opcodes1.size() != Opcodes2.size())
26180 return false;
26181 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
26182 // Undefs are compatible with any other value.
26183 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
26184 continue;
26185 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
26186 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
26187 if (R.isDeleted(I1) || R.isDeleted(I2))
26188 return false;
26189 if (I1->getParent() != I2->getParent())
26190 return false;
26191 if (getSameOpcode({I1, I2}, *TLI))
26192 continue;
26193 return false;
26194 }
26195 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
26196 continue;
26197 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
26198 return false;
26199 }
26200 return true;
26201 };
26202
26203 bool HaveVectorizedPhiNodes = false;
26204 do {
26205 // Collect the incoming values from the PHIs.
26206 Incoming.clear();
26207 for (Instruction &I : *BB) {
26208 auto *P = dyn_cast<PHINode>(&I);
26209 if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
26210 break;
26211
26212 // No need to analyze deleted, vectorized and non-vectorizable
26213 // instructions.
26214 if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
26215 isValidElementType(P->getType()))
26216 Incoming.push_back(P);
26217 }
26218
26219 if (Incoming.size() <= 1)
26220 break;
26221
26222 // Find the corresponding non-phi nodes for better matching when trying to
26223 // build the tree.
26224 for (Value *V : Incoming) {
26225 SmallVectorImpl<Value *> &Opcodes =
26226 PHIToOpcodes.try_emplace(V).first->getSecond();
26227 if (!Opcodes.empty())
26228 continue;
26229 SmallVector<Value *, 4> Nodes(1, V);
26230 SmallPtrSet<Value *, 4> Visited;
26231 while (!Nodes.empty()) {
26232 auto *PHI = cast<PHINode>(Nodes.pop_back_val());
26233 if (!Visited.insert(PHI).second)
26234 continue;
26235 for (Value *V : PHI->incoming_values()) {
26236 if (auto *PHI1 = dyn_cast<PHINode>((V))) {
26237 Nodes.push_back(PHI1);
26238 continue;
26239 }
26240 Opcodes.emplace_back(V);
26241 }
26242 }
26243 }
26244
26245 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
26246 Incoming, PHICompare, AreCompatiblePHIs,
26247 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
26248 return tryToVectorizeList(Candidates, R, MaxVFOnly);
26249 },
26250 /*MaxVFOnly=*/true, R);
26251 Changed |= HaveVectorizedPhiNodes;
26252 if (HaveVectorizedPhiNodes && any_of(PHIToOpcodes, [&](const auto &P) {
26253 auto *PHI = dyn_cast<PHINode>(P.first);
26254 return !PHI || R.isDeleted(PHI);
26255 }))
26256 PHIToOpcodes.clear();
26257 VisitedInstrs.insert_range(Incoming);
26258 } while (HaveVectorizedPhiNodes);
26259
26260 VisitedInstrs.clear();
26261
26262 InstSetVector PostProcessInserts;
26263 SmallSetVector<CmpInst *, 8> PostProcessCmps;
26264 // Vectorizes Inserts in `PostProcessInserts` and if `VectorizeCmps` is true
26265 // also vectorizes `PostProcessCmps`.
26266 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
26267 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
26268 if (VectorizeCmps) {
26269 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
26270 PostProcessCmps.clear();
26271 }
26272 PostProcessInserts.clear();
26273 return Changed;
26274 };
26275 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
26276 auto IsInPostProcessInstrs = [&](Instruction *I) {
26277 if (auto *Cmp = dyn_cast<CmpInst>(I))
26278 return PostProcessCmps.contains(Cmp);
26280 PostProcessInserts.contains(I);
26281 };
26282 // Returns true if `I` is an instruction without users, like terminator, or
26283 // function call with ignored return value, store. Ignore unused instructions
26284 // (basing on instruction type, except for CallInst and InvokeInst).
26285 auto HasNoUsers = [](Instruction *I) {
26286 return I->use_empty() &&
26287 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
26288 };
26289 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
26290 // Skip instructions with scalable type. The num of elements is unknown at
26291 // compile-time for scalable type.
26292 if (isa<ScalableVectorType>(It->getType()))
26293 continue;
26294
26295 // Skip instructions marked for the deletion.
26296 if (R.isDeleted(&*It))
26297 continue;
26298 // We may go through BB multiple times so skip the one we have checked.
26299 if (!VisitedInstrs.insert(&*It).second) {
26300 if (HasNoUsers(&*It) &&
26301 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
26302 // We would like to start over since some instructions are deleted
26303 // and the iterator may become invalid value.
26304 Changed = true;
26305 It = BB->begin();
26306 E = BB->end();
26307 }
26308 continue;
26309 }
26310
26311 // Try to vectorize reductions that use PHINodes.
26312 if (PHINode *P = dyn_cast<PHINode>(It)) {
26313 // Check that the PHI is a reduction PHI.
26314 if (P->getNumIncomingValues() == 2) {
26315 // Try to match and vectorize a horizontal reduction.
26316 Instruction *Root = getReductionInstr(DT, P, BB, LI);
26317 if (Root && vectorizeRootInstruction(P, Root, BB, R)) {
26318 Changed = true;
26319 It = BB->begin();
26320 E = BB->end();
26321 continue;
26322 }
26323 }
26324 // Try to vectorize the incoming values of the PHI, to catch reductions
26325 // that feed into PHIs.
26326 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
26327 // Skip if the incoming block is the current BB for now. Also, bypass
26328 // unreachable IR for efficiency and to avoid crashing.
26329 // TODO: Collect the skipped incoming values and try to vectorize them
26330 // after processing BB.
26331 if (BB == P->getIncomingBlock(I) ||
26332 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
26333 continue;
26334
26335 // Postponed instructions should not be vectorized here, delay their
26336 // vectorization.
26337 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
26338 PI && !IsInPostProcessInstrs(PI)) {
26339 bool Res =
26340 vectorizeRootInstruction(nullptr, PI, P->getIncomingBlock(I), R);
26341 Changed |= Res;
26342 if (Res && R.isDeleted(P)) {
26343 It = BB->begin();
26344 E = BB->end();
26345 break;
26346 }
26347 }
26348 }
26349 continue;
26350 }
26351
26352 if (HasNoUsers(&*It)) {
26353 bool OpsChanged = false;
26354 auto *SI = dyn_cast<StoreInst>(It);
26355 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
26356 if (SI) {
26357 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
26358 // Try to vectorize chain in store, if this is the only store to the
26359 // address in the block.
26360 // TODO: This is just a temporarily solution to save compile time. Need
26361 // to investigate if we can safely turn on slp-vectorize-hor-store
26362 // instead to allow lookup for reduction chains in all non-vectorized
26363 // stores (need to check side effects and compile time).
26364 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
26365 SI->getValueOperand()->hasOneUse();
26366 }
26367 if (TryToVectorizeRoot) {
26368 for (auto *V : It->operand_values()) {
26369 // Postponed instructions should not be vectorized here, delay their
26370 // vectorization.
26371 if (auto *VI = dyn_cast<Instruction>(V);
26372 VI && !IsInPostProcessInstrs(VI))
26373 // Try to match and vectorize a horizontal reduction.
26374 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R);
26375 }
26376 }
26377 // Start vectorization of post-process list of instructions from the
26378 // top-tree instructions to try to vectorize as many instructions as
26379 // possible.
26380 OpsChanged |=
26381 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
26382 if (OpsChanged) {
26383 // We would like to start over since some instructions are deleted
26384 // and the iterator may become invalid value.
26385 Changed = true;
26386 It = BB->begin();
26387 E = BB->end();
26388 continue;
26389 }
26390 }
26391
26393 PostProcessInserts.insert(&*It);
26394 else if (isa<CmpInst>(It))
26395 PostProcessCmps.insert(cast<CmpInst>(&*It));
26396 }
26397
26398 return Changed;
26399}
26400
26401bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
26402 auto Changed = false;
26403 for (auto &Entry : GEPs) {
26404 // If the getelementptr list has fewer than two elements, there's nothing
26405 // to do.
26406 if (Entry.second.size() < 2)
26407 continue;
26408
26409 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
26410 << Entry.second.size() << ".\n");
26411
26412 // Process the GEP list in chunks suitable for the target's supported
26413 // vector size. If a vector register can't hold 1 element, we are done. We
26414 // are trying to vectorize the index computations, so the maximum number of
26415 // elements is based on the size of the index expression, rather than the
26416 // size of the GEP itself (the target's pointer size).
26417 auto *It = find_if(Entry.second, [&](GetElementPtrInst *GEP) {
26418 return !R.isDeleted(GEP);
26419 });
26420 if (It == Entry.second.end())
26421 continue;
26422 unsigned MaxVecRegSize = R.getMaxVecRegSize();
26423 unsigned EltSize = R.getVectorElementSize(*(*It)->idx_begin());
26424 if (MaxVecRegSize < EltSize)
26425 continue;
26426
26427 unsigned MaxElts = MaxVecRegSize / EltSize;
26428 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
26429 auto Len = std::min<unsigned>(BE - BI, MaxElts);
26430 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
26431
26432 // Initialize a set a candidate getelementptrs. Note that we use a
26433 // SetVector here to preserve program order. If the index computations
26434 // are vectorizable and begin with loads, we want to minimize the chance
26435 // of having to reorder them later.
26436 SetVector<Value *> Candidates(llvm::from_range, GEPList);
26437
26438 // Some of the candidates may have already been vectorized after we
26439 // initially collected them or their index is optimized to constant value.
26440 // If so, they are marked as deleted, so remove them from the set of
26441 // candidates.
26442 Candidates.remove_if([&R](Value *I) {
26443 return R.isDeleted(cast<Instruction>(I)) ||
26444 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
26445 });
26446
26447 // Remove from the set of candidates all pairs of getelementptrs with
26448 // constant differences. Such getelementptrs are likely not good
26449 // candidates for vectorization in a bottom-up phase since one can be
26450 // computed from the other. We also ensure all candidate getelementptr
26451 // indices are unique.
26452 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
26453 auto *GEPI = GEPList[I];
26454 if (!Candidates.count(GEPI))
26455 continue;
26456 const SCEV *SCEVI = SE->getSCEV(GEPList[I]);
26457 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
26458 auto *GEPJ = GEPList[J];
26459 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
26460 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
26461 Candidates.remove(GEPI);
26462 Candidates.remove(GEPJ);
26463 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
26464 Candidates.remove(GEPJ);
26465 }
26466 }
26467 }
26468
26469 // We break out of the above computation as soon as we know there are
26470 // fewer than two candidates remaining.
26471 if (Candidates.size() < 2)
26472 continue;
26473
26474 // Add the single, non-constant index of each candidate to the bundle. We
26475 // ensured the indices met these constraints when we originally collected
26476 // the getelementptrs.
26477 SmallVector<Value *, 16> Bundle(Candidates.size());
26478 auto BundleIndex = 0u;
26479 for (auto *V : Candidates) {
26480 auto *GEP = cast<GetElementPtrInst>(V);
26481 auto *GEPIdx = GEP->idx_begin()->get();
26482 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
26483 Bundle[BundleIndex++] = GEPIdx;
26484 }
26485
26486 // Try and vectorize the indices. We are currently only interested in
26487 // gather-like cases of the form:
26488 //
26489 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
26490 //
26491 // where the loads of "a", the loads of "b", and the subtractions can be
26492 // performed in parallel. It's likely that detecting this pattern in a
26493 // bottom-up phase will be simpler and less costly than building a
26494 // full-blown top-down phase beginning at the consecutive loads.
26495 Changed |= tryToVectorizeList(Bundle, R);
26496 }
26497 }
26498 return Changed;
26499}
26500
26501bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
26502 bool Changed = false;
26503 // Sort by type, base pointers and values operand. Value operands must be
26504 // compatible (have the same opcode, same parent), otherwise it is
26505 // definitely not profitable to try to vectorize them.
26506 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
26507 if (V->getValueOperand()->getType()->getTypeID() <
26508 V2->getValueOperand()->getType()->getTypeID())
26509 return true;
26510 if (V->getValueOperand()->getType()->getTypeID() >
26511 V2->getValueOperand()->getType()->getTypeID())
26512 return false;
26513 if (V->getPointerOperandType()->getTypeID() <
26514 V2->getPointerOperandType()->getTypeID())
26515 return true;
26516 if (V->getPointerOperandType()->getTypeID() >
26517 V2->getPointerOperandType()->getTypeID())
26518 return false;
26519 if (V->getValueOperand()->getType()->getScalarSizeInBits() <
26520 V2->getValueOperand()->getType()->getScalarSizeInBits())
26521 return true;
26522 if (V->getValueOperand()->getType()->getScalarSizeInBits() >
26523 V2->getValueOperand()->getType()->getScalarSizeInBits())
26524 return false;
26525 // UndefValues are compatible with all other values.
26526 if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
26527 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
26528 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =
26529 DT->getNode(I1->getParent());
26530 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =
26531 DT->getNode(I2->getParent());
26532 assert(NodeI1 && "Should only process reachable instructions");
26533 assert(NodeI2 && "Should only process reachable instructions");
26534 assert((NodeI1 == NodeI2) ==
26535 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
26536 "Different nodes should have different DFS numbers");
26537 if (NodeI1 != NodeI2)
26538 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
26539 return I1->getOpcode() < I2->getOpcode();
26540 }
26541 return V->getValueOperand()->getValueID() <
26542 V2->getValueOperand()->getValueID();
26543 };
26544
26545 bool SameParent = true;
26546 auto AreCompatibleStores = [&](ArrayRef<StoreInst *> VL, StoreInst *V1) {
26547 if (VL.empty()) {
26548 SameParent = true;
26549 return true;
26550 }
26551 StoreInst *V2 = VL.back();
26552 if (V1 == V2)
26553 return true;
26554 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
26555 return false;
26556 if (V1->getPointerOperandType() != V2->getPointerOperandType())
26557 return false;
26558 // Undefs are compatible with any other value.
26559 if (isa<UndefValue>(V1->getValueOperand()) ||
26561 return true;
26562 if (isa<Constant>(V1->getValueOperand()) &&
26564 return true;
26565 // Check if the operands of the stores can be vectorized. They can be
26566 // vectorized, if they have compatible operands or have operands, which can
26567 // be vectorized as copyables.
26568 auto *I1 = dyn_cast<Instruction>(V1->getValueOperand());
26569 auto *I2 = dyn_cast<Instruction>(V2->getValueOperand());
26570 if (I1 || I2) {
26571 // Accept only tail-following non-compatible values for now.
26572 // TODO: investigate if it is possible to vectorize incompatible values,
26573 // if the copyables are first in the list.
26574 if (I1 && !I2)
26575 return false;
26576 SameParent &= I1 && I2 && I1->getParent() == I2->getParent();
26577 SmallVector<Value *> NewVL(VL.size() + 1);
26578 for (auto [SI, V] : zip(VL, NewVL))
26579 V = SI->getValueOperand();
26580 NewVL.back() = V1->getValueOperand();
26581 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
26582 InstructionsState S = Analysis.buildInstructionsState(
26583 NewVL, R, VectorizeCopyableElements, /*WithProfitabilityCheck=*/true,
26584 /*SkipSameCodeCheck=*/!SameParent);
26585 if (S)
26586 return true;
26587 if (!SameParent)
26588 return false;
26589 }
26590 return V1->getValueOperand()->getValueID() ==
26591 V2->getValueOperand()->getValueID();
26592 };
26593
26594 // Attempt to sort and vectorize each of the store-groups.
26595 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
26596 for (auto &Pair : Stores) {
26597 if (Pair.second.size() < 2)
26598 continue;
26599
26600 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
26601 << Pair.second.size() << ".\n");
26602
26603 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
26604 continue;
26605
26606 // Reverse stores to do bottom-to-top analysis. This is important if the
26607 // values are stores to the same addresses several times, in this case need
26608 // to follow the stores order (reversed to meet the memory dependecies).
26609 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
26610 Pair.second.rend());
26612 ReversedStores, StoreSorter, AreCompatibleStores,
26613 [&](ArrayRef<StoreInst *> Candidates, bool) {
26614 return vectorizeStores(Candidates, R, Attempted);
26615 },
26616 /*MaxVFOnly=*/false, R);
26617 }
26618 return Changed;
26619}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isConstant(const MachineInstr &MI)
AMDGPU Register Bank Select
Rewrite undef for PHI
ReachingDefInfo InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
basic Basic Alias true
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:638
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
Early If Converter
static bool runImpl(Function &F, const TargetLowering &TLI, AssumptionCache *AC)
Definition ExpandFp.cpp:992
This is the interface for a simple mod/ref and alias analysis over globals.
static const HTTPClientCleanup Cleanup
Hexagon Common GEP
#define _
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition IVUsers.cpp:48
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
#define T
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
ppc ctr loops verify
static bool IsSelect(MachineInstr &MI)
if(PassOpts->AAPipeline)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
static std::optional< OperandInfo > getOperandInfo(const MachineOperand &MO)
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static cl::opt< bool > SplitAlternateInstructions("slp-split-alternate-instructions", cl::init(true), cl::Hidden, cl::desc("Improve the code quality by splitting alternate instructions"))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static bool isMaskedLoadCompress(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, const DominatorTree &DT, const TargetLibraryInfo &TLI, const function_ref< bool(Value *)> AreAllUsersVectorized, bool &IsMasked, unsigned &InterleaveFactor, SmallVectorImpl< int > &CompressMask, VectorType *&LoadVecTy)
Checks if the VL can be transformed to a (masked)load + compress or (masked) interleaved load.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< bool > VectorizeCopyableElements("slp-copyable-elements", cl::init(true), cl::Hidden, cl::desc("Try to replace values with the idempotent instructions for " "better vectorization."))
Enables vectorization of copyable elements.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static bool isCommutative(Instruction *I, Value *ValWithUses)
static bool allSameOpcode(ArrayRef< Value * > VL)
static InstructionCost canConvertToFMA(ArrayRef< Value * > VL, const InstructionsState &S, DominatorTree &DT, const DataLayout &DL, TargetTransformInfo &TTI, const TargetLibraryInfo &TLI)
Check if we can convert fadd/fsub sequence to FMAD.
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
#define SV_NAME
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Creates subvector insert.
static void findBuildAggregateRec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static const SCEV * calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static DebugLoc getDebugLocFromPHI(PHINode &PN)
static std::optional< unsigned > getExtractIndex(const Instruction *E)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static cl::opt< bool > ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden, cl::desc("Generate strided loads even if they are not " "profitable. Used for testing only."))
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static bool isMainInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an main operation for the given MainOp and AltOp instruction...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I)
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={})
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
static bool buildCompressMask(ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, Type *ScalarTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< int > &CompressMask)
Builds compress-like mask for shuffles for the given PointerOps, ordered with Order.
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
Calculates the costs of vectorized intrinsic (if possible) and vectorized function (if possible) call...
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static SmallVector< Constant * > replicateMask(ArrayRef< Constant * > Val, unsigned VF)
Replicates the given Val VF times.
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static Instruction * findInstructionWithOpcode(ArrayRef< Value * > VL, unsigned Opcode)
Find an instruction with a specific opcode in VL.
static InstructionCost getExtractWithExtendCost(const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst is a FixedVectorType,...
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static std::pair< Instruction *, Instruction * > getMainAltOpsNoStateVL(ArrayRef< Value * > VL)
Returns main/alternate instructions for the given VL.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int64_t > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(ArrayRef< T * >, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static bool tryToFindDuplicates(SmallVectorImpl< Value * > &VL, SmallVectorImpl< int > &ReuseShuffleIndices, const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const InstructionsState &S, const BoUpSLP::EdgeInfo &UserTreeIdx, bool TryPad=false)
Checks that every instruction appears once in the list and if not, packs them, building ReuseShuffleI...
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Generates subvector extract using Generator or using default shuffle.
static cl::opt< bool > DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden, cl::desc("Disable tree reordering even if it is " "profitable. Used for testing only."))
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static InstructionCost getVectorInstrCost(const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef< std::tuple< Value *, User *, int > > ScalarUserAndIdx)
This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy is a FixedVectorType,...
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, Type *ScalarTy, unsigned Opcode0, unsigned Opcode1)
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static const int BlockSize
Definition TarWriter.cpp:33
This pass exposes codegen information to IR-level passes.
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
Definition VPlanSLP.cpp:210
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition blake3_impl.h:83
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:234
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1406
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1330
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:371
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:380
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition APInt.cpp:1666
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1111
void clearAllBits()
Set every bit to 0.
Definition APInt.h:1396
void negate()
Negate this APInt in place.
Definition APInt.h:1468
unsigned logBase2() const
Definition APInt.h:1761
void setAllBits()
Set every bit to 1.
Definition APInt.h:1319
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition APInt.h:1367
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:440
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:200
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:389
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition APInt.h:286
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:239
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition ArrayRef.h:183
const T & back() const
back - Get the last element.
Definition ArrayRef.h:156
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition ArrayRef.h:224
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:200
const T & front() const
front - Get the first element.
Definition ArrayRef.h:150
iterator end() const
Definition ArrayRef.h:136
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
iterator begin() const
Definition ArrayRef.h:135
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:191
const T & consume_front()
consume_front() - Returns the first element and drops it from ArrayRef.
Definition ArrayRef.h:162
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:459
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
reverse_iterator rend()
Definition BasicBlock.h:477
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
size_t size() const
Definition BasicBlock.h:480
InstListType::const_reverse_iterator const_reverse_iterator
Definition BasicBlock.h:173
bool isEHPad() const
Return true if this basic block is an exception handling block.
Definition BasicBlock.h:707
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition InstrTypes.h:448
This class is the base class for the comparison instructions.
Definition InstrTypes.h:664
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:982
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition InstrTypes.h:827
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
Predicate getPredicate() const
Return the predicate for this instruction.
Definition InstrTypes.h:765
static LLVM_ABI Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
static LLVM_ABI Constant * getBinOpIdentity(unsigned Opcode, Type *Ty, bool AllowRHSConstant=false, bool NSZ=false)
Return the identity constant for a binary opcode.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:163
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:154
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
static bool shouldExecute(unsigned CounterName)
A debug info location.
Definition DebugLoc.h:124
static DebugLoc getUnknown()
Definition DebugLoc.h:162
An analysis that produces DemandedBits for a function.
LLVM_ABI APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:167
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:237
bool erase(const KeyT &Val)
Definition DenseMap.h:311
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:163
iterator end()
Definition DenseMap.h:81
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition DenseMap.h:213
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:158
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:222
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
Base class for the actual dominator tree node.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
Definition Dominators.h:284
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:165
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:310
This instruction extracts a single (scalar) element from a VectorType value.
This instruction extracts a struct member or array element value from an aggregate value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
bool allowReassoc() const
Flag queries.
Definition FMF.h:64
bool allowContract() const
Definition FMF.h:69
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
ArrayRef< Type * > params() const
Type * getReturnType() const
bool empty() const
Definition Function.h:857
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
For the node iterator we just need to turn the TreeEntry iterator into a TreeEntry* iterator so that ...
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2574
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition IRBuilder.h:547
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:575
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2640
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:345
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
LLVM_ABI Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2207
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2596
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1708
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition IRBuilder.h:2280
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2442
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1651
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1437
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isBinaryOp() const
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
const SmallVectorImpl< Type * > & getArgTypes() const
An instruction for reading from memory.
Value * getPointerOperand()
bool isSimple() const
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
iterator end()
Definition MapVector.h:67
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition MapVector.h:48
iterator find(const KeyT &Key)
Definition MapVector.h:149
bool empty() const
Definition MapVector.h:77
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:111
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:103
size_type size() const
Definition MapVector.h:56
std::pair< KeyT, ValueT > & front()
Definition MapVector.h:79
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:303
T & front() const
front - Get the first element.
Definition ArrayRef.h:354
iterator end() const
Definition ArrayRef.h:348
iterator begin() const
Definition ArrayRef.h:347
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
Definition Pass.h:99
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:59
ArrayRef< value_type > getArrayRef() const
Definition SetVector.h:90
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:102
const value_type & front() const
Return the first element of the SetVector.
Definition SetVector.h:131
void insert_range(Range &&R)
Definition SetVector.h:175
Vector takeVector()
Clear the SetVector and return the underlying vector.
Definition SetVector.h:93
void clear()
Completely clear the SetVector.
Definition SetVector.h:266
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:99
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:150
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition SetVector.h:251
static LLVM_ABI bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static LLVM_ABI bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static LLVM_ABI bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
iterator end() const
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:338
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition SmallSet.h:175
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition SmallSet.h:228
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
size_type size() const
Definition SmallSet.h:170
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
LLVM_ABI InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
LLVM_ABI InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
OperandValueProperties
Additional properties of an operand's values.
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ GatherScatter
The cast is used with a gather/scatter.
OperandValueKind
Additional information about an operand's possible values.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
Definition Type.cpp:181
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition Type.h:246
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
LLVM_ABI unsigned getStructNumElements() const
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition Type.h:296
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition Type.h:270
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
TypeID getTypeID() const
Return the type id for the type.
Definition Type.h:136
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition Type.h:225
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition User.cpp:21
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Definition User.h:119
op_iterator op_begin()
Definition User.h:284
Value * getOperand(unsigned i) const
Definition User.h:232
unsigned getNumOperands() const
Definition User.h:254
iterator_range< value_op_iterator > operand_values()
Definition User.h:316
The Vector Function Database.
Definition VectorUtils.h:33
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition VectorUtils.h:74
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
User * user_back()
Definition Value.h:412
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition Value.h:543
LLVM_ABI bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition Value.cpp:158
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099
LLVM_ABI unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition Value.cpp:265
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
iterator find(const_arg_type_t< ValueT > V)
Definition DenseSet.h:167
void insert_range(Range &&R)
Definition DenseSet.h:228
size_type size() const
Definition DenseSet.h:87
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:166
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
Definition Hashing.h:76
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
VLOperands(ArrayRef< Value * > RootVL, ArrayRef< ValueList > Operands, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
Bottom Up SLP Vectorizer.
bool isProfitableToReorder() const
Checks if it is profitable to reorder the current tree.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleEntity &SE)
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleBundle &Bundle)
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
SmallVector< StoreInst *, 8 > StoreList
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={}, InstructionCost ReductionCost=TTI::TCC_Free)
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
SmallVector< Instruction *, 16 > InstrList
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, StridedPtrInfo &SPtrInfo, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
bool analyzeRtStrideCandidate(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align CommonAlignment, SmallVectorImpl< unsigned > &SortedIndices, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with run-time stride).
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isVectorized(const Value *V) const
Check if the value is vectorized in the tree.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleData &SD)
bool isStridedLoad(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align Alignment, const int64_t Diff, Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const
Checks if strided loads can be generated out of VL loads with pointers PointerOps:
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals, ArrayRef< std::tuple< Value *, unsigned, bool > > VectorValuesAndScales)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
static bool isIdentityOrder(ArrayRef< unsigned > Order)
Does this non-empty order represent an identity order?
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Gets reordering data for the given tree entry.
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
CallInst * Call
Changed
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaxNum(const Opnd0 &Op0, const Opnd1 &Op1)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaximum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MatchFunctor< Val, Pattern > match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
unsigned combineHashValue(unsigned a, unsigned b)
Simplistic combination of 32-bit hash values into 32-bit hash values.
@ User
could "use" a pointer
DiagnosticInfoOptimizationBase::Argument NV
bool empty() const
Definition BasicBlock.h:101
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI iterator begin() const
LLVM_ABI Instruction & front() const
A private "module" namespace for types and utilities used by this pass.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
@ Offset
Definition DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:831
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2060
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
void fill(R &&Range, T &&Value)
Provide wrappers to std::fill which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1747
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1720
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1727
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1657
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
InstructionCost Cost
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition ScopeExit.h:59
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2474
auto pred_end(const MachineBasicBlock *BB)
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:644
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition Utils.cpp:1725
constexpr from_range_t from_range
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
LLVM_ABI std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:733
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition STLExtras.h:2233
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
auto cast_or_null(const Y &Val)
Definition Casting.h:715
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:557
iterator_range< po_iterator< T > > post_order(const T &G)
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:677
auto binary_search(R &&Range, T &&Value)
Provide wrappers to std::binary_search which take ranges instead of having to pass begin/end explicit...
Definition STLExtras.h:1983
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:331
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:396
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
DomTreeNodeBase< BasicBlock > DomTreeNode
Definition Dominators.h:95
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:754
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2130
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:1970
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1734
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:402
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1624
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
Definition STLExtras.h:1765
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:435
bool isa_and_present(const Y &Val)
isa_and_present<X> - Functionally identical to isa, except that a null value is accepted.
Definition Casting.h:670
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
Definition SPIRVUtils.h:339
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1741
auto make_first_range(ContainerTy &&c)
Given a container of pairs, return a range over the first elements.
Definition STLExtras.h:1399
LLVM_ABI std::optional< int64_t > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition Local.cpp:421
bool isModOrRefSet(const ModRefInfo MRI)
Definition ModRef.h:43
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
Definition STLExtras.h:1922
LLVM_ABI bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Ref
The access may reference the value stored in memory.
Definition ModRef.h:32
@ LLVM_MARK_AS_BITMASK_ENUM
Definition ModRef.h:37
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:405
@ Other
Any other memory.
Definition ModRef.h:68
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
TargetTransformInfo TTI
LLVM_ABI CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ None
Not a recurrence.
@ Xor
Bitwise or logical XOR of integers.
@ FMul
Product of floats.
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
@ FAdd
Sum of floats.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1956
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2032
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1837
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
Definition STLExtras.h:1409
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1963
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
auto pred_begin(const MachineBasicBlock *BB)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:560
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1760
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1899
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
template class LLVM_TEMPLATE_ABI DomTreeNodeBase< BasicBlock >
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2090
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:316
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
LLVM_ABI Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:831
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Used to keep track of an operand bundle.
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits(bool simple=false)
DenseMapInfo< BoUpSLP::TreeEntry * > FirstInfo
static bool isEqual(const BoUpSLP::EdgeInfo &LHS, const BoUpSLP::EdgeInfo &RHS)
static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val)
static BoUpSLP::EdgeInfo getTombstoneKey()
An information struct used to provide DenseMap with the various necessary components for a given valu...
Add the VectorizableTree to the index iterator to be able to return TreeEntry pointers.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
static nodes_iterator nodes_end(BoUpSLP *R)
BoUpSLP::TreeEntry::VecTreeTy ContainerTy
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
ScalarEvolution * SE
TargetTransformInfo * TTI
AssumptionCache * AC
TargetLibraryInfo * TLI
const DataLayout * DL
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Definition MapVector.h:257
Describe known properties for a set of pointers.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Definition STLExtras.h:1427
Function object to check whether the second component of a container supported by std::get (like std:...
Definition STLExtras.h:1436
This structure holds any data we need about the edges being traversed during buildTreeRec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.
bool operator==(const EdgeInfo &Other) const