LLVM 22.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
41#include "llvm/Analysis/Loads.h"
52#include "llvm/IR/Attributes.h"
53#include "llvm/IR/BasicBlock.h"
54#include "llvm/IR/Constant.h"
55#include "llvm/IR/Constants.h"
56#include "llvm/IR/DataLayout.h"
58#include "llvm/IR/Dominators.h"
59#include "llvm/IR/Function.h"
60#include "llvm/IR/IRBuilder.h"
61#include "llvm/IR/InstrTypes.h"
62#include "llvm/IR/Instruction.h"
65#include "llvm/IR/Intrinsics.h"
66#include "llvm/IR/Module.h"
67#include "llvm/IR/Operator.h"
69#include "llvm/IR/Type.h"
70#include "llvm/IR/Use.h"
71#include "llvm/IR/User.h"
72#include "llvm/IR/Value.h"
73#include "llvm/IR/ValueHandle.h"
74#ifdef EXPENSIVE_CHECKS
75#include "llvm/IR/Verifier.h"
76#endif
77#include "llvm/Pass.h"
82#include "llvm/Support/Debug.h"
94#include <algorithm>
95#include <cassert>
96#include <cstdint>
97#include <iterator>
98#include <memory>
99#include <optional>
100#include <set>
101#include <string>
102#include <tuple>
103#include <utility>
104
105using namespace llvm;
106using namespace llvm::PatternMatch;
107using namespace slpvectorizer;
108using namespace std::placeholders;
109
110#define SV_NAME "slp-vectorizer"
111#define DEBUG_TYPE "SLP"
112
113STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
114
115DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",
116 "Controls which SLP graphs should be vectorized.");
117
118static cl::opt<bool>
119 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
120 cl::desc("Run the SLP vectorization passes"));
121
122static cl::opt<bool>
123 SLPReVec("slp-revec", cl::init(false), cl::Hidden,
124 cl::desc("Enable vectorization for wider vector utilization"));
125
126static cl::opt<int>
128 cl::desc("Only vectorize if you gain more than this "
129 "number "));
130
132 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
133 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
134 "heuristics and makes vectorization decision via cost modeling."));
135
136static cl::opt<bool>
137ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
138 cl::desc("Attempt to vectorize horizontal reductions"));
139
141 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
142 cl::desc(
143 "Attempt to vectorize horizontal reductions feeding into a store"));
144
146 "slp-split-alternate-instructions", cl::init(true), cl::Hidden,
147 cl::desc("Improve the code quality by splitting alternate instructions"));
148
149static cl::opt<int>
151 cl::desc("Attempt to vectorize for this register size in bits"));
152
155 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
156
157/// Limits the size of scheduling regions in a block.
158/// It avoid long compile times for _very_ large blocks where vector
159/// instructions are spread over a wide range.
160/// This limit is way higher than needed by real-world functions.
161static cl::opt<int>
162ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
163 cl::desc("Limit the size of the SLP scheduling region per block"));
164
166 "slp-min-reg-size", cl::init(128), cl::Hidden,
167 cl::desc("Attempt to vectorize for this register size in bits"));
168
170 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
171 cl::desc("Limit the recursion depth when building a vectorizable tree"));
172
174 "slp-min-tree-size", cl::init(3), cl::Hidden,
175 cl::desc("Only vectorize small trees if they are fully vectorizable"));
176
177// The maximum depth that the look-ahead score heuristic will explore.
178// The higher this value, the higher the compilation time overhead.
180 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
181 cl::desc("The maximum look-ahead depth for operand reordering scores"));
182
183// The maximum depth that the look-ahead score heuristic will explore
184// when it probing among candidates for vectorization tree roots.
185// The higher this value, the higher the compilation time overhead but unlike
186// similar limit for operands ordering this is less frequently used, hence
187// impact of higher value is less noticeable.
189 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
190 cl::desc("The maximum look-ahead depth for searching best rooting option"));
191
193 "slp-min-strided-loads", cl::init(2), cl::Hidden,
194 cl::desc("The minimum number of loads, which should be considered strided, "
195 "if the stride is > 1 or is runtime value"));
196
198 "slp-max-stride", cl::init(8), cl::Hidden,
199 cl::desc("The maximum stride, considered to be profitable."));
200
201static cl::opt<bool>
202 DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden,
203 cl::desc("Disable tree reordering even if it is "
204 "profitable. Used for testing only."));
205
206static cl::opt<bool>
207 ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden,
208 cl::desc("Generate strided loads even if they are not "
209 "profitable. Used for testing only."));
210
211static cl::opt<bool>
212 ViewSLPTree("view-slp-tree", cl::Hidden,
213 cl::desc("Display the SLP trees with Graphviz"));
214
216 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
217 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
218
219/// Enables vectorization of copyable elements.
221 "slp-copyable-elements", cl::init(true), cl::Hidden,
222 cl::desc("Try to replace values with the idempotent instructions for "
223 "better vectorization."));
224
225// Limit the number of alias checks. The limit is chosen so that
226// it has no negative effect on the llvm benchmarks.
227static const unsigned AliasedCheckLimit = 10;
228
229// Limit of the number of uses for potentially transformed instructions/values,
230// used in checks to avoid compile-time explode.
231static constexpr int UsesLimit = 64;
232
233// Another limit for the alias checks: The maximum distance between load/store
234// instructions where alias checks are done.
235// This limit is useful for very large basic blocks.
236static const unsigned MaxMemDepDistance = 160;
237
238/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
239/// regions to be handled.
240static const int MinScheduleRegionSize = 16;
241
242/// Maximum allowed number of operands in the PHI nodes.
243static const unsigned MaxPHINumOperands = 128;
244
245/// Predicate for the element types that the SLP vectorizer supports.
246///
247/// The most important thing to filter here are types which are invalid in LLVM
248/// vectors. We also filter target specific types which have absolutely no
249/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
250/// avoids spending time checking the cost model and realizing that they will
251/// be inevitably scalarized.
252static bool isValidElementType(Type *Ty) {
253 // TODO: Support ScalableVectorType.
255 Ty = Ty->getScalarType();
256 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
257 !Ty->isPPC_FP128Ty();
258}
259
260/// Returns the type of the given value/instruction \p V. If it is store,
261/// returns the type of its value operand, for Cmp - the types of the compare
262/// operands and for insertelement - the type os the inserted operand.
263/// Otherwise, just the type of the value is returned.
265 if (auto *SI = dyn_cast<StoreInst>(V))
266 return SI->getValueOperand()->getType();
267 if (auto *CI = dyn_cast<CmpInst>(V))
268 return CI->getOperand(0)->getType();
269 if (auto *IE = dyn_cast<InsertElementInst>(V))
270 return IE->getOperand(1)->getType();
271 return V->getType();
272}
273
274/// \returns the number of elements for Ty.
275static unsigned getNumElements(Type *Ty) {
277 "ScalableVectorType is not supported.");
278 if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
279 return VecTy->getNumElements();
280 return 1;
281}
282
283/// \returns the vector type of ScalarTy based on vectorization factor.
284static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
285 return FixedVectorType::get(ScalarTy->getScalarType(),
286 VF * getNumElements(ScalarTy));
287}
288
289/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
290/// which forms type, which splits by \p TTI into whole vector types during
291/// legalization.
293 Type *Ty, unsigned Sz) {
294 if (!isValidElementType(Ty))
295 return bit_ceil(Sz);
296 // Find the number of elements, which forms full vectors.
297 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
298 if (NumParts == 0 || NumParts >= Sz)
299 return bit_ceil(Sz);
300 return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
301}
302
303/// Returns the number of elements of the given type \p Ty, not greater than \p
304/// Sz, which forms type, which splits by \p TTI into whole vector types during
305/// legalization.
306static unsigned
308 unsigned Sz) {
309 if (!isValidElementType(Ty))
310 return bit_floor(Sz);
311 // Find the number of elements, which forms full vectors.
312 unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
313 if (NumParts == 0 || NumParts >= Sz)
314 return bit_floor(Sz);
315 unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
316 if (RegVF > Sz)
317 return bit_floor(Sz);
318 return (Sz / RegVF) * RegVF;
319}
320
321static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
322 SmallVectorImpl<int> &Mask) {
323 // The ShuffleBuilder implementation use shufflevector to splat an "element".
324 // But the element have different meaning for SLP (scalar) and REVEC
325 // (vector). We need to expand Mask into masks which shufflevector can use
326 // directly.
327 SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
328 for (unsigned I : seq<unsigned>(Mask.size()))
329 for (auto [J, MaskV] : enumerate(MutableArrayRef(NewMask).slice(
330 I * VecTyNumElements, VecTyNumElements)))
331 MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem
332 : Mask[I] * VecTyNumElements + J;
333 Mask.swap(NewMask);
334}
335
336/// \returns the number of groups of shufflevector
337/// A group has the following features
338/// 1. All of value in a group are shufflevector.
339/// 2. The mask of all shufflevector is isExtractSubvectorMask.
340/// 3. The mask of all shufflevector uses all of the elements of the source.
341/// e.g., it is 1 group (%0)
342/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
343/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
344/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
345/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
346/// it is 2 groups (%3 and %4)
347/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
348/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
349/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
350/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
351/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
352/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
353/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
354/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
355/// it is 0 group
356/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
357/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
358/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
359/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
361 if (VL.empty())
362 return 0;
364 return 0;
365 auto *SV = cast<ShuffleVectorInst>(VL.front());
366 unsigned SVNumElements =
367 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
368 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
369 if (SVNumElements % ShuffleMaskSize != 0)
370 return 0;
371 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
372 if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
373 return 0;
374 unsigned NumGroup = 0;
375 for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) {
376 auto *SV = cast<ShuffleVectorInst>(VL[I]);
377 Value *Src = SV->getOperand(0);
378 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
379 SmallBitVector ExpectedIndex(GroupSize);
380 if (!all_of(Group, [&](Value *V) {
381 auto *SV = cast<ShuffleVectorInst>(V);
382 // From the same source.
383 if (SV->getOperand(0) != Src)
384 return false;
385 int Index;
386 if (!SV->isExtractSubvectorMask(Index))
387 return false;
388 ExpectedIndex.set(Index / ShuffleMaskSize);
389 return true;
390 }))
391 return 0;
392 if (!ExpectedIndex.all())
393 return 0;
394 ++NumGroup;
395 }
396 assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
397 return NumGroup;
398}
399
400/// \returns a shufflevector mask which is used to vectorize shufflevectors
401/// e.g.,
402/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
403/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
404/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
405/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
406/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
407/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
408/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
409/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
410/// the result is
411/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
413 assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
414 auto *SV = cast<ShuffleVectorInst>(VL.front());
415 unsigned SVNumElements =
416 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
417 SmallVector<int> Mask;
418 unsigned AccumulateLength = 0;
419 for (Value *V : VL) {
420 auto *SV = cast<ShuffleVectorInst>(V);
421 for (int M : SV->getShuffleMask())
422 Mask.push_back(M == PoisonMaskElem ? PoisonMaskElem
423 : AccumulateLength + M);
424 AccumulateLength += SVNumElements;
425 }
426 return Mask;
427}
428
429/// \returns True if the value is a constant (but not globals/constant
430/// expressions).
431static bool isConstant(Value *V) {
433}
434
435/// Checks if \p V is one of vector-like instructions, i.e. undef,
436/// insertelement/extractelement with constant indices for fixed vector type or
437/// extractvalue instruction.
441 return false;
442 auto *I = dyn_cast<Instruction>(V);
443 if (!I || isa<ExtractValueInst>(I))
444 return true;
445 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
446 return false;
448 return isConstant(I->getOperand(1));
449 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
450 return isConstant(I->getOperand(2));
451}
452
453/// Returns power-of-2 number of elements in a single register (part), given the
454/// total number of elements \p Size and number of registers (parts) \p
455/// NumParts.
456static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
457 return std::min<unsigned>(Size, bit_ceil(divideCeil(Size, NumParts)));
458}
459
460/// Returns correct remaining number of elements, considering total amount \p
461/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
462/// and current register (part) \p Part.
463static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
464 unsigned Part) {
465 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
466}
467
468#if !defined(NDEBUG)
469/// Print a short descriptor of the instruction bundle suitable for debug output.
470static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
471 std::string Result;
472 raw_string_ostream OS(Result);
473 if (Idx >= 0)
474 OS << "Idx: " << Idx << ", ";
475 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
476 return Result;
477}
478#endif
479
480/// \returns true if all of the instructions in \p VL are in the same block or
481/// false otherwise.
483 auto *It = find_if(VL, IsaPred<Instruction>);
484 if (It == VL.end())
485 return false;
488 return true;
489
490 BasicBlock *BB = I0->getParent();
491 for (Value *V : iterator_range(It, VL.end())) {
492 if (isa<PoisonValue>(V))
493 continue;
494 auto *II = dyn_cast<Instruction>(V);
495 if (!II)
496 return false;
497
498 if (BB != II->getParent())
499 return false;
500 }
501 return true;
502}
503
504/// \returns True if all of the values in \p VL are constants (but not
505/// globals/constant expressions).
507 // Constant expressions and globals can't be vectorized like normal integer/FP
508 // constants.
509 return all_of(VL, isConstant);
510}
511
512/// \returns True if all of the values in \p VL are identical or some of them
513/// are UndefValue.
514static bool isSplat(ArrayRef<Value *> VL) {
515 Value *FirstNonUndef = nullptr;
516 for (Value *V : VL) {
517 if (isa<UndefValue>(V))
518 continue;
519 if (!FirstNonUndef) {
520 FirstNonUndef = V;
521 continue;
522 }
523 if (V != FirstNonUndef)
524 return false;
525 }
526 return FirstNonUndef != nullptr;
527}
528
529/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
530/// For BinaryOperator, it also checks if \p InstWithUses is used in specific
531/// patterns that make it effectively commutative (like equality comparisons
532/// with zero).
533/// In most cases, users should not call this function directly (since \p I and
534/// \p InstWithUses are the same). However, when analyzing interchangeable
535/// instructions, we need to use the converted opcode along with the original
536/// uses.
537/// \param I The instruction to check for commutativity
538/// \param ValWithUses The value whose uses are analyzed for special
539/// patterns
540static bool isCommutative(Instruction *I, Value *ValWithUses) {
541 if (auto *Cmp = dyn_cast<CmpInst>(I))
542 return Cmp->isCommutative();
543 if (auto *BO = dyn_cast<BinaryOperator>(I))
544 return BO->isCommutative() ||
545 (BO->getOpcode() == Instruction::Sub &&
546 !ValWithUses->hasNUsesOrMore(UsesLimit) &&
547 all_of(
548 ValWithUses->uses(),
549 [](const Use &U) {
550 // Commutative, if icmp eq/ne sub, 0
551 CmpPredicate Pred;
552 if (match(U.getUser(),
553 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
554 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
555 return true;
556 // Commutative, if abs(sub nsw, true) or abs(sub, false).
557 ConstantInt *Flag;
558 return match(U.getUser(),
559 m_Intrinsic<Intrinsic::abs>(
560 m_Specific(U.get()), m_ConstantInt(Flag))) &&
561 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
562 Flag->isOne());
563 })) ||
564 (BO->getOpcode() == Instruction::FSub &&
565 !ValWithUses->hasNUsesOrMore(UsesLimit) &&
566 all_of(ValWithUses->uses(), [](const Use &U) {
567 return match(U.getUser(),
568 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
569 }));
570 return I->isCommutative();
571}
572
573/// This is a helper function to check whether \p I is commutative.
574/// This is a convenience wrapper that calls the two-parameter version of
575/// isCommutative with the same instruction for both parameters. This is
576/// the common case where the instruction being checked for commutativity
577/// is the same as the instruction whose uses are analyzed for special
578/// patterns (see the two-parameter version above for details).
579/// \param I The instruction to check for commutativity
580/// \returns true if the instruction is commutative, false otherwise
581static bool isCommutative(Instruction *I) { return isCommutative(I, I); }
582
583/// \returns number of operands of \p I, considering commutativity. Returns 2
584/// for commutative instrinsics.
585/// \param I The instruction to check for commutativity
588 // IntrinsicInst::isCommutative returns true if swapping the first "two"
589 // arguments to the intrinsic produces the same result.
590 constexpr unsigned IntrinsicNumOperands = 2;
591 return IntrinsicNumOperands;
592 }
593 return I->getNumOperands();
594}
595
596template <typename T>
597static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
598 unsigned Offset) {
599 static_assert(std::is_same_v<T, InsertElementInst> ||
600 std::is_same_v<T, ExtractElementInst>,
601 "unsupported T");
602 int Index = Offset;
603 if (const auto *IE = dyn_cast<T>(Inst)) {
604 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
605 if (!VT)
606 return std::nullopt;
607 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
608 if (!CI)
609 return std::nullopt;
610 if (CI->getValue().uge(VT->getNumElements()))
611 return std::nullopt;
612 Index *= VT->getNumElements();
613 Index += CI->getZExtValue();
614 return Index;
615 }
616 return std::nullopt;
617}
618
619/// \returns inserting or extracting index of InsertElement, ExtractElement or
620/// InsertValue instruction, using Offset as base offset for index.
621/// \returns std::nullopt if the index is not an immediate.
622static std::optional<unsigned> getElementIndex(const Value *Inst,
623 unsigned Offset = 0) {
624 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
625 return Index;
627 return Index;
628
629 int Index = Offset;
630
631 const auto *IV = dyn_cast<InsertValueInst>(Inst);
632 if (!IV)
633 return std::nullopt;
634
635 Type *CurrentType = IV->getType();
636 for (unsigned I : IV->indices()) {
637 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
638 Index *= ST->getNumElements();
639 CurrentType = ST->getElementType(I);
640 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
641 Index *= AT->getNumElements();
642 CurrentType = AT->getElementType();
643 } else {
644 return std::nullopt;
645 }
646 Index += I;
647 }
648 return Index;
649}
650
651/// \returns true if all of the values in \p VL use the same opcode.
652/// For comparison instructions, also checks if predicates match.
653/// PoisonValues are considered matching.
654/// Interchangeable instructions are not considered.
656 auto *It = find_if(VL, IsaPred<Instruction>);
657 if (It == VL.end())
658 return true;
659 Instruction *MainOp = cast<Instruction>(*It);
660 unsigned Opcode = MainOp->getOpcode();
661 bool IsCmpOp = isa<CmpInst>(MainOp);
662 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
664 return std::all_of(It, VL.end(), [&](Value *V) {
665 if (auto *CI = dyn_cast<CmpInst>(V))
666 return BasePred == CI->getPredicate();
667 if (auto *I = dyn_cast<Instruction>(V))
668 return I->getOpcode() == Opcode;
669 return isa<PoisonValue>(V);
670 });
671}
672
673namespace {
674/// Specifies the way the mask should be analyzed for undefs/poisonous elements
675/// in the shuffle mask.
676enum class UseMask {
677 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
678 ///< check for the mask elements for the first argument (mask
679 ///< indices are in range [0:VF)).
680 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
681 ///< for the mask elements for the second argument (mask indices
682 ///< are in range [VF:2*VF))
683 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
684 ///< future shuffle elements and mark them as ones as being used
685 ///< in future. Non-undef elements are considered as unused since
686 ///< they're already marked as used in the mask.
687};
688} // namespace
689
690/// Prepares a use bitset for the given mask either for the first argument or
691/// for the second.
693 UseMask MaskArg) {
694 SmallBitVector UseMask(VF, true);
695 for (auto [Idx, Value] : enumerate(Mask)) {
696 if (Value == PoisonMaskElem) {
697 if (MaskArg == UseMask::UndefsAsMask)
698 UseMask.reset(Idx);
699 continue;
700 }
701 if (MaskArg == UseMask::FirstArg && Value < VF)
702 UseMask.reset(Value);
703 else if (MaskArg == UseMask::SecondArg && Value >= VF)
704 UseMask.reset(Value - VF);
705 }
706 return UseMask;
707}
708
709/// Checks if the given value is actually an undefined constant vector.
710/// Also, if the \p UseMask is not empty, tries to check if the non-masked
711/// elements actually mask the insertelement buildvector, if any.
712template <bool IsPoisonOnly = false>
714 const SmallBitVector &UseMask = {}) {
715 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
716 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
717 if (isa<T>(V))
718 return Res;
719 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
720 if (!VecTy)
721 return Res.reset();
722 auto *C = dyn_cast<Constant>(V);
723 if (!C) {
724 if (!UseMask.empty()) {
725 const Value *Base = V;
726 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
727 Base = II->getOperand(0);
728 if (isa<T>(II->getOperand(1)))
729 continue;
730 std::optional<unsigned> Idx = getElementIndex(II);
731 if (!Idx) {
732 Res.reset();
733 return Res;
734 }
735 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
736 Res.reset(*Idx);
737 }
738 // TODO: Add analysis for shuffles here too.
739 if (V == Base) {
740 Res.reset();
741 } else {
742 SmallBitVector SubMask(UseMask.size(), false);
743 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
744 }
745 } else {
746 Res.reset();
747 }
748 return Res;
749 }
750 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
751 if (Constant *Elem = C->getAggregateElement(I))
752 if (!isa<T>(Elem) &&
753 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
754 Res.reset(I);
755 }
756 return Res;
757}
758
759/// Checks if the vector of instructions can be represented as a shuffle, like:
760/// %x0 = extractelement <4 x i8> %x, i32 0
761/// %x3 = extractelement <4 x i8> %x, i32 3
762/// %y1 = extractelement <4 x i8> %y, i32 1
763/// %y2 = extractelement <4 x i8> %y, i32 2
764/// %x0x0 = mul i8 %x0, %x0
765/// %x3x3 = mul i8 %x3, %x3
766/// %y1y1 = mul i8 %y1, %y1
767/// %y2y2 = mul i8 %y2, %y2
768/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
769/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
770/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
771/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
772/// ret <4 x i8> %ins4
773/// can be transformed into:
774/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
775/// i32 6>
776/// %2 = mul <4 x i8> %1, %1
777/// ret <4 x i8> %2
778/// Mask will return the Shuffle Mask equivalent to the extracted elements.
779/// TODO: Can we split off and reuse the shuffle mask detection from
780/// ShuffleVectorInst/getShuffleCost?
781static std::optional<TargetTransformInfo::ShuffleKind>
783 AssumptionCache *AC) {
784 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
785 if (It == VL.end())
786 return std::nullopt;
787 unsigned Size =
788 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) {
789 auto *EI = dyn_cast<ExtractElementInst>(V);
790 if (!EI)
791 return S;
792 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
793 if (!VTy)
794 return S;
795 return std::max(S, VTy->getNumElements());
796 });
797
798 Value *Vec1 = nullptr;
799 Value *Vec2 = nullptr;
800 bool HasNonUndefVec = any_of(VL, [&](Value *V) {
801 auto *EE = dyn_cast<ExtractElementInst>(V);
802 if (!EE)
803 return false;
804 Value *Vec = EE->getVectorOperand();
805 if (isa<UndefValue>(Vec))
806 return false;
807 return isGuaranteedNotToBePoison(Vec, AC);
808 });
809 enum ShuffleMode { Unknown, Select, Permute };
810 ShuffleMode CommonShuffleMode = Unknown;
811 Mask.assign(VL.size(), PoisonMaskElem);
812 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
813 // Undef can be represented as an undef element in a vector.
814 if (isa<UndefValue>(VL[I]))
815 continue;
816 auto *EI = cast<ExtractElementInst>(VL[I]);
817 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
818 return std::nullopt;
819 auto *Vec = EI->getVectorOperand();
820 // We can extractelement from undef or poison vector.
822 continue;
823 // All vector operands must have the same number of vector elements.
824 if (isa<UndefValue>(Vec)) {
825 Mask[I] = I;
826 } else {
827 if (isa<UndefValue>(EI->getIndexOperand()))
828 continue;
829 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
830 if (!Idx)
831 return std::nullopt;
832 // Undefined behavior if Idx is negative or >= Size.
833 if (Idx->getValue().uge(Size))
834 continue;
835 unsigned IntIdx = Idx->getValue().getZExtValue();
836 Mask[I] = IntIdx;
837 }
838 if (isUndefVector(Vec).all() && HasNonUndefVec)
839 continue;
840 // For correct shuffling we have to have at most 2 different vector operands
841 // in all extractelement instructions.
842 if (!Vec1 || Vec1 == Vec) {
843 Vec1 = Vec;
844 } else if (!Vec2 || Vec2 == Vec) {
845 Vec2 = Vec;
846 Mask[I] += Size;
847 } else {
848 return std::nullopt;
849 }
850 if (CommonShuffleMode == Permute)
851 continue;
852 // If the extract index is not the same as the operation number, it is a
853 // permutation.
854 if (Mask[I] % Size != I) {
855 CommonShuffleMode = Permute;
856 continue;
857 }
858 CommonShuffleMode = Select;
859 }
860 // If we're not crossing lanes in different vectors, consider it as blending.
861 if (CommonShuffleMode == Select && Vec2)
863 // If Vec2 was never used, we have a permutation of a single vector, otherwise
864 // we have permutation of 2 vectors.
867}
868
869/// \returns True if Extract{Value,Element} instruction extracts element Idx.
870static std::optional<unsigned> getExtractIndex(const Instruction *E) {
871 unsigned Opcode = E->getOpcode();
872 assert((Opcode == Instruction::ExtractElement ||
873 Opcode == Instruction::ExtractValue) &&
874 "Expected extractelement or extractvalue instruction.");
875 if (Opcode == Instruction::ExtractElement) {
876 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
877 if (!CI)
878 return std::nullopt;
879 return CI->getZExtValue();
880 }
881 auto *EI = cast<ExtractValueInst>(E);
882 if (EI->getNumIndices() != 1)
883 return std::nullopt;
884 return *EI->idx_begin();
885}
886
887namespace llvm {
888/// Checks if the provided value does not require scheduling. It does not
889/// require scheduling if this is not an instruction or it is an instruction
890/// that does not read/write memory and all operands are either not instructions
891/// or phi nodes or instructions from different blocks.
892static bool areAllOperandsNonInsts(Value *V);
893/// Checks if the provided value does not require scheduling. It does not
894/// require scheduling if this is not an instruction or it is an instruction
895/// that does not read/write memory and all users are phi nodes or instructions
896/// from the different blocks.
897static bool isUsedOutsideBlock(Value *V);
898/// Checks if the specified value does not require scheduling. It does not
899/// require scheduling if all operands and all users do not need to be scheduled
900/// in the current basic block.
901static bool doesNotNeedToBeScheduled(Value *V);
902} // namespace llvm
903
904namespace {
905/// \returns true if \p Opcode is allowed as part of the main/alternate
906/// instruction for SLP vectorization.
907///
908/// Example of unsupported opcode is SDIV that can potentially cause UB if the
909/// "shuffled out" lane would result in division by zero.
910bool isValidForAlternation(unsigned Opcode) {
911 return !Instruction::isIntDivRem(Opcode);
912}
913
914/// Helper class that determines VL can use the same opcode.
915/// Alternate instruction is supported. In addition, it supports interchangeable
916/// instruction. An interchangeable instruction is an instruction that can be
917/// converted to another instruction with same semantics. For example, x << 1 is
918/// equal to x * 2. x * 1 is equal to x | 0.
919class BinOpSameOpcodeHelper {
920 using MaskType = std::uint_fast16_t;
921 /// Sort SupportedOp because it is used by binary_search.
922 constexpr static std::initializer_list<unsigned> SupportedOp = {
923 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
924 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
925 enum : MaskType {
926 ShlBIT = 0b1,
927 AShrBIT = 0b10,
928 MulBIT = 0b100,
929 AddBIT = 0b1000,
930 SubBIT = 0b10000,
931 AndBIT = 0b100000,
932 OrBIT = 0b1000000,
933 XorBIT = 0b10000000,
934 MainOpBIT = 0b100000000,
936 };
937 /// Return a non-nullptr if either operand of I is a ConstantInt.
938 /// The second return value represents the operand position. We check the
939 /// right-hand side first (1). If the right hand side is not a ConstantInt and
940 /// the instruction is neither Sub, Shl, nor AShr, we then check the left hand
941 /// side (0).
942 static std::pair<ConstantInt *, unsigned>
943 isBinOpWithConstantInt(const Instruction *I) {
944 unsigned Opcode = I->getOpcode();
945 assert(binary_search(SupportedOp, Opcode) && "Unsupported opcode.");
946 (void)SupportedOp;
947 auto *BinOp = cast<BinaryOperator>(I);
948 if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1)))
949 return {CI, 1};
950 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
951 Opcode == Instruction::AShr)
952 return {nullptr, 0};
953 if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(0)))
954 return {CI, 0};
955 return {nullptr, 0};
956 }
957 struct InterchangeableInfo {
958 const Instruction *I = nullptr;
959 /// The bit it sets represents whether MainOp can be converted to.
960 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
961 MulBIT | AShrBIT | ShlBIT;
962 /// We cannot create an interchangeable instruction that does not exist in
963 /// VL. For example, VL [x + 0, y * 1] can be converted to [x << 0, y << 0],
964 /// but << does not exist in VL. In the end, we convert VL to [x * 1, y *
965 /// 1]. SeenBefore is used to know what operations have been seen before.
966 MaskType SeenBefore = 0;
967 InterchangeableInfo(const Instruction *I) : I(I) {}
968 /// Return false allows BinOpSameOpcodeHelper to find an alternate
969 /// instruction. Directly setting the mask will destroy the mask state,
970 /// preventing us from determining which instruction it should convert to.
971 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
972 if (Mask & InterchangeableMask) {
973 SeenBefore |= OpcodeInMaskForm;
974 Mask &= InterchangeableMask;
975 return true;
976 }
977 return false;
978 }
979 bool equal(unsigned Opcode) {
980 return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
981 }
982 unsigned getOpcode() const {
983 MaskType Candidate = Mask & SeenBefore;
984 if (Candidate & MainOpBIT)
985 return I->getOpcode();
986 if (Candidate & ShlBIT)
987 return Instruction::Shl;
988 if (Candidate & AShrBIT)
989 return Instruction::AShr;
990 if (Candidate & MulBIT)
991 return Instruction::Mul;
992 if (Candidate & AddBIT)
993 return Instruction::Add;
994 if (Candidate & SubBIT)
995 return Instruction::Sub;
996 if (Candidate & AndBIT)
997 return Instruction::And;
998 if (Candidate & OrBIT)
999 return Instruction::Or;
1000 if (Candidate & XorBIT)
1001 return Instruction::Xor;
1002 llvm_unreachable("Cannot find interchangeable instruction.");
1003 }
1004
1005 /// Return true if the instruction can be converted to \p Opcode.
1006 bool hasCandidateOpcode(unsigned Opcode) const {
1007 MaskType Candidate = Mask & SeenBefore;
1008 switch (Opcode) {
1009 case Instruction::Shl:
1010 return Candidate & ShlBIT;
1011 case Instruction::AShr:
1012 return Candidate & AShrBIT;
1013 case Instruction::Mul:
1014 return Candidate & MulBIT;
1015 case Instruction::Add:
1016 return Candidate & AddBIT;
1017 case Instruction::Sub:
1018 return Candidate & SubBIT;
1019 case Instruction::And:
1020 return Candidate & AndBIT;
1021 case Instruction::Or:
1022 return Candidate & OrBIT;
1023 case Instruction::Xor:
1024 return Candidate & XorBIT;
1025 case Instruction::LShr:
1026 case Instruction::FAdd:
1027 case Instruction::FSub:
1028 case Instruction::FMul:
1029 case Instruction::SDiv:
1030 case Instruction::UDiv:
1031 case Instruction::FDiv:
1032 case Instruction::SRem:
1033 case Instruction::URem:
1034 case Instruction::FRem:
1035 return false;
1036 default:
1037 break;
1038 }
1039 llvm_unreachable("Cannot find interchangeable instruction.");
1040 }
1041
1042 SmallVector<Value *> getOperand(const Instruction *To) const {
1043 unsigned ToOpcode = To->getOpcode();
1044 unsigned FromOpcode = I->getOpcode();
1045 if (FromOpcode == ToOpcode)
1046 return SmallVector<Value *>(I->operands());
1047 assert(binary_search(SupportedOp, ToOpcode) && "Unsupported opcode.");
1048 auto [CI, Pos] = isBinOpWithConstantInt(I);
1049 const APInt &FromCIValue = CI->getValue();
1050 unsigned FromCIValueBitWidth = FromCIValue.getBitWidth();
1051 APInt ToCIValue;
1052 switch (FromOpcode) {
1053 case Instruction::Shl:
1054 if (ToOpcode == Instruction::Mul) {
1055 ToCIValue = APInt::getOneBitSet(FromCIValueBitWidth,
1056 FromCIValue.getZExtValue());
1057 } else {
1058 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1059 ToCIValue = ToOpcode == Instruction::And
1060 ? APInt::getAllOnes(FromCIValueBitWidth)
1061 : APInt::getZero(FromCIValueBitWidth);
1062 }
1063 break;
1064 case Instruction::Mul:
1065 assert(FromCIValue.isPowerOf2() && "Cannot convert the instruction.");
1066 if (ToOpcode == Instruction::Shl) {
1067 ToCIValue = APInt(FromCIValueBitWidth, FromCIValue.logBase2());
1068 } else {
1069 assert(FromCIValue.isOne() && "Cannot convert the instruction.");
1070 ToCIValue = ToOpcode == Instruction::And
1071 ? APInt::getAllOnes(FromCIValueBitWidth)
1072 : APInt::getZero(FromCIValueBitWidth);
1073 }
1074 break;
1075 case Instruction::Add:
1076 case Instruction::Sub:
1077 if (FromCIValue.isZero()) {
1078 ToCIValue = APInt::getZero(FromCIValueBitWidth);
1079 } else {
1080 assert(is_contained({Instruction::Add, Instruction::Sub}, ToOpcode) &&
1081 "Cannot convert the instruction.");
1082 ToCIValue = FromCIValue;
1083 ToCIValue.negate();
1084 }
1085 break;
1086 case Instruction::And:
1087 assert(FromCIValue.isAllOnes() && "Cannot convert the instruction.");
1088 ToCIValue = ToOpcode == Instruction::Mul
1089 ? APInt::getOneBitSet(FromCIValueBitWidth, 0)
1090 : APInt::getZero(FromCIValueBitWidth);
1091 break;
1092 default:
1093 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1094 ToCIValue = APInt::getZero(FromCIValueBitWidth);
1095 break;
1096 }
1097 Value *LHS = I->getOperand(1 - Pos);
1098 Constant *RHS =
1099 ConstantInt::get(I->getOperand(Pos)->getType(), ToCIValue);
1100 // constant + x cannot be -constant - x
1101 // instead, it should be x - -constant
1102 if (Pos == 1 ||
1103 (FromOpcode == Instruction::Add && ToOpcode == Instruction::Sub))
1104 return SmallVector<Value *>({LHS, RHS});
1105 return SmallVector<Value *>({RHS, LHS});
1106 }
1107 };
1108 InterchangeableInfo MainOp;
1109 InterchangeableInfo AltOp;
1110 bool isValidForAlternation(const Instruction *I) const {
1111 return ::isValidForAlternation(MainOp.I->getOpcode()) &&
1112 ::isValidForAlternation(I->getOpcode());
1113 }
1114 bool initializeAltOp(const Instruction *I) {
1115 if (AltOp.I)
1116 return true;
1117 if (!isValidForAlternation(I))
1118 return false;
1119 AltOp.I = I;
1120 return true;
1121 }
1122
1123public:
1124 BinOpSameOpcodeHelper(const Instruction *MainOp,
1125 const Instruction *AltOp = nullptr)
1126 : MainOp(MainOp), AltOp(AltOp) {
1127 assert(is_sorted(SupportedOp) && "SupportedOp is not sorted.");
1128 }
1129 bool add(const Instruction *I) {
1131 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1132 unsigned Opcode = I->getOpcode();
1133 MaskType OpcodeInMaskForm;
1134 // Prefer Shl, AShr, Mul, Add, Sub, And, Or and Xor over MainOp.
1135 switch (Opcode) {
1136 case Instruction::Shl:
1137 OpcodeInMaskForm = ShlBIT;
1138 break;
1139 case Instruction::AShr:
1140 OpcodeInMaskForm = AShrBIT;
1141 break;
1142 case Instruction::Mul:
1143 OpcodeInMaskForm = MulBIT;
1144 break;
1145 case Instruction::Add:
1146 OpcodeInMaskForm = AddBIT;
1147 break;
1148 case Instruction::Sub:
1149 OpcodeInMaskForm = SubBIT;
1150 break;
1151 case Instruction::And:
1152 OpcodeInMaskForm = AndBIT;
1153 break;
1154 case Instruction::Or:
1155 OpcodeInMaskForm = OrBIT;
1156 break;
1157 case Instruction::Xor:
1158 OpcodeInMaskForm = XorBIT;
1159 break;
1160 default:
1161 return MainOp.equal(Opcode) ||
1162 (initializeAltOp(I) && AltOp.equal(Opcode));
1163 }
1164 MaskType InterchangeableMask = OpcodeInMaskForm;
1165 ConstantInt *CI = isBinOpWithConstantInt(I).first;
1166 if (CI) {
1167 constexpr MaskType CanBeAll =
1168 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1169 const APInt &CIValue = CI->getValue();
1170 switch (Opcode) {
1171 case Instruction::Shl:
1172 if (CIValue.ult(CIValue.getBitWidth()))
1173 InterchangeableMask = CIValue.isZero() ? CanBeAll : MulBIT | ShlBIT;
1174 break;
1175 case Instruction::Mul:
1176 if (CIValue.isOne()) {
1177 InterchangeableMask = CanBeAll;
1178 break;
1179 }
1180 if (CIValue.isPowerOf2())
1181 InterchangeableMask = MulBIT | ShlBIT;
1182 break;
1183 case Instruction::Add:
1184 case Instruction::Sub:
1185 InterchangeableMask = CIValue.isZero() ? CanBeAll : SubBIT | AddBIT;
1186 break;
1187 case Instruction::And:
1188 if (CIValue.isAllOnes())
1189 InterchangeableMask = CanBeAll;
1190 break;
1191 default:
1192 if (CIValue.isZero())
1193 InterchangeableMask = CanBeAll;
1194 break;
1195 }
1196 }
1197 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1198 (initializeAltOp(I) &&
1199 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1200 }
1201 unsigned getMainOpcode() const { return MainOp.getOpcode(); }
1202 /// Checks if the list of potential opcodes includes \p Opcode.
1203 bool hasCandidateOpcode(unsigned Opcode) const {
1204 return MainOp.hasCandidateOpcode(Opcode);
1205 }
1206 bool hasAltOp() const { return AltOp.I; }
1207 unsigned getAltOpcode() const {
1208 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1209 }
1210 SmallVector<Value *> getOperand(const Instruction *I) const {
1211 return MainOp.getOperand(I);
1212 }
1213};
1214
1215/// Main data required for vectorization of instructions.
1216class InstructionsState {
1217 /// MainOp and AltOp are primarily determined by getSameOpcode. Currently,
1218 /// only BinaryOperator, CastInst, and CmpInst support alternate instructions
1219 /// (i.e., AltOp is not equal to MainOp; this can be checked using
1220 /// isAltShuffle).
1221 /// A rare exception is TrySplitNode, where the InstructionsState is derived
1222 /// from getMainAltOpsNoStateVL.
1223 /// For those InstructionsState that use alternate instructions, the resulting
1224 /// vectorized output ultimately comes from a shufflevector. For example,
1225 /// given a vector list (VL):
1226 /// VL[0] = add i32 a, e
1227 /// VL[1] = sub i32 b, f
1228 /// VL[2] = add i32 c, g
1229 /// VL[3] = sub i32 d, h
1230 /// The vectorized result would be:
1231 /// intermediated_0 = add <4 x i32> <a, b, c, d>, <e, f, g, h>
1232 /// intermediated_1 = sub <4 x i32> <a, b, c, d>, <e, f, g, h>
1233 /// result = shufflevector <4 x i32> intermediated_0,
1234 /// <4 x i32> intermediated_1,
1235 /// <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1236 /// Since shufflevector is used in the final result, when calculating the cost
1237 /// (getEntryCost), we must account for the usage of shufflevector in
1238 /// GetVectorCost.
1239 Instruction *MainOp = nullptr;
1240 Instruction *AltOp = nullptr;
1241 /// Wether the instruction state represents copyable instructions.
1242 bool HasCopyables = false;
1243
1244public:
1245 Instruction *getMainOp() const {
1246 assert(valid() && "InstructionsState is invalid.");
1247 return MainOp;
1248 }
1249
1250 Instruction *getAltOp() const {
1251 assert(valid() && "InstructionsState is invalid.");
1252 return AltOp;
1253 }
1254
1255 /// The main/alternate opcodes for the list of instructions.
1256 unsigned getOpcode() const { return getMainOp()->getOpcode(); }
1257
1258 unsigned getAltOpcode() const { return getAltOp()->getOpcode(); }
1259
1260 /// Some of the instructions in the list have alternate opcodes.
1261 bool isAltShuffle() const { return getMainOp() != getAltOp(); }
1262
1263 /// Checks if the instruction matches either the main or alternate opcode.
1264 /// \returns
1265 /// - MainOp if \param I matches MainOp's opcode directly or can be converted
1266 /// to it
1267 /// - AltOp if \param I matches AltOp's opcode directly or can be converted to
1268 /// it
1269 /// - nullptr if \param I cannot be matched or converted to either opcode
1270 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
1271 assert(MainOp && "MainOp cannot be nullptr.");
1272 if (I->getOpcode() == MainOp->getOpcode())
1273 return MainOp;
1274 // Prefer AltOp instead of interchangeable instruction of MainOp.
1275 assert(AltOp && "AltOp cannot be nullptr.");
1276 if (I->getOpcode() == AltOp->getOpcode())
1277 return AltOp;
1278 if (!I->isBinaryOp())
1279 return nullptr;
1280 BinOpSameOpcodeHelper Converter(MainOp);
1281 if (!Converter.add(I) || !Converter.add(MainOp))
1282 return nullptr;
1283 if (isAltShuffle() && !Converter.hasCandidateOpcode(MainOp->getOpcode())) {
1284 BinOpSameOpcodeHelper AltConverter(AltOp);
1285 if (AltConverter.add(I) && AltConverter.add(AltOp) &&
1286 AltConverter.hasCandidateOpcode(AltOp->getOpcode()))
1287 return AltOp;
1288 }
1289 if (Converter.hasAltOp() && !isAltShuffle())
1290 return nullptr;
1291 return Converter.hasAltOp() ? AltOp : MainOp;
1292 }
1293
1294 /// Checks if main/alt instructions are shift operations.
1295 bool isShiftOp() const {
1296 return getMainOp()->isShift() && getAltOp()->isShift();
1297 }
1298
1299 /// Checks if main/alt instructions are bitwise logic operations.
1300 bool isBitwiseLogicOp() const {
1301 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1302 }
1303
1304 /// Checks if main/alt instructions are mul/div/rem/fmul/fdiv/frem operations.
1305 bool isMulDivLikeOp() const {
1306 constexpr std::array<unsigned, 8> MulDiv = {
1307 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1308 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1309 Instruction::URem, Instruction::FRem};
1310 return is_contained(MulDiv, getOpcode()) &&
1311 is_contained(MulDiv, getAltOpcode());
1312 }
1313
1314 /// Checks if main/alt instructions are add/sub/fadd/fsub operations.
1315 bool isAddSubLikeOp() const {
1316 constexpr std::array<unsigned, 4> AddSub = {
1317 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1318 Instruction::FSub};
1319 return is_contained(AddSub, getOpcode()) &&
1320 is_contained(AddSub, getAltOpcode());
1321 }
1322
1323 /// Checks if main/alt instructions are cmp operations.
1324 bool isCmpOp() const {
1325 return (getOpcode() == Instruction::ICmp ||
1326 getOpcode() == Instruction::FCmp) &&
1327 getAltOpcode() == getOpcode();
1328 }
1329
1330 /// Checks if the current state is valid, i.e. has non-null MainOp
1331 bool valid() const { return MainOp && AltOp; }
1332
1333 explicit operator bool() const { return valid(); }
1334
1335 InstructionsState() = delete;
1336 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1337 bool HasCopyables = false)
1338 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1339 static InstructionsState invalid() { return {nullptr, nullptr}; }
1340
1341 /// Checks if the value is a copyable element.
1342 bool isCopyableElement(Value *V) const {
1343 assert(valid() && "InstructionsState is invalid.");
1344 if (!HasCopyables)
1345 return false;
1346 if (isAltShuffle() || getOpcode() == Instruction::GetElementPtr)
1347 return false;
1348 auto *I = dyn_cast<Instruction>(V);
1349 if (!I)
1350 return !isa<PoisonValue>(V);
1351 if (I->getParent() != MainOp->getParent() &&
1354 return true;
1355 if (I->getOpcode() == MainOp->getOpcode())
1356 return false;
1357 if (!I->isBinaryOp())
1358 return true;
1359 BinOpSameOpcodeHelper Converter(MainOp);
1360 return !Converter.add(I) || !Converter.add(MainOp) ||
1361 Converter.hasAltOp() || !Converter.hasCandidateOpcode(getOpcode());
1362 }
1363
1364 /// Checks if the value is non-schedulable.
1365 bool isNonSchedulable(Value *V) const {
1366 assert(valid() && "InstructionsState is invalid.");
1367 auto *I = dyn_cast<Instruction>(V);
1368 if (!HasCopyables)
1369 return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
1371 // MainOp for copyables always schedulable to correctly identify
1372 // non-schedulable copyables.
1373 if (getMainOp() == V)
1374 return false;
1375 if (isCopyableElement(V)) {
1376 auto IsNonSchedulableCopyableElement = [this](Value *V) {
1377 auto *I = dyn_cast<Instruction>(V);
1378 return !I || isa<PHINode>(I) || I->getParent() != MainOp->getParent() ||
1380 // If the copyable instructions comes after MainOp
1381 // (non-schedulable, but used in the block) - cannot vectorize
1382 // it, will possibly generate use before def.
1383 !MainOp->comesBefore(I));
1384 };
1385
1386 return IsNonSchedulableCopyableElement(V);
1387 }
1388 return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
1390 }
1391
1392 /// Checks if the state represents copyable instructions.
1393 bool areInstructionsWithCopyableElements() const {
1394 assert(valid() && "InstructionsState is invalid.");
1395 return HasCopyables;
1396 }
1397};
1398
1399std::pair<Instruction *, SmallVector<Value *>>
1400convertTo(Instruction *I, const InstructionsState &S) {
1401 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(I);
1402 assert(SelectedOp && "Cannot convert the instruction.");
1403 if (I->isBinaryOp()) {
1404 BinOpSameOpcodeHelper Converter(I);
1405 return std::make_pair(SelectedOp, Converter.getOperand(SelectedOp));
1406 }
1407 return std::make_pair(SelectedOp, SmallVector<Value *>(I->operands()));
1408}
1409
1410} // end anonymous namespace
1411
1412static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1413 const TargetLibraryInfo &TLI);
1414
1415/// Find an instruction with a specific opcode in VL.
1416/// \param VL Array of values to search through. Must contain only Instructions
1417/// and PoisonValues.
1418/// \param Opcode The instruction opcode to search for
1419/// \returns
1420/// - The first instruction found with matching opcode
1421/// - nullptr if no matching instruction is found
1423 unsigned Opcode) {
1424 for (Value *V : VL) {
1425 if (isa<PoisonValue>(V))
1426 continue;
1427 assert(isa<Instruction>(V) && "Only accepts PoisonValue and Instruction.");
1428 auto *Inst = cast<Instruction>(V);
1429 if (Inst->getOpcode() == Opcode)
1430 return Inst;
1431 }
1432 return nullptr;
1433}
1434
1435/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
1436/// compatible instructions or constants, or just some other regular values.
1437static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
1438 Value *Op1, const TargetLibraryInfo &TLI) {
1439 return (isConstant(BaseOp0) && isConstant(Op0)) ||
1440 (isConstant(BaseOp1) && isConstant(Op1)) ||
1441 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
1442 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
1443 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1444 getSameOpcode({BaseOp0, Op0}, TLI) ||
1445 getSameOpcode({BaseOp1, Op1}, TLI);
1446}
1447
1448/// \returns true if a compare instruction \p CI has similar "look" and
1449/// same predicate as \p BaseCI, "as is" or with its operands and predicate
1450/// swapped, false otherwise.
1451static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
1452 const TargetLibraryInfo &TLI) {
1453 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
1454 "Assessing comparisons of different types?");
1455 CmpInst::Predicate BasePred = BaseCI->getPredicate();
1456 CmpInst::Predicate Pred = CI->getPredicate();
1458
1459 Value *BaseOp0 = BaseCI->getOperand(0);
1460 Value *BaseOp1 = BaseCI->getOperand(1);
1461 Value *Op0 = CI->getOperand(0);
1462 Value *Op1 = CI->getOperand(1);
1463
1464 return (BasePred == Pred &&
1465 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
1466 (BasePred == SwappedPred &&
1467 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
1468}
1469
1470/// \returns analysis of the Instructions in \p VL described in
1471/// InstructionsState, the Opcode that we suppose the whole list
1472/// could be vectorized even if its structure is diverse.
1473static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1474 const TargetLibraryInfo &TLI) {
1475 // Make sure these are all Instructions.
1477 return InstructionsState::invalid();
1478
1479 auto *It = find_if(VL, IsaPred<Instruction>);
1480 if (It == VL.end())
1481 return InstructionsState::invalid();
1482
1483 Instruction *MainOp = cast<Instruction>(*It);
1484 unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);
1485 if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) ||
1486 (VL.size() == 2 && InstCnt < 2))
1487 return InstructionsState::invalid();
1488
1489 bool IsCastOp = isa<CastInst>(MainOp);
1490 bool IsBinOp = isa<BinaryOperator>(MainOp);
1491 bool IsCmpOp = isa<CmpInst>(MainOp);
1492 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
1494 Instruction *AltOp = MainOp;
1495 unsigned Opcode = MainOp->getOpcode();
1496 unsigned AltOpcode = Opcode;
1497
1498 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1499 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1500 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
1501 UniquePreds.insert(BasePred);
1502 UniqueNonSwappedPreds.insert(BasePred);
1503 for (Value *V : VL) {
1504 auto *I = dyn_cast<CmpInst>(V);
1505 if (!I)
1506 return false;
1507 CmpInst::Predicate CurrentPred = I->getPredicate();
1508 CmpInst::Predicate SwappedCurrentPred =
1509 CmpInst::getSwappedPredicate(CurrentPred);
1510 UniqueNonSwappedPreds.insert(CurrentPred);
1511 if (!UniquePreds.contains(CurrentPred) &&
1512 !UniquePreds.contains(SwappedCurrentPred))
1513 UniquePreds.insert(CurrentPred);
1514 }
1515 // Total number of predicates > 2, but if consider swapped predicates
1516 // compatible only 2, consider swappable predicates as compatible opcodes,
1517 // not alternate.
1518 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
1519 }();
1520 // Check for one alternate opcode from another BinaryOperator.
1521 // TODO - generalize to support all operators (types, calls etc.).
1522 Intrinsic::ID BaseID = 0;
1523 SmallVector<VFInfo> BaseMappings;
1524 if (auto *CallBase = dyn_cast<CallInst>(MainOp)) {
1525 BaseID = getVectorIntrinsicIDForCall(CallBase, &TLI);
1526 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
1527 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
1528 return InstructionsState::invalid();
1529 }
1530 bool AnyPoison = InstCnt != VL.size();
1531 // Check MainOp too to be sure that it matches the requirements for the
1532 // instructions.
1533 for (Value *V : iterator_range(It, VL.end())) {
1534 auto *I = dyn_cast<Instruction>(V);
1535 if (!I)
1536 continue;
1537
1538 // Cannot combine poison and divisions.
1539 // TODO: do some smart analysis of the CallInsts to exclude divide-like
1540 // intrinsics/functions only.
1541 if (AnyPoison && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
1542 return InstructionsState::invalid();
1543 unsigned InstOpcode = I->getOpcode();
1544 if (IsBinOp && isa<BinaryOperator>(I)) {
1545 if (BinOpHelper.add(I))
1546 continue;
1547 } else if (IsCastOp && isa<CastInst>(I)) {
1548 Value *Op0 = MainOp->getOperand(0);
1549 Type *Ty0 = Op0->getType();
1550 Value *Op1 = I->getOperand(0);
1551 Type *Ty1 = Op1->getType();
1552 if (Ty0 == Ty1) {
1553 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1554 continue;
1555 if (Opcode == AltOpcode) {
1556 assert(isValidForAlternation(Opcode) &&
1557 isValidForAlternation(InstOpcode) &&
1558 "Cast isn't safe for alternation, logic needs to be updated!");
1559 AltOpcode = InstOpcode;
1560 AltOp = I;
1561 continue;
1562 }
1563 }
1564 } else if (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) {
1565 auto *BaseInst = cast<CmpInst>(MainOp);
1566 Type *Ty0 = BaseInst->getOperand(0)->getType();
1567 Type *Ty1 = Inst->getOperand(0)->getType();
1568 if (Ty0 == Ty1) {
1569 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
1570 assert(InstOpcode == AltOpcode &&
1571 "Alternate instructions are only supported by BinaryOperator "
1572 "and CastInst.");
1573 // Check for compatible operands. If the corresponding operands are not
1574 // compatible - need to perform alternate vectorization.
1575 CmpInst::Predicate CurrentPred = Inst->getPredicate();
1576 CmpInst::Predicate SwappedCurrentPred =
1577 CmpInst::getSwappedPredicate(CurrentPred);
1578
1579 if ((VL.size() == 2 || SwappedPredsCompatible) &&
1580 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1581 continue;
1582
1583 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
1584 continue;
1585 auto *AltInst = cast<CmpInst>(AltOp);
1586 if (MainOp != AltOp) {
1587 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
1588 continue;
1589 } else if (BasePred != CurrentPred) {
1590 assert(
1591 isValidForAlternation(InstOpcode) &&
1592 "CmpInst isn't safe for alternation, logic needs to be updated!");
1593 AltOp = I;
1594 continue;
1595 }
1596 CmpInst::Predicate AltPred = AltInst->getPredicate();
1597 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1598 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1599 continue;
1600 }
1601 } else if (InstOpcode == Opcode) {
1602 assert(InstOpcode == AltOpcode &&
1603 "Alternate instructions are only supported by BinaryOperator and "
1604 "CastInst.");
1605 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
1606 if (Gep->getNumOperands() != 2 ||
1607 Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType())
1608 return InstructionsState::invalid();
1609 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
1611 return InstructionsState::invalid();
1612 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
1613 auto *BaseLI = cast<LoadInst>(MainOp);
1614 if (!LI->isSimple() || !BaseLI->isSimple())
1615 return InstructionsState::invalid();
1616 } else if (auto *Call = dyn_cast<CallInst>(I)) {
1617 auto *CallBase = cast<CallInst>(MainOp);
1618 if (Call->getCalledFunction() != CallBase->getCalledFunction())
1619 return InstructionsState::invalid();
1620 if (Call->hasOperandBundles() &&
1622 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1623 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1624 CallBase->op_begin() +
1626 return InstructionsState::invalid();
1628 if (ID != BaseID)
1629 return InstructionsState::invalid();
1630 if (!ID) {
1631 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
1632 if (Mappings.size() != BaseMappings.size() ||
1633 Mappings.front().ISA != BaseMappings.front().ISA ||
1634 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
1635 Mappings.front().VectorName != BaseMappings.front().VectorName ||
1636 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
1637 Mappings.front().Shape.Parameters !=
1638 BaseMappings.front().Shape.Parameters)
1639 return InstructionsState::invalid();
1640 }
1641 }
1642 continue;
1643 }
1644 return InstructionsState::invalid();
1645 }
1646
1647 if (IsBinOp) {
1648 MainOp = findInstructionWithOpcode(VL, BinOpHelper.getMainOpcode());
1649 assert(MainOp && "Cannot find MainOp with Opcode from BinOpHelper.");
1650 AltOp = findInstructionWithOpcode(VL, BinOpHelper.getAltOpcode());
1651 assert(MainOp && "Cannot find AltOp with Opcode from BinOpHelper.");
1652 }
1653 assert((MainOp == AltOp || !allSameOpcode(VL)) &&
1654 "Incorrect implementation of allSameOpcode.");
1655 InstructionsState S(MainOp, AltOp);
1656 assert(all_of(VL,
1657 [&](Value *V) {
1658 return isa<PoisonValue>(V) ||
1659 S.getMatchingMainOpOrAltOp(cast<Instruction>(V));
1660 }) &&
1661 "Invalid InstructionsState.");
1662 return S;
1663}
1664
1665/// \returns true if all of the values in \p VL have the same type or false
1666/// otherwise.
1668 Type *Ty = VL.consume_front()->getType();
1669 return all_of(VL, [&](Value *V) { return V->getType() == Ty; });
1670}
1671
1672/// \returns True if in-tree use also needs extract. This refers to
1673/// possible scalar operand in vectorized instruction.
1674static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
1675 TargetLibraryInfo *TLI,
1676 const TargetTransformInfo *TTI) {
1677 if (!UserInst)
1678 return false;
1679 unsigned Opcode = UserInst->getOpcode();
1680 switch (Opcode) {
1681 case Instruction::Load: {
1682 LoadInst *LI = cast<LoadInst>(UserInst);
1683 return (LI->getPointerOperand() == Scalar);
1684 }
1685 case Instruction::Store: {
1686 StoreInst *SI = cast<StoreInst>(UserInst);
1687 return (SI->getPointerOperand() == Scalar);
1688 }
1689 case Instruction::Call: {
1690 CallInst *CI = cast<CallInst>(UserInst);
1692 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
1693 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1694 Arg.value().get() == Scalar;
1695 });
1696 }
1697 default:
1698 return false;
1699 }
1700}
1701
1702/// \returns the AA location that is being access by the instruction.
1705 return MemoryLocation::get(SI);
1706 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1707 return MemoryLocation::get(LI);
1708 return MemoryLocation();
1709}
1710
1711/// \returns True if the instruction is not a volatile or atomic load/store.
1712static bool isSimple(Instruction *I) {
1713 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1714 return LI->isSimple();
1716 return SI->isSimple();
1718 return !MI->isVolatile();
1719 return true;
1720}
1721
1722/// Shuffles \p Mask in accordance with the given \p SubMask.
1723/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1724/// one but two input vectors.
1725static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
1726 bool ExtendingManyInputs = false) {
1727 if (SubMask.empty())
1728 return;
1729 assert(
1730 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
1731 // Check if input scalars were extended to match the size of other node.
1732 (SubMask.size() == Mask.size() && Mask.back() == PoisonMaskElem)) &&
1733 "SubMask with many inputs support must be larger than the mask.");
1734 if (Mask.empty()) {
1735 Mask.append(SubMask.begin(), SubMask.end());
1736 return;
1737 }
1738 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1739 int TermValue = std::min(Mask.size(), SubMask.size());
1740 for (int I = 0, E = SubMask.size(); I < E; ++I) {
1741 if (SubMask[I] == PoisonMaskElem ||
1742 (!ExtendingManyInputs &&
1743 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1744 continue;
1745 NewMask[I] = Mask[SubMask[I]];
1746 }
1747 Mask.swap(NewMask);
1748}
1749
1750/// Order may have elements assigned special value (size) which is out of
1751/// bounds. Such indices only appear on places which correspond to undef values
1752/// (see canReuseExtract for details) and used in order to avoid undef values
1753/// have effect on operands ordering.
1754/// The first loop below simply finds all unused indices and then the next loop
1755/// nest assigns these indices for undef values positions.
1756/// As an example below Order has two undef positions and they have assigned
1757/// values 3 and 7 respectively:
1758/// before: 6 9 5 4 9 2 1 0
1759/// after: 6 3 5 4 7 2 1 0
1761 const size_t Sz = Order.size();
1762 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1763 SmallBitVector MaskedIndices(Sz);
1764 for (unsigned I = 0; I < Sz; ++I) {
1765 if (Order[I] < Sz)
1766 UnusedIndices.reset(Order[I]);
1767 else
1768 MaskedIndices.set(I);
1769 }
1770 if (MaskedIndices.none())
1771 return;
1772 assert(UnusedIndices.count() == MaskedIndices.count() &&
1773 "Non-synced masked/available indices.");
1774 int Idx = UnusedIndices.find_first();
1775 int MIdx = MaskedIndices.find_first();
1776 while (MIdx >= 0) {
1777 assert(Idx >= 0 && "Indices must be synced.");
1778 Order[MIdx] = Idx;
1779 Idx = UnusedIndices.find_next(Idx);
1780 MIdx = MaskedIndices.find_next(MIdx);
1781 }
1782}
1783
1784/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1785/// Opcode1.
1787 unsigned Opcode0, unsigned Opcode1) {
1788 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
1789 SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
1790 for (unsigned Lane : seq<unsigned>(VL.size())) {
1791 if (isa<PoisonValue>(VL[Lane]))
1792 continue;
1793 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1794 OpcodeMask.set(Lane * ScalarTyNumElements,
1795 Lane * ScalarTyNumElements + ScalarTyNumElements);
1796 }
1797 return OpcodeMask;
1798}
1799
1800/// Replicates the given \p Val \p VF times.
1802 unsigned VF) {
1803 assert(none_of(Val, [](Constant *C) { return C->getType()->isVectorTy(); }) &&
1804 "Expected scalar constants.");
1805 SmallVector<Constant *> NewVal(Val.size() * VF);
1806 for (auto [I, V] : enumerate(Val))
1807 std::fill_n(NewVal.begin() + I * VF, VF, V);
1808 return NewVal;
1809}
1810
1811namespace llvm {
1812
1814 SmallVectorImpl<int> &Mask) {
1815 Mask.clear();
1816 const unsigned E = Indices.size();
1817 Mask.resize(E, PoisonMaskElem);
1818 for (unsigned I = 0; I < E; ++I)
1819 Mask[Indices[I]] = I;
1820}
1821
1822/// Reorders the list of scalars in accordance with the given \p Mask.
1824 ArrayRef<int> Mask) {
1825 assert(!Mask.empty() && "Expected non-empty mask.");
1826 SmallVector<Value *> Prev(Scalars.size(),
1827 PoisonValue::get(Scalars.front()->getType()));
1828 Prev.swap(Scalars);
1829 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1830 if (Mask[I] != PoisonMaskElem)
1831 Scalars[Mask[I]] = Prev[I];
1832}
1833
1834/// Checks if the provided value does not require scheduling. It does not
1835/// require scheduling if this is not an instruction or it is an instruction
1836/// that does not read/write memory and all operands are either not instructions
1837/// or phi nodes or instructions from different blocks.
1839 auto *I = dyn_cast<Instruction>(V);
1840 if (!I)
1841 return true;
1842 return !mayHaveNonDefUseDependency(*I) &&
1843 all_of(I->operands(), [I](Value *V) {
1844 auto *IO = dyn_cast<Instruction>(V);
1845 if (!IO)
1846 return true;
1847 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1848 });
1849}
1850
1851/// Checks if the provided value does not require scheduling. It does not
1852/// require scheduling if this is not an instruction or it is an instruction
1853/// that does not read/write memory and all users are phi nodes or instructions
1854/// from the different blocks.
1855static bool isUsedOutsideBlock(Value *V) {
1856 auto *I = dyn_cast<Instruction>(V);
1857 if (!I)
1858 return true;
1859 // Limits the number of uses to save compile time.
1860 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1861 all_of(I->users(), [I](User *U) {
1862 auto *IU = dyn_cast<Instruction>(U);
1863 if (!IU)
1864 return true;
1865 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1866 });
1867}
1868
1869/// Checks if the specified value does not require scheduling. It does not
1870/// require scheduling if all operands and all users do not need to be scheduled
1871/// in the current basic block.
1874}
1875
1876/// Checks if the specified array of instructions does not require scheduling.
1877/// It is so if all either instructions have operands that do not require
1878/// scheduling or their users do not require scheduling since they are phis or
1879/// in other basic blocks.
1881 return !VL.empty() &&
1883}
1884
1885/// Returns true if widened type of \p Ty elements with size \p Sz represents
1886/// full vector type, i.e. adding extra element results in extra parts upon type
1887/// legalization.
1889 unsigned Sz) {
1890 if (Sz <= 1)
1891 return false;
1893 return false;
1894 if (has_single_bit(Sz))
1895 return true;
1896 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
1897 return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
1898 Sz % NumParts == 0;
1899}
1900
1901/// Returns number of parts, the type \p VecTy will be split at the codegen
1902/// phase. If the type is going to be scalarized or does not uses whole
1903/// registers, returns 1.
1904static unsigned
1906 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1907 unsigned NumParts = TTI.getNumberOfParts(VecTy);
1908 if (NumParts == 0 || NumParts >= Limit)
1909 return 1;
1910 unsigned Sz = getNumElements(VecTy);
1911 if (NumParts >= Sz || Sz % NumParts != 0 ||
1912 !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), Sz / NumParts))
1913 return 1;
1914 return NumParts;
1915}
1916
1917namespace slpvectorizer {
1918
1919/// Bottom Up SLP Vectorizer.
1920class BoUpSLP {
1921 class TreeEntry;
1922 class ScheduleEntity;
1923 class ScheduleData;
1924 class ScheduleCopyableData;
1925 class ScheduleBundle;
1928
1929 /// If we decide to generate strided load / store, this struct contains all
1930 /// the necessary info. It's fields are calculated by analyzeRtStrideCandidate
1931 /// and analyzeConstantStrideCandidate. Note that Stride can be given either
1932 /// as a SCEV or as a Value if it already exists. To get the stride in bytes,
1933 /// StrideVal (or value obtained from StrideSCEV) has to by multiplied by the
1934 /// size of element of FixedVectorType.
1935 struct StridedPtrInfo {
1936 Value *StrideVal = nullptr;
1937 const SCEV *StrideSCEV = nullptr;
1938 FixedVectorType *Ty = nullptr;
1939 };
1940 SmallDenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
1941
1942public:
1943 /// Tracks the state we can represent the loads in the given sequence.
1951
1958
1960 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1962 const DataLayout *DL, OptimizationRemarkEmitter *ORE)
1963 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1964 AC(AC), DB(DB), DL(DL), ORE(ORE),
1965 Builder(Se->getContext(), TargetFolder(*DL)) {
1966 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1967 // Use the vector register size specified by the target unless overridden
1968 // by a command-line option.
1969 // TODO: It would be better to limit the vectorization factor based on
1970 // data type rather than just register size. For example, x86 AVX has
1971 // 256-bit registers, but it does not support integer operations
1972 // at that width (that requires AVX2).
1973 if (MaxVectorRegSizeOption.getNumOccurrences())
1974 MaxVecRegSize = MaxVectorRegSizeOption;
1975 else
1976 MaxVecRegSize =
1977 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
1978 .getFixedValue();
1979
1980 if (MinVectorRegSizeOption.getNumOccurrences())
1981 MinVecRegSize = MinVectorRegSizeOption;
1982 else
1983 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1984 }
1985
1986 /// Vectorize the tree that starts with the elements in \p VL.
1987 /// Returns the vectorized root.
1989
1990 /// Vectorize the tree but with the list of externally used values \p
1991 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1992 /// generated extractvalue instructions.
1994 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1995 Instruction *ReductionRoot = nullptr,
1996 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {});
1997
1998 /// \returns the cost incurred by unwanted spills and fills, caused by
1999 /// holding live values over call sites.
2001
2002 /// \returns the vectorization cost of the subtree that starts at \p VL.
2003 /// A negative number means that this is profitable.
2004 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = {},
2005 InstructionCost ReductionCost = TTI::TCC_Free);
2006
2007 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
2008 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
2009 void buildTree(ArrayRef<Value *> Roots,
2010 const SmallDenseSet<Value *> &UserIgnoreLst);
2011
2012 /// Construct a vectorizable tree that starts at \p Roots.
2013 void buildTree(ArrayRef<Value *> Roots);
2014
2015 /// Return the scalars of the root node.
2017 assert(!VectorizableTree.empty() && "No graph to get the first node from");
2018 return VectorizableTree.front()->Scalars;
2019 }
2020
2021 /// Returns the type/is-signed info for the root node in the graph without
2022 /// casting.
2023 std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const {
2024 const TreeEntry &Root = *VectorizableTree.front();
2025 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2026 !Root.Scalars.front()->getType()->isIntegerTy())
2027 return std::nullopt;
2028 auto It = MinBWs.find(&Root);
2029 if (It != MinBWs.end())
2030 return std::make_pair(IntegerType::get(Root.Scalars.front()->getContext(),
2031 It->second.first),
2032 It->second.second);
2033 if (Root.getOpcode() == Instruction::ZExt ||
2034 Root.getOpcode() == Instruction::SExt)
2035 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
2036 Root.getOpcode() == Instruction::SExt);
2037 return std::nullopt;
2038 }
2039
2040 /// Checks if the root graph node can be emitted with narrower bitwidth at
2041 /// codegen and returns it signedness, if so.
2043 return MinBWs.at(VectorizableTree.front().get()).second;
2044 }
2045
2046 /// Returns reduction type after minbitdth analysis.
2048 if (ReductionBitWidth == 0 ||
2049 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2050 ReductionBitWidth >=
2051 DL->getTypeSizeInBits(
2052 VectorizableTree.front()->Scalars.front()->getType()))
2053 return getWidenedType(
2054 VectorizableTree.front()->Scalars.front()->getType(),
2055 VectorizableTree.front()->getVectorFactor());
2056 return getWidenedType(
2058 VectorizableTree.front()->Scalars.front()->getContext(),
2059 ReductionBitWidth),
2060 VectorizableTree.front()->getVectorFactor());
2061 }
2062
2063 /// Builds external uses of the vectorized scalars, i.e. the list of
2064 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
2065 /// ExternallyUsedValues contains additional list of external uses to handle
2066 /// vectorization of reductions.
2067 void
2068 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
2069
2070 /// Transforms graph nodes to target specific representations, if profitable.
2071 void transformNodes();
2072
2073 /// Clear the internal data structures that are created by 'buildTree'.
2074 void deleteTree() {
2075 VectorizableTree.clear();
2076 ScalarToTreeEntries.clear();
2077 OperandsToTreeEntry.clear();
2078 ScalarsInSplitNodes.clear();
2079 MustGather.clear();
2080 NonScheduledFirst.clear();
2081 EntryToLastInstruction.clear();
2082 LoadEntriesToVectorize.clear();
2083 IsGraphTransformMode = false;
2084 GatheredLoadsEntriesFirst.reset();
2085 CompressEntryToData.clear();
2086 ExternalUses.clear();
2087 ExternalUsesAsOriginalScalar.clear();
2088 ExternalUsesWithNonUsers.clear();
2089 for (auto &Iter : BlocksSchedules) {
2090 BlockScheduling *BS = Iter.second.get();
2091 BS->clear();
2092 }
2093 MinBWs.clear();
2094 ReductionBitWidth = 0;
2095 BaseGraphSize = 1;
2096 CastMaxMinBWSizes.reset();
2097 ExtraBitWidthNodes.clear();
2098 InstrElementSize.clear();
2099 UserIgnoreList = nullptr;
2100 PostponedGathers.clear();
2101 ValueToGatherNodes.clear();
2102 }
2103
2104 unsigned getTreeSize() const { return VectorizableTree.size(); }
2105
2106 /// Returns the base graph size, before any transformations.
2107 unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
2108
2109 /// Perform LICM and CSE on the newly generated gather sequences.
2111
2112 /// Does this non-empty order represent an identity order? Identity
2113 /// should be represented as an empty order, so this is used to
2114 /// decide if we can canonicalize a computed order. Undef elements
2115 /// (represented as size) are ignored.
2117 assert(!Order.empty() && "expected non-empty order");
2118 const unsigned Sz = Order.size();
2119 return all_of(enumerate(Order), [&](const auto &P) {
2120 return P.value() == P.index() || P.value() == Sz;
2121 });
2122 }
2123
2124 /// Checks if the specified gather tree entry \p TE can be represented as a
2125 /// shuffled vector entry + (possibly) permutation with other gathers. It
2126 /// implements the checks only for possibly ordered scalars (Loads,
2127 /// ExtractElement, ExtractValue), which can be part of the graph.
2128 /// \param TopToBottom If true, used for the whole tree rotation, false - for
2129 /// sub-tree rotations. \param IgnoreReorder true, if the order of the root
2130 /// node might be ignored.
2131 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE,
2132 bool TopToBottom,
2133 bool IgnoreReorder);
2134
2135 /// Sort loads into increasing pointers offsets to allow greater clustering.
2136 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
2137
2138 /// Gets reordering data for the given tree entry. If the entry is vectorized
2139 /// - just return ReorderIndices, otherwise check if the scalars can be
2140 /// reordered and return the most optimal order.
2141 /// \return std::nullopt if ordering is not important, empty order, if
2142 /// identity order is important, or the actual order.
2143 /// \param TopToBottom If true, include the order of vectorized stores and
2144 /// insertelement nodes, otherwise skip them.
2145 /// \param IgnoreReorder true, if the root node order can be ignored.
2146 std::optional<OrdersType>
2147 getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder);
2148
2149 /// Checks if it is profitable to reorder the current tree.
2150 /// If the tree does not contain many profitable reordable nodes, better to
2151 /// skip it to save compile time.
2152 bool isProfitableToReorder() const;
2153
2154 /// Reorders the current graph to the most profitable order starting from the
2155 /// root node to the leaf nodes. The best order is chosen only from the nodes
2156 /// of the same size (vectorization factor). Smaller nodes are considered
2157 /// parts of subgraph with smaller VF and they are reordered independently. We
2158 /// can make it because we still need to extend smaller nodes to the wider VF
2159 /// and we can merge reordering shuffles with the widening shuffles.
2160 void reorderTopToBottom();
2161
2162 /// Reorders the current graph to the most profitable order starting from
2163 /// leaves to the root. It allows to rotate small subgraphs and reduce the
2164 /// number of reshuffles if the leaf nodes use the same order. In this case we
2165 /// can merge the orders and just shuffle user node instead of shuffling its
2166 /// operands. Plus, even the leaf nodes have different orders, it allows to
2167 /// sink reordering in the graph closer to the root node and merge it later
2168 /// during analysis.
2169 void reorderBottomToTop(bool IgnoreReorder = false);
2170
2171 /// \return The vector element size in bits to use when vectorizing the
2172 /// expression tree ending at \p V. If V is a store, the size is the width of
2173 /// the stored value. Otherwise, the size is the width of the largest loaded
2174 /// value reaching V. This method is used by the vectorizer to calculate
2175 /// vectorization factors.
2176 unsigned getVectorElementSize(Value *V);
2177
2178 /// Compute the minimum type sizes required to represent the entries in a
2179 /// vectorizable tree.
2181
2182 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
2183 unsigned getMaxVecRegSize() const {
2184 return MaxVecRegSize;
2185 }
2186
2187 // \returns minimum vector register size as set by cl::opt.
2188 unsigned getMinVecRegSize() const {
2189 return MinVecRegSize;
2190 }
2191
2192 unsigned getMinVF(unsigned Sz) const {
2193 return std::max(2U, getMinVecRegSize() / Sz);
2194 }
2195
2196 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
2197 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
2198 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2199 return MaxVF ? MaxVF : UINT_MAX;
2200 }
2201
2202 /// Check if homogeneous aggregate is isomorphic to some VectorType.
2203 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
2204 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
2205 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
2206 ///
2207 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
2208 unsigned canMapToVector(Type *T) const;
2209
2210 /// \returns True if the VectorizableTree is both tiny and not fully
2211 /// vectorizable. We do not vectorize such trees.
2212 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
2213
2214 /// Checks if the graph and all its subgraphs cannot be better vectorized.
2215 /// It may happen, if all gather nodes are loads and they cannot be
2216 /// "clusterized". In this case even subgraphs cannot be vectorized more
2217 /// effectively than the base graph.
2218 bool isTreeNotExtendable() const;
2219
2220 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
2221 /// can be load combined in the backend. Load combining may not be allowed in
2222 /// the IR optimizer, so we do not want to alter the pattern. For example,
2223 /// partially transforming a scalar bswap() pattern into vector code is
2224 /// effectively impossible for the backend to undo.
2225 /// TODO: If load combining is allowed in the IR optimizer, this analysis
2226 /// may not be necessary.
2227 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
2228
2229 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
2230 /// can be load combined in the backend. Load combining may not be allowed in
2231 /// the IR optimizer, so we do not want to alter the pattern. For example,
2232 /// partially transforming a scalar bswap() pattern into vector code is
2233 /// effectively impossible for the backend to undo.
2234 /// TODO: If load combining is allowed in the IR optimizer, this analysis
2235 /// may not be necessary.
2236 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
2239 const DataLayout &DL, ScalarEvolution &SE,
2240 const bool IsAnyPointerUsedOutGraph, const int64_t Diff,
2241 StridedPtrInfo &SPtrInfo) const;
2242
2243 /// Checks if the given array of loads can be represented as a vectorized,
2244 /// scatter or just simple gather.
2245 /// \param VL list of loads.
2246 /// \param VL0 main load value.
2247 /// \param Order returned order of load instructions.
2248 /// \param PointerOps returned list of pointer operands.
2249 /// \param BestVF return best vector factor, if recursive check found better
2250 /// vectorization sequences rather than masked gather.
2251 /// \param TryRecursiveCheck used to check if long masked gather can be
2252 /// represented as a serie of loads/insert subvector, if profitable.
2255 SmallVectorImpl<Value *> &PointerOps,
2256 StridedPtrInfo &SPtrInfo,
2257 unsigned *BestVF = nullptr,
2258 bool TryRecursiveCheck = true) const;
2259
2260 /// Registers non-vectorizable sequence of loads
2261 template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
2262 ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
2263 }
2264
2265 /// Checks if the given loads sequence is known as not vectorizable
2266 template <typename T>
2268 return ListOfKnonwnNonVectorizableLoads.contains(hash_value(VL));
2269 }
2270
2272
2273 /// This structure holds any data we need about the edges being traversed
2274 /// during buildTreeRec(). We keep track of:
2275 /// (i) the user TreeEntry index, and
2276 /// (ii) the index of the edge.
2277 struct EdgeInfo {
2278 EdgeInfo() = default;
2279 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
2281 /// The user TreeEntry.
2282 TreeEntry *UserTE = nullptr;
2283 /// The operand index of the use.
2284 unsigned EdgeIdx = UINT_MAX;
2285#ifndef NDEBUG
2287 const BoUpSLP::EdgeInfo &EI) {
2288 EI.dump(OS);
2289 return OS;
2290 }
2291 /// Debug print.
2292 void dump(raw_ostream &OS) const {
2293 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
2294 << " EdgeIdx:" << EdgeIdx << "}";
2295 }
2296 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
2297#endif
2298 bool operator == (const EdgeInfo &Other) const {
2299 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
2300 }
2301
2302 operator bool() const { return UserTE != nullptr; }
2303 };
2304 friend struct DenseMapInfo<EdgeInfo>;
2305
2306 /// A helper class used for scoring candidates for two consecutive lanes.
2308 const TargetLibraryInfo &TLI;
2309 const DataLayout &DL;
2310 ScalarEvolution &SE;
2311 const BoUpSLP &R;
2312 int NumLanes; // Total number of lanes (aka vectorization factor).
2313 int MaxLevel; // The maximum recursion depth for accumulating score.
2314
2315 public:
2317 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
2318 int MaxLevel)
2319 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2320 MaxLevel(MaxLevel) {}
2321
2322 // The hard-coded scores listed here are not very important, though it shall
2323 // be higher for better matches to improve the resulting cost. When
2324 // computing the scores of matching one sub-tree with another, we are
2325 // basically counting the number of values that are matching. So even if all
2326 // scores are set to 1, we would still get a decent matching result.
2327 // However, sometimes we have to break ties. For example we may have to
2328 // choose between matching loads vs matching opcodes. This is what these
2329 // scores are helping us with: they provide the order of preference. Also,
2330 // this is important if the scalar is externally used or used in another
2331 // tree entry node in the different lane.
2332
2333 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
2334 static const int ScoreConsecutiveLoads = 4;
2335 /// The same load multiple times. This should have a better score than
2336 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
2337 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
2338 /// a vector load and 1.0 for a broadcast.
2339 static const int ScoreSplatLoads = 3;
2340 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
2341 static const int ScoreReversedLoads = 3;
2342 /// A load candidate for masked gather.
2343 static const int ScoreMaskedGatherCandidate = 1;
2344 /// ExtractElementInst from same vector and consecutive indexes.
2345 static const int ScoreConsecutiveExtracts = 4;
2346 /// ExtractElementInst from same vector and reversed indices.
2347 static const int ScoreReversedExtracts = 3;
2348 /// Constants.
2349 static const int ScoreConstants = 2;
2350 /// Instructions with the same opcode.
2351 static const int ScoreSameOpcode = 2;
2352 /// Instructions with alt opcodes (e.g, add + sub).
2353 static const int ScoreAltOpcodes = 1;
2354 /// Identical instructions (a.k.a. splat or broadcast).
2355 static const int ScoreSplat = 1;
2356 /// Matching with an undef is preferable to failing.
2357 static const int ScoreUndef = 1;
2358 /// Score for failing to find a decent match.
2359 static const int ScoreFail = 0;
2360 /// Score if all users are vectorized.
2361 static const int ScoreAllUserVectorized = 1;
2362
2363 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
2364 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
2365 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
2366 /// MainAltOps.
2368 ArrayRef<Value *> MainAltOps) const {
2369 if (!isValidElementType(V1->getType()) ||
2372
2373 if (V1 == V2) {
2374 if (isa<LoadInst>(V1)) {
2375 // Retruns true if the users of V1 and V2 won't need to be extracted.
2376 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
2377 // Bail out if we have too many uses to save compilation time.
2378 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
2379 return false;
2380
2381 auto AllUsersVectorized = [U1, U2, this](Value *V) {
2382 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
2383 return U == U1 || U == U2 || R.isVectorized(U);
2384 });
2385 };
2386 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2387 };
2388 // A broadcast of a load can be cheaper on some targets.
2389 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
2390 ElementCount::getFixed(NumLanes)) &&
2391 ((int)V1->getNumUses() == NumLanes ||
2392 AllUsersAreInternal(V1, V2)))
2394 }
2396 }
2397
2398 auto CheckSameEntryOrFail = [&]() {
2399 if (ArrayRef<TreeEntry *> TEs1 = R.getTreeEntries(V1); !TEs1.empty()) {
2401 if (ArrayRef<TreeEntry *> TEs2 = R.getTreeEntries(V2);
2402 !TEs2.empty() &&
2403 any_of(TEs2, [&](TreeEntry *E) { return Set.contains(E); }))
2405 }
2407 };
2408
2409 auto *LI1 = dyn_cast<LoadInst>(V1);
2410 auto *LI2 = dyn_cast<LoadInst>(V2);
2411 if (LI1 && LI2) {
2412 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2413 !LI2->isSimple())
2414 return CheckSameEntryOrFail();
2415
2416 std::optional<int64_t> Dist = getPointersDiff(
2417 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
2418 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
2419 if (!Dist || *Dist == 0) {
2420 if (getUnderlyingObject(LI1->getPointerOperand()) ==
2421 getUnderlyingObject(LI2->getPointerOperand()) &&
2422 R.TTI->isLegalMaskedGather(
2423 getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
2425 return CheckSameEntryOrFail();
2426 }
2427 // The distance is too large - still may be profitable to use masked
2428 // loads/gathers.
2429 if (std::abs(*Dist) > NumLanes / 2)
2431 // This still will detect consecutive loads, but we might have "holes"
2432 // in some cases. It is ok for non-power-2 vectorization and may produce
2433 // better results. It should not affect current vectorization.
2436 }
2437
2438 auto *C1 = dyn_cast<Constant>(V1);
2439 auto *C2 = dyn_cast<Constant>(V2);
2440 if (C1 && C2)
2442
2443 // Consider constants and buildvector compatible.
2444 if ((C1 && isa<InsertElementInst>(V2)) ||
2445 (C2 && isa<InsertElementInst>(V1)))
2447
2448 // Extracts from consecutive indexes of the same vector better score as
2449 // the extracts could be optimized away.
2450 Value *EV1;
2451 ConstantInt *Ex1Idx;
2452 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
2453 // Undefs are always profitable for extractelements.
2454 // Compiler can easily combine poison and extractelement <non-poison> or
2455 // undef and extractelement <poison>. But combining undef +
2456 // extractelement <non-poison-but-may-produce-poison> requires some
2457 // extra operations.
2458 if (isa<UndefValue>(V2))
2459 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
2462 Value *EV2 = nullptr;
2463 ConstantInt *Ex2Idx = nullptr;
2464 if (match(V2,
2466 m_Undef())))) {
2467 // Undefs are always profitable for extractelements.
2468 if (!Ex2Idx)
2470 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
2472 if (EV2 == EV1) {
2473 int Idx1 = Ex1Idx->getZExtValue();
2474 int Idx2 = Ex2Idx->getZExtValue();
2475 int Dist = Idx2 - Idx1;
2476 // The distance is too large - still may be profitable to use
2477 // shuffles.
2478 if (std::abs(Dist) == 0)
2480 if (std::abs(Dist) > NumLanes / 2)
2484 }
2486 }
2487 return CheckSameEntryOrFail();
2488 }
2489
2490 auto *I1 = dyn_cast<Instruction>(V1);
2491 auto *I2 = dyn_cast<Instruction>(V2);
2492 if (I1 && I2) {
2493 if (I1->getParent() != I2->getParent())
2494 return CheckSameEntryOrFail();
2495 SmallVector<Value *, 4> Ops(MainAltOps);
2496 Ops.push_back(I1);
2497 Ops.push_back(I2);
2498 InstructionsState S = getSameOpcode(Ops, TLI);
2499 // Note: Only consider instructions with <= 2 operands to avoid
2500 // complexity explosion.
2501 if (S &&
2502 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||
2503 !S.isAltShuffle()) &&
2504 all_of(Ops, [&S](Value *V) {
2505 return isa<PoisonValue>(V) ||
2506 cast<Instruction>(V)->getNumOperands() ==
2507 S.getMainOp()->getNumOperands();
2508 }))
2509 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
2511 }
2512
2513 if (I1 && isa<PoisonValue>(V2))
2515
2516 if (isa<UndefValue>(V2))
2518
2519 return CheckSameEntryOrFail();
2520 }
2521
2522 /// Go through the operands of \p LHS and \p RHS recursively until
2523 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
2524 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
2525 /// of \p U1 and \p U2), except at the beginning of the recursion where
2526 /// these are set to nullptr.
2527 ///
2528 /// For example:
2529 /// \verbatim
2530 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
2531 /// \ / \ / \ / \ /
2532 /// + + + +
2533 /// G1 G2 G3 G4
2534 /// \endverbatim
2535 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
2536 /// each level recursively, accumulating the score. It starts from matching
2537 /// the additions at level 0, then moves on to the loads (level 1). The
2538 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
2539 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
2540 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
2541 /// Please note that the order of the operands does not matter, as we
2542 /// evaluate the score of all profitable combinations of operands. In
2543 /// other words the score of G1 and G4 is the same as G1 and G2. This
2544 /// heuristic is based on ideas described in:
2545 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
2546 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
2547 /// Luís F. W. Góes
2549 Instruction *U2, int CurrLevel,
2550 ArrayRef<Value *> MainAltOps) const {
2551
2552 // Get the shallow score of V1 and V2.
2553 int ShallowScoreAtThisLevel =
2554 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
2555
2556 // If reached MaxLevel,
2557 // or if V1 and V2 are not instructions,
2558 // or if they are SPLAT,
2559 // or if they are not consecutive,
2560 // or if profitable to vectorize loads or extractelements, early return
2561 // the current cost.
2562 auto *I1 = dyn_cast<Instruction>(LHS);
2563 auto *I2 = dyn_cast<Instruction>(RHS);
2564 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2565 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
2566 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
2567 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2569 ShallowScoreAtThisLevel))
2570 return ShallowScoreAtThisLevel;
2571 assert(I1 && I2 && "Should have early exited.");
2572
2573 // Contains the I2 operand indexes that got matched with I1 operands.
2574 SmallSet<unsigned, 4> Op2Used;
2575
2576 // Recursion towards the operands of I1 and I2. We are trying all possible
2577 // operand pairs, and keeping track of the best score.
2578 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2579 OpIdx1 != NumOperands1; ++OpIdx1) {
2580 // Try to pair op1I with the best operand of I2.
2581 int MaxTmpScore = 0;
2582 unsigned MaxOpIdx2 = 0;
2583 bool FoundBest = false;
2584 // If I2 is commutative try all combinations.
2585 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
2586 unsigned ToIdx = isCommutative(I2)
2587 ? I2->getNumOperands()
2588 : std::min(I2->getNumOperands(), OpIdx1 + 1);
2589 assert(FromIdx <= ToIdx && "Bad index");
2590 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2591 // Skip operands already paired with OpIdx1.
2592 if (Op2Used.count(OpIdx2))
2593 continue;
2594 // Recursively calculate the cost at each level
2595 int TmpScore =
2596 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
2597 I1, I2, CurrLevel + 1, {});
2598 // Look for the best score.
2599 if (TmpScore > LookAheadHeuristics::ScoreFail &&
2600 TmpScore > MaxTmpScore) {
2601 MaxTmpScore = TmpScore;
2602 MaxOpIdx2 = OpIdx2;
2603 FoundBest = true;
2604 }
2605 }
2606 if (FoundBest) {
2607 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
2608 Op2Used.insert(MaxOpIdx2);
2609 ShallowScoreAtThisLevel += MaxTmpScore;
2610 }
2611 }
2612 return ShallowScoreAtThisLevel;
2613 }
2614 };
2615 /// A helper data structure to hold the operands of a vector of instructions.
2616 /// This supports a fixed vector length for all operand vectors.
2618 /// For each operand we need (i) the value, and (ii) the opcode that it
2619 /// would be attached to if the expression was in a left-linearized form.
2620 /// This is required to avoid illegal operand reordering.
2621 /// For example:
2622 /// \verbatim
2623 /// 0 Op1
2624 /// |/
2625 /// Op1 Op2 Linearized + Op2
2626 /// \ / ----------> |/
2627 /// - -
2628 ///
2629 /// Op1 - Op2 (0 + Op1) - Op2
2630 /// \endverbatim
2631 ///
2632 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
2633 ///
2634 /// Another way to think of this is to track all the operations across the
2635 /// path from the operand all the way to the root of the tree and to
2636 /// calculate the operation that corresponds to this path. For example, the
2637 /// path from Op2 to the root crosses the RHS of the '-', therefore the
2638 /// corresponding operation is a '-' (which matches the one in the
2639 /// linearized tree, as shown above).
2640 ///
2641 /// For lack of a better term, we refer to this operation as Accumulated
2642 /// Path Operation (APO).
2643 struct OperandData {
2644 OperandData() = default;
2645 OperandData(Value *V, bool APO, bool IsUsed)
2646 : V(V), APO(APO), IsUsed(IsUsed) {}
2647 /// The operand value.
2648 Value *V = nullptr;
2649 /// TreeEntries only allow a single opcode, or an alternate sequence of
2650 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
2651 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
2652 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
2653 /// (e.g., Add/Mul)
2654 bool APO = false;
2655 /// Helper data for the reordering function.
2656 bool IsUsed = false;
2657 };
2658
2659 /// During operand reordering, we are trying to select the operand at lane
2660 /// that matches best with the operand at the neighboring lane. Our
2661 /// selection is based on the type of value we are looking for. For example,
2662 /// if the neighboring lane has a load, we need to look for a load that is
2663 /// accessing a consecutive address. These strategies are summarized in the
2664 /// 'ReorderingMode' enumerator.
2665 enum class ReorderingMode {
2666 Load, ///< Matching loads to consecutive memory addresses
2667 Opcode, ///< Matching instructions based on opcode (same or alternate)
2668 Constant, ///< Matching constants
2669 Splat, ///< Matching the same instruction multiple times (broadcast)
2670 Failed, ///< We failed to create a vectorizable group
2671 };
2672
2673 using OperandDataVec = SmallVector<OperandData, 2>;
2674
2675 /// A vector of operand vectors.
2677 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
2678 /// is not IntrinsicInst, ArgSize is User::getNumOperands.
2679 unsigned ArgSize = 0;
2680
2681 const TargetLibraryInfo &TLI;
2682 const DataLayout &DL;
2683 ScalarEvolution &SE;
2684 const BoUpSLP &R;
2685 const Loop *L = nullptr;
2686
2687 /// \returns the operand data at \p OpIdx and \p Lane.
2688 OperandData &getData(unsigned OpIdx, unsigned Lane) {
2689 return OpsVec[OpIdx][Lane];
2690 }
2691
2692 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
2693 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
2694 return OpsVec[OpIdx][Lane];
2695 }
2696
2697 /// Clears the used flag for all entries.
2698 void clearUsed() {
2699 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
2700 OpIdx != NumOperands; ++OpIdx)
2701 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2702 ++Lane)
2703 OpsVec[OpIdx][Lane].IsUsed = false;
2704 }
2705
2706 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
2707 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
2708 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2709 }
2710
2711 /// \param Lane lane of the operands under analysis.
2712 /// \param OpIdx operand index in \p Lane lane we're looking the best
2713 /// candidate for.
2714 /// \param Idx operand index of the current candidate value.
2715 /// \returns The additional score due to possible broadcasting of the
2716 /// elements in the lane. It is more profitable to have power-of-2 unique
2717 /// elements in the lane, it will be vectorized with higher probability
2718 /// after removing duplicates. Currently the SLP vectorizer supports only
2719 /// vectorization of the power-of-2 number of unique scalars.
2720 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
2721 const SmallBitVector &UsedLanes) const {
2722 Value *IdxLaneV = getData(Idx, Lane).V;
2723 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2724 isa<ExtractElementInst>(IdxLaneV))
2725 return 0;
2727 for (unsigned Ln : seq<unsigned>(getNumLanes())) {
2728 if (Ln == Lane)
2729 continue;
2730 Value *OpIdxLnV = getData(OpIdx, Ln).V;
2731 if (!isa<Instruction>(OpIdxLnV))
2732 return 0;
2733 Uniques.try_emplace(OpIdxLnV, Ln);
2734 }
2735 unsigned UniquesCount = Uniques.size();
2736 auto IdxIt = Uniques.find(IdxLaneV);
2737 unsigned UniquesCntWithIdxLaneV =
2738 IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2739 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2740 auto OpIdxIt = Uniques.find(OpIdxLaneV);
2741 unsigned UniquesCntWithOpIdxLaneV =
2742 OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2743 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2744 return 0;
2745 return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) -
2746 UniquesCntWithOpIdxLaneV,
2747 UniquesCntWithOpIdxLaneV -
2748 bit_floor(UniquesCntWithOpIdxLaneV)) -
2749 ((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second))
2750 ? UniquesCntWithIdxLaneV - bit_floor(UniquesCntWithIdxLaneV)
2751 : bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2752 }
2753
2754 /// \param Lane lane of the operands under analysis.
2755 /// \param OpIdx operand index in \p Lane lane we're looking the best
2756 /// candidate for.
2757 /// \param Idx operand index of the current candidate value.
2758 /// \returns The additional score for the scalar which users are all
2759 /// vectorized.
2760 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
2761 Value *IdxLaneV = getData(Idx, Lane).V;
2762 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2763 // Do not care about number of uses for vector-like instructions
2764 // (extractelement/extractvalue with constant indices), they are extracts
2765 // themselves and already externally used. Vectorization of such
2766 // instructions does not add extra extractelement instruction, just may
2767 // remove it.
2768 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
2769 isVectorLikeInstWithConstOps(OpIdxLaneV))
2771 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2772 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2773 return 0;
2774 return R.areAllUsersVectorized(IdxLaneI)
2776 : 0;
2777 }
2778
2779 /// Score scaling factor for fully compatible instructions but with
2780 /// different number of external uses. Allows better selection of the
2781 /// instructions with less external uses.
2782 static const int ScoreScaleFactor = 10;
2783
2784 /// \Returns the look-ahead score, which tells us how much the sub-trees
2785 /// rooted at \p LHS and \p RHS match, the more they match the higher the
2786 /// score. This helps break ties in an informed way when we cannot decide on
2787 /// the order of the operands by just considering the immediate
2788 /// predecessors.
2789 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
2790 int Lane, unsigned OpIdx, unsigned Idx,
2791 bool &IsUsed, const SmallBitVector &UsedLanes) {
2792 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
2794 // Keep track of the instruction stack as we recurse into the operands
2795 // during the look-ahead score exploration.
2796 int Score =
2797 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
2798 /*CurrLevel=*/1, MainAltOps);
2799 if (Score) {
2800 int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
2801 if (Score <= -SplatScore) {
2802 // Failed score.
2803 Score = 0;
2804 } else {
2805 Score += SplatScore;
2806 // Scale score to see the difference between different operands
2807 // and similar operands but all vectorized/not all vectorized
2808 // uses. It does not affect actual selection of the best
2809 // compatible operand in general, just allows to select the
2810 // operand with all vectorized uses.
2811 Score *= ScoreScaleFactor;
2812 Score += getExternalUseScore(Lane, OpIdx, Idx);
2813 IsUsed = true;
2814 }
2815 }
2816 return Score;
2817 }
2818
2819 /// Best defined scores per lanes between the passes. Used to choose the
2820 /// best operand (with the highest score) between the passes.
2821 /// The key - {Operand Index, Lane}.
2822 /// The value - the best score between the passes for the lane and the
2823 /// operand.
2825 BestScoresPerLanes;
2826
2827 // Search all operands in Ops[*][Lane] for the one that matches best
2828 // Ops[OpIdx][LastLane] and return its opreand index.
2829 // If no good match can be found, return std::nullopt.
2830 std::optional<unsigned>
2831 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
2832 ArrayRef<ReorderingMode> ReorderingModes,
2833 ArrayRef<Value *> MainAltOps,
2834 const SmallBitVector &UsedLanes) {
2835 unsigned NumOperands = getNumOperands();
2836
2837 // The operand of the previous lane at OpIdx.
2838 Value *OpLastLane = getData(OpIdx, LastLane).V;
2839
2840 // Our strategy mode for OpIdx.
2841 ReorderingMode RMode = ReorderingModes[OpIdx];
2842 if (RMode == ReorderingMode::Failed)
2843 return std::nullopt;
2844
2845 // The linearized opcode of the operand at OpIdx, Lane.
2846 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2847
2848 // The best operand index and its score.
2849 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
2850 // are using the score to differentiate between the two.
2851 struct BestOpData {
2852 std::optional<unsigned> Idx;
2853 unsigned Score = 0;
2854 } BestOp;
2855 BestOp.Score =
2856 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
2857 .first->second;
2858
2859 // Track if the operand must be marked as used. If the operand is set to
2860 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
2861 // want to reestimate the operands again on the following iterations).
2862 bool IsUsed = RMode == ReorderingMode::Splat ||
2863 RMode == ReorderingMode::Constant ||
2864 RMode == ReorderingMode::Load;
2865 // Iterate through all unused operands and look for the best.
2866 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2867 // Get the operand at Idx and Lane.
2868 OperandData &OpData = getData(Idx, Lane);
2869 Value *Op = OpData.V;
2870 bool OpAPO = OpData.APO;
2871
2872 // Skip already selected operands.
2873 if (OpData.IsUsed)
2874 continue;
2875
2876 // Skip if we are trying to move the operand to a position with a
2877 // different opcode in the linearized tree form. This would break the
2878 // semantics.
2879 if (OpAPO != OpIdxAPO)
2880 continue;
2881
2882 // Look for an operand that matches the current mode.
2883 switch (RMode) {
2884 case ReorderingMode::Load:
2885 case ReorderingMode::Opcode: {
2886 bool LeftToRight = Lane > LastLane;
2887 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
2888 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
2889 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2890 OpIdx, Idx, IsUsed, UsedLanes);
2891 if (Score > static_cast<int>(BestOp.Score) ||
2892 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
2893 Idx == OpIdx)) {
2894 BestOp.Idx = Idx;
2895 BestOp.Score = Score;
2896 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2897 }
2898 break;
2899 }
2900 case ReorderingMode::Constant:
2901 if (isa<Constant>(Op) ||
2902 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
2903 BestOp.Idx = Idx;
2904 if (isa<Constant>(Op)) {
2906 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2908 }
2910 IsUsed = false;
2911 }
2912 break;
2913 case ReorderingMode::Splat:
2914 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
2915 IsUsed = Op == OpLastLane;
2916 if (Op == OpLastLane) {
2917 BestOp.Score = LookAheadHeuristics::ScoreSplat;
2918 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2920 }
2921 BestOp.Idx = Idx;
2922 }
2923 break;
2924 case ReorderingMode::Failed:
2925 llvm_unreachable("Not expected Failed reordering mode.");
2926 }
2927 }
2928
2929 if (BestOp.Idx) {
2930 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2931 return BestOp.Idx;
2932 }
2933 // If we could not find a good match return std::nullopt.
2934 return std::nullopt;
2935 }
2936
2937 /// Helper for reorderOperandVecs.
2938 /// \returns the lane that we should start reordering from. This is the one
2939 /// which has the least number of operands that can freely move about or
2940 /// less profitable because it already has the most optimal set of operands.
2941 unsigned getBestLaneToStartReordering() const {
2942 unsigned Min = UINT_MAX;
2943 unsigned SameOpNumber = 0;
2944 // std::pair<unsigned, unsigned> is used to implement a simple voting
2945 // algorithm and choose the lane with the least number of operands that
2946 // can freely move about or less profitable because it already has the
2947 // most optimal set of operands. The first unsigned is a counter for
2948 // voting, the second unsigned is the counter of lanes with instructions
2949 // with same/alternate opcodes and same parent basic block.
2951 // Try to be closer to the original results, if we have multiple lanes
2952 // with same cost. If 2 lanes have the same cost, use the one with the
2953 // highest index.
2954 for (int I = getNumLanes(); I > 0; --I) {
2955 unsigned Lane = I - 1;
2956 OperandsOrderData NumFreeOpsHash =
2957 getMaxNumOperandsThatCanBeReordered(Lane);
2958 // Compare the number of operands that can move and choose the one with
2959 // the least number.
2960 if (NumFreeOpsHash.NumOfAPOs < Min) {
2961 Min = NumFreeOpsHash.NumOfAPOs;
2962 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2963 HashMap.clear();
2964 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2965 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2966 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2967 // Select the most optimal lane in terms of number of operands that
2968 // should be moved around.
2969 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2970 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2971 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2972 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2973 auto [It, Inserted] =
2974 HashMap.try_emplace(NumFreeOpsHash.Hash, 1, Lane);
2975 if (!Inserted)
2976 ++It->second.first;
2977 }
2978 }
2979 // Select the lane with the minimum counter.
2980 unsigned BestLane = 0;
2981 unsigned CntMin = UINT_MAX;
2982 for (const auto &Data : reverse(HashMap)) {
2983 if (Data.second.first < CntMin) {
2984 CntMin = Data.second.first;
2985 BestLane = Data.second.second;
2986 }
2987 }
2988 return BestLane;
2989 }
2990
2991 /// Data structure that helps to reorder operands.
2992 struct OperandsOrderData {
2993 /// The best number of operands with the same APOs, which can be
2994 /// reordered.
2995 unsigned NumOfAPOs = UINT_MAX;
2996 /// Number of operands with the same/alternate instruction opcode and
2997 /// parent.
2998 unsigned NumOpsWithSameOpcodeParent = 0;
2999 /// Hash for the actual operands ordering.
3000 /// Used to count operands, actually their position id and opcode
3001 /// value. It is used in the voting mechanism to find the lane with the
3002 /// least number of operands that can freely move about or less profitable
3003 /// because it already has the most optimal set of operands. Can be
3004 /// replaced with SmallVector<unsigned> instead but hash code is faster
3005 /// and requires less memory.
3006 unsigned Hash = 0;
3007 };
3008 /// \returns the maximum number of operands that are allowed to be reordered
3009 /// for \p Lane and the number of compatible instructions(with the same
3010 /// parent/opcode). This is used as a heuristic for selecting the first lane
3011 /// to start operand reordering.
3012 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
3013 unsigned CntTrue = 0;
3014 unsigned NumOperands = getNumOperands();
3015 // Operands with the same APO can be reordered. We therefore need to count
3016 // how many of them we have for each APO, like this: Cnt[APO] = x.
3017 // Since we only have two APOs, namely true and false, we can avoid using
3018 // a map. Instead we can simply count the number of operands that
3019 // correspond to one of them (in this case the 'true' APO), and calculate
3020 // the other by subtracting it from the total number of operands.
3021 // Operands with the same instruction opcode and parent are more
3022 // profitable since we don't need to move them in many cases, with a high
3023 // probability such lane already can be vectorized effectively.
3024 bool AllUndefs = true;
3025 unsigned NumOpsWithSameOpcodeParent = 0;
3026 Instruction *OpcodeI = nullptr;
3027 BasicBlock *Parent = nullptr;
3028 unsigned Hash = 0;
3029 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3030 const OperandData &OpData = getData(OpIdx, Lane);
3031 if (OpData.APO)
3032 ++CntTrue;
3033 // Use Boyer-Moore majority voting for finding the majority opcode and
3034 // the number of times it occurs.
3035 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
3036 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI) ||
3037 I->getParent() != Parent) {
3038 if (NumOpsWithSameOpcodeParent == 0) {
3039 NumOpsWithSameOpcodeParent = 1;
3040 OpcodeI = I;
3041 Parent = I->getParent();
3042 } else {
3043 --NumOpsWithSameOpcodeParent;
3044 }
3045 } else {
3046 ++NumOpsWithSameOpcodeParent;
3047 }
3048 }
3049 Hash = hash_combine(
3050 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
3051 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
3052 }
3053 if (AllUndefs)
3054 return {};
3055 OperandsOrderData Data;
3056 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
3057 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3058 Data.Hash = Hash;
3059 return Data;
3060 }
3061
3062 /// Go through the instructions in VL and append their operands.
3063 void appendOperands(ArrayRef<Value *> VL, ArrayRef<ValueList> Operands,
3064 const InstructionsState &S) {
3065 assert(!Operands.empty() && !VL.empty() && "Bad list of operands");
3066 assert((empty() || all_of(Operands,
3067 [this](const ValueList &VL) {
3068 return VL.size() == getNumLanes();
3069 })) &&
3070 "Expected same number of lanes");
3071 assert(S.valid() && "InstructionsState is invalid.");
3072 // IntrinsicInst::isCommutative returns true if swapping the first "two"
3073 // arguments to the intrinsic produces the same result.
3074 Instruction *MainOp = S.getMainOp();
3075 unsigned NumOperands = MainOp->getNumOperands();
3077 OpsVec.resize(ArgSize);
3078 unsigned NumLanes = VL.size();
3079 for (OperandDataVec &Ops : OpsVec)
3080 Ops.resize(NumLanes);
3081 for (unsigned Lane : seq<unsigned>(NumLanes)) {
3082 // Our tree has just 3 nodes: the root and two operands.
3083 // It is therefore trivial to get the APO. We only need to check the
3084 // opcode of V and whether the operand at OpIdx is the LHS or RHS
3085 // operand. The LHS operand of both add and sub is never attached to an
3086 // inversese operation in the linearized form, therefore its APO is
3087 // false. The RHS is true only if V is an inverse operation.
3088
3089 // Since operand reordering is performed on groups of commutative
3090 // operations or alternating sequences (e.g., +, -), we can safely tell
3091 // the inverse operations by checking commutativity.
3092 auto *I = dyn_cast<Instruction>(VL[Lane]);
3093 if (!I && isa<PoisonValue>(VL[Lane])) {
3094 for (unsigned OpIdx : seq<unsigned>(NumOperands))
3095 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], true, false};
3096 continue;
3097 }
3098 bool IsInverseOperation = false;
3099 if (S.isCopyableElement(VL[Lane])) {
3100 // The value is a copyable element.
3101 IsInverseOperation = !isCommutative(MainOp, VL[Lane]);
3102 } else {
3103 assert(I && "Expected instruction");
3104 auto [SelectedOp, Ops] = convertTo(I, S);
3105 // We cannot check commutativity by the converted instruction
3106 // (SelectedOp) because isCommutative also examines def-use
3107 // relationships.
3108 IsInverseOperation = !isCommutative(SelectedOp, I);
3109 }
3110 for (unsigned OpIdx : seq<unsigned>(ArgSize)) {
3111 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
3112 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], APO, false};
3113 }
3114 }
3115 }
3116
3117 /// \returns the number of operands.
3118 unsigned getNumOperands() const { return ArgSize; }
3119
3120 /// \returns the number of lanes.
3121 unsigned getNumLanes() const { return OpsVec[0].size(); }
3122
3123 /// \returns the operand value at \p OpIdx and \p Lane.
3124 Value *getValue(unsigned OpIdx, unsigned Lane) const {
3125 return getData(OpIdx, Lane).V;
3126 }
3127
3128 /// \returns true if the data structure is empty.
3129 bool empty() const { return OpsVec.empty(); }
3130
3131 /// Clears the data.
3132 void clear() { OpsVec.clear(); }
3133
3134 /// \Returns true if there are enough operands identical to \p Op to fill
3135 /// the whole vector (it is mixed with constants or loop invariant values).
3136 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
3137 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
3138 assert(Op == getValue(OpIdx, Lane) &&
3139 "Op is expected to be getValue(OpIdx, Lane).");
3140 // Small number of loads - try load matching.
3141 if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
3142 return false;
3143 bool OpAPO = getData(OpIdx, Lane).APO;
3144 bool IsInvariant = L && L->isLoopInvariant(Op);
3145 unsigned Cnt = 0;
3146 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3147 if (Ln == Lane)
3148 continue;
3149 // This is set to true if we found a candidate for broadcast at Lane.
3150 bool FoundCandidate = false;
3151 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3152 OperandData &Data = getData(OpI, Ln);
3153 if (Data.APO != OpAPO || Data.IsUsed)
3154 continue;
3155 Value *OpILane = getValue(OpI, Lane);
3156 bool IsConstantOp = isa<Constant>(OpILane);
3157 // Consider the broadcast candidate if:
3158 // 1. Same value is found in one of the operands.
3159 if (Data.V == Op ||
3160 // 2. The operand in the given lane is not constant but there is a
3161 // constant operand in another lane (which can be moved to the
3162 // given lane). In this case we can represent it as a simple
3163 // permutation of constant and broadcast.
3164 (!IsConstantOp &&
3165 ((Lns > 2 && isa<Constant>(Data.V)) ||
3166 // 2.1. If we have only 2 lanes, need to check that value in the
3167 // next lane does not build same opcode sequence.
3168 (Lns == 2 &&
3169 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) &&
3170 isa<Constant>(Data.V)))) ||
3171 // 3. The operand in the current lane is loop invariant (can be
3172 // hoisted out) and another operand is also a loop invariant
3173 // (though not a constant). In this case the whole vector can be
3174 // hoisted out.
3175 // FIXME: need to teach the cost model about this case for better
3176 // estimation.
3177 (IsInvariant && !isa<Constant>(Data.V) &&
3178 !getSameOpcode({Op, Data.V}, TLI) &&
3179 L->isLoopInvariant(Data.V))) {
3180 FoundCandidate = true;
3181 Data.IsUsed = Data.V == Op;
3182 if (Data.V == Op)
3183 ++Cnt;
3184 break;
3185 }
3186 }
3187 if (!FoundCandidate)
3188 return false;
3189 }
3190 return getNumLanes() == 2 || Cnt > 1;
3191 }
3192
3193 /// Checks if there is at least single compatible operand in lanes other
3194 /// than \p Lane, compatible with the operand \p Op.
3195 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
3196 assert(Op == getValue(OpIdx, Lane) &&
3197 "Op is expected to be getValue(OpIdx, Lane).");
3198 bool OpAPO = getData(OpIdx, Lane).APO;
3199 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3200 if (Ln == Lane)
3201 continue;
3202 if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
3203 const OperandData &Data = getData(OpI, Ln);
3204 if (Data.APO != OpAPO || Data.IsUsed)
3205 return true;
3206 Value *OpILn = getValue(OpI, Ln);
3207 return (L && L->isLoopInvariant(OpILn)) ||
3208 (getSameOpcode({Op, OpILn}, TLI) &&
3209 allSameBlock({Op, OpILn}));
3210 }))
3211 return true;
3212 }
3213 return false;
3214 }
3215
3216 public:
3217 /// Initialize with all the operands of the instruction vector \p RootVL.
3219 const InstructionsState &S, const BoUpSLP &R)
3220 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3221 L(R.LI->getLoopFor(S.getMainOp()->getParent())) {
3222 // Append all the operands of RootVL.
3223 appendOperands(RootVL, Operands, S);
3224 }
3225
3226 /// \Returns a value vector with the operands across all lanes for the
3227 /// opearnd at \p OpIdx.
3228 ValueList getVL(unsigned OpIdx) const {
3229 ValueList OpVL(OpsVec[OpIdx].size());
3230 assert(OpsVec[OpIdx].size() == getNumLanes() &&
3231 "Expected same num of lanes across all operands");
3232 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3233 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
3234 return OpVL;
3235 }
3236
3237 // Performs operand reordering for 2 or more operands.
3238 // The original operands are in OrigOps[OpIdx][Lane].
3239 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
3240 void reorder() {
3241 unsigned NumOperands = getNumOperands();
3242 unsigned NumLanes = getNumLanes();
3243 // Each operand has its own mode. We are using this mode to help us select
3244 // the instructions for each lane, so that they match best with the ones
3245 // we have selected so far.
3246 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
3247
3248 // This is a greedy single-pass algorithm. We are going over each lane
3249 // once and deciding on the best order right away with no back-tracking.
3250 // However, in order to increase its effectiveness, we start with the lane
3251 // that has operands that can move the least. For example, given the
3252 // following lanes:
3253 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
3254 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
3255 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
3256 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
3257 // we will start at Lane 1, since the operands of the subtraction cannot
3258 // be reordered. Then we will visit the rest of the lanes in a circular
3259 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
3260
3261 // Find the first lane that we will start our search from.
3262 unsigned FirstLane = getBestLaneToStartReordering();
3263
3264 // Initialize the modes.
3265 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3266 Value *OpLane0 = getValue(OpIdx, FirstLane);
3267 // Keep track if we have instructions with all the same opcode on one
3268 // side.
3269 if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
3270 // Check if OpLane0 should be broadcast.
3271 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
3272 !canBeVectorized(OpILane0, OpIdx, FirstLane))
3273 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3274 else if (isa<LoadInst>(OpILane0))
3275 ReorderingModes[OpIdx] = ReorderingMode::Load;
3276 else
3277 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
3278 } else if (isa<Constant>(OpLane0)) {
3279 ReorderingModes[OpIdx] = ReorderingMode::Constant;
3280 } else if (isa<Argument>(OpLane0)) {
3281 // Our best hope is a Splat. It may save some cost in some cases.
3282 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3283 } else {
3284 llvm_unreachable("Unexpected value kind.");
3285 }
3286 }
3287
3288 // Check that we don't have same operands. No need to reorder if operands
3289 // are just perfect diamond or shuffled diamond match. Do not do it only
3290 // for possible broadcasts or non-power of 2 number of scalars (just for
3291 // now).
3292 auto &&SkipReordering = [this]() {
3293 SmallPtrSet<Value *, 4> UniqueValues;
3294 ArrayRef<OperandData> Op0 = OpsVec.front();
3295 for (const OperandData &Data : Op0)
3296 UniqueValues.insert(Data.V);
3298 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
3299 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
3300 return !UniqueValues.contains(Data.V);
3301 }))
3302 return false;
3303 }
3304 // TODO: Check if we can remove a check for non-power-2 number of
3305 // scalars after full support of non-power-2 vectorization.
3306 return UniqueValues.size() != 2 &&
3307 hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(),
3308 UniqueValues.size());
3309 };
3310
3311 // If the initial strategy fails for any of the operand indexes, then we
3312 // perform reordering again in a second pass. This helps avoid assigning
3313 // high priority to the failed strategy, and should improve reordering for
3314 // the non-failed operand indexes.
3315 for (int Pass = 0; Pass != 2; ++Pass) {
3316 // Check if no need to reorder operands since they're are perfect or
3317 // shuffled diamond match.
3318 // Need to do it to avoid extra external use cost counting for
3319 // shuffled matches, which may cause regressions.
3320 if (SkipReordering())
3321 break;
3322 // Skip the second pass if the first pass did not fail.
3323 bool StrategyFailed = false;
3324 // Mark all operand data as free to use.
3325 clearUsed();
3326 // We keep the original operand order for the FirstLane, so reorder the
3327 // rest of the lanes. We are visiting the nodes in a circular fashion,
3328 // using FirstLane as the center point and increasing the radius
3329 // distance.
3330 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
3331 for (unsigned I = 0; I < NumOperands; ++I)
3332 MainAltOps[I].push_back(getData(I, FirstLane).V);
3333
3334 SmallBitVector UsedLanes(NumLanes);
3335 UsedLanes.set(FirstLane);
3336 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3337 // Visit the lane on the right and then the lane on the left.
3338 for (int Direction : {+1, -1}) {
3339 int Lane = FirstLane + Direction * Distance;
3340 if (Lane < 0 || Lane >= (int)NumLanes)
3341 continue;
3342 UsedLanes.set(Lane);
3343 int LastLane = Lane - Direction;
3344 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
3345 "Out of bounds");
3346 // Look for a good match for each operand.
3347 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3348 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
3349 std::optional<unsigned> BestIdx =
3350 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
3351 MainAltOps[OpIdx], UsedLanes);
3352 // By not selecting a value, we allow the operands that follow to
3353 // select a better matching value. We will get a non-null value in
3354 // the next run of getBestOperand().
3355 if (BestIdx) {
3356 // Swap the current operand with the one returned by
3357 // getBestOperand().
3358 swap(OpIdx, *BestIdx, Lane);
3359 } else {
3360 // Enable the second pass.
3361 StrategyFailed = true;
3362 }
3363 // Try to get the alternate opcode and follow it during analysis.
3364 if (MainAltOps[OpIdx].size() != 2) {
3365 OperandData &AltOp = getData(OpIdx, Lane);
3366 InstructionsState OpS =
3367 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
3368 if (OpS && OpS.isAltShuffle())
3369 MainAltOps[OpIdx].push_back(AltOp.V);
3370 }
3371 }
3372 }
3373 }
3374 // Skip second pass if the strategy did not fail.
3375 if (!StrategyFailed)
3376 break;
3377 }
3378 }
3379
3380#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3381 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
3382 switch (RMode) {
3383 case ReorderingMode::Load:
3384 return "Load";
3385 case ReorderingMode::Opcode:
3386 return "Opcode";
3387 case ReorderingMode::Constant:
3388 return "Constant";
3389 case ReorderingMode::Splat:
3390 return "Splat";
3391 case ReorderingMode::Failed:
3392 return "Failed";
3393 }
3394 llvm_unreachable("Unimplemented Reordering Type");
3395 }
3396
3397 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
3398 raw_ostream &OS) {
3399 return OS << getModeStr(RMode);
3400 }
3401
3402 /// Debug print.
3403 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
3404 printMode(RMode, dbgs());
3405 }
3406
3407 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
3408 return printMode(RMode, OS);
3409 }
3410
3412 const unsigned Indent = 2;
3413 unsigned Cnt = 0;
3414 for (const OperandDataVec &OpDataVec : OpsVec) {
3415 OS << "Operand " << Cnt++ << "\n";
3416 for (const OperandData &OpData : OpDataVec) {
3417 OS.indent(Indent) << "{";
3418 if (Value *V = OpData.V)
3419 OS << *V;
3420 else
3421 OS << "null";
3422 OS << ", APO:" << OpData.APO << "}\n";
3423 }
3424 OS << "\n";
3425 }
3426 return OS;
3427 }
3428
3429 /// Debug print.
3430 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
3431#endif
3432 };
3433
3434 /// Evaluate each pair in \p Candidates and return index into \p Candidates
3435 /// for a pair which have highest score deemed to have best chance to form
3436 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
3437 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
3438 /// of the cost, considered to be good enough score.
3439 std::optional<int>
3440 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
3441 int Limit = LookAheadHeuristics::ScoreFail) const {
3442 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
3444 int BestScore = Limit;
3445 std::optional<int> Index;
3446 for (int I : seq<int>(0, Candidates.size())) {
3447 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
3448 Candidates[I].second,
3449 /*U1=*/nullptr, /*U2=*/nullptr,
3450 /*CurrLevel=*/1, {});
3451 if (Score > BestScore) {
3452 BestScore = Score;
3453 Index = I;
3454 }
3455 }
3456 return Index;
3457 }
3458
3459 /// Checks if the instruction is marked for deletion.
3460 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
3461
3462 /// Removes an instruction from its block and eventually deletes it.
3463 /// It's like Instruction::eraseFromParent() except that the actual deletion
3464 /// is delayed until BoUpSLP is destructed.
3466 DeletedInstructions.insert(I);
3467 }
3468
3469 /// Remove instructions from the parent function and clear the operands of \p
3470 /// DeadVals instructions, marking for deletion trivially dead operands.
3471 template <typename T>
3473 ArrayRef<T *> DeadVals,
3474 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
3476 for (T *V : DeadVals) {
3477 auto *I = cast<Instruction>(V);
3479 }
3480 DenseSet<Value *> Processed;
3481 for (T *V : DeadVals) {
3482 if (!V || !Processed.insert(V).second)
3483 continue;
3484 auto *I = cast<Instruction>(V);
3486 ArrayRef<TreeEntry *> Entries = getTreeEntries(I);
3487 for (Use &U : I->operands()) {
3488 if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
3489 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
3491 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
3492 return Entry->VectorizedValue == OpI;
3493 })))
3494 DeadInsts.push_back(OpI);
3495 }
3496 I->dropAllReferences();
3497 }
3498 for (T *V : DeadVals) {
3499 auto *I = cast<Instruction>(V);
3500 if (!I->getParent())
3501 continue;
3502 assert((I->use_empty() || all_of(I->uses(),
3503 [&](Use &U) {
3504 return isDeleted(
3505 cast<Instruction>(U.getUser()));
3506 })) &&
3507 "trying to erase instruction with users.");
3508 I->removeFromParent();
3509 SE->forgetValue(I);
3510 }
3511 // Process the dead instruction list until empty.
3512 while (!DeadInsts.empty()) {
3513 Value *V = DeadInsts.pop_back_val();
3515 if (!VI || !VI->getParent())
3516 continue;
3518 "Live instruction found in dead worklist!");
3519 assert(VI->use_empty() && "Instructions with uses are not dead.");
3520
3521 // Don't lose the debug info while deleting the instructions.
3522 salvageDebugInfo(*VI);
3523
3524 // Null out all of the instruction's operands to see if any operand
3525 // becomes dead as we go.
3526 for (Use &OpU : VI->operands()) {
3527 Value *OpV = OpU.get();
3528 if (!OpV)
3529 continue;
3530 OpU.set(nullptr);
3531
3532 if (!OpV->use_empty())
3533 continue;
3534
3535 // If the operand is an instruction that became dead as we nulled out
3536 // the operand, and if it is 'trivially' dead, delete it in a future
3537 // loop iteration.
3538 if (auto *OpI = dyn_cast<Instruction>(OpV))
3539 if (!DeletedInstructions.contains(OpI) &&
3540 (!OpI->getType()->isVectorTy() ||
3541 none_of(VectorValuesAndScales,
3542 [&](const std::tuple<Value *, unsigned, bool> &V) {
3543 return std::get<0>(V) == OpI;
3544 })) &&
3546 DeadInsts.push_back(OpI);
3547 }
3548
3549 VI->removeFromParent();
3550 eraseInstruction(VI);
3551 SE->forgetValue(VI);
3552 }
3553 }
3554
3555 /// Checks if the instruction was already analyzed for being possible
3556 /// reduction root.
3558 return AnalyzedReductionsRoots.count(I);
3559 }
3560 /// Register given instruction as already analyzed for being possible
3561 /// reduction root.
3563 AnalyzedReductionsRoots.insert(I);
3564 }
3565 /// Checks if the provided list of reduced values was checked already for
3566 /// vectorization.
3568 return AnalyzedReductionVals.contains(hash_value(VL));
3569 }
3570 /// Adds the list of reduced values to list of already checked values for the
3571 /// vectorization.
3573 AnalyzedReductionVals.insert(hash_value(VL));
3574 }
3575 /// Clear the list of the analyzed reduction root instructions.
3577 AnalyzedReductionsRoots.clear();
3578 AnalyzedReductionVals.clear();
3579 AnalyzedMinBWVals.clear();
3580 }
3581 /// Checks if the given value is gathered in one of the nodes.
3582 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
3583 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
3584 }
3585 /// Checks if the given value is gathered in one of the nodes.
3586 bool isGathered(const Value *V) const {
3587 return MustGather.contains(V);
3588 }
3589 /// Checks if the specified value was not schedule.
3590 bool isNotScheduled(const Value *V) const {
3591 return NonScheduledFirst.contains(V);
3592 }
3593
3594 /// Check if the value is vectorized in the tree.
3595 bool isVectorized(const Value *V) const {
3596 assert(V && "V cannot be nullptr.");
3597 return ScalarToTreeEntries.contains(V);
3598 }
3599
3600 ~BoUpSLP();
3601
3602private:
3603 /// Determine if a node \p E in can be demoted to a smaller type with a
3604 /// truncation. We collect the entries that will be demoted in ToDemote.
3605 /// \param E Node for analysis
3606 /// \param ToDemote indices of the nodes to be demoted.
3607 bool collectValuesToDemote(
3608 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
3610 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
3611 bool &IsProfitableToDemote, bool IsTruncRoot) const;
3612
3613 /// Builds the list of reorderable operands on the edges \p Edges of the \p
3614 /// UserTE, which allow reordering (i.e. the operands can be reordered because
3615 /// they have only one user and reordarable).
3616 /// \param ReorderableGathers List of all gather nodes that require reordering
3617 /// (e.g., gather of extractlements or partially vectorizable loads).
3618 /// \param GatherOps List of gather operand nodes for \p UserTE that require
3619 /// reordering, subset of \p NonVectorized.
3620 void buildReorderableOperands(
3621 TreeEntry *UserTE,
3622 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
3623 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
3624 SmallVectorImpl<TreeEntry *> &GatherOps);
3625
3626 /// Checks if the given \p TE is a gather node with clustered reused scalars
3627 /// and reorders it per given \p Mask.
3628 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
3629
3630 /// Checks if all users of \p I are the part of the vectorization tree.
3631 bool areAllUsersVectorized(
3632 Instruction *I,
3633 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
3634
3635 /// Return information about the vector formed for the specified index
3636 /// of a vector of (the same) instruction.
3638
3639 /// \returns the graph entry for the \p Idx operand of the \p E entry.
3640 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
3641 TreeEntry *getOperandEntry(TreeEntry *E, unsigned Idx) {
3642 return const_cast<TreeEntry *>(
3643 getOperandEntry(const_cast<const TreeEntry *>(E), Idx));
3644 }
3645
3646 /// Gets the root instruction for the given node. If the node is a strided
3647 /// load/store node with the reverse order, the root instruction is the last
3648 /// one.
3649 Instruction *getRootEntryInstruction(const TreeEntry &Entry) const;
3650
3651 /// \returns Cast context for the given graph node.
3653 getCastContextHint(const TreeEntry &TE) const;
3654
3655 /// \returns the cost of the vectorizable entry.
3656 InstructionCost getEntryCost(const TreeEntry *E,
3657 ArrayRef<Value *> VectorizedVals,
3658 SmallPtrSetImpl<Value *> &CheckedExtracts);
3659
3660 /// Checks if it is legal and profitable to build SplitVectorize node for the
3661 /// given \p VL.
3662 /// \param Op1 first homogeneous scalars.
3663 /// \param Op2 second homogeneous scalars.
3664 /// \param ReorderIndices indices to reorder the scalars.
3665 /// \returns true if the node was successfully built.
3666 bool canBuildSplitNode(ArrayRef<Value *> VL,
3667 const InstructionsState &LocalState,
3670 OrdersType &ReorderIndices) const;
3671
3672 /// This is the recursive part of buildTree.
3673 void buildTreeRec(ArrayRef<Value *> Roots, unsigned Depth, const EdgeInfo &EI,
3674 unsigned InterleaveFactor = 0);
3675
3676 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
3677 /// be vectorized to use the original vector (or aggregate "bitcast" to a
3678 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
3679 /// returns false, setting \p CurrentOrder to either an empty vector or a
3680 /// non-identity permutation that allows to reuse extract instructions.
3681 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
3682 /// extract order.
3683 bool canReuseExtract(ArrayRef<Value *> VL,
3684 SmallVectorImpl<unsigned> &CurrentOrder,
3685 bool ResizeAllowed = false) const;
3686
3687 /// Vectorize a single entry in the tree.
3688 Value *vectorizeTree(TreeEntry *E);
3689
3690 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
3691 /// \p E.
3692 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx);
3693
3694 /// Create a new vector from a list of scalar values. Produces a sequence
3695 /// which exploits values reused across lanes, and arranges the inserts
3696 /// for ease of later optimization.
3697 template <typename BVTy, typename ResTy, typename... Args>
3698 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
3699
3700 /// Create a new vector from a list of scalar values. Produces a sequence
3701 /// which exploits values reused across lanes, and arranges the inserts
3702 /// for ease of later optimization.
3703 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
3704
3705 /// Returns the instruction in the bundle, which can be used as a base point
3706 /// for scheduling. Usually it is the last instruction in the bundle, except
3707 /// for the case when all operands are external (in this case, it is the first
3708 /// instruction in the list).
3709 Instruction &getLastInstructionInBundle(const TreeEntry *E);
3710
3711 /// Tries to find extractelement instructions with constant indices from fixed
3712 /// vector type and gather such instructions into a bunch, which highly likely
3713 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3714 /// was successful, the matched scalars are replaced by poison values in \p VL
3715 /// for future analysis.
3716 std::optional<TargetTransformInfo::ShuffleKind>
3717 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
3718 SmallVectorImpl<int> &Mask) const;
3719
3720 /// Tries to find extractelement instructions with constant indices from fixed
3721 /// vector type and gather such instructions into a bunch, which highly likely
3722 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3723 /// was successful, the matched scalars are replaced by poison values in \p VL
3724 /// for future analysis.
3726 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
3728 unsigned NumParts) const;
3729
3730 /// Checks if the gathered \p VL can be represented as a single register
3731 /// shuffle(s) of previous tree entries.
3732 /// \param TE Tree entry checked for permutation.
3733 /// \param VL List of scalars (a subset of the TE scalar), checked for
3734 /// permutations. Must form single-register vector.
3735 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3736 /// commands to build the mask using the original vector value, without
3737 /// relying on the potential reordering.
3738 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
3739 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
3740 std::optional<TargetTransformInfo::ShuffleKind>
3741 isGatherShuffledSingleRegisterEntry(
3742 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
3743 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
3744 bool ForOrder);
3745
3746 /// Checks if the gathered \p VL can be represented as multi-register
3747 /// shuffle(s) of previous tree entries.
3748 /// \param TE Tree entry checked for permutation.
3749 /// \param VL List of scalars (a subset of the TE scalar), checked for
3750 /// permutations.
3751 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3752 /// commands to build the mask using the original vector value, without
3753 /// relying on the potential reordering.
3754 /// \returns per-register series of ShuffleKind, if gathered values can be
3755 /// represented as shuffles of previous tree entries. \p Mask is filled with
3756 /// the shuffle mask (also on per-register base).
3758 isGatherShuffledEntry(
3759 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
3761 unsigned NumParts, bool ForOrder = false);
3762
3763 /// \returns the cost of gathering (inserting) the values in \p VL into a
3764 /// vector.
3765 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
3766 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
3767 Type *ScalarTy) const;
3768
3769 /// Set the Builder insert point to one after the last instruction in
3770 /// the bundle
3771 void setInsertPointAfterBundle(const TreeEntry *E);
3772
3773 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
3774 /// specified, the starting vector value is poison.
3775 Value *
3776 gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
3777 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle);
3778
3779 /// \returns whether the VectorizableTree is fully vectorizable and will
3780 /// be beneficial even the tree height is tiny.
3781 bool isFullyVectorizableTinyTree(bool ForReduction) const;
3782
3783 /// Run through the list of all gathered loads in the graph and try to find
3784 /// vector loads/masked gathers instead of regular gathers. Later these loads
3785 /// are reshufled to build final gathered nodes.
3786 void tryToVectorizeGatheredLoads(
3787 const SmallMapVector<
3788 std::tuple<BasicBlock *, Value *, Type *>,
3789 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
3790 &GatheredLoads);
3791
3792 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
3793 /// users of \p TE and collects the stores. It returns the map from the store
3794 /// pointers to the collected stores.
3796 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
3797
3798 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
3799 /// stores in \p StoresVec can form a vector instruction. If so it returns
3800 /// true and populates \p ReorderIndices with the shuffle indices of the
3801 /// stores when compared to the sorted vector.
3802 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
3803 OrdersType &ReorderIndices) const;
3804
3805 /// Iterates through the users of \p TE, looking for scalar stores that can be
3806 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
3807 /// their order and builds an order index vector for each store bundle. It
3808 /// returns all these order vectors found.
3809 /// We run this after the tree has formed, otherwise we may come across user
3810 /// instructions that are not yet in the tree.
3812 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
3813
3814 /// Tries to reorder the gathering node for better vectorization
3815 /// opportunities.
3816 void reorderGatherNode(TreeEntry &TE);
3817
3818 class TreeEntry {
3819 public:
3820 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
3821 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3822
3823 /// \returns Common mask for reorder indices and reused scalars.
3824 SmallVector<int> getCommonMask() const {
3825 if (State == TreeEntry::SplitVectorize)
3826 return {};
3827 SmallVector<int> Mask;
3828 inversePermutation(ReorderIndices, Mask);
3829 ::addMask(Mask, ReuseShuffleIndices);
3830 return Mask;
3831 }
3832
3833 /// \returns The mask for split nodes.
3834 SmallVector<int> getSplitMask() const {
3835 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3836 "Expected only split vectorize node.");
3837 SmallVector<int> Mask(getVectorFactor(), PoisonMaskElem);
3838 unsigned CommonVF = std::max<unsigned>(
3839 CombinedEntriesWithIndices.back().second,
3840 Scalars.size() - CombinedEntriesWithIndices.back().second);
3841 for (auto [Idx, I] : enumerate(ReorderIndices))
3842 Mask[I] =
3843 Idx + (Idx >= CombinedEntriesWithIndices.back().second
3844 ? CommonVF - CombinedEntriesWithIndices.back().second
3845 : 0);
3846 return Mask;
3847 }
3848
3849 /// Updates (reorders) SplitVectorize node according to the given mask \p
3850 /// Mask and order \p MaskOrder.
3851 void reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
3852 ArrayRef<int> MaskOrder);
3853
3854 /// \returns true if the scalars in VL are equal to this entry.
3855 bool isSame(ArrayRef<Value *> VL) const {
3856 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
3857 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
3858 return std::equal(VL.begin(), VL.end(), Scalars.begin());
3859 return VL.size() == Mask.size() &&
3860 std::equal(VL.begin(), VL.end(), Mask.begin(),
3861 [Scalars](Value *V, int Idx) {
3862 return (isa<UndefValue>(V) &&
3863 Idx == PoisonMaskElem) ||
3864 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3865 });
3866 };
3867 if (!ReorderIndices.empty()) {
3868 // TODO: implement matching if the nodes are just reordered, still can
3869 // treat the vector as the same if the list of scalars matches VL
3870 // directly, without reordering.
3871 SmallVector<int> Mask;
3872 inversePermutation(ReorderIndices, Mask);
3873 if (VL.size() == Scalars.size())
3874 return IsSame(Scalars, Mask);
3875 if (VL.size() == ReuseShuffleIndices.size()) {
3876 ::addMask(Mask, ReuseShuffleIndices);
3877 return IsSame(Scalars, Mask);
3878 }
3879 return false;
3880 }
3881 return IsSame(Scalars, ReuseShuffleIndices);
3882 }
3883
3884 /// \returns true if current entry has same operands as \p TE.
3885 bool hasEqualOperands(const TreeEntry &TE) const {
3886 if (TE.getNumOperands() != getNumOperands())
3887 return false;
3888 SmallBitVector Used(getNumOperands());
3889 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
3890 unsigned PrevCount = Used.count();
3891 for (unsigned K = 0; K < E; ++K) {
3892 if (Used.test(K))
3893 continue;
3894 if (getOperand(K) == TE.getOperand(I)) {
3895 Used.set(K);
3896 break;
3897 }
3898 }
3899 // Check if we actually found the matching operand.
3900 if (PrevCount == Used.count())
3901 return false;
3902 }
3903 return true;
3904 }
3905
3906 /// \return Final vectorization factor for the node. Defined by the total
3907 /// number of vectorized scalars, including those, used several times in the
3908 /// entry and counted in the \a ReuseShuffleIndices, if any.
3909 unsigned getVectorFactor() const {
3910 if (!ReuseShuffleIndices.empty())
3911 return ReuseShuffleIndices.size();
3912 return Scalars.size();
3913 };
3914
3915 /// Checks if the current node is a gather node.
3916 bool isGather() const { return State == NeedToGather; }
3917
3918 /// A vector of scalars.
3919 ValueList Scalars;
3920
3921 /// The Scalars are vectorized into this value. It is initialized to Null.
3922 WeakTrackingVH VectorizedValue = nullptr;
3923
3924 /// Do we need to gather this sequence or vectorize it
3925 /// (either with vector instruction or with scatter/gather
3926 /// intrinsics for store/load)?
3927 enum EntryState {
3928 Vectorize, ///< The node is regularly vectorized.
3929 ScatterVectorize, ///< Masked scatter/gather node.
3930 StridedVectorize, ///< Strided loads (and stores)
3931 CompressVectorize, ///< (Masked) load with compress.
3932 NeedToGather, ///< Gather/buildvector node.
3933 CombinedVectorize, ///< Vectorized node, combined with its user into more
3934 ///< complex node like select/cmp to minmax, mul/add to
3935 ///< fma, etc. Must be used for the following nodes in
3936 ///< the pattern, not the very first one.
3937 SplitVectorize, ///< Splits the node into 2 subnodes, vectorizes them
3938 ///< independently and then combines back.
3939 };
3940 EntryState State;
3941
3942 /// List of combined opcodes supported by the vectorizer.
3943 enum CombinedOpcode {
3944 NotCombinedOp = -1,
3945 MinMax = Instruction::OtherOpsEnd + 1,
3946 FMulAdd,
3947 };
3948 CombinedOpcode CombinedOp = NotCombinedOp;
3949
3950 /// Does this sequence require some shuffling?
3951 SmallVector<int, 4> ReuseShuffleIndices;
3952
3953 /// Does this entry require reordering?
3954 SmallVector<unsigned, 4> ReorderIndices;
3955
3956 /// Points back to the VectorizableTree.
3957 ///
3958 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
3959 /// to be a pointer and needs to be able to initialize the child iterator.
3960 /// Thus we need a reference back to the container to translate the indices
3961 /// to entries.
3962 VecTreeTy &Container;
3963
3964 /// The TreeEntry index containing the user of this entry.
3965 EdgeInfo UserTreeIndex;
3966
3967 /// The index of this treeEntry in VectorizableTree.
3968 unsigned Idx = 0;
3969
3970 /// For gather/buildvector/alt opcode nodes, which are combined from
3971 /// other nodes as a series of insertvector instructions.
3972 SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
3973
3974 private:
3975 /// The operands of each instruction in each lane Operands[op_index][lane].
3976 /// Note: This helps avoid the replication of the code that performs the
3977 /// reordering of operands during buildTreeRec() and vectorizeTree().
3979
3980 /// Copyable elements of the entry node.
3981 SmallPtrSet<const Value *, 4> CopyableElements;
3982
3983 /// MainOp and AltOp are recorded inside. S should be obtained from
3984 /// newTreeEntry.
3985 InstructionsState S = InstructionsState::invalid();
3986
3987 /// Interleaving factor for interleaved loads Vectorize nodes.
3988 unsigned InterleaveFactor = 0;
3989
3990 /// True if the node does not require scheduling.
3991 bool DoesNotNeedToSchedule = false;
3992
3993 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
3994 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
3995 if (Operands.size() < OpIdx + 1)
3996 Operands.resize(OpIdx + 1);
3997 assert(Operands[OpIdx].empty() && "Already resized?");
3998 assert(OpVL.size() <= Scalars.size() &&
3999 "Number of operands is greater than the number of scalars.");
4000 Operands[OpIdx].resize(OpVL.size());
4001 copy(OpVL, Operands[OpIdx].begin());
4002 }
4003
4004 public:
4005 /// Returns interleave factor for interleave nodes.
4006 unsigned getInterleaveFactor() const { return InterleaveFactor; }
4007 /// Sets interleaving factor for the interleaving nodes.
4008 void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
4009
4010 /// Marks the node as one that does not require scheduling.
4011 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule = true; }
4012 /// Returns true if the node is marked as one that does not require
4013 /// scheduling.
4014 bool doesNotNeedToSchedule() const { return DoesNotNeedToSchedule; }
4015
4016 /// Set this bundle's operands from \p Operands.
4017 void setOperands(ArrayRef<ValueList> Operands) {
4018 for (unsigned I : seq<unsigned>(Operands.size()))
4019 setOperand(I, Operands[I]);
4020 }
4021
4022 /// Reorders operands of the node to the given mask \p Mask.
4023 void reorderOperands(ArrayRef<int> Mask) {
4024 for (ValueList &Operand : Operands)
4025 reorderScalars(Operand, Mask);
4026 }
4027
4028 /// \returns the \p OpIdx operand of this TreeEntry.
4029 ValueList &getOperand(unsigned OpIdx) {
4030 assert(OpIdx < Operands.size() && "Off bounds");
4031 return Operands[OpIdx];
4032 }
4033
4034 /// \returns the \p OpIdx operand of this TreeEntry.
4035 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
4036 assert(OpIdx < Operands.size() && "Off bounds");
4037 return Operands[OpIdx];
4038 }
4039
4040 /// \returns the number of operands.
4041 unsigned getNumOperands() const { return Operands.size(); }
4042
4043 /// \return the single \p OpIdx operand.
4044 Value *getSingleOperand(unsigned OpIdx) const {
4045 assert(OpIdx < Operands.size() && "Off bounds");
4046 assert(!Operands[OpIdx].empty() && "No operand available");
4047 return Operands[OpIdx][0];
4048 }
4049
4050 /// Some of the instructions in the list have alternate opcodes.
4051 bool isAltShuffle() const { return S.isAltShuffle(); }
4052
4053 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
4054 return S.getMatchingMainOpOrAltOp(I);
4055 }
4056
4057 /// Chooses the correct key for scheduling data. If \p Op has the same (or
4058 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
4059 /// \p OpValue.
4060 Value *isOneOf(Value *Op) const {
4061 auto *I = dyn_cast<Instruction>(Op);
4062 if (I && getMatchingMainOpOrAltOp(I))
4063 return Op;
4064 return S.getMainOp();
4065 }
4066
4067 void setOperations(const InstructionsState &S) {
4068 assert(S && "InstructionsState is invalid.");
4069 this->S = S;
4070 }
4071
4072 Instruction *getMainOp() const { return S.getMainOp(); }
4073
4074 Instruction *getAltOp() const { return S.getAltOp(); }
4075
4076 /// The main/alternate opcodes for the list of instructions.
4077 unsigned getOpcode() const { return S.getOpcode(); }
4078
4079 unsigned getAltOpcode() const { return S.getAltOpcode(); }
4080
4081 bool hasState() const { return S.valid(); }
4082
4083 /// Add \p V to the list of copyable elements.
4084 void addCopyableElement(Value *V) {
4085 assert(S.isCopyableElement(V) && "Not a copyable element.");
4086 CopyableElements.insert(V);
4087 }
4088
4089 /// Returns true if \p V is a copyable element.
4090 bool isCopyableElement(Value *V) const {
4091 return CopyableElements.contains(V);
4092 }
4093
4094 /// Returns true if any scalar in the list is a copyable element.
4095 bool hasCopyableElements() const { return !CopyableElements.empty(); }
4096
4097 /// Returns the state of the operations.
4098 const InstructionsState &getOperations() const { return S; }
4099
4100 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
4101 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
4102 unsigned findLaneForValue(Value *V) const {
4103 unsigned FoundLane = getVectorFactor();
4104 for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End;
4105 std::advance(It, 1)) {
4106 if (*It != V)
4107 continue;
4108 FoundLane = std::distance(Scalars.begin(), It);
4109 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4110 if (!ReorderIndices.empty())
4111 FoundLane = ReorderIndices[FoundLane];
4112 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4113 if (ReuseShuffleIndices.empty())
4114 break;
4115 if (auto *RIt = find(ReuseShuffleIndices, FoundLane);
4116 RIt != ReuseShuffleIndices.end()) {
4117 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
4118 break;
4119 }
4120 }
4121 assert(FoundLane < getVectorFactor() && "Unable to find given value.");
4122 return FoundLane;
4123 }
4124
4125 /// Build a shuffle mask for graph entry which represents a merge of main
4126 /// and alternate operations.
4127 void
4128 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
4129 SmallVectorImpl<int> &Mask,
4130 SmallVectorImpl<Value *> *OpScalars = nullptr,
4131 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
4132
4133 /// Return true if this is a non-power-of-2 node.
4134 bool isNonPowOf2Vec() const {
4135 bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
4136 return IsNonPowerOf2;
4137 }
4138
4139 /// Return true if this is a node, which tries to vectorize number of
4140 /// elements, forming whole vectors.
4141 bool
4142 hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
4143 bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
4144 TTI, getValueType(Scalars.front()), Scalars.size());
4145 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
4146 "Reshuffling not supported with non-power-of-2 vectors yet.");
4147 return IsNonPowerOf2;
4148 }
4149
4150 Value *getOrdered(unsigned Idx) const {
4151 assert(isGather() && "Must be used only for buildvectors/gathers.");
4152 if (ReorderIndices.empty())
4153 return Scalars[Idx];
4154 SmallVector<int> Mask;
4155 inversePermutation(ReorderIndices, Mask);
4156 return Scalars[Mask[Idx]];
4157 }
4158
4159#ifndef NDEBUG
4160 /// Debug printer.
4161 LLVM_DUMP_METHOD void dump() const {
4162 dbgs() << Idx << ".\n";
4163 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4164 dbgs() << "Operand " << OpI << ":\n";
4165 for (const Value *V : Operands[OpI])
4166 dbgs().indent(2) << *V << "\n";
4167 }
4168 dbgs() << "Scalars: \n";
4169 for (Value *V : Scalars)
4170 dbgs().indent(2) << *V << "\n";
4171 dbgs() << "State: ";
4172 if (S && hasCopyableElements())
4173 dbgs() << "[[Copyable]] ";
4174 switch (State) {
4175 case Vectorize:
4176 if (InterleaveFactor > 0) {
4177 dbgs() << "Vectorize with interleave factor " << InterleaveFactor
4178 << "\n";
4179 } else {
4180 dbgs() << "Vectorize\n";
4181 }
4182 break;
4183 case ScatterVectorize:
4184 dbgs() << "ScatterVectorize\n";
4185 break;
4186 case StridedVectorize:
4187 dbgs() << "StridedVectorize\n";
4188 break;
4189 case CompressVectorize:
4190 dbgs() << "CompressVectorize\n";
4191 break;
4192 case NeedToGather:
4193 dbgs() << "NeedToGather\n";
4194 break;
4195 case CombinedVectorize:
4196 dbgs() << "CombinedVectorize\n";
4197 break;
4198 case SplitVectorize:
4199 dbgs() << "SplitVectorize\n";
4200 break;
4201 }
4202 if (S) {
4203 dbgs() << "MainOp: " << *S.getMainOp() << "\n";
4204 dbgs() << "AltOp: " << *S.getAltOp() << "\n";
4205 } else {
4206 dbgs() << "MainOp: NULL\n";
4207 dbgs() << "AltOp: NULL\n";
4208 }
4209 dbgs() << "VectorizedValue: ";
4210 if (VectorizedValue)
4211 dbgs() << *VectorizedValue << "\n";
4212 else
4213 dbgs() << "NULL\n";
4214 dbgs() << "ReuseShuffleIndices: ";
4215 if (ReuseShuffleIndices.empty())
4216 dbgs() << "Empty";
4217 else
4218 for (int ReuseIdx : ReuseShuffleIndices)
4219 dbgs() << ReuseIdx << ", ";
4220 dbgs() << "\n";
4221 dbgs() << "ReorderIndices: ";
4222 for (unsigned ReorderIdx : ReorderIndices)
4223 dbgs() << ReorderIdx << ", ";
4224 dbgs() << "\n";
4225 dbgs() << "UserTreeIndex: ";
4226 if (UserTreeIndex)
4227 dbgs() << UserTreeIndex;
4228 else
4229 dbgs() << "<invalid>";
4230 dbgs() << "\n";
4231 if (!CombinedEntriesWithIndices.empty()) {
4232 dbgs() << "Combined entries: ";
4233 interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
4234 dbgs() << "Entry index " << P.first << " with offset " << P.second;
4235 });
4236 dbgs() << "\n";
4237 }
4238 }
4239#endif
4240 };
4241
4242#ifndef NDEBUG
4243 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
4244 InstructionCost VecCost, InstructionCost ScalarCost,
4245 StringRef Banner) const {
4246 dbgs() << "SLP: " << Banner << ":\n";
4247 E->dump();
4248 dbgs() << "SLP: Costs:\n";
4249 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
4250 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
4251 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
4252 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4253 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
4254 }
4255#endif
4256
4257 /// Create a new gather TreeEntry
4258 TreeEntry *newGatherTreeEntry(ArrayRef<Value *> VL,
4259 const InstructionsState &S,
4260 const EdgeInfo &UserTreeIdx,
4261 ArrayRef<int> ReuseShuffleIndices = {}) {
4262 auto Invalid = ScheduleBundle::invalid();
4263 return newTreeEntry(VL, Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4264 }
4265
4266 /// Create a new VectorizableTree entry.
4267 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, ScheduleBundle &Bundle,
4268 const InstructionsState &S,
4269 const EdgeInfo &UserTreeIdx,
4270 ArrayRef<int> ReuseShuffleIndices = {},
4271 ArrayRef<unsigned> ReorderIndices = {},
4272 unsigned InterleaveFactor = 0) {
4273 TreeEntry::EntryState EntryState =
4274 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4275 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4276 ReuseShuffleIndices, ReorderIndices);
4277 if (E && InterleaveFactor > 0)
4278 E->setInterleave(InterleaveFactor);
4279 return E;
4280 }
4281
4282 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
4283 TreeEntry::EntryState EntryState,
4284 ScheduleBundle &Bundle, const InstructionsState &S,
4285 const EdgeInfo &UserTreeIdx,
4286 ArrayRef<int> ReuseShuffleIndices = {},
4287 ArrayRef<unsigned> ReorderIndices = {}) {
4288 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4289 EntryState == TreeEntry::SplitVectorize)) ||
4290 (Bundle && EntryState != TreeEntry::NeedToGather &&
4291 EntryState != TreeEntry::SplitVectorize)) &&
4292 "Need to vectorize gather entry?");
4293 // Gathered loads still gathered? Do not create entry, use the original one.
4294 if (GatheredLoadsEntriesFirst.has_value() &&
4295 EntryState == TreeEntry::NeedToGather && S &&
4296 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4297 !UserTreeIdx.UserTE)
4298 return nullptr;
4299 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
4300 TreeEntry *Last = VectorizableTree.back().get();
4301 Last->Idx = VectorizableTree.size() - 1;
4302 Last->State = EntryState;
4303 if (UserTreeIdx.UserTE)
4304 OperandsToTreeEntry.try_emplace(
4305 std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx), Last);
4306 // FIXME: Remove once support for ReuseShuffleIndices has been implemented
4307 // for non-power-of-two vectors.
4308 assert(
4309 (hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) ||
4310 ReuseShuffleIndices.empty()) &&
4311 "Reshuffling scalars not yet supported for nodes with padding");
4312 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
4313 ReuseShuffleIndices.end());
4314 if (ReorderIndices.empty()) {
4315 Last->Scalars.assign(VL.begin(), VL.end());
4316 if (S)
4317 Last->setOperations(S);
4318 } else {
4319 // Reorder scalars and build final mask.
4320 Last->Scalars.assign(VL.size(), nullptr);
4321 transform(ReorderIndices, Last->Scalars.begin(),
4322 [VL](unsigned Idx) -> Value * {
4323 if (Idx >= VL.size())
4324 return UndefValue::get(VL.front()->getType());
4325 return VL[Idx];
4326 });
4327 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
4328 if (S)
4329 Last->setOperations(S);
4330 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
4331 }
4332 if (EntryState == TreeEntry::SplitVectorize) {
4333 assert(S && "Split nodes must have operations.");
4334 Last->setOperations(S);
4335 SmallPtrSet<Value *, 4> Processed;
4336 for (Value *V : VL) {
4337 auto *I = dyn_cast<Instruction>(V);
4338 if (!I)
4339 continue;
4340 auto It = ScalarsInSplitNodes.find(V);
4341 if (It == ScalarsInSplitNodes.end()) {
4342 ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(Last);
4343 (void)Processed.insert(V);
4344 } else if (Processed.insert(V).second) {
4345 assert(!is_contained(It->getSecond(), Last) &&
4346 "Value already associated with the node.");
4347 It->getSecond().push_back(Last);
4348 }
4349 }
4350 } else if (!Last->isGather()) {
4351 if (isa<PHINode>(S.getMainOp()) ||
4352 isVectorLikeInstWithConstOps(S.getMainOp()) ||
4353 (!S.areInstructionsWithCopyableElements() &&
4354 doesNotNeedToSchedule(VL)) ||
4355 all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))
4356 Last->setDoesNotNeedToSchedule();
4357 SmallPtrSet<Value *, 4> Processed;
4358 for (Value *V : VL) {
4359 if (isa<PoisonValue>(V))
4360 continue;
4361 if (S.isCopyableElement(V)) {
4362 Last->addCopyableElement(V);
4363 continue;
4364 }
4365 auto It = ScalarToTreeEntries.find(V);
4366 if (It == ScalarToTreeEntries.end()) {
4367 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(Last);
4368 (void)Processed.insert(V);
4369 } else if (Processed.insert(V).second) {
4370 assert(!is_contained(It->getSecond(), Last) &&
4371 "Value already associated with the node.");
4372 It->getSecond().push_back(Last);
4373 }
4374 }
4375 // Update the scheduler bundle to point to this TreeEntry.
4376 assert((!Bundle.getBundle().empty() || Last->doesNotNeedToSchedule()) &&
4377 "Bundle and VL out of sync");
4378 if (!Bundle.getBundle().empty()) {
4379#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4380 auto *BundleMember = Bundle.getBundle().begin();
4381 SmallPtrSet<Value *, 4> Processed;
4382 for (Value *V : VL) {
4383 if (S.isNonSchedulable(V) || !Processed.insert(V).second)
4384 continue;
4385 ++BundleMember;
4386 }
4387 assert(BundleMember == Bundle.getBundle().end() &&
4388 "Bundle and VL out of sync");
4389#endif
4390 Bundle.setTreeEntry(Last);
4391 }
4392 } else {
4393 // Build a map for gathered scalars to the nodes where they are used.
4394 bool AllConstsOrCasts = true;
4395 for (Value *V : VL) {
4396 if (S && S.areInstructionsWithCopyableElements() &&
4397 S.isCopyableElement(V))
4398 Last->addCopyableElement(V);
4399 if (!isConstant(V)) {
4400 auto *I = dyn_cast<CastInst>(V);
4401 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
4402 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4403 !UserTreeIdx.UserTE->isGather())
4404 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
4405 }
4406 }
4407 if (AllConstsOrCasts)
4408 CastMaxMinBWSizes =
4409 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
4410 MustGather.insert_range(VL);
4411 }
4412
4413 if (UserTreeIdx.UserTE)
4414 Last->UserTreeIndex = UserTreeIdx;
4415 return Last;
4416 }
4417
4418 /// -- Vectorization State --
4419 /// Holds all of the tree entries.
4420 TreeEntry::VecTreeTy VectorizableTree;
4421
4422#ifndef NDEBUG
4423 /// Debug printer.
4424 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
4425 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4426 VectorizableTree[Id]->dump();
4427 dbgs() << "\n";
4428 }
4429 }
4430#endif
4431
4432 /// Get list of vector entries, associated with the value \p V.
4433 ArrayRef<TreeEntry *> getTreeEntries(Value *V) const {
4434 assert(V && "V cannot be nullptr.");
4435 auto It = ScalarToTreeEntries.find(V);
4436 if (It == ScalarToTreeEntries.end())
4437 return {};
4438 return It->getSecond();
4439 }
4440
4441 /// Get list of split vector entries, associated with the value \p V.
4442 ArrayRef<TreeEntry *> getSplitTreeEntries(Value *V) const {
4443 assert(V && "V cannot be nullptr.");
4444 auto It = ScalarsInSplitNodes.find(V);
4445 if (It == ScalarsInSplitNodes.end())
4446 return {};
4447 return It->getSecond();
4448 }
4449
4450 /// Returns first vector node for value \p V, matching values \p VL.
4451 TreeEntry *getSameValuesTreeEntry(Value *V, ArrayRef<Value *> VL,
4452 bool SameVF = false) const {
4453 assert(V && "V cannot be nullptr.");
4454 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
4455 if ((!SameVF || TE->getVectorFactor() == VL.size()) && TE->isSame(VL))
4456 return TE;
4457 return nullptr;
4458 }
4459
4460 /// Check that the operand node of alternate node does not generate
4461 /// buildvector sequence. If it is, then probably not worth it to build
4462 /// alternate shuffle, if number of buildvector operands + alternate
4463 /// instruction > than the number of buildvector instructions.
4464 /// \param S the instructions state of the analyzed values.
4465 /// \param VL list of the instructions with alternate opcodes.
4466 bool areAltOperandsProfitable(const InstructionsState &S,
4467 ArrayRef<Value *> VL) const;
4468
4469 /// Contains all the outputs of legality analysis for a list of values to
4470 /// vectorize.
4471 class ScalarsVectorizationLegality {
4472 InstructionsState S;
4473 bool IsLegal;
4474 bool TryToFindDuplicates;
4475 bool TrySplitVectorize;
4476
4477 public:
4478 ScalarsVectorizationLegality(InstructionsState S, bool IsLegal,
4479 bool TryToFindDuplicates = true,
4480 bool TrySplitVectorize = false)
4481 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4482 TrySplitVectorize(TrySplitVectorize) {
4483 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4484 "Inconsistent state");
4485 }
4486 const InstructionsState &getInstructionsState() const { return S; };
4487 bool isLegal() const { return IsLegal; }
4488 bool tryToFindDuplicates() const { return TryToFindDuplicates; }
4489 bool trySplitVectorize() const { return TrySplitVectorize; }
4490 };
4491
4492 /// Checks if the specified list of the instructions/values can be vectorized
4493 /// in general.
4494 ScalarsVectorizationLegality
4495 getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
4496 const EdgeInfo &UserTreeIdx,
4497 bool TryCopyableElementsVectorization) const;
4498
4499 /// Checks if the specified list of the instructions/values can be vectorized
4500 /// and fills required data before actual scheduling of the instructions.
4501 TreeEntry::EntryState getScalarsVectorizationState(
4502 const InstructionsState &S, ArrayRef<Value *> VL,
4503 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
4504 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
4505
4506 /// Maps a specific scalar to its tree entry(ies).
4507 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
4508
4509 /// Maps the operand index and entry to the corresponding tree entry.
4510 SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
4511 OperandsToTreeEntry;
4512
4513 /// Scalars, used in split vectorize nodes.
4514 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
4515
4516 /// Maps a value to the proposed vectorizable size.
4517 SmallDenseMap<Value *, unsigned> InstrElementSize;
4518
4519 /// A list of scalars that we found that we need to keep as scalars.
4520 ValueSet MustGather;
4521
4522 /// A set of first non-schedulable values.
4523 ValueSet NonScheduledFirst;
4524
4525 /// A map between the vectorized entries and the last instructions in the
4526 /// bundles. The bundles are built in use order, not in the def order of the
4527 /// instructions. So, we cannot rely directly on the last instruction in the
4528 /// bundle being the last instruction in the program order during
4529 /// vectorization process since the basic blocks are affected, need to
4530 /// pre-gather them before.
4531 SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4532
4533 /// List of gather nodes, depending on other gather/vector nodes, which should
4534 /// be emitted after the vector instruction emission process to correctly
4535 /// handle order of the vector instructions and shuffles.
4536 SetVector<const TreeEntry *> PostponedGathers;
4537
4538 using ValueToGatherNodesMap =
4539 DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
4540 ValueToGatherNodesMap ValueToGatherNodes;
4541
4542 /// A list of the load entries (node indices), which can be vectorized using
4543 /// strided or masked gather approach, but attempted to be represented as
4544 /// contiguous loads.
4545 SetVector<unsigned> LoadEntriesToVectorize;
4546
4547 /// true if graph nodes transforming mode is on.
4548 bool IsGraphTransformMode = false;
4549
4550 /// The index of the first gathered load entry in the VectorizeTree.
4551 std::optional<unsigned> GatheredLoadsEntriesFirst;
4552
4553 /// Maps compress entries to their mask data for the final codegen.
4554 SmallDenseMap<const TreeEntry *,
4555 std::tuple<SmallVector<int>, VectorType *, unsigned, bool>>
4556 CompressEntryToData;
4557
4558 /// This POD struct describes one external user in the vectorized tree.
4559 struct ExternalUser {
4560 ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, unsigned L)
4561 : Scalar(S), User(U), E(E), Lane(L) {}
4562
4563 /// Which scalar in our function.
4564 Value *Scalar = nullptr;
4565
4566 /// Which user that uses the scalar.
4567 llvm::User *User = nullptr;
4568
4569 /// Vector node, the value is part of.
4570 const TreeEntry &E;
4571
4572 /// Which lane does the scalar belong to.
4573 unsigned Lane;
4574 };
4575 using UserList = SmallVector<ExternalUser, 16>;
4576
4577 /// Checks if two instructions may access the same memory.
4578 ///
4579 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
4580 /// is invariant in the calling loop.
4581 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
4582 Instruction *Inst2) {
4583 assert(Loc1.Ptr && isSimple(Inst1) && "Expected simple first instruction.");
4584 // First check if the result is already in the cache.
4585 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
4586 auto Res = AliasCache.try_emplace(Key);
4587 if (!Res.second)
4588 return Res.first->second;
4589 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
4590 // Store the result in the cache.
4591 Res.first->getSecond() = Aliased;
4592 return Aliased;
4593 }
4594
4595 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4596
4597 /// Cache for alias results.
4598 /// TODO: consider moving this to the AliasAnalysis itself.
4599 SmallDenseMap<AliasCacheKey, bool> AliasCache;
4600
4601 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
4602 // globally through SLP because we don't perform any action which
4603 // invalidates capture results.
4604 BatchAAResults BatchAA;
4605
4606 /// Temporary store for deleted instructions. Instructions will be deleted
4607 /// eventually when the BoUpSLP is destructed. The deferral is required to
4608 /// ensure that there are no incorrect collisions in the AliasCache, which
4609 /// can happen if a new instruction is allocated at the same address as a
4610 /// previously deleted instruction.
4611 DenseSet<Instruction *> DeletedInstructions;
4612
4613 /// Set of the instruction, being analyzed already for reductions.
4614 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
4615
4616 /// Set of hashes for the list of reduction values already being analyzed.
4617 DenseSet<size_t> AnalyzedReductionVals;
4618
4619 /// Values, already been analyzed for mininmal bitwidth and found to be
4620 /// non-profitable.
4621 DenseSet<Value *> AnalyzedMinBWVals;
4622
4623 /// A list of values that need to extracted out of the tree.
4624 /// This list holds pairs of (Internal Scalar : External User). External User
4625 /// can be nullptr, it means that this Internal Scalar will be used later,
4626 /// after vectorization.
4627 UserList ExternalUses;
4628
4629 /// A list of GEPs which can be reaplced by scalar GEPs instead of
4630 /// extractelement instructions.
4631 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
4632
4633 /// A list of scalar to be extracted without specific user necause of too many
4634 /// uses.
4635 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
4636
4637 /// Values used only by @llvm.assume calls.
4638 SmallPtrSet<const Value *, 32> EphValues;
4639
4640 /// Holds all of the instructions that we gathered, shuffle instructions and
4641 /// extractelements.
4642 SetVector<Instruction *> GatherShuffleExtractSeq;
4643
4644 /// A list of blocks that we are going to CSE.
4645 DenseSet<BasicBlock *> CSEBlocks;
4646
4647 /// List of hashes of vector of loads, which are known to be non vectorizable.
4648 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
4649
4650 /// Represents a scheduling entity, either ScheduleData, ScheduleCopyableData
4651 /// or ScheduleBundle. ScheduleData used to gather dependecies for a single
4652 /// instructions, while ScheduleBundle represents a batch of instructions,
4653 /// going to be groupped together. ScheduleCopyableData models extra user for
4654 /// "copyable" instructions.
4655 class ScheduleEntity {
4656 friend class ScheduleBundle;
4657 friend class ScheduleData;
4658 friend class ScheduleCopyableData;
4659
4660 protected:
4661 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
4662 Kind getKind() const { return K; }
4663 ScheduleEntity(Kind K) : K(K) {}
4664
4665 private:
4666 /// Used for getting a "good" final ordering of instructions.
4667 int SchedulingPriority = 0;
4668 /// True if this instruction (or bundle) is scheduled (or considered as
4669 /// scheduled in the dry-run).
4670 bool IsScheduled = false;
4671 /// The kind of the ScheduleEntity.
4672 const Kind K = Kind::ScheduleData;
4673
4674 public:
4675 ScheduleEntity() = delete;
4676 /// Gets/sets the scheduling priority.
4677 void setSchedulingPriority(int Priority) { SchedulingPriority = Priority; }
4678 int getSchedulingPriority() const { return SchedulingPriority; }
4679 bool isReady() const {
4680 if (const auto *SD = dyn_cast<ScheduleData>(this))
4681 return SD->isReady();
4682 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4683 return CD->isReady();
4684 return cast<ScheduleBundle>(this)->isReady();
4685 }
4686 /// Returns true if the dependency information has been calculated.
4687 /// Note that depenendency validity can vary between instructions within
4688 /// a single bundle.
4689 bool hasValidDependencies() const {
4690 if (const auto *SD = dyn_cast<ScheduleData>(this))
4691 return SD->hasValidDependencies();
4692 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4693 return CD->hasValidDependencies();
4694 return cast<ScheduleBundle>(this)->hasValidDependencies();
4695 }
4696 /// Gets the number of unscheduled dependencies.
4697 int getUnscheduledDeps() const {
4698 if (const auto *SD = dyn_cast<ScheduleData>(this))
4699 return SD->getUnscheduledDeps();
4700 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4701 return CD->getUnscheduledDeps();
4702 return cast<ScheduleBundle>(this)->unscheduledDepsInBundle();
4703 }
4704 /// Increments the number of unscheduled dependencies.
4705 int incrementUnscheduledDeps(int Incr) {
4706 if (auto *SD = dyn_cast<ScheduleData>(this))
4707 return SD->incrementUnscheduledDeps(Incr);
4708 return cast<ScheduleCopyableData>(this)->incrementUnscheduledDeps(Incr);
4709 }
4710 /// Gets the number of dependencies.
4711 int getDependencies() const {
4712 if (const auto *SD = dyn_cast<ScheduleData>(this))
4713 return SD->getDependencies();
4714 return cast<ScheduleCopyableData>(this)->getDependencies();
4715 }
4716 /// Gets the instruction.
4717 Instruction *getInst() const {
4718 if (const auto *SD = dyn_cast<ScheduleData>(this))
4719 return SD->getInst();
4720 return cast<ScheduleCopyableData>(this)->getInst();
4721 }
4722
4723 /// Gets/sets if the bundle is scheduled.
4724 bool isScheduled() const { return IsScheduled; }
4725 void setScheduled(bool Scheduled) { IsScheduled = Scheduled; }
4726
4727 static bool classof(const ScheduleEntity *) { return true; }
4728
4729#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4730 void dump(raw_ostream &OS) const {
4731 if (const auto *SD = dyn_cast<ScheduleData>(this))
4732 return SD->dump(OS);
4733 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4734 return CD->dump(OS);
4735 return cast<ScheduleBundle>(this)->dump(OS);
4736 }
4737
4738 LLVM_DUMP_METHOD void dump() const {
4739 dump(dbgs());
4740 dbgs() << '\n';
4741 }
4742#endif // if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4743 };
4744
4745#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4747 const BoUpSLP::ScheduleEntity &SE) {
4748 SE.dump(OS);
4749 return OS;
4750 }
4751#endif
4752
4753 /// Contains all scheduling relevant data for an instruction.
4754 /// A ScheduleData either represents a single instruction or a member of an
4755 /// instruction bundle (= a group of instructions which is combined into a
4756 /// vector instruction).
4757 class ScheduleData final : public ScheduleEntity {
4758 public:
4759 // The initial value for the dependency counters. It means that the
4760 // dependencies are not calculated yet.
4761 enum { InvalidDeps = -1 };
4762
4763 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
4764 static bool classof(const ScheduleEntity *Entity) {
4765 return Entity->getKind() == Kind::ScheduleData;
4766 }
4767
4768 void init(int BlockSchedulingRegionID, Instruction *I) {
4769 NextLoadStore = nullptr;
4770 IsScheduled = false;
4771 SchedulingRegionID = BlockSchedulingRegionID;
4772 clearDependencies();
4773 Inst = I;
4774 }
4775
4776 /// Verify basic self consistency properties
4777 void verify() {
4778 if (hasValidDependencies()) {
4779 assert(UnscheduledDeps <= Dependencies && "invariant");
4780 } else {
4781 assert(UnscheduledDeps == Dependencies && "invariant");
4782 }
4783
4784 if (IsScheduled) {
4785 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
4786 "unexpected scheduled state");
4787 }
4788 }
4789
4790 /// Returns true if the dependency information has been calculated.
4791 /// Note that depenendency validity can vary between instructions within
4792 /// a single bundle.
4793 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
4794
4795 /// Returns true if it is ready for scheduling, i.e. it has no more
4796 /// unscheduled depending instructions/bundles.
4797 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
4798
4799 /// Modifies the number of unscheduled dependencies for this instruction,
4800 /// and returns the number of remaining dependencies for the containing
4801 /// bundle.
4802 int incrementUnscheduledDeps(int Incr) {
4803 assert(hasValidDependencies() &&
4804 "increment of unscheduled deps would be meaningless");
4805 UnscheduledDeps += Incr;
4806 return UnscheduledDeps;
4807 }
4808
4809 /// Sets the number of unscheduled dependencies to the number of
4810 /// dependencies.
4811 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
4812
4813 /// Clears all dependency information.
4814 void clearDependencies() {
4815 clearDirectDependencies();
4816 MemoryDependencies.clear();
4817 ControlDependencies.clear();
4818 }
4819
4820 /// Clears all direct dependencies only, except for control and memory
4821 /// dependencies.
4822 /// Required for copyable elements to correctly handle control/memory deps
4823 /// and avoid extra reclaculation of such deps.
4824 void clearDirectDependencies() {
4825 Dependencies = InvalidDeps;
4826 resetUnscheduledDeps();
4827 IsScheduled = false;
4828 }
4829
4830 /// Gets the number of unscheduled dependencies.
4831 int getUnscheduledDeps() const { return UnscheduledDeps; }
4832 /// Gets the number of dependencies.
4833 int getDependencies() const { return Dependencies; }
4834 /// Initializes the number of dependencies.
4835 void initDependencies() { Dependencies = 0; }
4836 /// Increments the number of dependencies.
4837 void incDependencies() { Dependencies++; }
4838
4839 /// Gets scheduling region ID.
4840 int getSchedulingRegionID() const { return SchedulingRegionID; }
4841
4842 /// Gets the instruction.
4843 Instruction *getInst() const { return Inst; }
4844
4845 /// Gets the list of memory dependencies.
4846 ArrayRef<ScheduleData *> getMemoryDependencies() const {
4847 return MemoryDependencies;
4848 }
4849 /// Adds a memory dependency.
4850 void addMemoryDependency(ScheduleData *Dep) {
4851 MemoryDependencies.push_back(Dep);
4852 }
4853 /// Gets the list of control dependencies.
4854 ArrayRef<ScheduleData *> getControlDependencies() const {
4855 return ControlDependencies;
4856 }
4857 /// Adds a control dependency.
4858 void addControlDependency(ScheduleData *Dep) {
4859 ControlDependencies.push_back(Dep);
4860 }
4861 /// Gets/sets the next load/store instruction in the block.
4862 ScheduleData *getNextLoadStore() const { return NextLoadStore; }
4863 void setNextLoadStore(ScheduleData *Next) { NextLoadStore = Next; }
4864
4865 void dump(raw_ostream &OS) const { OS << *Inst; }
4866
4867 LLVM_DUMP_METHOD void dump() const {
4868 dump(dbgs());
4869 dbgs() << '\n';
4870 }
4871
4872 private:
4873 Instruction *Inst = nullptr;
4874
4875 /// Single linked list of all memory instructions (e.g. load, store, call)
4876 /// in the block - until the end of the scheduling region.
4877 ScheduleData *NextLoadStore = nullptr;
4878
4879 /// The dependent memory instructions.
4880 /// This list is derived on demand in calculateDependencies().
4881 SmallVector<ScheduleData *> MemoryDependencies;
4882
4883 /// List of instructions which this instruction could be control dependent
4884 /// on. Allowing such nodes to be scheduled below this one could introduce
4885 /// a runtime fault which didn't exist in the original program.
4886 /// ex: this is a load or udiv following a readonly call which inf loops
4887 SmallVector<ScheduleData *> ControlDependencies;
4888
4889 /// This ScheduleData is in the current scheduling region if this matches
4890 /// the current SchedulingRegionID of BlockScheduling.
4891 int SchedulingRegionID = 0;
4892
4893 /// The number of dependencies. Constitutes of the number of users of the
4894 /// instruction plus the number of dependent memory instructions (if any).
4895 /// This value is calculated on demand.
4896 /// If InvalidDeps, the number of dependencies is not calculated yet.
4897 int Dependencies = InvalidDeps;
4898
4899 /// The number of dependencies minus the number of dependencies of scheduled
4900 /// instructions. As soon as this is zero, the instruction/bundle gets ready
4901 /// for scheduling.
4902 /// Note that this is negative as long as Dependencies is not calculated.
4903 int UnscheduledDeps = InvalidDeps;
4904 };
4905
4906#ifndef NDEBUG
4908 const BoUpSLP::ScheduleData &SD) {
4909 SD.dump(OS);
4910 return OS;
4911 }
4912#endif
4913
4914 class ScheduleBundle final : public ScheduleEntity {
4915 /// The schedule data for the instructions in the bundle.
4917 /// True if this bundle is valid.
4918 bool IsValid = true;
4919 /// The TreeEntry that this instruction corresponds to.
4920 TreeEntry *TE = nullptr;
4921 ScheduleBundle(bool IsValid)
4922 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
4923
4924 public:
4925 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
4926 static bool classof(const ScheduleEntity *Entity) {
4927 return Entity->getKind() == Kind::ScheduleBundle;
4928 }
4929
4930 /// Verify basic self consistency properties
4931 void verify() const {
4932 for (const ScheduleEntity *SD : Bundle) {
4933 if (SD->hasValidDependencies()) {
4934 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
4935 "invariant");
4936 } else {
4937 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
4938 "invariant");
4939 }
4940
4941 if (isScheduled()) {
4942 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
4943 "unexpected scheduled state");
4944 }
4945 }
4946 }
4947
4948 /// Returns the number of unscheduled dependencies in the bundle.
4949 int unscheduledDepsInBundle() const {
4950 assert(*this && "bundle must not be empty");
4951 int Sum = 0;
4952 for (const ScheduleEntity *BundleMember : Bundle) {
4953 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
4954 return ScheduleData::InvalidDeps;
4955 Sum += BundleMember->getUnscheduledDeps();
4956 }
4957 return Sum;
4958 }
4959
4960 /// Returns true if the dependency information has been calculated.
4961 /// Note that depenendency validity can vary between instructions within
4962 /// a single bundle.
4963 bool hasValidDependencies() const {
4964 return all_of(Bundle, [](const ScheduleEntity *SD) {
4965 return SD->hasValidDependencies();
4966 });
4967 }
4968
4969 /// Returns true if it is ready for scheduling, i.e. it has no more
4970 /// unscheduled depending instructions/bundles.
4971 bool isReady() const {
4972 assert(*this && "bundle must not be empty");
4973 return unscheduledDepsInBundle() == 0 && !isScheduled();
4974 }
4975
4976 /// Returns the bundle of scheduling data, associated with the current
4977 /// instruction.
4978 ArrayRef<ScheduleEntity *> getBundle() { return Bundle; }
4979 ArrayRef<const ScheduleEntity *> getBundle() const { return Bundle; }
4980 /// Adds an instruction to the bundle.
4981 void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
4982
4983 /// Gets/sets the associated tree entry.
4984 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
4985 TreeEntry *getTreeEntry() const { return TE; }
4986
4987 static ScheduleBundle invalid() { return {false}; }
4988
4989 operator bool() const { return IsValid; }
4990
4991#ifndef NDEBUG
4992 void dump(raw_ostream &OS) const {
4993 if (!*this) {
4994 OS << "[]";
4995 return;
4996 }
4997 OS << '[';
4998 interleaveComma(Bundle, OS, [&](const ScheduleEntity *SD) {
5000 OS << "<Copyable>";
5001 OS << *SD->getInst();
5002 });
5003 OS << ']';
5004 }
5005
5006 LLVM_DUMP_METHOD void dump() const {
5007 dump(dbgs());
5008 dbgs() << '\n';
5009 }
5010#endif // NDEBUG
5011 };
5012
5013#ifndef NDEBUG
5015 const BoUpSLP::ScheduleBundle &Bundle) {
5016 Bundle.dump(OS);
5017 return OS;
5018 }
5019#endif
5020
5021 /// Contains all scheduling relevant data for the copyable instruction.
5022 /// It models the virtual instructions, supposed to replace the original
5023 /// instructions. E.g., if instruction %0 = load is a part of the bundle [%0,
5024 /// %1], where %1 = add, then the ScheduleCopyableData models virtual
5025 /// instruction %virt = add %0, 0.
5026 class ScheduleCopyableData final : public ScheduleEntity {
5027 /// The source schedule data for the instruction.
5028 Instruction *Inst = nullptr;
5029 /// The edge information for the instruction.
5030 const EdgeInfo EI;
5031 /// This ScheduleData is in the current scheduling region if this matches
5032 /// the current SchedulingRegionID of BlockScheduling.
5033 int SchedulingRegionID = 0;
5034 /// Bundle, this data is part of.
5035 ScheduleBundle &Bundle;
5036
5037 public:
5038 ScheduleCopyableData(int BlockSchedulingRegionID, Instruction *I,
5039 const EdgeInfo &EI, ScheduleBundle &Bundle)
5040 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(I), EI(EI),
5041 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5042 static bool classof(const ScheduleEntity *Entity) {
5043 return Entity->getKind() == Kind::ScheduleCopyableData;
5044 }
5045
5046 /// Verify basic self consistency properties
5047 void verify() {
5048 if (hasValidDependencies()) {
5049 assert(UnscheduledDeps <= Dependencies && "invariant");
5050 } else {
5051 assert(UnscheduledDeps == Dependencies && "invariant");
5052 }
5053
5054 if (IsScheduled) {
5055 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5056 "unexpected scheduled state");
5057 }
5058 }
5059
5060 /// Returns true if the dependency information has been calculated.
5061 /// Note that depenendency validity can vary between instructions within
5062 /// a single bundle.
5063 bool hasValidDependencies() const {
5064 return Dependencies != ScheduleData::InvalidDeps;
5065 }
5066
5067 /// Returns true if it is ready for scheduling, i.e. it has no more
5068 /// unscheduled depending instructions/bundles.
5069 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
5070
5071 /// Modifies the number of unscheduled dependencies for this instruction,
5072 /// and returns the number of remaining dependencies for the containing
5073 /// bundle.
5074 int incrementUnscheduledDeps(int Incr) {
5075 assert(hasValidDependencies() &&
5076 "increment of unscheduled deps would be meaningless");
5077 UnscheduledDeps += Incr;
5078 assert(UnscheduledDeps >= 0 && "invariant");
5079 return UnscheduledDeps;
5080 }
5081
5082 /// Sets the number of unscheduled dependencies to the number of
5083 /// dependencies.
5084 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5085
5086 /// Gets the number of unscheduled dependencies.
5087 int getUnscheduledDeps() const { return UnscheduledDeps; }
5088 /// Gets the number of dependencies.
5089 int getDependencies() const { return Dependencies; }
5090 /// Initializes the number of dependencies.
5091 void initDependencies() { Dependencies = 0; }
5092 /// Increments the number of dependencies.
5093 void incDependencies() { Dependencies++; }
5094
5095 /// Gets scheduling region ID.
5096 int getSchedulingRegionID() const { return SchedulingRegionID; }
5097
5098 /// Gets the instruction.
5099 Instruction *getInst() const { return Inst; }
5100
5101 /// Clears all dependency information.
5102 void clearDependencies() {
5103 Dependencies = ScheduleData::InvalidDeps;
5104 UnscheduledDeps = ScheduleData::InvalidDeps;
5105 IsScheduled = false;
5106 }
5107
5108 /// Gets the edge information.
5109 const EdgeInfo &getEdgeInfo() const { return EI; }
5110
5111 /// Gets the bundle.
5112 ScheduleBundle &getBundle() { return Bundle; }
5113 const ScheduleBundle &getBundle() const { return Bundle; }
5114
5115#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5116 void dump(raw_ostream &OS) const { OS << "[Copyable]" << *getInst(); }
5117
5118 LLVM_DUMP_METHOD void dump() const {
5119 dump(dbgs());
5120 dbgs() << '\n';
5121 }
5122#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5123
5124 private:
5125 /// true, if it has valid dependency information. These nodes always have
5126 /// only single dependency.
5127 int Dependencies = ScheduleData::InvalidDeps;
5128
5129 /// The number of dependencies minus the number of dependencies of scheduled
5130 /// instructions. As soon as this is zero, the instruction/bundle gets ready
5131 /// for scheduling.
5132 /// Note that this is negative as long as Dependencies is not calculated.
5133 int UnscheduledDeps = ScheduleData::InvalidDeps;
5134 };
5135
5136#ifndef NDEBUG
5137 friend inline raw_ostream &
5138 operator<<(raw_ostream &OS, const BoUpSLP::ScheduleCopyableData &SD) {
5139 SD.dump(OS);
5140 return OS;
5141 }
5142#endif
5143
5144 friend struct GraphTraits<BoUpSLP *>;
5145 friend struct DOTGraphTraits<BoUpSLP *>;
5146
5147 /// Contains all scheduling data for a basic block.
5148 /// It does not schedules instructions, which are not memory read/write
5149 /// instructions and their operands are either constants, or arguments, or
5150 /// phis, or instructions from others blocks, or their users are phis or from
5151 /// the other blocks. The resulting vector instructions can be placed at the
5152 /// beginning of the basic block without scheduling (if operands does not need
5153 /// to be scheduled) or at the end of the block (if users are outside of the
5154 /// block). It allows to save some compile time and memory used by the
5155 /// compiler.
5156 /// ScheduleData is assigned for each instruction in between the boundaries of
5157 /// the tree entry, even for those, which are not part of the graph. It is
5158 /// required to correctly follow the dependencies between the instructions and
5159 /// their correct scheduling. The ScheduleData is not allocated for the
5160 /// instructions, which do not require scheduling, like phis, nodes with
5161 /// extractelements/insertelements only or nodes with instructions, with
5162 /// uses/operands outside of the block.
5163 struct BlockScheduling {
5164 BlockScheduling(BasicBlock *BB)
5165 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
5166
5167 void clear() {
5168 ScheduledBundles.clear();
5169 ScheduledBundlesList.clear();
5170 ScheduleCopyableDataMap.clear();
5171 ScheduleCopyableDataMapByInst.clear();
5172 ScheduleCopyableDataMapByInstUser.clear();
5173 ScheduleCopyableDataMapByUsers.clear();
5174 ReadyInsts.clear();
5175 ScheduleStart = nullptr;
5176 ScheduleEnd = nullptr;
5177 FirstLoadStoreInRegion = nullptr;
5178 LastLoadStoreInRegion = nullptr;
5179 RegionHasStackSave = false;
5180
5181 // Reduce the maximum schedule region size by the size of the
5182 // previous scheduling run.
5183 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5184 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
5185 ScheduleRegionSizeLimit = MinScheduleRegionSize;
5186 ScheduleRegionSize = 0;
5187
5188 // Make a new scheduling region, i.e. all existing ScheduleData is not
5189 // in the new region yet.
5190 ++SchedulingRegionID;
5191 }
5192
5193 ScheduleData *getScheduleData(Instruction *I) {
5194 if (!I)
5195 return nullptr;
5196 if (BB != I->getParent())
5197 // Avoid lookup if can't possibly be in map.
5198 return nullptr;
5199 ScheduleData *SD = ScheduleDataMap.lookup(I);
5200 if (SD && isInSchedulingRegion(*SD))
5201 return SD;
5202 return nullptr;
5203 }
5204
5205 ScheduleData *getScheduleData(Value *V) {
5206 return getScheduleData(dyn_cast<Instruction>(V));
5207 }
5208
5209 /// Returns the ScheduleCopyableData for the given edge (user tree entry and
5210 /// operand number) and value.
5211 ScheduleCopyableData *getScheduleCopyableData(const EdgeInfo &EI,
5212 const Value *V) const {
5213 if (ScheduleCopyableDataMap.empty())
5214 return nullptr;
5215 auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
5216 if (It == ScheduleCopyableDataMap.end())
5217 return nullptr;
5218 ScheduleCopyableData *SD = It->getSecond().get();
5219 if (!isInSchedulingRegion(*SD))
5220 return nullptr;
5221 return SD;
5222 }
5223
5224 /// Returns the ScheduleCopyableData for the given user \p User, operand
5225 /// number and operand \p V.
5227 getScheduleCopyableData(const Value *User, unsigned OperandIdx,
5228 const Value *V) {
5229 if (ScheduleCopyableDataMapByInstUser.empty())
5230 return {};
5231 const auto It = ScheduleCopyableDataMapByInstUser.find(
5232 std::make_pair(std::make_pair(User, OperandIdx), V));
5233 if (It == ScheduleCopyableDataMapByInstUser.end())
5234 return {};
5236 for (ScheduleCopyableData *SD : It->getSecond()) {
5237 if (isInSchedulingRegion(*SD))
5238 Res.push_back(SD);
5239 }
5240 return Res;
5241 }
5242
5243 /// Returns true if all operands of the given instruction \p User are
5244 /// replaced by copyable data.
5245 /// \param User The user instruction.
5246 /// \param Op The operand, which might be replaced by the copyable data.
5247 /// \param SLP The SLP tree.
5248 /// \param NumOps The number of operands used. If the instruction uses the
5249 /// same operand several times, check for the first use, then the second,
5250 /// etc.
5251 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5252 Instruction *Op, BoUpSLP &SLP,
5253 unsigned NumOps) const {
5254 assert(NumOps > 0 && "No operands");
5255 if (ScheduleCopyableDataMap.empty())
5256 return false;
5257 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5258 SmallDenseMap<const TreeEntry *, unsigned> OrderedEntriesCount;
5259 for (const Use &U : User->operands()) {
5260 if (U.get() != Op)
5261 continue;
5262 ArrayRef<TreeEntry *> Entries = SLP.getTreeEntries(User);
5263 if (Entries.empty())
5264 return false;
5265 // Check all tree entries, if they have operands replaced by copyable
5266 // data.
5267 for (TreeEntry *TE : Entries) {
5268 // Check if the user is commutative.
5269 // The commutatives are handled later, as their oeprands can be
5270 // reordered.
5271 // Same applies even for non-commutative cmps, because we can invert
5272 // their predicate potentially and, thus, reorder the operands.
5273 bool IsCommutativeUser =
5274 ::isCommutative(User) ||
5275 ::isCommutative(TE->getMatchingMainOpOrAltOp(User), User);
5276 EdgeInfo EI(TE, U.getOperandNo());
5277 if (!IsCommutativeUser && !isa<CmpInst>(User)) {
5278 unsigned &OpCnt =
5279 OrderedEntriesCount.try_emplace(TE, 0).first->getSecond();
5280 if (!getScheduleCopyableData(EI, Op) && OpCnt < NumOps)
5281 return false;
5282 // Found copyable operand - continue.
5283 ++OpCnt;
5284 continue;
5285 }
5286 ++PotentiallyReorderedEntriesCount.try_emplace(TE, 0)
5287 .first->getSecond();
5288 }
5289 }
5290 // Check the commutative/cmp entries.
5291 if (!PotentiallyReorderedEntriesCount.empty()) {
5292 for (auto &P : PotentiallyReorderedEntriesCount) {
5293 auto *It = find(P.first->Scalars, User);
5294 assert(It != P.first->Scalars.end() &&
5295 "User is not in the tree entry");
5296 int Lane = std::distance(P.first->Scalars.begin(), It);
5297 assert(Lane >= 0 && "Lane is not found");
5298 if (isa<StoreInst>(User) && !P.first->ReorderIndices.empty())
5299 Lane = P.first->ReorderIndices[Lane];
5300 assert(Lane < static_cast<int>(P.first->Scalars.size()) &&
5301 "Couldn't find extract lane");
5302 SmallVector<unsigned> OpIndices;
5303 for (unsigned OpIdx :
5305 P.first->getMainOp()))) {
5306 if (P.first->getOperand(OpIdx)[Lane] == Op &&
5307 getScheduleCopyableData(EdgeInfo(P.first, OpIdx), Op))
5308 --P.getSecond();
5309 }
5310 }
5311 return all_of(PotentiallyReorderedEntriesCount,
5312 [&](const std::pair<const TreeEntry *, unsigned> &P) {
5313 return P.second == NumOps - 1;
5314 });
5315 }
5316 return true;
5317 }
5318
5320 getScheduleCopyableData(const Instruction *I) const {
5321 if (ScheduleCopyableDataMapByInst.empty())
5322 return {};
5323 const auto It = ScheduleCopyableDataMapByInst.find(I);
5324 if (It == ScheduleCopyableDataMapByInst.end())
5325 return {};
5327 for (ScheduleCopyableData *SD : It->getSecond()) {
5328 if (isInSchedulingRegion(*SD))
5329 Res.push_back(SD);
5330 }
5331 return Res;
5332 }
5333
5335 getScheduleCopyableDataUsers(const Instruction *User) const {
5336 if (ScheduleCopyableDataMapByUsers.empty())
5337 return {};
5338 const auto It = ScheduleCopyableDataMapByUsers.find(User);
5339 if (It == ScheduleCopyableDataMapByUsers.end())
5340 return {};
5342 for (ScheduleCopyableData *SD : It->getSecond()) {
5343 if (isInSchedulingRegion(*SD))
5344 Res.push_back(SD);
5345 }
5346 return Res;
5347 }
5348
5349 ScheduleCopyableData &addScheduleCopyableData(const EdgeInfo &EI,
5350 Instruction *I,
5351 int SchedulingRegionID,
5352 ScheduleBundle &Bundle) {
5353 assert(!getScheduleCopyableData(EI, I) && "already in the map");
5354 ScheduleCopyableData *CD =
5355 ScheduleCopyableDataMap
5356 .try_emplace(std::make_pair(EI, I),
5357 std::make_unique<ScheduleCopyableData>(
5358 SchedulingRegionID, I, EI, Bundle))
5359 .first->getSecond()
5360 .get();
5361 ScheduleCopyableDataMapByInst[I].push_back(CD);
5362 if (EI.UserTE) {
5363 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
5364 const auto *It = find(Op, I);
5365 assert(It != Op.end() && "Lane not set");
5366 SmallPtrSet<Instruction *, 4> Visited;
5367 do {
5368 int Lane = std::distance(Op.begin(), It);
5369 assert(Lane >= 0 && "Lane not set");
5370 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
5371 !EI.UserTE->ReorderIndices.empty())
5372 Lane = EI.UserTE->ReorderIndices[Lane];
5373 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
5374 "Couldn't find extract lane");
5375 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
5376 if (!Visited.insert(In).second) {
5377 It = find(make_range(std::next(It), Op.end()), I);
5378 continue;
5379 }
5380 ScheduleCopyableDataMapByInstUser
5381 .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx), I))
5382 .first->getSecond()
5383 .push_back(CD);
5384 ScheduleCopyableDataMapByUsers.try_emplace(I)
5385 .first->getSecond()
5386 .insert(CD);
5387 // Remove extra deps for users, becoming non-immediate users of the
5388 // instruction. It may happen, if the chain of same copyable elements
5389 // appears in the tree.
5390 if (In == I) {
5391 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5392 if (ScheduleCopyableData *UserCD =
5393 getScheduleCopyableData(UserEI, In))
5394 ScheduleCopyableDataMapByUsers[I].remove(UserCD);
5395 }
5396 It = find(make_range(std::next(It), Op.end()), I);
5397 } while (It != Op.end());
5398 } else {
5399 ScheduleCopyableDataMapByUsers.try_emplace(I).first->getSecond().insert(
5400 CD);
5401 }
5402 return *CD;
5403 }
5404
5405 ArrayRef<ScheduleBundle *> getScheduleBundles(Value *V) const {
5406 auto *I = dyn_cast<Instruction>(V);
5407 if (!I)
5408 return {};
5409 auto It = ScheduledBundles.find(I);
5410 if (It == ScheduledBundles.end())
5411 return {};
5412 return It->getSecond();
5413 }
5414
5415 /// Returns true if the entity is in the scheduling region.
5416 bool isInSchedulingRegion(const ScheduleEntity &SD) const {
5417 if (const auto *Data = dyn_cast<ScheduleData>(&SD))
5418 return Data->getSchedulingRegionID() == SchedulingRegionID;
5419 if (const auto *CD = dyn_cast<ScheduleCopyableData>(&SD))
5420 return CD->getSchedulingRegionID() == SchedulingRegionID;
5421 return all_of(cast<ScheduleBundle>(SD).getBundle(),
5422 [&](const ScheduleEntity *BundleMember) {
5423 return isInSchedulingRegion(*BundleMember);
5424 });
5425 }
5426
5427 /// Marks an instruction as scheduled and puts all dependent ready
5428 /// instructions into the ready-list.
5429 template <typename ReadyListType>
5430 void schedule(const BoUpSLP &R, const InstructionsState &S,
5431 const EdgeInfo &EI, ScheduleEntity *Data,
5432 ReadyListType &ReadyList) {
5433 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5435 // Handle the def-use chain dependencies.
5436
5437 // Decrement the unscheduled counter and insert to ready list if ready.
5438 auto DecrUnsched = [&](auto *Data, bool IsControl = false) {
5439 if ((IsControl || Data->hasValidDependencies()) &&
5440 Data->incrementUnscheduledDeps(-1) == 0) {
5441 // There are no more unscheduled dependencies after
5442 // decrementing, so we can put the dependent instruction
5443 // into the ready list.
5444 SmallVector<ScheduleBundle *, 1> CopyableBundle;
5446 if (auto *CD = dyn_cast<ScheduleCopyableData>(Data)) {
5447 CopyableBundle.push_back(&CD->getBundle());
5448 Bundles = CopyableBundle;
5449 } else {
5450 Bundles = getScheduleBundles(Data->getInst());
5451 }
5452 if (!Bundles.empty()) {
5453 for (ScheduleBundle *Bundle : Bundles) {
5454 if (Bundle->unscheduledDepsInBundle() == 0) {
5455 assert(!Bundle->isScheduled() &&
5456 "already scheduled bundle gets ready");
5457 ReadyList.insert(Bundle);
5459 << "SLP: gets ready: " << *Bundle << "\n");
5460 }
5461 }
5462 return;
5463 }
5464 assert(!Data->isScheduled() &&
5465 "already scheduled bundle gets ready");
5467 "Expected non-copyable data");
5468 ReadyList.insert(Data);
5469 LLVM_DEBUG(dbgs() << "SLP: gets ready: " << *Data << "\n");
5470 }
5471 };
5472
5473 auto DecrUnschedForInst = [&](Instruction *User, unsigned OpIdx,
5474 Instruction *I) {
5475 if (!ScheduleCopyableDataMap.empty()) {
5477 getScheduleCopyableData(User, OpIdx, I);
5478 for (ScheduleCopyableData *CD : CopyableData)
5479 DecrUnsched(CD, /*IsControl=*/false);
5480 if (!CopyableData.empty())
5481 return;
5482 }
5483 if (ScheduleData *OpSD = getScheduleData(I))
5484 DecrUnsched(OpSD, /*IsControl=*/false);
5485 };
5486
5487 // If BundleMember is a vector bundle, its operands may have been
5488 // reordered during buildTree(). We therefore need to get its operands
5489 // through the TreeEntry.
5490 if (!Bundles.empty()) {
5491 auto *In = BundleMember->getInst();
5492 // Count uses of each instruction operand.
5493 SmallDenseMap<const Instruction *, unsigned> OperandsUses;
5494 unsigned TotalOpCount = 0;
5495 if (isa<ScheduleCopyableData>(BundleMember)) {
5496 // Copyable data is used only once (uses itself).
5497 TotalOpCount = OperandsUses[In] = 1;
5498 } else {
5499 for (const Use &U : In->operands()) {
5500 if (auto *I = dyn_cast<Instruction>(U.get())) {
5501 auto Res = OperandsUses.try_emplace(I, 0);
5502 ++Res.first->getSecond();
5503 ++TotalOpCount;
5504 }
5505 }
5506 }
5507 // Decrement the unscheduled counter and insert to ready list if
5508 // ready.
5509 auto DecrUnschedForInst = [&](Instruction *I, TreeEntry *UserTE,
5510 unsigned OpIdx) {
5511 if (!ScheduleCopyableDataMap.empty()) {
5512 const EdgeInfo EI = {UserTE, OpIdx};
5513 if (ScheduleCopyableData *CD = getScheduleCopyableData(EI, I)) {
5514 DecrUnsched(CD, /*IsControl=*/false);
5515 return;
5516 }
5517 }
5518 auto It = OperandsUses.find(I);
5519 assert(It != OperandsUses.end() && "Operand not found");
5520 if (It->second > 0) {
5521 --It->getSecond();
5522 assert(TotalOpCount > 0 && "No more operands to decrement");
5523 --TotalOpCount;
5524 if (ScheduleData *OpSD = getScheduleData(I))
5525 DecrUnsched(OpSD, /*IsControl=*/false);
5526 }
5527 };
5528
5529 for (ScheduleBundle *Bundle : Bundles) {
5530 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
5531 break;
5532 // Need to search for the lane since the tree entry can be
5533 // reordered.
5534 int Lane = std::distance(Bundle->getTreeEntry()->Scalars.begin(),
5535 find(Bundle->getTreeEntry()->Scalars, In));
5536 assert(Lane >= 0 && "Lane not set");
5537 if (isa<StoreInst>(In) &&
5538 !Bundle->getTreeEntry()->ReorderIndices.empty())
5539 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
5540 assert(Lane < static_cast<int>(
5541 Bundle->getTreeEntry()->Scalars.size()) &&
5542 "Couldn't find extract lane");
5543
5544 // Since vectorization tree is being built recursively this
5545 // assertion ensures that the tree entry has all operands set before
5546 // reaching this code. Couple of exceptions known at the moment are
5547 // extracts where their second (immediate) operand is not added.
5548 // Since immediates do not affect scheduler behavior this is
5549 // considered okay.
5550 assert(In &&
5552 In->getNumOperands() ==
5553 Bundle->getTreeEntry()->getNumOperands() ||
5554 Bundle->getTreeEntry()->isCopyableElement(In)) &&
5555 "Missed TreeEntry operands?");
5556
5557 for (unsigned OpIdx :
5558 seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
5559 if (auto *I = dyn_cast<Instruction>(
5560 Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
5561 LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): " << *I
5562 << "\n");
5563 DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx);
5564 }
5565 }
5566 } else {
5567 // If BundleMember is a stand-alone instruction, no operand reordering
5568 // has taken place, so we directly access its operands.
5569 for (Use &U : BundleMember->getInst()->operands()) {
5570 if (auto *I = dyn_cast<Instruction>(U.get())) {
5572 << "SLP: check for readiness (def): " << *I << "\n");
5573 DecrUnschedForInst(BundleMember->getInst(), U.getOperandNo(), I);
5574 }
5575 }
5576 }
5577 // Handle the memory dependencies.
5578 auto *SD = dyn_cast<ScheduleData>(BundleMember);
5579 if (!SD)
5580 return;
5581 SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
5582 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
5583 if (!VisitedMemory.insert(MemoryDep).second)
5584 continue;
5585 // There are no more unscheduled dependencies after decrementing,
5586 // so we can put the dependent instruction into the ready list.
5587 LLVM_DEBUG(dbgs() << "SLP: check for readiness (mem): "
5588 << *MemoryDep << "\n");
5589 DecrUnsched(MemoryDep);
5590 }
5591 // Handle the control dependencies.
5592 SmallPtrSet<const ScheduleData *, 4> VisitedControl;
5593 for (ScheduleData *Dep : SD->getControlDependencies()) {
5594 if (!VisitedControl.insert(Dep).second)
5595 continue;
5596 // There are no more unscheduled dependencies after decrementing,
5597 // so we can put the dependent instruction into the ready list.
5599 << "SLP: check for readiness (ctrl): " << *Dep << "\n");
5600 DecrUnsched(Dep, /*IsControl=*/true);
5601 }
5602 };
5603 if (auto *SD = dyn_cast<ScheduleData>(Data)) {
5604 SD->setScheduled(/*Scheduled=*/true);
5605 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
5608 Instruction *In = SD->getInst();
5609 if (R.isVectorized(In)) {
5610 ArrayRef<TreeEntry *> Entries = R.getTreeEntries(In);
5611 for (TreeEntry *TE : Entries) {
5613 In->getNumOperands() != TE->getNumOperands())
5614 continue;
5615 auto &BundlePtr =
5616 PseudoBundles.emplace_back(std::make_unique<ScheduleBundle>());
5617 BundlePtr->setTreeEntry(TE);
5618 BundlePtr->add(SD);
5619 Bundles.push_back(BundlePtr.get());
5620 }
5621 }
5622 ProcessBundleMember(SD, Bundles);
5623 } else {
5624 ScheduleBundle &Bundle = *cast<ScheduleBundle>(Data);
5625 Bundle.setScheduled(/*Scheduled=*/true);
5626 LLVM_DEBUG(dbgs() << "SLP: schedule " << Bundle << "\n");
5627 auto AreAllBundlesScheduled =
5628 [&](const ScheduleEntity *SD,
5629 ArrayRef<ScheduleBundle *> SDBundles) {
5631 return true;
5632 return !SDBundles.empty() &&
5633 all_of(SDBundles, [&](const ScheduleBundle *SDBundle) {
5634 return SDBundle->isScheduled();
5635 });
5636 };
5637 for (ScheduleEntity *SD : Bundle.getBundle()) {
5640 SDBundles = getScheduleBundles(SD->getInst());
5641 if (AreAllBundlesScheduled(SD, SDBundles)) {
5642 SD->setScheduled(/*Scheduled=*/true);
5643 ProcessBundleMember(SD, isa<ScheduleCopyableData>(SD) ? &Bundle
5644 : SDBundles);
5645 }
5646 }
5647 }
5648 }
5649
5650 /// Verify basic self consistency properties of the data structure.
5651 void verify() {
5652 if (!ScheduleStart)
5653 return;
5654
5655 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
5656 ScheduleStart->comesBefore(ScheduleEnd) &&
5657 "Not a valid scheduling region?");
5658
5659 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5660 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
5661 if (!Bundles.empty()) {
5662 for (ScheduleBundle *Bundle : Bundles) {
5663 assert(isInSchedulingRegion(*Bundle) &&
5664 "primary schedule data not in window?");
5665 Bundle->verify();
5666 }
5667 continue;
5668 }
5669 auto *SD = getScheduleData(I);
5670 if (!SD)
5671 continue;
5672 assert(isInSchedulingRegion(*SD) &&
5673 "primary schedule data not in window?");
5674 SD->verify();
5675 }
5676
5677 assert(all_of(ReadyInsts,
5678 [](const ScheduleEntity *Bundle) {
5679 return Bundle->isReady();
5680 }) &&
5681 "item in ready list not ready?");
5682 }
5683
5684 /// Put all instructions into the ReadyList which are ready for scheduling.
5685 template <typename ReadyListType>
5686 void initialFillReadyList(ReadyListType &ReadyList) {
5687 SmallPtrSet<ScheduleBundle *, 16> Visited;
5688 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5689 ScheduleData *SD = getScheduleData(I);
5690 if (SD && SD->hasValidDependencies() && SD->isReady()) {
5691 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
5692 !Bundles.empty()) {
5693 for (ScheduleBundle *Bundle : Bundles) {
5694 if (!Visited.insert(Bundle).second)
5695 continue;
5696 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
5697 ReadyList.insert(Bundle);
5698 LLVM_DEBUG(dbgs() << "SLP: initially in ready list: "
5699 << *Bundle << "\n");
5700 }
5701 }
5702 continue;
5703 }
5704 ReadyList.insert(SD);
5706 << "SLP: initially in ready list: " << *SD << "\n");
5707 }
5708 }
5709 }
5710
5711 /// Build a bundle from the ScheduleData nodes corresponding to the
5712 /// scalar instruction for each lane.
5713 /// \param VL The list of scalar instructions.
5714 /// \param S The state of the instructions.
5715 /// \param EI The edge in the SLP graph or the user node/operand number.
5716 ScheduleBundle &buildBundle(ArrayRef<Value *> VL,
5717 const InstructionsState &S, const EdgeInfo &EI);
5718
5719 /// Checks if a bundle of instructions can be scheduled, i.e. has no
5720 /// cyclic dependencies. This is only a dry-run, no instructions are
5721 /// actually moved at this stage.
5722 /// \returns the scheduling bundle. The returned Optional value is not
5723 /// std::nullopt if \p VL is allowed to be scheduled.
5724 std::optional<ScheduleBundle *>
5725 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
5726 const InstructionsState &S, const EdgeInfo &EI);
5727
5728 /// Allocates schedule data chunk.
5729 ScheduleData *allocateScheduleDataChunks();
5730
5731 /// Extends the scheduling region so that V is inside the region.
5732 /// \returns true if the region size is within the limit.
5733 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
5734
5735 /// Initialize the ScheduleData structures for new instructions in the
5736 /// scheduling region.
5737 void initScheduleData(Instruction *FromI, Instruction *ToI,
5738 ScheduleData *PrevLoadStore,
5739 ScheduleData *NextLoadStore);
5740
5741 /// Updates the dependency information of a bundle and of all instructions/
5742 /// bundles which depend on the original bundle.
5743 void calculateDependencies(ScheduleBundle &Bundle, bool InsertInReadyList,
5744 BoUpSLP *SLP,
5745 ArrayRef<ScheduleData *> ControlDeps = {});
5746
5747 /// Sets all instruction in the scheduling region to un-scheduled.
5748 void resetSchedule();
5749
5750 BasicBlock *BB;
5751
5752 /// Simple memory allocation for ScheduleData.
5754
5755 /// The size of a ScheduleData array in ScheduleDataChunks.
5756 int ChunkSize;
5757
5758 /// The allocator position in the current chunk, which is the last entry
5759 /// of ScheduleDataChunks.
5760 int ChunkPos;
5761
5762 /// Attaches ScheduleData to Instruction.
5763 /// Note that the mapping survives during all vectorization iterations, i.e.
5764 /// ScheduleData structures are recycled.
5765 SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
5766
5767 /// Attaches ScheduleCopyableData to EdgeInfo (UserTreeEntry + operand
5768 /// number) and the operand instruction, represented as copyable element.
5769 SmallDenseMap<std::pair<EdgeInfo, const Value *>,
5770 std::unique_ptr<ScheduleCopyableData>>
5771 ScheduleCopyableDataMap;
5772
5773 /// Represents mapping between instruction and all related
5774 /// ScheduleCopyableData (for all uses in the tree, represenedt as copyable
5775 /// element). The SLP tree may contain several representations of the same
5776 /// instruction.
5777 SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
5778 ScheduleCopyableDataMapByInst;
5779
5780 /// Represents mapping between user value and operand number, the operand
5781 /// value and all related ScheduleCopyableData. The relation is 1:n, because
5782 /// the same user may refernce the same operand in different tree entries
5783 /// and the operand may be modelled by the different copyable data element.
5784 SmallDenseMap<std::pair<std::pair<const Value *, unsigned>, const Value *>,
5786 ScheduleCopyableDataMapByInstUser;
5787
5788 /// Represents mapping between instruction and all related
5789 /// ScheduleCopyableData. It represents the mapping between the actual
5790 /// instruction and the last copyable data element in the chain. E.g., if
5791 /// the graph models the following instructions:
5792 /// %0 = non-add instruction ...
5793 /// ...
5794 /// %4 = add %3, 1
5795 /// %5 = add %4, 1
5796 /// %6 = insertelement poison, %0, 0
5797 /// %7 = insertelement %6, %5, 1
5798 /// And the graph is modeled as:
5799 /// [%5, %0] -> [%4, copyable %0 <0> ] -> [%3, copyable %0 <1> ]
5800 /// -> [1, 0] -> [%1, 0]
5801 ///
5802 /// this map will map %0 only to the copyable element <1>, which is the last
5803 /// user (direct user of the actual instruction). <0> uses <1>, so <1> will
5804 /// keep the map to <0>, not the %0.
5805 SmallDenseMap<const Instruction *,
5806 SmallSetVector<ScheduleCopyableData *, 4>>
5807 ScheduleCopyableDataMapByUsers;
5808
5809 /// Attaches ScheduleBundle to Instruction.
5810 SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
5811 ScheduledBundles;
5812 /// The list of ScheduleBundles.
5813 SmallVector<std::unique_ptr<ScheduleBundle>> ScheduledBundlesList;
5814
5815 /// The ready-list for scheduling (only used for the dry-run).
5816 SetVector<ScheduleEntity *> ReadyInsts;
5817
5818 /// The first instruction of the scheduling region.
5819 Instruction *ScheduleStart = nullptr;
5820
5821 /// The first instruction _after_ the scheduling region.
5822 Instruction *ScheduleEnd = nullptr;
5823
5824 /// The first memory accessing instruction in the scheduling region
5825 /// (can be null).
5826 ScheduleData *FirstLoadStoreInRegion = nullptr;
5827
5828 /// The last memory accessing instruction in the scheduling region
5829 /// (can be null).
5830 ScheduleData *LastLoadStoreInRegion = nullptr;
5831
5832 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
5833 /// region? Used to optimize the dependence calculation for the
5834 /// common case where there isn't.
5835 bool RegionHasStackSave = false;
5836
5837 /// The current size of the scheduling region.
5838 int ScheduleRegionSize = 0;
5839
5840 /// The maximum size allowed for the scheduling region.
5841 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
5842
5843 /// The ID of the scheduling region. For a new vectorization iteration this
5844 /// is incremented which "removes" all ScheduleData from the region.
5845 /// Make sure that the initial SchedulingRegionID is greater than the
5846 /// initial SchedulingRegionID in ScheduleData (which is 0).
5847 int SchedulingRegionID = 1;
5848 };
5849
5850 /// Attaches the BlockScheduling structures to basic blocks.
5851 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
5852
5853 /// Performs the "real" scheduling. Done before vectorization is actually
5854 /// performed in a basic block.
5855 void scheduleBlock(const BoUpSLP &R, BlockScheduling *BS);
5856
5857 /// List of users to ignore during scheduling and that don't need extracting.
5858 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
5859
5860 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
5861 /// sorted SmallVectors of unsigned.
5862 struct OrdersTypeDenseMapInfo {
5863 static OrdersType getEmptyKey() {
5864 OrdersType V;
5865 V.push_back(~1U);
5866 return V;
5867 }
5868
5869 static OrdersType getTombstoneKey() {
5870 OrdersType V;
5871 V.push_back(~2U);
5872 return V;
5873 }
5874
5875 static unsigned getHashValue(const OrdersType &V) {
5876 return static_cast<unsigned>(hash_combine_range(V));
5877 }
5878
5879 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
5880 return LHS == RHS;
5881 }
5882 };
5883
5884 // Analysis and block reference.
5885 Function *F;
5886 ScalarEvolution *SE;
5887 TargetTransformInfo *TTI;
5888 TargetLibraryInfo *TLI;
5889 LoopInfo *LI;
5890 DominatorTree *DT;
5891 AssumptionCache *AC;
5892 DemandedBits *DB;
5893 const DataLayout *DL;
5894 OptimizationRemarkEmitter *ORE;
5895
5896 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
5897 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
5898
5899 /// Instruction builder to construct the vectorized tree.
5900 IRBuilder<TargetFolder> Builder;
5901
5902 /// A map of scalar integer values to the smallest bit width with which they
5903 /// can legally be represented. The values map to (width, signed) pairs,
5904 /// where "width" indicates the minimum bit width and "signed" is True if the
5905 /// value must be signed-extended, rather than zero-extended, back to its
5906 /// original width.
5907 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
5908
5909 /// Final size of the reduced vector, if the current graph represents the
5910 /// input for the reduction and it was possible to narrow the size of the
5911 /// reduction.
5912 unsigned ReductionBitWidth = 0;
5913
5914 /// Canonical graph size before the transformations.
5915 unsigned BaseGraphSize = 1;
5916
5917 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
5918 /// type sizes, used in the tree.
5919 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
5920
5921 /// Indices of the vectorized nodes, which supposed to be the roots of the new
5922 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
5923 DenseSet<unsigned> ExtraBitWidthNodes;
5924};
5925
5926} // end namespace slpvectorizer
5927
5928template <> struct DenseMapInfo<BoUpSLP::EdgeInfo> {
5932 return BoUpSLP::EdgeInfo(FirstInfo::getEmptyKey(),
5933 SecondInfo::getEmptyKey());
5934 }
5935
5937 return BoUpSLP::EdgeInfo(FirstInfo::getTombstoneKey(),
5938 SecondInfo::getTombstoneKey());
5939 }
5940
5941 static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val) {
5942 return detail::combineHashValue(FirstInfo::getHashValue(Val.UserTE),
5943 SecondInfo::getHashValue(Val.EdgeIdx));
5944 }
5945
5946 static bool isEqual(const BoUpSLP::EdgeInfo &LHS,
5947 const BoUpSLP::EdgeInfo &RHS) {
5948 return LHS == RHS;
5949 }
5950};
5951
5952template <> struct GraphTraits<BoUpSLP *> {
5953 using TreeEntry = BoUpSLP::TreeEntry;
5954
5955 /// NodeRef has to be a pointer per the GraphWriter.
5957
5958 using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
5959
5960 /// Add the VectorizableTree to the index iterator to be able to return
5961 /// TreeEntry pointers.
5963 : public iterator_adaptor_base<
5964 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
5966
5970
5971 NodeRef operator*() { return I->UserTE; }
5972 };
5973
5975 return R.VectorizableTree[0].get();
5976 }
5977
5979 return {&N->UserTreeIndex, N->Container};
5980 }
5981
5983 return {&N->UserTreeIndex + 1, N->Container};
5984 }
5985
5986 /// For the node iterator we just need to turn the TreeEntry iterator into a
5987 /// TreeEntry* iterator so that it dereferences to NodeRef.
5989 using ItTy = ContainerTy::iterator;
5990 ItTy It;
5991
5992 public:
5993 nodes_iterator(const ItTy &It2) : It(It2) {}
5994 NodeRef operator*() { return It->get(); }
5996 ++It;
5997 return *this;
5998 }
5999 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
6000 };
6001
6003 return nodes_iterator(R->VectorizableTree.begin());
6004 }
6005
6007 return nodes_iterator(R->VectorizableTree.end());
6008 }
6009
6010 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
6011};
6012
6013template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
6014 using TreeEntry = BoUpSLP::TreeEntry;
6015
6016 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
6017
6018 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
6019 std::string Str;
6020 raw_string_ostream OS(Str);
6021 OS << Entry->Idx << ".\n";
6022 if (isSplat(Entry->Scalars))
6023 OS << "<splat> ";
6024 for (auto *V : Entry->Scalars) {
6025 OS << *V;
6026 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
6027 return EU.Scalar == V;
6028 }))
6029 OS << " <extract>";
6030 OS << "\n";
6031 }
6032 return Str;
6033 }
6034
6035 static std::string getNodeAttributes(const TreeEntry *Entry,
6036 const BoUpSLP *) {
6037 if (Entry->isGather())
6038 return "color=red";
6039 if (Entry->State == TreeEntry::ScatterVectorize ||
6040 Entry->State == TreeEntry::StridedVectorize ||
6041 Entry->State == TreeEntry::CompressVectorize)
6042 return "color=blue";
6043 return "";
6044 }
6045};
6046
6047} // end namespace llvm
6048
6051 for (auto *I : DeletedInstructions) {
6052 if (!I->getParent()) {
6053 // Temporarily insert instruction back to erase them from parent and
6054 // memory later.
6055 if (isa<PHINode>(I))
6056 // Phi nodes must be the very first instructions in the block.
6057 I->insertBefore(F->getEntryBlock(),
6058 F->getEntryBlock().getFirstNonPHIIt());
6059 else
6060 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
6061 continue;
6062 }
6063 for (Use &U : I->operands()) {
6064 auto *Op = dyn_cast<Instruction>(U.get());
6065 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
6067 DeadInsts.emplace_back(Op);
6068 }
6069 I->dropAllReferences();
6070 }
6071 for (auto *I : DeletedInstructions) {
6072 assert(I->use_empty() &&
6073 "trying to erase instruction with users.");
6074 I->eraseFromParent();
6075 }
6076
6077 // Cleanup any dead scalar code feeding the vectorized instructions
6079
6080#ifdef EXPENSIVE_CHECKS
6081 // If we could guarantee that this call is not extremely slow, we could
6082 // remove the ifdef limitation (see PR47712).
6083 assert(!verifyFunction(*F, &dbgs()));
6084#endif
6085}
6086
6087/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
6088/// contains original mask for the scalars reused in the node. Procedure
6089/// transform this mask in accordance with the given \p Mask.
6091 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
6092 "Expected non-empty mask.");
6093 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
6094 Prev.swap(Reuses);
6095 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
6096 if (Mask[I] != PoisonMaskElem)
6097 Reuses[Mask[I]] = Prev[I];
6098}
6099
6100/// Reorders the given \p Order according to the given \p Mask. \p Order - is
6101/// the original order of the scalars. Procedure transforms the provided order
6102/// in accordance with the given \p Mask. If the resulting \p Order is just an
6103/// identity order, \p Order is cleared.
6105 bool BottomOrder = false) {
6106 assert(!Mask.empty() && "Expected non-empty mask.");
6107 unsigned Sz = Mask.size();
6108 if (BottomOrder) {
6109 SmallVector<unsigned> PrevOrder;
6110 if (Order.empty()) {
6111 PrevOrder.resize(Sz);
6112 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
6113 } else {
6114 PrevOrder.swap(Order);
6115 }
6116 Order.assign(Sz, Sz);
6117 for (unsigned I = 0; I < Sz; ++I)
6118 if (Mask[I] != PoisonMaskElem)
6119 Order[I] = PrevOrder[Mask[I]];
6120 if (all_of(enumerate(Order), [&](const auto &Data) {
6121 return Data.value() == Sz || Data.index() == Data.value();
6122 })) {
6123 Order.clear();
6124 return;
6125 }
6126 fixupOrderingIndices(Order);
6127 return;
6128 }
6129 SmallVector<int> MaskOrder;
6130 if (Order.empty()) {
6131 MaskOrder.resize(Sz);
6132 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
6133 } else {
6134 inversePermutation(Order, MaskOrder);
6135 }
6136 reorderReuses(MaskOrder, Mask);
6137 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
6138 Order.clear();
6139 return;
6140 }
6141 Order.assign(Sz, Sz);
6142 for (unsigned I = 0; I < Sz; ++I)
6143 if (MaskOrder[I] != PoisonMaskElem)
6144 Order[MaskOrder[I]] = I;
6145 fixupOrderingIndices(Order);
6146}
6147
6148std::optional<BoUpSLP::OrdersType>
6149BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
6150 bool TopToBottom, bool IgnoreReorder) {
6151 assert(TE.isGather() && "Expected gather node only.");
6152 // Try to find subvector extract/insert patterns and reorder only such
6153 // patterns.
6154 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
6155 Type *ScalarTy = GatheredScalars.front()->getType();
6156 size_t NumScalars = GatheredScalars.size();
6157 if (!isValidElementType(ScalarTy))
6158 return std::nullopt;
6159 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
6160 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, NumScalars);
6161 SmallVector<int> ExtractMask;
6162 SmallVector<int> Mask;
6165 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
6167 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
6168 /*ForOrder=*/true);
6169 // No shuffled operands - ignore.
6170 if (GatherShuffles.empty() && ExtractShuffles.empty())
6171 return std::nullopt;
6172 OrdersType CurrentOrder(NumScalars, NumScalars);
6173 if (GatherShuffles.size() == 1 &&
6174 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
6175 Entries.front().front()->isSame(TE.Scalars)) {
6176 // If the full matched node in whole tree rotation - no need to consider the
6177 // matching order, rotating the whole tree.
6178 if (TopToBottom)
6179 return std::nullopt;
6180 // No need to keep the order for the same user node.
6181 if (Entries.front().front()->UserTreeIndex.UserTE ==
6182 TE.UserTreeIndex.UserTE)
6183 return std::nullopt;
6184 // No need to keep the order for the matched root node, if it can be freely
6185 // reordered.
6186 if (!IgnoreReorder && Entries.front().front()->Idx == 0)
6187 return std::nullopt;
6188 // If shuffling 2 elements only and the matching node has reverse reuses -
6189 // no need to count order, both work fine.
6190 if (!Entries.front().front()->ReuseShuffleIndices.empty() &&
6191 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6192 any_of(enumerate(Entries.front().front()->ReuseShuffleIndices),
6193 [](const auto &P) {
6194 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6195 }))
6196 return std::nullopt;
6197
6198 // Perfect match in the graph, will reuse the previously vectorized
6199 // node. Cost is 0.
6200 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
6201 return CurrentOrder;
6202 }
6203 auto IsSplatMask = [](ArrayRef<int> Mask) {
6204 int SingleElt = PoisonMaskElem;
6205 return all_of(Mask, [&](int I) {
6206 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
6207 SingleElt = I;
6208 return I == PoisonMaskElem || I == SingleElt;
6209 });
6210 };
6211 // Exclusive broadcast mask - ignore.
6212 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
6213 (Entries.size() != 1 ||
6214 Entries.front().front()->ReorderIndices.empty())) ||
6215 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
6216 return std::nullopt;
6217 SmallBitVector ShuffledSubMasks(NumParts);
6218 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
6219 ArrayRef<int> Mask, int PartSz, int NumParts,
6220 function_ref<unsigned(unsigned)> GetVF) {
6221 for (int I : seq<int>(0, NumParts)) {
6222 if (ShuffledSubMasks.test(I))
6223 continue;
6224 const int VF = GetVF(I);
6225 if (VF == 0)
6226 continue;
6227 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
6228 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
6229 // Shuffle of at least 2 vectors - ignore.
6230 if (any_of(Slice, [&](unsigned I) { return I != NumScalars; })) {
6231 llvm::fill(Slice, NumScalars);
6232 ShuffledSubMasks.set(I);
6233 continue;
6234 }
6235 // Try to include as much elements from the mask as possible.
6236 int FirstMin = INT_MAX;
6237 int SecondVecFound = false;
6238 for (int K : seq<int>(Limit)) {
6239 int Idx = Mask[I * PartSz + K];
6240 if (Idx == PoisonMaskElem) {
6241 Value *V = GatheredScalars[I * PartSz + K];
6242 if (isConstant(V) && !isa<PoisonValue>(V)) {
6243 SecondVecFound = true;
6244 break;
6245 }
6246 continue;
6247 }
6248 if (Idx < VF) {
6249 if (FirstMin > Idx)
6250 FirstMin = Idx;
6251 } else {
6252 SecondVecFound = true;
6253 break;
6254 }
6255 }
6256 FirstMin = (FirstMin / PartSz) * PartSz;
6257 // Shuffle of at least 2 vectors - ignore.
6258 if (SecondVecFound) {
6259 llvm::fill(Slice, NumScalars);
6260 ShuffledSubMasks.set(I);
6261 continue;
6262 }
6263 for (int K : seq<int>(Limit)) {
6264 int Idx = Mask[I * PartSz + K];
6265 if (Idx == PoisonMaskElem)
6266 continue;
6267 Idx -= FirstMin;
6268 if (Idx >= PartSz) {
6269 SecondVecFound = true;
6270 break;
6271 }
6272 if (CurrentOrder[I * PartSz + Idx] >
6273 static_cast<unsigned>(I * PartSz + K) &&
6274 CurrentOrder[I * PartSz + Idx] !=
6275 static_cast<unsigned>(I * PartSz + Idx))
6276 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
6277 }
6278 // Shuffle of at least 2 vectors - ignore.
6279 if (SecondVecFound) {
6280 llvm::fill(Slice, NumScalars);
6281 ShuffledSubMasks.set(I);
6282 continue;
6283 }
6284 }
6285 };
6286 int PartSz = getPartNumElems(NumScalars, NumParts);
6287 if (!ExtractShuffles.empty())
6288 TransformMaskToOrder(
6289 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
6290 if (!ExtractShuffles[I])
6291 return 0U;
6292 unsigned VF = 0;
6293 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
6294 for (unsigned Idx : seq<unsigned>(Sz)) {
6295 int K = I * PartSz + Idx;
6296 if (ExtractMask[K] == PoisonMaskElem)
6297 continue;
6298 if (!TE.ReuseShuffleIndices.empty())
6299 K = TE.ReuseShuffleIndices[K];
6300 if (K == PoisonMaskElem)
6301 continue;
6302 if (!TE.ReorderIndices.empty())
6303 K = std::distance(TE.ReorderIndices.begin(),
6304 find(TE.ReorderIndices, K));
6305 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
6306 if (!EI)
6307 continue;
6308 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
6309 ->getElementCount()
6310 .getKnownMinValue());
6311 }
6312 return VF;
6313 });
6314 // Check special corner case - single shuffle of the same entry.
6315 if (GatherShuffles.size() == 1 && NumParts != 1) {
6316 if (ShuffledSubMasks.any())
6317 return std::nullopt;
6318 PartSz = NumScalars;
6319 NumParts = 1;
6320 }
6321 if (!Entries.empty())
6322 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
6323 if (!GatherShuffles[I])
6324 return 0U;
6325 return std::max(Entries[I].front()->getVectorFactor(),
6326 Entries[I].back()->getVectorFactor());
6327 });
6328 unsigned NumUndefs = count(CurrentOrder, NumScalars);
6329 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6330 return std::nullopt;
6331 return std::move(CurrentOrder);
6332}
6333
6334static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
6335 const TargetLibraryInfo &TLI,
6336 bool CompareOpcodes = true) {
6339 return false;
6340 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
6341 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
6342 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6343 (!GEP2 || GEP2->getNumOperands() == 2) &&
6344 (((!GEP1 || isConstant(GEP1->getOperand(1))) &&
6345 (!GEP2 || isConstant(GEP2->getOperand(1)))) ||
6346 !CompareOpcodes ||
6347 (GEP1 && GEP2 &&
6348 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
6349}
6350
6351/// Calculates minimal alignment as a common alignment.
6352template <typename T>
6354 Align CommonAlignment = cast<T>(VL.consume_front())->getAlign();
6355 for (Value *V : VL)
6356 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
6357 return CommonAlignment;
6358}
6359
6360/// Check if \p Order represents reverse order.
6362 assert(!Order.empty() &&
6363 "Order is empty. Please check it before using isReverseOrder.");
6364 unsigned Sz = Order.size();
6365 return all_of(enumerate(Order), [&](const auto &Pair) {
6366 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6367 });
6368}
6369
6370/// Checks if the provided list of pointers \p Pointers represents the strided
6371/// pointers for type ElemTy. If they are not, nullptr is returned.
6372/// Otherwise, SCEV* of the stride value is returned.
6373static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
6374 const DataLayout &DL, ScalarEvolution &SE,
6375 SmallVectorImpl<unsigned> &SortedIndices) {
6377 const SCEV *PtrSCEVLowest = nullptr;
6378 const SCEV *PtrSCEVHighest = nullptr;
6379 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
6380 // addresses).
6381 for (Value *Ptr : PointerOps) {
6382 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
6383 if (!PtrSCEV)
6384 return nullptr;
6385 SCEVs.push_back(PtrSCEV);
6386 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6387 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6388 continue;
6389 }
6390 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
6391 if (isa<SCEVCouldNotCompute>(Diff))
6392 return nullptr;
6393 if (Diff->isNonConstantNegative()) {
6394 PtrSCEVLowest = PtrSCEV;
6395 continue;
6396 }
6397 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
6398 if (isa<SCEVCouldNotCompute>(Diff1))
6399 return nullptr;
6400 if (Diff1->isNonConstantNegative()) {
6401 PtrSCEVHighest = PtrSCEV;
6402 continue;
6403 }
6404 }
6405 // Dist = PtrSCEVHighest - PtrSCEVLowest;
6406 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
6407 if (isa<SCEVCouldNotCompute>(Dist))
6408 return nullptr;
6409 int Size = DL.getTypeStoreSize(ElemTy);
6410 auto TryGetStride = [&](const SCEV *Dist,
6411 const SCEV *Multiplier) -> const SCEV * {
6412 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
6413 if (M->getOperand(0) == Multiplier)
6414 return M->getOperand(1);
6415 if (M->getOperand(1) == Multiplier)
6416 return M->getOperand(0);
6417 return nullptr;
6418 }
6419 if (Multiplier == Dist)
6420 return SE.getConstant(Dist->getType(), 1);
6421 return SE.getUDivExactExpr(Dist, Multiplier);
6422 };
6423 // Stride_in_elements = Dist / element_size * (num_elems - 1).
6424 const SCEV *Stride = nullptr;
6425 if (Size != 1 || SCEVs.size() > 2) {
6426 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
6427 Stride = TryGetStride(Dist, Sz);
6428 if (!Stride)
6429 return nullptr;
6430 }
6431 if (!Stride || isa<SCEVConstant>(Stride))
6432 return nullptr;
6433 // Iterate through all pointers and check if all distances are
6434 // unique multiple of Stride.
6435 using DistOrdPair = std::pair<int64_t, int>;
6436 auto Compare = llvm::less_first();
6437 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
6438 int Cnt = 0;
6439 bool IsConsecutive = true;
6440 for (const SCEV *PtrSCEV : SCEVs) {
6441 unsigned Dist = 0;
6442 if (PtrSCEV != PtrSCEVLowest) {
6443 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
6444 const SCEV *Coeff = TryGetStride(Diff, Stride);
6445 if (!Coeff)
6446 return nullptr;
6447 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
6448 if (!SC || isa<SCEVCouldNotCompute>(SC))
6449 return nullptr;
6450 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
6451 SE.getMulExpr(Stride, SC)))
6452 ->isZero())
6453 return nullptr;
6454 Dist = SC->getAPInt().getZExtValue();
6455 }
6456 // If the strides are not the same or repeated, we can't vectorize.
6457 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
6458 return nullptr;
6459 auto Res = Offsets.emplace(Dist, Cnt);
6460 if (!Res.second)
6461 return nullptr;
6462 // Consecutive order if the inserted element is the last one.
6463 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
6464 ++Cnt;
6465 }
6466 if (Offsets.size() != SCEVs.size())
6467 return nullptr;
6468 SortedIndices.clear();
6469 if (!IsConsecutive) {
6470 // Fill SortedIndices array only if it is non-consecutive.
6471 SortedIndices.resize(PointerOps.size());
6472 Cnt = 0;
6473 for (const std::pair<int64_t, int> &Pair : Offsets) {
6474 SortedIndices[Cnt] = Pair.second;
6475 ++Cnt;
6476 }
6477 }
6478 return Stride;
6479}
6480
6481static std::pair<InstructionCost, InstructionCost>
6482getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
6483 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
6484 Type *ScalarTy, VectorType *VecTy);
6485
6486/// Returns the cost of the shuffle instructions with the given \p Kind, vector
6487/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
6488/// subvector pattern.
6489static InstructionCost
6491 VectorType *Tp, ArrayRef<int> Mask = {},
6493 int Index = 0, VectorType *SubTp = nullptr,
6495 VectorType *DstTy = Tp;
6496 if (!Mask.empty())
6497 DstTy = FixedVectorType::get(Tp->getScalarType(), Mask.size());
6498
6499 if (Kind != TTI::SK_PermuteTwoSrc)
6500 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
6501 Args);
6502 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6503 int NumSubElts;
6505 Mask, NumSrcElts, NumSubElts, Index)) {
6506 if (Index + NumSubElts > NumSrcElts &&
6507 Index + NumSrcElts <= static_cast<int>(Mask.size()))
6508 return TTI.getShuffleCost(TTI::SK_InsertSubvector, DstTy, Tp, Mask,
6509 TTI::TCK_RecipThroughput, Index, Tp);
6510 }
6511 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
6512 Args);
6513}
6514
6515/// This is similar to TargetTransformInfo::getScalarizationOverhead, but if
6516/// ScalarTy is a FixedVectorType, a vector will be inserted or extracted
6517/// instead of a scalar.
6518static InstructionCost
6520 VectorType *Ty, const APInt &DemandedElts, bool Insert,
6521 bool Extract, TTI::TargetCostKind CostKind,
6522 bool ForPoisonSrc = true, ArrayRef<Value *> VL = {}) {
6524 "ScalableVectorType is not supported.");
6525 assert(getNumElements(ScalarTy) * DemandedElts.getBitWidth() ==
6526 getNumElements(Ty) &&
6527 "Incorrect usage.");
6528 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
6529 assert(SLPReVec && "Only supported by REVEC.");
6530 // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
6531 // of CreateInsertElement.
6532 unsigned ScalarTyNumElements = VecTy->getNumElements();
6534 for (unsigned I : seq(DemandedElts.getBitWidth())) {
6535 if (!DemandedElts[I])
6536 continue;
6537 if (Insert)
6539 I * ScalarTyNumElements, VecTy);
6540 if (Extract)
6542 I * ScalarTyNumElements, VecTy);
6543 }
6544 return Cost;
6545 }
6546 return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
6547 CostKind, ForPoisonSrc, VL);
6548}
6549
6550/// This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy
6551/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6553 const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val,
6554 TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar,
6555 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
6556 if (Opcode == Instruction::ExtractElement) {
6557 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
6558 assert(SLPReVec && "Only supported by REVEC.");
6559 assert(isa<VectorType>(Val) && "Val must be a vector type.");
6561 cast<VectorType>(Val), {}, CostKind,
6562 Index * VecTy->getNumElements(), VecTy);
6563 }
6564 }
6565 return TTI.getVectorInstrCost(Opcode, Val, CostKind, Index, Scalar,
6566 ScalarUserAndIdx);
6567}
6568
6569/// This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst
6570/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6572 const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst,
6573 VectorType *VecTy, unsigned Index,
6575 if (auto *ScalarTy = dyn_cast<FixedVectorType>(Dst)) {
6576 assert(SLPReVec && "Only supported by REVEC.");
6577 auto *SubTp =
6578 getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
6580 Index * ScalarTy->getNumElements(), SubTp) +
6581 TTI.getCastInstrCost(Opcode, Dst, SubTp, TTI::CastContextHint::None,
6582 CostKind);
6583 }
6584 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index, CostKind);
6585}
6586
6587/// Creates subvector insert. Generates shuffle using \p Generator or
6588/// using default shuffle.
6590 IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index,
6591 function_ref<Value *(Value *, Value *, ArrayRef<int>)> Generator = {}) {
6592 if (isa<PoisonValue>(Vec) && isa<PoisonValue>(V))
6593 return Vec;
6594 const unsigned SubVecVF = getNumElements(V->getType());
6595 // Create shuffle, insertvector requires that index is multiple of
6596 // the subvector length.
6597 const unsigned VecVF = getNumElements(Vec->getType());
6598 SmallVector<int> Mask(VecVF, PoisonMaskElem);
6599 if (isa<PoisonValue>(Vec)) {
6600 auto *Begin = std::next(Mask.begin(), Index);
6601 std::iota(Begin, std::next(Begin, SubVecVF), 0);
6602 Vec = Builder.CreateShuffleVector(V, Mask);
6603 return Vec;
6604 }
6605 std::iota(Mask.begin(), Mask.end(), 0);
6606 std::iota(std::next(Mask.begin(), Index),
6607 std::next(Mask.begin(), Index + SubVecVF), VecVF);
6608 if (Generator)
6609 return Generator(Vec, V, Mask);
6610 // 1. Resize V to the size of Vec.
6611 SmallVector<int> ResizeMask(VecVF, PoisonMaskElem);
6612 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
6613 V = Builder.CreateShuffleVector(V, ResizeMask);
6614 // 2. Insert V into Vec.
6615 return Builder.CreateShuffleVector(Vec, V, Mask);
6616}
6617
6618/// Generates subvector extract using \p Generator or using default shuffle.
6620 unsigned SubVecVF, unsigned Index) {
6621 SmallVector<int> Mask(SubVecVF, PoisonMaskElem);
6622 std::iota(Mask.begin(), Mask.end(), Index);
6623 return Builder.CreateShuffleVector(Vec, Mask);
6624}
6625
6626/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered
6627/// with \p Order.
6628/// \return true if the mask represents strided access, false - otherwise.
6630 ArrayRef<unsigned> Order, Type *ScalarTy,
6631 const DataLayout &DL, ScalarEvolution &SE,
6632 SmallVectorImpl<int> &CompressMask) {
6633 const unsigned Sz = PointerOps.size();
6634 CompressMask.assign(Sz, PoisonMaskElem);
6635 // The first element always set.
6636 CompressMask[0] = 0;
6637 // Check if the mask represents strided access.
6638 std::optional<unsigned> Stride = 0;
6639 Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps[Order.front()];
6640 for (unsigned I : seq<unsigned>(1, Sz)) {
6641 Value *Ptr = Order.empty() ? PointerOps[I] : PointerOps[Order[I]];
6642 std::optional<int64_t> OptPos =
6643 getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
6644 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
6645 return false;
6646 unsigned Pos = static_cast<unsigned>(*OptPos);
6647 CompressMask[I] = Pos;
6648 if (!Stride)
6649 continue;
6650 if (*Stride == 0) {
6651 *Stride = Pos;
6652 continue;
6653 }
6654 if (Pos != *Stride * I)
6655 Stride.reset();
6656 }
6657 return Stride.has_value();
6658}
6659
6660/// Checks if the \p VL can be transformed to a (masked)load + compress or
6661/// (masked) interleaved load.
6663 ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
6666 const DominatorTree &DT, const TargetLibraryInfo &TLI,
6667 const function_ref<bool(Value *)> AreAllUsersVectorized, bool &IsMasked,
6668 unsigned &InterleaveFactor, SmallVectorImpl<int> &CompressMask,
6669 VectorType *&LoadVecTy) {
6670 InterleaveFactor = 0;
6671 Type *ScalarTy = VL.front()->getType();
6672 const size_t Sz = VL.size();
6673 auto *VecTy = getWidenedType(ScalarTy, Sz);
6675 SmallVector<int> Mask;
6676 if (!Order.empty())
6677 inversePermutation(Order, Mask);
6678 // Check external uses.
6679 for (const auto [I, V] : enumerate(VL)) {
6680 if (AreAllUsersVectorized(V))
6681 continue;
6682 InstructionCost ExtractCost =
6683 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
6684 Mask.empty() ? I : Mask[I]);
6685 InstructionCost ScalarCost =
6686 TTI.getInstructionCost(cast<Instruction>(V), CostKind);
6687 if (ExtractCost <= ScalarCost)
6688 return false;
6689 }
6690 Value *Ptr0;
6691 Value *PtrN;
6692 if (Order.empty()) {
6693 Ptr0 = PointerOps.front();
6694 PtrN = PointerOps.back();
6695 } else {
6696 Ptr0 = PointerOps[Order.front()];
6697 PtrN = PointerOps[Order.back()];
6698 }
6699 std::optional<int64_t> Diff =
6700 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
6701 if (!Diff)
6702 return false;
6703 const size_t MaxRegSize =
6705 .getFixedValue();
6706 // Check for very large distances between elements.
6707 if (*Diff / Sz >= MaxRegSize / 8)
6708 return false;
6709 LoadVecTy = getWidenedType(ScalarTy, *Diff + 1);
6710 auto *LI = cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]);
6711 Align CommonAlignment = LI->getAlign();
6712 IsMasked = !isSafeToLoadUnconditionally(
6713 Ptr0, LoadVecTy, CommonAlignment, DL,
6714 cast<LoadInst>(Order.empty() ? VL.back() : VL[Order.back()]), &AC, &DT,
6715 &TLI);
6716 if (IsMasked && !TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment,
6717 LI->getPointerAddressSpace()))
6718 return false;
6719 // TODO: perform the analysis of each scalar load for better
6720 // safe-load-unconditionally analysis.
6721 bool IsStrided =
6722 buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask);
6723 assert(CompressMask.size() >= 2 && "At least two elements are required");
6724 SmallVector<Value *> OrderedPointerOps(PointerOps);
6725 if (!Order.empty())
6726 reorderScalars(OrderedPointerOps, Mask);
6727 auto [ScalarGEPCost, VectorGEPCost] =
6728 getGEPCosts(TTI, OrderedPointerOps, OrderedPointerOps.front(),
6729 Instruction::GetElementPtr, CostKind, ScalarTy, LoadVecTy);
6730 // The cost of scalar loads.
6731 InstructionCost ScalarLoadsCost =
6732 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
6733 [&](InstructionCost C, Value *V) {
6734 return C + TTI.getInstructionCost(cast<Instruction>(V),
6735 CostKind);
6736 }) +
6737 ScalarGEPCost;
6738 APInt DemandedElts = APInt::getAllOnes(Sz);
6739 InstructionCost GatherCost =
6740 getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
6741 /*Insert=*/true,
6742 /*Extract=*/false, CostKind) +
6743 ScalarLoadsCost;
6744 InstructionCost LoadCost = 0;
6745 if (IsMasked) {
6746 LoadCost =
6747 TTI.getMaskedMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6748 LI->getPointerAddressSpace(), CostKind);
6749 } else {
6750 LoadCost =
6751 TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6752 LI->getPointerAddressSpace(), CostKind);
6753 }
6754 if (IsStrided && !IsMasked && Order.empty()) {
6755 // Check for potential segmented(interleaved) loads.
6756 VectorType *AlignedLoadVecTy = getWidenedType(
6757 ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1));
6758 if (!isSafeToLoadUnconditionally(Ptr0, AlignedLoadVecTy, CommonAlignment,
6759 DL, cast<LoadInst>(VL.back()), &AC, &DT,
6760 &TLI))
6761 AlignedLoadVecTy = LoadVecTy;
6762 if (TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
6763 CommonAlignment,
6764 LI->getPointerAddressSpace())) {
6765 InstructionCost InterleavedCost =
6766 VectorGEPCost + TTI.getInterleavedMemoryOpCost(
6767 Instruction::Load, AlignedLoadVecTy,
6768 CompressMask[1], {}, CommonAlignment,
6769 LI->getPointerAddressSpace(), CostKind, IsMasked);
6770 if (InterleavedCost < GatherCost) {
6771 InterleaveFactor = CompressMask[1];
6772 LoadVecTy = AlignedLoadVecTy;
6773 return true;
6774 }
6775 }
6776 }
6777 InstructionCost CompressCost = ::getShuffleCost(
6778 TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind);
6779 if (!Order.empty()) {
6780 SmallVector<int> NewMask(Sz, PoisonMaskElem);
6781 for (unsigned I : seq<unsigned>(Sz)) {
6782 NewMask[I] = CompressMask[Mask[I]];
6783 }
6784 CompressMask.swap(NewMask);
6785 }
6786 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
6787 return TotalVecCost < GatherCost;
6788}
6789
6790/// Checks if the \p VL can be transformed to a (masked)load + compress or
6791/// (masked) interleaved load.
6792static bool
6795 const DataLayout &DL, ScalarEvolution &SE,
6796 AssumptionCache &AC, const DominatorTree &DT,
6797 const TargetLibraryInfo &TLI,
6798 const function_ref<bool(Value *)> AreAllUsersVectorized) {
6799 bool IsMasked;
6800 unsigned InterleaveFactor;
6801 SmallVector<int> CompressMask;
6802 VectorType *LoadVecTy;
6803 return isMaskedLoadCompress(VL, PointerOps, Order, TTI, DL, SE, AC, DT, TLI,
6804 AreAllUsersVectorized, IsMasked, InterleaveFactor,
6805 CompressMask, LoadVecTy);
6806}
6807
6808/// Checks if strided loads can be generated out of \p VL loads with pointers \p
6809/// PointerOps:
6810/// 1. Target with strided load support is detected.
6811/// 2. The number of loads is greater than MinProfitableStridedLoads, or the
6812/// potential stride <= MaxProfitableLoadStride and the potential stride is
6813/// power-of-2 (to avoid perf regressions for the very small number of loads)
6814/// and max distance > number of loads, or potential stride is -1.
6815/// 3. The loads are ordered, or number of unordered loads <=
6816/// MaxProfitableUnorderedLoads, or loads are in reversed order. (this check is
6817/// to avoid extra costs for very expensive shuffles).
6818/// 4. Any pointer operand is an instruction with the users outside of the
6819/// current graph (for masked gathers extra extractelement instructions
6820/// might be required).
6822 ArrayRef<unsigned> Order,
6823 const TargetTransformInfo &TTI,
6824 const DataLayout &DL, ScalarEvolution &SE,
6825 const bool IsAnyPointerUsedOutGraph,
6826 const int64_t Diff,
6827 StridedPtrInfo &SPtrInfo) const {
6828 const size_t Sz = VL.size();
6829 const uint64_t AbsoluteDiff = std::abs(Diff);
6830 Type *ScalarTy = VL.front()->getType();
6831 auto *VecTy = getWidenedType(ScalarTy, Sz);
6832 if (IsAnyPointerUsedOutGraph ||
6833 (AbsoluteDiff > Sz &&
6835 (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
6836 AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) ||
6837 Diff == -(static_cast<int64_t>(Sz) - 1)) {
6838 int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
6839 if (Diff != Stride * static_cast<int64_t>(Sz - 1))
6840 return false;
6841 Align Alignment =
6842 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
6843 ->getAlign();
6844 if (!TTI.isLegalStridedLoadStore(VecTy, Alignment))
6845 return false;
6846 Value *Ptr0;
6847 Value *PtrN;
6848 if (Order.empty()) {
6849 Ptr0 = PointerOps.front();
6850 PtrN = PointerOps.back();
6851 } else {
6852 Ptr0 = PointerOps[Order.front()];
6853 PtrN = PointerOps[Order.back()];
6854 }
6855 // Iterate through all pointers and check if all distances are
6856 // unique multiple of Dist.
6858 for (Value *Ptr : PointerOps) {
6859 int64_t Dist = 0;
6860 if (Ptr == PtrN)
6861 Dist = Diff;
6862 else if (Ptr != Ptr0)
6863 Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
6864 // If the strides are not the same or repeated, we can't
6865 // vectorize.
6866 if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second)
6867 break;
6868 }
6869 if (Dists.size() == Sz) {
6870 Type *StrideTy = DL.getIndexType(Ptr0->getType());
6871 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride);
6872 SPtrInfo.Ty = getWidenedType(ScalarTy, Sz);
6873 return true;
6874 }
6875 }
6876 return false;
6877}
6878
6880 ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
6881 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo,
6882 unsigned *BestVF, bool TryRecursiveCheck) const {
6883 // Check that a vectorized load would load the same memory as a scalar
6884 // load. For example, we don't want to vectorize loads that are smaller
6885 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
6886 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
6887 // from such a struct, we read/write packed bits disagreeing with the
6888 // unvectorized version.
6889 if (BestVF)
6890 *BestVF = 0;
6892 return LoadsState::Gather;
6893 Type *ScalarTy = VL0->getType();
6894
6895 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
6896 return LoadsState::Gather;
6897
6898 // Make sure all loads in the bundle are simple - we can't vectorize
6899 // atomic or volatile loads.
6900 PointerOps.clear();
6901 const size_t Sz = VL.size();
6902 PointerOps.resize(Sz);
6903 auto *POIter = PointerOps.begin();
6904 for (Value *V : VL) {
6905 auto *L = dyn_cast<LoadInst>(V);
6906 if (!L || !L->isSimple())
6907 return LoadsState::Gather;
6908 *POIter = L->getPointerOperand();
6909 ++POIter;
6910 }
6911
6912 Order.clear();
6913 // Check the order of pointer operands or that all pointers are the same.
6914 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
6915
6916 auto *VecTy = getWidenedType(ScalarTy, Sz);
6917 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
6918 if (!IsSorted) {
6919 if (Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy)) {
6920 if (const SCEV *Stride =
6921 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order);
6922 Stride && TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
6923 SPtrInfo.Ty = getWidenedType(ScalarTy, PointerOps.size());
6924 SPtrInfo.StrideSCEV = Stride;
6926 }
6927 }
6928
6929 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
6930 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
6931 return LoadsState::Gather;
6932
6933 if (!all_of(PointerOps, [&](Value *P) {
6934 return arePointersCompatible(P, PointerOps.front(), *TLI);
6935 }))
6936 return LoadsState::Gather;
6937
6938 } else {
6939 Value *Ptr0;
6940 Value *PtrN;
6941 if (Order.empty()) {
6942 Ptr0 = PointerOps.front();
6943 PtrN = PointerOps.back();
6944 } else {
6945 Ptr0 = PointerOps[Order.front()];
6946 PtrN = PointerOps[Order.back()];
6947 }
6948 std::optional<int64_t> Diff =
6949 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
6950 // Check that the sorted loads are consecutive.
6951 if (static_cast<uint64_t>(*Diff) == Sz - 1)
6952 return LoadsState::Vectorize;
6953 if (isMaskedLoadCompress(VL, PointerOps, Order, *TTI, *DL, *SE, *AC, *DT,
6954 *TLI, [&](Value *V) {
6955 return areAllUsersVectorized(
6956 cast<Instruction>(V), UserIgnoreList);
6957 }))
6959 // Simple check if not a strided access - clear order.
6960 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
6961 // Try to generate strided load node.
6962 auto IsAnyPointerUsedOutGraph =
6963 IsPossibleStrided && any_of(PointerOps, [&](Value *V) {
6964 return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
6965 return !isVectorized(U) && !MustGather.contains(U);
6966 });
6967 });
6968 if (IsPossibleStrided &&
6969 isStridedLoad(VL, PointerOps, Order, *TTI, *DL, *SE,
6970 IsAnyPointerUsedOutGraph, *Diff, SPtrInfo))
6972 }
6973 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
6974 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
6975 return LoadsState::Gather;
6976 // Correctly identify compare the cost of loads + shuffles rather than
6977 // strided/masked gather loads. Returns true if vectorized + shuffles
6978 // representation is better than just gather.
6979 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
6980 unsigned *BestVF,
6981 bool ProfitableGatherPointers) {
6982 if (BestVF)
6983 *BestVF = 0;
6984 // Compare masked gather cost and loads + insert subvector costs.
6986 auto [ScalarGEPCost, VectorGEPCost] =
6987 getGEPCosts(TTI, PointerOps, PointerOps.front(),
6988 Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
6989 // Estimate the cost of masked gather GEP. If not a splat, roughly
6990 // estimate as a buildvector, otherwise estimate as splat.
6991 APInt DemandedElts = APInt::getAllOnes(Sz);
6992 Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType();
6993 VectorType *PtrVecTy = getWidenedType(PtrScalarTy, Sz);
6994 if (static_cast<unsigned>(count_if(
6995 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
6996 any_of(PointerOps, [&](Value *V) {
6997 return getUnderlyingObject(V) !=
6998 getUnderlyingObject(PointerOps.front());
6999 }))
7000 VectorGEPCost += getScalarizationOverhead(TTI, PtrScalarTy, PtrVecTy,
7001 DemandedElts, /*Insert=*/true,
7002 /*Extract=*/false, CostKind);
7003 else
7004 VectorGEPCost +=
7006 TTI, PtrScalarTy, PtrVecTy, APInt::getOneBitSet(Sz, 0),
7007 /*Insert=*/true, /*Extract=*/false, CostKind) +
7008 ::getShuffleCost(TTI, TTI::SK_Broadcast, PtrVecTy, {}, CostKind);
7009 // The cost of scalar loads.
7010 InstructionCost ScalarLoadsCost =
7011 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
7012 [&](InstructionCost C, Value *V) {
7013 return C + TTI.getInstructionCost(
7015 }) +
7016 ScalarGEPCost;
7017 // The cost of masked gather.
7018 InstructionCost MaskedGatherCost =
7019 TTI.getGatherScatterOpCost(
7020 Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
7021 /*VariableMask=*/false, CommonAlignment, CostKind) +
7022 (ProfitableGatherPointers ? 0 : VectorGEPCost);
7023 InstructionCost GatherCost =
7024 getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
7025 /*Insert=*/true,
7026 /*Extract=*/false, CostKind) +
7027 ScalarLoadsCost;
7028 // The list of loads is small or perform partial check already - directly
7029 // compare masked gather cost and gather cost.
7030 constexpr unsigned ListLimit = 4;
7031 if (!TryRecursiveCheck || VL.size() < ListLimit)
7032 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7033
7034 // FIXME: The following code has not been updated for non-power-of-2
7035 // vectors (and not whole registers). The splitting logic here does not
7036 // cover the original vector if the vector factor is not a power of two.
7037 if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size()))
7038 return false;
7039
7040 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
7041 unsigned MinVF = getMinVF(2 * Sz);
7042 DemandedElts.clearAllBits();
7043 // Iterate through possible vectorization factors and check if vectorized +
7044 // shuffles is better than just gather.
7045 for (unsigned VF =
7046 getFloorFullVectorNumberOfElements(TTI, ScalarTy, VL.size() - 1);
7047 VF >= MinVF;
7048 VF = getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {
7050 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
7051 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
7053 SmallVector<Value *> PointerOps;
7054 LoadsState LS = canVectorizeLoads(Slice, Slice.front(), Order,
7055 PointerOps, SPtrInfo, BestVF,
7056 /*TryRecursiveCheck=*/false);
7057 // Check that the sorted loads are consecutive.
7058 if (LS == LoadsState::Gather) {
7059 if (BestVF) {
7060 DemandedElts.setAllBits();
7061 break;
7062 }
7063 DemandedElts.setBits(Cnt, Cnt + VF);
7064 continue;
7065 }
7066 // If need the reorder - consider as high-cost masked gather for now.
7067 if ((LS == LoadsState::Vectorize ||
7070 !Order.empty() && !isReverseOrder(Order))
7072 States.push_back(LS);
7073 }
7074 if (DemandedElts.isAllOnes())
7075 // All loads gathered - try smaller VF.
7076 continue;
7077 // Can be vectorized later as a serie of loads/insertelements.
7078 InstructionCost VecLdCost = 0;
7079 if (!DemandedElts.isZero()) {
7080 VecLdCost = getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
7081 /*Insert=*/true,
7082 /*Extract=*/false, CostKind) +
7083 ScalarGEPCost;
7084 for (unsigned Idx : seq<unsigned>(VL.size()))
7085 if (DemandedElts[Idx])
7086 VecLdCost +=
7087 TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
7088 }
7089 auto *SubVecTy = getWidenedType(ScalarTy, VF);
7090 for (auto [I, LS] : enumerate(States)) {
7091 auto *LI0 = cast<LoadInst>(VL[I * VF]);
7092 InstructionCost VectorGEPCost =
7093 (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
7094 ? 0
7095 : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
7096 LI0->getPointerOperand(),
7097 Instruction::GetElementPtr, CostKind, ScalarTy,
7098 SubVecTy)
7099 .second;
7100 if (LS == LoadsState::ScatterVectorize) {
7101 if (static_cast<unsigned>(
7102 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
7103 PointerOps.size() - 1 ||
7104 any_of(PointerOps, [&](Value *V) {
7105 return getUnderlyingObject(V) !=
7106 getUnderlyingObject(PointerOps.front());
7107 }))
7108 VectorGEPCost += getScalarizationOverhead(
7109 TTI, ScalarTy, SubVecTy, APInt::getAllOnes(VF),
7110 /*Insert=*/true, /*Extract=*/false, CostKind);
7111 else
7112 VectorGEPCost +=
7114 TTI, ScalarTy, SubVecTy, APInt::getOneBitSet(VF, 0),
7115 /*Insert=*/true, /*Extract=*/false, CostKind) +
7116 ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
7117 CostKind);
7118 }
7119 switch (LS) {
7121 VecLdCost +=
7122 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
7123 LI0->getPointerAddressSpace(), CostKind,
7125 VectorGEPCost;
7126 break;
7128 VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
7129 LI0->getPointerOperand(),
7130 /*VariableMask=*/false,
7131 CommonAlignment, CostKind) +
7132 VectorGEPCost;
7133 break;
7135 VecLdCost += TTI.getMaskedMemoryOpCost(
7136 Instruction::Load, SubVecTy, CommonAlignment,
7137 LI0->getPointerAddressSpace(), CostKind) +
7138 VectorGEPCost +
7140 {}, CostKind);
7141 break;
7143 VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
7144 LI0->getPointerOperand(),
7145 /*VariableMask=*/false,
7146 CommonAlignment, CostKind) +
7147 VectorGEPCost;
7148 break;
7149 case LoadsState::Gather:
7150 // Gathers are already calculated - ignore.
7151 continue;
7152 }
7153 SmallVector<int> ShuffleMask(VL.size());
7154 for (int Idx : seq<int>(0, VL.size()))
7155 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
7156 if (I > 0)
7157 VecLdCost +=
7158 ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, ShuffleMask,
7159 CostKind, I * VF, SubVecTy);
7160 }
7161 // If masked gather cost is higher - better to vectorize, so
7162 // consider it as a gather node. It will be better estimated
7163 // later.
7164 if (MaskedGatherCost >= VecLdCost &&
7165 VecLdCost - GatherCost < -SLPCostThreshold) {
7166 if (BestVF)
7167 *BestVF = VF;
7168 return true;
7169 }
7170 }
7171 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7172 };
7173 // TODO: need to improve analysis of the pointers, if not all of them are
7174 // GEPs or have > 2 operands, we end up with a gather node, which just
7175 // increases the cost.
7176 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
7177 bool ProfitableGatherPointers =
7178 L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
7179 return L->isLoopInvariant(V);
7180 })) <= Sz / 2;
7181 if (ProfitableGatherPointers || all_of(PointerOps, [](Value *P) {
7183 return (!GEP && doesNotNeedToBeScheduled(P)) ||
7184 (GEP && GEP->getNumOperands() == 2 &&
7185 isa<Constant, Instruction>(GEP->getOperand(1)));
7186 })) {
7187 // Check if potential masked gather can be represented as series
7188 // of loads + insertsubvectors.
7189 // If masked gather cost is higher - better to vectorize, so
7190 // consider it as a gather node. It will be better estimated
7191 // later.
7192 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
7193 ProfitableGatherPointers))
7195 }
7196
7197 return LoadsState::Gather;
7198}
7199
7201 ArrayRef<BasicBlock *> BBs, Type *ElemTy,
7202 const DataLayout &DL, ScalarEvolution &SE,
7203 SmallVectorImpl<unsigned> &SortedIndices) {
7204 assert(
7205 all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
7206 "Expected list of pointer operands.");
7207 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
7208 // Ptr into, sort and return the sorted indices with values next to one
7209 // another.
7211 std::pair<BasicBlock *, Value *>,
7213 Bases;
7214 Bases
7215 .try_emplace(std::make_pair(
7217 .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U);
7218
7219 SortedIndices.clear();
7220 for (auto [Cnt, Ptr] : enumerate(VL.drop_front())) {
7221 auto Key = std::make_pair(BBs[Cnt + 1],
7223 bool Found = any_of(Bases.try_emplace(Key).first->second,
7224 [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) {
7225 std::optional<int64_t> Diff =
7226 getPointersDiff(ElemTy, std::get<0>(Base.front()),
7227 ElemTy, Ptr, DL, SE,
7228 /*StrictCheck=*/true);
7229 if (!Diff)
7230 return false;
7231
7232 Base.emplace_back(Ptr, *Diff, Cnt + 1);
7233 return true;
7234 });
7235
7236 if (!Found) {
7237 // If we haven't found enough to usefully cluster, return early.
7238 if (Bases.size() > VL.size() / 2 - 1)
7239 return false;
7240
7241 // Not found already - add a new Base
7242 Bases.find(Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
7243 }
7244 }
7245
7246 if (Bases.size() == VL.size())
7247 return false;
7248
7249 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
7250 Bases.front().second.size() == VL.size()))
7251 return false;
7252
7253 // For each of the bases sort the pointers by Offset and check if any of the
7254 // base become consecutively allocated.
7255 auto ComparePointers = [](Value *Ptr1, Value *Ptr2) {
7256 SmallPtrSet<Value *, 13> FirstPointers;
7257 SmallPtrSet<Value *, 13> SecondPointers;
7258 Value *P1 = Ptr1;
7259 Value *P2 = Ptr2;
7260 unsigned Depth = 0;
7261 while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1)) {
7262 if (P1 == P2 || Depth > RecursionMaxDepth)
7263 return false;
7264 FirstPointers.insert(P1);
7265 SecondPointers.insert(P2);
7266 P1 = getUnderlyingObject(P1, /*MaxLookup=*/1);
7267 P2 = getUnderlyingObject(P2, /*MaxLookup=*/1);
7268 ++Depth;
7269 }
7270 assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) &&
7271 "Unable to find matching root.");
7272 return FirstPointers.contains(P2) && !SecondPointers.contains(P1);
7273 };
7274 for (auto &Base : Bases) {
7275 for (auto &Vec : Base.second) {
7276 if (Vec.size() > 1) {
7278 int64_t InitialOffset = std::get<1>(Vec[0]);
7279 bool AnyConsecutive =
7280 all_of(enumerate(Vec), [InitialOffset](const auto &P) {
7281 return std::get<1>(P.value()) ==
7282 int64_t(P.index()) + InitialOffset;
7283 });
7284 // Fill SortedIndices array only if it looks worth-while to sort the
7285 // ptrs.
7286 if (!AnyConsecutive)
7287 return false;
7288 }
7289 }
7290 stable_sort(Base.second, [&](const auto &V1, const auto &V2) {
7291 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
7292 });
7293 }
7294
7295 for (auto &T : Bases)
7296 for (const auto &Vec : T.second)
7297 for (const auto &P : Vec)
7298 SortedIndices.push_back(std::get<2>(P));
7299
7300 assert(SortedIndices.size() == VL.size() &&
7301 "Expected SortedIndices to be the size of VL");
7302 return true;
7303}
7304
7305std::optional<BoUpSLP::OrdersType>
7306BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
7307 assert(TE.isGather() && "Expected gather node only.");
7308 Type *ScalarTy = TE.Scalars[0]->getType();
7309
7311 Ptrs.reserve(TE.Scalars.size());
7313 BBs.reserve(TE.Scalars.size());
7314 for (Value *V : TE.Scalars) {
7315 auto *L = dyn_cast<LoadInst>(V);
7316 if (!L || !L->isSimple())
7317 return std::nullopt;
7318 Ptrs.push_back(L->getPointerOperand());
7319 BBs.push_back(L->getParent());
7320 }
7321
7322 BoUpSLP::OrdersType Order;
7323 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
7324 clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order))
7325 return std::move(Order);
7326 return std::nullopt;
7327}
7328
7329/// Check if two insertelement instructions are from the same buildvector.
7332 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
7333 // Instructions must be from the same basic blocks.
7334 if (VU->getParent() != V->getParent())
7335 return false;
7336 // Checks if 2 insertelements are from the same buildvector.
7337 if (VU->getType() != V->getType())
7338 return false;
7339 // Multiple used inserts are separate nodes.
7340 if (!VU->hasOneUse() && !V->hasOneUse())
7341 return false;
7342 auto *IE1 = VU;
7343 auto *IE2 = V;
7344 std::optional<unsigned> Idx1 = getElementIndex(IE1);
7345 std::optional<unsigned> Idx2 = getElementIndex(IE2);
7346 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
7347 return false;
7348 // Go through the vector operand of insertelement instructions trying to find
7349 // either VU as the original vector for IE2 or V as the original vector for
7350 // IE1.
7351 SmallBitVector ReusedIdx(
7352 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
7353 bool IsReusedIdx = false;
7354 do {
7355 if (IE2 == VU && !IE1)
7356 return VU->hasOneUse();
7357 if (IE1 == V && !IE2)
7358 return V->hasOneUse();
7359 if (IE1 && IE1 != V) {
7360 unsigned Idx1 = getElementIndex(IE1).value_or(*Idx2);
7361 IsReusedIdx |= ReusedIdx.test(Idx1);
7362 ReusedIdx.set(Idx1);
7363 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
7364 IE1 = nullptr;
7365 else
7366 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
7367 }
7368 if (IE2 && IE2 != VU) {
7369 unsigned Idx2 = getElementIndex(IE2).value_or(*Idx1);
7370 IsReusedIdx |= ReusedIdx.test(Idx2);
7371 ReusedIdx.set(Idx2);
7372 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
7373 IE2 = nullptr;
7374 else
7375 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
7376 }
7377 } while (!IsReusedIdx && (IE1 || IE2));
7378 return false;
7379}
7380
7381/// Checks if the specified instruction \p I is an alternate operation for
7382/// the given \p MainOp and \p AltOp instructions.
7383static bool isAlternateInstruction(Instruction *I, Instruction *MainOp,
7384 Instruction *AltOp,
7385 const TargetLibraryInfo &TLI);
7386
7387std::optional<BoUpSLP::OrdersType>
7388BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
7389 bool IgnoreReorder) {
7390 // No need to reorder if need to shuffle reuses, still need to shuffle the
7391 // node.
7392 if (!TE.ReuseShuffleIndices.empty()) {
7393 // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
7394 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
7395 "Reshuffling scalars not yet supported for nodes with padding");
7396
7397 if (isSplat(TE.Scalars))
7398 return std::nullopt;
7399 // Check if reuse shuffle indices can be improved by reordering.
7400 // For this, check that reuse mask is "clustered", i.e. each scalar values
7401 // is used once in each submask of size <number_of_scalars>.
7402 // Example: 4 scalar values.
7403 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
7404 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
7405 // element 3 is used twice in the second submask.
7406 unsigned Sz = TE.Scalars.size();
7407 if (TE.isGather()) {
7408 if (std::optional<OrdersType> CurrentOrder =
7409 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder)) {
7410 SmallVector<int> Mask;
7411 fixupOrderingIndices(*CurrentOrder);
7412 inversePermutation(*CurrentOrder, Mask);
7413 ::addMask(Mask, TE.ReuseShuffleIndices);
7414 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
7415 unsigned Sz = TE.Scalars.size();
7416 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
7417 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
7418 if (Idx != PoisonMaskElem)
7419 Res[Idx + K * Sz] = I + K * Sz;
7420 }
7421 return std::move(Res);
7422 }
7423 }
7424 if (Sz == 2 && TE.getVectorFactor() == 4 &&
7425 ::getNumberOfParts(*TTI, getWidenedType(TE.Scalars.front()->getType(),
7426 2 * TE.getVectorFactor())) == 1)
7427 return std::nullopt;
7428 if (TE.ReuseShuffleIndices.size() % Sz != 0)
7429 return std::nullopt;
7430 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
7431 Sz)) {
7432 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
7433 if (TE.ReorderIndices.empty())
7434 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
7435 else
7436 inversePermutation(TE.ReorderIndices, ReorderMask);
7437 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
7438 unsigned VF = ReorderMask.size();
7439 OrdersType ResOrder(VF, VF);
7440 unsigned NumParts = divideCeil(VF, Sz);
7441 SmallBitVector UsedVals(NumParts);
7442 for (unsigned I = 0; I < VF; I += Sz) {
7443 int Val = PoisonMaskElem;
7444 unsigned UndefCnt = 0;
7445 unsigned Limit = std::min(Sz, VF - I);
7446 if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
7447 [&](int Idx) {
7448 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
7449 Val = Idx;
7450 if (Idx == PoisonMaskElem)
7451 ++UndefCnt;
7452 return Idx != PoisonMaskElem && Idx != Val;
7453 }) ||
7454 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
7455 UndefCnt > Sz / 2)
7456 return std::nullopt;
7457 UsedVals.set(Val);
7458 for (unsigned K = 0; K < NumParts; ++K) {
7459 unsigned Idx = Val + Sz * K;
7460 if (Idx < VF && I + K < VF)
7461 ResOrder[Idx] = I + K;
7462 }
7463 }
7464 return std::move(ResOrder);
7465 }
7466 unsigned VF = TE.getVectorFactor();
7467 // Try build correct order for extractelement instructions.
7468 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
7469 TE.ReuseShuffleIndices.end());
7470 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
7471 all_of(TE.Scalars, [Sz](Value *V) {
7472 if (isa<PoisonValue>(V))
7473 return true;
7474 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
7475 return Idx && *Idx < Sz;
7476 })) {
7477 assert(!TE.isAltShuffle() && "Alternate instructions are only supported "
7478 "by BinaryOperator and CastInst.");
7479 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
7480 if (TE.ReorderIndices.empty())
7481 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
7482 else
7483 inversePermutation(TE.ReorderIndices, ReorderMask);
7484 for (unsigned I = 0; I < VF; ++I) {
7485 int &Idx = ReusedMask[I];
7486 if (Idx == PoisonMaskElem)
7487 continue;
7488 Value *V = TE.Scalars[ReorderMask[Idx]];
7489 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
7490 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
7491 }
7492 }
7493 // Build the order of the VF size, need to reorder reuses shuffles, they are
7494 // always of VF size.
7495 OrdersType ResOrder(VF);
7496 std::iota(ResOrder.begin(), ResOrder.end(), 0);
7497 auto *It = ResOrder.begin();
7498 for (unsigned K = 0; K < VF; K += Sz) {
7499 OrdersType CurrentOrder(TE.ReorderIndices);
7500 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
7501 if (SubMask.front() == PoisonMaskElem)
7502 std::iota(SubMask.begin(), SubMask.end(), 0);
7503 reorderOrder(CurrentOrder, SubMask);
7504 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
7505 std::advance(It, Sz);
7506 }
7507 if (TE.isGather() && all_of(enumerate(ResOrder), [](const auto &Data) {
7508 return Data.index() == Data.value();
7509 }))
7510 return std::nullopt; // No need to reorder.
7511 return std::move(ResOrder);
7512 }
7513 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
7514 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
7515 !Instruction::isBinaryOp(TE.UserTreeIndex.UserTE->getOpcode())) &&
7516 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
7517 return std::nullopt;
7518 if (TE.State == TreeEntry::SplitVectorize ||
7519 ((TE.State == TreeEntry::Vectorize ||
7520 TE.State == TreeEntry::StridedVectorize ||
7521 TE.State == TreeEntry::CompressVectorize) &&
7523 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))))) {
7524 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
7525 "Alternate instructions are only supported by "
7526 "BinaryOperator and CastInst.");
7527 return TE.ReorderIndices;
7528 }
7529 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
7530 TE.isAltShuffle()) {
7531 assert(TE.ReuseShuffleIndices.empty() &&
7532 "ReuseShuffleIndices should be "
7533 "empty for alternate instructions.");
7534 SmallVector<int> Mask;
7535 TE.buildAltOpShuffleMask(
7536 [&](Instruction *I) {
7537 assert(TE.getMatchingMainOpOrAltOp(I) &&
7538 "Unexpected main/alternate opcode");
7539 return isAlternateInstruction(I, TE.getMainOp(), TE.getAltOp(), *TLI);
7540 },
7541 Mask);
7542 const int VF = TE.getVectorFactor();
7543 OrdersType ResOrder(VF, VF);
7544 for (unsigned I : seq<unsigned>(VF)) {
7545 if (Mask[I] == PoisonMaskElem)
7546 continue;
7547 ResOrder[Mask[I] % VF] = I;
7548 }
7549 return std::move(ResOrder);
7550 }
7551 if (!TE.ReorderIndices.empty())
7552 return TE.ReorderIndices;
7553 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
7554 if (!TE.ReorderIndices.empty())
7555 return TE.ReorderIndices;
7556
7557 SmallVector<Instruction *> UserBVHead(TE.Scalars.size());
7558 for (auto [I, V] : zip(UserBVHead, TE.Scalars)) {
7559 if (isa<Constant>(V) || !V->hasNUsesOrMore(1))
7560 continue;
7561 auto *II = dyn_cast<InsertElementInst>(*V->user_begin());
7562 if (!II)
7563 continue;
7564 Instruction *BVHead = nullptr;
7565 BasicBlock *BB = II->getParent();
7566 while (II && II->hasOneUse() && II->getParent() == BB) {
7567 BVHead = II;
7568 II = dyn_cast<InsertElementInst>(II->getOperand(0));
7569 }
7570 I = BVHead;
7571 }
7572
7573 auto CompareByBasicBlocks = [&](BasicBlock *BB1, BasicBlock *BB2) {
7574 assert(BB1 != BB2 && "Expected different basic blocks.");
7575 if (!DT->isReachableFromEntry(BB1))
7576 return false;
7577 if (!DT->isReachableFromEntry(BB2))
7578 return true;
7579 auto *NodeA = DT->getNode(BB1);
7580 auto *NodeB = DT->getNode(BB2);
7581 assert(NodeA && "Should only process reachable instructions");
7582 assert(NodeB && "Should only process reachable instructions");
7583 assert((NodeA == NodeB) ==
7584 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
7585 "Different nodes should have different DFS numbers");
7586 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
7587 };
7588 auto PHICompare = [&](unsigned I1, unsigned I2) {
7589 Value *V1 = TE.Scalars[I1];
7590 Value *V2 = TE.Scalars[I2];
7591 if (V1 == V2 || (V1->use_empty() && V2->use_empty()))
7592 return false;
7593 if (isa<PoisonValue>(V1))
7594 return true;
7595 if (isa<PoisonValue>(V2))
7596 return false;
7597 if (V1->getNumUses() < V2->getNumUses())
7598 return true;
7599 if (V1->getNumUses() > V2->getNumUses())
7600 return false;
7601 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
7602 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
7603 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
7604 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
7605 FirstUserOfPhi2->getParent());
7606 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
7607 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
7608 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
7609 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
7610 if (IE1 && !IE2)
7611 return true;
7612 if (!IE1 && IE2)
7613 return false;
7614 if (IE1 && IE2) {
7615 if (UserBVHead[I1] && !UserBVHead[I2])
7616 return true;
7617 if (!UserBVHead[I1])
7618 return false;
7619 if (UserBVHead[I1] == UserBVHead[I2])
7620 return getElementIndex(IE1) < getElementIndex(IE2);
7621 if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent())
7622 return CompareByBasicBlocks(UserBVHead[I1]->getParent(),
7623 UserBVHead[I2]->getParent());
7624 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
7625 }
7626 if (EE1 && !EE2)
7627 return true;
7628 if (!EE1 && EE2)
7629 return false;
7630 if (EE1 && EE2) {
7631 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
7632 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
7633 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
7634 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
7635 if (!Inst2 && !P2)
7636 return Inst1 || P1;
7637 if (EE1->getOperand(0) == EE2->getOperand(0))
7638 return getElementIndex(EE1) < getElementIndex(EE2);
7639 if (!Inst1 && Inst2)
7640 return false;
7641 if (Inst1 && Inst2) {
7642 if (Inst1->getParent() != Inst2->getParent())
7643 return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent());
7644 return Inst1->comesBefore(Inst2);
7645 }
7646 if (!P1 && P2)
7647 return false;
7648 assert(P1 && P2 &&
7649 "Expected either instructions or arguments vector operands.");
7650 return P1->getArgNo() < P2->getArgNo();
7651 }
7652 return false;
7653 };
7654 OrdersType Phis(TE.Scalars.size());
7655 std::iota(Phis.begin(), Phis.end(), 0);
7656 stable_sort(Phis, PHICompare);
7657 if (isIdentityOrder(Phis))
7658 return std::nullopt; // No need to reorder.
7659 return std::move(Phis);
7660 }
7661 if (TE.isGather() &&
7662 (!TE.hasState() || !TE.isAltShuffle() ||
7663 ScalarsInSplitNodes.contains(TE.getMainOp())) &&
7664 allSameType(TE.Scalars)) {
7665 // TODO: add analysis of other gather nodes with extractelement
7666 // instructions and other values/instructions, not only undefs.
7667 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
7669 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
7670 all_of(TE.Scalars, [](Value *V) {
7671 auto *EE = dyn_cast<ExtractElementInst>(V);
7672 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
7673 })) {
7674 // Check that gather of extractelements can be represented as
7675 // just a shuffle of a single vector.
7676 OrdersType CurrentOrder;
7677 bool Reuse =
7678 canReuseExtract(TE.Scalars, CurrentOrder, /*ResizeAllowed=*/true);
7679 if (Reuse || !CurrentOrder.empty())
7680 return std::move(CurrentOrder);
7681 }
7682 // If the gather node is <undef, v, .., poison> and
7683 // insertelement poison, v, 0 [+ permute]
7684 // is cheaper than
7685 // insertelement poison, v, n - try to reorder.
7686 // If rotating the whole graph, exclude the permute cost, the whole graph
7687 // might be transformed.
7688 int Sz = TE.Scalars.size();
7689 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
7690 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
7691 const auto *It = find_if_not(TE.Scalars, isConstant);
7692 if (It == TE.Scalars.begin())
7693 return OrdersType();
7694 auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
7695 if (It != TE.Scalars.end()) {
7696 OrdersType Order(Sz, Sz);
7697 unsigned Idx = std::distance(TE.Scalars.begin(), It);
7698 Order[Idx] = 0;
7699 fixupOrderingIndices(Order);
7700 SmallVector<int> Mask;
7701 inversePermutation(Order, Mask);
7702 InstructionCost PermuteCost =
7703 TopToBottom
7704 ? 0
7705 : ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, Ty, Mask);
7706 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
7707 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
7708 PoisonValue::get(Ty), *It);
7709 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
7710 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
7711 PoisonValue::get(Ty), *It);
7712 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
7713 OrdersType Order(Sz, Sz);
7714 Order[Idx] = 0;
7715 return std::move(Order);
7716 }
7717 }
7718 }
7719 if (isSplat(TE.Scalars))
7720 return std::nullopt;
7721 if (TE.Scalars.size() >= 3)
7722 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
7723 return Order;
7724 // Check if can include the order of vectorized loads. For masked gathers do
7725 // extra analysis later, so include such nodes into a special list.
7726 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
7727 SmallVector<Value *> PointerOps;
7728 StridedPtrInfo SPtrInfo;
7729 OrdersType CurrentOrder;
7730 LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
7731 CurrentOrder, PointerOps, SPtrInfo);
7734 return std::move(CurrentOrder);
7735 }
7736 // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
7737 // has been auditted for correctness with non-power-of-two vectors.
7738 if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
7739 if (std::optional<OrdersType> CurrentOrder =
7740 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
7741 return CurrentOrder;
7742 }
7743 return std::nullopt;
7744}
7745
7746/// Checks if the given mask is a "clustered" mask with the same clusters of
7747/// size \p Sz, which are not identity submasks.
7749 unsigned Sz) {
7750 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
7751 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
7752 return false;
7753 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
7754 ArrayRef<int> Cluster = Mask.slice(I, Sz);
7755 if (Cluster != FirstCluster)
7756 return false;
7757 }
7758 return true;
7759}
7760
7761void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
7762 // Reorder reuses mask.
7763 reorderReuses(TE.ReuseShuffleIndices, Mask);
7764 const unsigned Sz = TE.Scalars.size();
7765 // For vectorized and non-clustered reused no need to do anything else.
7766 if (!TE.isGather() ||
7768 Sz) ||
7769 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
7770 return;
7771 SmallVector<int> NewMask;
7772 inversePermutation(TE.ReorderIndices, NewMask);
7773 addMask(NewMask, TE.ReuseShuffleIndices);
7774 // Clear reorder since it is going to be applied to the new mask.
7775 TE.ReorderIndices.clear();
7776 // Try to improve gathered nodes with clustered reuses, if possible.
7777 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
7778 SmallVector<unsigned> NewOrder(Slice);
7779 inversePermutation(NewOrder, NewMask);
7780 reorderScalars(TE.Scalars, NewMask);
7781 // Fill the reuses mask with the identity submasks.
7782 for (auto *It = TE.ReuseShuffleIndices.begin(),
7783 *End = TE.ReuseShuffleIndices.end();
7784 It != End; std::advance(It, Sz))
7785 std::iota(It, std::next(It, Sz), 0);
7786}
7787
7789 ArrayRef<unsigned> SecondaryOrder) {
7790 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
7791 "Expected same size of orders");
7792 size_t Sz = Order.size();
7793 SmallBitVector UsedIndices(Sz);
7794 for (unsigned Idx : seq<unsigned>(0, Sz)) {
7795 if (Order[Idx] != Sz)
7796 UsedIndices.set(Order[Idx]);
7797 }
7798 if (SecondaryOrder.empty()) {
7799 for (unsigned Idx : seq<unsigned>(0, Sz))
7800 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
7801 Order[Idx] = Idx;
7802 } else {
7803 for (unsigned Idx : seq<unsigned>(0, Sz))
7804 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
7805 !UsedIndices.test(SecondaryOrder[Idx]))
7806 Order[Idx] = SecondaryOrder[Idx];
7807 }
7808}
7809
7812 return false;
7813
7814 constexpr unsigned TinyVF = 2;
7815 constexpr unsigned TinyTree = 10;
7816 constexpr unsigned PhiOpsLimit = 12;
7817 constexpr unsigned GatherLoadsLimit = 2;
7818 if (VectorizableTree.size() <= TinyTree)
7819 return true;
7820 if (VectorizableTree.front()->hasState() &&
7821 !VectorizableTree.front()->isGather() &&
7822 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
7823 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
7824 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
7825 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
7826 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
7827 VectorizableTree.front()->ReorderIndices.empty()) {
7828 // Check if the tree has only single store and single (unordered) load node,
7829 // other nodes are phis or geps/binops, combined with phis, and/or single
7830 // gather load node
7831 if (VectorizableTree.front()->hasState() &&
7832 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
7833 VectorizableTree.front()->Scalars.size() == TinyVF &&
7834 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
7835 return false;
7836 // Single node, which require reorder - skip.
7837 if (VectorizableTree.front()->hasState() &&
7838 VectorizableTree.front()->getOpcode() == Instruction::Store &&
7839 VectorizableTree.front()->ReorderIndices.empty()) {
7840 const unsigned ReorderedSplitsCnt =
7841 count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
7842 return TE->State == TreeEntry::SplitVectorize &&
7843 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
7844 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
7845 ::isCommutative(TE->UserTreeIndex.UserTE->getMainOp());
7846 });
7847 if (ReorderedSplitsCnt <= 1 &&
7848 static_cast<unsigned>(count_if(
7849 VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
7850 return ((!TE->isGather() &&
7851 (TE->ReorderIndices.empty() ||
7852 (TE->UserTreeIndex.UserTE &&
7853 TE->UserTreeIndex.UserTE->State ==
7854 TreeEntry::Vectorize &&
7855 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
7856 .empty()))) ||
7857 (TE->isGather() && TE->ReorderIndices.empty() &&
7858 (!TE->hasState() || TE->isAltShuffle() ||
7859 TE->getOpcode() == Instruction::Load ||
7860 TE->getOpcode() == Instruction::ZExt ||
7861 TE->getOpcode() == Instruction::SExt))) &&
7862 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
7863 !TE->isGather() || none_of(TE->Scalars, [&](Value *V) {
7864 return !isConstant(V) && isVectorized(V);
7865 }));
7866 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
7867 return false;
7868 }
7869 bool HasPhis = false;
7870 bool HasLoad = true;
7871 unsigned GatherLoads = 0;
7872 for (const std::unique_ptr<TreeEntry> &TE :
7873 ArrayRef(VectorizableTree).drop_front()) {
7874 if (TE->State == TreeEntry::SplitVectorize)
7875 continue;
7876 if (!TE->hasState()) {
7877 if (all_of(TE->Scalars, IsaPred<Constant, PHINode>) ||
7879 continue;
7880 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
7882 continue;
7883 return true;
7884 }
7885 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
7886 if (!TE->isGather()) {
7887 HasLoad = false;
7888 continue;
7889 }
7890 if (HasLoad)
7891 return true;
7892 ++GatherLoads;
7893 if (GatherLoads >= GatherLoadsLimit)
7894 return true;
7895 }
7896 if (TE->getOpcode() == Instruction::GetElementPtr ||
7897 Instruction::isBinaryOp(TE->getOpcode()))
7898 continue;
7899 if (TE->getOpcode() != Instruction::PHI &&
7900 (!TE->hasCopyableElements() ||
7901 static_cast<unsigned>(count_if(TE->Scalars, IsaPred<PHINode>)) <
7902 TE->Scalars.size() / 2))
7903 return true;
7904 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
7905 TE->getNumOperands() > PhiOpsLimit)
7906 return false;
7907 HasPhis = true;
7908 }
7909 return !HasPhis;
7910 }
7911 return true;
7912}
7913
7914void BoUpSLP::TreeEntry::reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
7915 ArrayRef<int> MaskOrder) {
7916 assert(State == TreeEntry::SplitVectorize && "Expected split user node.");
7917 SmallVector<int> NewMask(getVectorFactor());
7918 SmallVector<int> NewMaskOrder(getVectorFactor());
7919 std::iota(NewMask.begin(), NewMask.end(), 0);
7920 std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
7921 if (Idx == 0) {
7922 copy(Mask, NewMask.begin());
7923 copy(MaskOrder, NewMaskOrder.begin());
7924 } else {
7925 assert(Idx == 1 && "Expected either 0 or 1 index.");
7926 unsigned Offset = CombinedEntriesWithIndices.back().second;
7927 for (unsigned I : seq<unsigned>(Mask.size())) {
7928 NewMask[I + Offset] = Mask[I] + Offset;
7929 NewMaskOrder[I + Offset] = MaskOrder[I] + Offset;
7930 }
7931 }
7932 reorderScalars(Scalars, NewMask);
7933 reorderOrder(ReorderIndices, NewMaskOrder, /*BottomOrder=*/true);
7934 if (!ReorderIndices.empty() && BoUpSLP::isIdentityOrder(ReorderIndices))
7935 ReorderIndices.clear();
7936}
7937
7939 // Maps VF to the graph nodes.
7941 // ExtractElement gather nodes which can be vectorized and need to handle
7942 // their ordering.
7944
7945 // Phi nodes can have preferred ordering based on their result users
7947
7948 // AltShuffles can also have a preferred ordering that leads to fewer
7949 // instructions, e.g., the addsub instruction in x86.
7950 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
7951
7952 // Maps a TreeEntry to the reorder indices of external users.
7954 ExternalUserReorderMap;
7955 // Find all reorderable nodes with the given VF.
7956 // Currently the are vectorized stores,loads,extracts + some gathering of
7957 // extracts.
7958 for_each(VectorizableTree, [&, &TTIRef = *TTI](
7959 const std::unique_ptr<TreeEntry> &TE) {
7960 // Look for external users that will probably be vectorized.
7961 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
7962 findExternalStoreUsersReorderIndices(TE.get());
7963 if (!ExternalUserReorderIndices.empty()) {
7964 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
7965 ExternalUserReorderMap.try_emplace(TE.get(),
7966 std::move(ExternalUserReorderIndices));
7967 }
7968
7969 // Patterns like [fadd,fsub] can be combined into a single instruction in
7970 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
7971 // to take into account their order when looking for the most used order.
7972 if (TE->hasState() && TE->isAltShuffle() &&
7973 TE->State != TreeEntry::SplitVectorize) {
7974 Type *ScalarTy = TE->Scalars[0]->getType();
7975 VectorType *VecTy = getWidenedType(ScalarTy, TE->Scalars.size());
7976 unsigned Opcode0 = TE->getOpcode();
7977 unsigned Opcode1 = TE->getAltOpcode();
7978 SmallBitVector OpcodeMask(
7979 getAltInstrMask(TE->Scalars, ScalarTy, Opcode0, Opcode1));
7980 // If this pattern is supported by the target then we consider the order.
7981 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
7982 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
7983 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
7984 }
7985 // TODO: Check the reverse order too.
7986 }
7987
7988 bool IgnoreReorder =
7989 !UserIgnoreList && VectorizableTree.front()->hasState() &&
7990 (VectorizableTree.front()->getOpcode() == Instruction::InsertElement ||
7991 VectorizableTree.front()->getOpcode() == Instruction::Store);
7992 if (std::optional<OrdersType> CurrentOrder =
7993 getReorderingData(*TE, /*TopToBottom=*/true, IgnoreReorder)) {
7994 // Do not include ordering for nodes used in the alt opcode vectorization,
7995 // better to reorder them during bottom-to-top stage. If follow the order
7996 // here, it causes reordering of the whole graph though actually it is
7997 // profitable just to reorder the subgraph that starts from the alternate
7998 // opcode vectorization node. Such nodes already end-up with the shuffle
7999 // instruction and it is just enough to change this shuffle rather than
8000 // rotate the scalars for the whole graph.
8001 unsigned Cnt = 0;
8002 const TreeEntry *UserTE = TE.get();
8003 while (UserTE && Cnt < RecursionMaxDepth) {
8004 if (!UserTE->UserTreeIndex)
8005 break;
8006 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8007 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
8008 UserTE->UserTreeIndex.UserTE->Idx != 0)
8009 return;
8010 UserTE = UserTE->UserTreeIndex.UserTE;
8011 ++Cnt;
8012 }
8013 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8014 if (!(TE->State == TreeEntry::Vectorize ||
8015 TE->State == TreeEntry::StridedVectorize ||
8016 TE->State == TreeEntry::SplitVectorize ||
8017 TE->State == TreeEntry::CompressVectorize) ||
8018 !TE->ReuseShuffleIndices.empty())
8019 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
8020 if (TE->State == TreeEntry::Vectorize &&
8021 TE->getOpcode() == Instruction::PHI)
8022 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
8023 }
8024 });
8025
8026 // Reorder the graph nodes according to their vectorization factor.
8027 for (unsigned VF = VectorizableTree.front()->getVectorFactor();
8028 !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
8029 auto It = VFToOrderedEntries.find(VF);
8030 if (It == VFToOrderedEntries.end())
8031 continue;
8032 // Try to find the most profitable order. We just are looking for the most
8033 // used order and reorder scalar elements in the nodes according to this
8034 // mostly used order.
8035 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
8036 // Delete VF entry upon exit.
8037 auto Cleanup = make_scope_exit([&]() { VFToOrderedEntries.erase(It); });
8038
8039 // All operands are reordered and used only in this node - propagate the
8040 // most used order to the user node.
8043 OrdersUses;
8044 for (const TreeEntry *OpTE : OrderedEntries) {
8045 // No need to reorder this nodes, still need to extend and to use shuffle,
8046 // just need to merge reordering shuffle and the reuse shuffle.
8047 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE) &&
8048 OpTE->State != TreeEntry::SplitVectorize)
8049 continue;
8050 // Count number of orders uses.
8051 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8052 &PhisToOrders]() -> const OrdersType & {
8053 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8054 auto It = GathersToOrders.find(OpTE);
8055 if (It != GathersToOrders.end())
8056 return It->second;
8057 }
8058 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8059 auto It = AltShufflesToOrders.find(OpTE);
8060 if (It != AltShufflesToOrders.end())
8061 return It->second;
8062 }
8063 if (OpTE->State == TreeEntry::Vectorize &&
8064 OpTE->getOpcode() == Instruction::PHI) {
8065 auto It = PhisToOrders.find(OpTE);
8066 if (It != PhisToOrders.end())
8067 return It->second;
8068 }
8069 return OpTE->ReorderIndices;
8070 }();
8071 // First consider the order of the external scalar users.
8072 auto It = ExternalUserReorderMap.find(OpTE);
8073 if (It != ExternalUserReorderMap.end()) {
8074 const auto &ExternalUserReorderIndices = It->second;
8075 // If the OpTE vector factor != number of scalars - use natural order,
8076 // it is an attempt to reorder node with reused scalars but with
8077 // external uses.
8078 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8079 OrdersUses.try_emplace(OrdersType(), 0).first->second +=
8080 ExternalUserReorderIndices.size();
8081 } else {
8082 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
8083 ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
8084 }
8085 // No other useful reorder data in this entry.
8086 if (Order.empty())
8087 continue;
8088 }
8089 // Stores actually store the mask, not the order, need to invert.
8090 if (OpTE->State == TreeEntry::Vectorize &&
8091 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
8092 assert(!OpTE->isAltShuffle() &&
8093 "Alternate instructions are only supported by BinaryOperator "
8094 "and CastInst.");
8095 SmallVector<int> Mask;
8096 inversePermutation(Order, Mask);
8097 unsigned E = Order.size();
8098 OrdersType CurrentOrder(E, E);
8099 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
8100 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8101 });
8102 fixupOrderingIndices(CurrentOrder);
8103 ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
8104 } else {
8105 ++OrdersUses.try_emplace(Order, 0).first->second;
8106 }
8107 }
8108 if (OrdersUses.empty())
8109 continue;
8110 // Choose the most used order.
8111 unsigned IdentityCnt = 0;
8112 unsigned FilledIdentityCnt = 0;
8113 OrdersType IdentityOrder(VF, VF);
8114 for (auto &Pair : OrdersUses) {
8115 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
8116 if (!Pair.first.empty())
8117 FilledIdentityCnt += Pair.second;
8118 IdentityCnt += Pair.second;
8119 combineOrders(IdentityOrder, Pair.first);
8120 }
8121 }
8122 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
8123 unsigned Cnt = IdentityCnt;
8124 for (auto &Pair : OrdersUses) {
8125 // Prefer identity order. But, if filled identity found (non-empty order)
8126 // with same number of uses, as the new candidate order, we can choose
8127 // this candidate order.
8128 if (Cnt < Pair.second ||
8129 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8130 Cnt == Pair.second && !BestOrder.empty() &&
8131 isIdentityOrder(BestOrder))) {
8132 combineOrders(Pair.first, BestOrder);
8133 BestOrder = Pair.first;
8134 Cnt = Pair.second;
8135 } else {
8136 combineOrders(BestOrder, Pair.first);
8137 }
8138 }
8139 // Set order of the user node.
8140 if (isIdentityOrder(BestOrder))
8141 continue;
8142 fixupOrderingIndices(BestOrder);
8143 SmallVector<int> Mask;
8144 inversePermutation(BestOrder, Mask);
8145 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
8146 unsigned E = BestOrder.size();
8147 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
8148 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8149 });
8150 // Do an actual reordering, if profitable.
8151 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8152 // Just do the reordering for the nodes with the given VF.
8153 if (TE->Scalars.size() != VF) {
8154 if (TE->ReuseShuffleIndices.size() == VF) {
8155 assert(TE->State != TreeEntry::SplitVectorize &&
8156 "Split vectorized not expected.");
8157 // Need to reorder the reuses masks of the operands with smaller VF to
8158 // be able to find the match between the graph nodes and scalar
8159 // operands of the given node during vectorization/cost estimation.
8160 assert(
8161 (!TE->UserTreeIndex ||
8162 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8163 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8164 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8165 "All users must be of VF size.");
8166 if (SLPReVec) {
8167 assert(SLPReVec && "Only supported by REVEC.");
8168 // ShuffleVectorInst does not do reorderOperands (and it should not
8169 // because ShuffleVectorInst supports only a limited set of
8170 // patterns). Only do reorderNodeWithReuses if the user is not
8171 // ShuffleVectorInst.
8172 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
8173 isa<ShuffleVectorInst>(TE->UserTreeIndex.UserTE->getMainOp()))
8174 continue;
8175 }
8176 // Update ordering of the operands with the smaller VF than the given
8177 // one.
8178 reorderNodeWithReuses(*TE, Mask);
8179 // Update orders in user split vectorize nodes.
8180 if (TE->UserTreeIndex &&
8181 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8182 TE->UserTreeIndex.UserTE->reorderSplitNode(
8183 TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
8184 }
8185 continue;
8186 }
8187 if ((TE->State == TreeEntry::SplitVectorize &&
8188 TE->ReuseShuffleIndices.empty()) ||
8189 ((TE->State == TreeEntry::Vectorize ||
8190 TE->State == TreeEntry::StridedVectorize ||
8191 TE->State == TreeEntry::CompressVectorize) &&
8193 InsertElementInst>(TE->getMainOp()) ||
8194 (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp()))))) {
8195 assert(
8196 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
8197 TE->ReuseShuffleIndices.empty())) &&
8198 "Alternate instructions are only supported by BinaryOperator "
8199 "and CastInst.");
8200 // Build correct orders for extract{element,value}, loads,
8201 // stores and alternate (split) nodes.
8202 reorderOrder(TE->ReorderIndices, Mask);
8203 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
8204 TE->reorderOperands(Mask);
8205 } else {
8206 // Reorder the node and its operands.
8207 TE->reorderOperands(Mask);
8208 assert(TE->ReorderIndices.empty() &&
8209 "Expected empty reorder sequence.");
8210 reorderScalars(TE->Scalars, Mask);
8211 }
8212 if (!TE->ReuseShuffleIndices.empty()) {
8213 // Apply reversed order to keep the original ordering of the reused
8214 // elements to avoid extra reorder indices shuffling.
8215 OrdersType CurrentOrder;
8216 reorderOrder(CurrentOrder, MaskOrder);
8217 SmallVector<int> NewReuses;
8218 inversePermutation(CurrentOrder, NewReuses);
8219 addMask(NewReuses, TE->ReuseShuffleIndices);
8220 TE->ReuseShuffleIndices.swap(NewReuses);
8221 } else if (TE->UserTreeIndex &&
8222 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8223 // Update orders in user split vectorize nodes.
8224 TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
8225 Mask, MaskOrder);
8226 }
8227 }
8228}
8229
8230void BoUpSLP::buildReorderableOperands(
8231 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
8232 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
8233 SmallVectorImpl<TreeEntry *> &GatherOps) {
8234 for (unsigned I : seq<unsigned>(UserTE->getNumOperands())) {
8235 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
8236 return OpData.first == I &&
8237 (OpData.second->State == TreeEntry::Vectorize ||
8238 OpData.second->State == TreeEntry::StridedVectorize ||
8239 OpData.second->State == TreeEntry::CompressVectorize ||
8240 OpData.second->State == TreeEntry::SplitVectorize);
8241 }))
8242 continue;
8243 // Do not request operands, if they do not exist.
8244 if (UserTE->hasState()) {
8245 if (UserTE->getOpcode() == Instruction::ExtractElement ||
8246 UserTE->getOpcode() == Instruction::ExtractValue)
8247 continue;
8248 if (UserTE->getOpcode() == Instruction::InsertElement && I == 0)
8249 continue;
8250 if (UserTE->getOpcode() == Instruction::Store &&
8251 UserTE->State == TreeEntry::Vectorize && I == 1)
8252 continue;
8253 if (UserTE->getOpcode() == Instruction::Load &&
8254 (UserTE->State == TreeEntry::Vectorize ||
8255 UserTE->State == TreeEntry::StridedVectorize ||
8256 UserTE->State == TreeEntry::CompressVectorize))
8257 continue;
8258 }
8259 TreeEntry *TE = getOperandEntry(UserTE, I);
8260 assert(TE && "Expected operand entry.");
8261 if (!TE->isGather()) {
8262 // Add the node to the list of the ordered nodes with the identity
8263 // order.
8264 Edges.emplace_back(I, TE);
8265 // Add ScatterVectorize nodes to the list of operands, where just
8266 // reordering of the scalars is required. Similar to the gathers, so
8267 // simply add to the list of gathered ops.
8268 // If there are reused scalars, process this node as a regular vectorize
8269 // node, just reorder reuses mask.
8270 if (TE->State == TreeEntry::ScatterVectorize &&
8271 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
8272 GatherOps.push_back(TE);
8273 continue;
8274 }
8275 if (ReorderableGathers.contains(TE))
8276 GatherOps.push_back(TE);
8277 }
8278}
8279
8280void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
8281 struct TreeEntryCompare {
8282 bool operator()(const TreeEntry *LHS, const TreeEntry *RHS) const {
8283 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
8284 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
8285 return LHS->Idx < RHS->Idx;
8286 }
8287 };
8289 DenseSet<const TreeEntry *> GathersToOrders;
8290 // Find all reorderable leaf nodes with the given VF.
8291 // Currently the are vectorized loads,extracts without alternate operands +
8292 // some gathering of extracts.
8294 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8295 if (TE->State != TreeEntry::Vectorize &&
8296 TE->State != TreeEntry::StridedVectorize &&
8297 TE->State != TreeEntry::CompressVectorize &&
8298 TE->State != TreeEntry::SplitVectorize)
8299 NonVectorized.insert(TE.get());
8300 if (std::optional<OrdersType> CurrentOrder =
8301 getReorderingData(*TE, /*TopToBottom=*/false, IgnoreReorder)) {
8302 Queue.push(TE.get());
8303 if (!(TE->State == TreeEntry::Vectorize ||
8304 TE->State == TreeEntry::StridedVectorize ||
8305 TE->State == TreeEntry::CompressVectorize ||
8306 TE->State == TreeEntry::SplitVectorize) ||
8307 !TE->ReuseShuffleIndices.empty())
8308 GathersToOrders.insert(TE.get());
8309 }
8310 }
8311
8312 // 1. Propagate order to the graph nodes, which use only reordered nodes.
8313 // I.e., if the node has operands, that are reordered, try to make at least
8314 // one operand order in the natural order and reorder others + reorder the
8315 // user node itself.
8316 SmallPtrSet<const TreeEntry *, 4> Visited, RevisitedOps;
8317 while (!Queue.empty()) {
8318 // 1. Filter out only reordered nodes.
8319 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
8320 TreeEntry *TE = Queue.top();
8321 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
8322 Queue.pop();
8323 SmallVector<TreeEntry *> OrderedOps(1, TE);
8324 while (!Queue.empty()) {
8325 TE = Queue.top();
8326 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
8327 break;
8328 Queue.pop();
8329 OrderedOps.push_back(TE);
8330 }
8331 for (TreeEntry *TE : OrderedOps) {
8332 if (!(TE->State == TreeEntry::Vectorize ||
8333 TE->State == TreeEntry::StridedVectorize ||
8334 TE->State == TreeEntry::CompressVectorize ||
8335 TE->State == TreeEntry::SplitVectorize ||
8336 (TE->isGather() && GathersToOrders.contains(TE))) ||
8337 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
8338 !Visited.insert(TE).second)
8339 continue;
8340 // Build a map between user nodes and their operands order to speedup
8341 // search. The graph currently does not provide this dependency directly.
8342 Users.first = TE->UserTreeIndex.UserTE;
8343 Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
8344 }
8345 if (Users.first) {
8346 auto &Data = Users;
8347 if (Data.first->State == TreeEntry::SplitVectorize) {
8348 assert(
8349 Data.second.size() <= 2 &&
8350 "Expected not greater than 2 operands for split vectorize node.");
8351 if (any_of(Data.second,
8352 [](const auto &Op) { return !Op.second->UserTreeIndex; }))
8353 continue;
8354 // Update orders in user split vectorize nodes.
8355 assert(Data.first->CombinedEntriesWithIndices.size() == 2 &&
8356 "Expected exactly 2 entries.");
8357 for (const auto &P : Data.first->CombinedEntriesWithIndices) {
8358 TreeEntry &OpTE = *VectorizableTree[P.first];
8359 OrdersType Order = OpTE.ReorderIndices;
8360 if (Order.empty() || !OpTE.ReuseShuffleIndices.empty()) {
8361 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
8362 continue;
8363 const auto BestOrder =
8364 getReorderingData(OpTE, /*TopToBottom=*/false, IgnoreReorder);
8365 if (!BestOrder || BestOrder->empty() || isIdentityOrder(*BestOrder))
8366 continue;
8367 Order = *BestOrder;
8368 }
8369 fixupOrderingIndices(Order);
8370 SmallVector<int> Mask;
8371 inversePermutation(Order, Mask);
8372 const unsigned E = Order.size();
8373 SmallVector<int> MaskOrder(E, PoisonMaskElem);
8374 transform(Order, MaskOrder.begin(), [E](unsigned I) {
8375 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8376 });
8377 Data.first->reorderSplitNode(P.second ? 1 : 0, Mask, MaskOrder);
8378 // Clear ordering of the operand.
8379 if (!OpTE.ReorderIndices.empty()) {
8380 OpTE.ReorderIndices.clear();
8381 } else if (!OpTE.ReuseShuffleIndices.empty()) {
8382 reorderReuses(OpTE.ReuseShuffleIndices, Mask);
8383 } else {
8384 assert(OpTE.isGather() && "Expected only gather/buildvector node.");
8385 reorderScalars(OpTE.Scalars, Mask);
8386 }
8387 }
8388 if (Data.first->ReuseShuffleIndices.empty() &&
8389 !Data.first->ReorderIndices.empty()) {
8390 // Insert user node to the list to try to sink reordering deeper in
8391 // the graph.
8392 Queue.push(Data.first);
8393 }
8394 continue;
8395 }
8396 // Check that operands are used only in the User node.
8397 SmallVector<TreeEntry *> GatherOps;
8398 buildReorderableOperands(Data.first, Data.second, NonVectorized,
8399 GatherOps);
8400 // All operands are reordered and used only in this node - propagate the
8401 // most used order to the user node.
8404 OrdersUses;
8405 // Do the analysis for each tree entry only once, otherwise the order of
8406 // the same node my be considered several times, though might be not
8407 // profitable.
8410 for (const auto &Op : Data.second) {
8411 TreeEntry *OpTE = Op.second;
8412 if (!VisitedOps.insert(OpTE).second)
8413 continue;
8414 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
8415 continue;
8416 const auto Order = [&]() -> const OrdersType {
8417 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
8418 return getReorderingData(*OpTE, /*TopToBottom=*/false,
8419 IgnoreReorder)
8420 .value_or(OrdersType(1));
8421 return OpTE->ReorderIndices;
8422 }();
8423 // The order is partially ordered, skip it in favor of fully non-ordered
8424 // orders.
8425 if (Order.size() == 1)
8426 continue;
8427
8428 // Check that the reordering does not increase number of shuffles, i.e.
8429 // same-values-nodes has same parents or their parents has same parents.
8430 if (!Order.empty() && !isIdentityOrder(Order)) {
8431 Value *Root = OpTE->hasState()
8432 ? OpTE->getMainOp()
8433 : *find_if_not(OpTE->Scalars, isConstant);
8434 auto GetSameNodesUsers = [&](Value *Root) {
8436 for (const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
8437 if (TE != OpTE && TE->UserTreeIndex &&
8438 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8439 TE->Scalars.size() == OpTE->Scalars.size() &&
8440 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8441 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8442 Res.insert(TE->UserTreeIndex.UserTE);
8443 }
8444 for (const TreeEntry *TE : getTreeEntries(Root)) {
8445 if (TE != OpTE && TE->UserTreeIndex &&
8446 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8447 TE->Scalars.size() == OpTE->Scalars.size() &&
8448 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8449 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8450 Res.insert(TE->UserTreeIndex.UserTE);
8451 }
8452 return Res.takeVector();
8453 };
8454 auto GetNumOperands = [](const TreeEntry *TE) {
8455 if (TE->State == TreeEntry::SplitVectorize)
8456 return TE->getNumOperands();
8457 if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
8458 return CI->arg_size();
8459 return TE->getNumOperands();
8460 };
8461 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
8462 const TreeEntry *TE) {
8464 if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
8466 for (unsigned Idx : seq<unsigned>(GetNumOperands(TE))) {
8469 continue;
8470 const TreeEntry *Op = getOperandEntry(TE, Idx);
8471 if (Op->isGather() && Op->hasState()) {
8472 const TreeEntry *VecOp =
8473 getSameValuesTreeEntry(Op->getMainOp(), Op->Scalars);
8474 if (VecOp)
8475 Op = VecOp;
8476 }
8477 if (Op->ReorderIndices.empty() && Op->ReuseShuffleIndices.empty())
8478 return false;
8479 }
8480 return true;
8481 };
8482 SmallVector<TreeEntry *> Users = GetSameNodesUsers(Root);
8483 if (!Users.empty() && !all_of(Users, [&](TreeEntry *UTE) {
8484 if (!RevisitedOps.insert(UTE).second)
8485 return false;
8486 return UTE == Data.first || !UTE->ReorderIndices.empty() ||
8487 !UTE->ReuseShuffleIndices.empty() ||
8488 (UTE->UserTreeIndex &&
8489 UTE->UserTreeIndex.UserTE == Data.first) ||
8490 (Data.first->UserTreeIndex &&
8491 Data.first->UserTreeIndex.UserTE == UTE) ||
8492 (IgnoreReorder && UTE->UserTreeIndex &&
8493 UTE->UserTreeIndex.UserTE->Idx == 0) ||
8494 NodeShouldBeReorderedWithOperands(UTE);
8495 }))
8496 continue;
8497 for (TreeEntry *UTE : Users) {
8499 if (auto *CI = dyn_cast<CallInst>(UTE->getMainOp()); CI)
8501 for (unsigned Idx : seq<unsigned>(GetNumOperands(UTE))) {
8504 continue;
8505 const TreeEntry *Op = getOperandEntry(UTE, Idx);
8506 Visited.erase(Op);
8507 Queue.push(const_cast<TreeEntry *>(Op));
8508 }
8509 }
8510 }
8511 unsigned NumOps = count_if(
8512 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
8513 return P.second == OpTE;
8514 });
8515 // Stores actually store the mask, not the order, need to invert.
8516 if (OpTE->State == TreeEntry::Vectorize &&
8517 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
8518 assert(!OpTE->isAltShuffle() &&
8519 "Alternate instructions are only supported by BinaryOperator "
8520 "and CastInst.");
8521 SmallVector<int> Mask;
8522 inversePermutation(Order, Mask);
8523 unsigned E = Order.size();
8524 OrdersType CurrentOrder(E, E);
8525 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
8526 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8527 });
8528 fixupOrderingIndices(CurrentOrder);
8529 OrdersUses.try_emplace(CurrentOrder, 0).first->second += NumOps;
8530 } else {
8531 OrdersUses.try_emplace(Order, 0).first->second += NumOps;
8532 }
8533 auto Res = OrdersUses.try_emplace(OrdersType(), 0);
8534 const auto AllowsReordering = [&](const TreeEntry *TE) {
8535 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
8536 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
8537 (IgnoreReorder && TE->Idx == 0))
8538 return true;
8539 if (TE->isGather()) {
8540 if (GathersToOrders.contains(TE))
8541 return !getReorderingData(*TE, /*TopToBottom=*/false,
8542 IgnoreReorder)
8543 .value_or(OrdersType(1))
8544 .empty();
8545 return true;
8546 }
8547 return false;
8548 };
8549 if (OpTE->UserTreeIndex) {
8550 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
8551 if (!VisitedUsers.insert(UserTE).second)
8552 continue;
8553 // May reorder user node if it requires reordering, has reused
8554 // scalars, is an alternate op vectorize node or its op nodes require
8555 // reordering.
8556 if (AllowsReordering(UserTE))
8557 continue;
8558 // Check if users allow reordering.
8559 // Currently look up just 1 level of operands to avoid increase of
8560 // the compile time.
8561 // Profitable to reorder if definitely more operands allow
8562 // reordering rather than those with natural order.
8564 if (static_cast<unsigned>(count_if(
8565 Ops, [UserTE, &AllowsReordering](
8566 const std::pair<unsigned, TreeEntry *> &Op) {
8567 return AllowsReordering(Op.second) &&
8568 Op.second->UserTreeIndex.UserTE == UserTE;
8569 })) <= Ops.size() / 2)
8570 ++Res.first->second;
8571 }
8572 }
8573 if (OrdersUses.empty()) {
8574 Visited.insert_range(llvm::make_second_range(Data.second));
8575 continue;
8576 }
8577 // Choose the most used order.
8578 unsigned IdentityCnt = 0;
8579 unsigned VF = Data.second.front().second->getVectorFactor();
8580 OrdersType IdentityOrder(VF, VF);
8581 for (auto &Pair : OrdersUses) {
8582 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
8583 IdentityCnt += Pair.second;
8584 combineOrders(IdentityOrder, Pair.first);
8585 }
8586 }
8587 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
8588 unsigned Cnt = IdentityCnt;
8589 for (auto &Pair : OrdersUses) {
8590 // Prefer identity order. But, if filled identity found (non-empty
8591 // order) with same number of uses, as the new candidate order, we can
8592 // choose this candidate order.
8593 if (Cnt < Pair.second) {
8594 combineOrders(Pair.first, BestOrder);
8595 BestOrder = Pair.first;
8596 Cnt = Pair.second;
8597 } else {
8598 combineOrders(BestOrder, Pair.first);
8599 }
8600 }
8601 // Set order of the user node.
8602 if (isIdentityOrder(BestOrder)) {
8603 Visited.insert_range(llvm::make_second_range(Data.second));
8604 continue;
8605 }
8606 fixupOrderingIndices(BestOrder);
8607 // Erase operands from OrderedEntries list and adjust their orders.
8608 VisitedOps.clear();
8609 SmallVector<int> Mask;
8610 inversePermutation(BestOrder, Mask);
8611 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
8612 unsigned E = BestOrder.size();
8613 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
8614 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8615 });
8616 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
8617 TreeEntry *TE = Op.second;
8618 if (!VisitedOps.insert(TE).second)
8619 continue;
8620 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
8621 reorderNodeWithReuses(*TE, Mask);
8622 continue;
8623 }
8624 // Gathers are processed separately.
8625 if (TE->State != TreeEntry::Vectorize &&
8626 TE->State != TreeEntry::StridedVectorize &&
8627 TE->State != TreeEntry::CompressVectorize &&
8628 TE->State != TreeEntry::SplitVectorize &&
8629 (TE->State != TreeEntry::ScatterVectorize ||
8630 TE->ReorderIndices.empty()))
8631 continue;
8632 assert((BestOrder.size() == TE->ReorderIndices.size() ||
8633 TE->ReorderIndices.empty()) &&
8634 "Non-matching sizes of user/operand entries.");
8635 reorderOrder(TE->ReorderIndices, Mask);
8636 if (IgnoreReorder && TE == VectorizableTree.front().get())
8637 IgnoreReorder = false;
8638 }
8639 // For gathers just need to reorder its scalars.
8640 for (TreeEntry *Gather : GatherOps) {
8641 assert(Gather->ReorderIndices.empty() &&
8642 "Unexpected reordering of gathers.");
8643 if (!Gather->ReuseShuffleIndices.empty()) {
8644 // Just reorder reuses indices.
8645 reorderReuses(Gather->ReuseShuffleIndices, Mask);
8646 continue;
8647 }
8648 reorderScalars(Gather->Scalars, Mask);
8649 Visited.insert(Gather);
8650 }
8651 // Reorder operands of the user node and set the ordering for the user
8652 // node itself.
8653 auto IsNotProfitableAltCodeNode = [](const TreeEntry &TE) {
8654 return TE.isAltShuffle() &&
8655 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
8656 TE.ReorderIndices.empty());
8657 };
8658 if (Data.first->State != TreeEntry::Vectorize ||
8660 Data.first->getMainOp()) ||
8661 IsNotProfitableAltCodeNode(*Data.first))
8662 Data.first->reorderOperands(Mask);
8663 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
8664 IsNotProfitableAltCodeNode(*Data.first) ||
8665 Data.first->State == TreeEntry::StridedVectorize ||
8666 Data.first->State == TreeEntry::CompressVectorize) {
8667 reorderScalars(Data.first->Scalars, Mask);
8668 reorderOrder(Data.first->ReorderIndices, MaskOrder,
8669 /*BottomOrder=*/true);
8670 if (Data.first->ReuseShuffleIndices.empty() &&
8671 !Data.first->ReorderIndices.empty() &&
8672 !IsNotProfitableAltCodeNode(*Data.first)) {
8673 // Insert user node to the list to try to sink reordering deeper in
8674 // the graph.
8675 Queue.push(Data.first);
8676 }
8677 } else {
8678 reorderOrder(Data.first->ReorderIndices, Mask);
8679 }
8680 }
8681 }
8682 // If the reordering is unnecessary, just remove the reorder.
8683 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
8684 VectorizableTree.front()->ReuseShuffleIndices.empty())
8685 VectorizableTree.front()->ReorderIndices.clear();
8686}
8687
8688Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
8689 if (Entry.hasState() &&
8690 (Entry.getOpcode() == Instruction::Store ||
8691 Entry.getOpcode() == Instruction::Load) &&
8692 Entry.State == TreeEntry::StridedVectorize &&
8693 !Entry.ReorderIndices.empty() && isReverseOrder(Entry.ReorderIndices))
8694 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
8695 return dyn_cast<Instruction>(Entry.Scalars.front());
8696}
8697
8699 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
8700 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
8701 DenseMap<Value *, unsigned> ScalarToExtUses;
8702 SmallPtrSet<Value *, 4> ExternalUsers;
8703 // Collect the values that we need to extract from the tree.
8704 for (auto &TEPtr : VectorizableTree) {
8705 TreeEntry *Entry = TEPtr.get();
8706
8707 // No need to handle users of gathered values.
8708 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
8709 continue;
8710
8711 // For each lane:
8712 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
8713 Value *Scalar = Entry->Scalars[Lane];
8714 if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
8715 continue;
8716
8717 // All uses must be replaced already? No need to do it again.
8718 auto It = ScalarToExtUses.find(Scalar);
8719 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
8720 continue;
8721
8722 if (Scalar->hasNUsesOrMore(NumVectScalars)) {
8723 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8724 LLVM_DEBUG(dbgs() << "SLP: Need to extract from lane " << FoundLane
8725 << " from " << *Scalar << "for many users.\n");
8726 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
8727 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
8728 ExternalUsesWithNonUsers.insert(Scalar);
8729 continue;
8730 }
8731
8732 // Check if the scalar is externally used as an extra arg.
8733 const auto ExtI = ExternallyUsedValues.find(Scalar);
8734 if (ExtI != ExternallyUsedValues.end()) {
8735 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8736 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
8737 << FoundLane << " from " << *Scalar << ".\n");
8738 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
8739 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
8740 continue;
8741 }
8742 for (User *U : Scalar->users()) {
8743 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
8744
8745 Instruction *UserInst = dyn_cast<Instruction>(U);
8746 if (!UserInst || isDeleted(UserInst))
8747 continue;
8748
8749 // Ignore users in the user ignore list.
8750 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
8751 continue;
8752
8753 // Skip in-tree scalars that become vectors
8754 if (ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
8755 !UseEntries.empty()) {
8756 // Some in-tree scalars will remain as scalar in vectorized
8757 // instructions. If that is the case, the one in FoundLane will
8758 // be used.
8759 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
8760 isa<LoadInst, StoreInst>(UserInst)) ||
8761 isa<CallInst>(UserInst)) ||
8762 all_of(UseEntries, [&](TreeEntry *UseEntry) {
8763 return UseEntry->State == TreeEntry::ScatterVectorize ||
8765 Scalar, getRootEntryInstruction(*UseEntry), TLI,
8766 TTI);
8767 })) {
8768 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
8769 << ".\n");
8770 assert(none_of(UseEntries,
8771 [](TreeEntry *UseEntry) {
8772 return UseEntry->isGather();
8773 }) &&
8774 "Bad state");
8775 continue;
8776 }
8777 U = nullptr;
8778 if (It != ScalarToExtUses.end()) {
8779 ExternalUses[It->second].User = nullptr;
8780 break;
8781 }
8782 }
8783
8784 if (U && Scalar->hasNUsesOrMore(UsesLimit))
8785 U = nullptr;
8786 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8787 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
8788 << " from lane " << FoundLane << " from " << *Scalar
8789 << ".\n");
8790 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
8791 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
8792 ExternalUsesWithNonUsers.insert(Scalar);
8793 if (!U)
8794 break;
8795 }
8796 }
8797 }
8798}
8799
8801BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
8804 PtrToStoresMap;
8805 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
8806 Value *V = TE->Scalars[Lane];
8807 // Don't iterate over the users of constant data.
8808 if (!isa<Instruction>(V))
8809 continue;
8810 // To save compilation time we don't visit if we have too many users.
8811 if (V->hasNUsesOrMore(UsesLimit))
8812 break;
8813
8814 // Collect stores per pointer object.
8815 for (User *U : V->users()) {
8816 auto *SI = dyn_cast<StoreInst>(U);
8817 // Test whether we can handle the store. V might be a global, which could
8818 // be used in a different function.
8819 if (SI == nullptr || !SI->isSimple() || SI->getFunction() != F ||
8820 !isValidElementType(SI->getValueOperand()->getType()))
8821 continue;
8822 // Skip entry if already
8823 if (isVectorized(U))
8824 continue;
8825
8826 Value *Ptr =
8827 getUnderlyingObject(SI->getPointerOperand(), RecursionMaxDepth);
8828 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
8829 SI->getValueOperand()->getType(), Ptr}];
8830 // For now just keep one store per pointer object per lane.
8831 // TODO: Extend this to support multiple stores per pointer per lane
8832 if (StoresVec.size() > Lane)
8833 continue;
8834 if (!StoresVec.empty()) {
8835 std::optional<int64_t> Diff = getPointersDiff(
8836 SI->getValueOperand()->getType(), SI->getPointerOperand(),
8837 SI->getValueOperand()->getType(),
8838 StoresVec.front()->getPointerOperand(), *DL, *SE,
8839 /*StrictCheck=*/true);
8840 // We failed to compare the pointers so just abandon this store.
8841 if (!Diff)
8842 continue;
8843 }
8844 StoresVec.push_back(SI);
8845 }
8846 }
8847 SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());
8848 unsigned I = 0;
8849 for (auto &P : PtrToStoresMap) {
8850 Res[I].swap(P.second);
8851 ++I;
8852 }
8853 return Res;
8854}
8855
8856bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
8857 OrdersType &ReorderIndices) const {
8858 // We check whether the stores in StoreVec can form a vector by sorting them
8859 // and checking whether they are consecutive.
8860
8861 // To avoid calling getPointersDiff() while sorting we create a vector of
8862 // pairs {store, offset from first} and sort this instead.
8864 StoreInst *S0 = StoresVec[0];
8865 StoreOffsetVec.emplace_back(0, 0);
8866 Type *S0Ty = S0->getValueOperand()->getType();
8867 Value *S0Ptr = S0->getPointerOperand();
8868 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
8869 StoreInst *SI = StoresVec[Idx];
8870 std::optional<int64_t> Diff =
8871 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
8872 SI->getPointerOperand(), *DL, *SE,
8873 /*StrictCheck=*/true);
8874 StoreOffsetVec.emplace_back(*Diff, Idx);
8875 }
8876
8877 // Check if the stores are consecutive by checking if their difference is 1.
8878 if (StoreOffsetVec.size() != StoresVec.size())
8879 return false;
8880 sort(StoreOffsetVec, llvm::less_first());
8881 unsigned Idx = 0;
8882 int64_t PrevDist = 0;
8883 for (const auto &P : StoreOffsetVec) {
8884 if (Idx > 0 && P.first != PrevDist + 1)
8885 return false;
8886 PrevDist = P.first;
8887 ++Idx;
8888 }
8889
8890 // Calculate the shuffle indices according to their offset against the sorted
8891 // StoreOffsetVec.
8892 ReorderIndices.assign(StoresVec.size(), 0);
8893 bool IsIdentity = true;
8894 for (auto [I, P] : enumerate(StoreOffsetVec)) {
8895 ReorderIndices[P.second] = I;
8896 IsIdentity &= P.second == I;
8897 }
8898 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
8899 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
8900 // same convention here.
8901 if (IsIdentity)
8902 ReorderIndices.clear();
8903
8904 return true;
8905}
8906
8907#ifndef NDEBUG
8909 for (unsigned Idx : Order)
8910 dbgs() << Idx << ", ";
8911 dbgs() << "\n";
8912}
8913#endif
8914
8916BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
8917 unsigned NumLanes = TE->Scalars.size();
8918
8919 SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);
8920
8921 // Holds the reorder indices for each candidate store vector that is a user of
8922 // the current TreeEntry.
8923 SmallVector<OrdersType, 1> ExternalReorderIndices;
8924
8925 // Now inspect the stores collected per pointer and look for vectorization
8926 // candidates. For each candidate calculate the reorder index vector and push
8927 // it into `ExternalReorderIndices`
8928 for (ArrayRef<StoreInst *> StoresVec : Stores) {
8929 // If we have fewer than NumLanes stores, then we can't form a vector.
8930 if (StoresVec.size() != NumLanes)
8931 continue;
8932
8933 // If the stores are not consecutive then abandon this StoresVec.
8934 OrdersType ReorderIndices;
8935 if (!canFormVector(StoresVec, ReorderIndices))
8936 continue;
8937
8938 // We now know that the scalars in StoresVec can form a vector instruction,
8939 // so set the reorder indices.
8940 ExternalReorderIndices.push_back(ReorderIndices);
8941 }
8942 return ExternalReorderIndices;
8943}
8944
8946 const SmallDenseSet<Value *> &UserIgnoreLst) {
8947 deleteTree();
8948 UserIgnoreList = &UserIgnoreLst;
8949 if (!allSameType(Roots))
8950 return;
8951 buildTreeRec(Roots, 0, EdgeInfo());
8952}
8953
8955 deleteTree();
8956 if (!allSameType(Roots))
8957 return;
8958 buildTreeRec(Roots, 0, EdgeInfo());
8959}
8960
8961/// Tries to find subvector of loads and builds new vector of only loads if can
8962/// be profitable.
8964 const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL,
8966 SmallVectorImpl<SmallVector<std::pair<LoadInst *, int64_t>>> &GatheredLoads,
8967 bool AddNew = true) {
8968 if (VL.empty())
8969 return;
8970 Type *ScalarTy = getValueType(VL.front());
8971 if (!isValidElementType(ScalarTy))
8972 return;
8974 SmallVector<DenseMap<int64_t, LoadInst *>> ClusteredDistToLoad;
8975 for (Value *V : VL) {
8976 auto *LI = dyn_cast<LoadInst>(V);
8977 if (!LI)
8978 continue;
8979 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
8980 continue;
8981 bool IsFound = false;
8982 for (auto [Map, Data] : zip(ClusteredDistToLoad, ClusteredLoads)) {
8983 assert(LI->getParent() == Data.front().first->getParent() &&
8984 LI->getType() == Data.front().first->getType() &&
8985 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) ==
8986 getUnderlyingObject(Data.front().first->getPointerOperand(),
8988 "Expected loads with the same type, same parent and same "
8989 "underlying pointer.");
8990 std::optional<int64_t> Dist = getPointersDiff(
8991 LI->getType(), LI->getPointerOperand(), Data.front().first->getType(),
8992 Data.front().first->getPointerOperand(), DL, SE,
8993 /*StrictCheck=*/true);
8994 if (!Dist)
8995 continue;
8996 auto It = Map.find(*Dist);
8997 if (It != Map.end() && It->second != LI)
8998 continue;
8999 if (It == Map.end()) {
9000 Data.emplace_back(LI, *Dist);
9001 Map.try_emplace(*Dist, LI);
9002 }
9003 IsFound = true;
9004 break;
9005 }
9006 if (!IsFound) {
9007 ClusteredLoads.emplace_back().emplace_back(LI, 0);
9008 ClusteredDistToLoad.emplace_back().try_emplace(0, LI);
9009 }
9010 }
9011 auto FindMatchingLoads =
9014 &GatheredLoads,
9015 SetVector<unsigned> &ToAdd, SetVector<unsigned> &Repeated,
9016 int64_t &Offset, unsigned &Start) {
9017 if (Loads.empty())
9018 return GatheredLoads.end();
9019 LoadInst *LI = Loads.front().first;
9020 for (auto [Idx, Data] : enumerate(GatheredLoads)) {
9021 if (Idx < Start)
9022 continue;
9023 ToAdd.clear();
9024 if (LI->getParent() != Data.front().first->getParent() ||
9025 LI->getType() != Data.front().first->getType())
9026 continue;
9027 std::optional<int64_t> Dist =
9029 Data.front().first->getType(),
9030 Data.front().first->getPointerOperand(), DL, SE,
9031 /*StrictCheck=*/true);
9032 if (!Dist)
9033 continue;
9034 SmallSet<int64_t, 4> DataDists;
9036 for (std::pair<LoadInst *, int64_t> P : Data) {
9037 DataDists.insert(P.second);
9038 DataLoads.insert(P.first);
9039 }
9040 // Found matching gathered loads - check if all loads are unique or
9041 // can be effectively vectorized.
9042 unsigned NumUniques = 0;
9043 for (auto [Cnt, Pair] : enumerate(Loads)) {
9044 bool Used = DataLoads.contains(Pair.first);
9045 if (!Used && !DataDists.contains(*Dist + Pair.second)) {
9046 ++NumUniques;
9047 ToAdd.insert(Cnt);
9048 } else if (Used) {
9049 Repeated.insert(Cnt);
9050 }
9051 }
9052 if (NumUniques > 0 &&
9053 (Loads.size() == NumUniques ||
9054 (Loads.size() - NumUniques >= 2 &&
9055 Loads.size() - NumUniques >= Loads.size() / 2 &&
9056 (has_single_bit(Data.size() + NumUniques) ||
9057 bit_ceil(Data.size()) <
9058 bit_ceil(Data.size() + NumUniques))))) {
9059 Offset = *Dist;
9060 Start = Idx + 1;
9061 return std::next(GatheredLoads.begin(), Idx);
9062 }
9063 }
9064 ToAdd.clear();
9065 return GatheredLoads.end();
9066 };
9067 for (ArrayRef<std::pair<LoadInst *, int64_t>> Data : ClusteredLoads) {
9068 unsigned Start = 0;
9069 SetVector<unsigned> ToAdd, LocalToAdd, Repeated;
9070 int64_t Offset = 0;
9071 auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,
9072 Offset, Start);
9073 while (It != GatheredLoads.end()) {
9074 assert(!LocalToAdd.empty() && "Expected some elements to add.");
9075 for (unsigned Idx : LocalToAdd)
9076 It->emplace_back(Data[Idx].first, Data[Idx].second + Offset);
9077 ToAdd.insert_range(LocalToAdd);
9078 It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset,
9079 Start);
9080 }
9081 if (any_of(seq<unsigned>(Data.size()), [&](unsigned Idx) {
9082 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
9083 })) {
9084 auto AddNewLoads =
9086 for (unsigned Idx : seq<unsigned>(Data.size())) {
9087 if (ToAdd.contains(Idx) || Repeated.contains(Idx))
9088 continue;
9089 Loads.push_back(Data[Idx]);
9090 }
9091 };
9092 if (!AddNew) {
9093 LoadInst *LI = Data.front().first;
9094 It = find_if(
9095 GatheredLoads, [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9096 return PD.front().first->getParent() == LI->getParent() &&
9097 PD.front().first->getType() == LI->getType();
9098 });
9099 while (It != GatheredLoads.end()) {
9100 AddNewLoads(*It);
9101 It = std::find_if(
9102 std::next(It), GatheredLoads.end(),
9103 [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9104 return PD.front().first->getParent() == LI->getParent() &&
9105 PD.front().first->getType() == LI->getType();
9106 });
9107 }
9108 }
9109 GatheredLoads.emplace_back().append(Data.begin(), Data.end());
9110 AddNewLoads(GatheredLoads.emplace_back());
9111 }
9112 }
9113}
9114
9115void BoUpSLP::tryToVectorizeGatheredLoads(
9116 const SmallMapVector<
9117 std::tuple<BasicBlock *, Value *, Type *>,
9118 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
9119 &GatheredLoads) {
9120 GatheredLoadsEntriesFirst = VectorizableTree.size();
9121
9122 SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
9123 LoadEntriesToVectorize.size());
9124 for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
9125 Set.insert_range(VectorizableTree[Idx]->Scalars);
9126
9127 // Sort loads by distance.
9128 auto LoadSorter = [](const std::pair<LoadInst *, int64_t> &L1,
9129 const std::pair<LoadInst *, int64_t> &L2) {
9130 return L1.second > L2.second;
9131 };
9132
9133 auto IsMaskedGatherSupported = [&, TTI = TTI](ArrayRef<LoadInst *> Loads) {
9134 ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
9135 Loads.size());
9136 Align Alignment = computeCommonAlignment<LoadInst>(Values);
9137 auto *Ty = getWidenedType(Loads.front()->getType(), Loads.size());
9138 return TTI->isLegalMaskedGather(Ty, Alignment) &&
9139 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
9140 };
9141
9142 auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,
9143 BoUpSLP::ValueSet &VectorizedLoads,
9144 SmallVectorImpl<LoadInst *> &NonVectorized,
9145 bool Final, unsigned MaxVF) {
9147 unsigned StartIdx = 0;
9148 SmallVector<int> CandidateVFs;
9149 if (VectorizeNonPowerOf2 && has_single_bit(MaxVF + 1))
9150 CandidateVFs.push_back(MaxVF);
9151 for (int NumElts = getFloorFullVectorNumberOfElements(
9152 *TTI, Loads.front()->getType(), MaxVF);
9153 NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(
9154 *TTI, Loads.front()->getType(), NumElts - 1)) {
9155 CandidateVFs.push_back(NumElts);
9156 if (VectorizeNonPowerOf2 && NumElts > 2)
9157 CandidateVFs.push_back(NumElts - 1);
9158 }
9159
9160 if (Final && CandidateVFs.empty())
9161 return Results;
9162
9163 unsigned BestVF = Final ? CandidateVFs.back() : 0;
9164 for (unsigned NumElts : CandidateVFs) {
9165 if (Final && NumElts > BestVF)
9166 continue;
9167 SmallVector<unsigned> MaskedGatherVectorized;
9168 for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E;
9169 ++Cnt) {
9170 ArrayRef<LoadInst *> Slice =
9171 ArrayRef(Loads).slice(Cnt, std::min(NumElts, E - Cnt));
9172 if (VectorizedLoads.count(Slice.front()) ||
9173 VectorizedLoads.count(Slice.back()) ||
9175 continue;
9176 // Check if it is profitable to try vectorizing gathered loads. It is
9177 // profitable if we have more than 3 consecutive loads or if we have
9178 // less but all users are vectorized or deleted.
9179 bool AllowToVectorize = false;
9180 // Check if it is profitable to vectorize 2-elements loads.
9181 if (NumElts == 2) {
9182 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
9183 Slice.front()->getType(), ElementCount::getFixed(NumElts));
9184 auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
9185 for (LoadInst *LI : Slice) {
9186 // If single use/user - allow to vectorize.
9187 if (LI->hasOneUse())
9188 continue;
9189 // 1. Check if number of uses equals number of users.
9190 // 2. All users are deleted.
9191 // 3. The load broadcasts are not allowed or the load is not
9192 // broadcasted.
9193 if (static_cast<unsigned int>(std::distance(
9194 LI->user_begin(), LI->user_end())) != LI->getNumUses())
9195 return false;
9196 if (!IsLegalBroadcastLoad)
9197 continue;
9198 if (LI->hasNUsesOrMore(UsesLimit))
9199 return false;
9200 for (User *U : LI->users()) {
9201 if (auto *UI = dyn_cast<Instruction>(U); UI && isDeleted(UI))
9202 continue;
9203 for (const TreeEntry *UTE : getTreeEntries(U)) {
9204 for (int I : seq<int>(UTE->getNumOperands())) {
9205 if (all_of(UTE->getOperand(I), [LI](Value *V) {
9206 return V == LI || isa<PoisonValue>(V);
9207 }))
9208 // Found legal broadcast - do not vectorize.
9209 return false;
9210 }
9211 }
9212 }
9213 }
9214 return true;
9215 };
9216 AllowToVectorize = CheckIfAllowed(Slice);
9217 } else {
9218 AllowToVectorize =
9219 (NumElts >= 3 ||
9220 any_of(ValueToGatherNodes.at(Slice.front()),
9221 [=](const TreeEntry *TE) {
9222 return TE->Scalars.size() == 2 &&
9223 ((TE->Scalars.front() == Slice.front() &&
9224 TE->Scalars.back() == Slice.back()) ||
9225 (TE->Scalars.front() == Slice.back() &&
9226 TE->Scalars.back() == Slice.front()));
9227 })) &&
9228 hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),
9229 Slice.size());
9230 }
9231 if (AllowToVectorize) {
9232 SmallVector<Value *> PointerOps;
9233 OrdersType CurrentOrder;
9234 // Try to build vector load.
9235 ArrayRef<Value *> Values(
9236 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9237 StridedPtrInfo SPtrInfo;
9238 LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
9239 PointerOps, SPtrInfo, &BestVF);
9240 if (LS != LoadsState::Gather ||
9241 (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
9242 if (LS == LoadsState::ScatterVectorize) {
9243 if (MaskedGatherVectorized.empty() ||
9244 Cnt >= MaskedGatherVectorized.back() + NumElts)
9245 MaskedGatherVectorized.push_back(Cnt);
9246 continue;
9247 }
9248 if (LS != LoadsState::Gather) {
9249 Results.emplace_back(Values, LS);
9250 VectorizedLoads.insert_range(Slice);
9251 // If we vectorized initial block, no need to try to vectorize it
9252 // again.
9253 if (Cnt == StartIdx)
9254 StartIdx += NumElts;
9255 }
9256 // Check if the whole array was vectorized already - exit.
9257 if (StartIdx >= Loads.size())
9258 break;
9259 // Erase last masked gather candidate, if another candidate within
9260 // the range is found to be better.
9261 if (!MaskedGatherVectorized.empty() &&
9262 Cnt < MaskedGatherVectorized.back() + NumElts)
9263 MaskedGatherVectorized.pop_back();
9264 Cnt += NumElts - 1;
9265 continue;
9266 }
9267 }
9268 if (!AllowToVectorize || BestVF == 0)
9270 }
9271 // Mark masked gathers candidates as vectorized, if any.
9272 for (unsigned Cnt : MaskedGatherVectorized) {
9273 ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(
9274 Cnt, std::min<unsigned>(NumElts, Loads.size() - Cnt));
9275 ArrayRef<Value *> Values(
9276 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9277 Results.emplace_back(Values, LoadsState::ScatterVectorize);
9278 VectorizedLoads.insert_range(Slice);
9279 // If we vectorized initial block, no need to try to vectorize it again.
9280 if (Cnt == StartIdx)
9281 StartIdx += NumElts;
9282 }
9283 }
9284 for (LoadInst *LI : Loads) {
9285 if (!VectorizedLoads.contains(LI))
9286 NonVectorized.push_back(LI);
9287 }
9288 return Results;
9289 };
9290 auto ProcessGatheredLoads =
9291 [&, &TTI = *TTI](
9293 bool Final = false) {
9294 SmallVector<LoadInst *> NonVectorized;
9295 for (ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
9296 GatheredLoads) {
9297 if (LoadsDists.size() <= 1) {
9298 NonVectorized.push_back(LoadsDists.back().first);
9299 continue;
9300 }
9302 LoadsDists);
9303 SmallVector<LoadInst *> OriginalLoads(make_first_range(LoadsDists));
9304 stable_sort(LocalLoadsDists, LoadSorter);
9306 unsigned MaxConsecutiveDistance = 0;
9307 unsigned CurrentConsecutiveDist = 1;
9308 int64_t LastDist = LocalLoadsDists.front().second;
9309 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
9310 for (const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
9311 if (isVectorized(L.first))
9312 continue;
9313 assert(LastDist >= L.second &&
9314 "Expected first distance always not less than second");
9315 if (static_cast<uint64_t>(LastDist - L.second) ==
9316 CurrentConsecutiveDist) {
9317 ++CurrentConsecutiveDist;
9318 MaxConsecutiveDistance =
9319 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
9320 Loads.push_back(L.first);
9321 continue;
9322 }
9323 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
9324 !Loads.empty())
9325 Loads.pop_back();
9326 CurrentConsecutiveDist = 1;
9327 LastDist = L.second;
9328 Loads.push_back(L.first);
9329 }
9330 if (Loads.size() <= 1)
9331 continue;
9332 if (AllowMaskedGather)
9333 MaxConsecutiveDistance = Loads.size();
9334 else if (MaxConsecutiveDistance < 2)
9335 continue;
9336 BoUpSLP::ValueSet VectorizedLoads;
9337 SmallVector<LoadInst *> SortedNonVectorized;
9339 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
9340 Final, MaxConsecutiveDistance);
9341 if (!Results.empty() && !SortedNonVectorized.empty() &&
9342 OriginalLoads.size() == Loads.size() &&
9343 MaxConsecutiveDistance == Loads.size() &&
9345 [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
9346 return P.second == LoadsState::ScatterVectorize;
9347 })) {
9348 VectorizedLoads.clear();
9349 SmallVector<LoadInst *> UnsortedNonVectorized;
9351 UnsortedResults =
9352 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
9353 UnsortedNonVectorized, Final,
9354 OriginalLoads.size());
9355 if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
9356 SortedNonVectorized.swap(UnsortedNonVectorized);
9357 Results.swap(UnsortedResults);
9358 }
9359 }
9360 for (auto [Slice, _] : Results) {
9361 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
9362 << Slice.size() << ")\n");
9363 if (any_of(Slice, [&](Value *V) { return isVectorized(V); })) {
9364 for (Value *L : Slice)
9365 if (!isVectorized(L))
9366 SortedNonVectorized.push_back(cast<LoadInst>(L));
9367 continue;
9368 }
9369
9370 // Select maximum VF as a maximum of user gathered nodes and
9371 // distance between scalar loads in these nodes.
9372 unsigned MaxVF = Slice.size();
9373 unsigned UserMaxVF = 0;
9374 unsigned InterleaveFactor = 0;
9375 if (MaxVF == 2) {
9376 UserMaxVF = MaxVF;
9377 } else {
9378 // Found distance between segments of the interleaved loads.
9379 std::optional<unsigned> InterleavedLoadsDistance = 0;
9380 unsigned Order = 0;
9381 std::optional<unsigned> CommonVF = 0;
9382 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
9383 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
9384 for (auto [Idx, V] : enumerate(Slice)) {
9385 for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
9386 UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
9387 unsigned Pos =
9388 EntryToPosition.try_emplace(E, Idx).first->second;
9389 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
9390 if (CommonVF) {
9391 if (*CommonVF == 0) {
9392 CommonVF = E->Scalars.size();
9393 continue;
9394 }
9395 if (*CommonVF != E->Scalars.size())
9396 CommonVF.reset();
9397 }
9398 // Check if the load is the part of the interleaved load.
9399 if (Pos != Idx && InterleavedLoadsDistance) {
9400 if (!DeinterleavedNodes.contains(E) &&
9401 any_of(E->Scalars, [&, Slice = Slice](Value *V) {
9402 if (isa<Constant>(V))
9403 return false;
9404 if (isVectorized(V))
9405 return true;
9406 const auto &Nodes = ValueToGatherNodes.at(V);
9407 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
9408 !is_contained(Slice, V);
9409 })) {
9410 InterleavedLoadsDistance.reset();
9411 continue;
9412 }
9413 DeinterleavedNodes.insert(E);
9414 if (*InterleavedLoadsDistance == 0) {
9415 InterleavedLoadsDistance = Idx - Pos;
9416 continue;
9417 }
9418 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
9419 (Idx - Pos) / *InterleavedLoadsDistance < Order)
9420 InterleavedLoadsDistance.reset();
9421 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
9422 }
9423 }
9424 }
9425 DeinterleavedNodes.clear();
9426 // Check if the large load represents interleaved load operation.
9427 if (InterleavedLoadsDistance.value_or(0) > 1 &&
9428 CommonVF.value_or(0) != 0) {
9429 InterleaveFactor = bit_ceil(*InterleavedLoadsDistance);
9430 unsigned VF = *CommonVF;
9431 OrdersType Order;
9432 SmallVector<Value *> PointerOps;
9433 StridedPtrInfo SPtrInfo;
9434 // Segmented load detected - vectorize at maximum vector factor.
9435 if (InterleaveFactor <= Slice.size() &&
9436 TTI.isLegalInterleavedAccessType(
9437 getWidenedType(Slice.front()->getType(), VF),
9438 InterleaveFactor,
9439 cast<LoadInst>(Slice.front())->getAlign(),
9440 cast<LoadInst>(Slice.front())
9442 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
9443 SPtrInfo) == LoadsState::Vectorize) {
9444 UserMaxVF = InterleaveFactor * VF;
9445 } else {
9446 InterleaveFactor = 0;
9447 }
9448 }
9449 // Cannot represent the loads as consecutive vectorizable nodes -
9450 // just exit.
9451 unsigned ConsecutiveNodesSize = 0;
9452 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
9453 any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9454 [&, Slice = Slice](const auto &P) {
9455 const auto *It = find_if(Slice, [&](Value *V) {
9456 return std::get<1>(P).contains(V);
9457 });
9458 if (It == Slice.end())
9459 return false;
9460 const TreeEntry &TE =
9461 *VectorizableTree[std::get<0>(P)];
9462 ArrayRef<Value *> VL = TE.Scalars;
9463 OrdersType Order;
9464 SmallVector<Value *> PointerOps;
9465 StridedPtrInfo SPtrInfo;
9467 VL, VL.front(), Order, PointerOps, SPtrInfo);
9468 if (State == LoadsState::ScatterVectorize ||
9470 return false;
9471 ConsecutiveNodesSize += VL.size();
9472 size_t Start = std::distance(Slice.begin(), It);
9473 size_t Sz = Slice.size() - Start;
9474 return Sz < VL.size() ||
9475 Slice.slice(Start, VL.size()) != VL;
9476 }))
9477 continue;
9478 // Try to build long masked gather loads.
9479 UserMaxVF = bit_ceil(UserMaxVF);
9480 if (InterleaveFactor == 0 &&
9481 any_of(seq<unsigned>(Slice.size() / UserMaxVF),
9482 [&, Slice = Slice](unsigned Idx) {
9483 OrdersType Order;
9484 SmallVector<Value *> PointerOps;
9485 StridedPtrInfo SPtrInfo;
9486 return canVectorizeLoads(
9487 Slice.slice(Idx * UserMaxVF, UserMaxVF),
9488 Slice[Idx * UserMaxVF], Order, PointerOps,
9489 SPtrInfo) == LoadsState::ScatterVectorize;
9490 }))
9491 UserMaxVF = MaxVF;
9492 if (Slice.size() != ConsecutiveNodesSize)
9493 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
9494 }
9495 for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
9496 bool IsVectorized = true;
9497 for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
9498 ArrayRef<Value *> SubSlice =
9499 Slice.slice(I, std::min(VF, E - I));
9500 if (isVectorized(SubSlice.front()))
9501 continue;
9502 // Check if the subslice is to be-vectorized entry, which is not
9503 // equal to entry.
9504 if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9505 [&](const auto &P) {
9506 return !SubSlice.equals(
9507 VectorizableTree[std::get<0>(P)]
9508 ->Scalars) &&
9509 set_is_subset(SubSlice, std::get<1>(P));
9510 }))
9511 continue;
9512 unsigned Sz = VectorizableTree.size();
9513 buildTreeRec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
9514 if (Sz == VectorizableTree.size()) {
9515 IsVectorized = false;
9516 // Try non-interleaved vectorization with smaller vector
9517 // factor.
9518 if (InterleaveFactor > 0) {
9519 VF = 2 * (MaxVF / InterleaveFactor);
9520 InterleaveFactor = 0;
9521 }
9522 continue;
9523 }
9524 }
9525 if (IsVectorized)
9526 break;
9527 }
9528 }
9529 NonVectorized.append(SortedNonVectorized);
9530 }
9531 return NonVectorized;
9532 };
9533 for (const auto &GLs : GatheredLoads) {
9534 const auto &Ref = GLs.second;
9535 SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);
9536 if (!Ref.empty() && !NonVectorized.empty() &&
9537 std::accumulate(
9538 Ref.begin(), Ref.end(), 0u,
9539 [](unsigned S, ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
9540 -> unsigned { return S + LoadsDists.size(); }) !=
9541 NonVectorized.size() &&
9542 IsMaskedGatherSupported(NonVectorized)) {
9544 FinalGatheredLoads;
9545 for (LoadInst *LI : NonVectorized) {
9546 // Reinsert non-vectorized loads to other list of loads with the same
9547 // base pointers.
9548 gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI,
9549 FinalGatheredLoads,
9550 /*AddNew=*/false);
9551 }
9552 // Final attempt to vectorize non-vectorized loads.
9553 (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
9554 }
9555 }
9556 // Try to vectorize postponed load entries, previously marked as gathered.
9557 for (unsigned Idx : LoadEntriesToVectorize) {
9558 const TreeEntry &E = *VectorizableTree[Idx];
9559 SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
9560 // Avoid reordering, if possible.
9561 if (!E.ReorderIndices.empty()) {
9562 // Build a mask out of the reorder indices and reorder scalars per this
9563 // mask.
9564 SmallVector<int> ReorderMask;
9565 inversePermutation(E.ReorderIndices, ReorderMask);
9566 reorderScalars(GatheredScalars, ReorderMask);
9567 }
9568 buildTreeRec(GatheredScalars, 0, EdgeInfo());
9569 }
9570 // If no new entries created, consider it as no gathered loads entries must be
9571 // handled.
9572 if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
9573 VectorizableTree.size())
9574 GatheredLoadsEntriesFirst.reset();
9575}
9576
9577/// Generates key/subkey pair for the given value to provide effective sorting
9578/// of the values and better detection of the vectorizable values sequences. The
9579/// keys/subkeys can be used for better sorting of the values themselves (keys)
9580/// and in values subgroups (subkeys).
9581static std::pair<size_t, size_t> generateKeySubkey(
9582 Value *V, const TargetLibraryInfo *TLI,
9583 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
9584 bool AllowAlternate) {
9585 hash_code Key = hash_value(V->getValueID() + 2);
9586 hash_code SubKey = hash_value(0);
9587 // Sort the loads by the distance between the pointers.
9588 if (auto *LI = dyn_cast<LoadInst>(V)) {
9589 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
9590 if (LI->isSimple())
9591 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
9592 else
9593 Key = SubKey = hash_value(LI);
9594 } else if (isVectorLikeInstWithConstOps(V)) {
9595 // Sort extracts by the vector operands.
9597 Key = hash_value(Value::UndefValueVal + 1);
9598 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
9599 if (!isUndefVector(EI->getVectorOperand()).all() &&
9600 !isa<UndefValue>(EI->getIndexOperand()))
9601 SubKey = hash_value(EI->getVectorOperand());
9602 }
9603 } else if (auto *I = dyn_cast<Instruction>(V)) {
9604 // Sort other instructions just by the opcodes except for CMPInst.
9605 // For CMP also sort by the predicate kind.
9607 isValidForAlternation(I->getOpcode())) {
9608 if (AllowAlternate)
9609 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
9610 else
9611 Key = hash_combine(hash_value(I->getOpcode()), Key);
9612 SubKey = hash_combine(
9613 hash_value(I->getOpcode()), hash_value(I->getType()),
9615 ? I->getType()
9616 : cast<CastInst>(I)->getOperand(0)->getType()));
9617 // For casts, look through the only operand to improve compile time.
9618 if (isa<CastInst>(I)) {
9619 std::pair<size_t, size_t> OpVals =
9620 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
9621 /*AllowAlternate=*/true);
9622 Key = hash_combine(OpVals.first, Key);
9623 SubKey = hash_combine(OpVals.first, SubKey);
9624 }
9625 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
9626 CmpInst::Predicate Pred = CI->getPredicate();
9627 if (CI->isCommutative())
9628 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
9630 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
9631 hash_value(SwapPred),
9632 hash_value(CI->getOperand(0)->getType()));
9633 } else if (auto *Call = dyn_cast<CallInst>(I)) {
9636 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
9637 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
9638 SubKey = hash_combine(hash_value(I->getOpcode()),
9639 hash_value(Call->getCalledFunction()));
9640 } else {
9642 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
9643 }
9644 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
9645 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
9646 hash_value(Op.Tag), SubKey);
9647 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
9648 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
9649 SubKey = hash_value(Gep->getPointerOperand());
9650 else
9651 SubKey = hash_value(Gep);
9652 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
9653 !isa<ConstantInt>(I->getOperand(1))) {
9654 // Do not try to vectorize instructions with potentially high cost.
9655 SubKey = hash_value(I);
9656 } else {
9657 SubKey = hash_value(I->getOpcode());
9658 }
9659 Key = hash_combine(hash_value(I->getParent()), Key);
9660 }
9661 return std::make_pair(Key, SubKey);
9662}
9663
9664/// Checks if the specified instruction \p I is an main operation for the given
9665/// \p MainOp and \p AltOp instructions.
9666static bool isMainInstruction(Instruction *I, Instruction *MainOp,
9667 Instruction *AltOp, const TargetLibraryInfo &TLI);
9668
9669bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
9670 ArrayRef<Value *> VL) const {
9671 Type *ScalarTy = S.getMainOp()->getType();
9672 unsigned Opcode0 = S.getOpcode();
9673 unsigned Opcode1 = S.getAltOpcode();
9674 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
9675 // If this pattern is supported by the target then consider it profitable.
9676 if (TTI->isLegalAltInstr(getWidenedType(ScalarTy, VL.size()), Opcode0,
9677 Opcode1, OpcodeMask))
9678 return true;
9680 for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
9681 Operands.emplace_back();
9682 // Prepare the operand vector.
9683 for (Value *V : VL) {
9684 if (isa<PoisonValue>(V)) {
9685 Operands.back().push_back(
9686 PoisonValue::get(S.getMainOp()->getOperand(I)->getType()));
9687 continue;
9688 }
9689 Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
9690 }
9691 }
9692 if (Operands.size() == 2) {
9693 // Try find best operands candidates.
9694 for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
9696 Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
9697 Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
9698 Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
9699 std::optional<int> Res = findBestRootPair(Candidates);
9700 switch (Res.value_or(0)) {
9701 case 0:
9702 break;
9703 case 1:
9704 std::swap(Operands[0][I + 1], Operands[1][I + 1]);
9705 break;
9706 case 2:
9707 std::swap(Operands[0][I], Operands[1][I]);
9708 break;
9709 default:
9710 llvm_unreachable("Unexpected index.");
9711 }
9712 }
9713 }
9714 DenseSet<unsigned> UniqueOpcodes;
9715 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
9716 unsigned NonInstCnt = 0;
9717 // Estimate number of instructions, required for the vectorized node and for
9718 // the buildvector node.
9719 unsigned UndefCnt = 0;
9720 // Count the number of extra shuffles, required for vector nodes.
9721 unsigned ExtraShuffleInsts = 0;
9722 // Check that operands do not contain same values and create either perfect
9723 // diamond match or shuffled match.
9724 if (Operands.size() == 2) {
9725 // Do not count same operands twice.
9726 if (Operands.front() == Operands.back()) {
9727 Operands.erase(Operands.begin());
9728 } else if (!allConstant(Operands.front()) &&
9729 all_of(Operands.front(), [&](Value *V) {
9730 return is_contained(Operands.back(), V);
9731 })) {
9732 Operands.erase(Operands.begin());
9733 ++ExtraShuffleInsts;
9734 }
9735 }
9736 const Loop *L = LI->getLoopFor(S.getMainOp()->getParent());
9737 // Vectorize node, if:
9738 // 1. at least single operand is constant or splat.
9739 // 2. Operands have many loop invariants (the instructions are not loop
9740 // invariants).
9741 // 3. At least single unique operands is supposed to vectorized.
9742 return none_of(Operands,
9743 [&](ArrayRef<Value *> Op) {
9744 if (allConstant(Op) ||
9745 (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
9746 getSameOpcode(Op, *TLI)))
9747 return false;
9748 DenseMap<Value *, unsigned> Uniques;
9749 for (Value *V : Op) {
9751 isVectorized(V) || (L && L->isLoopInvariant(V))) {
9752 if (isa<UndefValue>(V))
9753 ++UndefCnt;
9754 continue;
9755 }
9756 auto Res = Uniques.try_emplace(V, 0);
9757 // Found first duplicate - need to add shuffle.
9758 if (!Res.second && Res.first->second == 1)
9759 ++ExtraShuffleInsts;
9760 ++Res.first->getSecond();
9761 if (auto *I = dyn_cast<Instruction>(V))
9762 UniqueOpcodes.insert(I->getOpcode());
9763 else if (Res.second)
9764 ++NonInstCnt;
9765 }
9766 return none_of(Uniques, [&](const auto &P) {
9767 return P.first->hasNUsesOrMore(P.second + 1) &&
9768 none_of(P.first->users(), [&](User *U) {
9769 return isVectorized(U) || Uniques.contains(U);
9770 });
9771 });
9772 }) ||
9773 // Do not vectorize node, if estimated number of vector instructions is
9774 // more than estimated number of buildvector instructions. Number of
9775 // vector operands is number of vector instructions + number of vector
9776 // instructions for operands (buildvectors). Number of buildvector
9777 // instructions is just number_of_operands * number_of_scalars.
9778 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
9779 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
9780 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
9781}
9782
9783/// Builds the arguments types vector for the given call instruction with the
9784/// given \p ID for the specified vector factor.
9787 const unsigned VF, unsigned MinBW,
9788 const TargetTransformInfo *TTI) {
9789 SmallVector<Type *> ArgTys;
9790 for (auto [Idx, Arg] : enumerate(CI->args())) {
9793 ArgTys.push_back(Arg->getType());
9794 continue;
9795 }
9796 if (MinBW > 0) {
9797 ArgTys.push_back(
9798 getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
9799 continue;
9800 }
9801 }
9802 ArgTys.push_back(getWidenedType(Arg->getType(), VF));
9803 }
9804 return ArgTys;
9805}
9806
9807/// Calculates the costs of vectorized intrinsic (if possible) and vectorized
9808/// function (if possible) calls. Returns invalid cost for the corresponding
9809/// calls, if they cannot be vectorized/will be scalarized.
9810static std::pair<InstructionCost, InstructionCost>
9813 ArrayRef<Type *> ArgTys) {
9814 auto Shape = VFShape::get(CI->getFunctionType(),
9816 false /*HasGlobalPred*/);
9817 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
9818 auto LibCost = InstructionCost::getInvalid();
9819 if (!CI->isNoBuiltin() && VecFunc) {
9820 // Calculate the cost of the vector library call.
9821 // If the corresponding vector call is cheaper, return its cost.
9822 LibCost =
9823 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
9824 }
9826
9827 // Calculate the cost of the vector intrinsic call.
9828 FastMathFlags FMF;
9829 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
9830 FMF = FPCI->getFastMathFlags();
9831 const InstructionCost ScalarLimit = 10000;
9832 IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF, nullptr,
9833 LibCost.isValid() ? LibCost : ScalarLimit);
9834 auto IntrinsicCost =
9835 TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
9836 if ((LibCost.isValid() && IntrinsicCost > LibCost) ||
9837 (!LibCost.isValid() && IntrinsicCost > ScalarLimit))
9839
9840 return {IntrinsicCost, LibCost};
9841}
9842
9843BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
9844 const InstructionsState &S, ArrayRef<Value *> VL,
9845 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
9846 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
9847 assert(S.getMainOp() &&
9848 "Expected instructions with same/alternate opcodes only.");
9849
9850 unsigned ShuffleOrOp =
9851 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
9852 Instruction *VL0 = S.getMainOp();
9853 switch (ShuffleOrOp) {
9854 case Instruction::PHI: {
9855 // Too many operands - gather, most probably won't be vectorized.
9856 if (VL0->getNumOperands() > MaxPHINumOperands)
9857 return TreeEntry::NeedToGather;
9858 // Check for terminator values (e.g. invoke).
9859 for (Value *V : VL) {
9860 auto *PHI = dyn_cast<PHINode>(V);
9861 if (!PHI)
9862 continue;
9863 for (Value *Incoming : PHI->incoming_values()) {
9865 if (Term && Term->isTerminator()) {
9867 << "SLP: Need to swizzle PHINodes (terminator use).\n");
9868 return TreeEntry::NeedToGather;
9869 }
9870 }
9871 }
9872
9873 return TreeEntry::Vectorize;
9874 }
9875 case Instruction::ExtractElement:
9876 if (any_of(VL, [&](Value *V) {
9877 auto *EI = dyn_cast<ExtractElementInst>(V);
9878 if (!EI)
9879 return true;
9880 return isVectorized(EI->getOperand(0));
9881 }))
9882 return TreeEntry::NeedToGather;
9883 [[fallthrough]];
9884 case Instruction::ExtractValue: {
9885 bool Reuse = canReuseExtract(VL, CurrentOrder);
9886 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
9887 // non-full registers).
9888 if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size()))
9889 return TreeEntry::NeedToGather;
9890 if (Reuse || !CurrentOrder.empty())
9891 return TreeEntry::Vectorize;
9892 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
9893 return TreeEntry::NeedToGather;
9894 }
9895 case Instruction::InsertElement: {
9896 // Check that we have a buildvector and not a shuffle of 2 or more
9897 // different vectors.
9898 ValueSet SourceVectors;
9899 for (Value *V : VL) {
9900 if (isa<PoisonValue>(V)) {
9901 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement/poison vector.\n");
9902 return TreeEntry::NeedToGather;
9903 }
9904 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
9905 assert(getElementIndex(V) != std::nullopt &&
9906 "Non-constant or undef index?");
9907 }
9908
9909 if (count_if(VL, [&SourceVectors](Value *V) {
9910 return !SourceVectors.contains(V);
9911 }) >= 2) {
9912 // Found 2nd source vector - cancel.
9913 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
9914 "different source vectors.\n");
9915 return TreeEntry::NeedToGather;
9916 }
9917
9918 if (any_of(VL, [&SourceVectors](Value *V) {
9919 // The last InsertElement can have multiple uses.
9920 return SourceVectors.contains(V) && !V->hasOneUse();
9921 })) {
9922 assert(SLPReVec && "Only supported by REVEC.");
9923 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
9924 "multiple uses.\n");
9925 return TreeEntry::NeedToGather;
9926 }
9927
9928 return TreeEntry::Vectorize;
9929 }
9930 case Instruction::Load: {
9931 // Check that a vectorized load would load the same memory as a scalar
9932 // load. For example, we don't want to vectorize loads that are smaller
9933 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
9934 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
9935 // from such a struct, we read/write packed bits disagreeing with the
9936 // unvectorized version.
9937 auto IsGatheredNode = [&]() {
9938 if (!GatheredLoadsEntriesFirst)
9939 return false;
9940 return all_of(VL, [&](Value *V) {
9941 if (isa<PoisonValue>(V))
9942 return true;
9943 return any_of(getTreeEntries(V), [&](const TreeEntry *TE) {
9944 return TE->Idx >= *GatheredLoadsEntriesFirst;
9945 });
9946 });
9947 };
9948 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps, SPtrInfo)) {
9950 return TreeEntry::Vectorize;
9952 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
9953 // Delay slow vectorized nodes for better vectorization attempts.
9954 LoadEntriesToVectorize.insert(VectorizableTree.size());
9955 return TreeEntry::NeedToGather;
9956 }
9957 return IsGatheredNode() ? TreeEntry::NeedToGather
9958 : TreeEntry::CompressVectorize;
9960 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
9961 // Delay slow vectorized nodes for better vectorization attempts.
9962 LoadEntriesToVectorize.insert(VectorizableTree.size());
9963 return TreeEntry::NeedToGather;
9964 }
9965 return IsGatheredNode() ? TreeEntry::NeedToGather
9966 : TreeEntry::ScatterVectorize;
9968 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
9969 // Delay slow vectorized nodes for better vectorization attempts.
9970 LoadEntriesToVectorize.insert(VectorizableTree.size());
9971 return TreeEntry::NeedToGather;
9972 }
9973 return IsGatheredNode() ? TreeEntry::NeedToGather
9974 : TreeEntry::StridedVectorize;
9975 case LoadsState::Gather:
9976#ifndef NDEBUG
9977 Type *ScalarTy = VL0->getType();
9978 if (DL->getTypeSizeInBits(ScalarTy) !=
9979 DL->getTypeAllocSizeInBits(ScalarTy))
9980 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
9981 else if (any_of(VL, [](Value *V) {
9982 auto *LI = dyn_cast<LoadInst>(V);
9983 return !LI || !LI->isSimple();
9984 }))
9985 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
9986 else
9987 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
9988#endif // NDEBUG
9990 return TreeEntry::NeedToGather;
9991 }
9992 llvm_unreachable("Unexpected state of loads");
9993 }
9994 case Instruction::ZExt:
9995 case Instruction::SExt:
9996 case Instruction::FPToUI:
9997 case Instruction::FPToSI:
9998 case Instruction::FPExt:
9999 case Instruction::PtrToInt:
10000 case Instruction::IntToPtr:
10001 case Instruction::SIToFP:
10002 case Instruction::UIToFP:
10003 case Instruction::Trunc:
10004 case Instruction::FPTrunc:
10005 case Instruction::BitCast: {
10006 Type *SrcTy = VL0->getOperand(0)->getType();
10007 for (Value *V : VL) {
10008 if (isa<PoisonValue>(V))
10009 continue;
10010 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
10011 if (Ty != SrcTy || !isValidElementType(Ty)) {
10012 LLVM_DEBUG(
10013 dbgs() << "SLP: Gathering casts with different src types.\n");
10014 return TreeEntry::NeedToGather;
10015 }
10016 }
10017 return TreeEntry::Vectorize;
10018 }
10019 case Instruction::ICmp:
10020 case Instruction::FCmp: {
10021 // Check that all of the compares have the same predicate.
10022 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
10024 Type *ComparedTy = VL0->getOperand(0)->getType();
10025 for (Value *V : VL) {
10026 if (isa<PoisonValue>(V))
10027 continue;
10028 auto *Cmp = cast<CmpInst>(V);
10029 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
10030 Cmp->getOperand(0)->getType() != ComparedTy) {
10031 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
10032 return TreeEntry::NeedToGather;
10033 }
10034 }
10035 return TreeEntry::Vectorize;
10036 }
10037 case Instruction::Select:
10038 case Instruction::FNeg:
10039 case Instruction::Add:
10040 case Instruction::FAdd:
10041 case Instruction::Sub:
10042 case Instruction::FSub:
10043 case Instruction::Mul:
10044 case Instruction::FMul:
10045 case Instruction::UDiv:
10046 case Instruction::SDiv:
10047 case Instruction::FDiv:
10048 case Instruction::URem:
10049 case Instruction::SRem:
10050 case Instruction::FRem:
10051 case Instruction::Shl:
10052 case Instruction::LShr:
10053 case Instruction::AShr:
10054 case Instruction::And:
10055 case Instruction::Or:
10056 case Instruction::Xor:
10057 case Instruction::Freeze:
10058 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10059 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
10060 auto *I = dyn_cast<Instruction>(V);
10061 return I && I->isBinaryOp() && !I->isFast();
10062 }))
10063 return TreeEntry::NeedToGather;
10064 return TreeEntry::Vectorize;
10065 case Instruction::GetElementPtr: {
10066 // We don't combine GEPs with complicated (nested) indexing.
10067 for (Value *V : VL) {
10068 auto *I = dyn_cast<GetElementPtrInst>(V);
10069 if (!I)
10070 continue;
10071 if (I->getNumOperands() != 2) {
10072 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
10073 return TreeEntry::NeedToGather;
10074 }
10075 }
10076
10077 // We can't combine several GEPs into one vector if they operate on
10078 // different types.
10079 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
10080 for (Value *V : VL) {
10081 auto *GEP = dyn_cast<GEPOperator>(V);
10082 if (!GEP)
10083 continue;
10084 Type *CurTy = GEP->getSourceElementType();
10085 if (Ty0 != CurTy) {
10086 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
10087 return TreeEntry::NeedToGather;
10088 }
10089 }
10090
10091 // We don't combine GEPs with non-constant indexes.
10092 Type *Ty1 = VL0->getOperand(1)->getType();
10093 for (Value *V : VL) {
10094 auto *I = dyn_cast<GetElementPtrInst>(V);
10095 if (!I)
10096 continue;
10097 auto *Op = I->getOperand(1);
10098 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
10099 (Op->getType() != Ty1 &&
10100 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
10101 Op->getType()->getScalarSizeInBits() >
10102 DL->getIndexSizeInBits(
10103 V->getType()->getPointerAddressSpace())))) {
10104 LLVM_DEBUG(
10105 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
10106 return TreeEntry::NeedToGather;
10107 }
10108 }
10109
10110 return TreeEntry::Vectorize;
10111 }
10112 case Instruction::Store: {
10113 // Check if the stores are consecutive or if we need to swizzle them.
10114 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
10115 // Avoid types that are padded when being allocated as scalars, while
10116 // being packed together in a vector (such as i1).
10117 if (DL->getTypeSizeInBits(ScalarTy) !=
10118 DL->getTypeAllocSizeInBits(ScalarTy)) {
10119 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
10120 return TreeEntry::NeedToGather;
10121 }
10122 // Make sure all stores in the bundle are simple - we can't vectorize
10123 // atomic or volatile stores.
10124 for (Value *V : VL) {
10125 auto *SI = cast<StoreInst>(V);
10126 if (!SI->isSimple()) {
10127 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
10128 return TreeEntry::NeedToGather;
10129 }
10130 PointerOps.push_back(SI->getPointerOperand());
10131 }
10132
10133 // Check the order of pointer operands.
10134 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
10135 Value *Ptr0;
10136 Value *PtrN;
10137 if (CurrentOrder.empty()) {
10138 Ptr0 = PointerOps.front();
10139 PtrN = PointerOps.back();
10140 } else {
10141 Ptr0 = PointerOps[CurrentOrder.front()];
10142 PtrN = PointerOps[CurrentOrder.back()];
10143 }
10144 std::optional<int64_t> Dist =
10145 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
10146 // Check that the sorted pointer operands are consecutive.
10147 if (static_cast<uint64_t>(*Dist) == VL.size() - 1)
10148 return TreeEntry::Vectorize;
10149 }
10150
10151 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
10152 return TreeEntry::NeedToGather;
10153 }
10154 case Instruction::Call: {
10155 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10156 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
10157 auto *I = dyn_cast<Instruction>(V);
10158 return I && !I->isFast();
10159 }))
10160 return TreeEntry::NeedToGather;
10161 // Check if the calls are all to the same vectorizable intrinsic or
10162 // library function.
10163 CallInst *CI = cast<CallInst>(VL0);
10165
10166 VFShape Shape = VFShape::get(
10167 CI->getFunctionType(),
10168 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
10169 false /*HasGlobalPred*/);
10170 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10171
10172 if (!VecFunc && !isTriviallyVectorizable(ID)) {
10173 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
10174 return TreeEntry::NeedToGather;
10175 }
10176 Function *F = CI->getCalledFunction();
10177 unsigned NumArgs = CI->arg_size();
10178 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
10179 for (unsigned J = 0; J != NumArgs; ++J)
10181 ScalarArgs[J] = CI->getArgOperand(J);
10182 for (Value *V : VL) {
10183 CallInst *CI2 = dyn_cast<CallInst>(V);
10184 if (!CI2 || CI2->getCalledFunction() != F ||
10185 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
10186 (VecFunc &&
10187 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
10189 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
10190 << "\n");
10191 return TreeEntry::NeedToGather;
10192 }
10193 // Some intrinsics have scalar arguments and should be same in order for
10194 // them to be vectorized.
10195 for (unsigned J = 0; J != NumArgs; ++J) {
10197 Value *A1J = CI2->getArgOperand(J);
10198 if (ScalarArgs[J] != A1J) {
10200 << "SLP: mismatched arguments in call:" << *CI
10201 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
10202 return TreeEntry::NeedToGather;
10203 }
10204 }
10205 }
10206 // Verify that the bundle operands are identical between the two calls.
10207 if (CI->hasOperandBundles() &&
10208 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
10209 CI->op_begin() + CI->getBundleOperandsEndIndex(),
10210 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
10211 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
10212 << "!=" << *V << '\n');
10213 return TreeEntry::NeedToGather;
10214 }
10215 }
10216 SmallVector<Type *> ArgTys =
10217 buildIntrinsicArgTypes(CI, ID, VL.size(), 0, TTI);
10218 auto *VecTy = getWidenedType(S.getMainOp()->getType(), VL.size());
10219 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
10220 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10221 return TreeEntry::NeedToGather;
10222
10223 return TreeEntry::Vectorize;
10224 }
10225 case Instruction::ShuffleVector: {
10226 if (!S.isAltShuffle()) {
10227 // REVEC can support non alternate shuffle.
10229 return TreeEntry::Vectorize;
10230 // If this is not an alternate sequence of opcode like add-sub
10231 // then do not vectorize this instruction.
10232 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
10233 return TreeEntry::NeedToGather;
10234 }
10235 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
10236 LLVM_DEBUG(
10237 dbgs()
10238 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
10239 "the whole alt sequence is not profitable.\n");
10240 return TreeEntry::NeedToGather;
10241 }
10242
10243 return TreeEntry::Vectorize;
10244 }
10245 default:
10246 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
10247 return TreeEntry::NeedToGather;
10248 }
10249}
10250
10251namespace {
10252/// Allows to correctly handle operands of the phi nodes based on the \p Main
10253/// PHINode order of incoming basic blocks/values.
10254class PHIHandler {
10255 DominatorTree &DT;
10256 PHINode *Main = nullptr;
10259
10260public:
10261 PHIHandler() = delete;
10262 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
10263 : DT(DT), Main(Main), Phis(Phis),
10264 Operands(Main->getNumIncomingValues(),
10265 SmallVector<Value *>(Phis.size(), nullptr)) {}
10266 void buildOperands() {
10267 constexpr unsigned FastLimit = 4;
10268 if (Main->getNumIncomingValues() <= FastLimit) {
10269 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
10270 BasicBlock *InBB = Main->getIncomingBlock(I);
10271 if (!DT.isReachableFromEntry(InBB)) {
10272 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
10273 continue;
10274 }
10275 // Prepare the operand vector.
10276 for (auto [Idx, V] : enumerate(Phis)) {
10277 auto *P = dyn_cast<PHINode>(V);
10278 if (!P) {
10280 "Expected isa instruction or poison value.");
10281 Operands[I][Idx] = V;
10282 continue;
10283 }
10284 if (P->getIncomingBlock(I) == InBB)
10285 Operands[I][Idx] = P->getIncomingValue(I);
10286 else
10287 Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
10288 }
10289 }
10290 return;
10291 }
10292 SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
10293 Blocks;
10294 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues())) {
10295 BasicBlock *InBB = Main->getIncomingBlock(I);
10296 if (!DT.isReachableFromEntry(InBB)) {
10297 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
10298 continue;
10299 }
10300 Blocks.try_emplace(InBB).first->second.push_back(I);
10301 }
10302 for (auto [Idx, V] : enumerate(Phis)) {
10303 if (isa<PoisonValue>(V)) {
10304 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues()))
10305 Operands[I][Idx] = V;
10306 continue;
10307 }
10308 auto *P = cast<PHINode>(V);
10309 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
10310 BasicBlock *InBB = P->getIncomingBlock(I);
10311 if (InBB == Main->getIncomingBlock(I)) {
10313 continue;
10314 Operands[I][Idx] = P->getIncomingValue(I);
10315 continue;
10316 }
10317 auto *It = Blocks.find(InBB);
10318 if (It == Blocks.end())
10319 continue;
10320 Operands[It->second.front()][Idx] = P->getIncomingValue(I);
10321 }
10322 }
10323 for (const auto &P : Blocks) {
10324 ArrayRef<unsigned> IncomingValues = P.second;
10325 if (IncomingValues.size() <= 1)
10326 continue;
10327 unsigned BasicI = IncomingValues.consume_front();
10328 for (unsigned I : IncomingValues) {
10330 [&](const auto &Data) {
10331 return !Data.value() ||
10332 Data.value() == Operands[BasicI][Data.index()];
10333 }) &&
10334 "Expected empty operands list.");
10335 Operands[I] = Operands[BasicI];
10336 }
10337 }
10338 }
10339 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
10340};
10341} // namespace
10342
10343/// Returns main/alternate instructions for the given \p VL. Unlike
10344/// getSameOpcode supports non-compatible instructions for better SplitVectorize
10345/// node support.
10346/// \returns first main/alt instructions, if only poisons and instruction with
10347/// only 2 opcodes exists. Returns pair of nullptr otherwise.
10348static std::pair<Instruction *, Instruction *>
10350 Instruction *MainOp = nullptr;
10351 Instruction *AltOp = nullptr;
10352 for (Value *V : VL) {
10353 if (isa<PoisonValue>(V))
10354 continue;
10355 auto *I = dyn_cast<Instruction>(V);
10356 if (!I)
10357 return {};
10358 if (!MainOp) {
10359 MainOp = I;
10360 continue;
10361 }
10362 if (MainOp->getOpcode() == I->getOpcode()) {
10363 if (I->getParent() != MainOp->getParent())
10364 return {};
10365 continue;
10366 }
10367 if (!AltOp) {
10368 AltOp = I;
10369 continue;
10370 }
10371 if (AltOp->getOpcode() == I->getOpcode()) {
10372 if (I->getParent() != AltOp->getParent())
10373 return {};
10374 continue;
10375 }
10376 return {};
10377 }
10378 if (!AltOp)
10379 return {};
10380 assert(MainOp && AltOp && MainOp->getOpcode() != AltOp->getOpcode() &&
10381 "Expected different main and alt instructions.");
10382 return std::make_pair(MainOp, AltOp);
10383}
10384
10385/// Checks that every instruction appears once in the list and if not, packs
10386/// them, building \p ReuseShuffleIndices mask and mutating \p VL. The list of
10387/// unique scalars is extended by poison values to the whole register size.
10388///
10389/// \returns false if \p VL could not be uniquified, in which case \p VL is
10390/// unchanged and \p ReuseShuffleIndices is empty.
10392 SmallVectorImpl<int> &ReuseShuffleIndices,
10393 const TargetTransformInfo &TTI,
10394 const TargetLibraryInfo &TLI,
10395 const InstructionsState &S,
10396 const BoUpSLP::EdgeInfo &UserTreeIdx,
10397 bool TryPad = false) {
10398 // Check that every instruction appears once in this bundle.
10399 SmallVector<Value *> UniqueValues;
10400 SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
10401 for (Value *V : VL) {
10402 if (isConstant(V)) {
10403 // Constants are always considered distinct, even if the same constant
10404 // appears multiple times in VL.
10405 ReuseShuffleIndices.emplace_back(
10406 isa<PoisonValue>(V) ? PoisonMaskElem : UniqueValues.size());
10407 UniqueValues.emplace_back(V);
10408 continue;
10409 }
10410 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
10411 ReuseShuffleIndices.emplace_back(Res.first->second);
10412 if (Res.second)
10413 UniqueValues.emplace_back(V);
10414 }
10415
10416 // Easy case: VL has unique values and a "natural" size
10417 size_t NumUniqueScalarValues = UniqueValues.size();
10418 bool IsFullVectors = hasFullVectorsOrPowerOf2(
10419 TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues);
10420 if (NumUniqueScalarValues == VL.size() &&
10421 (VectorizeNonPowerOf2 || IsFullVectors)) {
10422 ReuseShuffleIndices.clear();
10423 return true;
10424 }
10425
10426 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
10427 if ((UserTreeIdx.UserTE &&
10428 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(TTI)) ||
10430 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
10431 "for nodes with padding.\n");
10432 ReuseShuffleIndices.clear();
10433 return false;
10434 }
10435
10436 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
10437 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
10438 (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
10439 return isa<UndefValue>(V) || !isConstant(V);
10440 }))) {
10441 if (TryPad && UniquePositions.size() > 1 && NumUniqueScalarValues > 1 &&
10442 S.getMainOp()->isSafeToRemove() &&
10443 (S.areInstructionsWithCopyableElements() ||
10444 all_of(UniqueValues, IsaPred<Instruction, PoisonValue>))) {
10445 // Find the number of elements, which forms full vectors.
10446 unsigned PWSz = getFullVectorNumberOfElements(
10447 TTI, UniqueValues.front()->getType(), UniqueValues.size());
10448 PWSz = std::min<unsigned>(PWSz, VL.size());
10449 if (PWSz == VL.size()) {
10450 // We ended up with the same size after removing duplicates and
10451 // upgrading the resulting vector size to a "nice size". Just keep
10452 // the initial VL then.
10453 ReuseShuffleIndices.clear();
10454 } else {
10455 // Pad unique values with poison to grow the vector to a "nice" size
10456 SmallVector<Value *> PaddedUniqueValues(UniqueValues.begin(),
10457 UniqueValues.end());
10458 PaddedUniqueValues.append(
10459 PWSz - UniqueValues.size(),
10460 PoisonValue::get(UniqueValues.front()->getType()));
10461 // Check that extended with poisons/copyable operations are still valid
10462 // for vectorization (div/rem are not allowed).
10463 if (!S.areInstructionsWithCopyableElements() &&
10464 !getSameOpcode(PaddedUniqueValues, TLI).valid()) {
10465 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
10466 ReuseShuffleIndices.clear();
10467 return false;
10468 }
10469 VL = std::move(PaddedUniqueValues);
10470 }
10471 return true;
10472 }
10473 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
10474 ReuseShuffleIndices.clear();
10475 return false;
10476 }
10477 VL = std::move(UniqueValues);
10478 return true;
10479}
10480
10481bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
10482 const InstructionsState &LocalState,
10483 SmallVectorImpl<Value *> &Op1,
10484 SmallVectorImpl<Value *> &Op2,
10485 OrdersType &ReorderIndices) const {
10486 constexpr unsigned SmallNodeSize = 4;
10487 if (VL.size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
10489 return false;
10490
10491 // Check if this is a duplicate of another split entry.
10492 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *LocalState.getMainOp()
10493 << ".\n");
10494 for (TreeEntry *E : getSplitTreeEntries(LocalState.getMainOp())) {
10495 if (E->isSame(VL)) {
10496 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at "
10497 << *LocalState.getMainOp() << ".\n");
10498 return false;
10499 }
10500 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
10501 if (all_of(VL, [&](Value *V) {
10502 return isa<PoisonValue>(V) || Values.contains(V);
10503 })) {
10504 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
10505 return false;
10506 }
10507 }
10508
10509 ReorderIndices.assign(VL.size(), VL.size());
10510 SmallBitVector Op1Indices(VL.size());
10511 for (auto [Idx, V] : enumerate(VL)) {
10512 auto *I = dyn_cast<Instruction>(V);
10513 if (!I) {
10514 Op1.push_back(V);
10515 Op1Indices.set(Idx);
10516 continue;
10517 }
10518 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
10519 isMainInstruction(I, LocalState.getMainOp(), LocalState.getAltOp(),
10520 *TLI)) ||
10521 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
10522 !isAlternateInstruction(I, LocalState.getMainOp(),
10523 LocalState.getAltOp(), *TLI))) {
10524 Op1.push_back(V);
10525 Op1Indices.set(Idx);
10526 continue;
10527 }
10528 Op2.push_back(V);
10529 }
10530 Type *ScalarTy = getValueType(VL.front());
10531 VectorType *VecTy = getWidenedType(ScalarTy, VL.size());
10532 unsigned Opcode0 = LocalState.getOpcode();
10533 unsigned Opcode1 = LocalState.getAltOpcode();
10534 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
10535 // Enable split node, only if all nodes do not form legal alternate
10536 // instruction (like X86 addsub).
10537 SmallPtrSet<Value *, 4> UOp1(llvm::from_range, Op1);
10538 SmallPtrSet<Value *, 4> UOp2(llvm::from_range, Op2);
10539 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
10540 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
10541 !hasFullVectorsOrPowerOf2(*TTI, Op1.front()->getType(), Op1.size()) ||
10542 !hasFullVectorsOrPowerOf2(*TTI, Op2.front()->getType(), Op2.size()))
10543 return false;
10544 // Enable split node, only if all nodes are power-of-2/full registers.
10545 unsigned Op1Cnt = 0, Op2Cnt = Op1.size();
10546 for (unsigned Idx : seq<unsigned>(VL.size())) {
10547 if (Op1Indices.test(Idx)) {
10548 ReorderIndices[Op1Cnt] = Idx;
10549 ++Op1Cnt;
10550 } else {
10551 ReorderIndices[Op2Cnt] = Idx;
10552 ++Op2Cnt;
10553 }
10554 }
10555 if (isIdentityOrder(ReorderIndices))
10556 ReorderIndices.clear();
10557 SmallVector<int> Mask;
10558 if (!ReorderIndices.empty())
10559 inversePermutation(ReorderIndices, Mask);
10560 unsigned NumParts = TTI->getNumberOfParts(VecTy);
10561 VectorType *Op1VecTy = getWidenedType(ScalarTy, Op1.size());
10562 VectorType *Op2VecTy = getWidenedType(ScalarTy, Op2.size());
10563 // Check non-profitable single register ops, which better to be represented
10564 // as alternate ops.
10565 if (NumParts >= VL.size())
10566 return false;
10568 InstructionCost InsertCost = ::getShuffleCost(
10569 *TTI, TTI::SK_InsertSubvector, VecTy, {}, Kind, Op1.size(), Op2VecTy);
10570 FixedVectorType *SubVecTy =
10571 getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size()));
10572 InstructionCost NewShuffleCost =
10573 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, SubVecTy, Mask, Kind);
10574 if (!LocalState.isCmpOp() && NumParts <= 1 &&
10575 (Mask.empty() || InsertCost >= NewShuffleCost))
10576 return false;
10577 if ((LocalState.getMainOp()->isBinaryOp() &&
10578 LocalState.getAltOp()->isBinaryOp() &&
10579 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
10580 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
10581 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
10582 (LocalState.getMainOp()->isUnaryOp() &&
10583 LocalState.getAltOp()->isUnaryOp())) {
10584 InstructionCost OriginalVecOpsCost =
10585 TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
10586 TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
10587 SmallVector<int> OriginalMask(VL.size(), PoisonMaskElem);
10588 for (unsigned Idx : seq<unsigned>(VL.size())) {
10589 if (isa<PoisonValue>(VL[Idx]))
10590 continue;
10591 OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.size());
10592 }
10593 InstructionCost OriginalCost =
10594 OriginalVecOpsCost + ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
10595 VecTy, OriginalMask, Kind);
10596 InstructionCost NewVecOpsCost =
10597 TTI->getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
10598 TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
10599 InstructionCost NewCost =
10600 NewVecOpsCost + InsertCost +
10601 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
10602 VectorizableTree.front()->getOpcode() == Instruction::Store
10603 ? NewShuffleCost
10604 : 0);
10605 // If not profitable to split - exit.
10606 if (NewCost >= OriginalCost)
10607 return false;
10608 }
10609 return true;
10610}
10611
10612namespace {
10613/// Class accepts incoming list of values, checks if it is able to model
10614/// "copyable" values as compatible operations, and generates the list of values
10615/// for scheduling and list of operands doe the new nodes.
10616class InstructionsCompatibilityAnalysis {
10617 DominatorTree &DT;
10618 const DataLayout &DL;
10619 const TargetTransformInfo &TTI;
10620 const TargetLibraryInfo &TLI;
10621 unsigned MainOpcode = 0;
10622 Instruction *MainOp = nullptr;
10623
10624 /// Checks if the opcode is supported as the main opcode for copyable
10625 /// elements.
10626 static bool isSupportedOpcode(const unsigned Opcode) {
10627 return Opcode == Instruction::Add || Opcode == Instruction::LShr;
10628 }
10629
10630 /// Identifies the best candidate value, which represents main opcode
10631 /// operation.
10632 /// Currently the best candidate is the Add instruction with the parent
10633 /// block with the highest DFS incoming number (block, that dominates other).
10634 void findAndSetMainInstruction(ArrayRef<Value *> VL, const BoUpSLP &R) {
10635 BasicBlock *Parent = nullptr;
10636 // Checks if the instruction has supported opcode.
10637 auto IsSupportedInstruction = [&](Instruction *I) {
10638 return I && isSupportedOpcode(I->getOpcode()) &&
10639 (!doesNotNeedToBeScheduled(I) || !R.isVectorized(I));
10640 };
10641 // Exclude operands instructions immediately to improve compile time, it
10642 // will be unable to schedule anyway.
10643 SmallDenseSet<Value *, 8> Operands;
10644 SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
10645 for (Value *V : VL) {
10646 auto *I = dyn_cast<Instruction>(V);
10647 if (!I)
10648 continue;
10649 if (!DT.isReachableFromEntry(I->getParent()))
10650 continue;
10651 if (Candidates.empty()) {
10652 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
10653 Parent = I->getParent();
10654 Operands.insert(I->op_begin(), I->op_end());
10655 continue;
10656 }
10657 if (Parent == I->getParent()) {
10658 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
10659 Operands.insert(I->op_begin(), I->op_end());
10660 continue;
10661 }
10662 auto *NodeA = DT.getNode(Parent);
10663 auto *NodeB = DT.getNode(I->getParent());
10664 assert(NodeA && "Should only process reachable instructions");
10665 assert(NodeB && "Should only process reachable instructions");
10666 assert((NodeA == NodeB) ==
10667 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10668 "Different nodes should have different DFS numbers");
10669 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
10670 Candidates.clear();
10671 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
10672 Parent = I->getParent();
10673 Operands.clear();
10674 Operands.insert(I->op_begin(), I->op_end());
10675 }
10676 }
10677 unsigned BestOpcodeNum = 0;
10678 MainOp = nullptr;
10679 for (const auto &P : Candidates) {
10680 if (P.second.size() < BestOpcodeNum)
10681 continue;
10682 for (Instruction *I : P.second) {
10683 if (IsSupportedInstruction(I) && !Operands.contains(I)) {
10684 MainOp = I;
10685 BestOpcodeNum = P.second.size();
10686 break;
10687 }
10688 }
10689 }
10690 if (MainOp) {
10691 // Do not match, if any copyable is a terminator from the same block as
10692 // the main operation.
10693 if (any_of(VL, [&](Value *V) {
10694 auto *I = dyn_cast<Instruction>(V);
10695 return I && I->getParent() == MainOp->getParent() &&
10696 I->isTerminator();
10697 })) {
10698 MainOp = nullptr;
10699 return;
10700 }
10701 MainOpcode = MainOp->getOpcode();
10702 }
10703 }
10704
10705 /// Returns the idempotent value for the \p MainOp with the detected \p
10706 /// MainOpcode. For Add, returns 0. For Or, it should choose between false and
10707 /// the operand itself, since V or V == V.
10708 Value *selectBestIdempotentValue() const {
10709 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
10710 return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(),
10711 !MainOp->isCommutative());
10712 }
10713
10714 /// Returns the value and operands for the \p V, considering if it is original
10715 /// instruction and its actual operands should be returned, or it is a
10716 /// copyable element and its should be represented as idempotent instruction.
10717 SmallVector<Value *> getOperands(const InstructionsState &S, Value *V) const {
10718 if (isa<PoisonValue>(V))
10719 return {V, V};
10720 if (!S.isCopyableElement(V))
10721 return convertTo(cast<Instruction>(V), S).second;
10722 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
10723 return {V, selectBestIdempotentValue()};
10724 }
10725
10726 /// Builds operands for the original instructions.
10727 void
10728 buildOriginalOperands(const InstructionsState &S, ArrayRef<Value *> VL,
10729 SmallVectorImpl<BoUpSLP::ValueList> &Operands) const {
10730
10731 unsigned ShuffleOrOp =
10732 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
10733 Instruction *VL0 = S.getMainOp();
10734
10735 switch (ShuffleOrOp) {
10736 case Instruction::PHI: {
10737 auto *PH = cast<PHINode>(VL0);
10738
10739 // Keeps the reordered operands to avoid code duplication.
10740 PHIHandler Handler(DT, PH, VL);
10741 Handler.buildOperands();
10742 Operands.assign(PH->getNumOperands(), {});
10743 for (unsigned I : seq<unsigned>(PH->getNumOperands()))
10744 Operands[I].assign(Handler.getOperands(I).begin(),
10745 Handler.getOperands(I).end());
10746 return;
10747 }
10748 case Instruction::ExtractValue:
10749 case Instruction::ExtractElement:
10750 // This is a special case, as it does not gather, but at the same time
10751 // we are not extending buildTree_rec() towards the operands.
10752 Operands.assign(1, {VL.size(), VL0->getOperand(0)});
10753 return;
10754 case Instruction::InsertElement:
10755 Operands.assign(2, {VL.size(), nullptr});
10756 for (auto [Idx, V] : enumerate(VL)) {
10757 auto *IE = cast<InsertElementInst>(V);
10758 for (auto [OpIdx, Ops] : enumerate(Operands))
10759 Ops[Idx] = IE->getOperand(OpIdx);
10760 }
10761 return;
10762 case Instruction::Load:
10763 Operands.assign(
10764 1, {VL.size(),
10765 PoisonValue::get(cast<LoadInst>(VL0)->getPointerOperandType())});
10766 for (auto [V, Op] : zip(VL, Operands.back())) {
10767 auto *LI = dyn_cast<LoadInst>(V);
10768 if (!LI)
10769 continue;
10770 Op = LI->getPointerOperand();
10771 }
10772 return;
10773 case Instruction::ZExt:
10774 case Instruction::SExt:
10775 case Instruction::FPToUI:
10776 case Instruction::FPToSI:
10777 case Instruction::FPExt:
10778 case Instruction::PtrToInt:
10779 case Instruction::IntToPtr:
10780 case Instruction::SIToFP:
10781 case Instruction::UIToFP:
10782 case Instruction::Trunc:
10783 case Instruction::FPTrunc:
10784 case Instruction::BitCast:
10785 case Instruction::ICmp:
10786 case Instruction::FCmp:
10787 case Instruction::Select:
10788 case Instruction::FNeg:
10789 case Instruction::Add:
10790 case Instruction::FAdd:
10791 case Instruction::Sub:
10792 case Instruction::FSub:
10793 case Instruction::Mul:
10794 case Instruction::FMul:
10795 case Instruction::UDiv:
10796 case Instruction::SDiv:
10797 case Instruction::FDiv:
10798 case Instruction::URem:
10799 case Instruction::SRem:
10800 case Instruction::FRem:
10801 case Instruction::Shl:
10802 case Instruction::LShr:
10803 case Instruction::AShr:
10804 case Instruction::And:
10805 case Instruction::Or:
10806 case Instruction::Xor:
10807 case Instruction::Freeze:
10808 case Instruction::Store:
10809 case Instruction::ShuffleVector:
10810 Operands.assign(VL0->getNumOperands(), {VL.size(), nullptr});
10811 for (auto [Idx, V] : enumerate(VL)) {
10812 auto *I = dyn_cast<Instruction>(V);
10813 if (!I) {
10814 for (auto [OpIdx, Ops] : enumerate(Operands))
10815 Ops[Idx] = PoisonValue::get(VL0->getOperand(OpIdx)->getType());
10816 continue;
10817 }
10818 auto [Op, ConvertedOps] = convertTo(I, S);
10819 for (auto [OpIdx, Ops] : enumerate(Operands))
10820 Ops[Idx] = ConvertedOps[OpIdx];
10821 }
10822 return;
10823 case Instruction::GetElementPtr: {
10824 Operands.assign(2, {VL.size(), nullptr});
10825 // Need to cast all indices to the same type before vectorization to
10826 // avoid crash.
10827 // Required to be able to find correct matches between different gather
10828 // nodes and reuse the vectorized values rather than trying to gather them
10829 // again.
10830 const unsigned IndexIdx = 1;
10831 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
10832 Type *Ty =
10833 all_of(VL,
10834 [&](Value *V) {
10836 return !GEP || VL0Ty == GEP->getOperand(IndexIdx)->getType();
10837 })
10838 ? VL0Ty
10839 : DL.getIndexType(cast<GetElementPtrInst>(VL0)
10840 ->getPointerOperandType()
10841 ->getScalarType());
10842 for (auto [Idx, V] : enumerate(VL)) {
10844 if (!GEP) {
10845 Operands[0][Idx] = V;
10846 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
10847 continue;
10848 }
10849 Operands[0][Idx] = GEP->getPointerOperand();
10850 auto *Op = GEP->getOperand(IndexIdx);
10851 auto *CI = dyn_cast<ConstantInt>(Op);
10852 Operands[1][Idx] = CI ? ConstantFoldIntegerCast(
10853 CI, Ty, CI->getValue().isSignBitSet(), DL)
10854 : Op;
10855 }
10856 return;
10857 }
10858 case Instruction::Call: {
10859 auto *CI = cast<CallInst>(VL0);
10861 for (unsigned Idx : seq<unsigned>(CI->arg_size())) {
10863 continue;
10864 auto &Ops = Operands.emplace_back();
10865 for (Value *V : VL) {
10866 auto *I = dyn_cast<Instruction>(V);
10867 Ops.push_back(I ? I->getOperand(Idx)
10868 : PoisonValue::get(VL0->getOperand(Idx)->getType()));
10869 }
10870 }
10871 return;
10872 }
10873 default:
10874 break;
10875 }
10876 llvm_unreachable("Unexpected vectorization of the instructions.");
10877 }
10878
10879public:
10880 InstructionsCompatibilityAnalysis(DominatorTree &DT, const DataLayout &DL,
10881 const TargetTransformInfo &TTI,
10882 const TargetLibraryInfo &TLI)
10883 : DT(DT), DL(DL), TTI(TTI), TLI(TLI) {}
10884
10885 InstructionsState
10886 buildInstructionsState(ArrayRef<Value *> VL, const BoUpSLP &R,
10887 bool TryCopyableElementsVectorization,
10888 bool WithProfitabilityCheck = false,
10889 bool SkipSameCodeCheck = false) {
10890 InstructionsState S = (SkipSameCodeCheck || !allSameBlock(VL))
10891 ? InstructionsState::invalid()
10892 : getSameOpcode(VL, TLI);
10893 if (S)
10894 return S;
10895 if (!VectorizeCopyableElements || !TryCopyableElementsVectorization)
10896 return S;
10897 findAndSetMainInstruction(VL, R);
10898 if (!MainOp)
10899 return InstructionsState::invalid();
10900 S = InstructionsState(MainOp, MainOp, /*HasCopyables=*/true);
10901 if (!WithProfitabilityCheck)
10902 return S;
10903 // Check if it is profitable to vectorize the instruction.
10904 SmallVector<BoUpSLP::ValueList> Operands = buildOperands(S, VL);
10905 auto BuildCandidates =
10906 [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates, Value *V1,
10907 Value *V2) {
10908 if (V1 != V2 && isa<PHINode>(V1))
10909 return;
10910 auto *I1 = dyn_cast<Instruction>(V1);
10911 auto *I2 = dyn_cast<Instruction>(V2);
10912 if (I1 && I2 && I1->getOpcode() == I2->getOpcode() &&
10913 I1->getParent() != I2->getParent())
10914 return;
10915 Candidates.emplace_back(V1, (I1 || I2) ? V2 : V1);
10916 };
10917 if (VL.size() == 2) {
10918 // Check if the operands allow better vectorization.
10919 SmallVector<std::pair<Value *, Value *>, 4> Candidates1, Candidates2;
10920 BuildCandidates(Candidates1, Operands[0][0], Operands[0][1]);
10921 BuildCandidates(Candidates2, Operands[1][0], Operands[1][1]);
10922 bool Res = !Candidates1.empty() && !Candidates2.empty() &&
10923 R.findBestRootPair(Candidates1) &&
10924 R.findBestRootPair(Candidates2);
10925 if (!Res && isCommutative(MainOp)) {
10926 Candidates1.clear();
10927 Candidates2.clear();
10928 BuildCandidates(Candidates1, Operands[0][0], Operands[1][1]);
10929 BuildCandidates(Candidates2, Operands[1][0], Operands[0][1]);
10930 Res = !Candidates1.empty() && !Candidates2.empty() &&
10931 R.findBestRootPair(Candidates1) &&
10932 R.findBestRootPair(Candidates2);
10933 }
10934 if (!Res)
10935 return InstructionsState::invalid();
10937 InstructionCost ScalarCost = TTI.getInstructionCost(S.getMainOp(), Kind);
10938 InstructionCost VectorCost;
10939 FixedVectorType *VecTy =
10940 getWidenedType(S.getMainOp()->getType(), VL.size());
10941 switch (MainOpcode) {
10942 case Instruction::Add:
10943 case Instruction::LShr:
10944 VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
10945 break;
10946 default:
10947 llvm_unreachable("Unexpected instruction.");
10948 }
10949 if (VectorCost > ScalarCost)
10950 return InstructionsState::invalid();
10951 return S;
10952 }
10953 assert(Operands.size() == 2 && "Unexpected number of operands!");
10954 unsigned CopyableNum =
10955 count_if(VL, [&](Value *V) { return S.isCopyableElement(V); });
10956 if (CopyableNum < VL.size() / 2)
10957 return S;
10958 // Too many phi copyables - exit.
10959 const unsigned Limit = VL.size() / 24;
10960 if ((CopyableNum >= VL.size() - Limit ||
10961 (CopyableNum >= VL.size() - 1 && VL.size() > 4) ||
10962 CopyableNum >= MaxPHINumOperands) &&
10963 all_of(VL, [&](Value *V) {
10964 return isa<PHINode>(V) || !S.isCopyableElement(V);
10965 }))
10966 return InstructionsState::invalid();
10967 // Check profitability if number of copyables > VL.size() / 2.
10968 // 1. Reorder operands for better matching.
10969 if (isCommutative(MainOp)) {
10970 for (auto &Ops : Operands) {
10971 // Make instructions the first operands.
10972 if (!isa<Instruction>(Ops.front()) && isa<Instruction>(Ops.back())) {
10973 std::swap(Ops.front(), Ops.back());
10974 continue;
10975 }
10976 // Make constants the second operands.
10977 if (isa<Constant>(Ops.front())) {
10978 std::swap(Ops.front(), Ops.back());
10979 continue;
10980 }
10981 }
10982 }
10983 // 2. Check, if operands can be vectorized.
10984 if (count_if(Operands.back(), IsaPred<Instruction>) > 1)
10985 return InstructionsState::invalid();
10986 auto CheckOperand = [&](ArrayRef<Value *> Ops) {
10987 if (allConstant(Ops) || isSplat(Ops))
10988 return true;
10989 // Check if it is "almost" splat, i.e. has >= 4 elements and only single
10990 // one is different.
10991 constexpr unsigned Limit = 4;
10992 if (Operands.front().size() >= Limit) {
10993 SmallDenseMap<const Value *, unsigned> Counters;
10994 for (Value *V : Ops) {
10995 if (isa<UndefValue>(V))
10996 continue;
10997 ++Counters[V];
10998 }
10999 if (Counters.size() == 2 &&
11000 any_of(Counters, [&](const std::pair<const Value *, unsigned> &C) {
11001 return C.second == 1;
11002 }))
11003 return true;
11004 }
11005 // First operand not a constant or splat? Last attempt - check for
11006 // potential vectorization.
11007 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
11008 InstructionsState OpS = Analysis.buildInstructionsState(
11009 Ops, R, /*TryCopyableElementsVectorization=*/true);
11010 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !allSameBlock(Ops)))
11011 return false;
11012 unsigned CopyableNum =
11013 count_if(Ops, [&](Value *V) { return OpS.isCopyableElement(V); });
11014 return CopyableNum <= VL.size() / 2;
11015 };
11016 if (!CheckOperand(Operands.front()))
11017 return InstructionsState::invalid();
11018
11019 return S;
11020 }
11021
11022 SmallVector<BoUpSLP::ValueList> buildOperands(const InstructionsState &S,
11023 ArrayRef<Value *> VL) {
11024 assert(S && "Invalid state!");
11026 if (S.areInstructionsWithCopyableElements()) {
11027 MainOp = S.getMainOp();
11028 MainOpcode = S.getOpcode();
11029 Operands.assign(MainOp->getNumOperands(),
11030 BoUpSLP::ValueList(VL.size(), nullptr));
11031 for (auto [Idx, V] : enumerate(VL)) {
11032 SmallVector<Value *> OperandsForValue = getOperands(S, V);
11033 for (auto [OperandIdx, Operand] : enumerate(OperandsForValue))
11034 Operands[OperandIdx][Idx] = Operand;
11035 }
11036 } else {
11037 buildOriginalOperands(S, VL, Operands);
11038 }
11039 return Operands;
11040 }
11041};
11042} // namespace
11043
11044BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
11045 ArrayRef<Value *> VL, unsigned Depth, const EdgeInfo &UserTreeIdx,
11046 bool TryCopyableElementsVectorization) const {
11047 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
11048
11049 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
11050 InstructionsState S = Analysis.buildInstructionsState(
11051 VL, *this, TryCopyableElementsVectorization,
11052 /*WithProfitabilityCheck=*/true, TryCopyableElementsVectorization);
11053
11054 // Don't go into catchswitch blocks, which can happen with PHIs.
11055 // Such blocks can only have PHIs and the catchswitch. There is no
11056 // place to insert a shuffle if we need to, so just avoid that issue.
11057 if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
11058 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
11059 // Do not try to pack to avoid extra instructions here.
11060 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11061 /*TryToFindDuplicates=*/false);
11062 }
11063
11064 // Check if this is a duplicate of another entry.
11065 if (S) {
11066 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp() << ".\n");
11067 for (TreeEntry *E : getTreeEntries(S.getMainOp())) {
11068 if (E->isSame(VL)) {
11069 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
11070 << ".\n");
11071 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11072 }
11073 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
11074 if (all_of(VL, [&](Value *V) {
11075 return isa<PoisonValue>(V) || Values.contains(V) ||
11076 (S.getOpcode() == Instruction::PHI && isa<PHINode>(V) &&
11077 LI->getLoopFor(S.getMainOp()->getParent()) &&
11078 isVectorized(V));
11079 })) {
11080 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
11081 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11082 }
11083 }
11084 }
11085
11086 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
11087 // a load), in which case peek through to include it in the tree, without
11088 // ballooning over-budget.
11089 if (Depth >= RecursionMaxDepth &&
11090 !(S && !S.isAltShuffle() && VL.size() >= 4 &&
11091 (match(S.getMainOp(), m_Load(m_Value())) ||
11092 all_of(VL, [&S](const Value *I) {
11093 return match(I,
11095 cast<Instruction>(I)->getOpcode() == S.getOpcode();
11096 })))) {
11097 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
11098 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11099 }
11100
11101 // Don't handle scalable vectors
11102 if (S && S.getOpcode() == Instruction::ExtractElement &&
11104 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
11105 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
11106 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11107 }
11108
11109 // Don't handle vectors.
11110 if (!SLPReVec && getValueType(VL.front())->isVectorTy()) {
11111 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
11112 // Do not try to pack to avoid extra instructions here.
11113 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11114 /*TryToFindDuplicates=*/false);
11115 }
11116
11117 // If all of the operands are identical or constant we have a simple solution.
11118 // If we deal with insert/extract instructions, they all must have constant
11119 // indices, otherwise we should gather them, not try to vectorize.
11120 // If alternate op node with 2 elements with gathered operands - do not
11121 // vectorize.
11122 auto NotProfitableForVectorization = [&S, this, Depth](ArrayRef<Value *> VL) {
11123 if (!S || !S.isAltShuffle() || VL.size() > 2)
11124 return false;
11125 if (VectorizableTree.size() < MinTreeSize)
11126 return false;
11127 if (Depth >= RecursionMaxDepth - 1)
11128 return true;
11129 // Check if all operands are extracts, part of vector node or can build a
11130 // regular vectorize node.
11131 SmallVector<unsigned, 8> InstsCount;
11132 for (Value *V : VL) {
11133 auto *I = cast<Instruction>(V);
11134 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
11135 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
11136 }));
11137 }
11138 bool IsCommutative =
11139 isCommutative(S.getMainOp()) || isCommutative(S.getAltOp());
11140 if ((IsCommutative &&
11141 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
11142 (!IsCommutative &&
11143 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
11144 return true;
11145 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
11147 auto *I1 = cast<Instruction>(VL.front());
11148 auto *I2 = cast<Instruction>(VL.back());
11149 for (int Op : seq<int>(S.getMainOp()->getNumOperands()))
11150 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
11151 I2->getOperand(Op));
11152 if (static_cast<unsigned>(count_if(
11153 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
11155 })) >= S.getMainOp()->getNumOperands() / 2)
11156 return false;
11157 if (S.getMainOp()->getNumOperands() > 2)
11158 return true;
11159 if (IsCommutative) {
11160 // Check permuted operands.
11161 Candidates.clear();
11162 for (int Op = 0, E = S.getMainOp()->getNumOperands(); Op < E; ++Op)
11163 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
11164 I2->getOperand((Op + 1) % E));
11165 if (any_of(
11166 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
11168 }))
11169 return false;
11170 }
11171 return true;
11172 };
11173 SmallVector<unsigned> SortedIndices;
11174 BasicBlock *BB = nullptr;
11175 bool IsScatterVectorizeUserTE =
11176 UserTreeIdx.UserTE &&
11177 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11178 bool AreAllSameBlock = S.valid();
11179 bool AreScatterAllGEPSameBlock =
11180 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
11181 VL.size() > 2 &&
11182 all_of(VL,
11183 [&BB](Value *V) {
11184 auto *I = dyn_cast<GetElementPtrInst>(V);
11185 if (!I)
11186 return doesNotNeedToBeScheduled(V);
11187 if (!BB)
11188 BB = I->getParent();
11189 return BB == I->getParent() && I->getNumOperands() == 2;
11190 }) &&
11191 BB &&
11192 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
11193 SortedIndices));
11194 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
11195 if (!AreAllSameInsts || (!S && allConstant(VL)) || isSplat(VL) ||
11196 (S &&
11198 S.getMainOp()) &&
11200 NotProfitableForVectorization(VL)) {
11201 if (!S) {
11202 LLVM_DEBUG(dbgs() << "SLP: Try split and if failed, gathering due to "
11203 "C,S,B,O, small shuffle. \n";
11204 dbgs() << "[";
11205 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
11206 dbgs() << "]\n");
11207 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11208 /*TryToFindDuplicates=*/true,
11209 /*TrySplitVectorize=*/true);
11210 }
11211 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n";
11212 dbgs() << "[";
11213 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
11214 dbgs() << "]\n");
11215 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11216 }
11217
11218 // Don't vectorize ephemeral values.
11219 if (S && !EphValues.empty()) {
11220 for (Value *V : VL) {
11221 if (EphValues.count(V)) {
11222 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
11223 << ") is ephemeral.\n");
11224 // Do not try to pack to avoid extra instructions here.
11225 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11226 /*TryToFindDuplicates=*/false);
11227 }
11228 }
11229 }
11230
11231 // We now know that this is a vector of instructions of the same type from
11232 // the same block.
11233
11234 // Check that none of the instructions in the bundle are already in the tree
11235 // and the node may be not profitable for the vectorization as the small
11236 // alternate node.
11237 if (S && S.isAltShuffle()) {
11238 auto GetNumVectorizedExtracted = [&]() {
11239 APInt Extracted = APInt::getZero(VL.size());
11240 APInt Vectorized = APInt::getAllOnes(VL.size());
11241 for (auto [Idx, V] : enumerate(VL)) {
11242 auto *I = dyn_cast<Instruction>(V);
11243 if (!I || doesNotNeedToBeScheduled(I) ||
11244 all_of(I->operands(), [&](const Use &U) {
11245 return isa<ExtractElementInst>(U.get());
11246 }))
11247 continue;
11248 if (isVectorized(I))
11249 Vectorized.clearBit(Idx);
11250 else if (!I->hasOneUser() && !areAllUsersVectorized(I, UserIgnoreList))
11251 Extracted.setBit(Idx);
11252 }
11253 return std::make_pair(Vectorized, Extracted);
11254 };
11255 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
11257 bool PreferScalarize = !Vectorized.isAllOnes() && VL.size() == 2;
11258 if (!Vectorized.isAllOnes() && !PreferScalarize) {
11259 // Rough cost estimation, if the vector code (+ potential extracts) is
11260 // more profitable than the scalar + buildvector.
11261 Type *ScalarTy = VL.front()->getType();
11262 auto *VecTy = getWidenedType(ScalarTy, VL.size());
11263 InstructionCost VectorizeCostEstimate =
11264 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, {}, Kind) +
11265 ::getScalarizationOverhead(*TTI, ScalarTy, VecTy, Extracted,
11266 /*Insert=*/false, /*Extract=*/true, Kind);
11267 InstructionCost ScalarizeCostEstimate = ::getScalarizationOverhead(
11268 *TTI, ScalarTy, VecTy, Vectorized,
11269 /*Insert=*/true, /*Extract=*/false, Kind, /*ForPoisonSrc=*/false);
11270 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
11271 }
11272 if (PreferScalarize) {
11273 LLVM_DEBUG(dbgs() << "SLP: The instructions are in tree and alternate "
11274 "node is not profitable.\n");
11275 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11276 }
11277 }
11278
11279 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
11280 if (UserIgnoreList && !UserIgnoreList->empty()) {
11281 for (Value *V : VL) {
11282 if (UserIgnoreList->contains(V)) {
11283 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
11284 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11285 }
11286 }
11287 }
11288
11289 // Special processing for sorted pointers for ScatterVectorize node with
11290 // constant indeces only.
11291 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
11292 assert(VL.front()->getType()->isPointerTy() &&
11294 "Expected pointers only.");
11295 // Reset S to make it GetElementPtr kind of node.
11296 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
11297 assert(It != VL.end() && "Expected at least one GEP.");
11298 S = getSameOpcode(*It, *TLI);
11299 }
11300
11301 // Check that all of the users of the scalars that we want to vectorize are
11302 // schedulable.
11303 Instruction *VL0 = S.getMainOp();
11304 BB = VL0->getParent();
11305
11306 if (S &&
11308 !DT->isReachableFromEntry(BB))) {
11309 // Don't go into unreachable blocks. They may contain instructions with
11310 // dependency cycles which confuse the final scheduling.
11311 // Do not vectorize EH and non-returning blocks, not profitable in most
11312 // cases.
11313 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
11314 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11315 }
11316 return ScalarsVectorizationLegality(S, /*IsLegal=*/true);
11317}
11318
11319void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
11320 const EdgeInfo &UserTreeIdx,
11321 unsigned InterleaveFactor) {
11322 assert((allConstant(VLRef) || allSameType(VLRef)) && "Invalid types!");
11323
11324 SmallVector<int> ReuseShuffleIndices;
11325 SmallVector<Value *> VL(VLRef);
11326
11327 // Tries to build split node.
11328 auto TrySplitNode = [&](const InstructionsState &LocalState) {
11329 SmallVector<Value *> Op1, Op2;
11330 OrdersType ReorderIndices;
11331 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
11332 return false;
11333
11334 auto Invalid = ScheduleBundle::invalid();
11335 auto *TE = newTreeEntry(VL, TreeEntry::SplitVectorize, Invalid, LocalState,
11336 UserTreeIdx, {}, ReorderIndices);
11337 LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n"; TE->dump());
11338 auto AddNode = [&](ArrayRef<Value *> Op, unsigned Idx) {
11339 InstructionsState S = getSameOpcode(Op, *TLI);
11340 if (S && (isa<LoadInst>(S.getMainOp()) ||
11341 getSameValuesTreeEntry(S.getMainOp(), Op, /*SameVF=*/true))) {
11342 // Build gather node for loads, they will be gathered later.
11343 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11344 Idx == 0 ? 0 : Op1.size());
11345 (void)newTreeEntry(Op, TreeEntry::NeedToGather, Invalid, S, {TE, Idx});
11346 } else {
11347 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11348 Idx == 0 ? 0 : Op1.size());
11349 buildTreeRec(Op, Depth, {TE, Idx});
11350 }
11351 };
11352 AddNode(Op1, 0);
11353 AddNode(Op2, 1);
11354 return true;
11355 };
11356
11357 auto AreOnlyConstsWithPHIs = [](ArrayRef<Value *> VL) {
11358 bool AreConsts = false;
11359 for (Value *V : VL) {
11360 if (isa<PoisonValue>(V))
11361 continue;
11362 if (isa<Constant>(V)) {
11363 AreConsts = true;
11364 continue;
11365 }
11366 if (!isa<PHINode>(V))
11367 return false;
11368 }
11369 return AreConsts;
11370 };
11371 if (AreOnlyConstsWithPHIs(VL)) {
11372 LLVM_DEBUG(dbgs() << "SLP: Gathering due to all constants and PHIs.\n");
11373 newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
11374 return;
11375 }
11376
11377 ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
11378 VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/false);
11379 InstructionsState S = Legality.getInstructionsState();
11380 if (!Legality.isLegal()) {
11381 if (Legality.trySplitVectorize()) {
11382 auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
11383 // Last chance to try to vectorize alternate node.
11384 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
11385 return;
11386 }
11387 if (!S)
11388 Legality = getScalarsVectorizationLegality(
11389 VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/true);
11390 if (!Legality.isLegal()) {
11391 if (Legality.tryToFindDuplicates())
11392 tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S,
11393 UserTreeIdx);
11394
11395 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11396 return;
11397 }
11398 S = Legality.getInstructionsState();
11399 }
11400
11401 // FIXME: investigate if there are profitable cases for VL.size() <= 4.
11402 if (S.isAltShuffle() && TrySplitNode(S))
11403 return;
11404
11405 // Check that every instruction appears once in this bundle.
11406 if (!tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S, UserTreeIdx,
11407 /*TryPad=*/true)) {
11408 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11409 return;
11410 }
11411
11412 // Perform specific checks for each particular instruction kind.
11413 bool IsScatterVectorizeUserTE =
11414 UserTreeIdx.UserTE &&
11415 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11416 OrdersType CurrentOrder;
11417 SmallVector<Value *> PointerOps;
11418 StridedPtrInfo SPtrInfo;
11419 TreeEntry::EntryState State = getScalarsVectorizationState(
11420 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
11421 if (State == TreeEntry::NeedToGather) {
11422 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11423 return;
11424 }
11425
11426 Instruction *VL0 = S.getMainOp();
11427 BasicBlock *BB = VL0->getParent();
11428 auto &BSRef = BlocksSchedules[BB];
11429 if (!BSRef)
11430 BSRef = std::make_unique<BlockScheduling>(BB);
11431
11432 BlockScheduling &BS = *BSRef;
11433
11434 SetVector<Value *> UniqueValues(llvm::from_range, VL);
11435 std::optional<ScheduleBundle *> BundlePtr =
11436 BS.tryScheduleBundle(UniqueValues.getArrayRef(), this, S, UserTreeIdx);
11437#ifdef EXPENSIVE_CHECKS
11438 // Make sure we didn't break any internal invariants
11439 BS.verify();
11440#endif
11441 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
11442 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
11443 // Last chance to try to vectorize alternate node.
11444 if (S.isAltShuffle() && ReuseShuffleIndices.empty() && TrySplitNode(S))
11445 return;
11446 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11447 NonScheduledFirst.insert(VL.front());
11448 if (S.getOpcode() == Instruction::Load &&
11449 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
11451 return;
11452 }
11453 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
11454 SmallVector<ValueList> Operands = Analysis.buildOperands(S, VL);
11455 ScheduleBundle Empty;
11456 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() : Empty;
11457 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
11458
11459 unsigned ShuffleOrOp =
11460 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
11461 auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) {
11462 // Postpone PHI nodes creation
11463 SmallVector<unsigned> PHIOps;
11464 for (unsigned I : seq<unsigned>(Operands.size())) {
11466 if (Op.empty())
11467 continue;
11468 InstructionsState S = getSameOpcode(Op, *TLI);
11469 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
11470 buildTreeRec(Op, Depth + 1, {TE, I});
11471 else
11472 PHIOps.push_back(I);
11473 }
11474 for (unsigned I : PHIOps)
11475 buildTreeRec(Operands[I], Depth + 1, {TE, I});
11476 };
11477 switch (ShuffleOrOp) {
11478 case Instruction::PHI: {
11479 TreeEntry *TE =
11480 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
11481 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n";
11482 TE->dump());
11483
11484 TE->setOperands(Operands);
11485 CreateOperandNodes(TE, Operands);
11486 return;
11487 }
11488 case Instruction::ExtractValue:
11489 case Instruction::ExtractElement: {
11490 if (CurrentOrder.empty()) {
11491 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
11492 } else {
11493 LLVM_DEBUG({
11494 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
11495 "with order";
11496 for (unsigned Idx : CurrentOrder)
11497 dbgs() << " " << Idx;
11498 dbgs() << "\n";
11499 });
11500 fixupOrderingIndices(CurrentOrder);
11501 }
11502 // Insert new order with initial value 0, if it does not exist,
11503 // otherwise return the iterator to the existing one.
11504 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11505 ReuseShuffleIndices, CurrentOrder);
11506 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
11507 "(ExtractValueInst/ExtractElementInst).\n";
11508 TE->dump());
11509 // This is a special case, as it does not gather, but at the same time
11510 // we are not extending buildTreeRec() towards the operands.
11511 TE->setOperands(Operands);
11512 return;
11513 }
11514 case Instruction::InsertElement: {
11515 assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
11516
11517 auto OrdCompare = [](const std::pair<int, int> &P1,
11518 const std::pair<int, int> &P2) {
11519 return P1.first > P2.first;
11520 };
11521 PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
11522 decltype(OrdCompare)>
11523 Indices(OrdCompare);
11524 for (int I = 0, E = VL.size(); I < E; ++I) {
11525 unsigned Idx = *getElementIndex(VL[I]);
11526 Indices.emplace(Idx, I);
11527 }
11528 OrdersType CurrentOrder(VL.size(), VL.size());
11529 bool IsIdentity = true;
11530 for (int I = 0, E = VL.size(); I < E; ++I) {
11531 CurrentOrder[Indices.top().second] = I;
11532 IsIdentity &= Indices.top().second == I;
11533 Indices.pop();
11534 }
11535 if (IsIdentity)
11536 CurrentOrder.clear();
11537 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11538 {}, CurrentOrder);
11539 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
11540 TE->dump());
11541
11542 TE->setOperands(Operands);
11543 buildTreeRec(TE->getOperand(1), Depth + 1, {TE, 1});
11544 return;
11545 }
11546 case Instruction::Load: {
11547 // Check that a vectorized load would load the same memory as a scalar
11548 // load. For example, we don't want to vectorize loads that are smaller
11549 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
11550 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
11551 // from such a struct, we read/write packed bits disagreeing with the
11552 // unvectorized version.
11553 TreeEntry *TE = nullptr;
11554 fixupOrderingIndices(CurrentOrder);
11555 switch (State) {
11556 case TreeEntry::Vectorize:
11557 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11558 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
11559 if (CurrentOrder.empty())
11560 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
11561 TE->dump());
11562 else
11564 << "SLP: added a new TreeEntry (jumbled LoadInst).\n";
11565 TE->dump());
11566 break;
11567 case TreeEntry::CompressVectorize:
11568 // Vectorizing non-consecutive loads with (masked)load + compress.
11569 TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
11570 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11571 LLVM_DEBUG(
11572 dbgs()
11573 << "SLP: added a new TreeEntry (masked LoadInst + compress).\n";
11574 TE->dump());
11575 break;
11576 case TreeEntry::StridedVectorize:
11577 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
11578 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
11579 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11580 TreeEntryToStridedPtrInfoMap[TE] = SPtrInfo;
11581 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
11582 TE->dump());
11583 break;
11584 case TreeEntry::ScatterVectorize:
11585 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
11586 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
11587 UserTreeIdx, ReuseShuffleIndices);
11588 LLVM_DEBUG(
11589 dbgs()
11590 << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
11591 TE->dump());
11592 break;
11593 case TreeEntry::CombinedVectorize:
11594 case TreeEntry::SplitVectorize:
11595 case TreeEntry::NeedToGather:
11596 llvm_unreachable("Unexpected loads state.");
11597 }
11598 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
11599 assert(Operands.size() == 1 && "Expected a single operand only");
11600 SmallVector<int> Mask;
11601 inversePermutation(CurrentOrder, Mask);
11602 reorderScalars(Operands.front(), Mask);
11603 }
11604 TE->setOperands(Operands);
11605 if (State == TreeEntry::ScatterVectorize)
11606 buildTreeRec(PointerOps, Depth + 1, {TE, 0});
11607 return;
11608 }
11609 case Instruction::ZExt:
11610 case Instruction::SExt:
11611 case Instruction::FPToUI:
11612 case Instruction::FPToSI:
11613 case Instruction::FPExt:
11614 case Instruction::PtrToInt:
11615 case Instruction::IntToPtr:
11616 case Instruction::SIToFP:
11617 case Instruction::UIToFP:
11618 case Instruction::Trunc:
11619 case Instruction::FPTrunc:
11620 case Instruction::BitCast: {
11621 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
11622 std::make_pair(std::numeric_limits<unsigned>::min(),
11623 std::numeric_limits<unsigned>::max()));
11624 if (ShuffleOrOp == Instruction::ZExt ||
11625 ShuffleOrOp == Instruction::SExt) {
11626 CastMaxMinBWSizes = std::make_pair(
11627 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
11628 PrevMaxBW),
11629 std::min<unsigned>(
11630 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
11631 PrevMinBW));
11632 } else if (ShuffleOrOp == Instruction::Trunc) {
11633 CastMaxMinBWSizes = std::make_pair(
11634 std::max<unsigned>(
11635 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
11636 PrevMaxBW),
11637 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
11638 PrevMinBW));
11639 }
11640 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11641 ReuseShuffleIndices);
11642 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
11643 TE->dump());
11644
11645 TE->setOperands(Operands);
11646 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
11647 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
11648 if (ShuffleOrOp == Instruction::Trunc) {
11649 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11650 } else if (ShuffleOrOp == Instruction::SIToFP ||
11651 ShuffleOrOp == Instruction::UIToFP) {
11652 unsigned NumSignBits =
11653 ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
11654 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
11655 APInt Mask = DB->getDemandedBits(OpI);
11656 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
11657 }
11658 if (NumSignBits * 2 >=
11659 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
11660 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11661 }
11662 return;
11663 }
11664 case Instruction::ICmp:
11665 case Instruction::FCmp: {
11666 // Check that all of the compares have the same predicate.
11667 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
11668 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11669 ReuseShuffleIndices);
11670 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n";
11671 TE->dump());
11672
11673 VLOperands Ops(VL, Operands, S, *this);
11674 if (cast<CmpInst>(VL0)->isCommutative()) {
11675 // Commutative predicate - collect + sort operands of the instructions
11676 // so that each side is more likely to have the same opcode.
11678 "Commutative Predicate mismatch");
11679 Ops.reorder();
11680 Operands.front() = Ops.getVL(0);
11681 Operands.back() = Ops.getVL(1);
11682 } else {
11683 // Collect operands - commute if it uses the swapped predicate.
11684 for (auto [Idx, V] : enumerate(VL)) {
11685 if (isa<PoisonValue>(V))
11686 continue;
11687 auto *Cmp = cast<CmpInst>(V);
11688 if (Cmp->getPredicate() != P0)
11689 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
11690 }
11691 }
11692 TE->setOperands(Operands);
11693 buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
11694 buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
11695 if (ShuffleOrOp == Instruction::ICmp) {
11696 unsigned NumSignBits0 =
11697 ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
11698 if (NumSignBits0 * 2 >=
11699 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
11700 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11701 unsigned NumSignBits1 =
11702 ComputeNumSignBits(VL0->getOperand(1), *DL, AC, nullptr, DT);
11703 if (NumSignBits1 * 2 >=
11704 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
11705 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
11706 }
11707 return;
11708 }
11709 case Instruction::Select:
11710 case Instruction::FNeg:
11711 case Instruction::Add:
11712 case Instruction::FAdd:
11713 case Instruction::Sub:
11714 case Instruction::FSub:
11715 case Instruction::Mul:
11716 case Instruction::FMul:
11717 case Instruction::UDiv:
11718 case Instruction::SDiv:
11719 case Instruction::FDiv:
11720 case Instruction::URem:
11721 case Instruction::SRem:
11722 case Instruction::FRem:
11723 case Instruction::Shl:
11724 case Instruction::LShr:
11725 case Instruction::AShr:
11726 case Instruction::And:
11727 case Instruction::Or:
11728 case Instruction::Xor:
11729 case Instruction::Freeze: {
11730 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11731 ReuseShuffleIndices);
11732 LLVM_DEBUG(
11733 dbgs() << "SLP: added a new TreeEntry "
11734 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
11735 TE->dump());
11736
11737 if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
11738 VLOperands Ops(VL, Operands, S, *this);
11739 Ops.reorder();
11740 Operands[0] = Ops.getVL(0);
11741 Operands[1] = Ops.getVL(1);
11742 }
11743 TE->setOperands(Operands);
11744 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
11745 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
11746 return;
11747 }
11748 case Instruction::GetElementPtr: {
11749 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11750 ReuseShuffleIndices);
11751 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
11752 TE->dump());
11753 TE->setOperands(Operands);
11754
11755 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
11756 buildTreeRec(Operands[I], Depth + 1, {TE, I});
11757 return;
11758 }
11759 case Instruction::Store: {
11760 bool Consecutive = CurrentOrder.empty();
11761 if (!Consecutive)
11762 fixupOrderingIndices(CurrentOrder);
11763 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11764 ReuseShuffleIndices, CurrentOrder);
11765 if (Consecutive)
11766 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
11767 TE->dump());
11768 else
11769 LLVM_DEBUG(
11770 dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n";
11771 TE->dump());
11772 TE->setOperands(Operands);
11773 buildTreeRec(TE->getOperand(0), Depth + 1, {TE, 0});
11774 return;
11775 }
11776 case Instruction::Call: {
11777 // Check if the calls are all to the same vectorizable intrinsic or
11778 // library function.
11779 CallInst *CI = cast<CallInst>(VL0);
11781
11782 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11783 ReuseShuffleIndices);
11784 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
11785 TE->dump());
11786 if (isCommutative(VL0)) {
11787 VLOperands Ops(VL, Operands, S, *this);
11788 Ops.reorder();
11789 Operands[0] = Ops.getVL(0);
11790 Operands[1] = Ops.getVL(1);
11791 }
11792 TE->setOperands(Operands);
11793 for (unsigned I : seq<unsigned>(CI->arg_size())) {
11794 // For scalar operands no need to create an entry since no need to
11795 // vectorize it.
11797 continue;
11798 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
11799 }
11800 return;
11801 }
11802 case Instruction::ShuffleVector: {
11803 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11804 ReuseShuffleIndices);
11805 if (S.isAltShuffle()) {
11806 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
11807 TE->dump());
11808 } else {
11809 assert(SLPReVec && "Only supported by REVEC.");
11810 LLVM_DEBUG(
11811 dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
11812 TE->dump());
11813 }
11814
11815 // Reorder operands if reordering would enable vectorization.
11816 auto *CI = dyn_cast<CmpInst>(VL0);
11817 if (CI && any_of(VL, [](Value *V) {
11818 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
11819 })) {
11820 auto *MainCI = cast<CmpInst>(S.getMainOp());
11821 auto *AltCI = cast<CmpInst>(S.getAltOp());
11822 CmpInst::Predicate MainP = MainCI->getPredicate();
11823 CmpInst::Predicate AltP = AltCI->getPredicate();
11824 assert(MainP != AltP &&
11825 "Expected different main/alternate predicates.");
11826 // Collect operands - commute if it uses the swapped predicate or
11827 // alternate operation.
11828 for (auto [Idx, V] : enumerate(VL)) {
11829 if (isa<PoisonValue>(V))
11830 continue;
11831 auto *Cmp = cast<CmpInst>(V);
11832
11833 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
11834 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
11835 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
11836 } else {
11837 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
11838 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
11839 }
11840 }
11841 TE->setOperands(Operands);
11842 buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
11843 buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
11844 return;
11845 }
11846
11847 if (isa<BinaryOperator>(VL0) || CI) {
11848 VLOperands Ops(VL, Operands, S, *this);
11849 Ops.reorder();
11850 Operands[0] = Ops.getVL(0);
11851 Operands[1] = Ops.getVL(1);
11852 }
11853 TE->setOperands(Operands);
11854 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
11855 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
11856 return;
11857 }
11858 default:
11859 break;
11860 }
11861 llvm_unreachable("Unexpected vectorization of the instructions.");
11862}
11863
11865 unsigned N = 1;
11866 Type *EltTy = T;
11867
11869 if (EltTy->isEmptyTy())
11870 return 0;
11871 if (auto *ST = dyn_cast<StructType>(EltTy)) {
11872 // Check that struct is homogeneous.
11873 for (const auto *Ty : ST->elements())
11874 if (Ty != *ST->element_begin())
11875 return 0;
11876 N *= ST->getNumElements();
11877 EltTy = *ST->element_begin();
11878 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
11879 N *= AT->getNumElements();
11880 EltTy = AT->getElementType();
11881 } else {
11882 auto *VT = cast<FixedVectorType>(EltTy);
11883 N *= VT->getNumElements();
11884 EltTy = VT->getElementType();
11885 }
11886 }
11887
11888 if (!isValidElementType(EltTy))
11889 return 0;
11890 size_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N));
11891 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
11892 VTSize != DL->getTypeStoreSizeInBits(T))
11893 return 0;
11894 return N;
11895}
11896
11897bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
11898 SmallVectorImpl<unsigned> &CurrentOrder,
11899 bool ResizeAllowed) const {
11901 assert(It != VL.end() && "Expected at least one extract instruction.");
11902 auto *E0 = cast<Instruction>(*It);
11903 assert(
11905 "Invalid opcode");
11906 // Check if all of the extracts come from the same vector and from the
11907 // correct offset.
11908 Value *Vec = E0->getOperand(0);
11909
11910 CurrentOrder.clear();
11911
11912 // We have to extract from a vector/aggregate with the same number of elements.
11913 unsigned NElts;
11914 if (E0->getOpcode() == Instruction::ExtractValue) {
11915 NElts = canMapToVector(Vec->getType());
11916 if (!NElts)
11917 return false;
11918 // Check if load can be rewritten as load of vector.
11919 LoadInst *LI = dyn_cast<LoadInst>(Vec);
11920 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
11921 return false;
11922 } else {
11923 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
11924 }
11925
11926 unsigned E = VL.size();
11927 if (!ResizeAllowed && NElts != E)
11928 return false;
11930 unsigned MinIdx = NElts, MaxIdx = 0;
11931 for (auto [I, V] : enumerate(VL)) {
11932 auto *Inst = dyn_cast<Instruction>(V);
11933 if (!Inst)
11934 continue;
11935 if (Inst->getOperand(0) != Vec)
11936 return false;
11937 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
11938 if (isa<UndefValue>(EE->getIndexOperand()))
11939 continue;
11940 std::optional<unsigned> Idx = getExtractIndex(Inst);
11941 if (!Idx)
11942 return false;
11943 const unsigned ExtIdx = *Idx;
11944 if (ExtIdx >= NElts)
11945 continue;
11946 Indices[I] = ExtIdx;
11947 if (MinIdx > ExtIdx)
11948 MinIdx = ExtIdx;
11949 if (MaxIdx < ExtIdx)
11950 MaxIdx = ExtIdx;
11951 }
11952 if (MaxIdx - MinIdx + 1 > E)
11953 return false;
11954 if (MaxIdx + 1 <= E)
11955 MinIdx = 0;
11956
11957 // Check that all of the indices extract from the correct offset.
11958 bool ShouldKeepOrder = true;
11959 // Assign to all items the initial value E + 1 so we can check if the extract
11960 // instruction index was used already.
11961 // Also, later we can check that all the indices are used and we have a
11962 // consecutive access in the extract instructions, by checking that no
11963 // element of CurrentOrder still has value E + 1.
11964 CurrentOrder.assign(E, E);
11965 for (unsigned I = 0; I < E; ++I) {
11966 if (Indices[I] == PoisonMaskElem)
11967 continue;
11968 const unsigned ExtIdx = Indices[I] - MinIdx;
11969 if (CurrentOrder[ExtIdx] != E) {
11970 CurrentOrder.clear();
11971 return false;
11972 }
11973 ShouldKeepOrder &= ExtIdx == I;
11974 CurrentOrder[ExtIdx] = I;
11975 }
11976 if (ShouldKeepOrder)
11977 CurrentOrder.clear();
11978
11979 return ShouldKeepOrder;
11980}
11981
11982bool BoUpSLP::areAllUsersVectorized(
11983 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
11984 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
11985 all_of(I->users(), [this](User *U) {
11986 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
11987 (isa<ExtractElementInst>(U) && MustGather.contains(U));
11988 });
11989}
11990
11991void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
11992 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
11993 SmallVectorImpl<Value *> *OpScalars,
11994 SmallVectorImpl<Value *> *AltScalars) const {
11995 unsigned Sz = Scalars.size();
11996 Mask.assign(Sz, PoisonMaskElem);
11997 SmallVector<int> OrderMask;
11998 if (!ReorderIndices.empty())
11999 inversePermutation(ReorderIndices, OrderMask);
12000 for (unsigned I = 0; I < Sz; ++I) {
12001 unsigned Idx = I;
12002 if (!ReorderIndices.empty())
12003 Idx = OrderMask[I];
12004 if (isa<PoisonValue>(Scalars[Idx]))
12005 continue;
12006 auto *OpInst = cast<Instruction>(Scalars[Idx]);
12007 if (IsAltOp(OpInst)) {
12008 Mask[I] = Sz + Idx;
12009 if (AltScalars)
12010 AltScalars->push_back(OpInst);
12011 } else {
12012 Mask[I] = Idx;
12013 if (OpScalars)
12014 OpScalars->push_back(OpInst);
12015 }
12016 }
12017 if (!ReuseShuffleIndices.empty()) {
12018 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
12019 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
12020 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
12021 });
12022 Mask.swap(NewMask);
12023 }
12024}
12025
12027 Instruction *AltOp,
12028 const TargetLibraryInfo &TLI) {
12029 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == MainOp;
12030}
12031
12033 Instruction *AltOp,
12034 const TargetLibraryInfo &TLI) {
12035 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
12036 auto *AltCI = cast<CmpInst>(AltOp);
12037 CmpInst::Predicate MainP = MainCI->getPredicate();
12038 [[maybe_unused]] CmpInst::Predicate AltP = AltCI->getPredicate();
12039 assert(MainP != AltP && "Expected different main/alternate predicates.");
12040 auto *CI = cast<CmpInst>(I);
12041 if (isCmpSameOrSwapped(MainCI, CI, TLI))
12042 return false;
12043 if (isCmpSameOrSwapped(AltCI, CI, TLI))
12044 return true;
12045 CmpInst::Predicate P = CI->getPredicate();
12047
12048 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
12049 "CmpInst expected to match either main or alternate predicate or "
12050 "their swap.");
12051 return MainP != P && MainP != SwappedP;
12052 }
12053 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == AltOp;
12054}
12055
12056TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
12057 assert(!Ops.empty());
12058 const auto *Op0 = Ops.front();
12059
12060 const bool IsConstant = all_of(Ops, [](Value *V) {
12061 // TODO: We should allow undef elements here
12062 return isConstant(V) && !isa<UndefValue>(V);
12063 });
12064 const bool IsUniform = all_of(Ops, [=](Value *V) {
12065 // TODO: We should allow undef elements here
12066 return V == Op0;
12067 });
12068 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
12069 // TODO: We should allow undef elements here
12070 if (auto *CI = dyn_cast<ConstantInt>(V))
12071 return CI->getValue().isPowerOf2();
12072 return false;
12073 });
12074 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
12075 // TODO: We should allow undef elements here
12076 if (auto *CI = dyn_cast<ConstantInt>(V))
12077 return CI->getValue().isNegatedPowerOf2();
12078 return false;
12079 });
12080
12082 if (IsConstant && IsUniform)
12084 else if (IsConstant)
12086 else if (IsUniform)
12088
12090 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
12091 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
12092
12093 return {VK, VP};
12094}
12095
12096namespace {
12097/// The base class for shuffle instruction emission and shuffle cost estimation.
12098class BaseShuffleAnalysis {
12099protected:
12100 Type *ScalarTy = nullptr;
12101
12102 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
12103
12104 /// V is expected to be a vectorized value.
12105 /// When REVEC is disabled, there is no difference between VF and
12106 /// VNumElements.
12107 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
12108 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
12109 /// of 8.
12110 unsigned getVF(Value *V) const {
12111 assert(V && "V cannot be nullptr");
12112 assert(isa<FixedVectorType>(V->getType()) &&
12113 "V does not have FixedVectorType");
12114 assert(ScalarTy && "ScalarTy cannot be nullptr");
12115 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
12116 unsigned VNumElements =
12117 cast<FixedVectorType>(V->getType())->getNumElements();
12118 assert(VNumElements > ScalarTyNumElements &&
12119 "the number of elements of V is not large enough");
12120 assert(VNumElements % ScalarTyNumElements == 0 &&
12121 "the number of elements of V is not a vectorized value");
12122 return VNumElements / ScalarTyNumElements;
12123 }
12124
12125 /// Checks if the mask is an identity mask.
12126 /// \param IsStrict if is true the function returns false if mask size does
12127 /// not match vector size.
12128 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
12129 bool IsStrict) {
12130 int Limit = Mask.size();
12131 int VF = VecTy->getNumElements();
12132 int Index = -1;
12133 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
12134 return true;
12135 if (!IsStrict) {
12136 // Consider extract subvector starting from index 0.
12137 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&
12138 Index == 0)
12139 return true;
12140 // All VF-size submasks are identity (e.g.
12141 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
12142 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
12143 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
12144 return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
12146 }))
12147 return true;
12148 }
12149 return false;
12150 }
12151
12152 /// Tries to combine 2 different masks into single one.
12153 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
12154 /// change the size of the vector, \p LocalVF is the original size of the
12155 /// shuffled vector.
12156 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
12157 ArrayRef<int> ExtMask) {
12158 unsigned VF = Mask.size();
12159 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
12160 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
12161 if (ExtMask[I] == PoisonMaskElem)
12162 continue;
12163 int MaskedIdx = Mask[ExtMask[I] % VF];
12164 NewMask[I] =
12165 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
12166 }
12167 Mask.swap(NewMask);
12168 }
12169
12170 /// Looks through shuffles trying to reduce final number of shuffles in the
12171 /// code. The function looks through the previously emitted shuffle
12172 /// instructions and properly mark indices in mask as undef.
12173 /// For example, given the code
12174 /// \code
12175 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
12176 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
12177 /// \endcode
12178 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
12179 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
12180 /// <0, 1, 2, 3> for the shuffle.
12181 /// If 2 operands are of different size, the smallest one will be resized and
12182 /// the mask recalculated properly.
12183 /// For example, given the code
12184 /// \code
12185 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
12186 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
12187 /// \endcode
12188 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
12189 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
12190 /// <0, 1, 2, 3> for the shuffle.
12191 /// So, it tries to transform permutations to simple vector merge, if
12192 /// possible.
12193 /// \param V The input vector which must be shuffled using the given \p Mask.
12194 /// If the better candidate is found, \p V is set to this best candidate
12195 /// vector.
12196 /// \param Mask The input mask for the shuffle. If the best candidate is found
12197 /// during looking-through-shuffles attempt, it is updated accordingly.
12198 /// \param SinglePermute true if the shuffle operation is originally a
12199 /// single-value-permutation. In this case the look-through-shuffles procedure
12200 /// may look for resizing shuffles as the best candidates.
12201 /// \return true if the shuffle results in the non-resizing identity shuffle
12202 /// (and thus can be ignored), false - otherwise.
12203 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
12204 bool SinglePermute) {
12205 Value *Op = V;
12206 ShuffleVectorInst *IdentityOp = nullptr;
12207 SmallVector<int> IdentityMask;
12208 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
12209 // Exit if not a fixed vector type or changing size shuffle.
12210 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
12211 if (!SVTy)
12212 break;
12213 // Remember the identity or broadcast mask, if it is not a resizing
12214 // shuffle. If no better candidates are found, this Op and Mask will be
12215 // used in the final shuffle.
12216 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
12217 if (!IdentityOp || !SinglePermute ||
12218 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
12220 IdentityMask.size()))) {
12221 IdentityOp = SV;
12222 // Store current mask in the IdentityMask so later we did not lost
12223 // this info if IdentityOp is selected as the best candidate for the
12224 // permutation.
12225 IdentityMask.assign(Mask);
12226 }
12227 }
12228 // Remember the broadcast mask. If no better candidates are found, this Op
12229 // and Mask will be used in the final shuffle.
12230 // Zero splat can be used as identity too, since it might be used with
12231 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
12232 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
12233 // expensive, the analysis founds out, that the source vector is just a
12234 // broadcast, this original mask can be transformed to identity mask <0,
12235 // 1, 2, 3>.
12236 // \code
12237 // %0 = shuffle %v, poison, zeroinitalizer
12238 // %res = shuffle %0, poison, <3, 1, 2, 0>
12239 // \endcode
12240 // may be transformed to
12241 // \code
12242 // %0 = shuffle %v, poison, zeroinitalizer
12243 // %res = shuffle %0, poison, <0, 1, 2, 3>
12244 // \endcode
12245 if (SV->isZeroEltSplat()) {
12246 IdentityOp = SV;
12247 IdentityMask.assign(Mask);
12248 }
12249 int LocalVF = Mask.size();
12250 if (auto *SVOpTy =
12251 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
12252 LocalVF = SVOpTy->getNumElements();
12253 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
12254 for (auto [Idx, I] : enumerate(Mask)) {
12255 if (I == PoisonMaskElem ||
12256 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
12257 continue;
12258 ExtMask[Idx] = SV->getMaskValue(I);
12259 }
12260 bool IsOp1Undef = isUndefVector</*isPoisonOnly=*/true>(
12261 SV->getOperand(0),
12262 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
12263 .all();
12264 bool IsOp2Undef = isUndefVector</*isPoisonOnly=*/true>(
12265 SV->getOperand(1),
12266 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
12267 .all();
12268 if (!IsOp1Undef && !IsOp2Undef) {
12269 // Update mask and mark undef elems.
12270 for (int &I : Mask) {
12271 if (I == PoisonMaskElem)
12272 continue;
12273 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
12275 I = PoisonMaskElem;
12276 }
12277 break;
12278 }
12279 SmallVector<int> ShuffleMask(SV->getShuffleMask());
12280 combineMasks(LocalVF, ShuffleMask, Mask);
12281 Mask.swap(ShuffleMask);
12282 if (IsOp2Undef)
12283 Op = SV->getOperand(0);
12284 else
12285 Op = SV->getOperand(1);
12286 }
12287 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
12288 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
12290 if (IdentityOp) {
12291 V = IdentityOp;
12292 assert(Mask.size() == IdentityMask.size() &&
12293 "Expected masks of same sizes.");
12294 // Clear known poison elements.
12295 for (auto [I, Idx] : enumerate(Mask))
12296 if (Idx == PoisonMaskElem)
12297 IdentityMask[I] = PoisonMaskElem;
12298 Mask.swap(IdentityMask);
12299 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
12300 return SinglePermute &&
12301 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
12302 /*IsStrict=*/true) ||
12303 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
12304 Shuffle->isZeroEltSplat() &&
12306 all_of(enumerate(Mask), [&](const auto &P) {
12307 return P.value() == PoisonMaskElem ||
12308 Shuffle->getShuffleMask()[P.index()] == 0;
12309 })));
12310 }
12311 V = Op;
12312 return false;
12313 }
12314 V = Op;
12315 return true;
12316 }
12317
12318 /// Smart shuffle instruction emission, walks through shuffles trees and
12319 /// tries to find the best matching vector for the actual shuffle
12320 /// instruction.
12321 template <typename T, typename ShuffleBuilderTy>
12322 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
12323 ShuffleBuilderTy &Builder, Type *ScalarTy) {
12324 assert(V1 && "Expected at least one vector value.");
12325 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
12326 SmallVector<int> NewMask(Mask);
12327 if (ScalarTyNumElements != 1) {
12328 assert(SLPReVec && "FixedVectorType is not expected.");
12329 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, NewMask);
12330 Mask = NewMask;
12331 }
12332 if (V2)
12333 Builder.resizeToMatch(V1, V2);
12334 int VF = Mask.size();
12335 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
12336 VF = FTy->getNumElements();
12338 V2, buildUseMask(VF, Mask, UseMask::SecondArg))
12339 .all()) {
12340 // Peek through shuffles.
12341 Value *Op1 = V1;
12342 Value *Op2 = V2;
12343 int VF =
12344 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
12345 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
12346 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
12347 for (int I = 0, E = Mask.size(); I < E; ++I) {
12348 if (Mask[I] < VF)
12349 CombinedMask1[I] = Mask[I];
12350 else
12351 CombinedMask2[I] = Mask[I] - VF;
12352 }
12353 Value *PrevOp1;
12354 Value *PrevOp2;
12355 do {
12356 PrevOp1 = Op1;
12357 PrevOp2 = Op2;
12358 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
12359 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
12360 // Check if we have 2 resizing shuffles - need to peek through operands
12361 // again.
12362 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
12363 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
12364 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
12365 for (auto [Idx, I] : enumerate(CombinedMask1)) {
12366 if (I == PoisonMaskElem)
12367 continue;
12368 ExtMask1[Idx] = SV1->getMaskValue(I);
12369 }
12370 SmallBitVector UseMask1 = buildUseMask(
12371 cast<FixedVectorType>(SV1->getOperand(1)->getType())
12372 ->getNumElements(),
12373 ExtMask1, UseMask::SecondArg);
12374 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
12375 for (auto [Idx, I] : enumerate(CombinedMask2)) {
12376 if (I == PoisonMaskElem)
12377 continue;
12378 ExtMask2[Idx] = SV2->getMaskValue(I);
12379 }
12380 SmallBitVector UseMask2 = buildUseMask(
12381 cast<FixedVectorType>(SV2->getOperand(1)->getType())
12382 ->getNumElements(),
12383 ExtMask2, UseMask::SecondArg);
12384 if (SV1->getOperand(0)->getType() ==
12385 SV2->getOperand(0)->getType() &&
12386 SV1->getOperand(0)->getType() != SV1->getType() &&
12387 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
12388 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
12389 Op1 = SV1->getOperand(0);
12390 Op2 = SV2->getOperand(0);
12391 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
12392 int LocalVF = ShuffleMask1.size();
12393 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
12394 LocalVF = FTy->getNumElements();
12395 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
12396 CombinedMask1.swap(ShuffleMask1);
12397 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
12398 LocalVF = ShuffleMask2.size();
12399 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
12400 LocalVF = FTy->getNumElements();
12401 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
12402 CombinedMask2.swap(ShuffleMask2);
12403 }
12404 }
12405 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
12406 Builder.resizeToMatch(Op1, Op2);
12407 VF = std::max(cast<VectorType>(Op1->getType())
12408 ->getElementCount()
12409 .getKnownMinValue(),
12411 ->getElementCount()
12412 .getKnownMinValue());
12413 for (int I = 0, E = Mask.size(); I < E; ++I) {
12414 if (CombinedMask2[I] != PoisonMaskElem) {
12415 assert(CombinedMask1[I] == PoisonMaskElem &&
12416 "Expected undefined mask element");
12417 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
12418 }
12419 }
12420 if (Op1 == Op2 &&
12421 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
12422 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
12424 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
12425 ArrayRef(CombinedMask1))))
12426 return Builder.createIdentity(Op1);
12427 return Builder.createShuffleVector(
12428 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
12429 CombinedMask1);
12430 }
12431 if (isa<PoisonValue>(V1))
12432 return Builder.createPoison(
12433 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
12434 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
12435 assert(V1 && "Expected non-null value after looking through shuffles.");
12436
12437 if (!IsIdentity)
12438 return Builder.createShuffleVector(V1, NewMask);
12439 return Builder.createIdentity(V1);
12440 }
12441
12442 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
12443 /// shuffle emission.
12444 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
12445 ArrayRef<int> Mask) {
12446 for (unsigned I : seq<unsigned>(CommonMask.size()))
12447 if (Mask[I] != PoisonMaskElem)
12448 CommonMask[I] = I;
12449 }
12450};
12451} // namespace
12452
12453/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
12454static std::pair<InstructionCost, InstructionCost>
12456 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
12457 Type *ScalarTy, VectorType *VecTy) {
12458 InstructionCost ScalarCost = 0;
12459 InstructionCost VecCost = 0;
12460 // Here we differentiate two cases: (1) when Ptrs represent a regular
12461 // vectorization tree node (as they are pointer arguments of scattered
12462 // loads) or (2) when Ptrs are the arguments of loads or stores being
12463 // vectorized as plane wide unit-stride load/store since all the
12464 // loads/stores are known to be from/to adjacent locations.
12465 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
12466 // Case 2: estimate costs for pointer related costs when vectorizing to
12467 // a wide load/store.
12468 // Scalar cost is estimated as a set of pointers with known relationship
12469 // between them.
12470 // For vector code we will use BasePtr as argument for the wide load/store
12471 // but we also need to account all the instructions which are going to
12472 // stay in vectorized code due to uses outside of these scalar
12473 // loads/stores.
12474 ScalarCost = TTI.getPointersChainCost(
12475 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
12476 CostKind);
12477
12478 SmallVector<const Value *> PtrsRetainedInVecCode;
12479 for (Value *V : Ptrs) {
12480 if (V == BasePtr) {
12481 PtrsRetainedInVecCode.push_back(V);
12482 continue;
12483 }
12485 // For simplicity assume Ptr to stay in vectorized code if it's not a
12486 // GEP instruction. We don't care since it's cost considered free.
12487 // TODO: We should check for any uses outside of vectorizable tree
12488 // rather than just single use.
12489 if (!Ptr || !Ptr->hasOneUse())
12490 PtrsRetainedInVecCode.push_back(V);
12491 }
12492
12493 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
12494 // If all pointers stay in vectorized code then we don't have
12495 // any savings on that.
12496 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
12497 }
12498 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
12499 TTI::PointersChainInfo::getKnownStride(),
12500 VecTy, CostKind);
12501 } else {
12502 // Case 1: Ptrs are the arguments of loads that we are going to transform
12503 // into masked gather load intrinsic.
12504 // All the scalar GEPs will be removed as a result of vectorization.
12505 // For any external uses of some lanes extract element instructions will
12506 // be generated (which cost is estimated separately).
12507 TTI::PointersChainInfo PtrsInfo =
12508 all_of(Ptrs,
12509 [](const Value *V) {
12511 return Ptr && !Ptr->hasAllConstantIndices();
12512 })
12513 ? TTI::PointersChainInfo::getUnknownStride()
12514 : TTI::PointersChainInfo::getKnownStride();
12515
12516 ScalarCost =
12517 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
12518 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
12519 if (!BaseGEP) {
12520 auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
12521 if (It != Ptrs.end())
12522 BaseGEP = cast<GEPOperator>(*It);
12523 }
12524 if (BaseGEP) {
12525 SmallVector<const Value *> Indices(BaseGEP->indices());
12526 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
12527 BaseGEP->getPointerOperand(), Indices, VecTy,
12528 CostKind);
12529 }
12530 }
12531
12532 return std::make_pair(ScalarCost, VecCost);
12533}
12534
12535void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
12536 assert(TE.isGather() && TE.ReorderIndices.empty() &&
12537 "Expected gather node without reordering.");
12538 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
12539 SmallSet<size_t, 2> LoadKeyUsed;
12540
12541 // Do not reorder nodes if it small (just 2 elements), all-constant or all
12542 // instructions have same opcode already.
12543 if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||
12544 all_of(TE.Scalars, isConstant))
12545 return;
12546
12547 if (any_of(seq<unsigned>(TE.Idx), [&](unsigned Idx) {
12548 return VectorizableTree[Idx]->isSame(TE.Scalars);
12549 }))
12550 return;
12551
12552 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
12553 Key = hash_combine(hash_value(LI->getParent()), Key);
12554 Value *Ptr =
12555 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth);
12556 if (LoadKeyUsed.contains(Key)) {
12557 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
12558 if (LIt != LoadsMap.end()) {
12559 for (LoadInst *RLI : LIt->second) {
12560 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
12561 LI->getType(), LI->getPointerOperand(), *DL, *SE,
12562 /*StrictCheck=*/true))
12563 return hash_value(RLI->getPointerOperand());
12564 }
12565 for (LoadInst *RLI : LIt->second) {
12567 LI->getPointerOperand(), *TLI)) {
12568 hash_code SubKey = hash_value(RLI->getPointerOperand());
12569 return SubKey;
12570 }
12571 }
12572 if (LIt->second.size() > 2) {
12573 hash_code SubKey =
12574 hash_value(LIt->second.back()->getPointerOperand());
12575 return SubKey;
12576 }
12577 }
12578 }
12579 LoadKeyUsed.insert(Key);
12580 LoadsMap.try_emplace(std::make_pair(Key, Ptr)).first->second.push_back(LI);
12581 return hash_value(LI->getPointerOperand());
12582 };
12583 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
12584 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
12585 bool IsOrdered = true;
12586 unsigned NumInstructions = 0;
12587 // Try to "cluster" scalar instructions, to be able to build extra vectorized
12588 // nodes.
12589 for (auto [I, V] : enumerate(TE.Scalars)) {
12590 size_t Key = 1, Idx = 1;
12591 if (auto *Inst = dyn_cast<Instruction>(V);
12593 !isDeleted(Inst) && !isVectorized(V)) {
12594 std::tie(Key, Idx) = generateKeySubkey(V, TLI, GenerateLoadsSubkey,
12595 /*AllowAlternate=*/false);
12596 ++NumInstructions;
12597 }
12598 auto &Container = SortedValues[Key];
12599 if (IsOrdered && !KeyToIndex.contains(V) &&
12602 ((Container.contains(Idx) &&
12603 KeyToIndex.at(Container[Idx].back()).back() != I - 1) ||
12604 (!Container.empty() && !Container.contains(Idx) &&
12605 KeyToIndex.at(Container.back().second.back()).back() != I - 1)))
12606 IsOrdered = false;
12607 auto &KTI = KeyToIndex[V];
12608 if (KTI.empty())
12609 Container[Idx].push_back(V);
12610 KTI.push_back(I);
12611 }
12613 APInt DemandedElts = APInt::getAllOnes(TE.Scalars.size());
12614 if (!IsOrdered && NumInstructions > 1) {
12615 unsigned Cnt = 0;
12616 TE.ReorderIndices.resize(TE.Scalars.size(), TE.Scalars.size());
12617 for (const auto &D : SortedValues) {
12618 for (const auto &P : D.second) {
12619 unsigned Sz = 0;
12620 for (Value *V : P.second) {
12621 ArrayRef<unsigned> Indices = KeyToIndex.at(V);
12622 for (auto [K, Idx] : enumerate(Indices)) {
12623 TE.ReorderIndices[Cnt + K] = Idx;
12624 TE.Scalars[Cnt + K] = V;
12625 }
12626 Sz += Indices.size();
12627 Cnt += Indices.size();
12628 }
12629 if (Sz > 1 && isa<Instruction>(P.second.front())) {
12630 const unsigned SubVF = getFloorFullVectorNumberOfElements(
12631 *TTI, TE.Scalars.front()->getType(), Sz);
12632 SubVectors.emplace_back(Cnt - Sz, SubVF);
12633 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
12634 DemandedElts.clearBit(I);
12635 } else if (!P.second.empty() && isConstant(P.second.front())) {
12636 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
12637 DemandedElts.clearBit(I);
12638 }
12639 }
12640 }
12641 }
12642 // Reuses always require shuffles, so consider it as profitable.
12643 if (!TE.ReuseShuffleIndices.empty() || TE.ReorderIndices.empty())
12644 return;
12645 // Do simple cost estimation.
12648 auto *ScalarTy = TE.Scalars.front()->getType();
12649 auto *VecTy = getWidenedType(ScalarTy, TE.Scalars.size());
12650 for (auto [Idx, Sz] : SubVectors) {
12652 Idx, getWidenedType(ScalarTy, Sz));
12653 }
12654 Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
12655 /*Insert=*/true,
12656 /*Extract=*/false, CostKind);
12657 int Sz = TE.Scalars.size();
12658 SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
12659 TE.ReorderIndices.end());
12660 for (unsigned I : seq<unsigned>(Sz)) {
12661 Value *V = TE.getOrdered(I);
12662 if (isa<PoisonValue>(V)) {
12663 ReorderMask[I] = PoisonMaskElem;
12664 } else if (isConstant(V) || DemandedElts[I]) {
12665 ReorderMask[I] = I + TE.ReorderIndices.size();
12666 }
12667 }
12668 Cost += ::getShuffleCost(*TTI,
12669 any_of(ReorderMask, [&](int I) { return I >= Sz; })
12672 VecTy, ReorderMask);
12673 DemandedElts = APInt::getAllOnes(TE.Scalars.size());
12674 ReorderMask.assign(Sz, PoisonMaskElem);
12675 for (unsigned I : seq<unsigned>(Sz)) {
12676 Value *V = TE.getOrdered(I);
12677 if (isConstant(V)) {
12678 DemandedElts.clearBit(I);
12679 if (!isa<PoisonValue>(V))
12680 ReorderMask[I] = I;
12681 } else {
12682 ReorderMask[I] = I + Sz;
12683 }
12684 }
12685 InstructionCost BVCost =
12686 getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
12687 /*Insert=*/true, /*Extract=*/false, CostKind);
12688 if (!DemandedElts.isAllOnes())
12689 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);
12690 if (Cost >= BVCost) {
12691 SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end());
12692 reorderScalars(TE.Scalars, Mask);
12693 TE.ReorderIndices.clear();
12694 }
12695}
12696
12697/// Check if we can convert fadd/fsub sequence to FMAD.
12698/// \returns Cost of the FMAD, if conversion is possible, invalid cost otherwise.
12700 const InstructionsState &S,
12701 DominatorTree &DT, const DataLayout &DL,
12703 const TargetLibraryInfo &TLI) {
12704 assert(all_of(VL,
12705 [](Value *V) {
12706 return V->getType()->getScalarType()->isFloatingPointTy();
12707 }) &&
12708 "Can only convert to FMA for floating point types");
12709 assert(S.isAddSubLikeOp() && "Can only convert to FMA for add/sub");
12710
12711 auto CheckForContractable = [&](ArrayRef<Value *> VL) {
12712 FastMathFlags FMF;
12713 FMF.set();
12714 for (Value *V : VL) {
12715 auto *I = dyn_cast<Instruction>(V);
12716 if (!I)
12717 continue;
12718 if (S.isCopyableElement(I))
12719 continue;
12720 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(I);
12721 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
12722 continue;
12723 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
12724 FMF &= FPCI->getFastMathFlags();
12725 }
12726 return FMF.allowContract();
12727 };
12728 if (!CheckForContractable(VL))
12730 // fmul also should be contractable
12731 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
12732 SmallVector<BoUpSLP::ValueList> Operands = Analysis.buildOperands(S, VL);
12733
12734 InstructionsState OpS = getSameOpcode(Operands.front(), TLI);
12735 if (!OpS.valid())
12737
12738 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
12740 if (!CheckForContractable(Operands.front()))
12742 // Compare the costs.
12743 InstructionCost FMulPlusFAddCost = 0;
12744 InstructionCost FMACost = 0;
12746 FastMathFlags FMF;
12747 FMF.set();
12748 for (Value *V : VL) {
12749 auto *I = dyn_cast<Instruction>(V);
12750 if (!I)
12751 continue;
12752 if (!S.isCopyableElement(I))
12753 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
12754 FMF &= FPCI->getFastMathFlags();
12755 FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
12756 }
12757 unsigned NumOps = 0;
12758 for (auto [V, Op] : zip(VL, Operands.front())) {
12759 if (S.isCopyableElement(V))
12760 continue;
12761 auto *I = dyn_cast<Instruction>(Op);
12762 if (!I || !I->hasOneUse() || OpS.isCopyableElement(I)) {
12763 if (auto *OpI = dyn_cast<Instruction>(V))
12764 FMACost += TTI.getInstructionCost(OpI, CostKind);
12765 if (I)
12766 FMACost += TTI.getInstructionCost(I, CostKind);
12767 continue;
12768 }
12769 ++NumOps;
12770 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
12771 FMF &= FPCI->getFastMathFlags();
12772 FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
12773 }
12774 Type *Ty = VL.front()->getType();
12775 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, Ty, {Ty, Ty, Ty}, FMF);
12776 FMACost += NumOps * TTI.getIntrinsicInstrCost(ICA, CostKind);
12777 return FMACost < FMulPlusFAddCost ? FMACost : InstructionCost::getInvalid();
12778}
12779
12782 BaseGraphSize = VectorizableTree.size();
12783 // Turn graph transforming mode on and off, when done.
12784 class GraphTransformModeRAAI {
12785 bool &SavedIsGraphTransformMode;
12786
12787 public:
12788 GraphTransformModeRAAI(bool &IsGraphTransformMode)
12789 : SavedIsGraphTransformMode(IsGraphTransformMode) {
12790 IsGraphTransformMode = true;
12791 }
12792 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
12793 } TransformContext(IsGraphTransformMode);
12794 // Operands are profitable if they are:
12795 // 1. At least one constant
12796 // or
12797 // 2. Splats
12798 // or
12799 // 3. Results in good vectorization opportunity, i.e. may generate vector
12800 // nodes and reduce cost of the graph.
12801 auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
12802 const InstructionsState &S) {
12804 for (unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
12805 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
12806 I2->getOperand(Op));
12807 return all_of(
12808 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
12809 return all_of(Cand,
12810 [](const std::pair<Value *, Value *> &P) {
12811 return isa<Constant>(P.first) ||
12812 isa<Constant>(P.second) || P.first == P.second;
12813 }) ||
12815 });
12816 };
12817
12818 // Try to reorder gather nodes for better vectorization opportunities.
12819 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
12820 TreeEntry &E = *VectorizableTree[Idx];
12821 if (E.isGather())
12822 reorderGatherNode(E);
12823 }
12824
12825 // Better to use full gathered loads analysis, if there are only 2 loads
12826 // gathered nodes each having less than 16 elements.
12827 constexpr unsigned VFLimit = 16;
12828 bool ForceLoadGather =
12829 count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12830 return TE->isGather() && TE->hasState() &&
12831 TE->getOpcode() == Instruction::Load &&
12832 TE->getVectorFactor() < VFLimit;
12833 }) == 2;
12834
12835 // Checks if the scalars are used in other node.
12836 auto AreReusedScalars = [&](const TreeEntry *TE, ArrayRef<Value *> VL,
12837 function_ref<bool(Value *)> CheckContainer) {
12838 return TE->isSame(VL) || all_of(VL, [&](Value *V) {
12839 if (isa<PoisonValue>(V))
12840 return true;
12841 auto *I = dyn_cast<Instruction>(V);
12842 if (!I)
12843 return false;
12844 return is_contained(TE->Scalars, I) || CheckContainer(I);
12845 });
12846 };
12847 auto CheckForSameVectorNodes = [&](const TreeEntry &E) {
12848 if (E.hasState()) {
12849 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(E.getMainOp());
12850 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
12851 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
12852 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
12853 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12854 return is_contained(TEs, TE);
12855 });
12856 });
12857 }))
12858 return true;
12859 ;
12860 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(E.getMainOp());
12861 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
12862 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
12863 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
12864 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12865 return is_contained(TEs, TE);
12866 });
12867 });
12868 }))
12869 return true;
12870 } else {
12871 // Check if the gather node full copy of split node.
12872 auto *It = find_if(E.Scalars, IsaPred<Instruction>);
12873 if (It != E.Scalars.end()) {
12874 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(*It);
12875 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
12876 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
12877 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
12878 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12879 return is_contained(TEs, TE);
12880 });
12881 });
12882 }))
12883 return true;
12884 }
12885 }
12886 return false;
12887 };
12888 // The tree may grow here, so iterate over nodes, built before.
12889 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
12890 TreeEntry &E = *VectorizableTree[Idx];
12891 if (E.isGather()) {
12892 ArrayRef<Value *> VL = E.Scalars;
12893 const unsigned Sz = getVectorElementSize(VL.front());
12894 unsigned MinVF = getMinVF(2 * Sz);
12895 // Do not try partial vectorization for small nodes (<= 2), nodes with the
12896 // same opcode and same parent block or all constants.
12897 if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
12898 !(!E.hasState() || E.getOpcode() == Instruction::Load ||
12899 // We use allSameOpcode instead of isAltShuffle because we don't
12900 // want to use interchangeable instruction here.
12901 !allSameOpcode(VL) || !allSameBlock(VL)) ||
12902 allConstant(VL) || isSplat(VL))
12903 continue;
12904 if (ForceLoadGather && E.hasState() && E.getOpcode() == Instruction::Load)
12905 continue;
12906 // Check if the node is a copy of other vector nodes.
12907 if (CheckForSameVectorNodes(E))
12908 continue;
12909 // Try to find vectorizable sequences and transform them into a series of
12910 // insertvector instructions.
12911 unsigned StartIdx = 0;
12912 unsigned End = VL.size();
12913 for (unsigned VF = getFloorFullVectorNumberOfElements(
12914 *TTI, VL.front()->getType(), VL.size() - 1);
12915 VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
12916 *TTI, VL.front()->getType(), VF - 1)) {
12917 if (StartIdx + VF > End)
12918 continue;
12920 bool AllStrided = true;
12921 for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
12922 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
12923 // If any instruction is vectorized already - do not try again.
12924 // Reuse the existing node, if it fully matches the slice.
12925 if (isVectorized(Slice.front()) &&
12926 !getSameValuesTreeEntry(Slice.front(), Slice, /*SameVF=*/true))
12927 continue;
12928 // Constant already handled effectively - skip.
12929 if (allConstant(Slice))
12930 continue;
12931 // Do not try to vectorize small splats (less than vector register and
12932 // only with the single non-undef element).
12933 bool IsSplat = isSplat(Slice);
12934 bool IsTwoRegisterSplat = true;
12935 if (IsSplat && VF == 2) {
12936 unsigned NumRegs2VF = ::getNumberOfParts(
12937 *TTI, getWidenedType(Slice.front()->getType(), 2 * VF));
12938 IsTwoRegisterSplat = NumRegs2VF == 2;
12939 }
12940 if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat ||
12941 count(Slice, Slice.front()) ==
12942 static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
12943 : 1)) {
12944 if (IsSplat)
12945 continue;
12946 InstructionsState S = getSameOpcode(Slice, *TLI);
12947 if (!S || !allSameOpcode(Slice) || !allSameBlock(Slice) ||
12948 (S.getOpcode() == Instruction::Load &&
12950 (S.getOpcode() != Instruction::Load &&
12951 !hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), VF)))
12952 continue;
12953 if (VF == 2) {
12954 // Try to vectorize reduced values or if all users are vectorized.
12955 // For expensive instructions extra extracts might be profitable.
12956 if ((!UserIgnoreList || E.Idx != 0) &&
12957 TTI->getInstructionCost(S.getMainOp(), CostKind) <
12959 !all_of(Slice, [&](Value *V) {
12960 if (isa<PoisonValue>(V))
12961 return true;
12962 return areAllUsersVectorized(cast<Instruction>(V),
12963 UserIgnoreList);
12964 }))
12965 continue;
12966 if (S.getOpcode() == Instruction::Load) {
12967 OrdersType Order;
12968 SmallVector<Value *> PointerOps;
12969 StridedPtrInfo SPtrInfo;
12970 LoadsState Res = canVectorizeLoads(Slice, Slice.front(), Order,
12971 PointerOps, SPtrInfo);
12972 AllStrided &= Res == LoadsState::StridedVectorize ||
12974 Res == LoadsState::Gather;
12975 // Do not vectorize gathers.
12976 if (Res == LoadsState::ScatterVectorize ||
12977 Res == LoadsState::Gather) {
12978 if (Res == LoadsState::Gather) {
12980 // If reductions and the scalars from the root node are
12981 // analyzed - mark as non-vectorizable reduction.
12982 if (UserIgnoreList && E.Idx == 0)
12983 analyzedReductionVals(Slice);
12984 }
12985 continue;
12986 }
12987 } else if (S.getOpcode() == Instruction::ExtractElement ||
12988 (TTI->getInstructionCost(S.getMainOp(), CostKind) <
12990 !CheckOperandsProfitability(
12991 S.getMainOp(),
12994 S))) {
12995 // Do not vectorize extractelements (handled effectively
12996 // alread). Do not vectorize non-profitable instructions (with
12997 // low cost and non-vectorizable operands.)
12998 continue;
12999 }
13000 }
13001 }
13002 Slices.emplace_back(Cnt, Slice.size());
13003 }
13004 // Do not try to vectorize if all slides are strided or gathered with
13005 // vector factor 2 and there are more than 2 slices. Better to handle
13006 // them in gathered loads analysis, may result in better vectorization.
13007 if (VF == 2 && AllStrided && Slices.size() > 2)
13008 continue;
13009 auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
13010 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
13011 if (StartIdx == Cnt)
13012 StartIdx = Cnt + Sz;
13013 if (End == Cnt + Sz)
13014 End = Cnt;
13015 };
13016 for (auto [Cnt, Sz] : Slices) {
13017 ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);
13018 const TreeEntry *SameTE = nullptr;
13019 if (const auto *It = find_if(Slice, IsaPred<Instruction>);
13020 It != Slice.end()) {
13021 // If any instruction is vectorized already - do not try again.
13022 SameTE = getSameValuesTreeEntry(*It, Slice);
13023 }
13024 unsigned PrevSize = VectorizableTree.size();
13025 [[maybe_unused]] unsigned PrevEntriesSize =
13026 LoadEntriesToVectorize.size();
13027 buildTreeRec(Slice, 0, EdgeInfo(&E, UINT_MAX));
13028 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
13029 VectorizableTree[PrevSize]->isGather() &&
13030 VectorizableTree[PrevSize]->hasState() &&
13031 VectorizableTree[PrevSize]->getOpcode() !=
13032 Instruction::ExtractElement &&
13033 !isSplat(Slice)) {
13034 if (UserIgnoreList && E.Idx == 0 && VF == 2)
13035 analyzedReductionVals(Slice);
13036 VectorizableTree.pop_back();
13037 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
13038 "LoadEntriesToVectorize expected to remain the same");
13039 continue;
13040 }
13041 AddCombinedNode(PrevSize, Cnt, Sz);
13042 }
13043 }
13044 // Restore ordering, if no extra vectorization happened.
13045 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
13046 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
13047 reorderScalars(E.Scalars, Mask);
13048 E.ReorderIndices.clear();
13049 }
13050 }
13051 if (!E.hasState())
13052 continue;
13053 switch (E.getOpcode()) {
13054 case Instruction::Load: {
13055 // No need to reorder masked gather loads, just reorder the scalar
13056 // operands.
13057 if (E.State != TreeEntry::Vectorize)
13058 break;
13059 Type *ScalarTy = E.getMainOp()->getType();
13060 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
13061 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
13062 // Check if profitable to represent consecutive load + reverse as strided
13063 // load with stride -1.
13064 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
13065 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13066 SmallVector<int> Mask;
13067 inversePermutation(E.ReorderIndices, Mask);
13068 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
13069 InstructionCost OriginalVecCost =
13070 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
13071 BaseLI->getPointerAddressSpace(), CostKind,
13073 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
13074 InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
13075 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
13076 /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
13077 if (StridedCost < OriginalVecCost || ForceStridedLoads) {
13078 // Strided load is more profitable than consecutive load + reverse -
13079 // transform the node to strided load.
13080 Type *StrideTy = DL->getIndexType(cast<LoadInst>(E.Scalars.front())
13081 ->getPointerOperand()
13082 ->getType());
13083 StridedPtrInfo SPtrInfo;
13084 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
13085 SPtrInfo.Ty = VecTy;
13086 TreeEntryToStridedPtrInfoMap[&E] = SPtrInfo;
13087 E.State = TreeEntry::StridedVectorize;
13088 }
13089 }
13090 break;
13091 }
13092 case Instruction::Store: {
13093 Type *ScalarTy =
13094 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
13095 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
13096 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
13097 // Check if profitable to represent consecutive load + reverse as strided
13098 // load with stride -1.
13099 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
13100 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13101 SmallVector<int> Mask;
13102 inversePermutation(E.ReorderIndices, Mask);
13103 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
13104 InstructionCost OriginalVecCost =
13105 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
13106 BaseSI->getPointerAddressSpace(), CostKind,
13108 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
13109 InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
13110 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
13111 /*VariableMask=*/false, CommonAlignment, CostKind, BaseSI);
13112 if (StridedCost < OriginalVecCost)
13113 // Strided store is more profitable than reverse + consecutive store -
13114 // transform the node to strided store.
13115 E.State = TreeEntry::StridedVectorize;
13116 } else if (!E.ReorderIndices.empty()) {
13117 // Check for interleaved stores.
13118 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {
13119 auto *BaseSI = cast<StoreInst>(E.Scalars.front());
13120 assert(Mask.size() > 1 && "Expected mask greater than 1 element.");
13121 if (Mask.size() < 4)
13122 return 0u;
13123 for (unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
13125 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
13126 TTI.isLegalInterleavedAccessType(
13127 VecTy, Factor, BaseSI->getAlign(),
13128 BaseSI->getPointerAddressSpace()))
13129 return Factor;
13130 }
13131
13132 return 0u;
13133 };
13134 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
13135 unsigned InterleaveFactor = IsInterleaveMask(Mask);
13136 if (InterleaveFactor != 0)
13137 E.setInterleave(InterleaveFactor);
13138 }
13139 break;
13140 }
13141 case Instruction::Select: {
13142 if (E.State != TreeEntry::Vectorize)
13143 break;
13144 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
13145 if (MinMaxID == Intrinsic::not_intrinsic)
13146 break;
13147 // This node is a minmax node.
13148 E.CombinedOp = TreeEntry::MinMax;
13149 TreeEntry *CondEntry = getOperandEntry(&E, 0);
13150 if (SelectOnly && CondEntry->UserTreeIndex &&
13151 CondEntry->State == TreeEntry::Vectorize) {
13152 // The condition node is part of the combined minmax node.
13153 CondEntry->State = TreeEntry::CombinedVectorize;
13154 }
13155 break;
13156 }
13157 case Instruction::FSub:
13158 case Instruction::FAdd: {
13159 // Check if possible to convert (a*b)+c to fma.
13160 if (E.State != TreeEntry::Vectorize ||
13161 !E.getOperations().isAddSubLikeOp())
13162 break;
13163 if (!canConvertToFMA(E.Scalars, E.getOperations(), *DT, *DL, *TTI, *TLI)
13164 .isValid())
13165 break;
13166 // This node is a fmuladd node.
13167 E.CombinedOp = TreeEntry::FMulAdd;
13168 TreeEntry *FMulEntry = getOperandEntry(&E, 0);
13169 if (FMulEntry->UserTreeIndex &&
13170 FMulEntry->State == TreeEntry::Vectorize) {
13171 // The FMul node is part of the combined fmuladd node.
13172 FMulEntry->State = TreeEntry::CombinedVectorize;
13173 }
13174 break;
13175 }
13176 default:
13177 break;
13178 }
13179 }
13180
13181 if (LoadEntriesToVectorize.empty()) {
13182 // Single load node - exit.
13183 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
13184 VectorizableTree.front()->getOpcode() == Instruction::Load)
13185 return;
13186 // Small graph with small VF - exit.
13187 constexpr unsigned SmallTree = 3;
13188 constexpr unsigned SmallVF = 2;
13189 if ((VectorizableTree.size() <= SmallTree &&
13190 VectorizableTree.front()->Scalars.size() == SmallVF) ||
13191 (VectorizableTree.size() <= 2 && UserIgnoreList))
13192 return;
13193
13194 if (VectorizableTree.front()->isNonPowOf2Vec() &&
13195 getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
13196 getCanonicalGraphSize() <= SmallTree &&
13197 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
13198 [](const std::unique_ptr<TreeEntry> &TE) {
13199 return TE->isGather() && TE->hasState() &&
13200 TE->getOpcode() == Instruction::Load &&
13201 !allSameBlock(TE->Scalars);
13202 }) == 1)
13203 return;
13204 }
13205
13206 // A list of loads to be gathered during the vectorization process. We can
13207 // try to vectorize them at the end, if profitable.
13208 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
13210 GatheredLoads;
13211
13212 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
13213 TreeEntry &E = *TE;
13214 if (E.isGather() &&
13215 ((E.hasState() && E.getOpcode() == Instruction::Load) ||
13216 (!E.hasState() && any_of(E.Scalars,
13217 [&](Value *V) {
13218 return isa<LoadInst>(V) &&
13219 !isVectorized(V) &&
13220 !isDeleted(cast<Instruction>(V));
13221 }))) &&
13222 !isSplat(E.Scalars)) {
13223 for (Value *V : E.Scalars) {
13224 auto *LI = dyn_cast<LoadInst>(V);
13225 if (!LI)
13226 continue;
13227 if (isDeleted(LI) || isVectorized(LI) || !LI->isSimple())
13228 continue;
13230 *this, V, *DL, *SE, *TTI,
13231 GatheredLoads[std::make_tuple(
13232 LI->getParent(),
13233 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth),
13234 LI->getType())]);
13235 }
13236 }
13237 }
13238 // Try to vectorize gathered loads if this is not just a gather of loads.
13239 if (!GatheredLoads.empty())
13240 tryToVectorizeGatheredLoads(GatheredLoads);
13241}
13242
13243/// Merges shuffle masks and emits final shuffle instruction, if required. It
13244/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
13245/// when the actual shuffle instruction is generated only if this is actually
13246/// required. Otherwise, the shuffle instruction emission is delayed till the
13247/// end of the process, to reduce the number of emitted instructions and further
13248/// analysis/transformations.
13249class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
13250 bool IsFinalized = false;
13251 SmallVector<int> CommonMask;
13253 const TargetTransformInfo &TTI;
13254 InstructionCost Cost = 0;
13255 SmallDenseSet<Value *> VectorizedVals;
13256 BoUpSLP &R;
13257 SmallPtrSetImpl<Value *> &CheckedExtracts;
13258 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13259 /// While set, still trying to estimate the cost for the same nodes and we
13260 /// can delay actual cost estimation (virtual shuffle instruction emission).
13261 /// May help better estimate the cost if same nodes must be permuted + allows
13262 /// to move most of the long shuffles cost estimation to TTI.
13263 bool SameNodesEstimated = true;
13264
13265 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
13266 if (Ty->getScalarType()->isPointerTy()) {
13269 IntegerType::get(Ty->getContext(),
13270 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
13271 Ty->getScalarType());
13272 if (auto *VTy = dyn_cast<VectorType>(Ty))
13273 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
13274 return Res;
13275 }
13276 return Constant::getAllOnesValue(Ty);
13277 }
13278
13279 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
13280 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
13281 return TTI::TCC_Free;
13282 auto *VecTy = getWidenedType(ScalarTy, VL.size());
13283 InstructionCost GatherCost = 0;
13284 SmallVector<Value *> Gathers(VL);
13285 if (!Root && isSplat(VL)) {
13286 // Found the broadcasting of the single scalar, calculate the cost as
13287 // the broadcast.
13288 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
13289 assert(It != VL.end() && "Expected at least one non-undef value.");
13290 // Add broadcast for non-identity shuffle only.
13291 bool NeedShuffle =
13292 count(VL, *It) > 1 &&
13293 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
13294 if (!NeedShuffle) {
13295 if (isa<FixedVectorType>(ScalarTy)) {
13296 assert(SLPReVec && "FixedVectorType is not expected.");
13297 return TTI.getShuffleCost(
13298 TTI::SK_InsertSubvector, VecTy, VecTy, {}, CostKind,
13299 std::distance(VL.begin(), It) * getNumElements(ScalarTy),
13300 cast<FixedVectorType>(ScalarTy));
13301 }
13302 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
13303 CostKind, std::distance(VL.begin(), It),
13304 PoisonValue::get(VecTy), *It);
13305 }
13306
13307 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
13308 transform(VL, ShuffleMask.begin(), [](Value *V) {
13309 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
13310 });
13311 InstructionCost InsertCost =
13312 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
13313 PoisonValue::get(VecTy), *It);
13314 return InsertCost + ::getShuffleCost(TTI,
13316 VecTy, ShuffleMask, CostKind,
13317 /*Index=*/0, /*SubTp=*/nullptr,
13318 /*Args=*/*It);
13319 }
13320 return GatherCost +
13321 (all_of(Gathers, IsaPred<UndefValue>)
13323 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
13324 ScalarTy));
13325 };
13326
13327 /// Compute the cost of creating a vector containing the extracted values from
13328 /// \p VL.
13330 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
13331 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13332 unsigned NumParts) {
13333 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
13334 unsigned NumElts =
13335 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
13336 auto *EE = dyn_cast<ExtractElementInst>(V);
13337 if (!EE)
13338 return Sz;
13339 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
13340 if (!VecTy)
13341 return Sz;
13342 return std::max(Sz, VecTy->getNumElements());
13343 });
13344 // FIXME: this must be moved to TTI for better estimation.
13345 unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
13346 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
13348 SmallVectorImpl<unsigned> &SubVecSizes)
13349 -> std::optional<TTI::ShuffleKind> {
13350 if (NumElts <= EltsPerVector)
13351 return std::nullopt;
13352 int OffsetReg0 =
13353 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
13354 [](int S, int I) {
13355 if (I == PoisonMaskElem)
13356 return S;
13357 return std::min(S, I);
13358 }),
13359 EltsPerVector);
13360 int OffsetReg1 = OffsetReg0;
13361 DenseSet<int> RegIndices;
13362 // Check that if trying to permute same single/2 input vectors.
13364 int FirstRegId = -1;
13365 Indices.assign(1, OffsetReg0);
13366 for (auto [Pos, I] : enumerate(Mask)) {
13367 if (I == PoisonMaskElem)
13368 continue;
13369 int Idx = I - OffsetReg0;
13370 int RegId =
13371 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
13372 if (FirstRegId < 0)
13373 FirstRegId = RegId;
13374 RegIndices.insert(RegId);
13375 if (RegIndices.size() > 2)
13376 return std::nullopt;
13377 if (RegIndices.size() == 2) {
13378 ShuffleKind = TTI::SK_PermuteTwoSrc;
13379 if (Indices.size() == 1) {
13380 OffsetReg1 = alignDown(
13381 std::accumulate(
13382 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
13383 [&](int S, int I) {
13384 if (I == PoisonMaskElem)
13385 return S;
13386 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
13387 ((I - OffsetReg0) % NumElts) / EltsPerVector;
13388 if (RegId == FirstRegId)
13389 return S;
13390 return std::min(S, I);
13391 }),
13392 EltsPerVector);
13393 unsigned Index = OffsetReg1 % NumElts;
13394 Indices.push_back(Index);
13395 SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
13396 }
13397 Idx = I - OffsetReg1;
13398 }
13399 I = (Idx % NumElts) % EltsPerVector +
13400 (RegId == FirstRegId ? 0 : EltsPerVector);
13401 }
13402 return ShuffleKind;
13403 };
13404 InstructionCost Cost = 0;
13405
13406 // Process extracts in blocks of EltsPerVector to check if the source vector
13407 // operand can be re-used directly. If not, add the cost of creating a
13408 // shuffle to extract the values into a vector register.
13409 for (unsigned Part : seq<unsigned>(NumParts)) {
13410 if (!ShuffleKinds[Part])
13411 continue;
13412 ArrayRef<int> MaskSlice = Mask.slice(
13413 Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
13414 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
13415 copy(MaskSlice, SubMask.begin());
13417 SmallVector<unsigned, 2> SubVecSizes;
13418 std::optional<TTI::ShuffleKind> RegShuffleKind =
13419 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
13420 if (!RegShuffleKind) {
13421 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
13423 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
13424 Cost +=
13425 ::getShuffleCost(TTI, *ShuffleKinds[Part],
13426 getWidenedType(ScalarTy, NumElts), MaskSlice);
13427 continue;
13428 }
13429 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
13430 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
13431 Cost +=
13432 ::getShuffleCost(TTI, *RegShuffleKind,
13433 getWidenedType(ScalarTy, EltsPerVector), SubMask);
13434 }
13435 const unsigned BaseVF = getFullVectorNumberOfElements(
13436 *R.TTI, VL.front()->getType(), alignTo(NumElts, EltsPerVector));
13437 for (const auto [Idx, SubVecSize] : zip(Indices, SubVecSizes)) {
13438 assert((Idx + SubVecSize) <= BaseVF &&
13439 "SK_ExtractSubvector index out of range");
13441 getWidenedType(ScalarTy, BaseVF), {}, CostKind,
13442 Idx, getWidenedType(ScalarTy, SubVecSize));
13443 }
13444 // Second attempt to check, if just a permute is better estimated than
13445 // subvector extract.
13446 SubMask.assign(NumElts, PoisonMaskElem);
13447 copy(MaskSlice, SubMask.begin());
13448 InstructionCost OriginalCost = ::getShuffleCost(
13449 TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask);
13450 if (OriginalCost < Cost)
13451 Cost = OriginalCost;
13452 }
13453 return Cost;
13454 }
13455 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
13456 /// mask \p Mask, register number \p Part, that includes \p SliceSize
13457 /// elements.
13458 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
13459 ArrayRef<int> Mask, unsigned Part,
13460 unsigned SliceSize) {
13461 if (SameNodesEstimated) {
13462 // Delay the cost estimation if the same nodes are reshuffling.
13463 // If we already requested the cost of reshuffling of E1 and E2 before, no
13464 // need to estimate another cost with the sub-Mask, instead include this
13465 // sub-Mask into the CommonMask to estimate it later and avoid double cost
13466 // estimation.
13467 if ((InVectors.size() == 2 &&
13468 cast<const TreeEntry *>(InVectors.front()) == &E1 &&
13469 cast<const TreeEntry *>(InVectors.back()) == E2) ||
13470 (!E2 && cast<const TreeEntry *>(InVectors.front()) == &E1)) {
13471 unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
13472 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
13473 [](int Idx) { return Idx == PoisonMaskElem; }) &&
13474 "Expected all poisoned elements.");
13475 ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
13476 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
13477 return;
13478 }
13479 // Found non-matching nodes - need to estimate the cost for the matched
13480 // and transform mask.
13481 Cost += createShuffle(InVectors.front(),
13482 InVectors.size() == 1 ? nullptr : InVectors.back(),
13483 CommonMask);
13484 transformMaskAfterShuffle(CommonMask, CommonMask);
13485 } else if (InVectors.size() == 2) {
13486 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
13487 transformMaskAfterShuffle(CommonMask, CommonMask);
13488 }
13489 SameNodesEstimated = false;
13490 if (!E2 && InVectors.size() == 1) {
13491 unsigned VF = E1.getVectorFactor();
13492 if (Value *V1 = dyn_cast<Value *>(InVectors.front())) {
13493 VF = std::max(VF, getVF(V1));
13494 } else {
13495 const auto *E = cast<const TreeEntry *>(InVectors.front());
13496 VF = std::max(VF, E->getVectorFactor());
13497 }
13498 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13499 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
13500 CommonMask[Idx] = Mask[Idx] + VF;
13501 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
13502 transformMaskAfterShuffle(CommonMask, CommonMask);
13503 } else {
13504 auto P = InVectors.front();
13505 Cost += createShuffle(&E1, E2, Mask);
13506 unsigned VF = Mask.size();
13507 if (Value *V1 = dyn_cast<Value *>(P)) {
13508 VF = std::max(VF,
13509 getNumElements(V1->getType()));
13510 } else {
13511 const auto *E = cast<const TreeEntry *>(P);
13512 VF = std::max(VF, E->getVectorFactor());
13513 }
13514 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13515 if (Mask[Idx] != PoisonMaskElem)
13516 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
13517 Cost += createShuffle(P, InVectors.front(), CommonMask);
13518 transformMaskAfterShuffle(CommonMask, CommonMask);
13519 }
13520 }
13521
13522 class ShuffleCostBuilder {
13523 const TargetTransformInfo &TTI;
13524
13525 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
13526 int Index = -1;
13527 return Mask.empty() ||
13528 (VF == Mask.size() &&
13531 Index == 0);
13532 }
13533
13534 public:
13535 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
13536 ~ShuffleCostBuilder() = default;
13537 InstructionCost createShuffleVector(Value *V1, Value *,
13538 ArrayRef<int> Mask) const {
13539 // Empty mask or identity mask are free.
13540 unsigned VF =
13541 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
13542 if (isEmptyOrIdentity(Mask, VF))
13543 return TTI::TCC_Free;
13544 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
13545 cast<VectorType>(V1->getType()), Mask);
13546 }
13547 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
13548 // Empty mask or identity mask are free.
13549 unsigned VF =
13550 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
13551 if (isEmptyOrIdentity(Mask, VF))
13552 return TTI::TCC_Free;
13553 return ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
13554 cast<VectorType>(V1->getType()), Mask);
13555 }
13556 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
13557 InstructionCost createPoison(Type *Ty, unsigned VF) const {
13558 return TTI::TCC_Free;
13559 }
13560 void resizeToMatch(Value *&, Value *&) const {}
13561 };
13562
13563 /// Smart shuffle instruction emission, walks through shuffles trees and
13564 /// tries to find the best matching vector for the actual shuffle
13565 /// instruction.
13567 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
13569 ArrayRef<int> Mask) {
13570 ShuffleCostBuilder Builder(TTI);
13571 SmallVector<int> CommonMask(Mask);
13572 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
13573 unsigned CommonVF = Mask.size();
13574 InstructionCost ExtraCost = 0;
13575 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
13576 unsigned VF) -> InstructionCost {
13577 if (E.isGather() && allConstant(E.Scalars))
13578 return TTI::TCC_Free;
13579 Type *EScalarTy = E.Scalars.front()->getType();
13580 bool IsSigned = true;
13581 if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
13582 EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
13583 IsSigned = It->second.second;
13584 }
13585 if (EScalarTy != ScalarTy) {
13586 unsigned CastOpcode = Instruction::Trunc;
13587 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13588 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13589 if (DstSz > SrcSz)
13590 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13591 return TTI.getCastInstrCost(CastOpcode, getWidenedType(ScalarTy, VF),
13592 getWidenedType(EScalarTy, VF),
13593 TTI::CastContextHint::None, CostKind);
13594 }
13595 return TTI::TCC_Free;
13596 };
13597 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
13598 if (isa<Constant>(V))
13599 return TTI::TCC_Free;
13600 auto *VecTy = cast<VectorType>(V->getType());
13601 Type *EScalarTy = VecTy->getElementType();
13602 if (EScalarTy != ScalarTy) {
13603 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
13604 unsigned CastOpcode = Instruction::Trunc;
13605 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13606 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13607 if (DstSz > SrcSz)
13608 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13609 return TTI.getCastInstrCost(
13610 CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
13611 VecTy, TTI::CastContextHint::None, CostKind);
13612 }
13613 return TTI::TCC_Free;
13614 };
13615 if (!V1 && !V2 && !P2.isNull()) {
13616 // Shuffle 2 entry nodes.
13617 const TreeEntry *E = cast<const TreeEntry *>(P1);
13618 unsigned VF = E->getVectorFactor();
13619 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
13620 CommonVF = std::max(VF, E2->getVectorFactor());
13621 assert(all_of(Mask,
13622 [=](int Idx) {
13623 return Idx < 2 * static_cast<int>(CommonVF);
13624 }) &&
13625 "All elements in mask must be less than 2 * CommonVF.");
13626 if (E->Scalars.size() == E2->Scalars.size()) {
13627 SmallVector<int> EMask = E->getCommonMask();
13628 SmallVector<int> E2Mask = E2->getCommonMask();
13629 if (!EMask.empty() || !E2Mask.empty()) {
13630 for (int &Idx : CommonMask) {
13631 if (Idx == PoisonMaskElem)
13632 continue;
13633 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
13634 Idx = EMask[Idx];
13635 else if (Idx >= static_cast<int>(CommonVF))
13636 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
13637 E->Scalars.size();
13638 }
13639 }
13640 CommonVF = E->Scalars.size();
13641 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
13642 GetNodeMinBWAffectedCost(*E2, CommonVF);
13643 } else {
13644 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
13645 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
13646 }
13647 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13648 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13649 } else if (!V1 && P2.isNull()) {
13650 // Shuffle single entry node.
13651 const TreeEntry *E = cast<const TreeEntry *>(P1);
13652 unsigned VF = E->getVectorFactor();
13653 CommonVF = VF;
13654 assert(
13655 all_of(Mask,
13656 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
13657 "All elements in mask must be less than CommonVF.");
13658 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
13659 SmallVector<int> EMask = E->getCommonMask();
13660 assert(!EMask.empty() && "Expected non-empty common mask.");
13661 for (int &Idx : CommonMask) {
13662 if (Idx != PoisonMaskElem)
13663 Idx = EMask[Idx];
13664 }
13665 CommonVF = E->Scalars.size();
13666 } else if (unsigned Factor = E->getInterleaveFactor();
13667 Factor > 0 && E->Scalars.size() != Mask.size() &&
13669 Factor)) {
13670 // Deinterleaved nodes are free.
13671 std::iota(CommonMask.begin(), CommonMask.end(), 0);
13672 }
13673 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
13674 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13675 // Not identity/broadcast? Try to see if the original vector is better.
13676 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
13677 CommonVF == CommonMask.size() &&
13678 any_of(enumerate(CommonMask),
13679 [](const auto &&P) {
13680 return P.value() != PoisonMaskElem &&
13681 static_cast<unsigned>(P.value()) != P.index();
13682 }) &&
13683 any_of(CommonMask,
13684 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
13685 SmallVector<int> ReorderMask;
13686 inversePermutation(E->ReorderIndices, ReorderMask);
13687 ::addMask(CommonMask, ReorderMask);
13688 }
13689 } else if (V1 && P2.isNull()) {
13690 // Shuffle single vector.
13691 ExtraCost += GetValueMinBWAffectedCost(V1);
13692 CommonVF = getVF(V1);
13693 assert(
13694 all_of(Mask,
13695 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
13696 "All elements in mask must be less than CommonVF.");
13697 } else if (V1 && !V2) {
13698 // Shuffle vector and tree node.
13699 unsigned VF = getVF(V1);
13700 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
13701 CommonVF = std::max(VF, E2->getVectorFactor());
13702 assert(all_of(Mask,
13703 [=](int Idx) {
13704 return Idx < 2 * static_cast<int>(CommonVF);
13705 }) &&
13706 "All elements in mask must be less than 2 * CommonVF.");
13707 if (E2->Scalars.size() == VF && VF != CommonVF) {
13708 SmallVector<int> E2Mask = E2->getCommonMask();
13709 assert(!E2Mask.empty() && "Expected non-empty common mask.");
13710 for (int &Idx : CommonMask) {
13711 if (Idx == PoisonMaskElem)
13712 continue;
13713 if (Idx >= static_cast<int>(CommonVF))
13714 Idx = E2Mask[Idx - CommonVF] + VF;
13715 }
13716 CommonVF = VF;
13717 }
13718 ExtraCost += GetValueMinBWAffectedCost(V1);
13719 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13720 ExtraCost += GetNodeMinBWAffectedCost(
13721 *E2, std::min(CommonVF, E2->getVectorFactor()));
13722 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13723 } else if (!V1 && V2) {
13724 // Shuffle vector and tree node.
13725 unsigned VF = getVF(V2);
13726 const TreeEntry *E1 = cast<const TreeEntry *>(P1);
13727 CommonVF = std::max(VF, E1->getVectorFactor());
13728 assert(all_of(Mask,
13729 [=](int Idx) {
13730 return Idx < 2 * static_cast<int>(CommonVF);
13731 }) &&
13732 "All elements in mask must be less than 2 * CommonVF.");
13733 if (E1->Scalars.size() == VF && VF != CommonVF) {
13734 SmallVector<int> E1Mask = E1->getCommonMask();
13735 assert(!E1Mask.empty() && "Expected non-empty common mask.");
13736 for (int &Idx : CommonMask) {
13737 if (Idx == PoisonMaskElem)
13738 continue;
13739 if (Idx >= static_cast<int>(CommonVF))
13740 Idx = E1Mask[Idx - CommonVF] + VF;
13741 else
13742 Idx = E1Mask[Idx];
13743 }
13744 CommonVF = VF;
13745 }
13746 ExtraCost += GetNodeMinBWAffectedCost(
13747 *E1, std::min(CommonVF, E1->getVectorFactor()));
13748 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13749 ExtraCost += GetValueMinBWAffectedCost(V2);
13750 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13751 } else {
13752 assert(V1 && V2 && "Expected both vectors.");
13753 unsigned VF = getVF(V1);
13754 CommonVF = std::max(VF, getVF(V2));
13755 assert(all_of(Mask,
13756 [=](int Idx) {
13757 return Idx < 2 * static_cast<int>(CommonVF);
13758 }) &&
13759 "All elements in mask must be less than 2 * CommonVF.");
13760 ExtraCost +=
13761 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
13762 if (V1->getType() != V2->getType()) {
13763 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13764 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13765 } else {
13766 if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
13767 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13768 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
13769 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13770 }
13771 }
13772 InVectors.front() =
13773 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
13774 if (InVectors.size() == 2)
13775 InVectors.pop_back();
13776 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
13777 V1, V2, CommonMask, Builder, ScalarTy);
13778 }
13779
13780public:
13782 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
13783 SmallPtrSetImpl<Value *> &CheckedExtracts)
13784 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
13785 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
13786 CheckedExtracts(CheckedExtracts) {}
13787 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
13788 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13789 unsigned NumParts, bool &UseVecBaseAsInput) {
13790 UseVecBaseAsInput = false;
13791 if (Mask.empty())
13792 return nullptr;
13793 Value *VecBase = nullptr;
13794 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
13795 if (!E->ReorderIndices.empty()) {
13796 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
13797 E->ReorderIndices.end());
13798 reorderScalars(VL, ReorderMask);
13799 }
13800 // Check if it can be considered reused if same extractelements were
13801 // vectorized already.
13802 bool PrevNodeFound = any_of(
13803 ArrayRef(R.VectorizableTree).take_front(E->Idx),
13804 [&](const std::unique_ptr<TreeEntry> &TE) {
13805 return ((TE->hasState() && !TE->isAltShuffle() &&
13806 TE->getOpcode() == Instruction::ExtractElement) ||
13807 TE->isGather()) &&
13808 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
13809 return VL.size() > Data.index() &&
13810 (Mask[Data.index()] == PoisonMaskElem ||
13811 isa<UndefValue>(VL[Data.index()]) ||
13812 Data.value() == VL[Data.index()]);
13813 });
13814 });
13815 SmallPtrSet<Value *, 4> UniqueBases;
13816 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
13817 SmallDenseMap<Value *, APInt, 4> VectorOpsToExtracts;
13818 for (unsigned Part : seq<unsigned>(NumParts)) {
13819 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
13820 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
13821 for (auto [I, V] :
13822 enumerate(ArrayRef(VL).slice(Part * SliceSize, Limit))) {
13823 // Ignore non-extractelement scalars.
13824 if (isa<UndefValue>(V) ||
13825 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
13826 continue;
13827 // If all users of instruction are going to be vectorized and this
13828 // instruction itself is not going to be vectorized, consider this
13829 // instruction as dead and remove its cost from the final cost of the
13830 // vectorized tree.
13831 // Also, avoid adjusting the cost for extractelements with multiple uses
13832 // in different graph entries.
13833 auto *EE = cast<ExtractElementInst>(V);
13834 VecBase = EE->getVectorOperand();
13835 UniqueBases.insert(VecBase);
13836 ArrayRef<TreeEntry *> VEs = R.getTreeEntries(V);
13837 if (!CheckedExtracts.insert(V).second ||
13838 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
13839 any_of(EE->users(),
13840 [&](User *U) {
13841 return isa<GetElementPtrInst>(U) &&
13842 !R.areAllUsersVectorized(cast<Instruction>(U),
13843 &VectorizedVals);
13844 }) ||
13845 (!VEs.empty() && !is_contained(VEs, E)))
13846 continue;
13847 std::optional<unsigned> EEIdx = getExtractIndex(EE);
13848 if (!EEIdx)
13849 continue;
13850 unsigned Idx = *EEIdx;
13851 // Take credit for instruction that will become dead.
13852 if (EE->hasOneUse() || !PrevNodeFound) {
13853 Instruction *Ext = EE->user_back();
13854 if (isa<SExtInst, ZExtInst>(Ext) &&
13855 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
13856 // Use getExtractWithExtendCost() to calculate the cost of
13857 // extractelement/ext pair.
13858 Cost -= TTI.getExtractWithExtendCost(
13859 Ext->getOpcode(), Ext->getType(), EE->getVectorOperandType(),
13860 Idx, CostKind);
13861 // Add back the cost of s|zext which is subtracted separately.
13862 Cost += TTI.getCastInstrCost(
13863 Ext->getOpcode(), Ext->getType(), EE->getType(),
13865 continue;
13866 }
13867 }
13868 APInt &DemandedElts =
13869 VectorOpsToExtracts
13870 .try_emplace(VecBase,
13871 APInt::getZero(getNumElements(VecBase->getType())))
13872 .first->getSecond();
13873 DemandedElts.setBit(Idx);
13874 }
13875 }
13876 for (const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
13878 DemandedElts, /*Insert=*/false,
13879 /*Extract=*/true, CostKind);
13880 // Check that gather of extractelements can be represented as just a
13881 // shuffle of a single/two vectors the scalars are extracted from.
13882 // Found the bunch of extractelement instructions that must be gathered
13883 // into a vector and can be represented as a permutation elements in a
13884 // single input vector or of 2 input vectors.
13885 // Done for reused if same extractelements were vectorized already.
13886 if (!PrevNodeFound)
13887 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
13888 InVectors.assign(1, E);
13889 CommonMask.assign(Mask.begin(), Mask.end());
13890 transformMaskAfterShuffle(CommonMask, CommonMask);
13891 SameNodesEstimated = false;
13892 if (NumParts != 1 && UniqueBases.size() != 1) {
13893 UseVecBaseAsInput = true;
13894 VecBase =
13895 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
13896 }
13897 return VecBase;
13898 }
13899 /// Checks if the specified entry \p E needs to be delayed because of its
13900 /// dependency nodes.
13901 std::optional<InstructionCost>
13902 needToDelay(const TreeEntry *,
13904 // No need to delay the cost estimation during analysis.
13905 return std::nullopt;
13906 }
13907 /// Reset the builder to handle perfect diamond match.
13909 IsFinalized = false;
13910 CommonMask.clear();
13911 InVectors.clear();
13912 Cost = 0;
13913 VectorizedVals.clear();
13914 SameNodesEstimated = true;
13915 }
13916 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
13917 if (&E1 == &E2) {
13918 assert(all_of(Mask,
13919 [&](int Idx) {
13920 return Idx < static_cast<int>(E1.getVectorFactor());
13921 }) &&
13922 "Expected single vector shuffle mask.");
13923 add(E1, Mask);
13924 return;
13925 }
13926 if (InVectors.empty()) {
13927 CommonMask.assign(Mask.begin(), Mask.end());
13928 InVectors.assign({&E1, &E2});
13929 return;
13930 }
13931 assert(!CommonMask.empty() && "Expected non-empty common mask.");
13932 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
13933 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
13934 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
13935 const auto *It =
13936 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
13937 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
13938 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
13939 }
13940 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
13941 if (InVectors.empty()) {
13942 CommonMask.assign(Mask.begin(), Mask.end());
13943 InVectors.assign(1, &E1);
13944 return;
13945 }
13946 assert(!CommonMask.empty() && "Expected non-empty common mask.");
13947 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
13948 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
13949 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
13950 const auto *It =
13951 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
13952 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
13953 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
13954 if (!SameNodesEstimated && InVectors.size() == 1)
13955 InVectors.emplace_back(&E1);
13956 }
13957 /// Adds 2 input vectors and the mask for their shuffling.
13958 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
13959 // May come only for shuffling of 2 vectors with extractelements, already
13960 // handled in adjustExtracts.
13961 assert(InVectors.size() == 1 &&
13962 all_of(enumerate(CommonMask),
13963 [&](auto P) {
13964 if (P.value() == PoisonMaskElem)
13965 return Mask[P.index()] == PoisonMaskElem;
13966 auto *EI = cast<ExtractElementInst>(
13967 cast<const TreeEntry *>(InVectors.front())
13968 ->getOrdered(P.index()));
13969 return EI->getVectorOperand() == V1 ||
13970 EI->getVectorOperand() == V2;
13971 }) &&
13972 "Expected extractelement vectors.");
13973 }
13974 /// Adds another one input vector and the mask for the shuffling.
13975 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
13976 if (InVectors.empty()) {
13977 assert(CommonMask.empty() && !ForExtracts &&
13978 "Expected empty input mask/vectors.");
13979 CommonMask.assign(Mask.begin(), Mask.end());
13980 InVectors.assign(1, V1);
13981 return;
13982 }
13983 if (ForExtracts) {
13984 // No need to add vectors here, already handled them in adjustExtracts.
13985 assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
13986 !CommonMask.empty() &&
13987 all_of(enumerate(CommonMask),
13988 [&](auto P) {
13989 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
13990 ->getOrdered(P.index());
13991 if (P.value() == PoisonMaskElem)
13992 return P.value() == Mask[P.index()] ||
13993 isa<UndefValue>(Scalar);
13994 if (isa<Constant>(V1))
13995 return true;
13996 auto *EI = cast<ExtractElementInst>(Scalar);
13997 return EI->getVectorOperand() == V1;
13998 }) &&
13999 "Expected only tree entry for extractelement vectors.");
14000 return;
14001 }
14002 assert(!InVectors.empty() && !CommonMask.empty() &&
14003 "Expected only tree entries from extracts/reused buildvectors.");
14004 unsigned VF = getVF(V1);
14005 if (InVectors.size() == 2) {
14006 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14007 transformMaskAfterShuffle(CommonMask, CommonMask);
14008 VF = std::max<unsigned>(VF, CommonMask.size());
14009 } else if (const auto *InTE =
14010 InVectors.front().dyn_cast<const TreeEntry *>()) {
14011 VF = std::max(VF, InTE->getVectorFactor());
14012 } else {
14013 VF = std::max(
14014 VF, cast<FixedVectorType>(cast<Value *>(InVectors.front())->getType())
14015 ->getNumElements());
14016 }
14017 InVectors.push_back(V1);
14018 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14019 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
14020 CommonMask[Idx] = Mask[Idx] + VF;
14021 }
14022 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
14023 Value *Root = nullptr) {
14024 Cost += getBuildVectorCost(VL, Root);
14025 if (!Root) {
14026 // FIXME: Need to find a way to avoid use of getNullValue here.
14028 unsigned VF = VL.size();
14029 if (MaskVF != 0)
14030 VF = std::min(VF, MaskVF);
14031 Type *VLScalarTy = VL.front()->getType();
14032 for (Value *V : VL.take_front(VF)) {
14033 Type *ScalarTy = VLScalarTy->getScalarType();
14034 if (isa<PoisonValue>(V)) {
14035 Vals.push_back(PoisonValue::get(ScalarTy));
14036 continue;
14037 }
14038 if (isa<UndefValue>(V)) {
14039 Vals.push_back(UndefValue::get(ScalarTy));
14040 continue;
14041 }
14042 Vals.push_back(Constant::getNullValue(ScalarTy));
14043 }
14044 if (auto *VecTy = dyn_cast<FixedVectorType>(VLScalarTy)) {
14045 assert(SLPReVec && "FixedVectorType is not expected.");
14046 // When REVEC is enabled, we need to expand vector types into scalar
14047 // types.
14048 Vals = replicateMask(Vals, VecTy->getNumElements());
14049 }
14050 return ConstantVector::get(Vals);
14051 }
14054 cast<FixedVectorType>(Root->getType())->getNumElements()),
14055 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
14056 }
14058 /// Finalize emission of the shuffles.
14060 ArrayRef<int> ExtMask,
14061 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14062 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
14065 Action = {}) {
14066 IsFinalized = true;
14067 if (Action) {
14068 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
14069 if (InVectors.size() == 2)
14070 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
14071 else
14072 Cost += createShuffle(Vec, nullptr, CommonMask);
14073 transformMaskAfterShuffle(CommonMask, CommonMask);
14074 assert(VF > 0 &&
14075 "Expected vector length for the final value before action.");
14076 Value *V = cast<Value *>(Vec);
14077 Action(V, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
14078 Cost += createShuffle(V1, V2, Mask);
14079 return V1;
14080 });
14081 InVectors.front() = V;
14082 }
14083 if (!SubVectors.empty()) {
14084 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
14085 if (InVectors.size() == 2)
14086 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
14087 else
14088 Cost += createShuffle(Vec, nullptr, CommonMask);
14089 transformMaskAfterShuffle(CommonMask, CommonMask);
14090 // Add subvectors permutation cost.
14091 if (!SubVectorsMask.empty()) {
14092 assert(SubVectorsMask.size() <= CommonMask.size() &&
14093 "Expected same size of masks for subvectors and common mask.");
14094 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
14095 copy(SubVectorsMask, SVMask.begin());
14096 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
14097 if (I2 != PoisonMaskElem) {
14098 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
14099 I1 = I2 + CommonMask.size();
14100 }
14101 }
14103 getWidenedType(ScalarTy, CommonMask.size()),
14104 SVMask, CostKind);
14105 }
14106 for (auto [E, Idx] : SubVectors) {
14107 Type *EScalarTy = E->Scalars.front()->getType();
14108 bool IsSigned = true;
14109 if (auto It = R.MinBWs.find(E); It != R.MinBWs.end()) {
14110 EScalarTy =
14111 IntegerType::get(EScalarTy->getContext(), It->second.first);
14112 IsSigned = It->second.second;
14113 }
14114 if (ScalarTy != EScalarTy) {
14115 unsigned CastOpcode = Instruction::Trunc;
14116 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
14117 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
14118 if (DstSz > SrcSz)
14119 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14120 Cost += TTI.getCastInstrCost(
14121 CastOpcode, getWidenedType(ScalarTy, E->getVectorFactor()),
14122 getWidenedType(EScalarTy, E->getVectorFactor()),
14124 }
14127 getWidenedType(ScalarTy, CommonMask.size()), {}, CostKind, Idx,
14128 getWidenedType(ScalarTy, E->getVectorFactor()));
14129 if (!CommonMask.empty()) {
14130 std::iota(std::next(CommonMask.begin(), Idx),
14131 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
14132 Idx);
14133 }
14134 }
14135 }
14136
14137 if (!ExtMask.empty()) {
14138 if (CommonMask.empty()) {
14139 CommonMask.assign(ExtMask.begin(), ExtMask.end());
14140 } else {
14141 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
14142 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
14143 if (ExtMask[I] == PoisonMaskElem)
14144 continue;
14145 NewMask[I] = CommonMask[ExtMask[I]];
14146 }
14147 CommonMask.swap(NewMask);
14148 }
14149 }
14150 if (CommonMask.empty()) {
14151 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
14152 return Cost;
14153 }
14154 return Cost +
14155 createShuffle(InVectors.front(),
14156 InVectors.size() == 2 ? InVectors.back() : nullptr,
14157 CommonMask);
14158 }
14159
14161 assert((IsFinalized || CommonMask.empty()) &&
14162 "Shuffle construction must be finalized.");
14163 }
14164};
14165
14166const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
14167 unsigned Idx) const {
14168 TreeEntry *Op = OperandsToTreeEntry.at({E, Idx});
14169 assert(Op->isSame(E->getOperand(Idx)) && "Operands mismatch!");
14170 return Op;
14171}
14172
14173TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
14174 if (TE.State == TreeEntry::ScatterVectorize ||
14175 TE.State == TreeEntry::StridedVectorize)
14177 if (TE.State == TreeEntry::CompressVectorize)
14179 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
14180 !TE.isAltShuffle()) {
14181 if (TE.ReorderIndices.empty())
14183 SmallVector<int> Mask;
14184 inversePermutation(TE.ReorderIndices, Mask);
14185 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
14187 }
14189}
14190
14192BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
14193 SmallPtrSetImpl<Value *> &CheckedExtracts) {
14194 ArrayRef<Value *> VL = E->Scalars;
14195
14196 Type *ScalarTy = getValueType(VL[0]);
14197 if (!isValidElementType(ScalarTy))
14200
14201 // If we have computed a smaller type for the expression, update VecTy so
14202 // that the costs will be accurate.
14203 auto It = MinBWs.find(E);
14204 Type *OrigScalarTy = ScalarTy;
14205 if (It != MinBWs.end()) {
14206 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
14207 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
14208 if (VecTy)
14209 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
14210 }
14211 auto *VecTy = getWidenedType(ScalarTy, VL.size());
14212 unsigned EntryVF = E->getVectorFactor();
14213 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
14214
14215 if (E->isGather()) {
14216 if (allConstant(VL))
14217 return 0;
14218 if (isa<InsertElementInst>(VL[0]))
14220 if (isa<CmpInst>(VL.front()))
14221 ScalarTy = VL.front()->getType();
14222 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
14223 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
14224 }
14225 if (E->State == TreeEntry::SplitVectorize) {
14226 assert(E->CombinedEntriesWithIndices.size() == 2 &&
14227 "Expected exactly 2 combined entries.");
14228 assert(E->ReuseShuffleIndices.empty() && "Expected empty reuses mask.");
14229 InstructionCost VectorCost = 0;
14230 if (E->ReorderIndices.empty()) {
14231 VectorCost = ::getShuffleCost(
14232 *TTI, TTI::SK_InsertSubvector, FinalVecTy, {}, CostKind,
14233 E->CombinedEntriesWithIndices.back().second,
14235 ScalarTy,
14236 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
14237 ->getVectorFactor()));
14238 } else {
14239 unsigned CommonVF =
14240 std::max(VectorizableTree[E->CombinedEntriesWithIndices.front().first]
14241 ->getVectorFactor(),
14242 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
14243 ->getVectorFactor());
14244 VectorCost = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
14245 getWidenedType(ScalarTy, CommonVF),
14246 E->getSplitMask(), CostKind);
14247 }
14248 LLVM_DEBUG(dumpTreeCosts(E, 0, VectorCost, 0, "Calculated costs for Tree"));
14249 return VectorCost;
14250 }
14251 InstructionCost CommonCost = 0;
14252 SmallVector<int> Mask;
14253 if (!E->ReorderIndices.empty() && E->State != TreeEntry::CompressVectorize &&
14254 (E->State != TreeEntry::StridedVectorize ||
14255 !isReverseOrder(E->ReorderIndices))) {
14256 SmallVector<int> NewMask;
14257 if (E->getOpcode() == Instruction::Store) {
14258 // For stores the order is actually a mask.
14259 NewMask.resize(E->ReorderIndices.size());
14260 copy(E->ReorderIndices, NewMask.begin());
14261 } else {
14262 inversePermutation(E->ReorderIndices, NewMask);
14263 }
14264 ::addMask(Mask, NewMask);
14265 }
14266 if (!E->ReuseShuffleIndices.empty())
14267 ::addMask(Mask, E->ReuseShuffleIndices);
14268 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
14269 CommonCost =
14270 ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
14271 assert((E->State == TreeEntry::Vectorize ||
14272 E->State == TreeEntry::ScatterVectorize ||
14273 E->State == TreeEntry::StridedVectorize ||
14274 E->State == TreeEntry::CompressVectorize) &&
14275 "Unhandled state");
14276 assert(E->getOpcode() &&
14277 ((allSameType(VL) && allSameBlock(VL)) ||
14278 (E->getOpcode() == Instruction::GetElementPtr &&
14279 E->getMainOp()->getType()->isPointerTy()) ||
14280 E->hasCopyableElements()) &&
14281 "Invalid VL");
14282 Instruction *VL0 = E->getMainOp();
14283 unsigned ShuffleOrOp =
14284 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
14285 if (E->CombinedOp != TreeEntry::NotCombinedOp)
14286 ShuffleOrOp = E->CombinedOp;
14287 SmallSetVector<Value *, 16> UniqueValues(VL.begin(), VL.end());
14288 const unsigned Sz = UniqueValues.size();
14289 SmallBitVector UsedScalars(Sz, false);
14290 for (unsigned I = 0; I < Sz; ++I) {
14291 if (isa<Instruction>(UniqueValues[I]) &&
14292 !E->isCopyableElement(UniqueValues[I]) &&
14293 getTreeEntries(UniqueValues[I]).front() == E)
14294 continue;
14295 UsedScalars.set(I);
14296 }
14297 auto GetCastContextHint = [&](Value *V) {
14298 if (ArrayRef<TreeEntry *> OpTEs = getTreeEntries(V); OpTEs.size() == 1)
14299 return getCastContextHint(*OpTEs.front());
14300 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
14301 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
14302 !SrcState.isAltShuffle())
14305 };
14306 auto GetCostDiff =
14307 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
14308 function_ref<InstructionCost(InstructionCost)> VectorCost) {
14309 // Calculate the cost of this instruction.
14310 InstructionCost ScalarCost = 0;
14311 if (isa<CastInst, CallInst>(VL0)) {
14312 // For some of the instructions no need to calculate cost for each
14313 // particular instruction, we can use the cost of the single
14314 // instruction x total number of scalar instructions.
14315 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
14316 } else {
14317 for (unsigned I = 0; I < Sz; ++I) {
14318 if (UsedScalars.test(I))
14319 continue;
14320 ScalarCost += ScalarEltCost(I);
14321 }
14322 }
14323
14324 InstructionCost VecCost = VectorCost(CommonCost);
14325 // Check if the current node must be resized, if the parent node is not
14326 // resized.
14327 if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) &&
14328 E->Idx != 0 &&
14329 (E->getOpcode() != Instruction::Load || E->UserTreeIndex)) {
14330 const EdgeInfo &EI = E->UserTreeIndex;
14331 if (!EI.UserTE->hasState() ||
14332 EI.UserTE->getOpcode() != Instruction::Select ||
14333 EI.EdgeIdx != 0) {
14334 auto UserBWIt = MinBWs.find(EI.UserTE);
14335 Type *UserScalarTy =
14336 (EI.UserTE->isGather() ||
14337 EI.UserTE->State == TreeEntry::SplitVectorize)
14338 ? EI.UserTE->Scalars.front()->getType()
14339 : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
14340 if (UserBWIt != MinBWs.end())
14341 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
14342 UserBWIt->second.first);
14343 if (ScalarTy != UserScalarTy) {
14344 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
14345 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
14346 unsigned VecOpcode;
14347 auto *UserVecTy = getWidenedType(UserScalarTy, E->Scalars.size());
14348 if (BWSz > SrcBWSz)
14349 VecOpcode = Instruction::Trunc;
14350 else
14351 VecOpcode =
14352 It->second.second ? Instruction::SExt : Instruction::ZExt;
14353 TTI::CastContextHint CCH = GetCastContextHint(VL0);
14354 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
14355 CostKind);
14356 }
14357 }
14358 }
14359 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
14360 ScalarCost, "Calculated costs for Tree"));
14361 return VecCost - ScalarCost;
14362 };
14363 // Calculate cost difference from vectorizing set of GEPs.
14364 // Negative value means vectorizing is profitable.
14365 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
14366 assert((E->State == TreeEntry::Vectorize ||
14367 E->State == TreeEntry::StridedVectorize ||
14368 E->State == TreeEntry::CompressVectorize) &&
14369 "Entry state expected to be Vectorize, StridedVectorize or "
14370 "MaskedLoadCompressVectorize here.");
14371 InstructionCost ScalarCost = 0;
14372 InstructionCost VecCost = 0;
14373 std::tie(ScalarCost, VecCost) = getGEPCosts(
14374 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
14375 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
14376 "Calculated GEPs cost for Tree"));
14377
14378 return VecCost - ScalarCost;
14379 };
14380
14381 auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
14382 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI ? VI : VL);
14383 if (MinMaxID == Intrinsic::not_intrinsic)
14385 Type *CanonicalType = Ty;
14386 if (CanonicalType->isPtrOrPtrVectorTy())
14387 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
14388 CanonicalType->getContext(),
14389 DL->getTypeSizeInBits(CanonicalType->getScalarType())));
14390
14391 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
14392 {CanonicalType, CanonicalType});
14394 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
14395 // If the selects are the only uses of the compares, they will be
14396 // dead and we can adjust the cost by removing their cost.
14397 if (VI && SelectOnly) {
14398 assert((!Ty->isVectorTy() || SLPReVec) &&
14399 "Expected only for scalar type.");
14400 auto *CI = cast<CmpInst>(VI->getOperand(0));
14401 IntrinsicCost -= TTI->getCmpSelInstrCost(
14402 CI->getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
14403 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
14404 {TTI::OK_AnyValue, TTI::OP_None}, CI);
14405 }
14406 return IntrinsicCost;
14407 };
14408 auto GetFMulAddCost = [&, &TTI = *TTI](const InstructionsState &S,
14409 Instruction *VI) {
14410 InstructionCost Cost = canConvertToFMA(VI, S, *DT, *DL, TTI, *TLI);
14411 return Cost;
14412 };
14413 switch (ShuffleOrOp) {
14414 case Instruction::PHI: {
14415 // Count reused scalars.
14416 InstructionCost ScalarCost = 0;
14417 SmallPtrSet<const TreeEntry *, 4> CountedOps;
14418 for (Value *V : UniqueValues) {
14419 auto *PHI = dyn_cast<PHINode>(V);
14420 if (!PHI)
14421 continue;
14422
14423 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
14424 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
14425 Value *Op = PHI->getIncomingValue(I);
14426 Operands[I] = Op;
14427 }
14428 if (const TreeEntry *OpTE =
14429 getSameValuesTreeEntry(Operands.front(), Operands))
14430 if (CountedOps.insert(OpTE).second &&
14431 !OpTE->ReuseShuffleIndices.empty())
14432 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
14433 OpTE->Scalars.size());
14434 }
14435
14436 return CommonCost - ScalarCost;
14437 }
14438 case Instruction::ExtractValue:
14439 case Instruction::ExtractElement: {
14440 APInt DemandedElts;
14441 VectorType *SrcVecTy = nullptr;
14442 auto GetScalarCost = [&](unsigned Idx) {
14443 if (isa<PoisonValue>(UniqueValues[Idx]))
14445
14446 auto *I = cast<Instruction>(UniqueValues[Idx]);
14447 if (!SrcVecTy) {
14448 if (ShuffleOrOp == Instruction::ExtractElement) {
14449 auto *EE = cast<ExtractElementInst>(I);
14450 SrcVecTy = EE->getVectorOperandType();
14451 } else {
14452 auto *EV = cast<ExtractValueInst>(I);
14453 Type *AggregateTy = EV->getAggregateOperand()->getType();
14454 unsigned NumElts;
14455 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
14456 NumElts = ATy->getNumElements();
14457 else
14458 NumElts = AggregateTy->getStructNumElements();
14459 SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
14460 }
14461 }
14462 if (I->hasOneUse()) {
14463 Instruction *Ext = I->user_back();
14464 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
14466 // Use getExtractWithExtendCost() to calculate the cost of
14467 // extractelement/ext pair.
14468 InstructionCost Cost = TTI->getExtractWithExtendCost(
14469 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I),
14470 CostKind);
14471 // Subtract the cost of s|zext which is subtracted separately.
14472 Cost -= TTI->getCastInstrCost(
14473 Ext->getOpcode(), Ext->getType(), I->getType(),
14475 return Cost;
14476 }
14477 }
14478 if (DemandedElts.isZero())
14479 DemandedElts = APInt::getZero(getNumElements(SrcVecTy));
14480 DemandedElts.setBit(*getExtractIndex(I));
14482 };
14483 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
14484 return CommonCost - (DemandedElts.isZero()
14486 : TTI.getScalarizationOverhead(
14487 SrcVecTy, DemandedElts, /*Insert=*/false,
14488 /*Extract=*/true, CostKind));
14489 };
14490 return GetCostDiff(GetScalarCost, GetVectorCost);
14491 }
14492 case Instruction::InsertElement: {
14493 assert(E->ReuseShuffleIndices.empty() &&
14494 "Unique insertelements only are expected.");
14495 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
14496 unsigned const NumElts = SrcVecTy->getNumElements();
14497 unsigned const NumScalars = VL.size();
14498
14499 unsigned NumOfParts = ::getNumberOfParts(*TTI, SrcVecTy);
14500
14501 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
14502 unsigned OffsetBeg = *getElementIndex(VL.front());
14503 unsigned OffsetEnd = OffsetBeg;
14504 InsertMask[OffsetBeg] = 0;
14505 for (auto [I, V] : enumerate(VL.drop_front())) {
14506 unsigned Idx = *getElementIndex(V);
14507 if (OffsetBeg > Idx)
14508 OffsetBeg = Idx;
14509 else if (OffsetEnd < Idx)
14510 OffsetEnd = Idx;
14511 InsertMask[Idx] = I + 1;
14512 }
14513 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
14514 if (NumOfParts > 0 && NumOfParts < NumElts)
14515 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
14516 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
14517 VecScalarsSz;
14518 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
14519 unsigned InsertVecSz = std::min<unsigned>(
14520 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
14521 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
14522 bool IsWholeSubvector =
14523 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
14524 // Check if we can safely insert a subvector. If it is not possible, just
14525 // generate a whole-sized vector and shuffle the source vector and the new
14526 // subvector.
14527 if (OffsetBeg + InsertVecSz > VecSz) {
14528 // Align OffsetBeg to generate correct mask.
14529 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
14530 InsertVecSz = VecSz;
14531 }
14532
14533 APInt DemandedElts = APInt::getZero(NumElts);
14534 // TODO: Add support for Instruction::InsertValue.
14535 SmallVector<int> Mask;
14536 if (!E->ReorderIndices.empty()) {
14537 inversePermutation(E->ReorderIndices, Mask);
14538 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
14539 } else {
14540 Mask.assign(VecSz, PoisonMaskElem);
14541 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
14542 }
14543 bool IsIdentity = true;
14544 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
14545 Mask.swap(PrevMask);
14546 for (unsigned I = 0; I < NumScalars; ++I) {
14547 unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);
14548 DemandedElts.setBit(InsertIdx);
14549 IsIdentity &= InsertIdx - OffsetBeg == I;
14550 Mask[InsertIdx - OffsetBeg] = I;
14551 }
14552 assert(Offset < NumElts && "Failed to find vector index offset");
14553
14555 Cost -=
14556 getScalarizationOverhead(*TTI, ScalarTy, SrcVecTy, DemandedElts,
14557 /*Insert*/ true, /*Extract*/ false, CostKind);
14558
14559 // First cost - resize to actual vector size if not identity shuffle or
14560 // need to shift the vector.
14561 // Do not calculate the cost if the actual size is the register size and
14562 // we can merge this shuffle with the following SK_Select.
14563 auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz);
14564 if (!IsIdentity)
14566 InsertVecTy, Mask);
14567 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
14568 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
14569 }));
14570 // Second cost - permutation with subvector, if some elements are from the
14571 // initial vector or inserting a subvector.
14572 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
14573 // subvector of ActualVecTy.
14574 SmallBitVector InMask =
14575 isUndefVector(FirstInsert->getOperand(0),
14576 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
14577 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
14578 if (InsertVecSz != VecSz) {
14579 auto *ActualVecTy = getWidenedType(ScalarTy, VecSz);
14580 Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, ActualVecTy, {},
14581 CostKind, OffsetBeg - Offset, InsertVecTy);
14582 } else {
14583 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
14584 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
14585 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
14586 I <= End; ++I)
14587 if (Mask[I] != PoisonMaskElem)
14588 Mask[I] = I + VecSz;
14589 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
14590 Mask[I] =
14591 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
14592 Cost +=
14593 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
14594 }
14595 }
14596 return Cost;
14597 }
14598 case Instruction::ZExt:
14599 case Instruction::SExt:
14600 case Instruction::FPToUI:
14601 case Instruction::FPToSI:
14602 case Instruction::FPExt:
14603 case Instruction::PtrToInt:
14604 case Instruction::IntToPtr:
14605 case Instruction::SIToFP:
14606 case Instruction::UIToFP:
14607 case Instruction::Trunc:
14608 case Instruction::FPTrunc:
14609 case Instruction::BitCast: {
14610 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
14611 Type *SrcScalarTy = VL0->getOperand(0)->getType();
14612 auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
14613 unsigned Opcode = ShuffleOrOp;
14614 unsigned VecOpcode = Opcode;
14615 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
14616 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
14617 // Check if the values are candidates to demote.
14618 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->getScalarType());
14619 if (SrcIt != MinBWs.end()) {
14620 SrcBWSz = SrcIt->second.first;
14621 unsigned SrcScalarTyNumElements = getNumElements(SrcScalarTy);
14622 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
14623 SrcVecTy =
14624 getWidenedType(SrcScalarTy, VL.size() * SrcScalarTyNumElements);
14625 }
14626 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
14627 if (BWSz == SrcBWSz) {
14628 VecOpcode = Instruction::BitCast;
14629 } else if (BWSz < SrcBWSz) {
14630 VecOpcode = Instruction::Trunc;
14631 } else if (It != MinBWs.end()) {
14632 assert(BWSz > SrcBWSz && "Invalid cast!");
14633 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
14634 } else if (SrcIt != MinBWs.end()) {
14635 assert(BWSz > SrcBWSz && "Invalid cast!");
14636 VecOpcode =
14637 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
14638 }
14639 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
14640 !SrcIt->second.second) {
14641 VecOpcode = Instruction::UIToFP;
14642 }
14643 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
14644 assert(Idx == 0 && "Expected 0 index only");
14645 return TTI->getCastInstrCost(Opcode, VL0->getType(),
14646 VL0->getOperand(0)->getType(),
14648 };
14649 auto GetVectorCost = [=](InstructionCost CommonCost) {
14650 // Do not count cost here if minimum bitwidth is in effect and it is just
14651 // a bitcast (here it is just a noop).
14652 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
14653 return CommonCost;
14654 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
14655 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
14656
14657 bool IsArithmeticExtendedReduction =
14658 E->Idx == 0 && UserIgnoreList &&
14659 all_of(*UserIgnoreList, [](Value *V) {
14660 auto *I = cast<Instruction>(V);
14661 return is_contained({Instruction::Add, Instruction::FAdd,
14662 Instruction::Mul, Instruction::FMul,
14663 Instruction::And, Instruction::Or,
14664 Instruction::Xor},
14665 I->getOpcode());
14666 });
14667 if (IsArithmeticExtendedReduction &&
14668 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
14669 return CommonCost;
14670 return CommonCost +
14671 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
14672 VecOpcode == Opcode ? VI : nullptr);
14673 };
14674 return GetCostDiff(GetScalarCost, GetVectorCost);
14675 }
14676 case Instruction::FCmp:
14677 case Instruction::ICmp:
14678 case Instruction::Select: {
14679 CmpPredicate VecPred, SwappedVecPred;
14680 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
14681 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
14682 match(VL0, MatchCmp))
14683 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
14684 else
14685 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
14688 auto GetScalarCost = [&](unsigned Idx) {
14689 if (isa<PoisonValue>(UniqueValues[Idx]))
14691
14692 auto *VI = cast<Instruction>(UniqueValues[Idx]);
14693 CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
14696 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
14697 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
14698 !match(VI, MatchCmp)) ||
14699 (CurrentPred != static_cast<CmpInst::Predicate>(VecPred) &&
14700 CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred)))
14701 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
14704
14705 InstructionCost ScalarCost = TTI->getCmpSelInstrCost(
14706 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
14707 CostKind, getOperandInfo(VI->getOperand(0)),
14708 getOperandInfo(VI->getOperand(1)), VI);
14709 InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
14710 if (IntrinsicCost.isValid())
14711 ScalarCost = IntrinsicCost;
14712
14713 return ScalarCost;
14714 };
14715 auto GetVectorCost = [&](InstructionCost CommonCost) {
14716 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
14717
14718 InstructionCost VecCost =
14719 TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VecPred,
14720 CostKind, getOperandInfo(E->getOperand(0)),
14721 getOperandInfo(E->getOperand(1)), VL0);
14722 if (auto *SI = dyn_cast<SelectInst>(VL0)) {
14723 auto *CondType =
14724 getWidenedType(SI->getCondition()->getType(), VL.size());
14725 unsigned CondNumElements = CondType->getNumElements();
14726 unsigned VecTyNumElements = getNumElements(VecTy);
14727 assert(VecTyNumElements >= CondNumElements &&
14728 VecTyNumElements % CondNumElements == 0 &&
14729 "Cannot vectorize Instruction::Select");
14730 if (CondNumElements != VecTyNumElements) {
14731 // When the return type is i1 but the source is fixed vector type, we
14732 // need to duplicate the condition value.
14733 VecCost += ::getShuffleCost(
14734 *TTI, TTI::SK_PermuteSingleSrc, CondType,
14735 createReplicatedMask(VecTyNumElements / CondNumElements,
14736 CondNumElements));
14737 }
14738 }
14739 return VecCost + CommonCost;
14740 };
14741 return GetCostDiff(GetScalarCost, GetVectorCost);
14742 }
14743 case TreeEntry::MinMax: {
14744 auto GetScalarCost = [&](unsigned Idx) {
14745 return GetMinMaxCost(OrigScalarTy);
14746 };
14747 auto GetVectorCost = [&](InstructionCost CommonCost) {
14748 InstructionCost VecCost = GetMinMaxCost(VecTy);
14749 return VecCost + CommonCost;
14750 };
14751 return GetCostDiff(GetScalarCost, GetVectorCost);
14752 }
14753 case TreeEntry::FMulAdd: {
14754 auto GetScalarCost = [&](unsigned Idx) {
14755 if (isa<PoisonValue>(UniqueValues[Idx]))
14757 return GetFMulAddCost(E->getOperations(),
14758 cast<Instruction>(UniqueValues[Idx]));
14759 };
14760 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
14761 FastMathFlags FMF;
14762 FMF.set();
14763 for (Value *V : E->Scalars) {
14764 if (auto *FPCI = dyn_cast<FPMathOperator>(V)) {
14765 FMF &= FPCI->getFastMathFlags();
14766 if (auto *FPCIOp = dyn_cast<FPMathOperator>(FPCI->getOperand(0)))
14767 FMF &= FPCIOp->getFastMathFlags();
14768 }
14769 }
14770 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
14771 {VecTy, VecTy, VecTy}, FMF);
14772 InstructionCost VecCost = TTI.getIntrinsicInstrCost(ICA, CostKind);
14773 return VecCost + CommonCost;
14774 };
14775 return GetCostDiff(GetScalarCost, GetVectorCost);
14776 }
14777 case Instruction::FNeg:
14778 case Instruction::Add:
14779 case Instruction::FAdd:
14780 case Instruction::Sub:
14781 case Instruction::FSub:
14782 case Instruction::Mul:
14783 case Instruction::FMul:
14784 case Instruction::UDiv:
14785 case Instruction::SDiv:
14786 case Instruction::FDiv:
14787 case Instruction::URem:
14788 case Instruction::SRem:
14789 case Instruction::FRem:
14790 case Instruction::Shl:
14791 case Instruction::LShr:
14792 case Instruction::AShr:
14793 case Instruction::And:
14794 case Instruction::Or:
14795 case Instruction::Xor: {
14796 auto GetScalarCost = [&](unsigned Idx) {
14797 if (isa<PoisonValue>(UniqueValues[Idx]))
14799
14800 // We cannot retrieve the operand from UniqueValues[Idx] because an
14801 // interchangeable instruction may be used. The order and the actual
14802 // operand might differ from what is retrieved from UniqueValues[Idx].
14803 Value *Op1 = E->getOperand(0)[Idx];
14804 Value *Op2;
14805 SmallVector<const Value *, 2> Operands(1, Op1);
14806 if (isa<UnaryOperator>(UniqueValues[Idx])) {
14807 Op2 = Op1;
14808 } else {
14809 Op2 = E->getOperand(1)[Idx];
14810 Operands.push_back(Op2);
14811 }
14814 InstructionCost ScalarCost = TTI->getArithmeticInstrCost(
14815 ShuffleOrOp, OrigScalarTy, CostKind, Op1Info, Op2Info, Operands);
14816 if (auto *I = dyn_cast<Instruction>(UniqueValues[Idx]);
14817 I && (ShuffleOrOp == Instruction::FAdd ||
14818 ShuffleOrOp == Instruction::FSub)) {
14819 InstructionCost IntrinsicCost = GetFMulAddCost(E->getOperations(), I);
14820 if (IntrinsicCost.isValid())
14821 ScalarCost = IntrinsicCost;
14822 }
14823 return ScalarCost;
14824 };
14825 auto GetVectorCost = [=](InstructionCost CommonCost) {
14826 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
14827 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
14828 ArrayRef<Value *> Ops = E->getOperand(I);
14829 if (all_of(Ops, [&](Value *Op) {
14830 auto *CI = dyn_cast<ConstantInt>(Op);
14831 return CI && CI->getValue().countr_one() >= It->second.first;
14832 }))
14833 return CommonCost;
14834 }
14835 }
14836 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
14837 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
14838 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
14839 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
14840 Op2Info, {}, nullptr, TLI) +
14841 CommonCost;
14842 };
14843 return GetCostDiff(GetScalarCost, GetVectorCost);
14844 }
14845 case Instruction::GetElementPtr: {
14846 return CommonCost + GetGEPCostDiff(VL, VL0);
14847 }
14848 case Instruction::Load: {
14849 auto GetScalarCost = [&](unsigned Idx) {
14850 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
14851 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
14852 VI->getAlign(), VI->getPointerAddressSpace(),
14854 };
14855 auto *LI0 = cast<LoadInst>(VL0);
14856 auto GetVectorCost = [&](InstructionCost CommonCost) {
14857 InstructionCost VecLdCost;
14858 switch (E->State) {
14859 case TreeEntry::Vectorize:
14860 if (unsigned Factor = E->getInterleaveFactor()) {
14861 VecLdCost = TTI->getInterleavedMemoryOpCost(
14862 Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
14863 LI0->getPointerAddressSpace(), CostKind);
14864
14865 } else {
14866 VecLdCost = TTI->getMemoryOpCost(
14867 Instruction::Load, VecTy, LI0->getAlign(),
14868 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
14869 }
14870 break;
14871 case TreeEntry::StridedVectorize: {
14872 Align CommonAlignment =
14873 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
14874 VecLdCost = TTI->getStridedMemoryOpCost(
14875 Instruction::Load, VecTy, LI0->getPointerOperand(),
14876 /*VariableMask=*/false, CommonAlignment, CostKind);
14877 break;
14878 }
14879 case TreeEntry::CompressVectorize: {
14880 bool IsMasked;
14881 unsigned InterleaveFactor;
14882 SmallVector<int> CompressMask;
14883 VectorType *LoadVecTy;
14884 SmallVector<Value *> Scalars(VL);
14885 if (!E->ReorderIndices.empty()) {
14886 SmallVector<int> Mask(E->ReorderIndices.begin(),
14887 E->ReorderIndices.end());
14888 reorderScalars(Scalars, Mask);
14889 }
14890 SmallVector<Value *> PointerOps(Scalars.size());
14891 for (auto [I, V] : enumerate(Scalars))
14892 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
14893 [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress(
14894 Scalars, PointerOps, E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
14895 *TLI, [](Value *) { return true; }, IsMasked, InterleaveFactor,
14896 CompressMask, LoadVecTy);
14897 assert(IsVectorized && "Failed to vectorize load");
14898 CompressEntryToData.try_emplace(E, CompressMask, LoadVecTy,
14899 InterleaveFactor, IsMasked);
14900 Align CommonAlignment = LI0->getAlign();
14901 if (InterleaveFactor) {
14902 VecLdCost = TTI->getInterleavedMemoryOpCost(
14903 Instruction::Load, LoadVecTy, InterleaveFactor, {},
14904 CommonAlignment, LI0->getPointerAddressSpace(), CostKind);
14905 } else if (IsMasked) {
14906 VecLdCost = TTI->getMaskedMemoryOpCost(
14907 Instruction::Load, LoadVecTy, CommonAlignment,
14908 LI0->getPointerAddressSpace(), CostKind);
14909 // TODO: include this cost into CommonCost.
14910 VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
14911 LoadVecTy, CompressMask, CostKind);
14912 } else {
14913 VecLdCost = TTI->getMemoryOpCost(
14914 Instruction::Load, LoadVecTy, CommonAlignment,
14915 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
14916 // TODO: include this cost into CommonCost.
14917 VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
14918 LoadVecTy, CompressMask, CostKind);
14919 }
14920 break;
14921 }
14922 case TreeEntry::ScatterVectorize: {
14923 Align CommonAlignment =
14924 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
14925 VecLdCost = TTI->getGatherScatterOpCost(
14926 Instruction::Load, VecTy, LI0->getPointerOperand(),
14927 /*VariableMask=*/false, CommonAlignment, CostKind);
14928 break;
14929 }
14930 case TreeEntry::CombinedVectorize:
14931 case TreeEntry::SplitVectorize:
14932 case TreeEntry::NeedToGather:
14933 llvm_unreachable("Unexpected vectorization state.");
14934 }
14935 return VecLdCost + CommonCost;
14936 };
14937
14938 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
14939 // If this node generates masked gather load then it is not a terminal node.
14940 // Hence address operand cost is estimated separately.
14941 if (E->State == TreeEntry::ScatterVectorize)
14942 return Cost;
14943
14944 // Estimate cost of GEPs since this tree node is a terminator.
14945 SmallVector<Value *> PointerOps(VL.size());
14946 for (auto [I, V] : enumerate(VL))
14947 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
14948 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
14949 }
14950 case Instruction::Store: {
14951 bool IsReorder = !E->ReorderIndices.empty();
14952 auto GetScalarCost = [=](unsigned Idx) {
14953 auto *VI = cast<StoreInst>(VL[Idx]);
14954 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
14955 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
14956 VI->getAlign(), VI->getPointerAddressSpace(),
14957 CostKind, OpInfo, VI);
14958 };
14959 auto *BaseSI =
14960 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
14961 auto GetVectorCost = [=](InstructionCost CommonCost) {
14962 // We know that we can merge the stores. Calculate the cost.
14963 InstructionCost VecStCost;
14964 if (E->State == TreeEntry::StridedVectorize) {
14965 Align CommonAlignment =
14966 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
14967 VecStCost = TTI->getStridedMemoryOpCost(
14968 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
14969 /*VariableMask=*/false, CommonAlignment, CostKind);
14970 } else {
14971 assert(E->State == TreeEntry::Vectorize &&
14972 "Expected either strided or consecutive stores.");
14973 if (unsigned Factor = E->getInterleaveFactor()) {
14974 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
14975 "No reused shuffles expected");
14976 CommonCost = 0;
14977 VecStCost = TTI->getInterleavedMemoryOpCost(
14978 Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
14979 BaseSI->getPointerAddressSpace(), CostKind);
14980 } else {
14981 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
14982 VecStCost = TTI->getMemoryOpCost(
14983 Instruction::Store, VecTy, BaseSI->getAlign(),
14984 BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
14985 }
14986 }
14987 return VecStCost + CommonCost;
14988 };
14989 SmallVector<Value *> PointerOps(VL.size());
14990 for (auto [I, V] : enumerate(VL)) {
14991 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
14992 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
14993 }
14994
14995 return GetCostDiff(GetScalarCost, GetVectorCost) +
14996 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
14997 }
14998 case Instruction::Call: {
14999 auto GetScalarCost = [&](unsigned Idx) {
15000 auto *CI = cast<CallInst>(UniqueValues[Idx]);
15003 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
15004 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
15005 }
15006 return TTI->getCallInstrCost(CI->getCalledFunction(),
15008 CI->getFunctionType()->params(), CostKind);
15009 };
15010 auto GetVectorCost = [=](InstructionCost CommonCost) {
15011 auto *CI = cast<CallInst>(VL0);
15014 CI, ID, VecTy->getNumElements(),
15015 It != MinBWs.end() ? It->second.first : 0, TTI);
15016 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
15017 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
15018 };
15019 return GetCostDiff(GetScalarCost, GetVectorCost);
15020 }
15021 case Instruction::ShuffleVector: {
15022 if (!SLPReVec || E->isAltShuffle())
15023 assert(E->isAltShuffle() &&
15024 ((Instruction::isBinaryOp(E->getOpcode()) &&
15025 Instruction::isBinaryOp(E->getAltOpcode())) ||
15026 (Instruction::isCast(E->getOpcode()) &&
15027 Instruction::isCast(E->getAltOpcode())) ||
15028 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
15029 "Invalid Shuffle Vector Operand");
15030 // Try to find the previous shuffle node with the same operands and same
15031 // main/alternate ops.
15032 auto TryFindNodeWithEqualOperands = [=]() {
15033 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
15034 if (TE.get() == E)
15035 break;
15036 if (TE->hasState() && TE->isAltShuffle() &&
15037 ((TE->getOpcode() == E->getOpcode() &&
15038 TE->getAltOpcode() == E->getAltOpcode()) ||
15039 (TE->getOpcode() == E->getAltOpcode() &&
15040 TE->getAltOpcode() == E->getOpcode())) &&
15041 TE->hasEqualOperands(*E))
15042 return true;
15043 }
15044 return false;
15045 };
15046 auto GetScalarCost = [&](unsigned Idx) {
15047 if (isa<PoisonValue>(UniqueValues[Idx]))
15049
15050 auto *VI = cast<Instruction>(UniqueValues[Idx]);
15051 assert(E->getMatchingMainOpOrAltOp(VI) &&
15052 "Unexpected main/alternate opcode");
15053 (void)E;
15054 return TTI->getInstructionCost(VI, CostKind);
15055 };
15056 // Need to clear CommonCost since the final shuffle cost is included into
15057 // vector cost.
15058 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
15059 // VecCost is equal to sum of the cost of creating 2 vectors
15060 // and the cost of creating shuffle.
15061 InstructionCost VecCost = 0;
15062 if (TryFindNodeWithEqualOperands()) {
15063 LLVM_DEBUG({
15064 dbgs() << "SLP: diamond match for alternate node found.\n";
15065 E->dump();
15066 });
15067 // No need to add new vector costs here since we're going to reuse
15068 // same main/alternate vector ops, just do different shuffling.
15069 } else if (Instruction::isBinaryOp(E->getOpcode())) {
15070 VecCost =
15071 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
15072 VecCost +=
15073 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
15074 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
15075 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
15076 VecCost = TTIRef.getCmpSelInstrCost(
15077 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(), CostKind,
15078 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15079 VL0);
15080 VecCost += TTIRef.getCmpSelInstrCost(
15081 E->getOpcode(), VecTy, MaskTy,
15082 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
15083 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15084 E->getAltOp());
15085 } else {
15086 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
15087 auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
15088 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
15089 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
15090 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
15091 unsigned SrcBWSz =
15092 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
15093 if (SrcIt != MinBWs.end()) {
15094 SrcBWSz = SrcIt->second.first;
15095 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
15096 SrcTy = getWidenedType(SrcSclTy, VL.size());
15097 }
15098 if (BWSz <= SrcBWSz) {
15099 if (BWSz < SrcBWSz)
15100 VecCost =
15101 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
15103 LLVM_DEBUG({
15104 dbgs()
15105 << "SLP: alternate extension, which should be truncated.\n";
15106 E->dump();
15107 });
15108 return VecCost;
15109 }
15110 }
15111 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
15113 VecCost +=
15114 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
15116 }
15117 SmallVector<int> Mask;
15118 E->buildAltOpShuffleMask(
15119 [&](Instruction *I) {
15120 assert(E->getMatchingMainOpOrAltOp(I) &&
15121 "Unexpected main/alternate opcode");
15122 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
15123 *TLI);
15124 },
15125 Mask);
15127 FinalVecTy, Mask, CostKind);
15128 // Patterns like [fadd,fsub] can be combined into a single instruction
15129 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
15130 // need to take into account their order when looking for the most used
15131 // order.
15132 unsigned Opcode0 = E->getOpcode();
15133 unsigned Opcode1 = E->getAltOpcode();
15134 SmallBitVector OpcodeMask(
15135 getAltInstrMask(E->Scalars, ScalarTy, Opcode0, Opcode1));
15136 // If this pattern is supported by the target then we consider the
15137 // order.
15138 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
15139 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
15140 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
15141 return AltVecCost < VecCost ? AltVecCost : VecCost;
15142 }
15143 // TODO: Check the reverse order too.
15144 return VecCost;
15145 };
15146 if (SLPReVec && !E->isAltShuffle())
15147 return GetCostDiff(
15148 GetScalarCost, [&](InstructionCost) -> InstructionCost {
15149 // If a group uses mask in order, the shufflevector can be
15150 // eliminated by instcombine. Then the cost is 0.
15152 "Not supported shufflevector usage.");
15153 auto *SV = cast<ShuffleVectorInst>(VL.front());
15154 unsigned SVNumElements =
15155 cast<FixedVectorType>(SV->getOperand(0)->getType())
15156 ->getNumElements();
15157 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
15158 for (size_t I = 0, End = VL.size(); I != End; I += GroupSize) {
15159 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
15160 int NextIndex = 0;
15161 if (!all_of(Group, [&](Value *V) {
15163 "Not supported shufflevector usage.");
15164 auto *SV = cast<ShuffleVectorInst>(V);
15165 int Index;
15166 [[maybe_unused]] bool IsExtractSubvectorMask =
15167 SV->isExtractSubvectorMask(Index);
15168 assert(IsExtractSubvectorMask &&
15169 "Not supported shufflevector usage.");
15170 if (NextIndex != Index)
15171 return false;
15172 NextIndex += SV->getShuffleMask().size();
15173 return true;
15174 }))
15175 return ::getShuffleCost(
15177 calculateShufflevectorMask(E->Scalars));
15178 }
15179 return TTI::TCC_Free;
15180 });
15181 return GetCostDiff(GetScalarCost, GetVectorCost);
15182 }
15183 case Instruction::Freeze:
15184 return CommonCost;
15185 default:
15186 llvm_unreachable("Unknown instruction");
15187 }
15188}
15189
15190bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
15191 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
15192 << VectorizableTree.size() << " is fully vectorizable .\n");
15193
15194 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
15195 SmallVector<int> Mask;
15196 return TE->isGather() &&
15197 !any_of(TE->Scalars,
15198 [this](Value *V) { return EphValues.contains(V); }) &&
15199 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
15200 TE->Scalars.size() < Limit ||
15201 (((TE->hasState() &&
15202 TE->getOpcode() == Instruction::ExtractElement) ||
15204 isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
15205 (TE->hasState() && TE->getOpcode() == Instruction::Load &&
15206 !TE->isAltShuffle()) ||
15207 any_of(TE->Scalars, IsaPred<LoadInst>));
15208 };
15209
15210 // We only handle trees of heights 1 and 2.
15211 if (VectorizableTree.size() == 1 &&
15212 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
15213 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
15214 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
15215 (ForReduction &&
15216 AreVectorizableGathers(VectorizableTree[0].get(),
15217 VectorizableTree[0]->Scalars.size()) &&
15218 VectorizableTree[0]->getVectorFactor() > 2)))
15219 return true;
15220
15221 if (VectorizableTree.size() != 2)
15222 return false;
15223
15224 // Handle splat and all-constants stores. Also try to vectorize tiny trees
15225 // with the second gather nodes if they have less scalar operands rather than
15226 // the initial tree element (may be profitable to shuffle the second gather)
15227 // or they are extractelements, which form shuffle.
15228 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
15229 AreVectorizableGathers(VectorizableTree[1].get(),
15230 VectorizableTree[0]->Scalars.size()))
15231 return true;
15232
15233 // Gathering cost would be too much for tiny trees.
15234 if (VectorizableTree[0]->isGather() ||
15235 (VectorizableTree[1]->isGather() &&
15236 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
15237 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
15238 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
15239 return false;
15240
15241 return true;
15242}
15243
15244static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
15246 bool MustMatchOrInst) {
15247 // Look past the root to find a source value. Arbitrarily follow the
15248 // path through operand 0 of any 'or'. Also, peek through optional
15249 // shift-left-by-multiple-of-8-bits.
15250 Value *ZextLoad = Root;
15251 const APInt *ShAmtC;
15252 bool FoundOr = false;
15253 while (!isa<ConstantExpr>(ZextLoad) &&
15254 (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
15255 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
15256 ShAmtC->urem(8) == 0))) {
15257 auto *BinOp = cast<BinaryOperator>(ZextLoad);
15258 ZextLoad = BinOp->getOperand(0);
15259 if (BinOp->getOpcode() == Instruction::Or)
15260 FoundOr = true;
15261 }
15262 // Check if the input is an extended load of the required or/shift expression.
15263 Value *Load;
15264 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
15265 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
15266 return false;
15267
15268 // Require that the total load bit width is a legal integer type.
15269 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
15270 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
15271 Type *SrcTy = Load->getType();
15272 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
15273 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
15274 return false;
15275
15276 // Everything matched - assume that we can fold the whole sequence using
15277 // load combining.
15278 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
15279 << *(cast<Instruction>(Root)) << "\n");
15280
15281 return true;
15282}
15283
15285 if (RdxKind != RecurKind::Or)
15286 return false;
15287
15288 unsigned NumElts = VectorizableTree[0]->Scalars.size();
15289 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
15290 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
15291 /* MatchOr */ false);
15292}
15293
15295 // Peek through a final sequence of stores and check if all operations are
15296 // likely to be load-combined.
15297 unsigned NumElts = Stores.size();
15298 for (Value *Scalar : Stores) {
15299 Value *X;
15300 if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
15301 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
15302 return false;
15303 }
15304 return true;
15305}
15306
15307bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
15308 if (!DebugCounter::shouldExecute(VectorizedGraphs))
15309 return true;
15310
15311 // Graph is empty - do nothing.
15312 if (VectorizableTree.empty()) {
15313 assert(ExternalUses.empty() && "We shouldn't have any external users");
15314
15315 return true;
15316 }
15317
15318 // No need to vectorize inserts of gathered values.
15319 if (VectorizableTree.size() == 2 &&
15320 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
15321 VectorizableTree[1]->isGather() &&
15322 (VectorizableTree[1]->getVectorFactor() <= 2 ||
15323 !(isSplat(VectorizableTree[1]->Scalars) ||
15324 allConstant(VectorizableTree[1]->Scalars))))
15325 return true;
15326
15327 // If the graph includes only PHI nodes and gathers, it is defnitely not
15328 // profitable for the vectorization, we can skip it, if the cost threshold is
15329 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
15330 // gathers/buildvectors.
15331 constexpr int Limit = 4;
15332 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
15333 !VectorizableTree.empty() &&
15334 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15335 return (TE->isGather() &&
15336 (!TE->hasState() ||
15337 TE->getOpcode() != Instruction::ExtractElement) &&
15338 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
15339 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
15340 }))
15341 return true;
15342
15343 // Do not vectorize small tree of phis only, if all vector phis are also
15344 // gathered.
15345 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15346 VectorizableTree.size() <= Limit &&
15347 all_of(VectorizableTree,
15348 [&](const std::unique_ptr<TreeEntry> &TE) {
15349 return (TE->isGather() &&
15350 (!TE->hasState() ||
15351 TE->getOpcode() != Instruction::ExtractElement) &&
15352 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <=
15353 Limit) ||
15354 (TE->hasState() &&
15355 (TE->getOpcode() == Instruction::InsertElement ||
15356 (TE->getOpcode() == Instruction::PHI &&
15357 all_of(TE->Scalars, [&](Value *V) {
15358 return isa<PoisonValue>(V) || MustGather.contains(V);
15359 }))));
15360 }) &&
15361 any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15362 return TE->State == TreeEntry::Vectorize &&
15363 TE->getOpcode() == Instruction::PHI;
15364 }))
15365 return true;
15366
15367 // If the tree contains only phis, buildvectors, split nodes and
15368 // small nodes with reuses, we can skip it.
15369 SmallVector<const TreeEntry *> StoreLoadNodes;
15370 unsigned NumGathers = 0;
15371 constexpr int LimitTreeSize = 36;
15372 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
15373 all_of(VectorizableTree,
15374 [&](const std::unique_ptr<TreeEntry> &TE) {
15375 if (!TE->isGather() && TE->hasState() &&
15376 (TE->getOpcode() == Instruction::Load ||
15377 TE->getOpcode() == Instruction::Store)) {
15378 StoreLoadNodes.push_back(TE.get());
15379 return true;
15380 }
15381 if (TE->isGather())
15382 ++NumGathers;
15383 return TE->State == TreeEntry::SplitVectorize ||
15384 (TE->Idx == 0 && TE->Scalars.size() == 2 &&
15385 TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
15386 VectorizableTree.size() > LimitTreeSize) ||
15387 (TE->isGather() &&
15388 none_of(TE->Scalars, IsaPred<ExtractElementInst>)) ||
15389 (TE->hasState() &&
15390 (TE->getOpcode() == Instruction::PHI ||
15391 (TE->hasCopyableElements() &&
15392 static_cast<unsigned>(count_if(
15393 TE->Scalars, IsaPred<PHINode, Constant>)) >=
15394 TE->Scalars.size() / 2) ||
15395 ((!TE->ReuseShuffleIndices.empty() ||
15396 !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
15397 TE->Scalars.size() == 2)));
15398 }) &&
15399 (StoreLoadNodes.empty() ||
15400 (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.size() &&
15401 (NumGathers > 0 || none_of(StoreLoadNodes, [&](const TreeEntry *TE) {
15402 return TE->getOpcode() == Instruction::Store ||
15403 all_of(TE->Scalars, [&](Value *V) {
15404 return !isa<LoadInst>(V) ||
15405 areAllUsersVectorized(cast<Instruction>(V));
15406 });
15407 })))))
15408 return true;
15409
15410 // If the tree contains only buildvector, 2 non-buildvectors (with root user
15411 // tree node) and other buildvectors, we can skip it.
15412 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15413 VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
15414 VectorizableTree.size() >= Limit &&
15415 count_if(ArrayRef(VectorizableTree).drop_front(),
15416 [&](const std::unique_ptr<TreeEntry> &TE) {
15417 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
15418 TE->UserTreeIndex.UserTE->Idx == 0;
15419 }) == 2)
15420 return true;
15421
15422 // If the tree contains only vectorization of the phi node from the
15423 // buildvector - skip it.
15424 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15425 VectorizableTree.size() > 2 &&
15426 VectorizableTree.front()->State == TreeEntry::Vectorize &&
15427 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
15428 VectorizableTree[1]->State == TreeEntry::Vectorize &&
15429 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
15430 all_of(
15431 ArrayRef(VectorizableTree).drop_front(2),
15432 [&](const std::unique_ptr<TreeEntry> &TE) { return TE->isGather(); }))
15433 return true;
15434
15435 // We can vectorize the tree if its size is greater than or equal to the
15436 // minimum size specified by the MinTreeSize command line option.
15437 if (VectorizableTree.size() >= MinTreeSize)
15438 return false;
15439
15440 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
15441 // can vectorize it if we can prove it fully vectorizable.
15442 if (isFullyVectorizableTinyTree(ForReduction))
15443 return false;
15444
15445 // Check if any of the gather node forms an insertelement buildvector
15446 // somewhere.
15447 bool IsAllowedSingleBVNode =
15448 VectorizableTree.size() > 1 ||
15449 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
15450 !VectorizableTree.front()->isAltShuffle() &&
15451 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
15452 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
15453 allSameBlock(VectorizableTree.front()->Scalars));
15454 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15455 return TE->isGather() && all_of(TE->Scalars, [&](Value *V) {
15456 return isa<ExtractElementInst, Constant>(V) ||
15457 (IsAllowedSingleBVNode &&
15458 !V->hasNUsesOrMore(UsesLimit) &&
15459 any_of(V->users(), IsaPred<InsertElementInst>));
15460 });
15461 }))
15462 return false;
15463
15464 if (VectorizableTree.back()->isGather() &&
15465 VectorizableTree.back()->hasState() &&
15466 VectorizableTree.back()->isAltShuffle() &&
15467 VectorizableTree.back()->getVectorFactor() > 2 &&
15468 allSameBlock(VectorizableTree.back()->Scalars) &&
15469 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
15470 TTI->getScalarizationOverhead(
15471 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
15472 VectorizableTree.back()->getVectorFactor()),
15473 APInt::getAllOnes(VectorizableTree.back()->getVectorFactor()),
15474 /*Insert=*/true, /*Extract=*/false,
15476 return false;
15477
15478 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
15479 // vectorizable.
15480 return true;
15481}
15482
15485 constexpr unsigned SmallTree = 3;
15486 if (VectorizableTree.front()->isNonPowOf2Vec() &&
15487 getCanonicalGraphSize() <= SmallTree &&
15488 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
15489 [](const std::unique_ptr<TreeEntry> &TE) {
15490 return TE->isGather() && TE->hasState() &&
15491 TE->getOpcode() == Instruction::Load &&
15492 !allSameBlock(TE->Scalars);
15493 }) == 1)
15494 return true;
15495 return false;
15496 }
15497 bool Res = false;
15498 for (unsigned Idx : seq<unsigned>(getTreeSize())) {
15499 TreeEntry &E = *VectorizableTree[Idx];
15500 if (E.State == TreeEntry::SplitVectorize)
15501 return false;
15502 if (!E.isGather())
15503 continue;
15504 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
15505 (!E.hasState() &&
15507 (isa<ExtractElementInst>(E.Scalars.front()) &&
15508 getSameOpcode(ArrayRef(E.Scalars).drop_front(), *TLI).valid()))
15509 return false;
15510 if (isSplat(E.Scalars) || allConstant(E.Scalars))
15511 continue;
15512 Res = true;
15513 }
15514 return Res;
15515}
15516
15518 // Walk from the bottom of the tree to the top, tracking which values are
15519 // live. When we see a call instruction that is not part of our tree,
15520 // query TTI to see if there is a cost to keeping values live over it
15521 // (for example, if spills and fills are required).
15522
15523 const TreeEntry *Root = VectorizableTree.front().get();
15524 if (Root->isGather())
15525 return 0;
15526
15529 EntriesToOperands;
15530 SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;
15531 SmallPtrSet<const Instruction *, 8> LastInstructions;
15532 for (const auto &TEPtr : VectorizableTree) {
15533 if (!TEPtr->isGather()) {
15534 Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
15535 EntriesToLastInstruction.try_emplace(TEPtr.get(), LastInst);
15536 LastInstructions.insert(LastInst);
15537 }
15538 if (TEPtr->UserTreeIndex)
15539 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
15540 }
15541
15542 auto NoCallIntrinsic = [this](const Instruction *I) {
15543 const auto *II = dyn_cast<IntrinsicInst>(I);
15544 if (!II)
15545 return false;
15546 if (II->isAssumeLikeIntrinsic())
15547 return true;
15548 IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
15549 InstructionCost IntrCost =
15550 TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
15551 InstructionCost CallCost = TTI->getCallInstrCost(
15552 nullptr, II->getType(), ICA.getArgTypes(), TTI::TCK_RecipThroughput);
15553 return IntrCost < CallCost;
15554 };
15555
15556 // Maps last instruction in the entry to the last instruction for the one of
15557 // operand entries and the flag. If the flag is true, there are no calls in
15558 // between these instructions.
15560 CheckedInstructions;
15561 unsigned Budget = 0;
15562 const unsigned BudgetLimit =
15563 ScheduleRegionSizeBudget / VectorizableTree.size();
15564 auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
15565 const Instruction *Last) {
15566 assert(First->getParent() == Last->getParent() &&
15567 "Expected instructions in same block.");
15568 if (auto It = CheckedInstructions.find(Last);
15569 It != CheckedInstructions.end()) {
15570 const Instruction *Checked = It->second.getPointer();
15571 if (Checked == First || Checked->comesBefore(First))
15572 return It->second.getInt() != 0;
15573 Last = Checked;
15574 } else if (Last == First || Last->comesBefore(First)) {
15575 return true;
15576 }
15578 ++First->getIterator().getReverse(),
15579 PrevInstIt =
15580 Last->getIterator().getReverse();
15581 SmallVector<const Instruction *> LastInstsInRange;
15582 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
15583 // Debug information does not impact spill cost.
15584 // Vectorized calls, represented as vector intrinsics, do not impact spill
15585 // cost.
15586 if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
15587 CB && !NoCallIntrinsic(CB) && !isVectorized(CB)) {
15588 for (const Instruction *LastInst : LastInstsInRange)
15589 CheckedInstructions.try_emplace(LastInst, &*PrevInstIt, 0);
15590 return false;
15591 }
15592 if (LastInstructions.contains(&*PrevInstIt))
15593 LastInstsInRange.push_back(&*PrevInstIt);
15594
15595 ++PrevInstIt;
15596 ++Budget;
15597 }
15598 for (const Instruction *LastInst : LastInstsInRange)
15599 CheckedInstructions.try_emplace(
15600 LastInst, PrevInstIt == InstIt ? First : &*PrevInstIt,
15601 Budget <= BudgetLimit ? 1 : 0);
15602 return Budget <= BudgetLimit;
15603 };
15604 auto AddCosts = [&](const TreeEntry *Op) {
15605 Type *ScalarTy = Op->Scalars.front()->getType();
15606 auto It = MinBWs.find(Op);
15607 if (It != MinBWs.end())
15608 ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
15609 auto *VecTy = getWidenedType(ScalarTy, Op->getVectorFactor());
15610 Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
15611 if (ScalarTy->isVectorTy()) {
15612 // Handle revec dead vector instructions.
15613 Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
15614 }
15615 };
15616 // Memoize the relationship between blocks, i.e. if there is (at least one)
15617 // non-vectorized call between the blocks. This allows to skip the analysis of
15618 // the same block paths multiple times.
15620 ParentOpParentToPreds;
15621 auto CheckPredecessors = [&](BasicBlock *Root, BasicBlock *Pred,
15622 BasicBlock *OpParent) {
15623 auto Key = std::make_pair(Root, OpParent);
15624 if (auto It = ParentOpParentToPreds.find(Key);
15625 It != ParentOpParentToPreds.end())
15626 return It->second;
15628 if (Pred)
15629 Worklist.push_back(Pred);
15630 else
15631 Worklist.append(pred_begin(Root), pred_end(Root));
15634 ParentsPairsToAdd;
15635 bool Res = false;
15636 auto Cleanup = make_scope_exit([&]() {
15637 for (const auto &KeyPair : ParentsPairsToAdd) {
15638 assert(!ParentOpParentToPreds.contains(KeyPair) &&
15639 "Should not have been added before.");
15640 ParentOpParentToPreds.try_emplace(KeyPair, Res);
15641 }
15642 });
15643 while (!Worklist.empty()) {
15644 BasicBlock *BB = Worklist.pop_back_val();
15645 if (BB == OpParent || !Visited.insert(BB).second)
15646 continue;
15647 auto Pair = std::make_pair(BB, OpParent);
15648 if (auto It = ParentOpParentToPreds.find(Pair);
15649 It != ParentOpParentToPreds.end()) {
15650 Res = It->second;
15651 return Res;
15652 }
15653 ParentsPairsToAdd.insert(Pair);
15654 unsigned BlockSize = BB->size();
15655 if (BlockSize > static_cast<unsigned>(ScheduleRegionSizeBudget))
15656 return Res;
15657 Budget += BlockSize;
15658 if (Budget > BudgetLimit)
15659 return Res;
15660 if (!isa<CatchSwitchInst>(BB->getTerminator()) &&
15661 !CheckForNonVecCallsInSameBlock(&*BB->getFirstNonPHIOrDbgOrAlloca(),
15662 BB->getTerminator()))
15663 return Res;
15664 Worklist.append(pred_begin(BB), pred_end(BB));
15665 }
15666 Res = true;
15667 return Res;
15668 };
15669 SmallVector<const TreeEntry *> LiveEntries(1, Root);
15670 while (!LiveEntries.empty()) {
15671 const TreeEntry *Entry = LiveEntries.pop_back_val();
15672 SmallVector<const TreeEntry *> Operands = EntriesToOperands.lookup(Entry);
15673 if (Operands.empty())
15674 continue;
15675 Instruction *LastInst = EntriesToLastInstruction.at(Entry);
15676 BasicBlock *Parent = LastInst->getParent();
15677 for (const TreeEntry *Op : Operands) {
15678 if (!Op->isGather())
15679 LiveEntries.push_back(Op);
15680 if (Entry->State == TreeEntry::SplitVectorize ||
15681 (Entry->getOpcode() != Instruction::PHI && Op->isGather()) ||
15682 (Op->isGather() && allConstant(Op->Scalars)))
15683 continue;
15684 Budget = 0;
15685 BasicBlock *Pred = nullptr;
15686 if (auto *Phi = dyn_cast<PHINode>(Entry->getMainOp()))
15687 Pred = Phi->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
15688 BasicBlock *OpParent;
15689 Instruction *OpLastInst;
15690 if (Op->isGather()) {
15691 assert(Entry->getOpcode() == Instruction::PHI &&
15692 "Expected phi node only.");
15693 OpParent = cast<PHINode>(Entry->getMainOp())
15694 ->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
15695 OpLastInst = OpParent->getTerminator();
15696 for (Value *V : Op->Scalars) {
15697 auto *Inst = dyn_cast<Instruction>(V);
15698 if (!Inst)
15699 continue;
15700 if (isVectorized(V)) {
15701 OpParent = Inst->getParent();
15702 OpLastInst = Inst;
15703 break;
15704 }
15705 }
15706 } else {
15707 OpLastInst = EntriesToLastInstruction.at(Op);
15708 OpParent = OpLastInst->getParent();
15709 }
15710 // Check the call instructions within the same basic blocks.
15711 if (OpParent == Parent) {
15712 if (Entry->getOpcode() == Instruction::PHI) {
15713 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
15714 AddCosts(Op);
15715 continue;
15716 }
15717 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
15718 AddCosts(Op);
15719 continue;
15720 }
15721 // Check for call instruction in between blocks.
15722 // 1. Check entry's block to the head.
15723 if (Entry->getOpcode() != Instruction::PHI &&
15724 !CheckForNonVecCallsInSameBlock(
15725 &*LastInst->getParent()->getFirstNonPHIOrDbgOrAlloca(),
15726 LastInst)) {
15727 AddCosts(Op);
15728 continue;
15729 }
15730 // 2. Check op's block from the end.
15731 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
15732 OpParent->getTerminator())) {
15733 AddCosts(Op);
15734 continue;
15735 }
15736 // 3. Check the predecessors of entry's block till op's block.
15737 if (!CheckPredecessors(Parent, Pred, OpParent)) {
15738 AddCosts(Op);
15739 continue;
15740 }
15741 }
15742 }
15743
15744 return Cost;
15745}
15746
15747/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
15748/// buildvector sequence.
15750 const InsertElementInst *IE2) {
15751 if (IE1 == IE2)
15752 return false;
15753 const auto *I1 = IE1;
15754 const auto *I2 = IE2;
15755 const InsertElementInst *PrevI1;
15756 const InsertElementInst *PrevI2;
15757 unsigned Idx1 = *getElementIndex(IE1);
15758 unsigned Idx2 = *getElementIndex(IE2);
15759 do {
15760 if (I2 == IE1)
15761 return true;
15762 if (I1 == IE2)
15763 return false;
15764 PrevI1 = I1;
15765 PrevI2 = I2;
15766 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
15767 getElementIndex(I1).value_or(Idx2) != Idx2)
15768 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
15769 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
15770 getElementIndex(I2).value_or(Idx1) != Idx1)
15771 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
15772 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
15773 llvm_unreachable("Two different buildvectors not expected.");
15774}
15775
15776namespace {
15777/// Returns incoming Value *, if the requested type is Value * too, or a default
15778/// value, otherwise.
15779struct ValueSelect {
15780 template <typename U>
15781 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
15782 return V;
15783 }
15784 template <typename U>
15785 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
15786 return U();
15787 }
15788};
15789} // namespace
15790
15791/// Does the analysis of the provided shuffle masks and performs the requested
15792/// actions on the vectors with the given shuffle masks. It tries to do it in
15793/// several steps.
15794/// 1. If the Base vector is not undef vector, resizing the very first mask to
15795/// have common VF and perform action for 2 input vectors (including non-undef
15796/// Base). Other shuffle masks are combined with the resulting after the 1 stage
15797/// and processed as a shuffle of 2 elements.
15798/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
15799/// action only for 1 vector with the given mask, if it is not the identity
15800/// mask.
15801/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
15802/// vectors, combing the masks properly between the steps.
15803template <typename T>
15805 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
15806 function_ref<unsigned(T *)> GetVF,
15807 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
15809 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
15810 SmallVector<int> Mask(ShuffleMask.begin()->second);
15811 auto VMIt = std::next(ShuffleMask.begin());
15812 T *Prev = nullptr;
15813 SmallBitVector UseMask =
15814 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
15815 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
15816 if (!IsBaseUndef.all()) {
15817 // Base is not undef, need to combine it with the next subvectors.
15818 std::pair<T *, bool> Res =
15819 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
15820 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
15821 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
15822 if (Mask[Idx] == PoisonMaskElem)
15823 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
15824 else
15825 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
15826 }
15827 [[maybe_unused]] auto *V = ValueSelect::get<T *>(Base);
15828 assert((!V || GetVF(V) == Mask.size()) &&
15829 "Expected base vector of VF number of elements.");
15830 Prev = Action(Mask, {nullptr, Res.first});
15831 } else if (ShuffleMask.size() == 1) {
15832 // Base is undef and only 1 vector is shuffled - perform the action only for
15833 // single vector, if the mask is not the identity mask.
15834 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
15835 /*ForSingleMask=*/true);
15836 if (Res.second)
15837 // Identity mask is found.
15838 Prev = Res.first;
15839 else
15840 Prev = Action(Mask, {ShuffleMask.begin()->first});
15841 } else {
15842 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
15843 // shuffles step by step, combining shuffle between the steps.
15844 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
15845 unsigned Vec2VF = GetVF(VMIt->first);
15846 if (Vec1VF == Vec2VF) {
15847 // No need to resize the input vectors since they are of the same size, we
15848 // can shuffle them directly.
15849 ArrayRef<int> SecMask = VMIt->second;
15850 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
15851 if (SecMask[I] != PoisonMaskElem) {
15852 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
15853 Mask[I] = SecMask[I] + Vec1VF;
15854 }
15855 }
15856 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
15857 } else {
15858 // Vectors of different sizes - resize and reshuffle.
15859 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
15860 /*ForSingleMask=*/false);
15861 std::pair<T *, bool> Res2 =
15862 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
15863 ArrayRef<int> SecMask = VMIt->second;
15864 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
15865 if (Mask[I] != PoisonMaskElem) {
15866 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
15867 if (Res1.second)
15868 Mask[I] = I;
15869 } else if (SecMask[I] != PoisonMaskElem) {
15870 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
15871 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
15872 }
15873 }
15874 Prev = Action(Mask, {Res1.first, Res2.first});
15875 }
15876 VMIt = std::next(VMIt);
15877 }
15878 [[maybe_unused]] bool IsBaseNotUndef = !IsBaseUndef.all();
15879 // Perform requested actions for the remaining masks/vectors.
15880 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
15881 // Shuffle other input vectors, if any.
15882 std::pair<T *, bool> Res =
15883 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
15884 ArrayRef<int> SecMask = VMIt->second;
15885 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
15886 if (SecMask[I] != PoisonMaskElem) {
15887 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
15888 "Multiple uses of scalars.");
15889 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
15890 } else if (Mask[I] != PoisonMaskElem) {
15891 Mask[I] = I;
15892 }
15893 }
15894 Prev = Action(Mask, {Prev, Res.first});
15895 }
15896 return Prev;
15897}
15898
15899namespace {
15900/// Data type for handling buildvector sequences with the reused scalars from
15901/// other tree entries.
15902template <typename T> struct ShuffledInsertData {
15903 /// List of insertelements to be replaced by shuffles.
15904 SmallVector<InsertElementInst *> InsertElements;
15905 /// The parent vectors and shuffle mask for the given list of inserts.
15906 MapVector<T, SmallVector<int>> ValueMasks;
15907};
15908} // namespace
15909
15911 InstructionCost ReductionCost) {
15912 InstructionCost Cost = ReductionCost;
15913 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
15914 << VectorizableTree.size() << ".\n");
15915
15916 SmallPtrSet<Value *, 4> CheckedExtracts;
15917 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
15918 TreeEntry &TE = *VectorizableTree[I];
15919 // No need to count the cost for combined entries, they are combined and
15920 // just skip their cost.
15921 if (TE.State == TreeEntry::CombinedVectorize) {
15922 LLVM_DEBUG(
15923 dbgs() << "SLP: Skipping cost for combined node that starts with "
15924 << *TE.Scalars[0] << ".\n";
15925 TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
15926 continue;
15927 }
15928 if (TE.hasState() &&
15929 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
15930 if (const TreeEntry *E =
15931 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
15932 E && E->getVectorFactor() == TE.getVectorFactor()) {
15933 // Some gather nodes might be absolutely the same as some vectorizable
15934 // nodes after reordering, need to handle it.
15935 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
15936 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
15937 << "SLP: Current total cost = " << Cost << "\n");
15938 continue;
15939 }
15940 }
15941
15942 // Exclude cost of gather loads nodes which are not used. These nodes were
15943 // built as part of the final attempt to vectorize gathered loads.
15944 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
15945 "Expected gather nodes with users only.");
15946
15947 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
15948 Cost += C;
15949 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
15950 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
15951 << "SLP: Current total cost = " << Cost << "\n");
15952 }
15953
15954 if (Cost >= -SLPCostThreshold &&
15955 none_of(ExternalUses, [](const ExternalUser &EU) {
15956 return isa_and_nonnull<InsertElementInst>(EU.User);
15957 }))
15958 return Cost;
15959
15960 SmallPtrSet<Value *, 16> ExtractCostCalculated;
15961 InstructionCost ExtractCost = 0;
15963 SmallVector<APInt> DemandedElts;
15964 SmallDenseSet<Value *, 4> UsedInserts;
15966 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
15968 SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
15969 // Keep track {Scalar, Index, User} tuple.
15970 // On AArch64, this helps in fusing a mov instruction, associated with
15971 // extractelement, with fmul in the backend so that extractelement is free.
15973 for (ExternalUser &EU : ExternalUses) {
15974 ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane);
15975 }
15976 SmallDenseSet<std::pair<Value *, Value *>, 8> CheckedScalarUser;
15977 for (ExternalUser &EU : ExternalUses) {
15978 LLVM_DEBUG(dbgs() << "SLP: Computing cost for external use of TreeEntry "
15979 << EU.E.Idx << " in lane " << EU.Lane << "\n");
15980 LLVM_DEBUG(if (EU.User) dbgs() << " User:" << *EU.User << "\n";
15981 else dbgs() << " User: nullptr\n");
15982 LLVM_DEBUG(dbgs() << " Use: " << EU.Scalar->getNameOrAsOperand() << "\n");
15983
15984 // Uses by ephemeral values are free (because the ephemeral value will be
15985 // removed prior to code generation, and so the extraction will be
15986 // removed as well).
15987 if (EphValues.count(EU.User))
15988 continue;
15989
15990 // Check if the scalar for the given user or all users is accounted already.
15991 if (!CheckedScalarUser.insert(std::make_pair(EU.Scalar, EU.User)).second ||
15992 (EU.User &&
15993 CheckedScalarUser.contains(std::make_pair(EU.Scalar, nullptr))))
15994 continue;
15995
15996 // Used in unreachable blocks or in EH pads (rarely executed) or is
15997 // terminated with unreachable instruction.
15998 if (BasicBlock *UserParent =
15999 EU.User ? cast<Instruction>(EU.User)->getParent() : nullptr;
16000 UserParent &&
16001 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
16002 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
16003 continue;
16004
16005 // We only add extract cost once for the same scalar.
16006 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
16007 !ExtractCostCalculated.insert(EU.Scalar).second)
16008 continue;
16009
16010 // No extract cost for vector "scalar" if REVEC is disabled
16011 if (!SLPReVec && isa<FixedVectorType>(EU.Scalar->getType()))
16012 continue;
16013
16014 // If found user is an insertelement, do not calculate extract cost but try
16015 // to detect it as a final shuffled/identity match.
16016 // TODO: what if a user is insertvalue when REVEC is enabled?
16017 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
16018 VU && VU->getOperand(1) == EU.Scalar) {
16019 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
16020 if (!UsedInserts.insert(VU).second)
16021 continue;
16022 std::optional<unsigned> InsertIdx = getElementIndex(VU);
16023 if (InsertIdx) {
16024 const TreeEntry *ScalarTE = &EU.E;
16025 auto *It = find_if(
16026 ShuffledInserts,
16027 [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
16028 // Checks if 2 insertelements are from the same buildvector.
16029 InsertElementInst *VecInsert = Data.InsertElements.front();
16031 VU, VecInsert, [this](InsertElementInst *II) -> Value * {
16032 Value *Op0 = II->getOperand(0);
16033 if (isVectorized(II) && !isVectorized(Op0))
16034 return nullptr;
16035 return Op0;
16036 });
16037 });
16038 int VecId = -1;
16039 if (It == ShuffledInserts.end()) {
16040 auto &Data = ShuffledInserts.emplace_back();
16041 Data.InsertElements.emplace_back(VU);
16042 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
16043 VecId = ShuffledInserts.size() - 1;
16044 auto It = MinBWs.find(ScalarTE);
16045 if (It != MinBWs.end() &&
16046 VectorCasts
16047 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
16048 .second) {
16049 unsigned BWSz = It->second.first;
16050 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
16051 unsigned VecOpcode;
16052 if (DstBWSz < BWSz)
16053 VecOpcode = Instruction::Trunc;
16054 else
16055 VecOpcode =
16056 It->second.second ? Instruction::SExt : Instruction::ZExt;
16058 InstructionCost C = TTI->getCastInstrCost(
16059 VecOpcode, FTy,
16060 getWidenedType(IntegerType::get(FTy->getContext(), BWSz),
16061 FTy->getNumElements()),
16063 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
16064 << " for extending externally used vector with "
16065 "non-equal minimum bitwidth.\n");
16066 Cost += C;
16067 }
16068 } else {
16069 if (isFirstInsertElement(VU, It->InsertElements.front()))
16070 It->InsertElements.front() = VU;
16071 VecId = std::distance(ShuffledInserts.begin(), It);
16072 }
16073 int InIdx = *InsertIdx;
16074 SmallVectorImpl<int> &Mask =
16075 ShuffledInserts[VecId].ValueMasks[ScalarTE];
16076 if (Mask.empty())
16077 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
16078 Mask[InIdx] = EU.Lane;
16079 DemandedElts[VecId].setBit(InIdx);
16080 continue;
16081 }
16082 }
16083 }
16084
16086 // If we plan to rewrite the tree in a smaller type, we will need to sign
16087 // extend the extracted value back to the original type. Here, we account
16088 // for the extract and the added cost of the sign extend if needed.
16089 InstructionCost ExtraCost = TTI::TCC_Free;
16090 auto *ScalarTy = EU.Scalar->getType();
16091 const unsigned BundleWidth = EU.E.getVectorFactor();
16092 assert(EU.Lane < BundleWidth && "Extracted lane out of bounds.");
16093 auto *VecTy = getWidenedType(ScalarTy, BundleWidth);
16094 const TreeEntry *Entry = &EU.E;
16095 auto It = MinBWs.find(Entry);
16096 if (It != MinBWs.end()) {
16097 Type *MinTy = IntegerType::get(F->getContext(), It->second.first);
16098 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy))
16099 MinTy = getWidenedType(MinTy, VecTy->getNumElements());
16100 unsigned Extend = isKnownNonNegative(EU.Scalar, SimplifyQuery(*DL))
16101 ? Instruction::ZExt
16102 : Instruction::SExt;
16103 VecTy = getWidenedType(MinTy, BundleWidth);
16104 ExtraCost =
16105 getExtractWithExtendCost(*TTI, Extend, ScalarTy, VecTy, EU.Lane);
16106 LLVM_DEBUG(dbgs() << " ExtractExtend or ExtractSubvec cost: "
16107 << ExtraCost << "\n");
16108 } else {
16109 ExtraCost =
16110 getVectorInstrCost(*TTI, ScalarTy, Instruction::ExtractElement, VecTy,
16111 CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
16112 LLVM_DEBUG(dbgs() << " ExtractElement cost for " << *ScalarTy << " from "
16113 << *VecTy << ": " << ExtraCost << "\n");
16114 }
16115 // Leave the scalar instructions as is if they are cheaper than extracts.
16116 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
16117 Entry->getOpcode() == Instruction::Load) {
16118 // Checks if the user of the external scalar is phi in loop body.
16119 auto IsPhiInLoop = [&](const ExternalUser &U) {
16120 if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
16121 auto *I = cast<Instruction>(U.Scalar);
16122 const Loop *L = LI->getLoopFor(Phi->getParent());
16123 return L && (Phi->getParent() == I->getParent() ||
16124 L == LI->getLoopFor(I->getParent()));
16125 }
16126 return false;
16127 };
16128 if (!ValueToExtUses) {
16129 ValueToExtUses.emplace();
16130 for (const auto &P : enumerate(ExternalUses)) {
16131 // Ignore phis in loops.
16132 if (IsPhiInLoop(P.value()))
16133 continue;
16134
16135 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
16136 }
16137 }
16138 // Can use original instruction, if no operands vectorized or they are
16139 // marked as externally used already.
16140 auto *Inst = cast<Instruction>(EU.Scalar);
16141 InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
16142 auto OperandIsScalar = [&](Value *V) {
16143 if (!isVectorized(V)) {
16144 // Some extractelements might be not vectorized, but
16145 // transformed into shuffle and removed from the function,
16146 // consider it here.
16147 if (auto *EE = dyn_cast<ExtractElementInst>(V))
16148 return !EE->hasOneUse() || !MustGather.contains(EE);
16149 return true;
16150 }
16151 return ValueToExtUses->contains(V);
16152 };
16153 bool CanBeUsedAsScalar = all_of(Inst->operands(), OperandIsScalar);
16154 bool CanBeUsedAsScalarCast = false;
16155 if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
16156 if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));
16157 Op && all_of(Op->operands(), OperandIsScalar)) {
16158 InstructionCost OpCost =
16159 (isVectorized(Op) && !ValueToExtUses->contains(Op))
16160 ? TTI->getInstructionCost(Op, CostKind)
16161 : 0;
16162 if (ScalarCost + OpCost <= ExtraCost) {
16163 CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
16164 ScalarCost += OpCost;
16165 }
16166 }
16167 }
16168 if (CanBeUsedAsScalar) {
16169 bool KeepScalar = ScalarCost <= ExtraCost;
16170 // Try to keep original scalar if the user is the phi node from the same
16171 // block as the root phis, currently vectorized. It allows to keep
16172 // better ordering info of PHIs, being vectorized currently.
16173 bool IsProfitablePHIUser =
16174 (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
16175 VectorizableTree.front()->Scalars.size() > 2)) &&
16176 VectorizableTree.front()->hasState() &&
16177 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
16178 !Inst->hasNUsesOrMore(UsesLimit) &&
16179 none_of(Inst->users(),
16180 [&](User *U) {
16181 auto *PHIUser = dyn_cast<PHINode>(U);
16182 return (!PHIUser ||
16183 PHIUser->getParent() !=
16184 cast<Instruction>(
16185 VectorizableTree.front()->getMainOp())
16186 ->getParent()) &&
16187 !isVectorized(U);
16188 }) &&
16189 count_if(Entry->Scalars, [&](Value *V) {
16190 return ValueToExtUses->contains(V);
16191 }) <= 2;
16192 if (IsProfitablePHIUser) {
16193 KeepScalar = true;
16194 } else if (KeepScalar && ScalarCost != TTI::TCC_Free &&
16195 ExtraCost - ScalarCost <= TTI::TCC_Basic &&
16196 (!GatheredLoadsEntriesFirst.has_value() ||
16197 Entry->Idx < *GatheredLoadsEntriesFirst)) {
16198 unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
16199 return ValueToExtUses->contains(V);
16200 });
16201 auto It = ExtractsCount.find(Entry);
16202 if (It != ExtractsCount.end()) {
16203 assert(ScalarUsesCount >= It->getSecond().size() &&
16204 "Expected total number of external uses not less than "
16205 "number of scalar uses.");
16206 ScalarUsesCount -= It->getSecond().size();
16207 }
16208 // Keep original scalar if number of externally used instructions in
16209 // the same entry is not power of 2. It may help to do some extra
16210 // vectorization for now.
16211 KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
16212 }
16213 if (KeepScalar) {
16214 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
16215 for (Value *V : Inst->operands()) {
16216 auto It = ValueToExtUses->find(V);
16217 if (It != ValueToExtUses->end()) {
16218 // Replace all uses to avoid compiler crash.
16219 ExternalUses[It->second].User = nullptr;
16220 }
16221 }
16222 ExtraCost = ScalarCost;
16223 if (!IsPhiInLoop(EU))
16224 ExtractsCount[Entry].insert(Inst);
16225 if (CanBeUsedAsScalarCast) {
16226 ScalarOpsFromCasts.insert(Inst->getOperand(0));
16227 // Update the users of the operands of the cast operand to avoid
16228 // compiler crash.
16229 if (auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
16230 for (Value *V : IOp->operands()) {
16231 auto It = ValueToExtUses->find(V);
16232 if (It != ValueToExtUses->end()) {
16233 // Replace all uses to avoid compiler crash.
16234 ExternalUses[It->second].User = nullptr;
16235 }
16236 }
16237 }
16238 }
16239 }
16240 }
16241 }
16242
16243 ExtractCost += ExtraCost;
16244 }
16245 // Insert externals for extract of operands of casts to be emitted as scalars
16246 // instead of extractelement.
16247 for (Value *V : ScalarOpsFromCasts) {
16248 ExternalUsesAsOriginalScalar.insert(V);
16249 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V); !TEs.empty()) {
16250 ExternalUses.emplace_back(V, nullptr, *TEs.front(),
16251 TEs.front()->findLaneForValue(V));
16252 }
16253 }
16254 // Add reduced value cost, if resized.
16255 if (!VectorizedVals.empty()) {
16256 const TreeEntry &Root = *VectorizableTree.front();
16257 auto BWIt = MinBWs.find(&Root);
16258 if (BWIt != MinBWs.end()) {
16259 Type *DstTy = Root.Scalars.front()->getType();
16260 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->getScalarType());
16261 unsigned SrcSz =
16262 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
16263 if (OriginalSz != SrcSz) {
16264 unsigned Opcode = Instruction::Trunc;
16265 if (OriginalSz > SrcSz)
16266 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
16267 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
16268 if (auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
16269 assert(SLPReVec && "Only supported by REVEC.");
16270 SrcTy = getWidenedType(SrcTy, VecTy->getNumElements());
16271 }
16272 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
16275 }
16276 }
16277 }
16278
16279 Cost += ExtractCost;
16280 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
16281 bool ForSingleMask) {
16282 InstructionCost C = 0;
16283 unsigned VF = Mask.size();
16284 unsigned VecVF = TE->getVectorFactor();
16285 bool HasLargeIndex =
16286 any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); });
16287 if ((VF != VecVF && HasLargeIndex) ||
16289
16290 if (HasLargeIndex) {
16291 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
16292 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
16293 OrigMask.begin());
16295 getWidenedType(TE->getMainOp()->getType(), VecVF),
16296 OrigMask);
16297 LLVM_DEBUG(
16298 dbgs() << "SLP: Adding cost " << C
16299 << " for final shuffle of insertelement external users.\n";
16300 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
16301 Cost += C;
16302 return std::make_pair(TE, true);
16303 }
16304
16305 if (!ForSingleMask) {
16306 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
16307 for (unsigned I = 0; I < VF; ++I) {
16308 if (Mask[I] != PoisonMaskElem)
16309 ResizeMask[Mask[I]] = Mask[I];
16310 }
16311 if (!ShuffleVectorInst::isIdentityMask(ResizeMask, VF))
16314 getWidenedType(TE->getMainOp()->getType(), VecVF), ResizeMask);
16315 LLVM_DEBUG(
16316 dbgs() << "SLP: Adding cost " << C
16317 << " for final shuffle of insertelement external users.\n";
16318 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
16319
16320 Cost += C;
16321 }
16322 }
16323 return std::make_pair(TE, false);
16324 };
16325 // Calculate the cost of the reshuffled vectors, if any.
16326 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
16327 Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0);
16328 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
16329 unsigned VF = 0;
16330 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
16332 assert((TEs.size() == 1 || TEs.size() == 2) &&
16333 "Expected exactly 1 or 2 tree entries.");
16334 if (TEs.size() == 1) {
16335 if (VF == 0)
16336 VF = TEs.front()->getVectorFactor();
16337 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16338 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
16339 !all_of(enumerate(Mask), [=](const auto &Data) {
16340 return Data.value() == PoisonMaskElem ||
16341 (Data.index() < VF &&
16342 static_cast<int>(Data.index()) == Data.value());
16343 })) {
16346 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
16347 << " for final shuffle of insertelement "
16348 "external users.\n";
16349 TEs.front()->dump();
16350 dbgs() << "SLP: Current total cost = " << Cost << "\n");
16351 Cost += C;
16352 }
16353 } else {
16354 if (VF == 0) {
16355 if (TEs.front() &&
16356 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
16357 VF = TEs.front()->getVectorFactor();
16358 else
16359 VF = Mask.size();
16360 }
16361 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16363 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, FTy, Mask);
16364 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
16365 << " for final shuffle of vector node and external "
16366 "insertelement users.\n";
16367 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
16368 dbgs() << "SLP: Current total cost = " << Cost << "\n");
16369 Cost += C;
16370 }
16371 VF = Mask.size();
16372 return TEs.back();
16373 };
16375 MutableArrayRef(Vector.data(), Vector.size()), Base,
16376 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
16377 EstimateShufflesCost);
16378 InstructionCost InsertCost = TTI->getScalarizationOverhead(
16380 ShuffledInserts[I].InsertElements.front()->getType()),
16381 DemandedElts[I],
16382 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
16383 Cost -= InsertCost;
16384 }
16385
16386 // Add the cost for reduced value resize (if required).
16387 if (ReductionBitWidth != 0) {
16388 assert(UserIgnoreList && "Expected reduction tree.");
16389 const TreeEntry &E = *VectorizableTree.front();
16390 auto It = MinBWs.find(&E);
16391 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
16392 unsigned SrcSize = It->second.first;
16393 unsigned DstSize = ReductionBitWidth;
16394 unsigned Opcode = Instruction::Trunc;
16395 if (SrcSize < DstSize) {
16396 bool IsArithmeticExtendedReduction =
16397 all_of(*UserIgnoreList, [](Value *V) {
16398 auto *I = cast<Instruction>(V);
16399 return is_contained({Instruction::Add, Instruction::FAdd,
16400 Instruction::Mul, Instruction::FMul,
16401 Instruction::And, Instruction::Or,
16402 Instruction::Xor},
16403 I->getOpcode());
16404 });
16405 if (IsArithmeticExtendedReduction)
16406 Opcode =
16407 Instruction::BitCast; // Handle it by getExtendedReductionCost
16408 else
16409 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
16410 }
16411 if (Opcode != Instruction::BitCast) {
16412 auto *SrcVecTy =
16413 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
16414 auto *DstVecTy =
16415 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
16416 TTI::CastContextHint CCH = getCastContextHint(E);
16417 InstructionCost CastCost;
16418 switch (E.getOpcode()) {
16419 case Instruction::SExt:
16420 case Instruction::ZExt:
16421 case Instruction::Trunc: {
16422 const TreeEntry *OpTE = getOperandEntry(&E, 0);
16423 CCH = getCastContextHint(*OpTE);
16424 break;
16425 }
16426 default:
16427 break;
16428 }
16429 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
16431 Cost += CastCost;
16432 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
16433 << " for final resize for reduction from " << SrcVecTy
16434 << " to " << DstVecTy << "\n";
16435 dbgs() << "SLP: Current total cost = " << Cost << "\n");
16436 }
16437 }
16438 }
16439
16440 std::optional<InstructionCost> SpillCost;
16441 if (Cost < -SLPCostThreshold) {
16442 SpillCost = getSpillCost();
16443 Cost += *SpillCost;
16444 }
16445#ifndef NDEBUG
16446 SmallString<256> Str;
16447 {
16448 raw_svector_ostream OS(Str);
16449 OS << "SLP: Spill Cost = ";
16450 if (SpillCost)
16451 OS << *SpillCost;
16452 else
16453 OS << "<skipped>";
16454 OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n"
16455 << "SLP: Total Cost = " << Cost << ".\n";
16456 }
16457 LLVM_DEBUG(dbgs() << Str);
16458 if (ViewSLPTree)
16459 ViewGraph(this, "SLP" + F->getName(), false, Str);
16460#endif
16461
16462 return Cost;
16463}
16464
16465/// Tries to find extractelement instructions with constant indices from fixed
16466/// vector type and gather such instructions into a bunch, which highly likely
16467/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
16468/// successful, the matched scalars are replaced by poison values in \p VL for
16469/// future analysis.
16470std::optional<TTI::ShuffleKind>
16471BoUpSLP::tryToGatherSingleRegisterExtractElements(
16473 // Scan list of gathered scalars for extractelements that can be represented
16474 // as shuffles.
16476 SmallVector<int> UndefVectorExtracts;
16477 for (int I = 0, E = VL.size(); I < E; ++I) {
16478 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
16479 if (!EI) {
16480 if (isa<UndefValue>(VL[I]))
16481 UndefVectorExtracts.push_back(I);
16482 continue;
16483 }
16484 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
16485 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
16486 continue;
16487 std::optional<unsigned> Idx = getExtractIndex(EI);
16488 // Undefined index.
16489 if (!Idx) {
16490 UndefVectorExtracts.push_back(I);
16491 continue;
16492 }
16493 if (Idx >= VecTy->getNumElements()) {
16494 UndefVectorExtracts.push_back(I);
16495 continue;
16496 }
16497 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
16498 ExtractMask.reset(*Idx);
16499 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
16500 UndefVectorExtracts.push_back(I);
16501 continue;
16502 }
16503 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
16504 }
16505 // Sort the vector operands by the maximum number of uses in extractelements.
16507 VectorOpToIdx.takeVector();
16508 stable_sort(Vectors, [](const auto &P1, const auto &P2) {
16509 return P1.second.size() > P2.second.size();
16510 });
16511 // Find the best pair of the vectors or a single vector.
16512 const int UndefSz = UndefVectorExtracts.size();
16513 unsigned SingleMax = 0;
16514 unsigned PairMax = 0;
16515 if (!Vectors.empty()) {
16516 SingleMax = Vectors.front().second.size() + UndefSz;
16517 if (Vectors.size() > 1) {
16518 auto *ItNext = std::next(Vectors.begin());
16519 PairMax = SingleMax + ItNext->second.size();
16520 }
16521 }
16522 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
16523 return std::nullopt;
16524 // Check if better to perform a shuffle of 2 vectors or just of a single
16525 // vector.
16526 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
16527 SmallVector<Value *> GatheredExtracts(
16528 VL.size(), PoisonValue::get(VL.front()->getType()));
16529 if (SingleMax >= PairMax && SingleMax) {
16530 for (int Idx : Vectors.front().second)
16531 std::swap(GatheredExtracts[Idx], VL[Idx]);
16532 } else if (!Vectors.empty()) {
16533 for (unsigned Idx : {0, 1})
16534 for (int Idx : Vectors[Idx].second)
16535 std::swap(GatheredExtracts[Idx], VL[Idx]);
16536 }
16537 // Add extracts from undefs too.
16538 for (int Idx : UndefVectorExtracts)
16539 std::swap(GatheredExtracts[Idx], VL[Idx]);
16540 // Check that gather of extractelements can be represented as just a
16541 // shuffle of a single/two vectors the scalars are extracted from.
16542 std::optional<TTI::ShuffleKind> Res =
16543 isFixedVectorShuffle(GatheredExtracts, Mask, AC);
16544 if (!Res || all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
16545 // TODO: try to check other subsets if possible.
16546 // Restore the original VL if attempt was not successful.
16547 copy(SavedVL, VL.begin());
16548 return std::nullopt;
16549 }
16550 // Restore unused scalars from mask, if some of the extractelements were not
16551 // selected for shuffle.
16552 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
16553 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
16554 isa<UndefValue>(GatheredExtracts[I])) {
16555 std::swap(VL[I], GatheredExtracts[I]);
16556 continue;
16557 }
16558 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
16559 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
16560 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
16561 is_contained(UndefVectorExtracts, I))
16562 continue;
16563 }
16564 return Res;
16565}
16566
16567/// Tries to find extractelement instructions with constant indices from fixed
16568/// vector type and gather such instructions into a bunch, which highly likely
16569/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
16570/// successful, the matched scalars are replaced by poison values in \p VL for
16571/// future analysis.
16573BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
16574 SmallVectorImpl<int> &Mask,
16575 unsigned NumParts) const {
16576 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
16577 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
16578 Mask.assign(VL.size(), PoisonMaskElem);
16579 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
16580 for (unsigned Part : seq<unsigned>(NumParts)) {
16581 // Scan list of gathered scalars for extractelements that can be represented
16582 // as shuffles.
16583 MutableArrayRef<Value *> SubVL = MutableArrayRef(VL).slice(
16584 Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
16585 SmallVector<int> SubMask;
16586 std::optional<TTI::ShuffleKind> Res =
16587 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
16588 ShufflesRes[Part] = Res;
16589 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
16590 }
16591 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
16592 return Res.has_value();
16593 }))
16594 ShufflesRes.clear();
16595 return ShufflesRes;
16596}
16597
16598std::optional<TargetTransformInfo::ShuffleKind>
16599BoUpSLP::isGatherShuffledSingleRegisterEntry(
16600 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
16601 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
16602 Entries.clear();
16603 // TODO: currently checking only for Scalars in the tree entry, need to count
16604 // reused elements too for better cost estimation.
16605 auto GetUserEntry = [&](const TreeEntry *TE) {
16606 while (TE->UserTreeIndex && TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16607 TE = TE->UserTreeIndex.UserTE;
16608 if (TE == VectorizableTree.front().get())
16609 return EdgeInfo(const_cast<TreeEntry *>(TE), 0);
16610 return TE->UserTreeIndex;
16611 };
16612 auto HasGatherUser = [&](const TreeEntry *TE) {
16613 while (TE->Idx != 0 && TE->UserTreeIndex) {
16614 if (TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16615 return true;
16616 TE = TE->UserTreeIndex.UserTE;
16617 }
16618 return false;
16619 };
16620 const EdgeInfo TEUseEI = GetUserEntry(TE);
16621 if (!TEUseEI)
16622 return std::nullopt;
16623 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
16624 const BasicBlock *TEInsertBlock = nullptr;
16625 // Main node of PHI entries keeps the correct order of operands/incoming
16626 // blocks.
16627 if (auto *PHI = dyn_cast_or_null<PHINode>(
16628 TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() : nullptr);
16629 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
16630 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
16631 TEInsertPt = TEInsertBlock->getTerminator();
16632 } else {
16633 TEInsertBlock = TEInsertPt->getParent();
16634 }
16635 if (!DT->isReachableFromEntry(TEInsertBlock))
16636 return std::nullopt;
16637 auto *NodeUI = DT->getNode(TEInsertBlock);
16638 assert(NodeUI && "Should only process reachable instructions");
16639 SmallPtrSet<Value *, 4> GatheredScalars(llvm::from_range, VL);
16640 auto CheckOrdering = [&](const Instruction *InsertPt) {
16641 // Argument InsertPt is an instruction where vector code for some other
16642 // tree entry (one that shares one or more scalars with TE) is going to be
16643 // generated. This lambda returns true if insertion point of vector code
16644 // for the TE dominates that point (otherwise dependency is the other way
16645 // around). The other node is not limited to be of a gather kind. Gather
16646 // nodes are not scheduled and their vector code is inserted before their
16647 // first user. If user is PHI, that is supposed to be at the end of a
16648 // predecessor block. Otherwise it is the last instruction among scalars of
16649 // the user node. So, instead of checking dependency between instructions
16650 // themselves, we check dependency between their insertion points for vector
16651 // code (since each scalar instruction ends up as a lane of a vector
16652 // instruction).
16653 const BasicBlock *InsertBlock = InsertPt->getParent();
16654 auto *NodeEUI = DT->getNode(InsertBlock);
16655 if (!NodeEUI)
16656 return false;
16657 assert((NodeUI == NodeEUI) ==
16658 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
16659 "Different nodes should have different DFS numbers");
16660 // Check the order of the gather nodes users.
16661 if (TEInsertPt->getParent() != InsertBlock &&
16662 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
16663 return false;
16664 if (TEInsertPt->getParent() == InsertBlock &&
16665 TEInsertPt->comesBefore(InsertPt))
16666 return false;
16667 return true;
16668 };
16669 // Find all tree entries used by the gathered values. If no common entries
16670 // found - not a shuffle.
16671 // Here we build a set of tree nodes for each gathered value and trying to
16672 // find the intersection between these sets. If we have at least one common
16673 // tree node for each gathered value - we have just a permutation of the
16674 // single vector. If we have 2 different sets, we're in situation where we
16675 // have a permutation of 2 input vectors.
16677 SmallDenseMap<Value *, int> UsedValuesEntry;
16678 SmallPtrSet<const Value *, 16> VisitedValue;
16679 auto CheckAndUseSameNode = [&](const TreeEntry *TEPtr) {
16680 // The node is reused - exit.
16681 if ((TEPtr->getVectorFactor() != VL.size() &&
16682 TEPtr->Scalars.size() != VL.size()) ||
16683 (!TEPtr->isSame(VL) && !TEPtr->isSame(TE->Scalars)))
16684 return false;
16685 UsedTEs.clear();
16686 UsedTEs.emplace_back().insert(TEPtr);
16687 for (Value *V : VL) {
16688 if (isConstant(V))
16689 continue;
16690 UsedValuesEntry.try_emplace(V, 0);
16691 }
16692 return true;
16693 };
16694 auto CheckParentNodes = [&](const TreeEntry *User1, const TreeEntry *User2,
16695 unsigned EdgeIdx) {
16696 const TreeEntry *Ptr1 = User1;
16697 const TreeEntry *Ptr2 = User2;
16698 SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
16699 while (Ptr2) {
16700 PtrToIdx.try_emplace(Ptr2, EdgeIdx);
16701 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
16702 Ptr2 = Ptr2->UserTreeIndex.UserTE;
16703 }
16704 while (Ptr1) {
16705 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
16706 Ptr1 = Ptr1->UserTreeIndex.UserTE;
16707 if (auto It = PtrToIdx.find(Ptr1); It != PtrToIdx.end())
16708 return Idx < It->second;
16709 }
16710 return false;
16711 };
16712 for (Value *V : VL) {
16713 if (isConstant(V) || !VisitedValue.insert(V).second)
16714 continue;
16715 // Build a list of tree entries where V is used.
16716 SmallPtrSet<const TreeEntry *, 4> VToTEs;
16717 for (const TreeEntry *TEPtr : ValueToGatherNodes.lookup(V)) {
16718 if (TEPtr == TE || TEPtr->Idx == 0)
16719 continue;
16720 assert(any_of(TEPtr->Scalars,
16721 [&](Value *V) { return GatheredScalars.contains(V); }) &&
16722 "Must contain at least single gathered value.");
16723 assert(TEPtr->UserTreeIndex &&
16724 "Expected only single user of a gather node.");
16725 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
16726
16727 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
16728 UseEI.UserTE->hasState())
16729 ? dyn_cast<PHINode>(UseEI.UserTE->getMainOp())
16730 : nullptr;
16731 Instruction *InsertPt =
16732 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
16733 : &getLastInstructionInBundle(UseEI.UserTE);
16734 if (TEInsertPt == InsertPt) {
16735 // Check nodes, which might be emitted first.
16736 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16737 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
16738 TEUseEI.UserTE->isAltShuffle()) &&
16739 all_of(TEUseEI.UserTE->Scalars, isUsedOutsideBlock)) {
16740 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
16741 (UseEI.UserTE->hasState() &&
16742 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16743 !UseEI.UserTE->isAltShuffle()) ||
16744 !all_of(UseEI.UserTE->Scalars, isUsedOutsideBlock))
16745 continue;
16746 }
16747
16748 // If the schedulable insertion point is used in multiple entries - just
16749 // exit, no known ordering at this point, available only after real
16750 // scheduling.
16751 if (!doesNotNeedToBeScheduled(InsertPt) &&
16752 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
16753 continue;
16754 // If the users are the PHI nodes with the same incoming blocks - skip.
16755 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16756 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
16757 UseEI.UserTE->State == TreeEntry::Vectorize &&
16758 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16759 TEUseEI.UserTE != UseEI.UserTE)
16760 continue;
16761 // If 2 gathers are operands of the same entry (regardless of whether
16762 // user is PHI or else), compare operands indices, use the earlier one
16763 // as the base.
16764 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
16765 continue;
16766 // If the user instruction is used for some reason in different
16767 // vectorized nodes - make it depend on index.
16768 if (TEUseEI.UserTE != UseEI.UserTE &&
16769 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
16770 HasGatherUser(TEUseEI.UserTE)))
16771 continue;
16772 // If the user node is the operand of the other user node - skip.
16773 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
16774 continue;
16775 }
16776
16777 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
16778 TEUseEI.UserTE->doesNotNeedToSchedule() !=
16779 UseEI.UserTE->doesNotNeedToSchedule() &&
16780 is_contained(UseEI.UserTE->Scalars, TEInsertPt))
16781 continue;
16782 // Check if the user node of the TE comes after user node of TEPtr,
16783 // otherwise TEPtr depends on TE.
16784 if ((TEInsertBlock != InsertPt->getParent() ||
16785 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
16786 !CheckOrdering(InsertPt))
16787 continue;
16788 // The node is reused - exit.
16789 if (CheckAndUseSameNode(TEPtr))
16790 break;
16791 VToTEs.insert(TEPtr);
16792 }
16793 if (ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V); !VTEs.empty()) {
16794 const auto *It = find_if(
16795 VTEs, [&](const TreeEntry *MTE) { return MTE != TEUseEI.UserTE; });
16796 if (It != VTEs.end()) {
16797 const TreeEntry *VTE = *It;
16798 if (none_of(TE->CombinedEntriesWithIndices,
16799 [&](const auto &P) { return P.first == VTE->Idx; })) {
16800 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16801 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16802 continue;
16803 }
16804 // The node is reused - exit.
16805 if (CheckAndUseSameNode(VTE))
16806 break;
16807 VToTEs.insert(VTE);
16808 }
16809 }
16810 if (ArrayRef<TreeEntry *> VTEs = getTreeEntries(V); !VTEs.empty()) {
16811 const TreeEntry *VTE = VTEs.front();
16812 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
16813 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
16814 VTEs = VTEs.drop_front();
16815 // Iterate through all vectorized nodes.
16816 const auto *MIt = find_if(VTEs, [](const TreeEntry *MTE) {
16817 return MTE->State == TreeEntry::Vectorize;
16818 });
16819 if (MIt == VTEs.end())
16820 continue;
16821 VTE = *MIt;
16822 }
16823 if (none_of(TE->CombinedEntriesWithIndices,
16824 [&](const auto &P) { return P.first == VTE->Idx; })) {
16825 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16826 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16827 continue;
16828 }
16829 // The node is reused - exit.
16830 if (CheckAndUseSameNode(VTE))
16831 break;
16832 VToTEs.insert(VTE);
16833 }
16834 if (VToTEs.empty())
16835 continue;
16836 if (UsedTEs.empty()) {
16837 // The first iteration, just insert the list of nodes to vector.
16838 UsedTEs.push_back(VToTEs);
16839 UsedValuesEntry.try_emplace(V, 0);
16840 } else {
16841 // Need to check if there are any previously used tree nodes which use V.
16842 // If there are no such nodes, consider that we have another one input
16843 // vector.
16844 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
16845 unsigned Idx = 0;
16846 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
16847 // Do we have a non-empty intersection of previously listed tree entries
16848 // and tree entries using current V?
16849 set_intersect(VToTEs, Set);
16850 if (!VToTEs.empty()) {
16851 // Yes, write the new subset and continue analysis for the next
16852 // scalar.
16853 Set.swap(VToTEs);
16854 break;
16855 }
16856 VToTEs = SavedVToTEs;
16857 ++Idx;
16858 }
16859 // No non-empty intersection found - need to add a second set of possible
16860 // source vectors.
16861 if (Idx == UsedTEs.size()) {
16862 // If the number of input vectors is greater than 2 - not a permutation,
16863 // fallback to the regular gather.
16864 // TODO: support multiple reshuffled nodes.
16865 if (UsedTEs.size() == 2)
16866 continue;
16867 UsedTEs.push_back(SavedVToTEs);
16868 Idx = UsedTEs.size() - 1;
16869 }
16870 UsedValuesEntry.try_emplace(V, Idx);
16871 }
16872 }
16873
16874 if (UsedTEs.empty()) {
16875 Entries.clear();
16876 return std::nullopt;
16877 }
16878
16879 unsigned VF = 0;
16880 if (UsedTEs.size() == 1) {
16881 // Keep the order to avoid non-determinism.
16882 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
16883 UsedTEs.front().end());
16884 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
16885 return TE1->Idx < TE2->Idx;
16886 });
16887 // Try to find the perfect match in another gather node at first.
16888 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
16889 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
16890 });
16891 if (It != FirstEntries.end() &&
16892 ((*It)->getVectorFactor() == VL.size() ||
16893 ((*It)->getVectorFactor() == TE->Scalars.size() &&
16894 TE->ReuseShuffleIndices.size() == VL.size() &&
16895 (*It)->isSame(TE->Scalars)))) {
16896 Entries.push_back(*It);
16897 if ((*It)->getVectorFactor() == VL.size()) {
16898 std::iota(std::next(Mask.begin(), Part * VL.size()),
16899 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
16900 } else {
16901 SmallVector<int> CommonMask = TE->getCommonMask();
16902 copy(CommonMask, Mask.begin());
16903 }
16904 // Clear undef scalars.
16905 for (unsigned I : seq<unsigned>(VL.size()))
16906 if (isa<PoisonValue>(VL[I]))
16907 Mask[Part * VL.size() + I] = PoisonMaskElem;
16909 }
16910 // No perfect match, just shuffle, so choose the first tree node from the
16911 // tree.
16912 Entries.push_back(FirstEntries.front());
16913 // Update mapping between values and corresponding tree entries.
16914 for (auto &P : UsedValuesEntry)
16915 P.second = 0;
16916 VF = FirstEntries.front()->getVectorFactor();
16917 } else {
16918 // Try to find nodes with the same vector factor.
16919 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
16920 // Keep the order of tree nodes to avoid non-determinism.
16921 DenseMap<int, const TreeEntry *> VFToTE;
16922 for (const TreeEntry *TE : UsedTEs.front()) {
16923 unsigned VF = TE->getVectorFactor();
16924 auto It = VFToTE.find(VF);
16925 if (It != VFToTE.end()) {
16926 if (It->second->Idx > TE->Idx)
16927 It->getSecond() = TE;
16928 continue;
16929 }
16930 VFToTE.try_emplace(VF, TE);
16931 }
16932 // Same, keep the order to avoid non-determinism.
16933 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
16934 UsedTEs.back().end());
16935 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
16936 return TE1->Idx < TE2->Idx;
16937 });
16938 for (const TreeEntry *TE : SecondEntries) {
16939 auto It = VFToTE.find(TE->getVectorFactor());
16940 if (It != VFToTE.end()) {
16941 VF = It->first;
16942 Entries.push_back(It->second);
16943 Entries.push_back(TE);
16944 break;
16945 }
16946 }
16947 // No 2 source vectors with the same vector factor - just choose 2 with max
16948 // index.
16949 if (Entries.empty()) {
16951 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
16952 return TE1->Idx < TE2->Idx;
16953 }));
16954 Entries.push_back(SecondEntries.front());
16955 VF = std::max(Entries.front()->getVectorFactor(),
16956 Entries.back()->getVectorFactor());
16957 } else {
16958 VF = Entries.front()->getVectorFactor();
16959 }
16960 SmallVector<SmallPtrSet<Value *, 8>> ValuesToEntries;
16961 for (const TreeEntry *E : Entries)
16962 ValuesToEntries.emplace_back().insert(E->Scalars.begin(),
16963 E->Scalars.end());
16964 // Update mapping between values and corresponding tree entries.
16965 for (auto &P : UsedValuesEntry) {
16966 for (unsigned Idx : seq<unsigned>(ValuesToEntries.size()))
16967 if (ValuesToEntries[Idx].contains(P.first)) {
16968 P.second = Idx;
16969 break;
16970 }
16971 }
16972 }
16973
16974 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
16975 // Checks if the 2 PHIs are compatible in terms of high possibility to be
16976 // vectorized.
16977 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
16978 auto *PHI = cast<PHINode>(V);
16979 auto *PHI1 = cast<PHINode>(V1);
16980 // Check that all incoming values are compatible/from same parent (if they
16981 // are instructions).
16982 // The incoming values are compatible if they all are constants, or
16983 // instruction with the same/alternate opcodes from the same basic block.
16984 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
16985 Value *In = PHI->getIncomingValue(I);
16986 Value *In1 = PHI1->getIncomingValue(I);
16987 if (isConstant(In) && isConstant(In1))
16988 continue;
16989 if (!getSameOpcode({In, In1}, *TLI))
16990 return false;
16991 if (cast<Instruction>(In)->getParent() !=
16993 return false;
16994 }
16995 return true;
16996 };
16997 // Check if the value can be ignored during analysis for shuffled gathers.
16998 // We suppose it is better to ignore instruction, which do not form splats,
16999 // are not vectorized/not extractelements (these instructions will be handled
17000 // by extractelements processing) or may form vector node in future.
17001 auto MightBeIgnored = [=](Value *V) {
17002 auto *I = dyn_cast<Instruction>(V);
17003 return I && !IsSplatOrUndefs && !isVectorized(I) &&
17005 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
17006 };
17007 // Check that the neighbor instruction may form a full vector node with the
17008 // current instruction V. It is possible, if they have same/alternate opcode
17009 // and same parent basic block.
17010 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
17011 Value *V1 = VL[Idx];
17012 bool UsedInSameVTE = false;
17013 auto It = UsedValuesEntry.find(V1);
17014 if (It != UsedValuesEntry.end())
17015 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
17016 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
17017 getSameOpcode({V, V1}, *TLI) &&
17018 cast<Instruction>(V)->getParent() ==
17019 cast<Instruction>(V1)->getParent() &&
17020 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
17021 };
17022 // Build a shuffle mask for better cost estimation and vector emission.
17023 SmallBitVector UsedIdxs(Entries.size());
17025 for (int I = 0, E = VL.size(); I < E; ++I) {
17026 Value *V = VL[I];
17027 auto It = UsedValuesEntry.find(V);
17028 if (It == UsedValuesEntry.end())
17029 continue;
17030 // Do not try to shuffle scalars, if they are constants, or instructions
17031 // that can be vectorized as a result of the following vector build
17032 // vectorization.
17033 if (isConstant(V) || (MightBeIgnored(V) &&
17034 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
17035 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
17036 continue;
17037 unsigned Idx = It->second;
17038 EntryLanes.emplace_back(Idx, I);
17039 UsedIdxs.set(Idx);
17040 }
17041 // Iterate through all shuffled scalars and select entries, which can be used
17042 // for final shuffle.
17044 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
17045 if (!UsedIdxs.test(I))
17046 continue;
17047 // Fix the entry number for the given scalar. If it is the first entry, set
17048 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
17049 // These indices are used when calculating final shuffle mask as the vector
17050 // offset.
17051 for (std::pair<unsigned, int> &Pair : EntryLanes)
17052 if (Pair.first == I)
17053 Pair.first = TempEntries.size();
17054 TempEntries.push_back(Entries[I]);
17055 }
17056 Entries.swap(TempEntries);
17057 if (EntryLanes.size() == Entries.size() &&
17058 !VL.equals(ArrayRef(TE->Scalars)
17059 .slice(Part * VL.size(),
17060 std::min<int>(VL.size(), TE->Scalars.size())))) {
17061 // We may have here 1 or 2 entries only. If the number of scalars is equal
17062 // to the number of entries, no need to do the analysis, it is not very
17063 // profitable. Since VL is not the same as TE->Scalars, it means we already
17064 // have some shuffles before. Cut off not profitable case.
17065 Entries.clear();
17066 return std::nullopt;
17067 }
17068 // Build the final mask, check for the identity shuffle, if possible.
17069 bool IsIdentity = Entries.size() == 1;
17070 // Pair.first is the offset to the vector, while Pair.second is the index of
17071 // scalar in the list.
17072 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
17073 unsigned Idx = Part * VL.size() + Pair.second;
17074 Mask[Idx] =
17075 Pair.first * VF +
17076 (ForOrder ? std::distance(
17077 Entries[Pair.first]->Scalars.begin(),
17078 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
17079 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
17080 IsIdentity &= Mask[Idx] == Pair.second;
17081 }
17082 if (ForOrder || IsIdentity || Entries.empty()) {
17083 switch (Entries.size()) {
17084 case 1:
17085 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
17087 break;
17088 case 2:
17089 if (EntryLanes.size() > 2 || VL.size() <= 2)
17091 break;
17092 default:
17093 break;
17094 }
17095 } else if (!isa<VectorType>(VL.front()->getType()) &&
17096 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
17097 // Do the cost estimation if shuffle beneficial than buildvector.
17098 SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),
17099 std::next(Mask.begin(), (Part + 1) * VL.size()));
17100 int MinElement = SubMask.front(), MaxElement = SubMask.front();
17101 for (int Idx : SubMask) {
17102 if (Idx == PoisonMaskElem)
17103 continue;
17104 if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
17105 MinElement = Idx;
17106 if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
17107 MaxElement = Idx;
17108 }
17109 assert(MaxElement >= 0 && MinElement >= 0 &&
17110 MaxElement % VF >= MinElement % VF &&
17111 "Expected at least single element.");
17112 unsigned NewVF = std::max<unsigned>(
17113 VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(),
17114 (MaxElement % VF) -
17115 (MinElement % VF) + 1));
17116 if (NewVF < VF) {
17117 for (int &Idx : SubMask) {
17118 if (Idx == PoisonMaskElem)
17119 continue;
17120 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
17121 (Idx >= static_cast<int>(VF) ? NewVF : 0);
17122 }
17123 } else {
17124 NewVF = VF;
17125 }
17126
17128 auto *VecTy = getWidenedType(VL.front()->getType(), NewVF);
17129 auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
17130 auto GetShuffleCost = [&,
17131 &TTI = *TTI](ArrayRef<int> Mask,
17133 VectorType *VecTy) -> InstructionCost {
17134 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
17136 Mask, Entries.front()->getInterleaveFactor()))
17137 return TTI::TCC_Free;
17138 return ::getShuffleCost(TTI,
17139 Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
17141 VecTy, Mask, CostKind);
17142 };
17143 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
17144 InstructionCost FirstShuffleCost = 0;
17145 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
17146 if (Entries.size() == 1 || !Entries[0]->isGather()) {
17147 FirstShuffleCost = ShuffleCost;
17148 } else {
17149 // Transform mask to include only first entry.
17150 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
17151 bool IsIdentity = true;
17152 for (auto [I, Idx] : enumerate(FirstMask)) {
17153 if (Idx >= static_cast<int>(NewVF)) {
17154 Idx = PoisonMaskElem;
17155 } else {
17156 DemandedElts.clearBit(I);
17157 if (Idx != PoisonMaskElem)
17158 IsIdentity &= static_cast<int>(I) == Idx;
17159 }
17160 }
17161 if (!IsIdentity)
17162 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
17163 FirstShuffleCost += getScalarizationOverhead(
17164 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
17165 /*Extract=*/false, CostKind);
17166 }
17167 InstructionCost SecondShuffleCost = 0;
17168 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
17169 if (Entries.size() == 1 || !Entries[1]->isGather()) {
17170 SecondShuffleCost = ShuffleCost;
17171 } else {
17172 // Transform mask to include only first entry.
17173 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
17174 bool IsIdentity = true;
17175 for (auto [I, Idx] : enumerate(SecondMask)) {
17176 if (Idx < static_cast<int>(NewVF) && Idx >= 0) {
17177 Idx = PoisonMaskElem;
17178 } else {
17179 DemandedElts.clearBit(I);
17180 if (Idx != PoisonMaskElem) {
17181 Idx -= NewVF;
17182 IsIdentity &= static_cast<int>(I) == Idx;
17183 }
17184 }
17185 }
17186 if (!IsIdentity)
17187 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
17188 SecondShuffleCost += getScalarizationOverhead(
17189 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
17190 /*Extract=*/false, CostKind);
17191 }
17192 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
17193 for (auto [I, Idx] : enumerate(SubMask))
17194 if (Idx == PoisonMaskElem)
17195 DemandedElts.clearBit(I);
17196 InstructionCost BuildVectorCost = getScalarizationOverhead(
17197 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
17198 /*Extract=*/false, CostKind);
17199 const TreeEntry *BestEntry = nullptr;
17200 if (FirstShuffleCost < ShuffleCost) {
17201 std::for_each(std::next(Mask.begin(), Part * VL.size()),
17202 std::next(Mask.begin(), (Part + 1) * VL.size()),
17203 [&](int &Idx) {
17204 if (Idx >= static_cast<int>(VF))
17205 Idx = PoisonMaskElem;
17206 });
17207 BestEntry = Entries.front();
17208 ShuffleCost = FirstShuffleCost;
17209 }
17210 if (SecondShuffleCost < ShuffleCost) {
17211 std::for_each(std::next(Mask.begin(), Part * VL.size()),
17212 std::next(Mask.begin(), (Part + 1) * VL.size()),
17213 [&](int &Idx) {
17214 if (Idx < static_cast<int>(VF))
17215 Idx = PoisonMaskElem;
17216 else
17217 Idx -= VF;
17218 });
17219 BestEntry = Entries[1];
17220 ShuffleCost = SecondShuffleCost;
17221 }
17222 if (BuildVectorCost >= ShuffleCost) {
17223 if (BestEntry) {
17224 Entries.clear();
17225 Entries.push_back(BestEntry);
17226 }
17227 return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
17229 }
17230 }
17231 Entries.clear();
17232 // Clear the corresponding mask elements.
17233 std::fill(std::next(Mask.begin(), Part * VL.size()),
17234 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
17235 return std::nullopt;
17236}
17237
17239BoUpSLP::isGatherShuffledEntry(
17240 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
17241 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
17242 bool ForOrder) {
17243 assert(NumParts > 0 && NumParts < VL.size() &&
17244 "Expected positive number of registers.");
17245 Entries.clear();
17246 // No need to check for the topmost gather node.
17247 if (TE == VectorizableTree.front().get() &&
17248 (!GatheredLoadsEntriesFirst.has_value() ||
17249 none_of(ArrayRef(VectorizableTree).drop_front(),
17250 [](const std::unique_ptr<TreeEntry> &TE) {
17251 return !TE->isGather();
17252 })))
17253 return {};
17254 // FIXME: Gathering for non-power-of-2 (non whole registers) nodes not
17255 // implemented yet.
17256 if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
17257 return {};
17258 Mask.assign(VL.size(), PoisonMaskElem);
17259 assert((TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
17260 "Expected only single user of the gather node.");
17261 assert(VL.size() % NumParts == 0 &&
17262 "Number of scalars must be divisible by NumParts.");
17263 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->isGather() &&
17264 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
17265 (TE->Idx == 0 ||
17266 (TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) ||
17267 isSplat(TE->Scalars) ||
17268 (TE->hasState() &&
17269 getSameValuesTreeEntry(TE->getMainOp(), TE->Scalars))))
17270 return {};
17271 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
17273 for (unsigned Part : seq<unsigned>(NumParts)) {
17274 ArrayRef<Value *> SubVL =
17275 VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
17276 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
17277 std::optional<TTI::ShuffleKind> SubRes =
17278 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
17279 ForOrder);
17280 if (!SubRes)
17281 SubEntries.clear();
17282 Res.push_back(SubRes);
17283 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
17284 SubEntries.front()->getVectorFactor() == VL.size() &&
17285 (SubEntries.front()->isSame(TE->Scalars) ||
17286 SubEntries.front()->isSame(VL))) {
17287 SmallVector<const TreeEntry *> LocalSubEntries;
17288 LocalSubEntries.swap(SubEntries);
17289 Entries.clear();
17290 Res.clear();
17291 std::iota(Mask.begin(), Mask.end(), 0);
17292 // Clear undef scalars.
17293 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
17294 if (isa<PoisonValue>(VL[I]))
17296 Entries.emplace_back(1, LocalSubEntries.front());
17298 return Res;
17299 }
17300 }
17301 if (all_of(Res,
17302 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
17303 Entries.clear();
17304 return {};
17305 }
17306 return Res;
17307}
17308
17309InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
17310 Type *ScalarTy) const {
17311 const unsigned VF = VL.size();
17312 auto *VecTy = getWidenedType(ScalarTy, VF);
17313 // Find the cost of inserting/extracting values from the vector.
17314 // Check if the same elements are inserted several times and count them as
17315 // shuffle candidates.
17316 APInt DemandedElements = APInt::getZero(VF);
17319 auto EstimateInsertCost = [&](unsigned I, Value *V) {
17320 DemandedElements.setBit(I);
17321 if (V->getType() != ScalarTy)
17322 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
17324 };
17325 SmallVector<int> ConstantShuffleMask(VF, PoisonMaskElem);
17326 std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
17327 for (auto [I, V] : enumerate(VL)) {
17328 // No need to shuffle duplicates for constants.
17329 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V))
17330 continue;
17331
17332 if (isConstant(V)) {
17333 ConstantShuffleMask[I] = I + VF;
17334 continue;
17335 }
17336 EstimateInsertCost(I, V);
17337 }
17338 // FIXME: add a cost for constant vector materialization.
17339 bool IsAnyNonUndefConst =
17340 any_of(VL, [](Value *V) { return !isa<UndefValue>(V) && isConstant(V); });
17341 // 1. Shuffle input source vector and constant vector.
17342 if (!ForPoisonSrc && IsAnyNonUndefConst) {
17344 ConstantShuffleMask);
17345 }
17346
17347 // 2. Insert unique non-constants.
17348 if (!DemandedElements.isZero())
17349 Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElements,
17350 /*Insert=*/true,
17351 /*Extract=*/false, CostKind,
17352 ForPoisonSrc && !IsAnyNonUndefConst, VL);
17353 return Cost;
17354}
17355
17356Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
17357 auto It = EntryToLastInstruction.find(E);
17358 if (It != EntryToLastInstruction.end())
17359 return *cast<Instruction>(It->second);
17360 Instruction *Res = nullptr;
17361 // Get the basic block this bundle is in. All instructions in the bundle
17362 // should be in this block (except for extractelement-like instructions with
17363 // constant indices or gathered loads or copyables).
17364 Instruction *Front;
17365 unsigned Opcode;
17366 if (E->hasState()) {
17367 Front = E->getMainOp();
17368 Opcode = E->getOpcode();
17369 } else {
17370 Front = cast<Instruction>(*find_if(E->Scalars, IsaPred<Instruction>));
17371 Opcode = Front->getOpcode();
17372 }
17373 auto *BB = Front->getParent();
17374 assert(
17375 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
17376 E->isGather() && E->Idx < *GatheredLoadsEntriesFirst) ||
17377 E->State == TreeEntry::SplitVectorize || E->hasCopyableElements() ||
17378 all_of(E->Scalars,
17379 [=](Value *V) -> bool {
17380 if (Opcode == Instruction::GetElementPtr &&
17381 !isa<GetElementPtrInst>(V))
17382 return true;
17383 auto *I = dyn_cast<Instruction>(V);
17384 return !I || !E->getMatchingMainOpOrAltOp(I) ||
17385 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
17386 })) &&
17387 "Expected gathered loads or GEPs or instructions from same basic "
17388 "block.");
17389
17390 auto FindLastInst = [&]() {
17391 Instruction *LastInst = Front;
17392 for (Value *V : E->Scalars) {
17393 auto *I = dyn_cast<Instruction>(V);
17394 if (!I)
17395 continue;
17396 if (E->isCopyableElement(I))
17397 continue;
17398 if (LastInst->getParent() == I->getParent()) {
17399 if (LastInst->comesBefore(I))
17400 LastInst = I;
17401 continue;
17402 }
17403 assert(((Opcode == Instruction::GetElementPtr &&
17405 E->State == TreeEntry::SplitVectorize ||
17406 (isVectorLikeInstWithConstOps(LastInst) &&
17408 (GatheredLoadsEntriesFirst.has_value() &&
17409 Opcode == Instruction::Load && E->isGather() &&
17410 E->Idx < *GatheredLoadsEntriesFirst)) &&
17411 "Expected vector-like or non-GEP in GEP node insts only.");
17412 if (!DT->isReachableFromEntry(LastInst->getParent())) {
17413 LastInst = I;
17414 continue;
17415 }
17416 if (!DT->isReachableFromEntry(I->getParent()))
17417 continue;
17418 auto *NodeA = DT->getNode(LastInst->getParent());
17419 auto *NodeB = DT->getNode(I->getParent());
17420 assert(NodeA && "Should only process reachable instructions");
17421 assert(NodeB && "Should only process reachable instructions");
17422 assert((NodeA == NodeB) ==
17423 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17424 "Different nodes should have different DFS numbers");
17425 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
17426 LastInst = I;
17427 }
17428 BB = LastInst->getParent();
17429 return LastInst;
17430 };
17431
17432 auto FindFirstInst = [&]() {
17433 Instruction *FirstInst = Front;
17434 for (Value *V : E->Scalars) {
17435 auto *I = dyn_cast<Instruction>(V);
17436 if (!I)
17437 continue;
17438 if (E->isCopyableElement(I))
17439 continue;
17440 if (FirstInst->getParent() == I->getParent()) {
17441 if (I->comesBefore(FirstInst))
17442 FirstInst = I;
17443 continue;
17444 }
17445 assert(((Opcode == Instruction::GetElementPtr &&
17447 (isVectorLikeInstWithConstOps(FirstInst) &&
17449 "Expected vector-like or non-GEP in GEP node insts only.");
17450 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
17451 FirstInst = I;
17452 continue;
17453 }
17454 if (!DT->isReachableFromEntry(I->getParent()))
17455 continue;
17456 auto *NodeA = DT->getNode(FirstInst->getParent());
17457 auto *NodeB = DT->getNode(I->getParent());
17458 assert(NodeA && "Should only process reachable instructions");
17459 assert(NodeB && "Should only process reachable instructions");
17460 assert((NodeA == NodeB) ==
17461 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17462 "Different nodes should have different DFS numbers");
17463 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
17464 FirstInst = I;
17465 }
17466 return FirstInst;
17467 };
17468
17469 if (E->State == TreeEntry::SplitVectorize) {
17470 Res = FindLastInst();
17471 if (ArrayRef<TreeEntry *> Entries = getTreeEntries(Res); !Entries.empty()) {
17472 for (auto *E : Entries) {
17473 auto *I = dyn_cast_or_null<Instruction>(E->VectorizedValue);
17474 if (!I)
17475 I = &getLastInstructionInBundle(E);
17476 if (Res->getParent() == I->getParent() && Res->comesBefore(I))
17477 Res = I;
17478 }
17479 }
17480 EntryToLastInstruction.try_emplace(E, Res);
17481 return *Res;
17482 }
17483
17484 // Set insertpoint for gathered loads to the very first load.
17485 if (GatheredLoadsEntriesFirst.has_value() &&
17486 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
17487 Opcode == Instruction::Load) {
17488 Res = FindFirstInst();
17489 EntryToLastInstruction.try_emplace(E, Res);
17490 return *Res;
17491 }
17492
17493 // Set the insert point to the beginning of the basic block if the entry
17494 // should not be scheduled.
17495 auto FindScheduleBundle = [&](const TreeEntry *E) -> const ScheduleBundle * {
17496 if (E->isGather())
17497 return nullptr;
17498 // Found previously that the instruction do not need to be scheduled.
17499 const auto *It = BlocksSchedules.find(BB);
17500 if (It == BlocksSchedules.end())
17501 return nullptr;
17502 for (Value *V : E->Scalars) {
17503 auto *I = dyn_cast<Instruction>(V);
17504 if (!I || isa<PHINode>(I) ||
17505 (!E->isCopyableElement(I) && doesNotNeedToBeScheduled(I)))
17506 continue;
17507 ArrayRef<ScheduleBundle *> Bundles = It->second->getScheduleBundles(I);
17508 if (Bundles.empty())
17509 continue;
17510 const auto *It = find_if(
17511 Bundles, [&](ScheduleBundle *B) { return B->getTreeEntry() == E; });
17512 if (It != Bundles.end())
17513 return *It;
17514 }
17515 return nullptr;
17516 };
17517 const ScheduleBundle *Bundle = FindScheduleBundle(E);
17518 if (!E->isGather() && !Bundle) {
17519 if ((Opcode == Instruction::GetElementPtr &&
17520 any_of(E->Scalars,
17521 [](Value *V) {
17522 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
17523 })) ||
17524 all_of(E->Scalars, [&](Value *V) {
17525 return isa<PoisonValue>(V) || E->isCopyableElement(V) ||
17526 (!isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V));
17527 }))
17528 Res = FindLastInst();
17529 else
17530 Res = FindFirstInst();
17531 EntryToLastInstruction.try_emplace(E, Res);
17532 return *Res;
17533 }
17534
17535 // Find the last instruction. The common case should be that BB has been
17536 // scheduled, and the last instruction is VL.back(). So we start with
17537 // VL.back() and iterate over schedule data until we reach the end of the
17538 // bundle. The end of the bundle is marked by null ScheduleData.
17539 if (Bundle) {
17540 assert(!E->isGather() && "Gathered instructions should not be scheduled");
17541 Res = Bundle->getBundle().back()->getInst();
17542 EntryToLastInstruction.try_emplace(E, Res);
17543 return *Res;
17544 }
17545
17546 // LastInst can still be null at this point if there's either not an entry
17547 // for BB in BlocksSchedules or there's no ScheduleData available for
17548 // VL.back(). This can be the case if buildTreeRec aborts for various
17549 // reasons (e.g., the maximum recursion depth is reached, the maximum region
17550 // size is reached, etc.). ScheduleData is initialized in the scheduling
17551 // "dry-run".
17552 //
17553 // If this happens, we can still find the last instruction by brute force. We
17554 // iterate forwards from Front (inclusive) until we either see all
17555 // instructions in the bundle or reach the end of the block. If Front is the
17556 // last instruction in program order, LastInst will be set to Front, and we
17557 // will visit all the remaining instructions in the block.
17558 //
17559 // One of the reasons we exit early from buildTreeRec is to place an upper
17560 // bound on compile-time. Thus, taking an additional compile-time hit here is
17561 // not ideal. However, this should be exceedingly rare since it requires that
17562 // we both exit early from buildTreeRec and that the bundle be out-of-order
17563 // (causing us to iterate all the way to the end of the block).
17564 if (!Res)
17565 Res = FindLastInst();
17566 assert(Res && "Failed to find last instruction in bundle");
17567 EntryToLastInstruction.try_emplace(E, Res);
17568 return *Res;
17569}
17570
17571void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
17572 auto *Front = E->getMainOp();
17573 Instruction *LastInst = &getLastInstructionInBundle(E);
17574 assert(LastInst && "Failed to find last instruction in bundle");
17575 BasicBlock::iterator LastInstIt = LastInst->getIterator();
17576 // If the instruction is PHI, set the insert point after all the PHIs.
17577 bool IsPHI = isa<PHINode>(LastInst);
17578 if (IsPHI) {
17579 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
17580 if (LastInstIt != LastInst->getParent()->end() &&
17581 LastInstIt->getParent()->isLandingPad())
17582 LastInstIt = std::next(LastInstIt);
17583 }
17584 if (IsPHI ||
17585 (!E->isGather() && E->State != TreeEntry::SplitVectorize &&
17586 E->doesNotNeedToSchedule()) ||
17587 (GatheredLoadsEntriesFirst.has_value() &&
17588 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
17589 E->getOpcode() == Instruction::Load)) {
17590 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
17591 } else {
17592 // Set the insertion point after the last instruction in the bundle. Set the
17593 // debug location to Front.
17594 Builder.SetInsertPoint(
17595 LastInst->getParent(),
17596 LastInst->getNextNode()->getIterator());
17597 }
17598 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
17599}
17600
17601Value *BoUpSLP::gather(
17602 ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
17603 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle) {
17604 // List of instructions/lanes from current block and/or the blocks which are
17605 // part of the current loop. These instructions will be inserted at the end to
17606 // make it possible to optimize loops and hoist invariant instructions out of
17607 // the loops body with better chances for success.
17609 SmallSet<int, 4> PostponedIndices;
17610 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
17611 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
17612 SmallPtrSet<BasicBlock *, 4> Visited;
17613 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
17614 InsertBB = InsertBB->getSinglePredecessor();
17615 return InsertBB && InsertBB == InstBB;
17616 };
17617 for (int I = 0, E = VL.size(); I < E; ++I) {
17618 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
17619 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
17620 isVectorized(Inst) ||
17621 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
17622 PostponedIndices.insert(I).second)
17623 PostponedInsts.emplace_back(Inst, I);
17624 }
17625
17626 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
17627 Type *Ty) {
17628 Value *Scalar = V;
17629 if (Scalar->getType() != Ty) {
17630 assert(Scalar->getType()->isIntOrIntVectorTy() &&
17631 Ty->isIntOrIntVectorTy() && "Expected integer types only.");
17632 Value *V = Scalar;
17633 if (auto *CI = dyn_cast<CastInst>(Scalar);
17635 Value *Op = CI->getOperand(0);
17636 if (auto *IOp = dyn_cast<Instruction>(Op);
17637 !IOp || !(isDeleted(IOp) || isVectorized(IOp)))
17638 V = Op;
17639 }
17640 Scalar = Builder.CreateIntCast(
17641 V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
17642 }
17643
17644 Instruction *InsElt;
17645 if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
17646 assert(SLPReVec && "FixedVectorType is not expected.");
17647 Vec =
17648 createInsertVector(Builder, Vec, Scalar, Pos * getNumElements(VecTy));
17649 auto *II = dyn_cast<Instruction>(Vec);
17650 if (!II)
17651 return Vec;
17652 InsElt = II;
17653 } else {
17654 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
17655 InsElt = dyn_cast<InsertElementInst>(Vec);
17656 if (!InsElt)
17657 return Vec;
17658 }
17659 GatherShuffleExtractSeq.insert(InsElt);
17660 CSEBlocks.insert(InsElt->getParent());
17661 // Add to our 'need-to-extract' list.
17662 if (isa<Instruction>(V)) {
17663 if (ArrayRef<TreeEntry *> Entries = getTreeEntries(V); !Entries.empty()) {
17664 // Find which lane we need to extract.
17665 User *UserOp = nullptr;
17666 if (Scalar != V) {
17667 if (auto *SI = dyn_cast<Instruction>(Scalar))
17668 UserOp = SI;
17669 } else {
17670 if (V->getType()->isVectorTy()) {
17671 if (auto *SV = dyn_cast<ShuffleVectorInst>(InsElt);
17672 SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
17673 // Find shufflevector, caused by resize.
17674 auto FindOperand = [](Value *Vec, Value *V) -> Instruction * {
17675 if (auto *SV = dyn_cast<ShuffleVectorInst>(Vec)) {
17676 if (SV->getOperand(0) == V)
17677 return SV;
17678 if (SV->getOperand(1) == V)
17679 return SV;
17680 }
17681 return nullptr;
17682 };
17683 InsElt = nullptr;
17684 if (Instruction *User = FindOperand(SV->getOperand(0), V))
17685 InsElt = User;
17686 else if (Instruction *User = FindOperand(SV->getOperand(1), V))
17687 InsElt = User;
17688 assert(InsElt &&
17689 "Failed to find shufflevector, caused by resize.");
17690 }
17691 }
17692 UserOp = InsElt;
17693 }
17694 if (UserOp) {
17695 unsigned FoundLane = Entries.front()->findLaneForValue(V);
17696 ExternalUses.emplace_back(V, UserOp, *Entries.front(), FoundLane);
17697 }
17698 }
17699 }
17700 return Vec;
17701 };
17702 auto *VecTy = getWidenedType(ScalarTy, VL.size());
17703 Value *Vec = PoisonValue::get(VecTy);
17704 SmallVector<int> NonConsts;
17705 SmallVector<int> Mask(VL.size());
17706 std::iota(Mask.begin(), Mask.end(), 0);
17707 Value *OriginalRoot = Root;
17708 if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
17709 SV && isa<PoisonValue>(SV->getOperand(1)) &&
17710 SV->getOperand(0)->getType() == VecTy) {
17711 Root = SV->getOperand(0);
17712 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
17713 }
17714 // Insert constant values at first.
17715 for (int I = 0, E = VL.size(); I < E; ++I) {
17716 if (PostponedIndices.contains(I))
17717 continue;
17718 if (!isConstant(VL[I])) {
17719 NonConsts.push_back(I);
17720 continue;
17721 }
17722 if (isa<PoisonValue>(VL[I]))
17723 continue;
17724 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
17725 Mask[I] = I + E;
17726 }
17727 if (Root) {
17728 if (isa<PoisonValue>(Vec)) {
17729 Vec = OriginalRoot;
17730 } else {
17731 Vec = CreateShuffle(Root, Vec, Mask);
17732 if (auto *OI = dyn_cast<Instruction>(OriginalRoot);
17733 OI && OI->use_empty() &&
17734 none_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
17735 return TE->VectorizedValue == OI;
17736 }))
17737 eraseInstruction(OI);
17738 }
17739 }
17740 // Insert non-constant values.
17741 for (int I : NonConsts)
17742 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
17743 // Append instructions, which are/may be part of the loop, in the end to make
17744 // it possible to hoist non-loop-based instructions.
17745 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
17746 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
17747
17748 return Vec;
17749}
17750
17751/// Merges shuffle masks and emits final shuffle instruction, if required. It
17752/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
17753/// when the actual shuffle instruction is generated only if this is actually
17754/// required. Otherwise, the shuffle instruction emission is delayed till the
17755/// end of the process, to reduce the number of emitted instructions and further
17756/// analysis/transformations.
17757/// The class also will look through the previously emitted shuffle instructions
17758/// and properly mark indices in mask as undef.
17759/// For example, given the code
17760/// \code
17761/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
17762/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
17763/// \endcode
17764/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
17765/// look through %s1 and %s2 and emit
17766/// \code
17767/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
17768/// \endcode
17769/// instead.
17770/// If 2 operands are of different size, the smallest one will be resized and
17771/// the mask recalculated properly.
17772/// For example, given the code
17773/// \code
17774/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
17775/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
17776/// \endcode
17777/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
17778/// look through %s1 and %s2 and emit
17779/// \code
17780/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
17781/// \endcode
17782/// instead.
17783class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
17784 bool IsFinalized = false;
17785 /// Combined mask for all applied operands and masks. It is built during
17786 /// analysis and actual emission of shuffle vector instructions.
17787 SmallVector<int> CommonMask;
17788 /// List of operands for the shuffle vector instruction. It hold at max 2
17789 /// operands, if the 3rd is going to be added, the first 2 are combined into
17790 /// shuffle with \p CommonMask mask, the first operand sets to be the
17791 /// resulting shuffle and the second operand sets to be the newly added
17792 /// operand. The \p CommonMask is transformed in the proper way after that.
17793 SmallVector<Value *, 2> InVectors;
17794 IRBuilderBase &Builder;
17795 BoUpSLP &R;
17796
17797 class ShuffleIRBuilder {
17798 IRBuilderBase &Builder;
17799 /// Holds all of the instructions that we gathered.
17800 SetVector<Instruction *> &GatherShuffleExtractSeq;
17801 /// A list of blocks that we are going to CSE.
17802 DenseSet<BasicBlock *> &CSEBlocks;
17803 /// Data layout.
17804 const DataLayout &DL;
17805
17806 public:
17807 ShuffleIRBuilder(IRBuilderBase &Builder,
17808 SetVector<Instruction *> &GatherShuffleExtractSeq,
17809 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
17810 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
17811 CSEBlocks(CSEBlocks), DL(DL) {}
17812 ~ShuffleIRBuilder() = default;
17813 /// Creates shufflevector for the 2 operands with the given mask.
17814 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
17815 if (V1->getType() != V2->getType()) {
17817 V1->getType()->isIntOrIntVectorTy() &&
17818 "Expected integer vector types only.");
17819 if (V1->getType() != V2->getType()) {
17820 if (cast<VectorType>(V2->getType())
17821 ->getElementType()
17822 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
17823 ->getElementType()
17824 ->getIntegerBitWidth())
17825 V2 = Builder.CreateIntCast(
17826 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
17827 else
17828 V1 = Builder.CreateIntCast(
17829 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
17830 }
17831 }
17832 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
17833 if (auto *I = dyn_cast<Instruction>(Vec)) {
17834 GatherShuffleExtractSeq.insert(I);
17835 CSEBlocks.insert(I->getParent());
17836 }
17837 return Vec;
17838 }
17839 /// Creates permutation of the single vector operand with the given mask, if
17840 /// it is not identity mask.
17841 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
17842 if (Mask.empty())
17843 return V1;
17844 unsigned VF = Mask.size();
17845 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
17846 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
17847 return V1;
17848 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
17849 if (auto *I = dyn_cast<Instruction>(Vec)) {
17850 GatherShuffleExtractSeq.insert(I);
17851 CSEBlocks.insert(I->getParent());
17852 }
17853 return Vec;
17854 }
17855 Value *createIdentity(Value *V) { return V; }
17856 Value *createPoison(Type *Ty, unsigned VF) {
17857 return PoisonValue::get(getWidenedType(Ty, VF));
17858 }
17859 /// Resizes 2 input vector to match the sizes, if the they are not equal
17860 /// yet. The smallest vector is resized to the size of the larger vector.
17861 void resizeToMatch(Value *&V1, Value *&V2) {
17862 if (V1->getType() == V2->getType())
17863 return;
17864 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
17865 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
17866 int VF = std::max(V1VF, V2VF);
17867 int MinVF = std::min(V1VF, V2VF);
17868 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
17869 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
17870 0);
17871 Value *&Op = MinVF == V1VF ? V1 : V2;
17872 Op = Builder.CreateShuffleVector(Op, IdentityMask);
17873 if (auto *I = dyn_cast<Instruction>(Op)) {
17874 GatherShuffleExtractSeq.insert(I);
17875 CSEBlocks.insert(I->getParent());
17876 }
17877 if (MinVF == V1VF)
17878 V1 = Op;
17879 else
17880 V2 = Op;
17881 }
17882 };
17883
17884 /// Smart shuffle instruction emission, walks through shuffles trees and
17885 /// tries to find the best matching vector for the actual shuffle
17886 /// instruction.
17887 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
17888 assert(V1 && "Expected at least one vector value.");
17889 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
17890 R.CSEBlocks, *R.DL);
17891 return BaseShuffleAnalysis::createShuffle<Value *>(
17892 V1, V2, Mask, ShuffleBuilder, ScalarTy);
17893 }
17894
17895 /// Cast value \p V to the vector type with the same number of elements, but
17896 /// the base type \p ScalarTy.
17897 Value *castToScalarTyElem(Value *V,
17898 std::optional<bool> IsSigned = std::nullopt) {
17899 auto *VecTy = cast<VectorType>(V->getType());
17900 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
17901 if (VecTy->getElementType() == ScalarTy->getScalarType())
17902 return V;
17903 return Builder.CreateIntCast(
17904 V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
17905 IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
17906 }
17907
17908 Value *getVectorizedValue(const TreeEntry &E) {
17909 Value *Vec = E.VectorizedValue;
17910 if (!Vec->getType()->isIntOrIntVectorTy())
17911 return Vec;
17912 return castToScalarTyElem(Vec, any_of(E.Scalars, [&](Value *V) {
17913 return !isa<PoisonValue>(V) &&
17914 !isKnownNonNegative(
17915 V, SimplifyQuery(*R.DL));
17916 }));
17917 }
17918
17919public:
17921 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
17922
17923 /// Adjusts extractelements after reusing them.
17924 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
17925 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
17926 unsigned NumParts, bool &UseVecBaseAsInput) {
17927 UseVecBaseAsInput = false;
17928 SmallPtrSet<Value *, 4> UniqueBases;
17929 Value *VecBase = nullptr;
17930 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
17931 if (!E->ReorderIndices.empty()) {
17932 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
17933 E->ReorderIndices.end());
17934 reorderScalars(VL, ReorderMask);
17935 }
17936 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
17937 int Idx = Mask[I];
17938 if (Idx == PoisonMaskElem)
17939 continue;
17940 auto *EI = cast<ExtractElementInst>(VL[I]);
17941 VecBase = EI->getVectorOperand();
17942 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecBase); !TEs.empty())
17943 VecBase = TEs.front()->VectorizedValue;
17944 assert(VecBase && "Expected vectorized value.");
17945 UniqueBases.insert(VecBase);
17946 // If the only one use is vectorized - can delete the extractelement
17947 // itself.
17948 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
17949 (NumParts != 1 && count(VL, EI) > 1) ||
17950 any_of(EI->users(), [&](User *U) {
17951 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
17952 return UTEs.empty() || UTEs.size() > 1 ||
17953 (isa<GetElementPtrInst>(U) &&
17954 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
17955 (!UTEs.empty() &&
17956 count_if(R.VectorizableTree,
17957 [&](const std::unique_ptr<TreeEntry> &TE) {
17958 return TE->UserTreeIndex.UserTE ==
17959 UTEs.front() &&
17960 is_contained(VL, EI);
17961 }) != 1);
17962 }))
17963 continue;
17964 R.eraseInstruction(EI);
17965 }
17966 if (NumParts == 1 || UniqueBases.size() == 1) {
17967 assert(VecBase && "Expected vectorized value.");
17968 return castToScalarTyElem(VecBase);
17969 }
17970 UseVecBaseAsInput = true;
17971 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
17972 for (auto [I, Idx] : enumerate(Mask))
17973 if (Idx != PoisonMaskElem)
17974 Idx = I;
17975 };
17976 // Perform multi-register vector shuffle, joining them into a single virtual
17977 // long vector.
17978 // Need to shuffle each part independently and then insert all this parts
17979 // into a long virtual vector register, forming the original vector.
17980 Value *Vec = nullptr;
17981 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
17982 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
17983 for (unsigned Part : seq<unsigned>(NumParts)) {
17984 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
17985 ArrayRef<Value *> SubVL = ArrayRef(VL).slice(Part * SliceSize, Limit);
17986 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
17987 constexpr int MaxBases = 2;
17988 SmallVector<Value *, MaxBases> Bases(MaxBases);
17989 auto VLMask = zip(SubVL, SubMask);
17990 const unsigned VF = std::accumulate(
17991 VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) {
17992 if (std::get<1>(D) == PoisonMaskElem)
17993 return S;
17994 Value *VecOp =
17995 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
17996 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
17997 !TEs.empty())
17998 VecOp = TEs.front()->VectorizedValue;
17999 assert(VecOp && "Expected vectorized value.");
18000 const unsigned Size =
18001 cast<FixedVectorType>(VecOp->getType())->getNumElements();
18002 return std::max(S, Size);
18003 });
18004 for (const auto [V, I] : VLMask) {
18005 if (I == PoisonMaskElem)
18006 continue;
18007 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
18008 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp); !TEs.empty())
18009 VecOp = TEs.front()->VectorizedValue;
18010 assert(VecOp && "Expected vectorized value.");
18011 VecOp = castToScalarTyElem(VecOp);
18012 Bases[I / VF] = VecOp;
18013 }
18014 if (!Bases.front())
18015 continue;
18016 Value *SubVec;
18017 if (Bases.back()) {
18018 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
18019 TransformToIdentity(SubMask);
18020 } else {
18021 SubVec = Bases.front();
18022 }
18023 if (!Vec) {
18024 Vec = SubVec;
18025 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
18026 [&](unsigned P) {
18027 ArrayRef<int> SubMask =
18028 Mask.slice(P * SliceSize,
18029 getNumElems(Mask.size(),
18030 SliceSize, P));
18031 return all_of(SubMask, [](int Idx) {
18032 return Idx == PoisonMaskElem;
18033 });
18034 })) &&
18035 "Expected first part or all previous parts masked.");
18036 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18037 } else {
18038 unsigned NewVF =
18039 cast<FixedVectorType>(Vec->getType())->getNumElements();
18040 if (Vec->getType() != SubVec->getType()) {
18041 unsigned SubVecVF =
18042 cast<FixedVectorType>(SubVec->getType())->getNumElements();
18043 NewVF = std::max(NewVF, SubVecVF);
18044 }
18045 // Adjust SubMask.
18046 for (int &Idx : SubMask)
18047 if (Idx != PoisonMaskElem)
18048 Idx += NewVF;
18049 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18050 Vec = createShuffle(Vec, SubVec, VecMask);
18051 TransformToIdentity(VecMask);
18052 }
18053 }
18054 copy(VecMask, Mask.begin());
18055 return Vec;
18056 }
18057 /// Checks if the specified entry \p E needs to be delayed because of its
18058 /// dependency nodes.
18059 std::optional<Value *>
18060 needToDelay(const TreeEntry *E,
18062 // No need to delay emission if all deps are ready.
18063 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
18064 return all_of(
18065 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
18066 }))
18067 return std::nullopt;
18068 // Postpone gather emission, will be emitted after the end of the
18069 // process to keep correct order.
18070 auto *ResVecTy = getWidenedType(ScalarTy, E->getVectorFactor());
18071 return Builder.CreateAlignedLoad(
18072 ResVecTy,
18073 PoisonValue::get(PointerType::getUnqual(ScalarTy->getContext())),
18074 MaybeAlign());
18075 }
18076 /// Reset the builder to handle perfect diamond match.
18078 IsFinalized = false;
18079 CommonMask.clear();
18080 InVectors.clear();
18081 }
18082 /// Adds 2 input vectors (in form of tree entries) and the mask for their
18083 /// shuffling.
18084 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
18085 Value *V1 = getVectorizedValue(E1);
18086 Value *V2 = getVectorizedValue(E2);
18087 add(V1, V2, Mask);
18088 }
18089 /// Adds single input vector (in form of tree entry) and the mask for its
18090 /// shuffling.
18091 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
18092 Value *V1 = getVectorizedValue(E1);
18093 add(V1, Mask);
18094 }
18095 /// Adds 2 input vectors and the mask for their shuffling.
18096 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
18097 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
18100 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
18101 V1 = castToScalarTyElem(V1);
18102 V2 = castToScalarTyElem(V2);
18103 if (InVectors.empty()) {
18104 InVectors.push_back(V1);
18105 InVectors.push_back(V2);
18106 CommonMask.assign(Mask.begin(), Mask.end());
18107 return;
18108 }
18109 Value *Vec = InVectors.front();
18110 if (InVectors.size() == 2) {
18111 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18112 transformMaskAfterShuffle(CommonMask, CommonMask);
18113 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
18114 Mask.size()) {
18115 Vec = createShuffle(Vec, nullptr, CommonMask);
18116 transformMaskAfterShuffle(CommonMask, CommonMask);
18117 }
18118 V1 = createShuffle(V1, V2, Mask);
18119 unsigned VF = std::max(getVF(V1), getVF(Vec));
18120 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18121 if (Mask[Idx] != PoisonMaskElem)
18122 CommonMask[Idx] = Idx + VF;
18123 InVectors.front() = Vec;
18124 if (InVectors.size() == 2)
18125 InVectors.back() = V1;
18126 else
18127 InVectors.push_back(V1);
18128 }
18129 /// Adds another one input vector and the mask for the shuffling.
18130 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
18132 "castToScalarTyElem expects V1 to be FixedVectorType");
18133 V1 = castToScalarTyElem(V1);
18134 if (InVectors.empty()) {
18135 InVectors.push_back(V1);
18136 CommonMask.assign(Mask.begin(), Mask.end());
18137 return;
18138 }
18139 const auto *It = find(InVectors, V1);
18140 if (It == InVectors.end()) {
18141 if (InVectors.size() == 2 ||
18142 InVectors.front()->getType() != V1->getType()) {
18143 Value *V = InVectors.front();
18144 if (InVectors.size() == 2) {
18145 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
18146 transformMaskAfterShuffle(CommonMask, CommonMask);
18147 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
18148 CommonMask.size()) {
18149 V = createShuffle(InVectors.front(), nullptr, CommonMask);
18150 transformMaskAfterShuffle(CommonMask, CommonMask);
18151 }
18152 unsigned VF = std::max(CommonMask.size(), Mask.size());
18153 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18154 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
18155 CommonMask[Idx] = V->getType() != V1->getType()
18156 ? Idx + VF
18157 : Mask[Idx] + getVF(V1);
18158 if (V->getType() != V1->getType())
18159 V1 = createShuffle(V1, nullptr, Mask);
18160 InVectors.front() = V;
18161 if (InVectors.size() == 2)
18162 InVectors.back() = V1;
18163 else
18164 InVectors.push_back(V1);
18165 return;
18166 }
18167 // Check if second vector is required if the used elements are already
18168 // used from the first one.
18169 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18170 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
18171 InVectors.push_back(V1);
18172 break;
18173 }
18174 }
18175 unsigned VF = 0;
18176 for (Value *V : InVectors)
18177 VF = std::max(VF, getVF(V));
18178 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18179 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
18180 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
18181 }
18182 /// Adds another one input vector and the mask for the shuffling.
18184 SmallVector<int> NewMask;
18185 inversePermutation(Order, NewMask);
18186 add(V1, NewMask);
18187 }
18188 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
18189 Value *Root = nullptr) {
18190 return R.gather(VL, Root, ScalarTy,
18191 [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
18192 return createShuffle(V1, V2, Mask);
18193 });
18194 }
18195 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
18196 /// Finalize emission of the shuffles.
18197 /// \param Action the action (if any) to be performed before final applying of
18198 /// the \p ExtMask mask.
18200 ArrayRef<int> ExtMask,
18201 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
18202 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
18205 Action = {}) {
18206 IsFinalized = true;
18207 if (Action) {
18208 Value *Vec = InVectors.front();
18209 if (InVectors.size() == 2) {
18210 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18211 InVectors.pop_back();
18212 } else {
18213 Vec = createShuffle(Vec, nullptr, CommonMask);
18214 }
18215 transformMaskAfterShuffle(CommonMask, CommonMask);
18216 assert(VF > 0 &&
18217 "Expected vector length for the final value before action.");
18218 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
18219 if (VecVF < VF) {
18220 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
18221 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
18222 Vec = createShuffle(Vec, nullptr, ResizeMask);
18223 }
18224 Action(Vec, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
18225 return createShuffle(V1, V2, Mask);
18226 });
18227 InVectors.front() = Vec;
18228 }
18229 if (!SubVectors.empty()) {
18230 Value *Vec = InVectors.front();
18231 if (InVectors.size() == 2) {
18232 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18233 InVectors.pop_back();
18234 } else {
18235 Vec = createShuffle(Vec, nullptr, CommonMask);
18236 }
18237 transformMaskAfterShuffle(CommonMask, CommonMask);
18238 auto CreateSubVectors = [&](Value *Vec,
18239 SmallVectorImpl<int> &CommonMask) {
18240 for (auto [E, Idx] : SubVectors) {
18241 Value *V = getVectorizedValue(*E);
18242 unsigned InsertionIndex = Idx * getNumElements(ScalarTy);
18243 // Use scalar version of the SCalarType to correctly handle shuffles
18244 // for revectorization. The revectorization mode operates by the
18245 // vectors, but here we need to operate on the scalars, because the
18246 // masks were already transformed for the vector elements and we don't
18247 // need doing this transformation again.
18248 Type *OrigScalarTy = ScalarTy;
18249 ScalarTy = ScalarTy->getScalarType();
18250 Vec = createInsertVector(
18251 Builder, Vec, V, InsertionIndex,
18252 std::bind(&ShuffleInstructionBuilder::createShuffle, this, _1, _2,
18253 _3));
18254 ScalarTy = OrigScalarTy;
18255 if (!CommonMask.empty()) {
18256 std::iota(std::next(CommonMask.begin(), Idx),
18257 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
18258 Idx);
18259 }
18260 }
18261 return Vec;
18262 };
18263 if (SubVectorsMask.empty()) {
18264 Vec = CreateSubVectors(Vec, CommonMask);
18265 } else {
18266 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
18267 copy(SubVectorsMask, SVMask.begin());
18268 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
18269 if (I2 != PoisonMaskElem) {
18270 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
18271 I1 = I2 + CommonMask.size();
18272 }
18273 }
18274 Value *InsertVec =
18275 CreateSubVectors(PoisonValue::get(Vec->getType()), CommonMask);
18276 Vec = createShuffle(InsertVec, Vec, SVMask);
18277 transformMaskAfterShuffle(CommonMask, SVMask);
18278 }
18279 InVectors.front() = Vec;
18280 }
18281
18282 if (!ExtMask.empty()) {
18283 if (CommonMask.empty()) {
18284 CommonMask.assign(ExtMask.begin(), ExtMask.end());
18285 } else {
18286 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
18287 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
18288 if (ExtMask[I] == PoisonMaskElem)
18289 continue;
18290 NewMask[I] = CommonMask[ExtMask[I]];
18291 }
18292 CommonMask.swap(NewMask);
18293 }
18294 }
18295 if (CommonMask.empty()) {
18296 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
18297 return InVectors.front();
18298 }
18299 if (InVectors.size() == 2)
18300 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
18301 return createShuffle(InVectors.front(), nullptr, CommonMask);
18302 }
18303
18305 assert((IsFinalized || CommonMask.empty()) &&
18306 "Shuffle construction must be finalized.");
18307 }
18308};
18309
18310Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
18311 return vectorizeTree(getOperandEntry(E, NodeIdx));
18312}
18313
18314template <typename BVTy, typename ResTy, typename... Args>
18315ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
18316 Args &...Params) {
18317 assert(E->isGather() && "Expected gather node.");
18318 unsigned VF = E->getVectorFactor();
18319
18320 bool NeedFreeze = false;
18321 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
18322 // Clear values, to be replaced by insertvector instructions.
18323 for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
18324 for_each(MutableArrayRef(GatheredScalars)
18325 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
18326 [&](Value *&V) { V = PoisonValue::get(V->getType()); });
18328 E->CombinedEntriesWithIndices.size());
18329 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
18330 [&](const auto &P) {
18331 return std::make_pair(VectorizableTree[P.first].get(), P.second);
18332 });
18333 // Build a mask out of the reorder indices and reorder scalars per this
18334 // mask.
18335 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
18336 E->ReorderIndices.end());
18337 if (!ReorderMask.empty())
18338 reorderScalars(GatheredScalars, ReorderMask);
18339 SmallVector<int> SubVectorsMask;
18340 inversePermutation(E->ReorderIndices, SubVectorsMask);
18341 // Transform non-clustered elements in the mask to poison (-1).
18342 // "Clustered" operations will be reordered using this mask later.
18343 if (!SubVectors.empty() && !SubVectorsMask.empty()) {
18344 for (unsigned I : seq<unsigned>(GatheredScalars.size()))
18345 if (E->Scalars[I] == GatheredScalars[ReorderMask[I]])
18346 SubVectorsMask[ReorderMask[I]] = PoisonMaskElem;
18347 } else {
18348 SubVectorsMask.clear();
18349 }
18350 SmallVector<Value *> StoredGS(GatheredScalars);
18351 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
18352 unsigned I, unsigned SliceSize,
18353 bool IsNotPoisonous) {
18354 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
18355 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
18356 }))
18357 return false;
18358 TreeEntry *UserTE = E->UserTreeIndex.UserTE;
18359 unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;
18360 if (UserTE->getNumOperands() != 2)
18361 return false;
18362 if (!IsNotPoisonous) {
18363 auto *It = find_if(ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
18364 [=](const std::unique_ptr<TreeEntry> &TE) {
18365 return TE->UserTreeIndex.UserTE == UserTE &&
18366 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
18367 });
18368 if (It == VectorizableTree.end())
18369 return false;
18370 SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end());
18371 if (!(*It)->ReorderIndices.empty()) {
18372 inversePermutation((*It)->ReorderIndices, ReorderMask);
18373 reorderScalars(GS, ReorderMask);
18374 }
18375 if (!all_of(zip(GatheredScalars, GS), [&](const auto &P) {
18376 Value *V0 = std::get<0>(P);
18377 Value *V1 = std::get<1>(P);
18378 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
18379 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
18380 is_contained(E->Scalars, V1));
18381 }))
18382 return false;
18383 }
18384 int Idx;
18385 if ((Mask.size() < InputVF &&
18386 ShuffleVectorInst::isExtractSubvectorMask(Mask, InputVF, Idx) &&
18387 Idx == 0) ||
18388 (Mask.size() == InputVF &&
18389 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
18390 std::iota(
18391 std::next(Mask.begin(), I * SliceSize),
18392 std::next(Mask.begin(),
18393 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
18394 0);
18395 } else {
18396 unsigned IVal =
18397 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
18398 std::fill(
18399 std::next(Mask.begin(), I * SliceSize),
18400 std::next(Mask.begin(),
18401 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
18402 IVal);
18403 }
18404 return true;
18405 };
18406 BVTy ShuffleBuilder(ScalarTy, Params...);
18407 ResTy Res = ResTy();
18408 SmallVector<int> Mask;
18409 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
18411 Value *ExtractVecBase = nullptr;
18412 bool UseVecBaseAsInput = false;
18415 Type *OrigScalarTy = GatheredScalars.front()->getType();
18416 auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
18417 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, GatheredScalars.size());
18418 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
18419 // Check for gathered extracts.
18420 bool Resized = false;
18421 ExtractShuffles =
18422 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
18423 if (!ExtractShuffles.empty()) {
18424 SmallVector<const TreeEntry *> ExtractEntries;
18425 for (auto [Idx, I] : enumerate(ExtractMask)) {
18426 if (I == PoisonMaskElem)
18427 continue;
18428 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(
18429 cast<ExtractElementInst>(StoredGS[Idx])->getVectorOperand());
18430 !TEs.empty())
18431 ExtractEntries.append(TEs.begin(), TEs.end());
18432 }
18433 if (std::optional<ResTy> Delayed =
18434 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
18435 // Delay emission of gathers which are not ready yet.
18436 PostponedGathers.insert(E);
18437 // Postpone gather emission, will be emitted after the end of the
18438 // process to keep correct order.
18439 return *Delayed;
18440 }
18441 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
18442 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
18443 ExtractVecBase = VecBase;
18444 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
18445 if (VF == VecBaseTy->getNumElements() &&
18446 GatheredScalars.size() != VF) {
18447 Resized = true;
18448 GatheredScalars.append(VF - GatheredScalars.size(),
18449 PoisonValue::get(OrigScalarTy));
18450 NumParts =
18451 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF), VF);
18452 }
18453 }
18454 }
18455 // Gather extracts after we check for full matched gathers only.
18456 if (!ExtractShuffles.empty() || !E->hasState() ||
18457 E->getOpcode() != Instruction::Load ||
18458 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
18459 any_of(E->Scalars, IsaPred<LoadInst>)) &&
18460 any_of(E->Scalars,
18461 [this](Value *V) {
18462 return isa<LoadInst>(V) && isVectorized(V);
18463 })) ||
18464 (E->hasState() && E->isAltShuffle()) ||
18465 all_of(E->Scalars, [this](Value *V) { return isVectorized(V); }) ||
18466 isSplat(E->Scalars) ||
18467 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
18468 GatherShuffles =
18469 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
18470 }
18471 if (!GatherShuffles.empty()) {
18472 if (std::optional<ResTy> Delayed =
18473 ShuffleBuilder.needToDelay(E, Entries)) {
18474 // Delay emission of gathers which are not ready yet.
18475 PostponedGathers.insert(E);
18476 // Postpone gather emission, will be emitted after the end of the
18477 // process to keep correct order.
18478 return *Delayed;
18479 }
18480 if (GatherShuffles.size() == 1 &&
18481 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
18482 Entries.front().front()->isSame(E->Scalars)) {
18483 // Perfect match in the graph, will reuse the previously vectorized
18484 // node. Cost is 0.
18485 LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "
18486 << shortBundleName(E->Scalars, E->Idx) << ".\n");
18487 // Restore the mask for previous partially matched values.
18488 Mask.resize(E->Scalars.size());
18489 const TreeEntry *FrontTE = Entries.front().front();
18490 if (FrontTE->ReorderIndices.empty() &&
18491 ((FrontTE->ReuseShuffleIndices.empty() &&
18492 E->Scalars.size() == FrontTE->Scalars.size()) ||
18493 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
18494 std::iota(Mask.begin(), Mask.end(), 0);
18495 } else {
18496 for (auto [I, V] : enumerate(E->Scalars)) {
18497 if (isa<PoisonValue>(V)) {
18499 continue;
18500 }
18501 Mask[I] = FrontTE->findLaneForValue(V);
18502 }
18503 }
18504 // Reset the builder(s) to correctly handle perfect diamond matched
18505 // nodes.
18506 ShuffleBuilder.resetForSameNode();
18507 ShuffleBuilder.add(*FrontTE, Mask);
18508 // Full matched entry found, no need to insert subvectors.
18509 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
18510 return Res;
18511 }
18512 if (!Resized) {
18513 if (GatheredScalars.size() != VF &&
18514 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
18515 return any_of(TEs, [&](const TreeEntry *TE) {
18516 return TE->getVectorFactor() == VF;
18517 });
18518 }))
18519 GatheredScalars.append(VF - GatheredScalars.size(),
18520 PoisonValue::get(OrigScalarTy));
18521 }
18522 // Remove shuffled elements from list of gathers.
18523 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
18524 if (Mask[I] != PoisonMaskElem)
18525 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
18526 }
18527 }
18528 }
18529 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
18530 SmallVectorImpl<int> &ReuseMask,
18531 bool IsRootPoison) {
18532 // For splats with can emit broadcasts instead of gathers, so try to find
18533 // such sequences.
18534 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
18535 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
18536 Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
18537 SmallVector<int> UndefPos;
18538 DenseMap<Value *, unsigned> UniquePositions;
18539 // Gather unique non-const values and all constant values.
18540 // For repeated values, just shuffle them.
18541 int NumNonConsts = 0;
18542 int SinglePos = 0;
18543 for (auto [I, V] : enumerate(Scalars)) {
18544 if (isa<UndefValue>(V)) {
18545 if (!isa<PoisonValue>(V)) {
18546 ReuseMask[I] = I;
18547 UndefPos.push_back(I);
18548 }
18549 continue;
18550 }
18551 if (isConstant(V)) {
18552 ReuseMask[I] = I;
18553 continue;
18554 }
18555 ++NumNonConsts;
18556 SinglePos = I;
18557 Value *OrigV = V;
18558 Scalars[I] = PoisonValue::get(OrigScalarTy);
18559 if (IsSplat) {
18560 Scalars.front() = OrigV;
18561 ReuseMask[I] = 0;
18562 } else {
18563 const auto Res = UniquePositions.try_emplace(OrigV, I);
18564 Scalars[Res.first->second] = OrigV;
18565 ReuseMask[I] = Res.first->second;
18566 }
18567 }
18568 if (NumNonConsts == 1) {
18569 // Restore single insert element.
18570 if (IsSplat) {
18571 ReuseMask.assign(VF, PoisonMaskElem);
18572 std::swap(Scalars.front(), Scalars[SinglePos]);
18573 if (!UndefPos.empty() && UndefPos.front() == 0)
18574 Scalars.front() = UndefValue::get(OrigScalarTy);
18575 }
18576 ReuseMask[SinglePos] = SinglePos;
18577 } else if (!UndefPos.empty() && IsSplat) {
18578 // For undef values, try to replace them with the simple broadcast.
18579 // We can do it if the broadcasted value is guaranteed to be
18580 // non-poisonous, or by freezing the incoming scalar value first.
18581 auto *It = find_if(Scalars, [this, E](Value *V) {
18582 return !isa<UndefValue>(V) &&
18584 (E->UserTreeIndex && any_of(V->uses(), [E](const Use &U) {
18585 // Check if the value already used in the same operation in
18586 // one of the nodes already.
18587 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
18588 is_contained(E->UserTreeIndex.UserTE->Scalars,
18589 U.getUser());
18590 })));
18591 });
18592 if (It != Scalars.end()) {
18593 // Replace undefs by the non-poisoned scalars and emit broadcast.
18594 int Pos = std::distance(Scalars.begin(), It);
18595 for (int I : UndefPos) {
18596 // Set the undef position to the non-poisoned scalar.
18597 ReuseMask[I] = Pos;
18598 // Replace the undef by the poison, in the mask it is replaced by
18599 // non-poisoned scalar already.
18600 if (I != Pos)
18601 Scalars[I] = PoisonValue::get(OrigScalarTy);
18602 }
18603 } else {
18604 // Replace undefs by the poisons, emit broadcast and then emit
18605 // freeze.
18606 for (int I : UndefPos) {
18607 ReuseMask[I] = PoisonMaskElem;
18608 if (isa<UndefValue>(Scalars[I]))
18609 Scalars[I] = PoisonValue::get(OrigScalarTy);
18610 }
18611 NeedFreeze = true;
18612 }
18613 }
18614 };
18615 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
18616 bool IsNonPoisoned = true;
18617 bool IsUsedInExpr = true;
18618 Value *Vec1 = nullptr;
18619 if (!ExtractShuffles.empty()) {
18620 // Gather of extractelements can be represented as just a shuffle of
18621 // a single/two vectors the scalars are extracted from.
18622 // Find input vectors.
18623 Value *Vec2 = nullptr;
18624 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
18625 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
18626 ExtractMask[I] = PoisonMaskElem;
18627 }
18628 if (UseVecBaseAsInput) {
18629 Vec1 = ExtractVecBase;
18630 } else {
18631 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
18632 if (ExtractMask[I] == PoisonMaskElem)
18633 continue;
18634 if (isa<UndefValue>(StoredGS[I]))
18635 continue;
18636 auto *EI = cast<ExtractElementInst>(StoredGS[I]);
18637 Value *VecOp = EI->getVectorOperand();
18638 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(VecOp);
18639 !TEs.empty() && TEs.front()->VectorizedValue)
18640 VecOp = TEs.front()->VectorizedValue;
18641 if (!Vec1) {
18642 Vec1 = VecOp;
18643 } else if (Vec1 != VecOp) {
18644 assert((!Vec2 || Vec2 == VecOp) &&
18645 "Expected only 1 or 2 vectors shuffle.");
18646 Vec2 = VecOp;
18647 }
18648 }
18649 }
18650 if (Vec2) {
18651 IsUsedInExpr = false;
18652 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1, AC) &&
18653 isGuaranteedNotToBePoison(Vec2, AC);
18654 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
18655 } else if (Vec1) {
18656 bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1, AC);
18657 IsUsedInExpr &= FindReusedSplat(
18658 ExtractMask,
18659 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
18660 ExtractMask.size(), IsNotPoisonedVec);
18661 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
18662 IsNonPoisoned &= IsNotPoisonedVec;
18663 } else {
18664 IsUsedInExpr = false;
18665 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
18666 /*ForExtracts=*/true);
18667 }
18668 }
18669 if (!GatherShuffles.empty()) {
18670 unsigned SliceSize =
18671 getPartNumElems(E->Scalars.size(),
18672 ::getNumberOfParts(*TTI, VecTy, E->Scalars.size()));
18673 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
18674 for (const auto [I, TEs] : enumerate(Entries)) {
18675 if (TEs.empty()) {
18676 assert(!GatherShuffles[I] &&
18677 "No shuffles with empty entries list expected.");
18678 continue;
18679 }
18680 assert((TEs.size() == 1 || TEs.size() == 2) &&
18681 "Expected shuffle of 1 or 2 entries.");
18682 unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
18683 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
18684 VecMask.assign(VecMask.size(), PoisonMaskElem);
18685 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
18686 if (TEs.size() == 1) {
18687 bool IsNotPoisonedVec =
18688 TEs.front()->VectorizedValue
18689 ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC)
18690 : true;
18691 IsUsedInExpr &=
18692 FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I,
18693 SliceSize, IsNotPoisonedVec);
18694 ShuffleBuilder.add(*TEs.front(), VecMask);
18695 IsNonPoisoned &= IsNotPoisonedVec;
18696 } else {
18697 IsUsedInExpr = false;
18698 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
18699 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
18700 IsNonPoisoned &=
18701 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) &&
18702 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue, AC);
18703 }
18704 }
18705 }
18706 // Try to figure out best way to combine values: build a shuffle and insert
18707 // elements or just build several shuffles.
18708 // Insert non-constant scalars.
18709 SmallVector<Value *> NonConstants(GatheredScalars);
18710 int EMSz = ExtractMask.size();
18711 int MSz = Mask.size();
18712 // Try to build constant vector and shuffle with it only if currently we
18713 // have a single permutation and more than 1 scalar constants.
18714 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
18715 bool IsIdentityShuffle =
18716 ((UseVecBaseAsInput ||
18717 all_of(ExtractShuffles,
18718 [](const std::optional<TTI::ShuffleKind> &SK) {
18719 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
18721 })) &&
18722 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
18723 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
18724 (!GatherShuffles.empty() &&
18725 all_of(GatherShuffles,
18726 [](const std::optional<TTI::ShuffleKind> &SK) {
18727 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
18729 }) &&
18730 none_of(Mask, [&](int I) { return I >= MSz; }) &&
18732 bool EnoughConstsForShuffle =
18733 IsSingleShuffle &&
18734 (none_of(GatheredScalars,
18735 [](Value *V) {
18736 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
18737 }) ||
18738 any_of(GatheredScalars,
18739 [](Value *V) {
18740 return isa<Constant>(V) && !isa<UndefValue>(V);
18741 })) &&
18742 (!IsIdentityShuffle ||
18743 (GatheredScalars.size() == 2 &&
18744 any_of(GatheredScalars,
18745 [](Value *V) { return !isa<UndefValue>(V); })) ||
18746 count_if(GatheredScalars, [](Value *V) {
18747 return isa<Constant>(V) && !isa<PoisonValue>(V);
18748 }) > 1);
18749 // NonConstants array contains just non-constant values, GatheredScalars
18750 // contains only constant to build final vector and then shuffle.
18751 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
18752 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
18753 NonConstants[I] = PoisonValue::get(OrigScalarTy);
18754 else
18755 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
18756 }
18757 // Generate constants for final shuffle and build a mask for them.
18758 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
18759 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
18760 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
18761 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
18762 ShuffleBuilder.add(BV, BVMask);
18763 }
18764 if (all_of(NonConstants, [=](Value *V) {
18765 return isa<PoisonValue>(V) ||
18766 (IsSingleShuffle && ((IsIdentityShuffle &&
18767 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
18768 }))
18769 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
18770 SubVectorsMask);
18771 else
18772 Res = ShuffleBuilder.finalize(
18773 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
18774 [&](Value *&Vec, SmallVectorImpl<int> &Mask, auto CreateShuffle) {
18775 bool IsSplat = isSplat(NonConstants);
18776 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
18777 TryPackScalars(NonConstants, BVMask, /*IsRootPoison=*/false);
18778 auto CheckIfSplatIsProfitable = [&]() {
18779 // Estimate the cost of splatting + shuffle and compare with
18780 // insert + shuffle.
18781 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
18782 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
18783 if (isa<ExtractElementInst>(V) || isVectorized(V))
18784 return false;
18785 InstructionCost SplatCost = TTI->getVectorInstrCost(
18786 Instruction::InsertElement, VecTy, CostKind, /*Index=*/0,
18787 PoisonValue::get(VecTy), V);
18788 SmallVector<int> NewMask(Mask.begin(), Mask.end());
18789 for (auto [Idx, I] : enumerate(BVMask))
18790 if (I != PoisonMaskElem)
18791 NewMask[Idx] = Mask.size();
18792 SplatCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy,
18793 NewMask, CostKind);
18794 InstructionCost BVCost = TTI->getVectorInstrCost(
18795 Instruction::InsertElement, VecTy, CostKind,
18796 *find_if(Mask, [](int I) { return I != PoisonMaskElem; }),
18797 Vec, V);
18798 // Shuffle required?
18799 if (count(BVMask, PoisonMaskElem) <
18800 static_cast<int>(BVMask.size() - 1)) {
18801 SmallVector<int> NewMask(Mask.begin(), Mask.end());
18802 for (auto [Idx, I] : enumerate(BVMask))
18803 if (I != PoisonMaskElem)
18804 NewMask[Idx] = I;
18805 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
18806 VecTy, NewMask, CostKind);
18807 }
18808 return SplatCost <= BVCost;
18809 };
18810 if (!IsSplat || Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
18811 for (auto [Idx, I] : enumerate(BVMask))
18812 if (I != PoisonMaskElem)
18813 Mask[Idx] = I;
18814 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
18815 } else {
18816 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
18817 SmallVector<Value *> Values(NonConstants.size(),
18818 PoisonValue::get(ScalarTy));
18819 Values[0] = V;
18820 Value *BV = ShuffleBuilder.gather(Values, BVMask.size());
18821 SmallVector<int> SplatMask(BVMask.size(), PoisonMaskElem);
18822 transform(BVMask, SplatMask.begin(), [](int I) {
18823 return I == PoisonMaskElem ? PoisonMaskElem : 0;
18824 });
18825 if (!ShuffleVectorInst::isIdentityMask(SplatMask, VF))
18826 BV = CreateShuffle(BV, nullptr, SplatMask);
18827 for (auto [Idx, I] : enumerate(BVMask))
18828 if (I != PoisonMaskElem)
18829 Mask[Idx] = BVMask.size() + Idx;
18830 Vec = CreateShuffle(Vec, BV, Mask);
18831 for (auto [Idx, I] : enumerate(Mask))
18832 if (I != PoisonMaskElem)
18833 Mask[Idx] = Idx;
18834 }
18835 });
18836 } else if (!allConstant(GatheredScalars)) {
18837 // Gather unique scalars and all constants.
18838 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
18839 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
18840 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
18841 ShuffleBuilder.add(BV, ReuseMask);
18842 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
18843 SubVectorsMask);
18844 } else {
18845 // Gather all constants.
18846 SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem);
18847 for (auto [I, V] : enumerate(GatheredScalars)) {
18848 if (!isa<PoisonValue>(V))
18849 Mask[I] = I;
18850 }
18851 Value *BV = ShuffleBuilder.gather(GatheredScalars);
18852 ShuffleBuilder.add(BV, Mask);
18853 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
18854 SubVectorsMask);
18855 }
18856
18857 if (NeedFreeze)
18858 Res = ShuffleBuilder.createFreeze(Res);
18859 return Res;
18860}
18861
18862Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
18863 for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
18864 (void)vectorizeTree(VectorizableTree[EIdx].get());
18865 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
18866 Builder, *this);
18867}
18868
18869/// \returns \p I after propagating metadata from \p VL only for instructions in
18870/// \p VL.
18873 for (Value *V : VL)
18874 if (isa<Instruction>(V))
18875 Insts.push_back(V);
18876 return llvm::propagateMetadata(Inst, Insts);
18877}
18878
18880 if (DebugLoc DL = PN.getDebugLoc())
18881 return DL;
18882 return DebugLoc::getUnknown();
18883}
18884
18885Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
18886 IRBuilderBase::InsertPointGuard Guard(Builder);
18887
18888 Value *V = E->Scalars.front();
18889 Type *ScalarTy = V->getType();
18890 if (!isa<CmpInst>(V))
18891 ScalarTy = getValueType(V);
18892 auto It = MinBWs.find(E);
18893 if (It != MinBWs.end()) {
18894 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
18895 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
18896 if (VecTy)
18897 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
18898 }
18899 if (E->VectorizedValue)
18900 return E->VectorizedValue;
18901 auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
18902 if (E->isGather()) {
18903 // Set insert point for non-reduction initial nodes.
18904 if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
18905 setInsertPointAfterBundle(E);
18906 Value *Vec = createBuildVector(E, ScalarTy);
18907 E->VectorizedValue = Vec;
18908 return Vec;
18909 }
18910 if (E->State == TreeEntry::SplitVectorize) {
18911 assert(E->CombinedEntriesWithIndices.size() == 2 &&
18912 "Expected exactly 2 combined entries.");
18913 setInsertPointAfterBundle(E);
18914 TreeEntry &OpTE1 =
18915 *VectorizableTree[E->CombinedEntriesWithIndices.front().first];
18916 assert(OpTE1.isSame(
18917 ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) &&
18918 "Expected same first part of scalars.");
18919 Value *Op1 = vectorizeTree(&OpTE1);
18920 TreeEntry &OpTE2 =
18921 *VectorizableTree[E->CombinedEntriesWithIndices.back().first];
18922 assert(
18923 OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) &&
18924 "Expected same second part of scalars.");
18925 Value *Op2 = vectorizeTree(&OpTE2);
18926 auto GetOperandSignedness = [&](const TreeEntry *OpE) {
18927 bool IsSigned = false;
18928 auto It = MinBWs.find(OpE);
18929 if (It != MinBWs.end())
18930 IsSigned = It->second.second;
18931 else
18932 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
18933 if (isa<PoisonValue>(V))
18934 return false;
18935 return !isKnownNonNegative(R, SimplifyQuery(*DL));
18936 });
18937 return IsSigned;
18938 };
18939 if (cast<VectorType>(Op1->getType())->getElementType() !=
18940 ScalarTy->getScalarType()) {
18941 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
18942 Op1 = Builder.CreateIntCast(
18943 Op1,
18945 ScalarTy,
18946 cast<FixedVectorType>(Op1->getType())->getNumElements()),
18947 GetOperandSignedness(&OpTE1));
18948 }
18949 if (cast<VectorType>(Op2->getType())->getElementType() !=
18950 ScalarTy->getScalarType()) {
18951 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
18952 Op2 = Builder.CreateIntCast(
18953 Op2,
18955 ScalarTy,
18956 cast<FixedVectorType>(Op2->getType())->getNumElements()),
18957 GetOperandSignedness(&OpTE2));
18958 }
18959 if (E->ReorderIndices.empty()) {
18960 SmallVector<int> Mask(E->getVectorFactor(), PoisonMaskElem);
18961 std::iota(
18962 Mask.begin(),
18963 std::next(Mask.begin(), E->CombinedEntriesWithIndices.back().second),
18964 0);
18965 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
18966 if (ScalarTyNumElements != 1) {
18967 assert(SLPReVec && "Only supported by REVEC.");
18968 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, Mask);
18969 }
18970 Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
18971 Vec = createInsertVector(Builder, Vec, Op2,
18972 E->CombinedEntriesWithIndices.back().second *
18973 ScalarTyNumElements);
18974 E->VectorizedValue = Vec;
18975 return Vec;
18976 }
18977 unsigned CommonVF =
18978 std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
18979 if (getNumElements(Op1->getType()) != CommonVF) {
18980 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
18981 std::iota(Mask.begin(), std::next(Mask.begin(), OpTE1.getVectorFactor()),
18982 0);
18983 Op1 = Builder.CreateShuffleVector(Op1, Mask);
18984 }
18985 if (getNumElements(Op2->getType()) != CommonVF) {
18986 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
18987 std::iota(Mask.begin(), std::next(Mask.begin(), OpTE2.getVectorFactor()),
18988 0);
18989 Op2 = Builder.CreateShuffleVector(Op2, Mask);
18990 }
18991 Value *Vec = Builder.CreateShuffleVector(Op1, Op2, E->getSplitMask());
18992 E->VectorizedValue = Vec;
18993 return Vec;
18994 }
18995
18996 bool IsReverseOrder =
18997 !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices);
18998 auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
18999 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
19000 if (E->getOpcode() == Instruction::Store &&
19001 E->State == TreeEntry::Vectorize) {
19002 ArrayRef<int> Mask =
19003 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
19004 E->ReorderIndices.size());
19005 ShuffleBuilder.add(V, Mask);
19006 } else if ((E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
19007 E->State == TreeEntry::CompressVectorize) {
19008 ShuffleBuilder.addOrdered(V, {});
19009 } else {
19010 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
19011 }
19013 E->CombinedEntriesWithIndices.size());
19014 transform(
19015 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
19016 return std::make_pair(VectorizableTree[P.first].get(), P.second);
19017 });
19018 assert(
19019 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
19020 "Expected either combined subnodes or reordering");
19021 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
19022 };
19023
19024 assert(!E->isGather() && "Unhandled state");
19025 unsigned ShuffleOrOp =
19026 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
19027 Instruction *VL0 = E->getMainOp();
19028 auto GetOperandSignedness = [&](unsigned Idx) {
19029 const TreeEntry *OpE = getOperandEntry(E, Idx);
19030 bool IsSigned = false;
19031 auto It = MinBWs.find(OpE);
19032 if (It != MinBWs.end())
19033 IsSigned = It->second.second;
19034 else
19035 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
19036 if (isa<PoisonValue>(V))
19037 return false;
19038 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19039 });
19040 return IsSigned;
19041 };
19042 switch (ShuffleOrOp) {
19043 case Instruction::PHI: {
19044 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
19045 E != VectorizableTree.front().get() || E->UserTreeIndex) &&
19046 "PHI reordering is free.");
19047 auto *PH = cast<PHINode>(VL0);
19048 Builder.SetInsertPoint(PH->getParent(),
19049 PH->getParent()->getFirstNonPHIIt());
19050 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
19051 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
19052 Value *V = NewPhi;
19053
19054 // Adjust insertion point once all PHI's have been generated.
19055 Builder.SetInsertPoint(PH->getParent(),
19056 PH->getParent()->getFirstInsertionPt());
19057 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
19058
19059 V = FinalShuffle(V, E);
19060
19061 E->VectorizedValue = V;
19062 // If phi node is fully emitted - exit.
19063 if (NewPhi->getNumIncomingValues() != 0)
19064 return NewPhi;
19065
19066 // PHINodes may have multiple entries from the same block. We want to
19067 // visit every block once.
19068 SmallPtrSet<BasicBlock *, 4> VisitedBBs;
19069
19070 for (unsigned I : seq<unsigned>(PH->getNumIncomingValues())) {
19071 BasicBlock *IBB = PH->getIncomingBlock(I);
19072
19073 // Stop emission if all incoming values are generated.
19074 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
19075 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
19076 return NewPhi;
19077 }
19078
19079 if (!VisitedBBs.insert(IBB).second) {
19080 Value *VecOp = NewPhi->getIncomingValueForBlock(IBB);
19081 NewPhi->addIncoming(VecOp, IBB);
19082 TreeEntry *OpTE = getOperandEntry(E, I);
19083 assert(!OpTE->VectorizedValue && "Expected no vectorized value.");
19084 OpTE->VectorizedValue = VecOp;
19085 continue;
19086 }
19087
19088 Builder.SetInsertPoint(IBB->getTerminator());
19089 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
19090 Value *Vec = vectorizeOperand(E, I);
19091 if (VecTy != Vec->getType()) {
19092 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
19093 MinBWs.contains(getOperandEntry(E, I))) &&
19094 "Expected item in MinBWs.");
19095 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
19096 }
19097 NewPhi->addIncoming(Vec, IBB);
19098 }
19099
19100 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
19101 "Invalid number of incoming values");
19102 assert(E->VectorizedValue && "Expected vectorized value.");
19103 return E->VectorizedValue;
19104 }
19105
19106 case Instruction::ExtractElement: {
19107 Value *V = E->getSingleOperand(0);
19108 setInsertPointAfterBundle(E);
19109 V = FinalShuffle(V, E);
19110 E->VectorizedValue = V;
19111 return V;
19112 }
19113 case Instruction::ExtractValue: {
19114 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
19115 Builder.SetInsertPoint(LI);
19116 Value *Ptr = LI->getPointerOperand();
19117 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
19118 Value *NewV = ::propagateMetadata(V, E->Scalars);
19119 NewV = FinalShuffle(NewV, E);
19120 E->VectorizedValue = NewV;
19121 return NewV;
19122 }
19123 case Instruction::InsertElement: {
19124 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
19125 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
19126 Value *V = vectorizeOperand(E, 1);
19127 ArrayRef<Value *> Op = E->getOperand(1);
19128 Type *ScalarTy = Op.front()->getType();
19129 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
19130 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
19131 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
19132 assert(Res.first > 0 && "Expected item in MinBWs.");
19133 V = Builder.CreateIntCast(
19134 V,
19136 ScalarTy,
19137 cast<FixedVectorType>(V->getType())->getNumElements()),
19138 Res.second);
19139 }
19140
19141 // Create InsertVector shuffle if necessary
19142 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
19143 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
19144 }));
19145 const unsigned NumElts =
19146 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
19147 const unsigned NumScalars = E->Scalars.size();
19148
19149 unsigned Offset = *getElementIndex(VL0);
19150 assert(Offset < NumElts && "Failed to find vector index offset");
19151
19152 // Create shuffle to resize vector
19153 SmallVector<int> Mask;
19154 if (!E->ReorderIndices.empty()) {
19155 inversePermutation(E->ReorderIndices, Mask);
19156 Mask.append(NumElts - NumScalars, PoisonMaskElem);
19157 } else {
19158 Mask.assign(NumElts, PoisonMaskElem);
19159 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
19160 }
19161 // Create InsertVector shuffle if necessary
19162 bool IsIdentity = true;
19163 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
19164 Mask.swap(PrevMask);
19165 for (unsigned I = 0; I < NumScalars; ++I) {
19166 Value *Scalar = E->Scalars[PrevMask[I]];
19167 unsigned InsertIdx = *getElementIndex(Scalar);
19168 IsIdentity &= InsertIdx - Offset == I;
19169 Mask[InsertIdx - Offset] = I;
19170 }
19171 if (!IsIdentity || NumElts != NumScalars) {
19172 Value *V2 = nullptr;
19173 bool IsVNonPoisonous =
19175 SmallVector<int> InsertMask(Mask);
19176 if (NumElts != NumScalars && Offset == 0) {
19177 // Follow all insert element instructions from the current buildvector
19178 // sequence.
19179 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
19180 do {
19181 std::optional<unsigned> InsertIdx = getElementIndex(Ins);
19182 if (!InsertIdx)
19183 break;
19184 if (InsertMask[*InsertIdx] == PoisonMaskElem)
19185 InsertMask[*InsertIdx] = *InsertIdx;
19186 if (!Ins->hasOneUse())
19187 break;
19189 Ins->getUniqueUndroppableUser());
19190 } while (Ins);
19191 SmallBitVector UseMask =
19192 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19193 SmallBitVector IsFirstPoison =
19194 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19195 SmallBitVector IsFirstUndef =
19196 isUndefVector(FirstInsert->getOperand(0), UseMask);
19197 if (!IsFirstPoison.all()) {
19198 unsigned Idx = 0;
19199 for (unsigned I = 0; I < NumElts; I++) {
19200 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
19201 IsFirstUndef.test(I)) {
19202 if (IsVNonPoisonous) {
19203 InsertMask[I] = I < NumScalars ? I : 0;
19204 continue;
19205 }
19206 if (!V2)
19207 V2 = UndefValue::get(V->getType());
19208 if (Idx >= NumScalars)
19209 Idx = NumScalars - 1;
19210 InsertMask[I] = NumScalars + Idx;
19211 ++Idx;
19212 } else if (InsertMask[I] != PoisonMaskElem &&
19213 Mask[I] == PoisonMaskElem) {
19214 InsertMask[I] = PoisonMaskElem;
19215 }
19216 }
19217 } else {
19218 InsertMask = Mask;
19219 }
19220 }
19221 if (!V2)
19222 V2 = PoisonValue::get(V->getType());
19223 V = Builder.CreateShuffleVector(V, V2, InsertMask);
19224 if (auto *I = dyn_cast<Instruction>(V)) {
19225 GatherShuffleExtractSeq.insert(I);
19226 CSEBlocks.insert(I->getParent());
19227 }
19228 }
19229
19230 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
19231 for (unsigned I = 0; I < NumElts; I++) {
19232 if (Mask[I] != PoisonMaskElem)
19233 InsertMask[Offset + I] = I;
19234 }
19235 SmallBitVector UseMask =
19236 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19237 SmallBitVector IsFirstUndef =
19238 isUndefVector(FirstInsert->getOperand(0), UseMask);
19239 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
19240 NumElts != NumScalars) {
19241 if (IsFirstUndef.all()) {
19242 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
19243 SmallBitVector IsFirstPoison =
19244 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19245 if (!IsFirstPoison.all()) {
19246 for (unsigned I = 0; I < NumElts; I++) {
19247 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
19248 InsertMask[I] = I + NumElts;
19249 }
19250 }
19251 V = Builder.CreateShuffleVector(
19252 V,
19253 IsFirstPoison.all() ? PoisonValue::get(V->getType())
19254 : FirstInsert->getOperand(0),
19255 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
19256 if (auto *I = dyn_cast<Instruction>(V)) {
19257 GatherShuffleExtractSeq.insert(I);
19258 CSEBlocks.insert(I->getParent());
19259 }
19260 }
19261 } else {
19262 SmallBitVector IsFirstPoison =
19263 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19264 for (unsigned I = 0; I < NumElts; I++) {
19265 if (InsertMask[I] == PoisonMaskElem)
19266 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
19267 else
19268 InsertMask[I] += NumElts;
19269 }
19270 V = Builder.CreateShuffleVector(
19271 FirstInsert->getOperand(0), V, InsertMask,
19272 cast<Instruction>(E->Scalars.back())->getName());
19273 if (auto *I = dyn_cast<Instruction>(V)) {
19274 GatherShuffleExtractSeq.insert(I);
19275 CSEBlocks.insert(I->getParent());
19276 }
19277 }
19278 }
19279
19280 ++NumVectorInstructions;
19281 E->VectorizedValue = V;
19282 return V;
19283 }
19284 case Instruction::ZExt:
19285 case Instruction::SExt:
19286 case Instruction::FPToUI:
19287 case Instruction::FPToSI:
19288 case Instruction::FPExt:
19289 case Instruction::PtrToInt:
19290 case Instruction::IntToPtr:
19291 case Instruction::SIToFP:
19292 case Instruction::UIToFP:
19293 case Instruction::Trunc:
19294 case Instruction::FPTrunc:
19295 case Instruction::BitCast: {
19296 setInsertPointAfterBundle(E);
19297
19298 Value *InVec = vectorizeOperand(E, 0);
19299
19300 auto *CI = cast<CastInst>(VL0);
19301 Instruction::CastOps VecOpcode = CI->getOpcode();
19302 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
19303 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
19304 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
19305 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
19306 SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {
19307 // Check if the values are candidates to demote.
19308 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
19309 if (SrcIt != MinBWs.end())
19310 SrcBWSz = SrcIt->second.first;
19311 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
19312 if (BWSz == SrcBWSz) {
19313 VecOpcode = Instruction::BitCast;
19314 } else if (BWSz < SrcBWSz) {
19315 VecOpcode = Instruction::Trunc;
19316 } else if (It != MinBWs.end()) {
19317 assert(BWSz > SrcBWSz && "Invalid cast!");
19318 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
19319 } else if (SrcIt != MinBWs.end()) {
19320 assert(BWSz > SrcBWSz && "Invalid cast!");
19321 VecOpcode =
19322 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
19323 }
19324 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
19325 !SrcIt->second.second) {
19326 VecOpcode = Instruction::UIToFP;
19327 }
19328 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
19329 ? InVec
19330 : Builder.CreateCast(VecOpcode, InVec, VecTy);
19331 V = FinalShuffle(V, E);
19332
19333 E->VectorizedValue = V;
19334 ++NumVectorInstructions;
19335 return V;
19336 }
19337 case Instruction::FCmp:
19338 case Instruction::ICmp: {
19339 setInsertPointAfterBundle(E);
19340
19341 Value *L = vectorizeOperand(E, 0);
19342 Value *R = vectorizeOperand(E, 1);
19343 if (L->getType() != R->getType()) {
19344 assert((getOperandEntry(E, 0)->isGather() ||
19345 getOperandEntry(E, 1)->isGather() ||
19346 MinBWs.contains(getOperandEntry(E, 0)) ||
19347 MinBWs.contains(getOperandEntry(E, 1))) &&
19348 "Expected item in MinBWs.");
19349 if (cast<VectorType>(L->getType())
19350 ->getElementType()
19351 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
19352 ->getElementType()
19353 ->getIntegerBitWidth()) {
19354 Type *CastTy = R->getType();
19355 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
19356 } else {
19357 Type *CastTy = L->getType();
19358 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
19359 }
19360 }
19361
19362 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
19363 Value *V = Builder.CreateCmp(P0, L, R);
19364 propagateIRFlags(V, E->Scalars, VL0);
19365 if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end())
19366 ICmp->setSameSign(/*B=*/false);
19367 // Do not cast for cmps.
19368 VecTy = cast<FixedVectorType>(V->getType());
19369 V = FinalShuffle(V, E);
19370
19371 E->VectorizedValue = V;
19372 ++NumVectorInstructions;
19373 return V;
19374 }
19375 case Instruction::Select: {
19376 setInsertPointAfterBundle(E);
19377
19378 Value *Cond = vectorizeOperand(E, 0);
19379 Value *True = vectorizeOperand(E, 1);
19380 Value *False = vectorizeOperand(E, 2);
19381 if (True->getType() != VecTy || False->getType() != VecTy) {
19382 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
19383 getOperandEntry(E, 2)->isGather() ||
19384 MinBWs.contains(getOperandEntry(E, 1)) ||
19385 MinBWs.contains(getOperandEntry(E, 2))) &&
19386 "Expected item in MinBWs.");
19387 if (True->getType() != VecTy)
19388 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
19389 if (False->getType() != VecTy)
19390 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
19391 }
19392
19393 unsigned CondNumElements = getNumElements(Cond->getType());
19394 unsigned TrueNumElements = getNumElements(True->getType());
19395 assert(TrueNumElements >= CondNumElements &&
19396 TrueNumElements % CondNumElements == 0 &&
19397 "Cannot vectorize Instruction::Select");
19398 assert(TrueNumElements == getNumElements(False->getType()) &&
19399 "Cannot vectorize Instruction::Select");
19400 if (CondNumElements != TrueNumElements) {
19401 // When the return type is i1 but the source is fixed vector type, we
19402 // need to duplicate the condition value.
19403 Cond = Builder.CreateShuffleVector(
19404 Cond, createReplicatedMask(TrueNumElements / CondNumElements,
19405 CondNumElements));
19406 }
19407 assert(getNumElements(Cond->getType()) == TrueNumElements &&
19408 "Cannot vectorize Instruction::Select");
19409 Value *V = Builder.CreateSelect(Cond, True, False);
19410 V = FinalShuffle(V, E);
19411
19412 E->VectorizedValue = V;
19413 ++NumVectorInstructions;
19414 return V;
19415 }
19416 case Instruction::FNeg: {
19417 setInsertPointAfterBundle(E);
19418
19419 Value *Op = vectorizeOperand(E, 0);
19420
19421 Value *V = Builder.CreateUnOp(
19422 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
19423 propagateIRFlags(V, E->Scalars, VL0);
19424 if (auto *I = dyn_cast<Instruction>(V))
19425 V = ::propagateMetadata(I, E->Scalars);
19426
19427 V = FinalShuffle(V, E);
19428
19429 E->VectorizedValue = V;
19430 ++NumVectorInstructions;
19431
19432 return V;
19433 }
19434 case Instruction::Freeze: {
19435 setInsertPointAfterBundle(E);
19436
19437 Value *Op = vectorizeOperand(E, 0);
19438
19439 if (Op->getType() != VecTy) {
19440 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
19441 MinBWs.contains(getOperandEntry(E, 0))) &&
19442 "Expected item in MinBWs.");
19443 Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0));
19444 }
19445 Value *V = Builder.CreateFreeze(Op);
19446 V = FinalShuffle(V, E);
19447
19448 E->VectorizedValue = V;
19449 ++NumVectorInstructions;
19450
19451 return V;
19452 }
19453 case Instruction::Add:
19454 case Instruction::FAdd:
19455 case Instruction::Sub:
19456 case Instruction::FSub:
19457 case Instruction::Mul:
19458 case Instruction::FMul:
19459 case Instruction::UDiv:
19460 case Instruction::SDiv:
19461 case Instruction::FDiv:
19462 case Instruction::URem:
19463 case Instruction::SRem:
19464 case Instruction::FRem:
19465 case Instruction::Shl:
19466 case Instruction::LShr:
19467 case Instruction::AShr:
19468 case Instruction::And:
19469 case Instruction::Or:
19470 case Instruction::Xor: {
19471 setInsertPointAfterBundle(E);
19472
19473 Value *LHS = vectorizeOperand(E, 0);
19474 Value *RHS = vectorizeOperand(E, 1);
19475 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
19476 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
19477 ArrayRef<Value *> Ops = E->getOperand(I);
19478 if (all_of(Ops, [&](Value *Op) {
19479 auto *CI = dyn_cast<ConstantInt>(Op);
19480 return CI && CI->getValue().countr_one() >= It->second.first;
19481 })) {
19482 V = FinalShuffle(I == 0 ? RHS : LHS, E);
19483 E->VectorizedValue = V;
19484 ++NumVectorInstructions;
19485 return V;
19486 }
19487 }
19488 }
19489 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
19490 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
19491 getOperandEntry(E, 1)->isGather() ||
19492 MinBWs.contains(getOperandEntry(E, 0)) ||
19493 MinBWs.contains(getOperandEntry(E, 1))) &&
19494 "Expected item in MinBWs.");
19495 if (LHS->getType() != VecTy)
19496 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
19497 if (RHS->getType() != VecTy)
19498 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
19499 }
19500
19501 Value *V = Builder.CreateBinOp(
19502 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
19503 RHS);
19504 propagateIRFlags(V, E->Scalars, nullptr, It == MinBWs.end());
19505 if (auto *I = dyn_cast<Instruction>(V)) {
19506 V = ::propagateMetadata(I, E->Scalars);
19507 // Drop nuw flags for abs(sub(commutative), true).
19508 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
19509 any_of(E->Scalars, [](Value *V) {
19510 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
19511 }))
19512 I->setHasNoUnsignedWrap(/*b=*/false);
19513 }
19514
19515 V = FinalShuffle(V, E);
19516
19517 E->VectorizedValue = V;
19518 ++NumVectorInstructions;
19519
19520 return V;
19521 }
19522 case Instruction::Load: {
19523 // Loads are inserted at the head of the tree because we don't want to
19524 // sink them all the way down past store instructions.
19525 setInsertPointAfterBundle(E);
19526
19527 LoadInst *LI = cast<LoadInst>(VL0);
19528 Instruction *NewLI;
19529 FixedVectorType *StridedLoadTy = nullptr;
19530 Value *PO = LI->getPointerOperand();
19531 if (E->State == TreeEntry::Vectorize) {
19532 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
19533 } else if (E->State == TreeEntry::CompressVectorize) {
19534 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
19535 CompressEntryToData.at(E);
19536 Align CommonAlignment = LI->getAlign();
19537 if (IsMasked) {
19538 unsigned VF = getNumElements(LoadVecTy);
19539 SmallVector<Constant *> MaskValues(
19540 VF / getNumElements(LI->getType()),
19541 ConstantInt::getFalse(VecTy->getContext()));
19542 for (int I : CompressMask)
19543 MaskValues[I] = ConstantInt::getTrue(VecTy->getContext());
19544 if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
19545 assert(SLPReVec && "Only supported by REVEC.");
19546 MaskValues = replicateMask(MaskValues, VecTy->getNumElements());
19547 }
19548 Constant *MaskValue = ConstantVector::get(MaskValues);
19549 NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
19550 MaskValue);
19551 } else {
19552 NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
19553 }
19554 NewLI = ::propagateMetadata(NewLI, E->Scalars);
19555 // TODO: include this cost into CommonCost.
19556 if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
19557 assert(SLPReVec && "FixedVectorType is not expected.");
19558 transformScalarShuffleIndiciesToVector(VecTy->getNumElements(),
19559 CompressMask);
19560 }
19561 NewLI =
19562 cast<Instruction>(Builder.CreateShuffleVector(NewLI, CompressMask));
19563 } else if (E->State == TreeEntry::StridedVectorize) {
19564 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
19565 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
19566 PO = IsReverseOrder ? PtrN : Ptr0;
19567 Type *StrideTy = DL->getIndexType(PO->getType());
19568 Value *StrideVal;
19569 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
19570 StridedLoadTy = SPtrInfo.Ty;
19571 assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
19572 unsigned StridedLoadEC =
19573 StridedLoadTy->getElementCount().getKnownMinValue();
19574
19575 Value *Stride = SPtrInfo.StrideVal;
19576 if (!Stride) {
19577 const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
19578 assert(StrideSCEV && "Neither StrideVal nor StrideSCEV were set.");
19579 SCEVExpander Expander(*SE, *DL, "strided-load-vec");
19580 Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->getType(),
19581 &*Builder.GetInsertPoint());
19582 }
19583 Value *NewStride =
19584 Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true);
19585 StrideVal = Builder.CreateMul(
19586 NewStride, ConstantInt::get(
19587 StrideTy, (IsReverseOrder ? -1 : 1) *
19588 static_cast<int>(
19589 DL->getTypeAllocSize(ScalarTy))));
19590 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
19591 auto *Inst = Builder.CreateIntrinsic(
19592 Intrinsic::experimental_vp_strided_load,
19593 {StridedLoadTy, PO->getType(), StrideTy},
19594 {PO, StrideVal,
19595 Builder.getAllOnesMask(ElementCount::getFixed(StridedLoadEC)),
19596 Builder.getInt32(StridedLoadEC)});
19597 Inst->addParamAttr(
19598 /*ArgNo=*/0,
19599 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
19600 NewLI = Inst;
19601 } else {
19602 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
19603 Value *VecPtr = vectorizeOperand(E, 0);
19604 if (isa<FixedVectorType>(ScalarTy)) {
19605 assert(SLPReVec && "FixedVectorType is not expected.");
19606 // CreateMaskedGather expects VecTy and VecPtr have same size. We need
19607 // to expand VecPtr if ScalarTy is a vector type.
19608 unsigned ScalarTyNumElements =
19609 cast<FixedVectorType>(ScalarTy)->getNumElements();
19610 unsigned VecTyNumElements =
19611 cast<FixedVectorType>(VecTy)->getNumElements();
19612 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
19613 "Cannot expand getelementptr.");
19614 unsigned VF = VecTyNumElements / ScalarTyNumElements;
19615 SmallVector<Constant *> Indices(VecTyNumElements);
19616 transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) {
19617 return Builder.getInt64(I % ScalarTyNumElements);
19618 });
19619 VecPtr = Builder.CreateGEP(
19620 VecTy->getElementType(),
19621 Builder.CreateShuffleVector(
19622 VecPtr, createReplicatedMask(ScalarTyNumElements, VF)),
19623 ConstantVector::get(Indices));
19624 }
19625 // Use the minimum alignment of the gathered loads.
19626 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
19627 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
19628 }
19629 Value *V = E->State == TreeEntry::CompressVectorize
19630 ? NewLI
19631 : ::propagateMetadata(NewLI, E->Scalars);
19632
19633 V = FinalShuffle(V, E);
19634 E->VectorizedValue = V;
19635 ++NumVectorInstructions;
19636 return V;
19637 }
19638 case Instruction::Store: {
19639 auto *SI = cast<StoreInst>(VL0);
19640
19641 setInsertPointAfterBundle(E);
19642
19643 Value *VecValue = vectorizeOperand(E, 0);
19644 if (VecValue->getType() != VecTy)
19645 VecValue =
19646 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
19647 VecValue = FinalShuffle(VecValue, E);
19648
19649 Value *Ptr = SI->getPointerOperand();
19650 Instruction *ST;
19651 if (E->State == TreeEntry::Vectorize) {
19652 ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
19653 } else {
19654 assert(E->State == TreeEntry::StridedVectorize &&
19655 "Expected either strided or consecutive stores.");
19656 if (!E->ReorderIndices.empty()) {
19657 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
19658 Ptr = SI->getPointerOperand();
19659 }
19660 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
19661 Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
19662 auto *Inst = Builder.CreateIntrinsic(
19663 Intrinsic::experimental_vp_strided_store,
19664 {VecTy, Ptr->getType(), StrideTy},
19665 {VecValue, Ptr,
19666 ConstantInt::get(
19667 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
19668 Builder.getAllOnesMask(VecTy->getElementCount()),
19669 Builder.getInt32(E->Scalars.size())});
19670 Inst->addParamAttr(
19671 /*ArgNo=*/1,
19672 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
19673 ST = Inst;
19674 }
19675
19676 Value *V = ::propagateMetadata(ST, E->Scalars);
19677
19678 E->VectorizedValue = V;
19679 ++NumVectorInstructions;
19680 return V;
19681 }
19682 case Instruction::GetElementPtr: {
19683 auto *GEP0 = cast<GetElementPtrInst>(VL0);
19684 setInsertPointAfterBundle(E);
19685
19686 Value *Op0 = vectorizeOperand(E, 0);
19687
19688 SmallVector<Value *> OpVecs;
19689 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
19690 Value *OpVec = vectorizeOperand(E, J);
19691 OpVecs.push_back(OpVec);
19692 }
19693
19694 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
19695 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
19697 for (Value *V : E->Scalars) {
19699 GEPs.push_back(V);
19700 }
19701 V = ::propagateMetadata(I, GEPs);
19702 }
19703
19704 V = FinalShuffle(V, E);
19705
19706 E->VectorizedValue = V;
19707 ++NumVectorInstructions;
19708
19709 return V;
19710 }
19711 case Instruction::Call: {
19712 CallInst *CI = cast<CallInst>(VL0);
19713 setInsertPointAfterBundle(E);
19714
19716
19718 CI, ID, VecTy->getNumElements(),
19719 It != MinBWs.end() ? It->second.first : 0, TTI);
19720 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
19721 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
19722 VecCallCosts.first <= VecCallCosts.second;
19723
19724 Value *ScalarArg = nullptr;
19725 SmallVector<Value *> OpVecs;
19726 SmallVector<Type *, 2> TysForDecl;
19727 // Add return type if intrinsic is overloaded on it.
19728 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI))
19729 TysForDecl.push_back(VecTy);
19730 auto *CEI = cast<CallInst>(VL0);
19731 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
19732 // Some intrinsics have scalar arguments. This argument should not be
19733 // vectorized.
19734 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) {
19735 ScalarArg = CEI->getArgOperand(I);
19736 // if decided to reduce bitwidth of abs intrinsic, it second argument
19737 // must be set false (do not return poison, if value issigned min).
19738 if (ID == Intrinsic::abs && It != MinBWs.end() &&
19739 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
19740 ScalarArg = Builder.getFalse();
19741 OpVecs.push_back(ScalarArg);
19743 TysForDecl.push_back(ScalarArg->getType());
19744 continue;
19745 }
19746
19747 Value *OpVec = vectorizeOperand(E, I);
19748 ScalarArg = CEI->getArgOperand(I);
19749 if (cast<VectorType>(OpVec->getType())->getElementType() !=
19750 ScalarArg->getType()->getScalarType() &&
19751 It == MinBWs.end()) {
19752 auto *CastTy =
19753 getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
19754 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
19755 } else if (It != MinBWs.end()) {
19756 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
19757 }
19758 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
19759 OpVecs.push_back(OpVec);
19760 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))
19761 TysForDecl.push_back(OpVec->getType());
19762 }
19763
19764 Function *CF;
19765 if (!UseIntrinsic) {
19766 VFShape Shape =
19768 ElementCount::getFixed(VecTy->getNumElements()),
19769 false /*HasGlobalPred*/);
19770 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
19771 } else {
19772 CF = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, TysForDecl);
19773 }
19774
19776 CI->getOperandBundlesAsDefs(OpBundles);
19777 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
19778
19779 propagateIRFlags(V, E->Scalars, VL0);
19780 V = FinalShuffle(V, E);
19781
19782 E->VectorizedValue = V;
19783 ++NumVectorInstructions;
19784 return V;
19785 }
19786 case Instruction::ShuffleVector: {
19787 Value *V;
19788 if (SLPReVec && !E->isAltShuffle()) {
19789 setInsertPointAfterBundle(E);
19790 Value *Src = vectorizeOperand(E, 0);
19791 SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));
19792 if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
19793 SmallVector<int> NewMask(ThisMask.size());
19794 transform(ThisMask, NewMask.begin(), [&SVSrc](int Mask) {
19795 return SVSrc->getShuffleMask()[Mask];
19796 });
19797 V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
19798 SVSrc->getOperand(1), NewMask);
19799 } else {
19800 V = Builder.CreateShuffleVector(Src, ThisMask);
19801 }
19802 propagateIRFlags(V, E->Scalars, VL0);
19803 if (auto *I = dyn_cast<Instruction>(V))
19804 V = ::propagateMetadata(I, E->Scalars);
19805 V = FinalShuffle(V, E);
19806 } else {
19807 assert(E->isAltShuffle() &&
19808 ((Instruction::isBinaryOp(E->getOpcode()) &&
19809 Instruction::isBinaryOp(E->getAltOpcode())) ||
19810 (Instruction::isCast(E->getOpcode()) &&
19811 Instruction::isCast(E->getAltOpcode())) ||
19812 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
19813 "Invalid Shuffle Vector Operand");
19814
19815 Value *LHS = nullptr, *RHS = nullptr;
19816 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
19817 setInsertPointAfterBundle(E);
19818 LHS = vectorizeOperand(E, 0);
19819 RHS = vectorizeOperand(E, 1);
19820 } else {
19821 setInsertPointAfterBundle(E);
19822 LHS = vectorizeOperand(E, 0);
19823 }
19824 if (LHS && RHS &&
19825 ((Instruction::isBinaryOp(E->getOpcode()) &&
19826 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
19827 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
19828 assert((It != MinBWs.end() ||
19829 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
19830 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
19831 MinBWs.contains(getOperandEntry(E, 0)) ||
19832 MinBWs.contains(getOperandEntry(E, 1))) &&
19833 "Expected item in MinBWs.");
19834 Type *CastTy = VecTy;
19835 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
19837 ->getElementType()
19838 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
19839 ->getElementType()
19840 ->getIntegerBitWidth())
19841 CastTy = RHS->getType();
19842 else
19843 CastTy = LHS->getType();
19844 }
19845 if (LHS->getType() != CastTy)
19846 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
19847 if (RHS->getType() != CastTy)
19848 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
19849 }
19850
19851 Value *V0, *V1;
19852 if (Instruction::isBinaryOp(E->getOpcode())) {
19853 V0 = Builder.CreateBinOp(
19854 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
19855 V1 = Builder.CreateBinOp(
19856 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
19857 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
19858 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
19859 auto *AltCI = cast<CmpInst>(E->getAltOp());
19860 CmpInst::Predicate AltPred = AltCI->getPredicate();
19861 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
19862 } else {
19863 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
19864 unsigned SrcBWSz = DL->getTypeSizeInBits(
19865 cast<VectorType>(LHS->getType())->getElementType());
19866 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
19867 if (BWSz <= SrcBWSz) {
19868 if (BWSz < SrcBWSz)
19869 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
19870 assert(LHS->getType() == VecTy &&
19871 "Expected same type as operand.");
19872 if (auto *I = dyn_cast<Instruction>(LHS))
19873 LHS = ::propagateMetadata(I, E->Scalars);
19874 LHS = FinalShuffle(LHS, E);
19875 E->VectorizedValue = LHS;
19876 ++NumVectorInstructions;
19877 return LHS;
19878 }
19879 }
19880 V0 = Builder.CreateCast(
19881 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
19882 V1 = Builder.CreateCast(
19883 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
19884 }
19885 // Add V0 and V1 to later analysis to try to find and remove matching
19886 // instruction, if any.
19887 for (Value *V : {V0, V1}) {
19888 if (auto *I = dyn_cast<Instruction>(V)) {
19889 GatherShuffleExtractSeq.insert(I);
19890 CSEBlocks.insert(I->getParent());
19891 }
19892 }
19893
19894 // Create shuffle to take alternate operations from the vector.
19895 // Also, gather up main and alt scalar ops to propagate IR flags to
19896 // each vector operation.
19897 ValueList OpScalars, AltScalars;
19898 SmallVector<int> Mask;
19899 E->buildAltOpShuffleMask(
19900 [E, this](Instruction *I) {
19901 assert(E->getMatchingMainOpOrAltOp(I) &&
19902 "Unexpected main/alternate opcode");
19903 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
19904 *TLI);
19905 },
19906 Mask, &OpScalars, &AltScalars);
19907
19908 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
19909 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
19910 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
19911 // Drop nuw flags for abs(sub(commutative), true).
19912 if (auto *I = dyn_cast<Instruction>(Vec);
19913 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
19914 any_of(E->Scalars, [](Value *V) {
19915 if (isa<PoisonValue>(V))
19916 return false;
19917 auto *IV = cast<Instruction>(V);
19918 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
19919 }))
19920 I->setHasNoUnsignedWrap(/*b=*/false);
19921 };
19922 DropNuwFlag(V0, E->getOpcode());
19923 DropNuwFlag(V1, E->getAltOpcode());
19924
19925 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
19926 assert(SLPReVec && "FixedVectorType is not expected.");
19927 transformScalarShuffleIndiciesToVector(VecTy->getNumElements(), Mask);
19928 }
19929 V = Builder.CreateShuffleVector(V0, V1, Mask);
19930 if (auto *I = dyn_cast<Instruction>(V)) {
19931 V = ::propagateMetadata(I, E->Scalars);
19932 GatherShuffleExtractSeq.insert(I);
19933 CSEBlocks.insert(I->getParent());
19934 }
19935 }
19936
19937 E->VectorizedValue = V;
19938 ++NumVectorInstructions;
19939
19940 return V;
19941 }
19942 default:
19943 llvm_unreachable("unknown inst");
19944 }
19945 return nullptr;
19946}
19947
19949 ExtraValueToDebugLocsMap ExternallyUsedValues;
19950 return vectorizeTree(ExternallyUsedValues);
19951}
19952
19954 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
19955 Instruction *ReductionRoot,
19956 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
19957 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
19958 // need to rebuild it.
19959 EntryToLastInstruction.clear();
19960 // All blocks must be scheduled before any instructions are inserted.
19961 for (auto &BSIter : BlocksSchedules)
19962 scheduleBlock(*this, BSIter.second.get());
19963 // Cache last instructions for the nodes to avoid side effects, which may
19964 // appear during vectorization, like extra uses, etc.
19965 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
19966 if (TE->isGather())
19967 continue;
19968 (void)getLastInstructionInBundle(TE.get());
19969 }
19970
19971 if (ReductionRoot)
19972 Builder.SetInsertPoint(ReductionRoot->getParent(),
19973 ReductionRoot->getIterator());
19974 else
19975 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
19976
19977 // Vectorize gather operands of the nodes with the external uses only.
19979 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
19980 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
19981 TE->UserTreeIndex.UserTE->hasState() &&
19982 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
19983 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
19984 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
19985 !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
19986 all_of(TE->UserTreeIndex.UserTE->Scalars,
19987 [](Value *V) { return isUsedOutsideBlock(V); })) {
19988 Instruction &LastInst =
19989 getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
19990 GatherEntries.emplace_back(TE.get(), &LastInst);
19991 }
19992 }
19993 for (auto &Entry : GatherEntries) {
19994 IRBuilderBase::InsertPointGuard Guard(Builder);
19995 Builder.SetInsertPoint(Entry.second);
19996 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
19997 (void)vectorizeTree(Entry.first);
19998 }
19999 // Emit gathered loads first to emit better code for the users of those
20000 // gathered loads.
20001 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20002 if (GatheredLoadsEntriesFirst.has_value() &&
20003 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
20004 (!TE->isGather() || TE->UserTreeIndex)) {
20005 assert((TE->UserTreeIndex ||
20006 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
20007 "Expected gathered load node.");
20008 (void)vectorizeTree(TE.get());
20009 }
20010 }
20011 (void)vectorizeTree(VectorizableTree[0].get());
20012 // Run through the list of postponed gathers and emit them, replacing the temp
20013 // emitted allocas with actual vector instructions.
20014 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
20016 for (const TreeEntry *E : PostponedNodes) {
20017 auto *TE = const_cast<TreeEntry *>(E);
20018 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
20019 TE->VectorizedValue = nullptr;
20020 auto *UserI = cast<Instruction>(TE->UserTreeIndex.UserTE->VectorizedValue);
20021 // If user is a PHI node, its vector code have to be inserted right before
20022 // block terminator. Since the node was delayed, there were some unresolved
20023 // dependencies at the moment when stab instruction was emitted. In a case
20024 // when any of these dependencies turn out an operand of another PHI, coming
20025 // from this same block, position of a stab instruction will become invalid.
20026 // The is because source vector that supposed to feed this gather node was
20027 // inserted at the end of the block [after stab instruction]. So we need
20028 // to adjust insertion point again to the end of block.
20029 if (isa<PHINode>(UserI)) {
20030 // Insert before all users.
20031 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
20032 for (User *U : PrevVec->users()) {
20033 if (U == UserI)
20034 continue;
20035 auto *UI = dyn_cast<Instruction>(U);
20036 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
20037 continue;
20038 if (UI->comesBefore(InsertPt))
20039 InsertPt = UI;
20040 }
20041 Builder.SetInsertPoint(InsertPt);
20042 } else {
20043 Builder.SetInsertPoint(PrevVec);
20044 }
20045 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
20046 Value *Vec = vectorizeTree(TE);
20047 if (auto *VecI = dyn_cast<Instruction>(Vec);
20048 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
20049 Builder.GetInsertPoint()->comesBefore(VecI))
20050 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
20051 Builder.GetInsertPoint());
20052 if (Vec->getType() != PrevVec->getType()) {
20053 assert(Vec->getType()->isIntOrIntVectorTy() &&
20054 PrevVec->getType()->isIntOrIntVectorTy() &&
20055 "Expected integer vector types only.");
20056 std::optional<bool> IsSigned;
20057 for (Value *V : TE->Scalars) {
20058 if (isVectorized(V)) {
20059 for (const TreeEntry *MNTE : getTreeEntries(V)) {
20060 auto It = MinBWs.find(MNTE);
20061 if (It != MinBWs.end()) {
20062 IsSigned = IsSigned.value_or(false) || It->second.second;
20063 if (*IsSigned)
20064 break;
20065 }
20066 }
20067 if (IsSigned.value_or(false))
20068 break;
20069 // Scan through gather nodes.
20070 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
20071 auto It = MinBWs.find(BVE);
20072 if (It != MinBWs.end()) {
20073 IsSigned = IsSigned.value_or(false) || It->second.second;
20074 if (*IsSigned)
20075 break;
20076 }
20077 }
20078 if (IsSigned.value_or(false))
20079 break;
20080 if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
20081 IsSigned =
20082 IsSigned.value_or(false) ||
20083 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
20084 continue;
20085 }
20086 if (IsSigned.value_or(false))
20087 break;
20088 }
20089 }
20090 if (IsSigned.value_or(false)) {
20091 // Final attempt - check user node.
20092 auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
20093 if (It != MinBWs.end())
20094 IsSigned = It->second.second;
20095 }
20096 assert(IsSigned &&
20097 "Expected user node or perfect diamond match in MinBWs.");
20098 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
20099 }
20100 PrevVec->replaceAllUsesWith(Vec);
20101 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
20102 // Replace the stub vector node, if it was used before for one of the
20103 // buildvector nodes already.
20104 auto It = PostponedValues.find(PrevVec);
20105 if (It != PostponedValues.end()) {
20106 for (TreeEntry *VTE : It->getSecond())
20107 VTE->VectorizedValue = Vec;
20108 }
20109 eraseInstruction(PrevVec);
20110 }
20111
20112 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
20113 << " values .\n");
20114
20116 // Maps vector instruction to original insertelement instruction
20117 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
20118 // Maps extract Scalar to the corresponding extractelement instruction in the
20119 // basic block. Only one extractelement per block should be emitted.
20121 ScalarToEEs;
20122 SmallDenseSet<Value *, 4> UsedInserts;
20124 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
20126 // Extract all of the elements with the external uses.
20127 for (const auto &ExternalUse : ExternalUses) {
20128 Value *Scalar = ExternalUse.Scalar;
20129 llvm::User *User = ExternalUse.User;
20130
20131 // Skip users that we already RAUW. This happens when one instruction
20132 // has multiple uses of the same value.
20133 if (User && !is_contained(Scalar->users(), User))
20134 continue;
20135 const TreeEntry *E = &ExternalUse.E;
20136 assert(E && "Invalid scalar");
20137 assert(!E->isGather() && "Extracting from a gather list");
20138 // Non-instruction pointers are not deleted, just skip them.
20139 if (E->getOpcode() == Instruction::GetElementPtr &&
20140 !isa<GetElementPtrInst>(Scalar))
20141 continue;
20142
20143 Value *Vec = E->VectorizedValue;
20144 assert(Vec && "Can't find vectorizable value");
20145
20146 Value *Lane = Builder.getInt32(ExternalUse.Lane);
20147 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
20148 if (Scalar->getType() != Vec->getType()) {
20149 Value *Ex = nullptr;
20150 Value *ExV = nullptr;
20151 auto *Inst = dyn_cast<Instruction>(Scalar);
20152 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
20153 auto It = ScalarToEEs.find(Scalar);
20154 if (It != ScalarToEEs.end()) {
20155 // No need to emit many extracts, just move the only one in the
20156 // current block.
20157 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
20158 : Builder.GetInsertBlock());
20159 if (EEIt != It->second.end()) {
20160 Value *PrevV = EEIt->second.first;
20161 if (auto *I = dyn_cast<Instruction>(PrevV);
20162 I && !ReplaceInst &&
20163 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
20164 Builder.GetInsertPoint()->comesBefore(I)) {
20165 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
20166 Builder.GetInsertPoint());
20167 if (auto *CI = dyn_cast<Instruction>(EEIt->second.second))
20168 CI->moveAfter(I);
20169 }
20170 Ex = PrevV;
20171 ExV = EEIt->second.second ? EEIt->second.second : Ex;
20172 }
20173 }
20174 if (!Ex) {
20175 // "Reuse" the existing extract to improve final codegen.
20176 if (ReplaceInst) {
20177 // Leave the instruction as is, if it cheaper extracts and all
20178 // operands are scalar.
20179 if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
20180 IgnoredExtracts.insert(EE);
20181 Ex = EE;
20182 } else {
20183 auto *CloneInst = Inst->clone();
20184 CloneInst->insertBefore(Inst->getIterator());
20185 if (Inst->hasName())
20186 CloneInst->takeName(Inst);
20187 Ex = CloneInst;
20188 }
20189 } else if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
20190 ES && isa<Instruction>(Vec)) {
20191 Value *V = ES->getVectorOperand();
20192 auto *IVec = cast<Instruction>(Vec);
20193 if (ArrayRef<TreeEntry *> ETEs = getTreeEntries(V); !ETEs.empty())
20194 V = ETEs.front()->VectorizedValue;
20195 if (auto *IV = dyn_cast<Instruction>(V);
20196 !IV || IV == Vec || IV->getParent() != IVec->getParent() ||
20197 IV->comesBefore(IVec))
20198 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
20199 else
20200 Ex = Builder.CreateExtractElement(Vec, Lane);
20201 } else if (auto *VecTy =
20202 dyn_cast<FixedVectorType>(Scalar->getType())) {
20203 assert(SLPReVec && "FixedVectorType is not expected.");
20204 unsigned VecTyNumElements = VecTy->getNumElements();
20205 // When REVEC is enabled, we need to extract a vector.
20206 // Note: The element size of Scalar may be different from the
20207 // element size of Vec.
20208 Ex = createExtractVector(Builder, Vec, VecTyNumElements,
20209 ExternalUse.Lane * VecTyNumElements);
20210 } else {
20211 Ex = Builder.CreateExtractElement(Vec, Lane);
20212 }
20213 // If necessary, sign-extend or zero-extend ScalarRoot
20214 // to the larger type.
20215 ExV = Ex;
20216 if (Scalar->getType() != Ex->getType())
20217 ExV = Builder.CreateIntCast(
20218 Ex, Scalar->getType(),
20219 !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
20220 auto *I = dyn_cast<Instruction>(Ex);
20221 ScalarToEEs[Scalar].try_emplace(I ? I->getParent()
20222 : &F->getEntryBlock(),
20223 std::make_pair(Ex, ExV));
20224 }
20225 // The then branch of the previous if may produce constants, since 0
20226 // operand might be a constant.
20227 if (auto *ExI = dyn_cast<Instruction>(Ex);
20228 ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {
20229 GatherShuffleExtractSeq.insert(ExI);
20230 CSEBlocks.insert(ExI->getParent());
20231 }
20232 return ExV;
20233 }
20234 assert(isa<FixedVectorType>(Scalar->getType()) &&
20235 isa<InsertElementInst>(Scalar) &&
20236 "In-tree scalar of vector type is not insertelement?");
20237 auto *IE = cast<InsertElementInst>(Scalar);
20238 VectorToInsertElement.try_emplace(Vec, IE);
20239 return Vec;
20240 };
20241 // If User == nullptr, the Scalar remains as scalar in vectorized
20242 // instructions or is used as extra arg. Generate ExtractElement instruction
20243 // and update the record for this scalar in ExternallyUsedValues.
20244 if (!User) {
20245 if (!ScalarsWithNullptrUser.insert(Scalar).second)
20246 continue;
20247 assert(
20248 (ExternallyUsedValues.count(Scalar) ||
20249 ExternalUsesWithNonUsers.count(Scalar) ||
20250 ExternalUsesAsOriginalScalar.contains(Scalar) ||
20251 any_of(
20252 Scalar->users(),
20253 [&, TTI = TTI](llvm::User *U) {
20254 if (ExternalUsesAsOriginalScalar.contains(U))
20255 return true;
20256 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
20257 return !UseEntries.empty() &&
20258 (E->State == TreeEntry::Vectorize ||
20259 E->State == TreeEntry::StridedVectorize ||
20260 E->State == TreeEntry::CompressVectorize) &&
20261 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
20262 return (UseEntry->State == TreeEntry::Vectorize ||
20263 UseEntry->State ==
20264 TreeEntry::StridedVectorize ||
20265 UseEntry->State ==
20266 TreeEntry::CompressVectorize) &&
20267 doesInTreeUserNeedToExtract(
20268 Scalar, getRootEntryInstruction(*UseEntry),
20269 TLI, TTI);
20270 });
20271 })) &&
20272 "Scalar with nullptr User must be registered in "
20273 "ExternallyUsedValues map or remain as scalar in vectorized "
20274 "instructions");
20275 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
20276 if (auto *PHI = dyn_cast<PHINode>(VecI)) {
20277 if (PHI->getParent()->isLandingPad())
20278 Builder.SetInsertPoint(
20279 PHI->getParent(),
20280 std::next(
20281 PHI->getParent()->getLandingPadInst()->getIterator()));
20282 else
20283 Builder.SetInsertPoint(PHI->getParent(),
20284 PHI->getParent()->getFirstNonPHIIt());
20285 } else {
20286 Builder.SetInsertPoint(VecI->getParent(),
20287 std::next(VecI->getIterator()));
20288 }
20289 } else {
20290 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20291 }
20292 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20293 // Required to update internally referenced instructions.
20294 if (Scalar != NewInst) {
20295 assert((!isa<ExtractElementInst>(Scalar) ||
20296 !IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&
20297 "Extractelements should not be replaced.");
20298 Scalar->replaceAllUsesWith(NewInst);
20299 }
20300 continue;
20301 }
20302
20303 if (auto *VU = dyn_cast<InsertElementInst>(User);
20304 VU && VU->getOperand(1) == Scalar) {
20305 // Skip if the scalar is another vector op or Vec is not an instruction.
20306 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
20307 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
20308 if (!UsedInserts.insert(VU).second)
20309 continue;
20310 // Need to use original vector, if the root is truncated.
20311 auto BWIt = MinBWs.find(E);
20312 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
20313 auto *ScalarTy = FTy->getElementType();
20314 auto Key = std::make_pair(Vec, ScalarTy);
20315 auto VecIt = VectorCasts.find(Key);
20316 if (VecIt == VectorCasts.end()) {
20317 IRBuilderBase::InsertPointGuard Guard(Builder);
20318 if (auto *IVec = dyn_cast<PHINode>(Vec)) {
20319 if (IVec->getParent()->isLandingPad())
20320 Builder.SetInsertPoint(IVec->getParent(),
20321 std::next(IVec->getParent()
20322 ->getLandingPadInst()
20323 ->getIterator()));
20324 else
20325 Builder.SetInsertPoint(
20326 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
20327 } else if (auto *IVec = dyn_cast<Instruction>(Vec)) {
20328 Builder.SetInsertPoint(IVec->getNextNode());
20329 }
20330 Vec = Builder.CreateIntCast(
20331 Vec,
20333 ScalarTy,
20334 cast<FixedVectorType>(Vec->getType())->getNumElements()),
20335 BWIt->second.second);
20336 VectorCasts.try_emplace(Key, Vec);
20337 } else {
20338 Vec = VecIt->second;
20339 }
20340 }
20341
20342 std::optional<unsigned> InsertIdx = getElementIndex(VU);
20343 if (InsertIdx) {
20344 auto *It = find_if(
20345 ShuffledInserts, [VU](const ShuffledInsertData<Value *> &Data) {
20346 // Checks if 2 insertelements are from the same buildvector.
20347 InsertElementInst *VecInsert = Data.InsertElements.front();
20349 VU, VecInsert,
20350 [](InsertElementInst *II) { return II->getOperand(0); });
20351 });
20352 unsigned Idx = *InsertIdx;
20353 if (It == ShuffledInserts.end()) {
20354 (void)ShuffledInserts.emplace_back();
20355 It = std::next(ShuffledInserts.begin(),
20356 ShuffledInserts.size() - 1);
20357 }
20358 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
20359 if (Mask.empty())
20360 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
20361 Mask[Idx] = ExternalUse.Lane;
20362 It->InsertElements.push_back(cast<InsertElementInst>(User));
20363 continue;
20364 }
20365 }
20366 }
20367 }
20368
20369 // Generate extracts for out-of-tree users.
20370 // Find the insertion point for the extractelement lane.
20371 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
20372 if (PHINode *PH = dyn_cast<PHINode>(User)) {
20373 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
20374 if (PH->getIncomingValue(I) == Scalar) {
20375 Instruction *IncomingTerminator =
20376 PH->getIncomingBlock(I)->getTerminator();
20377 if (isa<CatchSwitchInst>(IncomingTerminator)) {
20378 Builder.SetInsertPoint(VecI->getParent(),
20379 std::next(VecI->getIterator()));
20380 } else {
20381 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
20382 }
20383 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20384 PH->setOperand(I, NewInst);
20385 }
20386 }
20387 } else {
20388 Builder.SetInsertPoint(cast<Instruction>(User));
20389 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20390 User->replaceUsesOfWith(Scalar, NewInst);
20391 }
20392 } else {
20393 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20394 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20395 User->replaceUsesOfWith(Scalar, NewInst);
20396 }
20397
20398 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
20399 }
20400
20401 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
20402 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
20403 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
20404 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
20405 for (int I = 0, E = Mask.size(); I < E; ++I) {
20406 if (Mask[I] < VF)
20407 CombinedMask1[I] = Mask[I];
20408 else
20409 CombinedMask2[I] = Mask[I] - VF;
20410 }
20411 ShuffleInstructionBuilder ShuffleBuilder(
20412 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
20413 ShuffleBuilder.add(V1, CombinedMask1);
20414 if (V2)
20415 ShuffleBuilder.add(V2, CombinedMask2);
20416 return ShuffleBuilder.finalize({}, {}, {});
20417 };
20418
20419 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
20420 bool ForSingleMask) {
20421 unsigned VF = Mask.size();
20422 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
20423 if (VF != VecVF) {
20424 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
20425 Vec = CreateShuffle(Vec, nullptr, Mask);
20426 return std::make_pair(Vec, true);
20427 }
20428 if (!ForSingleMask) {
20429 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
20430 for (unsigned I = 0; I < VF; ++I) {
20431 if (Mask[I] != PoisonMaskElem)
20432 ResizeMask[Mask[I]] = Mask[I];
20433 }
20434 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
20435 }
20436 }
20437
20438 return std::make_pair(Vec, false);
20439 };
20440 // Perform shuffling of the vectorize tree entries for better handling of
20441 // external extracts.
20442 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
20443 // Find the first and the last instruction in the list of insertelements.
20444 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
20445 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
20446 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
20447 Builder.SetInsertPoint(LastInsert);
20448 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
20450 MutableArrayRef(Vector.data(), Vector.size()),
20451 FirstInsert->getOperand(0),
20452 [](Value *Vec) {
20453 return cast<VectorType>(Vec->getType())
20454 ->getElementCount()
20455 .getKnownMinValue();
20456 },
20457 ResizeToVF,
20458 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
20459 ArrayRef<Value *> Vals) {
20460 assert((Vals.size() == 1 || Vals.size() == 2) &&
20461 "Expected exactly 1 or 2 input values.");
20462 if (Vals.size() == 1) {
20463 // Do not create shuffle if the mask is a simple identity
20464 // non-resizing mask.
20465 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
20466 ->getNumElements() ||
20467 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
20468 return CreateShuffle(Vals.front(), nullptr, Mask);
20469 return Vals.front();
20470 }
20471 return CreateShuffle(Vals.front() ? Vals.front()
20472 : FirstInsert->getOperand(0),
20473 Vals.back(), Mask);
20474 });
20475 auto It = ShuffledInserts[I].InsertElements.rbegin();
20476 // Rebuild buildvector chain.
20477 InsertElementInst *II = nullptr;
20478 if (It != ShuffledInserts[I].InsertElements.rend())
20479 II = *It;
20481 while (It != ShuffledInserts[I].InsertElements.rend()) {
20482 assert(II && "Must be an insertelement instruction.");
20483 if (*It == II)
20484 ++It;
20485 else
20486 Inserts.push_back(cast<Instruction>(II));
20487 II = dyn_cast<InsertElementInst>(II->getOperand(0));
20488 }
20489 for (Instruction *II : reverse(Inserts)) {
20490 II->replaceUsesOfWith(II->getOperand(0), NewInst);
20491 if (auto *NewI = dyn_cast<Instruction>(NewInst))
20492 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
20493 II->moveAfter(NewI);
20494 NewInst = II;
20495 }
20496 LastInsert->replaceAllUsesWith(NewInst);
20497 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
20498 IE->replaceUsesOfWith(IE->getOperand(0),
20499 PoisonValue::get(IE->getOperand(0)->getType()));
20500 IE->replaceUsesOfWith(IE->getOperand(1),
20501 PoisonValue::get(IE->getOperand(1)->getType()));
20502 eraseInstruction(IE);
20503 }
20504 CSEBlocks.insert(LastInsert->getParent());
20505 }
20506
20507 SmallVector<Instruction *> RemovedInsts;
20508 // For each vectorized value:
20509 for (auto &TEPtr : VectorizableTree) {
20510 TreeEntry *Entry = TEPtr.get();
20511
20512 // No need to handle users of gathered values.
20513 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
20514 continue;
20515
20516 assert(Entry->VectorizedValue && "Can't find vectorizable value");
20517
20518 // For each lane:
20519 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
20520 Value *Scalar = Entry->Scalars[Lane];
20521
20522 if (Entry->getOpcode() == Instruction::GetElementPtr &&
20523 !isa<GetElementPtrInst>(Scalar))
20524 continue;
20525 if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);
20526 EE && IgnoredExtracts.contains(EE))
20527 continue;
20528 if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
20529 continue;
20530#ifndef NDEBUG
20531 Type *Ty = Scalar->getType();
20532 if (!Ty->isVoidTy()) {
20533 for (User *U : Scalar->users()) {
20534 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
20535
20536 // It is legal to delete users in the ignorelist.
20537 assert((isVectorized(U) ||
20538 (UserIgnoreList && UserIgnoreList->contains(U)) ||
20541 "Deleting out-of-tree value");
20542 }
20543 }
20544#endif
20545 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
20546 auto *I = cast<Instruction>(Scalar);
20547 RemovedInsts.push_back(I);
20548 }
20549 }
20550
20551 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
20552 // new vector instruction.
20553 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
20554 V->mergeDIAssignID(RemovedInsts);
20555
20556 // Clear up reduction references, if any.
20557 if (UserIgnoreList) {
20558 for (Instruction *I : RemovedInsts) {
20559 const TreeEntry *IE = getTreeEntries(I).front();
20560 if (IE->Idx != 0 &&
20561 !(VectorizableTree.front()->isGather() && IE->UserTreeIndex &&
20562 (ValueToGatherNodes.lookup(I).contains(
20563 VectorizableTree.front().get()) ||
20564 (IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
20565 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
20566 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
20567 IE->UserTreeIndex &&
20568 is_contained(VectorizableTree.front()->Scalars, I)) &&
20569 !(GatheredLoadsEntriesFirst.has_value() &&
20570 IE->Idx >= *GatheredLoadsEntriesFirst &&
20571 VectorizableTree.front()->isGather() &&
20572 is_contained(VectorizableTree.front()->Scalars, I)) &&
20573 !(!VectorizableTree.front()->isGather() &&
20574 VectorizableTree.front()->isCopyableElement(I)))
20575 continue;
20576 SmallVector<SelectInst *> LogicalOpSelects;
20577 I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
20578 // Do not replace condition of the logical op in form select <cond>.
20579 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
20580 (match(U.getUser(), m_LogicalAnd()) ||
20581 match(U.getUser(), m_LogicalOr())) &&
20582 U.getOperandNo() == 0;
20583 if (IsPoisoningLogicalOp) {
20584 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
20585 return false;
20586 }
20587 return UserIgnoreList->contains(U.getUser());
20588 });
20589 // Replace conditions of the poisoning logical ops with the non-poison
20590 // constant value.
20591 for (SelectInst *SI : LogicalOpSelects)
20592 SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));
20593 }
20594 }
20595 // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
20596 // cache correctness.
20597 // NOTE: removeInstructionAndOperands only marks the instruction for deletion
20598 // - instructions are not deleted until later.
20599 removeInstructionsAndOperands(ArrayRef(RemovedInsts), VectorValuesAndScales);
20600
20601 Builder.ClearInsertionPoint();
20602 InstrElementSize.clear();
20603
20604 const TreeEntry &RootTE = *VectorizableTree.front();
20605 Value *Vec = RootTE.VectorizedValue;
20606 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
20607 It != MinBWs.end() &&
20608 ReductionBitWidth != It->second.first) {
20609 IRBuilder<>::InsertPointGuard Guard(Builder);
20610 Builder.SetInsertPoint(ReductionRoot->getParent(),
20611 ReductionRoot->getIterator());
20612 Vec = Builder.CreateIntCast(
20613 Vec,
20614 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
20615 cast<VectorType>(Vec->getType())->getElementCount()),
20616 It->second.second);
20617 }
20618 return Vec;
20619}
20620
20622 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
20623 << " gather sequences instructions.\n");
20624 // LICM InsertElementInst sequences.
20625 for (Instruction *I : GatherShuffleExtractSeq) {
20626 if (isDeleted(I))
20627 continue;
20628
20629 // Check if this block is inside a loop.
20630 Loop *L = LI->getLoopFor(I->getParent());
20631 if (!L)
20632 continue;
20633
20634 // Check if it has a preheader.
20635 BasicBlock *PreHeader = L->getLoopPreheader();
20636 if (!PreHeader)
20637 continue;
20638
20639 // If the vector or the element that we insert into it are
20640 // instructions that are defined in this basic block then we can't
20641 // hoist this instruction.
20642 if (any_of(I->operands(), [L](Value *V) {
20643 auto *OpI = dyn_cast<Instruction>(V);
20644 return OpI && L->contains(OpI);
20645 }))
20646 continue;
20647
20648 // We can hoist this instruction. Move it to the pre-header.
20649 I->moveBefore(PreHeader->getTerminator()->getIterator());
20650 CSEBlocks.insert(PreHeader);
20651 }
20652
20653 // Make a list of all reachable blocks in our CSE queue.
20655 CSEWorkList.reserve(CSEBlocks.size());
20656 for (BasicBlock *BB : CSEBlocks)
20657 if (DomTreeNode *N = DT->getNode(BB)) {
20658 assert(DT->isReachableFromEntry(N));
20659 CSEWorkList.push_back(N);
20660 }
20661
20662 // Sort blocks by domination. This ensures we visit a block after all blocks
20663 // dominating it are visited.
20664 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
20665 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
20666 "Different nodes should have different DFS numbers");
20667 return A->getDFSNumIn() < B->getDFSNumIn();
20668 });
20669
20670 // Less defined shuffles can be replaced by the more defined copies.
20671 // Between two shuffles one is less defined if it has the same vector operands
20672 // and its mask indeces are the same as in the first one or undefs. E.g.
20673 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
20674 // poison, <0, 0, 0, 0>.
20675 auto &&IsIdenticalOrLessDefined = [TTI = TTI](Instruction *I1,
20676 Instruction *I2,
20677 SmallVectorImpl<int> &NewMask) {
20678 if (I1->getType() != I2->getType())
20679 return false;
20680 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
20681 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
20682 if (!SI1 || !SI2)
20683 return I1->isIdenticalTo(I2);
20684 if (SI1->isIdenticalTo(SI2))
20685 return true;
20686 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
20687 if (SI1->getOperand(I) != SI2->getOperand(I))
20688 return false;
20689 // Check if the second instruction is more defined than the first one.
20690 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
20691 ArrayRef<int> SM1 = SI1->getShuffleMask();
20692 // Count trailing undefs in the mask to check the final number of used
20693 // registers.
20694 unsigned LastUndefsCnt = 0;
20695 for (int I = 0, E = NewMask.size(); I < E; ++I) {
20696 if (SM1[I] == PoisonMaskElem)
20697 ++LastUndefsCnt;
20698 else
20699 LastUndefsCnt = 0;
20700 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
20701 NewMask[I] != SM1[I])
20702 return false;
20703 if (NewMask[I] == PoisonMaskElem)
20704 NewMask[I] = SM1[I];
20705 }
20706 // Check if the last undefs actually change the final number of used vector
20707 // registers.
20708 return SM1.size() - LastUndefsCnt > 1 &&
20709 ::getNumberOfParts(*TTI, SI1->getType()) ==
20711 *TTI, getWidenedType(SI1->getType()->getElementType(),
20712 SM1.size() - LastUndefsCnt));
20713 };
20714 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
20715 // instructions. TODO: We can further optimize this scan if we split the
20716 // instructions into different buckets based on the insert lane.
20718 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
20719 assert(*I &&
20720 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
20721 "Worklist not sorted properly!");
20722 BasicBlock *BB = (*I)->getBlock();
20723 // For all instructions in blocks containing gather sequences:
20724 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
20725 if (isDeleted(&In))
20726 continue;
20728 !GatherShuffleExtractSeq.contains(&In))
20729 continue;
20730
20731 // Check if we can replace this instruction with any of the
20732 // visited instructions.
20733 bool Replaced = false;
20734 for (Instruction *&V : Visited) {
20735 SmallVector<int> NewMask;
20736 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
20737 DT->dominates(V->getParent(), In.getParent())) {
20738 In.replaceAllUsesWith(V);
20739 eraseInstruction(&In);
20740 if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
20741 if (!NewMask.empty())
20742 SI->setShuffleMask(NewMask);
20743 Replaced = true;
20744 break;
20745 }
20747 GatherShuffleExtractSeq.contains(V) &&
20748 IsIdenticalOrLessDefined(V, &In, NewMask) &&
20749 DT->dominates(In.getParent(), V->getParent())) {
20750 In.moveAfter(V);
20751 V->replaceAllUsesWith(&In);
20753 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
20754 if (!NewMask.empty())
20755 SI->setShuffleMask(NewMask);
20756 V = &In;
20757 Replaced = true;
20758 break;
20759 }
20760 }
20761 if (!Replaced) {
20762 assert(!is_contained(Visited, &In));
20763 Visited.push_back(&In);
20764 }
20765 }
20766 }
20767 CSEBlocks.clear();
20768 GatherShuffleExtractSeq.clear();
20769}
20770
20771BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
20772 ArrayRef<Value *> VL, const InstructionsState &S, const EdgeInfo &EI) {
20773 auto &BundlePtr =
20774 ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
20775 for (Value *V : VL) {
20776 if (S.isNonSchedulable(V))
20777 continue;
20778 auto *I = cast<Instruction>(V);
20779 if (S.isCopyableElement(V)) {
20780 // Add a copyable element model.
20781 ScheduleCopyableData &SD =
20782 addScheduleCopyableData(EI, I, SchedulingRegionID, *BundlePtr);
20783 // Group the instructions to a bundle.
20784 BundlePtr->add(&SD);
20785 continue;
20786 }
20787 ScheduleData *BundleMember = getScheduleData(V);
20788 assert(BundleMember && "no ScheduleData for bundle member "
20789 "(maybe not in same basic block)");
20790 // Group the instructions to a bundle.
20791 BundlePtr->add(BundleMember);
20792 ScheduledBundles.try_emplace(I).first->getSecond().push_back(
20793 BundlePtr.get());
20794 }
20795 assert(BundlePtr && *BundlePtr && "Failed to find schedule bundle");
20796 return *BundlePtr;
20797}
20798
20799// Groups the instructions to a bundle (which is then a single scheduling entity)
20800// and schedules instructions until the bundle gets ready.
20801std::optional<BoUpSLP::ScheduleBundle *>
20802BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
20803 const InstructionsState &S,
20804 const EdgeInfo &EI) {
20805 // No need to schedule PHIs, insertelement, extractelement and extractvalue
20806 // instructions.
20807 bool HasCopyables = S.areInstructionsWithCopyableElements();
20808 if (isa<PHINode>(S.getMainOp()) ||
20809 isVectorLikeInstWithConstOps(S.getMainOp()) ||
20810 (!HasCopyables && doesNotNeedToSchedule(VL)) ||
20811 all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))
20812 return nullptr;
20813
20814 // Initialize the instruction bundle.
20815 Instruction *OldScheduleEnd = ScheduleEnd;
20816 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
20817
20818 auto TryScheduleBundleImpl = [=](bool ReSchedule, ScheduleBundle &Bundle) {
20819 // Clear deps or recalculate the region, if the memory instruction is a
20820 // copyable. It may have memory deps, which must be recalculated.
20821 SmallVector<ScheduleData *> ControlDependentMembers;
20822 auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
20823 SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
20824 for (ScheduleEntity *SE : Bundle.getBundle()) {
20825 if (ScheduleCopyableData *SD = dyn_cast<ScheduleCopyableData>(SE)) {
20826 if (ScheduleData *BundleMember = getScheduleData(SD->getInst());
20827 BundleMember && BundleMember->hasValidDependencies()) {
20828 BundleMember->clearDirectDependencies();
20829 if (RegionHasStackSave ||
20831 BundleMember->getInst()))
20832 ControlDependentMembers.push_back(BundleMember);
20833 }
20834 continue;
20835 }
20836 auto *SD = cast<ScheduleData>(SE);
20837 if (SD->hasValidDependencies() &&
20838 (!S.areInstructionsWithCopyableElements() ||
20839 !S.isCopyableElement(SD->getInst())) &&
20840 !getScheduleCopyableData(SD->getInst()).empty() && EI.UserTE &&
20841 EI.UserTE->hasState() &&
20842 (!EI.UserTE->hasCopyableElements() ||
20843 !EI.UserTE->isCopyableElement(SD->getInst())))
20844 SD->clearDirectDependencies();
20845 for (const Use &U : SD->getInst()->operands()) {
20846 unsigned &NumOps =
20847 UserOpToNumOps
20848 .try_emplace(std::make_pair(SD->getInst(), U.get()), 0)
20849 .first->getSecond();
20850 ++NumOps;
20851 if (auto *Op = dyn_cast<Instruction>(U.get());
20852 Op && areAllOperandsReplacedByCopyableData(SD->getInst(), Op,
20853 *SLP, NumOps)) {
20854 if (ScheduleData *OpSD = getScheduleData(Op);
20855 OpSD && OpSD->hasValidDependencies()) {
20856 OpSD->clearDirectDependencies();
20857 if (RegionHasStackSave ||
20859 ControlDependentMembers.push_back(OpSD);
20860 }
20861 }
20862 }
20863 }
20864 };
20865 // The scheduling region got new instructions at the lower end (or it is a
20866 // new region for the first bundle). This makes it necessary to
20867 // recalculate all dependencies.
20868 // It is seldom that this needs to be done a second time after adding the
20869 // initial bundle to the region.
20870 if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
20871 for_each(ScheduleDataMap, [&](auto &P) {
20872 if (BB != P.first->getParent())
20873 return;
20874 ScheduleData *SD = P.second;
20875 if (isInSchedulingRegion(*SD))
20876 SD->clearDependencies();
20877 });
20878 for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {
20879 for_each(P.second, [&](ScheduleCopyableData *SD) {
20880 if (isInSchedulingRegion(*SD))
20881 SD->clearDependencies();
20882 });
20883 });
20884 ReSchedule = true;
20885 }
20886 // Check if the bundle data has deps for copyable elements already. In
20887 // this case need to reset deps and recalculate it.
20888 if (Bundle && !Bundle.getBundle().empty()) {
20889 if (S.areInstructionsWithCopyableElements() ||
20890 !ScheduleCopyableDataMap.empty())
20891 CheckIfNeedToClearDeps(Bundle);
20892 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << Bundle << " in block "
20893 << BB->getName() << "\n");
20894 calculateDependencies(Bundle, /*InsertInReadyList=*/!ReSchedule, SLP,
20895 ControlDependentMembers);
20896 } else if (!ControlDependentMembers.empty()) {
20897 ScheduleBundle Invalid = ScheduleBundle::invalid();
20898 calculateDependencies(Invalid, /*InsertInReadyList=*/!ReSchedule, SLP,
20899 ControlDependentMembers);
20900 }
20901
20902 if (ReSchedule) {
20903 resetSchedule();
20904 initialFillReadyList(ReadyInsts);
20905 }
20906
20907 // Now try to schedule the new bundle or (if no bundle) just calculate
20908 // dependencies. As soon as the bundle is "ready" it means that there are no
20909 // cyclic dependencies and we can schedule it. Note that's important that we
20910 // don't "schedule" the bundle yet.
20911 while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
20912 !ReadyInsts.empty()) {
20913 ScheduleEntity *Picked = ReadyInsts.pop_back_val();
20914 assert(Picked->isReady() && "must be ready to schedule");
20915 schedule(*SLP, S, EI, Picked, ReadyInsts);
20916 if (Picked == &Bundle)
20917 break;
20918 }
20919 };
20920
20921 // Make sure that the scheduling region contains all
20922 // instructions of the bundle.
20923 for (Value *V : VL) {
20924 if (S.isNonSchedulable(V))
20925 continue;
20926 if (!extendSchedulingRegion(V, S)) {
20927 // If the scheduling region got new instructions at the lower end (or it
20928 // is a new region for the first bundle). This makes it necessary to
20929 // recalculate all dependencies.
20930 // Otherwise the compiler may crash trying to incorrectly calculate
20931 // dependencies and emit instruction in the wrong order at the actual
20932 // scheduling.
20933 ScheduleBundle Invalid = ScheduleBundle::invalid();
20934 TryScheduleBundleImpl(/*ReSchedule=*/false, Invalid);
20935 return std::nullopt;
20936 }
20937 }
20938
20939 bool ReSchedule = false;
20940 for (Value *V : VL) {
20941 if (S.isNonSchedulable(V))
20942 continue;
20944 getScheduleCopyableData(cast<Instruction>(V));
20945 if (!CopyableData.empty()) {
20946 for (ScheduleCopyableData *SD : CopyableData)
20947 ReadyInsts.remove(SD);
20948 }
20949 ScheduleData *BundleMember = getScheduleData(V);
20950 assert((BundleMember || S.isCopyableElement(V)) &&
20951 "no ScheduleData for bundle member (maybe not in same basic block)");
20952 if (!BundleMember)
20953 continue;
20954
20955 // Make sure we don't leave the pieces of the bundle in the ready list when
20956 // whole bundle might not be ready.
20957 ReadyInsts.remove(BundleMember);
20958 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V);
20959 !Bundles.empty()) {
20960 for (ScheduleBundle *B : Bundles)
20961 ReadyInsts.remove(B);
20962 }
20963
20964 if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
20965 continue;
20966 // A bundle member was scheduled as single instruction before and now
20967 // needs to be scheduled as part of the bundle. We just get rid of the
20968 // existing schedule.
20969 // A bundle member has deps calculated before it was copyable element - need
20970 // to reschedule.
20971 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
20972 << " was already scheduled\n");
20973 ReSchedule = true;
20974 }
20975
20976 ScheduleBundle &Bundle = buildBundle(VL, S, EI);
20977 TryScheduleBundleImpl(ReSchedule, Bundle);
20978 if (!Bundle.isReady()) {
20979 for (ScheduleEntity *BD : Bundle.getBundle()) {
20980 // Copyable data scheduling is just removed.
20982 continue;
20983 if (BD->isReady()) {
20984 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(BD->getInst());
20985 if (Bundles.empty()) {
20986 ReadyInsts.insert(BD);
20987 continue;
20988 }
20989 for (ScheduleBundle *B : Bundles)
20990 if (B->isReady())
20991 ReadyInsts.insert(B);
20992 }
20993 }
20994 ScheduledBundlesList.pop_back();
20995 SmallVector<ScheduleData *> ControlDependentMembers;
20996 SmallPtrSet<Instruction *, 4> Visited;
20997 for (Value *V : VL) {
20998 if (S.isNonSchedulable(V))
20999 continue;
21000 auto *I = cast<Instruction>(V);
21001 if (S.isCopyableElement(I)) {
21002 // Remove the copyable data from the scheduling region and restore
21003 // previous mappings.
21004 auto KV = std::make_pair(EI, I);
21005 assert(ScheduleCopyableDataMap.contains(KV) &&
21006 "no ScheduleCopyableData for copyable element");
21007 ScheduleCopyableData *SD =
21008 ScheduleCopyableDataMapByInst.find(I)->getSecond().pop_back_val();
21009 ScheduleCopyableDataMapByUsers[I].remove(SD);
21010 if (EI.UserTE) {
21011 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
21012 const auto *It = find(Op, I);
21013 assert(It != Op.end() && "Lane not set");
21014 SmallPtrSet<Instruction *, 4> Visited;
21015 do {
21016 int Lane = std::distance(Op.begin(), It);
21017 assert(Lane >= 0 && "Lane not set");
21018 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
21019 !EI.UserTE->ReorderIndices.empty())
21020 Lane = EI.UserTE->ReorderIndices[Lane];
21021 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
21022 "Couldn't find extract lane");
21023 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
21024 if (!Visited.insert(In).second) {
21025 It = find(make_range(std::next(It), Op.end()), I);
21026 break;
21027 }
21028 ScheduleCopyableDataMapByInstUser
21029 [std::make_pair(std::make_pair(In, EI.EdgeIdx), I)]
21030 .pop_back();
21031 It = find(make_range(std::next(It), Op.end()), I);
21032 } while (It != Op.end());
21033 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
21034 if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI, I))
21035 ScheduleCopyableDataMapByUsers[I].insert(UserCD);
21036 }
21037 if (ScheduleCopyableDataMapByUsers[I].empty())
21038 ScheduleCopyableDataMapByUsers.erase(I);
21039 ScheduleCopyableDataMap.erase(KV);
21040 // Need to recalculate dependencies for the actual schedule data.
21041 if (ScheduleData *OpSD = getScheduleData(I);
21042 OpSD && OpSD->hasValidDependencies()) {
21043 OpSD->clearDirectDependencies();
21044 if (RegionHasStackSave ||
21046 ControlDependentMembers.push_back(OpSD);
21047 }
21048 continue;
21049 }
21050 ScheduledBundles.find(I)->getSecond().pop_back();
21051 }
21052 if (!ControlDependentMembers.empty()) {
21053 ScheduleBundle Invalid = ScheduleBundle::invalid();
21054 calculateDependencies(Invalid, /*InsertInReadyList=*/false, SLP,
21055 ControlDependentMembers);
21056 }
21057 return std::nullopt;
21058 }
21059 return &Bundle;
21060}
21061
21062BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
21063 // Allocate a new ScheduleData for the instruction.
21064 if (ChunkPos >= ChunkSize) {
21065 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
21066 ChunkPos = 0;
21067 }
21068 return &(ScheduleDataChunks.back()[ChunkPos++]);
21069}
21070
21071bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
21072 Value *V, const InstructionsState &S) {
21074 assert(I && "bundle member must be an instruction");
21075 if (getScheduleData(I))
21076 return true;
21077 if (!ScheduleStart) {
21078 // It's the first instruction in the new region.
21079 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
21080 ScheduleStart = I;
21081 ScheduleEnd = I->getNextNode();
21082 assert(ScheduleEnd && "tried to vectorize a terminator?");
21083 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
21084 return true;
21085 }
21086 // Search up and down at the same time, because we don't know if the new
21087 // instruction is above or below the existing scheduling region.
21088 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
21089 // against the budget. Otherwise debug info could affect codegen.
21091 ++ScheduleStart->getIterator().getReverse();
21092 BasicBlock::reverse_iterator UpperEnd = BB->rend();
21093 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
21094 BasicBlock::iterator LowerEnd = BB->end();
21095 auto IsAssumeLikeIntr = [](const Instruction &I) {
21096 if (auto *II = dyn_cast<IntrinsicInst>(&I))
21097 return II->isAssumeLikeIntrinsic();
21098 return false;
21099 };
21100 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21101 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21102 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
21103 &*DownIter != I) {
21104 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
21105 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
21106 return false;
21107 }
21108
21109 ++UpIter;
21110 ++DownIter;
21111
21112 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21113 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21114 }
21115 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
21116 assert(I->getParent() == ScheduleStart->getParent() &&
21117 "Instruction is in wrong basic block.");
21118 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
21119 ScheduleStart = I;
21120 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
21121 << "\n");
21122 return true;
21123 }
21124 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
21125 "Expected to reach top of the basic block or instruction down the "
21126 "lower end.");
21127 assert(I->getParent() == ScheduleEnd->getParent() &&
21128 "Instruction is in wrong basic block.");
21129 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
21130 nullptr);
21131 ScheduleEnd = I->getNextNode();
21132 assert(ScheduleEnd && "tried to vectorize a terminator?");
21133 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
21134 return true;
21135}
21136
21137void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
21138 Instruction *ToI,
21139 ScheduleData *PrevLoadStore,
21140 ScheduleData *NextLoadStore) {
21141 ScheduleData *CurrentLoadStore = PrevLoadStore;
21142 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
21143 // No need to allocate data for non-schedulable instructions.
21144 if (isa<PHINode>(I))
21145 continue;
21146 ScheduleData *SD = ScheduleDataMap.lookup(I);
21147 if (!SD) {
21148 SD = allocateScheduleDataChunks();
21149 ScheduleDataMap[I] = SD;
21150 }
21151 assert(!isInSchedulingRegion(*SD) &&
21152 "new ScheduleData already in scheduling region");
21153 SD->init(SchedulingRegionID, I);
21154
21155 if (I->mayReadOrWriteMemory() &&
21156 (!isa<IntrinsicInst>(I) ||
21157 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
21158 cast<IntrinsicInst>(I)->getIntrinsicID() !=
21159 Intrinsic::pseudoprobe))) {
21160 // Update the linked list of memory accessing instructions.
21161 if (CurrentLoadStore) {
21162 CurrentLoadStore->setNextLoadStore(SD);
21163 } else {
21164 FirstLoadStoreInRegion = SD;
21165 }
21166 CurrentLoadStore = SD;
21167 }
21168
21171 RegionHasStackSave = true;
21172 }
21173 if (NextLoadStore) {
21174 if (CurrentLoadStore)
21175 CurrentLoadStore->setNextLoadStore(NextLoadStore);
21176 } else {
21177 LastLoadStoreInRegion = CurrentLoadStore;
21178 }
21179}
21180
21181void BoUpSLP::BlockScheduling::calculateDependencies(
21182 ScheduleBundle &Bundle, bool InsertInReadyList, BoUpSLP *SLP,
21183 ArrayRef<ScheduleData *> ControlDeps) {
21184 SmallVector<ScheduleEntity *> WorkList;
21185 auto ProcessNode = [&](ScheduleEntity *SE) {
21186 if (auto *CD = dyn_cast<ScheduleCopyableData>(SE)) {
21187 if (CD->hasValidDependencies())
21188 return;
21189 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *CD << "\n");
21190 CD->initDependencies();
21191 CD->resetUnscheduledDeps();
21192 const EdgeInfo &EI = CD->getEdgeInfo();
21193 if (EI.UserTE) {
21194 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
21195 const auto *It = find(Op, CD->getInst());
21196 assert(It != Op.end() && "Lane not set");
21197 SmallPtrSet<Instruction *, 4> Visited;
21198 do {
21199 int Lane = std::distance(Op.begin(), It);
21200 assert(Lane >= 0 && "Lane not set");
21201 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
21202 !EI.UserTE->ReorderIndices.empty())
21203 Lane = EI.UserTE->ReorderIndices[Lane];
21204 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
21205 "Couldn't find extract lane");
21206 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
21207 if (EI.UserTE->isCopyableElement(In)) {
21208 // We may have not have related copyable scheduling data, if the
21209 // instruction is non-schedulable.
21210 if (ScheduleCopyableData *UseSD =
21211 getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) {
21212 CD->incDependencies();
21213 if (!UseSD->isScheduled())
21214 CD->incrementUnscheduledDeps(1);
21215 if (!UseSD->hasValidDependencies() ||
21216 (InsertInReadyList && UseSD->isReady()))
21217 WorkList.push_back(UseSD);
21218 }
21219 } else if (Visited.insert(In).second) {
21220 if (ScheduleData *UseSD = getScheduleData(In)) {
21221 CD->incDependencies();
21222 if (!UseSD->isScheduled())
21223 CD->incrementUnscheduledDeps(1);
21224 if (!UseSD->hasValidDependencies() ||
21225 (InsertInReadyList && UseSD->isReady()))
21226 WorkList.push_back(UseSD);
21227 }
21228 }
21229 It = find(make_range(std::next(It), Op.end()), CD->getInst());
21230 } while (It != Op.end());
21231 if (CD->isReady() && CD->getDependencies() == 0 &&
21232 (EI.UserTE->hasState() &&
21233 (EI.UserTE->getMainOp()->getParent() !=
21234 CD->getInst()->getParent() ||
21235 (isa<PHINode>(EI.UserTE->getMainOp()) &&
21236 (EI.UserTE->getMainOp()->hasNUsesOrMore(UsesLimit) ||
21237 any_of(EI.UserTE->getMainOp()->users(), [&](User *U) {
21238 auto *IU = dyn_cast<Instruction>(U);
21239 if (!IU)
21240 return true;
21241 return IU->getParent() == EI.UserTE->getMainOp()->getParent();
21242 })))))) {
21243 // If no uses in the block - mark as having pseudo-use, which cannot
21244 // be scheduled.
21245 // Prevents incorrect def-use tracking between external user and
21246 // actual instruction.
21247 CD->incDependencies();
21248 CD->incrementUnscheduledDeps(1);
21249 }
21250 }
21251 return;
21252 }
21253 auto *BundleMember = cast<ScheduleData>(SE);
21254 if (BundleMember->hasValidDependencies())
21255 return;
21256 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
21257 BundleMember->initDependencies();
21258 BundleMember->resetUnscheduledDeps();
21259 // Handle def-use chain dependencies.
21260 SmallDenseMap<Value *, unsigned> UserToNumOps;
21261 for (User *U : BundleMember->getInst()->users()) {
21262 if (isa<PHINode>(U))
21263 continue;
21264 if (ScheduleData *UseSD = getScheduleData(U)) {
21265 // The operand is a copyable element - skip.
21266 unsigned &NumOps = UserToNumOps.try_emplace(U, 0).first->getSecond();
21267 ++NumOps;
21268 if (areAllOperandsReplacedByCopyableData(
21269 cast<Instruction>(U), BundleMember->getInst(), *SLP, NumOps))
21270 continue;
21271 BundleMember->incDependencies();
21272 if (!UseSD->isScheduled())
21273 BundleMember->incrementUnscheduledDeps(1);
21274 if (!UseSD->hasValidDependencies() ||
21275 (InsertInReadyList && UseSD->isReady()))
21276 WorkList.push_back(UseSD);
21277 }
21278 }
21279 for (ScheduleCopyableData *UseSD :
21280 getScheduleCopyableDataUsers(BundleMember->getInst())) {
21281 BundleMember->incDependencies();
21282 if (!UseSD->isScheduled())
21283 BundleMember->incrementUnscheduledDeps(1);
21284 if (!UseSD->hasValidDependencies() ||
21285 (InsertInReadyList && UseSD->isReady()))
21286 WorkList.push_back(UseSD);
21287 }
21288
21289 SmallPtrSet<const Instruction *, 4> Visited;
21290 auto MakeControlDependent = [&](Instruction *I) {
21291 // Do not mark control dependent twice.
21292 if (!Visited.insert(I).second)
21293 return;
21294 auto *DepDest = getScheduleData(I);
21295 assert(DepDest && "must be in schedule window");
21296 DepDest->addControlDependency(BundleMember);
21297 BundleMember->incDependencies();
21298 if (!DepDest->isScheduled())
21299 BundleMember->incrementUnscheduledDeps(1);
21300 if (!DepDest->hasValidDependencies() ||
21301 (InsertInReadyList && DepDest->isReady()))
21302 WorkList.push_back(DepDest);
21303 };
21304
21305 // Any instruction which isn't safe to speculate at the beginning of the
21306 // block is control depend on any early exit or non-willreturn call
21307 // which proceeds it.
21308 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->getInst())) {
21309 for (Instruction *I = BundleMember->getInst()->getNextNode();
21310 I != ScheduleEnd; I = I->getNextNode()) {
21311 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
21312 continue;
21313
21314 // Add the dependency
21315 MakeControlDependent(I);
21316
21318 // Everything past here must be control dependent on I.
21319 break;
21320 }
21321 }
21322
21323 if (RegionHasStackSave) {
21324 // If we have an inalloc alloca instruction, it needs to be scheduled
21325 // after any preceeding stacksave. We also need to prevent any alloca
21326 // from reordering above a preceeding stackrestore.
21327 if (match(BundleMember->getInst(), m_Intrinsic<Intrinsic::stacksave>()) ||
21328 match(BundleMember->getInst(),
21330 for (Instruction *I = BundleMember->getInst()->getNextNode();
21331 I != ScheduleEnd; I = I->getNextNode()) {
21334 // Any allocas past here must be control dependent on I, and I
21335 // must be memory dependend on BundleMember->Inst.
21336 break;
21337
21338 if (!isa<AllocaInst>(I))
21339 continue;
21340
21341 // Add the dependency
21342 MakeControlDependent(I);
21343 }
21344 }
21345
21346 // In addition to the cases handle just above, we need to prevent
21347 // allocas and loads/stores from moving below a stacksave or a
21348 // stackrestore. Avoiding moving allocas below stackrestore is currently
21349 // thought to be conservatism. Moving loads/stores below a stackrestore
21350 // can lead to incorrect code.
21351 if (isa<AllocaInst>(BundleMember->getInst()) ||
21352 BundleMember->getInst()->mayReadOrWriteMemory()) {
21353 for (Instruction *I = BundleMember->getInst()->getNextNode();
21354 I != ScheduleEnd; I = I->getNextNode()) {
21357 continue;
21358
21359 // Add the dependency
21360 MakeControlDependent(I);
21361 break;
21362 }
21363 }
21364 }
21365
21366 // Handle the memory dependencies (if any).
21367 ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
21368 if (!NextLoadStore)
21369 return;
21370 Instruction *SrcInst = BundleMember->getInst();
21371 assert(SrcInst->mayReadOrWriteMemory() &&
21372 "NextLoadStore list for non memory effecting bundle?");
21373 MemoryLocation SrcLoc = getLocation(SrcInst);
21374 bool SrcMayWrite = SrcInst->mayWriteToMemory();
21375 unsigned NumAliased = 0;
21376 unsigned DistToSrc = 1;
21377 bool IsNonSimpleSrc = !SrcLoc.Ptr || !isSimple(SrcInst);
21378
21379 for (ScheduleData *DepDest = NextLoadStore; DepDest;
21380 DepDest = DepDest->getNextLoadStore()) {
21381 assert(isInSchedulingRegion(*DepDest) && "Expected to be in region");
21382
21383 // We have two limits to reduce the complexity:
21384 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
21385 // SLP->isAliased (which is the expensive part in this loop).
21386 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
21387 // the whole loop (even if the loop is fast, it's quadratic).
21388 // It's important for the loop break condition (see below) to
21389 // check this limit even between two read-only instructions.
21390 if (DistToSrc >= MaxMemDepDistance ||
21391 ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
21392 (IsNonSimpleSrc || NumAliased >= AliasedCheckLimit ||
21393 SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
21394
21395 // We increment the counter only if the locations are aliased
21396 // (instead of counting all alias checks). This gives a better
21397 // balance between reduced runtime and accurate dependencies.
21398 NumAliased++;
21399
21400 DepDest->addMemoryDependency(BundleMember);
21401 BundleMember->incDependencies();
21402 if (!DepDest->isScheduled())
21403 BundleMember->incrementUnscheduledDeps(1);
21404 if (!DepDest->hasValidDependencies() ||
21405 (InsertInReadyList && DepDest->isReady()))
21406 WorkList.push_back(DepDest);
21407 }
21408
21409 // Example, explaining the loop break condition: Let's assume our
21410 // starting instruction is i0 and MaxMemDepDistance = 3.
21411 //
21412 // +--------v--v--v
21413 // i0,i1,i2,i3,i4,i5,i6,i7,i8
21414 // +--------^--^--^
21415 //
21416 // MaxMemDepDistance let us stop alias-checking at i3 and we add
21417 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
21418 // Previously we already added dependencies from i3 to i6,i7,i8
21419 // (because of MaxMemDepDistance). As we added a dependency from
21420 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
21421 // and we can abort this loop at i6.
21422 if (DistToSrc >= 2 * MaxMemDepDistance)
21423 break;
21424 DistToSrc++;
21425 }
21426 };
21427
21428 assert((Bundle || !ControlDeps.empty()) &&
21429 "expected at least one instruction to schedule");
21430 if (Bundle)
21431 WorkList.push_back(Bundle.getBundle().front());
21432 WorkList.append(ControlDeps.begin(), ControlDeps.end());
21433 SmallPtrSet<ScheduleBundle *, 16> Visited;
21434 while (!WorkList.empty()) {
21435 ScheduleEntity *SD = WorkList.pop_back_val();
21436 SmallVector<ScheduleBundle *, 1> CopyableBundle;
21438 if (auto *CD = dyn_cast<ScheduleCopyableData>(SD)) {
21439 CopyableBundle.push_back(&CD->getBundle());
21440 Bundles = CopyableBundle;
21441 } else {
21442 Bundles = getScheduleBundles(SD->getInst());
21443 }
21444 if (Bundles.empty()) {
21445 if (!SD->hasValidDependencies())
21446 ProcessNode(SD);
21447 if (InsertInReadyList && SD->isReady()) {
21448 ReadyInsts.insert(SD);
21449 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD << "\n");
21450 }
21451 continue;
21452 }
21453 for (ScheduleBundle *Bundle : Bundles) {
21454 if (Bundle->hasValidDependencies() || !Visited.insert(Bundle).second)
21455 continue;
21456 assert(isInSchedulingRegion(*Bundle) &&
21457 "ScheduleData not in scheduling region");
21458 for_each(Bundle->getBundle(), ProcessNode);
21459 }
21460 if (InsertInReadyList && SD->isReady()) {
21461 for (ScheduleBundle *Bundle : Bundles) {
21462 assert(isInSchedulingRegion(*Bundle) &&
21463 "ScheduleData not in scheduling region");
21464 if (!Bundle->isReady())
21465 continue;
21466 ReadyInsts.insert(Bundle);
21467 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *Bundle
21468 << "\n");
21469 }
21470 }
21471 }
21472}
21473
21474void BoUpSLP::BlockScheduling::resetSchedule() {
21475 assert(ScheduleStart &&
21476 "tried to reset schedule on block which has not been scheduled");
21477 for_each(ScheduleDataMap, [&](auto &P) {
21478 if (BB != P.first->getParent())
21479 return;
21480 ScheduleData *SD = P.second;
21481 if (isInSchedulingRegion(*SD)) {
21482 SD->setScheduled(/*Scheduled=*/false);
21483 SD->resetUnscheduledDeps();
21484 }
21485 });
21486 for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {
21487 for_each(P.second, [&](ScheduleCopyableData *SD) {
21488 if (isInSchedulingRegion(*SD)) {
21489 SD->setScheduled(/*Scheduled=*/false);
21490 SD->resetUnscheduledDeps();
21491 }
21492 });
21493 });
21494 for_each(ScheduledBundles, [&](auto &P) {
21495 for_each(P.second, [&](ScheduleBundle *Bundle) {
21496 if (isInSchedulingRegion(*Bundle))
21497 Bundle->setScheduled(/*Scheduled=*/false);
21498 });
21499 });
21500 // Reset schedule data for copyable elements.
21501 for (auto &P : ScheduleCopyableDataMap) {
21502 if (isInSchedulingRegion(*P.second)) {
21503 P.second->setScheduled(/*Scheduled=*/false);
21504 P.second->resetUnscheduledDeps();
21505 }
21506 }
21507 ReadyInsts.clear();
21508}
21509
21510void BoUpSLP::scheduleBlock(const BoUpSLP &R, BlockScheduling *BS) {
21511 if (!BS->ScheduleStart)
21512 return;
21513
21514 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
21515
21516 // A key point - if we got here, pre-scheduling was able to find a valid
21517 // scheduling of the sub-graph of the scheduling window which consists
21518 // of all vector bundles and their transitive users. As such, we do not
21519 // need to reschedule anything *outside of* that subgraph.
21520
21521 BS->resetSchedule();
21522
21523 // For the real scheduling we use a more sophisticated ready-list: it is
21524 // sorted by the original instruction location. This lets the final schedule
21525 // be as close as possible to the original instruction order.
21526 // WARNING: If changing this order causes a correctness issue, that means
21527 // there is some missing dependence edge in the schedule data graph.
21528 struct ScheduleDataCompare {
21529 bool operator()(const ScheduleEntity *SD1,
21530 const ScheduleEntity *SD2) const {
21531 return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
21532 }
21533 };
21534 std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
21535
21536 // Ensure that all dependency data is updated (for nodes in the sub-graph)
21537 // and fill the ready-list with initial instructions.
21538 int Idx = 0;
21539 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
21540 I = I->getNextNode()) {
21541 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
21542 if (!Bundles.empty()) {
21543 for (ScheduleBundle *Bundle : Bundles) {
21544 Bundle->setSchedulingPriority(Idx++);
21545 if (!Bundle->hasValidDependencies())
21546 BS->calculateDependencies(*Bundle, /*InsertInReadyList=*/false, this);
21547 }
21548 SmallVector<ScheduleCopyableData *> SDs = BS->getScheduleCopyableData(I);
21549 for (ScheduleCopyableData *SD : reverse(SDs)) {
21550 ScheduleBundle &Bundle = SD->getBundle();
21551 Bundle.setSchedulingPriority(Idx++);
21552 if (!Bundle.hasValidDependencies())
21553 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
21554 }
21555 continue;
21556 }
21558 BS->getScheduleCopyableDataUsers(I);
21559 if (ScheduleData *SD = BS->getScheduleData(I)) {
21560 [[maybe_unused]] ArrayRef<TreeEntry *> SDTEs = getTreeEntries(I);
21561 assert((isVectorLikeInstWithConstOps(SD->getInst()) || SDTEs.empty() ||
21562 SDTEs.front()->doesNotNeedToSchedule() ||
21564 "scheduler and vectorizer bundle mismatch");
21565 SD->setSchedulingPriority(Idx++);
21566 if (!SD->hasValidDependencies() &&
21567 (!CopyableData.empty() ||
21568 any_of(R.ValueToGatherNodes.lookup(I), [&](const TreeEntry *TE) {
21569 assert(TE->isGather() && "expected gather node");
21570 return TE->hasState() && TE->hasCopyableElements() &&
21571 TE->isCopyableElement(I);
21572 }))) {
21573 // Need to calculate deps for these nodes to correctly handle copyable
21574 // dependencies, even if they were cancelled.
21575 // If copyables bundle was cancelled, the deps are cleared and need to
21576 // recalculate them.
21577 ScheduleBundle Bundle;
21578 Bundle.add(SD);
21579 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
21580 }
21581 }
21582 for (ScheduleCopyableData *SD : reverse(CopyableData)) {
21583 ScheduleBundle &Bundle = SD->getBundle();
21584 Bundle.setSchedulingPriority(Idx++);
21585 if (!Bundle.hasValidDependencies())
21586 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
21587 }
21588 }
21589 BS->initialFillReadyList(ReadyInsts);
21590
21591 Instruction *LastScheduledInst = BS->ScheduleEnd;
21592
21593 // Do the "real" scheduling.
21594 SmallPtrSet<Instruction *, 16> Scheduled;
21595 while (!ReadyInsts.empty()) {
21596 auto *Picked = *ReadyInsts.begin();
21597 ReadyInsts.erase(ReadyInsts.begin());
21598
21599 // Move the scheduled instruction(s) to their dedicated places, if not
21600 // there yet.
21601 if (auto *Bundle = dyn_cast<ScheduleBundle>(Picked)) {
21602 for (const ScheduleEntity *BundleMember : Bundle->getBundle()) {
21603 Instruction *PickedInst = BundleMember->getInst();
21604 // If copyable must be schedule as part of something else, skip it.
21605 bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst);
21606 if ((IsCopyable && BS->getScheduleData(PickedInst)) ||
21607 (!IsCopyable && !Scheduled.insert(PickedInst).second))
21608 continue;
21609 if (PickedInst->getNextNode() != LastScheduledInst)
21610 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
21611 LastScheduledInst = PickedInst;
21612 }
21613 EntryToLastInstruction.try_emplace(Bundle->getTreeEntry(),
21614 LastScheduledInst);
21615 } else {
21616 auto *SD = cast<ScheduleData>(Picked);
21617 Instruction *PickedInst = SD->getInst();
21618 if (PickedInst->getNextNode() != LastScheduledInst)
21619 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
21620 LastScheduledInst = PickedInst;
21621 }
21622 auto Invalid = InstructionsState::invalid();
21623 BS->schedule(R, Invalid, EdgeInfo(), Picked, ReadyInsts);
21624 }
21625
21626 // Check that we didn't break any of our invariants.
21627#ifdef EXPENSIVE_CHECKS
21628 BS->verify();
21629#endif
21630
21631#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
21632 // Check that all schedulable entities got scheduled
21633 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
21634 I = I->getNextNode()) {
21635 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
21636 assert(all_of(Bundles,
21637 [](const ScheduleBundle *Bundle) {
21638 return Bundle->isScheduled();
21639 }) &&
21640 "must be scheduled at this point");
21641 }
21642#endif
21643
21644 // Avoid duplicate scheduling of the block.
21645 BS->ScheduleStart = nullptr;
21646}
21647
21649 // If V is a store, just return the width of the stored value (or value
21650 // truncated just before storing) without traversing the expression tree.
21651 // This is the common case.
21652 if (auto *Store = dyn_cast<StoreInst>(V))
21653 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
21654
21655 if (auto *IEI = dyn_cast<InsertElementInst>(V))
21656 return getVectorElementSize(IEI->getOperand(1));
21657
21658 auto E = InstrElementSize.find(V);
21659 if (E != InstrElementSize.end())
21660 return E->second;
21661
21662 // If V is not a store, we can traverse the expression tree to find loads
21663 // that feed it. The type of the loaded value may indicate a more suitable
21664 // width than V's type. We want to base the vector element size on the width
21665 // of memory operations where possible.
21668 if (auto *I = dyn_cast<Instruction>(V)) {
21669 Worklist.emplace_back(I, I->getParent(), 0);
21670 Visited.insert(I);
21671 }
21672
21673 // Traverse the expression tree in bottom-up order looking for loads. If we
21674 // encounter an instruction we don't yet handle, we give up.
21675 auto Width = 0u;
21676 Value *FirstNonBool = nullptr;
21677 while (!Worklist.empty()) {
21678 auto [I, Parent, Level] = Worklist.pop_back_val();
21679
21680 // We should only be looking at scalar instructions here. If the current
21681 // instruction has a vector type, skip.
21682 auto *Ty = I->getType();
21683 if (isa<VectorType>(Ty))
21684 continue;
21685 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
21686 FirstNonBool = I;
21687 if (Level > RecursionMaxDepth)
21688 continue;
21689
21690 // If the current instruction is a load, update MaxWidth to reflect the
21691 // width of the loaded value.
21693 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
21694
21695 // Otherwise, we need to visit the operands of the instruction. We only
21696 // handle the interesting cases from buildTree here. If an operand is an
21697 // instruction we haven't yet visited and from the same basic block as the
21698 // user or the use is a PHI node, we add it to the worklist.
21701 for (Use &U : I->operands()) {
21702 if (auto *J = dyn_cast<Instruction>(U.get()))
21703 if (Visited.insert(J).second &&
21704 (isa<PHINode>(I) || J->getParent() == Parent)) {
21705 Worklist.emplace_back(J, J->getParent(), Level + 1);
21706 continue;
21707 }
21708 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
21709 FirstNonBool = U.get();
21710 }
21711 } else {
21712 break;
21713 }
21714 }
21715
21716 // If we didn't encounter a memory access in the expression tree, or if we
21717 // gave up for some reason, just return the width of V. Otherwise, return the
21718 // maximum width we found.
21719 if (!Width) {
21720 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
21721 V = FirstNonBool;
21722 Width = DL->getTypeSizeInBits(V->getType());
21723 }
21724
21725 for (Instruction *I : Visited)
21726 InstrElementSize[I] = Width;
21727
21728 return Width;
21729}
21730
21731bool BoUpSLP::collectValuesToDemote(
21732 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
21734 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
21735 bool &IsProfitableToDemote, bool IsTruncRoot) const {
21736 // We can always demote constants.
21737 if (all_of(E.Scalars, IsaPred<Constant>))
21738 return true;
21739
21740 unsigned OrigBitWidth =
21741 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
21742 if (OrigBitWidth == BitWidth) {
21743 MaxDepthLevel = 1;
21744 return true;
21745 }
21746
21747 // Check if the node was analyzed already and must keep its original bitwidth.
21748 if (NodesToKeepBWs.contains(E.Idx))
21749 return false;
21750
21751 // If the value is not a vectorized instruction in the expression and not used
21752 // by the insertelement instruction and not used in multiple vector nodes, it
21753 // cannot be demoted.
21754 bool IsSignedNode = any_of(E.Scalars, [&](Value *R) {
21755 if (isa<PoisonValue>(R))
21756 return false;
21757 return !isKnownNonNegative(R, SimplifyQuery(*DL));
21758 });
21759 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
21760 if (isa<PoisonValue>(V))
21761 return true;
21762 if (getTreeEntries(V).size() > 1)
21763 return false;
21764 // For lat shuffle of sext/zext with many uses need to check the extra bit
21765 // for unsigned values, otherwise may have incorrect casting for reused
21766 // scalars.
21767 bool IsSignedVal = !isKnownNonNegative(V, SimplifyQuery(*DL));
21768 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
21769 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
21770 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
21771 return true;
21772 }
21773 unsigned NumSignBits = ComputeNumSignBits(V, *DL, AC, nullptr, DT);
21774 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
21775 if (IsSignedNode)
21776 ++BitWidth1;
21777 if (auto *I = dyn_cast<Instruction>(V)) {
21778 APInt Mask = DB->getDemandedBits(I);
21779 unsigned BitWidth2 =
21780 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
21781 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
21782 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
21783 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
21784 break;
21785 BitWidth2 *= 2;
21786 }
21787 BitWidth1 = std::min(BitWidth1, BitWidth2);
21788 }
21789 BitWidth = std::max(BitWidth, BitWidth1);
21790 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
21791 };
21792 auto FinalAnalysis = [&, TTI = TTI]() {
21793 if (!IsProfitableToDemote)
21794 return false;
21795 bool Res = all_of(
21796 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
21797 // Demote gathers.
21798 if (Res && E.isGather()) {
21799 if (E.hasState()) {
21800 if (const TreeEntry *SameTE =
21801 getSameValuesTreeEntry(E.getMainOp(), E.Scalars);
21802 SameTE)
21803 if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot, BitWidth,
21804 ToDemote, Visited, NodesToKeepBWs,
21805 MaxDepthLevel, IsProfitableToDemote,
21806 IsTruncRoot)) {
21807 ToDemote.push_back(E.Idx);
21808 return true;
21809 }
21810 }
21811 // Check possible extractelement instructions bases and final vector
21812 // length.
21813 SmallPtrSet<Value *, 4> UniqueBases;
21814 for (Value *V : E.Scalars) {
21815 auto *EE = dyn_cast<ExtractElementInst>(V);
21816 if (!EE)
21817 continue;
21818 UniqueBases.insert(EE->getVectorOperand());
21819 }
21820 const unsigned VF = E.Scalars.size();
21821 Type *OrigScalarTy = E.Scalars.front()->getType();
21822 if (UniqueBases.size() <= 2 ||
21823 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF)) >=
21825 *TTI,
21827 IntegerType::get(OrigScalarTy->getContext(), BitWidth),
21828 VF))) {
21829 ToDemote.push_back(E.Idx);
21830 return true;
21831 }
21832 }
21833 return Res;
21834 };
21835 if (E.isGather() || !Visited.insert(&E).second ||
21836 any_of(E.Scalars, [&](Value *V) {
21837 return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {
21838 return isa<InsertElementInst>(U) && !isVectorized(U);
21839 });
21840 }))
21841 return FinalAnalysis();
21842
21843 if (any_of(E.Scalars, [&](Value *V) {
21844 return !isa<Constant>(V) && !all_of(V->users(), [=](User *U) {
21845 return isVectorized(U) ||
21846 (E.Idx == 0 && UserIgnoreList &&
21847 UserIgnoreList->contains(U)) ||
21848 (!isa<CmpInst>(U) && U->getType()->isSized() &&
21849 !U->getType()->isScalableTy() &&
21850 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
21851 }) && !IsPotentiallyTruncated(V, BitWidth);
21852 }))
21853 return false;
21854
21855 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
21856 bool &NeedToExit) {
21857 NeedToExit = false;
21858 unsigned InitLevel = MaxDepthLevel;
21859 for (const TreeEntry *Op : Operands) {
21860 unsigned Level = InitLevel;
21861 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
21862 ToDemote, Visited, NodesToKeepBWs, Level,
21863 IsProfitableToDemote, IsTruncRoot)) {
21864 if (!IsProfitableToDemote)
21865 return false;
21866 NeedToExit = true;
21867 if (!FinalAnalysis())
21868 return false;
21869 continue;
21870 }
21871 MaxDepthLevel = std::max(MaxDepthLevel, Level);
21872 }
21873 return true;
21874 };
21875 auto AttemptCheckBitwidth =
21876 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
21877 // Try all bitwidth < OrigBitWidth.
21878 NeedToExit = false;
21879 unsigned BestFailBitwidth = 0;
21880 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
21881 if (Checker(BitWidth, OrigBitWidth))
21882 return true;
21883 if (BestFailBitwidth == 0 && FinalAnalysis())
21884 BestFailBitwidth = BitWidth;
21885 }
21886 if (BitWidth >= OrigBitWidth) {
21887 if (BestFailBitwidth == 0) {
21888 BitWidth = OrigBitWidth;
21889 return false;
21890 }
21891 MaxDepthLevel = 1;
21892 BitWidth = BestFailBitwidth;
21893 NeedToExit = true;
21894 return true;
21895 }
21896 return false;
21897 };
21898 auto TryProcessInstruction =
21899 [&](unsigned &BitWidth, ArrayRef<const TreeEntry *> Operands = {},
21900 function_ref<bool(unsigned, unsigned)> Checker = {}) {
21901 if (Operands.empty()) {
21902 if (!IsTruncRoot)
21903 MaxDepthLevel = 1;
21904 for (Value *V : E.Scalars)
21905 (void)IsPotentiallyTruncated(V, BitWidth);
21906 } else {
21907 // Several vectorized uses? Check if we can truncate it, otherwise -
21908 // exit.
21909 if (any_of(E.Scalars, [&](Value *V) {
21910 return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth);
21911 }))
21912 return false;
21913 bool NeedToExit = false;
21914 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
21915 return false;
21916 if (NeedToExit)
21917 return true;
21918 if (!ProcessOperands(Operands, NeedToExit))
21919 return false;
21920 if (NeedToExit)
21921 return true;
21922 }
21923
21924 ++MaxDepthLevel;
21925 // Record the entry that we can demote.
21926 ToDemote.push_back(E.Idx);
21927 return IsProfitableToDemote;
21928 };
21929
21930 if (E.State == TreeEntry::SplitVectorize)
21931 return TryProcessInstruction(
21932 BitWidth,
21933 {VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(),
21934 VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()});
21935
21936 switch (E.getOpcode()) {
21937
21938 // We can always demote truncations and extensions. Since truncations can
21939 // seed additional demotion, we save the truncated value.
21940 case Instruction::Trunc:
21941 if (IsProfitableToDemoteRoot)
21942 IsProfitableToDemote = true;
21943 return TryProcessInstruction(BitWidth);
21944 case Instruction::ZExt:
21945 case Instruction::SExt:
21946 if (E.UserTreeIndex.UserTE && E.UserTreeIndex.UserTE->hasState() &&
21947 E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast &&
21948 E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy())
21949 return false;
21950 IsProfitableToDemote = true;
21951 return TryProcessInstruction(BitWidth);
21952
21953 // We can demote certain binary operations if we can demote both of their
21954 // operands.
21955 case Instruction::Add:
21956 case Instruction::Sub:
21957 case Instruction::Mul:
21958 case Instruction::And:
21959 case Instruction::Or:
21960 case Instruction::Xor: {
21961 return TryProcessInstruction(
21962 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
21963 }
21964 case Instruction::Freeze:
21965 return TryProcessInstruction(BitWidth, getOperandEntry(&E, 0));
21966 case Instruction::Shl: {
21967 // If we are truncating the result of this SHL, and if it's a shift of an
21968 // inrange amount, we can always perform a SHL in a smaller type.
21969 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
21970 return all_of(E.Scalars, [&](Value *V) {
21971 if (isa<PoisonValue>(V))
21972 return true;
21973 auto *I = cast<Instruction>(V);
21974 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
21975 return AmtKnownBits.getMaxValue().ult(BitWidth);
21976 });
21977 };
21978 return TryProcessInstruction(
21979 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
21980 }
21981 case Instruction::LShr: {
21982 // If this is a truncate of a logical shr, we can truncate it to a smaller
21983 // lshr iff we know that the bits we would otherwise be shifting in are
21984 // already zeros.
21985 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
21986 return all_of(E.Scalars, [&](Value *V) {
21987 if (isa<PoisonValue>(V))
21988 return true;
21989 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
21990 if (E.isCopyableElement(V))
21991 return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
21992 auto *I = cast<Instruction>(V);
21993 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
21994 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
21995 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
21996 SimplifyQuery(*DL));
21997 });
21998 };
21999 return TryProcessInstruction(
22000 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
22001 LShrChecker);
22002 }
22003 case Instruction::AShr: {
22004 // If this is a truncate of an arithmetic shr, we can truncate it to a
22005 // smaller ashr iff we know that all the bits from the sign bit of the
22006 // original type and the sign bit of the truncate type are similar.
22007 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22008 return all_of(E.Scalars, [&](Value *V) {
22009 if (isa<PoisonValue>(V))
22010 return true;
22011 auto *I = cast<Instruction>(V);
22012 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22013 unsigned ShiftedBits = OrigBitWidth - BitWidth;
22014 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22015 ShiftedBits <
22016 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22017 });
22018 };
22019 return TryProcessInstruction(
22020 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
22021 AShrChecker);
22022 }
22023 case Instruction::UDiv:
22024 case Instruction::URem: {
22025 // UDiv and URem can be truncated if all the truncated bits are zero.
22026 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22027 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
22028 return all_of(E.Scalars, [&](Value *V) {
22029 auto *I = cast<Instruction>(V);
22030 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22031 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
22032 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22033 });
22034 };
22035 return TryProcessInstruction(
22036 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
22037 }
22038
22039 // We can demote selects if we can demote their true and false values.
22040 case Instruction::Select: {
22041 return TryProcessInstruction(
22042 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
22043 }
22044
22045 // We can demote phis if we can demote all their incoming operands.
22046 case Instruction::PHI: {
22047 const unsigned NumOps = E.getNumOperands();
22049 transform(seq<unsigned>(0, NumOps), Ops.begin(),
22050 [&](unsigned Idx) { return getOperandEntry(&E, Idx); });
22051
22052 return TryProcessInstruction(BitWidth, Ops);
22053 }
22054
22055 case Instruction::Call: {
22056 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
22057 if (!IC)
22058 break;
22060 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
22061 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
22062 break;
22063 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
22064 function_ref<bool(unsigned, unsigned)> CallChecker;
22065 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22066 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
22067 return all_of(E.Scalars, [&](Value *V) {
22068 auto *I = cast<Instruction>(V);
22069 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
22070 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22071 return MaskedValueIsZero(I->getOperand(0), Mask,
22072 SimplifyQuery(*DL)) &&
22073 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22074 }
22075 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
22076 "Expected min/max intrinsics only.");
22077 unsigned SignBits = OrigBitWidth - BitWidth;
22078 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
22079 unsigned Op0SignBits =
22080 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22081 unsigned Op1SignBits =
22082 ComputeNumSignBits(I->getOperand(1), *DL, AC, nullptr, DT);
22083 return SignBits <= Op0SignBits &&
22084 ((SignBits != Op0SignBits &&
22085 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
22086 MaskedValueIsZero(I->getOperand(0), Mask,
22087 SimplifyQuery(*DL))) &&
22088 SignBits <= Op1SignBits &&
22089 ((SignBits != Op1SignBits &&
22090 !isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) ||
22091 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
22092 });
22093 };
22094 auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22095 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
22096 return all_of(E.Scalars, [&](Value *V) {
22097 auto *I = cast<Instruction>(V);
22098 unsigned SignBits = OrigBitWidth - BitWidth;
22099 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
22100 unsigned Op0SignBits =
22101 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22102 return SignBits <= Op0SignBits &&
22103 ((SignBits != Op0SignBits &&
22104 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
22105 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
22106 });
22107 };
22108 if (ID != Intrinsic::abs) {
22109 Operands.push_back(getOperandEntry(&E, 1));
22110 CallChecker = CompChecker;
22111 } else {
22112 CallChecker = AbsChecker;
22113 }
22114 InstructionCost BestCost =
22115 std::numeric_limits<InstructionCost::CostType>::max();
22116 unsigned BestBitWidth = BitWidth;
22117 unsigned VF = E.Scalars.size();
22118 // Choose the best bitwidth based on cost estimations.
22119 auto Checker = [&](unsigned BitWidth, unsigned) {
22120 unsigned MinBW = PowerOf2Ceil(BitWidth);
22121 SmallVector<Type *> ArgTys =
22122 buildIntrinsicArgTypes(IC, ID, VF, MinBW, TTI);
22123 auto VecCallCosts = getVectorCallCosts(
22124 IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),
22125 TTI, TLI, ArgTys);
22126 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
22127 if (Cost < BestCost) {
22128 BestCost = Cost;
22129 BestBitWidth = BitWidth;
22130 }
22131 return false;
22132 };
22133 [[maybe_unused]] bool NeedToExit;
22134 (void)AttemptCheckBitwidth(Checker, NeedToExit);
22135 BitWidth = BestBitWidth;
22136 return TryProcessInstruction(BitWidth, Operands, CallChecker);
22137 }
22138
22139 // Otherwise, conservatively give up.
22140 default:
22141 break;
22142 }
22143 MaxDepthLevel = 1;
22144 return FinalAnalysis();
22145}
22146
22147static RecurKind getRdxKind(Value *V);
22148
22150 // We only attempt to truncate integer expressions.
22151 bool IsStoreOrInsertElt =
22152 VectorizableTree.front()->hasState() &&
22153 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
22154 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
22155 if ((IsStoreOrInsertElt || UserIgnoreList) &&
22156 ExtraBitWidthNodes.size() <= 1 &&
22157 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
22158 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
22159 return;
22160
22161 unsigned NodeIdx = 0;
22162 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
22163 NodeIdx = 1;
22164
22165 // Ensure the roots of the vectorizable tree don't form a cycle.
22166 assert((VectorizableTree[NodeIdx]->isGather() || NodeIdx != 0 ||
22167 !VectorizableTree[NodeIdx]->UserTreeIndex) &&
22168 "Unexpected tree is graph.");
22169
22170 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
22171 // resize to the final type.
22172 bool IsTruncRoot = false;
22173 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
22174 SmallVector<unsigned> RootDemotes;
22175 SmallDenseSet<unsigned, 8> NodesToKeepBWs;
22176 if (NodeIdx != 0 &&
22177 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22178 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
22179 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
22180 IsTruncRoot = true;
22181 RootDemotes.push_back(NodeIdx);
22182 IsProfitableToDemoteRoot = true;
22183 ++NodeIdx;
22184 }
22185
22186 // Analyzed the reduction already and not profitable - exit.
22187 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
22188 return;
22189
22190 SmallVector<unsigned> ToDemote;
22191 auto ComputeMaxBitWidth =
22192 [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot,
22193 unsigned Limit, bool IsTruncRoot, bool IsSignedCmp) -> unsigned {
22194 ToDemote.clear();
22195 // Check if the root is trunc and the next node is gather/buildvector, then
22196 // keep trunc in scalars, which is free in most cases.
22197 if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
22198 !NodesToKeepBWs.contains(E.Idx) &&
22199 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
22200 all_of(E.Scalars, [&](Value *V) {
22201 return V->hasOneUse() || isa<Constant>(V) ||
22202 (!V->hasNUsesOrMore(UsesLimit) &&
22203 none_of(V->users(), [&](User *U) {
22204 ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
22205 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22206 if (TEs.empty() || is_contained(TEs, UserTE))
22207 return false;
22208 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22209 SelectInst>(U) ||
22210 isa<SIToFPInst, UIToFPInst>(U) ||
22211 (UserTE->hasState() &&
22212 (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22213 SelectInst>(UserTE->getMainOp()) ||
22214 isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))))
22215 return true;
22216 unsigned UserTESz = DL->getTypeSizeInBits(
22217 UserTE->Scalars.front()->getType());
22218 if (all_of(TEs, [&](const TreeEntry *TE) {
22219 auto It = MinBWs.find(TE);
22220 return It != MinBWs.end() &&
22221 It->second.first > UserTESz;
22222 }))
22223 return true;
22224 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
22225 }));
22226 })) {
22227 ToDemote.push_back(E.Idx);
22228 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22229 auto It = MinBWs.find(UserTE);
22230 if (It != MinBWs.end())
22231 return It->second.first;
22232 unsigned MaxBitWidth =
22233 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
22234 MaxBitWidth = bit_ceil(MaxBitWidth);
22235 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22236 MaxBitWidth = 8;
22237 return MaxBitWidth;
22238 }
22239
22240 if (!E.hasState())
22241 return 0u;
22242
22243 unsigned VF = E.getVectorFactor();
22244 Type *ScalarTy = E.Scalars.front()->getType();
22245 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
22246 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
22247 if (!TreeRootIT)
22248 return 0u;
22249
22250 if (any_of(E.Scalars,
22251 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
22252 return 0u;
22253
22254 unsigned NumParts = ::getNumberOfParts(
22255 *TTI, getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
22256
22257 // The maximum bit width required to represent all the values that can be
22258 // demoted without loss of precision. It would be safe to truncate the roots
22259 // of the expression to this width.
22260 unsigned MaxBitWidth = 1u;
22261
22262 // True if the roots can be zero-extended back to their original type,
22263 // rather than sign-extended. We know that if the leading bits are not
22264 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
22265 // True.
22266 // Determine if the sign bit of all the roots is known to be zero. If not,
22267 // IsKnownPositive is set to False.
22268 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
22269 if (isa<PoisonValue>(R))
22270 return true;
22271 KnownBits Known = computeKnownBits(R, *DL);
22272 return Known.isNonNegative();
22273 });
22274
22275 if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
22276 E.UserTreeIndex.UserTE->hasState() &&
22277 E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
22278 MaxBitWidth =
22279 std::min(DL->getTypeSizeInBits(
22280 E.UserTreeIndex.UserTE->Scalars.front()->getType()),
22281 DL->getTypeSizeInBits(ScalarTy));
22282
22283 // We first check if all the bits of the roots are demanded. If they're not,
22284 // we can truncate the roots to this narrower type.
22285 for (Value *Root : E.Scalars) {
22286 if (isa<PoisonValue>(Root))
22287 continue;
22288 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, AC, nullptr, DT);
22289 TypeSize NumTypeBits =
22290 DL->getTypeSizeInBits(Root->getType()->getScalarType());
22291 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22292 // If we can't prove that the sign bit is zero, we must add one to the
22293 // maximum bit width to account for the unknown sign bit. This preserves
22294 // the existing sign bit so we can safely sign-extend the root back to the
22295 // original type. Otherwise, if we know the sign bit is zero, we will
22296 // zero-extend the root instead.
22297 //
22298 // FIXME: This is somewhat suboptimal, as there will be cases where adding
22299 // one to the maximum bit width will yield a larger-than-necessary
22300 // type. In general, we need to add an extra bit only if we can't
22301 // prove that the upper bit of the original type is equal to the
22302 // upper bit of the proposed smaller type. If these two bits are
22303 // the same (either zero or one) we know that sign-extending from
22304 // the smaller type will result in the same value. Here, since we
22305 // can't yet prove this, we are just making the proposed smaller
22306 // type larger to ensure correctness.
22307 if (!IsKnownPositive)
22308 ++BitWidth1;
22309
22310 auto *I = dyn_cast<Instruction>(Root);
22311 if (!I) {
22312 MaxBitWidth = std::max(BitWidth1, MaxBitWidth);
22313 continue;
22314 }
22315 APInt Mask = DB->getDemandedBits(I);
22316 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22317 MaxBitWidth =
22318 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
22319 }
22320
22321 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22322 MaxBitWidth = 8;
22323
22324 // If the original type is large, but reduced type does not improve the reg
22325 // use - ignore it.
22326 if (NumParts > 1 &&
22327 NumParts ==
22329 *TTI, getWidenedType(IntegerType::get(F->getContext(),
22330 bit_ceil(MaxBitWidth)),
22331 VF)))
22332 return 0u;
22333
22334 unsigned Opcode = E.getOpcode();
22335 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
22336 Opcode == Instruction::SExt ||
22337 Opcode == Instruction::ZExt || NumParts > 1;
22338 // Conservatively determine if we can actually truncate the roots of the
22339 // expression. Collect the values that can be demoted in ToDemote and
22340 // additional roots that require investigating in Roots.
22342 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
22343 bool NeedToDemote = IsProfitableToDemote;
22344
22345 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
22346 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
22347 NeedToDemote, IsTruncRoot) ||
22348 (MaxDepthLevel <= Limit &&
22349 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
22350 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
22351 DL->getTypeSizeInBits(TreeRootIT) /
22352 DL->getTypeSizeInBits(
22353 E.getMainOp()->getOperand(0)->getType()) >
22354 2)))))
22355 return 0u;
22356 // Round MaxBitWidth up to the next power-of-two.
22357 MaxBitWidth = bit_ceil(MaxBitWidth);
22358
22359 return MaxBitWidth;
22360 };
22361
22362 // If we can truncate the root, we must collect additional values that might
22363 // be demoted as a result. That is, those seeded by truncations we will
22364 // modify.
22365 // Add reduction ops sizes, if any.
22366 if (UserIgnoreList &&
22367 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
22368 // Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
22369 // x i1> to in)).
22370 if (all_of(*UserIgnoreList,
22371 [](Value *V) {
22372 return isa<PoisonValue>(V) ||
22373 cast<Instruction>(V)->getOpcode() == Instruction::Add;
22374 }) &&
22375 VectorizableTree.front()->State == TreeEntry::Vectorize &&
22376 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
22377 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
22378 Builder.getInt1Ty()) {
22379 ReductionBitWidth = 1;
22380 } else {
22381 for (Value *V : *UserIgnoreList) {
22382 if (isa<PoisonValue>(V))
22383 continue;
22384 unsigned NumSignBits = ComputeNumSignBits(V, *DL, AC, nullptr, DT);
22385 TypeSize NumTypeBits = DL->getTypeSizeInBits(V->getType());
22386 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22388 ++BitWidth1;
22389 unsigned BitWidth2 = BitWidth1;
22392 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22393 }
22394 ReductionBitWidth =
22395 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
22396 }
22397 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
22398 ReductionBitWidth = 8;
22399
22400 ReductionBitWidth = bit_ceil(ReductionBitWidth);
22401 }
22402 }
22403 bool IsTopRoot = NodeIdx == 0;
22404 while (NodeIdx < VectorizableTree.size() &&
22405 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22406 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
22407 RootDemotes.push_back(NodeIdx);
22408 ++NodeIdx;
22409 IsTruncRoot = true;
22410 }
22411 bool IsSignedCmp = false;
22412 if (UserIgnoreList && all_of(*UserIgnoreList, [](Value *V) {
22413 return match(V, m_SMin(m_Value(), m_Value())) ||
22414 match(V, m_SMax(m_Value(), m_Value()));
22415 }))
22416 IsSignedCmp = true;
22417 while (NodeIdx < VectorizableTree.size()) {
22418 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
22419 unsigned Limit = 2;
22420 if (IsTopRoot &&
22421 ReductionBitWidth ==
22422 DL->getTypeSizeInBits(
22423 VectorizableTree.front()->Scalars.front()->getType()))
22424 Limit = 3;
22425 unsigned MaxBitWidth = ComputeMaxBitWidth(
22426 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
22427 IsTruncRoot, IsSignedCmp);
22428 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
22429 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
22430 ReductionBitWidth = bit_ceil(MaxBitWidth);
22431 else if (MaxBitWidth == 0)
22432 ReductionBitWidth = 0;
22433 }
22434
22435 for (unsigned Idx : RootDemotes) {
22436 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
22437 uint32_t OrigBitWidth =
22438 DL->getTypeSizeInBits(V->getType()->getScalarType());
22439 if (OrigBitWidth > MaxBitWidth) {
22440 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
22441 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
22442 }
22443 return false;
22444 }))
22445 ToDemote.push_back(Idx);
22446 }
22447 RootDemotes.clear();
22448 IsTopRoot = false;
22449 IsProfitableToDemoteRoot = true;
22450
22451 if (ExtraBitWidthNodes.empty()) {
22452 NodeIdx = VectorizableTree.size();
22453 } else {
22454 unsigned NewIdx = 0;
22455 do {
22456 NewIdx = *ExtraBitWidthNodes.begin();
22457 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
22458 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
22459 NodeIdx = NewIdx;
22460 IsTruncRoot =
22461 NodeIdx < VectorizableTree.size() &&
22462 VectorizableTree[NodeIdx]->UserTreeIndex &&
22463 VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
22464 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22465 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22466 Instruction::Trunc &&
22467 !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
22468 IsSignedCmp =
22469 NodeIdx < VectorizableTree.size() &&
22470 VectorizableTree[NodeIdx]->UserTreeIndex &&
22471 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22472 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22473 Instruction::ICmp &&
22474 any_of(
22475 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
22476 [&](Value *V) {
22477 auto *IC = dyn_cast<ICmpInst>(V);
22478 return IC && (IC->isSigned() ||
22479 !isKnownNonNegative(IC->getOperand(0),
22480 SimplifyQuery(*DL)) ||
22481 !isKnownNonNegative(IC->getOperand(1),
22482 SimplifyQuery(*DL)));
22483 });
22484 }
22485
22486 // If the maximum bit width we compute is less than the width of the roots'
22487 // type, we can proceed with the narrowing. Otherwise, do nothing.
22488 if (MaxBitWidth == 0 ||
22489 MaxBitWidth >=
22490 cast<IntegerType>(TreeRoot.front()->getType()->getScalarType())
22491 ->getBitWidth()) {
22492 if (UserIgnoreList)
22493 AnalyzedMinBWVals.insert_range(TreeRoot);
22494 NodesToKeepBWs.insert_range(ToDemote);
22495 continue;
22496 }
22497
22498 // Finally, map the values we can demote to the maximum bit with we
22499 // computed.
22500 for (unsigned Idx : ToDemote) {
22501 TreeEntry *TE = VectorizableTree[Idx].get();
22502 if (MinBWs.contains(TE))
22503 continue;
22504 bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
22505 if (isa<PoisonValue>(R))
22506 return false;
22507 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22508 });
22509 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
22510 }
22511 }
22512}
22513
22515 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
22516 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
22518 auto *AA = &AM.getResult<AAManager>(F);
22519 auto *LI = &AM.getResult<LoopAnalysis>(F);
22520 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
22521 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
22522 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
22524
22525 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
22526 if (!Changed)
22527 return PreservedAnalyses::all();
22528
22531 return PA;
22532}
22533
22535 TargetTransformInfo *TTI_,
22536 TargetLibraryInfo *TLI_, AAResults *AA_,
22537 LoopInfo *LI_, DominatorTree *DT_,
22538 AssumptionCache *AC_, DemandedBits *DB_,
22541 return false;
22542 SE = SE_;
22543 TTI = TTI_;
22544 TLI = TLI_;
22545 AA = AA_;
22546 LI = LI_;
22547 DT = DT_;
22548 AC = AC_;
22549 DB = DB_;
22550 DL = &F.getDataLayout();
22551
22552 Stores.clear();
22553 GEPs.clear();
22554 bool Changed = false;
22555
22556 // If the target claims to have no vector registers don't attempt
22557 // vectorization.
22558 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) {
22559 LLVM_DEBUG(
22560 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
22561 return false;
22562 }
22563
22564 // Don't vectorize when the attribute NoImplicitFloat is used.
22565 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
22566 return false;
22567
22568 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
22569
22570 // Use the bottom up slp vectorizer to construct chains that start with
22571 // store instructions.
22572 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
22573
22574 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
22575 // delete instructions.
22576
22577 // Update DFS numbers now so that we can use them for ordering.
22578 DT->updateDFSNumbers();
22579
22580 // Scan the blocks in the function in post order.
22581 for (auto *BB : post_order(&F.getEntryBlock())) {
22583 continue;
22584
22585 // Start new block - clear the list of reduction roots.
22586 R.clearReductionData();
22587 collectSeedInstructions(BB);
22588
22589 // Vectorize trees that end at stores.
22590 if (!Stores.empty()) {
22591 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
22592 << " underlying objects.\n");
22593 Changed |= vectorizeStoreChains(R);
22594 }
22595
22596 // Vectorize trees that end at reductions.
22597 Changed |= vectorizeChainsInBlock(BB, R);
22598
22599 // Vectorize the index computations of getelementptr instructions. This
22600 // is primarily intended to catch gather-like idioms ending at
22601 // non-consecutive loads.
22602 if (!GEPs.empty()) {
22603 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
22604 << " underlying objects.\n");
22605 Changed |= vectorizeGEPIndices(BB, R);
22606 }
22607 }
22608
22609 if (Changed) {
22610 R.optimizeGatherSequence();
22611 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
22612 }
22613 return Changed;
22614}
22615
22616std::optional<bool>
22617SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
22618 unsigned Idx, unsigned MinVF,
22619 unsigned &Size) {
22620 Size = 0;
22621 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
22622 << "\n");
22623 const unsigned Sz = R.getVectorElementSize(Chain[0]);
22624 unsigned VF = Chain.size();
22625
22626 if (!has_single_bit(Sz) ||
22628 *TTI, cast<StoreInst>(Chain.front())->getValueOperand()->getType(),
22629 VF) ||
22630 VF < 2 || VF < MinVF) {
22631 // Check if vectorizing with a non-power-of-2 VF should be considered. At
22632 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
22633 // all vector lanes are used.
22634 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
22635 return false;
22636 }
22637
22638 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
22639 << "\n");
22640
22641 SetVector<Value *> ValOps;
22642 for (Value *V : Chain)
22643 ValOps.insert(cast<StoreInst>(V)->getValueOperand());
22644 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
22645 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
22646 InstructionsState S = Analysis.buildInstructionsState(
22647 ValOps.getArrayRef(), R, /*TryCopyableElementsVectorization=*/true);
22648 if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
22649 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
22650 bool IsAllowedSize =
22651 hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(),
22652 ValOps.size()) ||
22653 (VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1));
22654 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
22655 (!S.getMainOp()->isSafeToRemove() ||
22656 any_of(ValOps.getArrayRef(),
22657 [&](Value *V) {
22658 return !isa<ExtractElementInst>(V) &&
22659 (V->getNumUses() > Chain.size() ||
22660 any_of(V->users(), [&](User *U) {
22661 return !Stores.contains(U);
22662 }));
22663 }))) ||
22664 (ValOps.size() > Chain.size() / 2 && !S)) {
22665 Size = (!IsAllowedSize && S) ? 1 : 2;
22666 return false;
22667 }
22668 }
22669 if (R.isLoadCombineCandidate(Chain))
22670 return true;
22671 R.buildTree(Chain);
22672 // Check if tree tiny and store itself or its value is not vectorized.
22673 if (R.isTreeTinyAndNotFullyVectorizable()) {
22674 if (R.isGathered(Chain.front()) ||
22675 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
22676 return std::nullopt;
22677 Size = R.getCanonicalGraphSize();
22678 return false;
22679 }
22680 if (R.isProfitableToReorder()) {
22681 R.reorderTopToBottom();
22682 R.reorderBottomToTop();
22683 }
22684 R.transformNodes();
22685 R.buildExternalUses();
22686
22687 R.computeMinimumValueSizes();
22688
22689 Size = R.getCanonicalGraphSize();
22690 if (S && S.getOpcode() == Instruction::Load)
22691 Size = 2; // cut off masked gather small trees
22692 InstructionCost Cost = R.getTreeCost();
22693
22694 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
22695 if (Cost < -SLPCostThreshold) {
22696 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
22697
22698 using namespace ore;
22699
22700 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
22701 cast<StoreInst>(Chain[0]))
22702 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
22703 << " and with tree size "
22704 << NV("TreeSize", R.getTreeSize()));
22705
22706 R.vectorizeTree();
22707 return true;
22708 }
22709
22710 return false;
22711}
22712
22713/// Checks if the quadratic mean deviation is less than 90% of the mean size.
22714static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
22715 bool First) {
22716 unsigned Num = 0;
22717 uint64_t Sum = std::accumulate(
22718 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
22719 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
22720 unsigned Size = First ? Val.first : Val.second;
22721 if (Size == 1)
22722 return V;
22723 ++Num;
22724 return V + Size;
22725 });
22726 if (Num == 0)
22727 return true;
22728 uint64_t Mean = Sum / Num;
22729 if (Mean == 0)
22730 return true;
22731 uint64_t Dev = std::accumulate(
22732 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
22733 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
22734 unsigned P = First ? Val.first : Val.second;
22735 if (P == 1)
22736 return V;
22737 return V + (P - Mean) * (P - Mean);
22738 }) /
22739 Num;
22740 return Dev * 96 / (Mean * Mean) == 0;
22741}
22742
22743namespace {
22744
22745/// A group of stores that we'll try to bundle together using vector ops.
22746/// They are ordered using the signed distance of their address operand to the
22747/// address of this group's BaseInstr.
22748class RelatedStoreInsts {
22749public:
22750 RelatedStoreInsts(unsigned BaseInstrIdx, ArrayRef<StoreInst *> AllStores)
22751 : AllStores(AllStores) {
22752 reset(BaseInstrIdx);
22753 }
22754
22755 void reset(unsigned NewBaseInstr) {
22756 assert(NewBaseInstr < AllStores.size() &&
22757 "Instruction index out of bounds");
22758 BaseInstrIdx = NewBaseInstr;
22759 Instrs.clear();
22760 insertOrLookup(NewBaseInstr, 0);
22761 }
22762
22763 /// Tries to insert \p InstrIdx as the store with a pointer distance of
22764 /// \p PtrDist.
22765 /// Does nothing if there is already a store with that \p PtrDist.
22766 /// \returns The previously associated Instruction index, or std::nullopt
22767 std::optional<unsigned> insertOrLookup(unsigned InstrIdx, int64_t PtrDist) {
22768 auto [It, Inserted] = Instrs.emplace(PtrDist, InstrIdx);
22769 return Inserted ? std::nullopt : std::make_optional(It->second);
22770 }
22771
22772 using DistToInstMap = std::map<int64_t, unsigned>;
22773 const DistToInstMap &getStores() const { return Instrs; }
22774
22775 /// If \p SI is related to this group of stores, return the distance of its
22776 /// pointer operand to the one the group's BaseInstr.
22777 std::optional<int64_t> getPointerDiff(StoreInst &SI, const DataLayout &DL,
22778 ScalarEvolution &SE) const {
22779 StoreInst &BaseStore = *AllStores[BaseInstrIdx];
22780 return getPointersDiff(
22781 BaseStore.getValueOperand()->getType(), BaseStore.getPointerOperand(),
22782 SI.getValueOperand()->getType(), SI.getPointerOperand(), DL, SE,
22783 /*StrictCheck=*/true);
22784 }
22785
22786 /// Recompute the pointer distances to be based on \p NewBaseInstIdx.
22787 /// Stores whose index is less than \p MinSafeIdx will be dropped.
22788 void rebase(unsigned MinSafeIdx, unsigned NewBaseInstIdx,
22789 int64_t DistFromCurBase) {
22790 DistToInstMap PrevSet = std::move(Instrs);
22791 reset(NewBaseInstIdx);
22792
22793 // Re-insert stores that come after MinSafeIdx to try and vectorize them
22794 // again. Their distance will be "rebased" to use NewBaseInstIdx as
22795 // reference.
22796 for (auto [Dist, InstIdx] : PrevSet) {
22797 if (InstIdx >= MinSafeIdx)
22798 insertOrLookup(InstIdx, Dist - DistFromCurBase);
22799 }
22800 }
22801
22802 /// Remove all stores that have been vectorized from this group.
22803 void clearVectorizedStores(const BoUpSLP::ValueSet &VectorizedStores) {
22804 DistToInstMap::reverse_iterator LastVectorizedStore = find_if(
22805 reverse(Instrs), [&](const std::pair<int64_t, unsigned> &DistAndIdx) {
22806 return VectorizedStores.contains(AllStores[DistAndIdx.second]);
22807 });
22808
22809 // Get a forward iterator pointing after the last vectorized store and erase
22810 // all stores before it so we don't try to vectorize them again.
22811 DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
22812 Instrs.erase(Instrs.begin(), VectorizedStoresEnd);
22813 }
22814
22815private:
22816 /// The index of the Base instruction, i.e. the one with a 0 pointer distance.
22817 unsigned BaseInstrIdx;
22818
22819 /// Maps a pointer distance from \p BaseInstrIdx to an instruction index.
22820 DistToInstMap Instrs;
22821
22822 /// Reference to all the stores in the BB being analyzed.
22823 ArrayRef<StoreInst *> AllStores;
22824};
22825
22826} // end anonymous namespace
22827
22828bool SLPVectorizerPass::vectorizeStores(
22829 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
22830 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
22831 &Visited) {
22832 // We may run into multiple chains that merge into a single chain. We mark the
22833 // stores that we vectorized so that we don't visit the same store twice.
22834 BoUpSLP::ValueSet VectorizedStores;
22835 bool Changed = false;
22836
22837 auto TryToVectorize = [&](const RelatedStoreInsts::DistToInstMap &StoreSeq) {
22838 int64_t PrevDist = -1;
22840 // Collect the chain into a list.
22841 for (auto [Idx, Data] : enumerate(StoreSeq)) {
22842 auto &[Dist, InstIdx] = Data;
22843 if (Operands.empty() || Dist - PrevDist == 1) {
22844 Operands.push_back(Stores[InstIdx]);
22845 PrevDist = Dist;
22846 if (Idx != StoreSeq.size() - 1)
22847 continue;
22848 }
22849 auto E = make_scope_exit([&, &Dist = Dist, &InstIdx = InstIdx]() {
22850 Operands.clear();
22851 Operands.push_back(Stores[InstIdx]);
22852 PrevDist = Dist;
22853 });
22854
22855 if (Operands.size() <= 1 ||
22856 !Visited
22857 .insert({Operands.front(),
22858 cast<StoreInst>(Operands.front())->getValueOperand(),
22859 Operands.back(),
22860 cast<StoreInst>(Operands.back())->getValueOperand(),
22861 Operands.size()})
22862 .second)
22863 continue;
22864
22865 unsigned MaxVecRegSize = R.getMaxVecRegSize();
22866 unsigned EltSize = R.getVectorElementSize(Operands[0]);
22867 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
22868
22869 unsigned MaxVF =
22870 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
22871 auto *Store = cast<StoreInst>(Operands[0]);
22872 Type *StoreTy = Store->getValueOperand()->getType();
22873 Type *ValueTy = StoreTy;
22874 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
22875 ValueTy = Trunc->getSrcTy();
22876 // When REVEC is enabled, StoreTy and ValueTy may be FixedVectorType. But
22877 // getStoreMinimumVF only support scalar type as arguments. As a result,
22878 // we need to use the element type of StoreTy and ValueTy to retrieve the
22879 // VF and then transform it back.
22880 // Remember: VF is defined as the number we want to vectorize, not the
22881 // number of elements in the final vector.
22882 Type *StoreScalarTy = StoreTy->getScalarType();
22883 unsigned MinVF = PowerOf2Ceil(TTI->getStoreMinimumVF(
22884 R.getMinVF(DL->getTypeStoreSizeInBits(StoreScalarTy)), StoreScalarTy,
22885 ValueTy->getScalarType()));
22886 MinVF /= getNumElements(StoreTy);
22887 MinVF = std::max<unsigned>(2, MinVF);
22888
22889 if (MaxVF < MinVF) {
22890 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
22891 << ") < "
22892 << "MinVF (" << MinVF << ")\n");
22893 continue;
22894 }
22895
22896 unsigned NonPowerOf2VF = 0;
22898 // First try vectorizing with a non-power-of-2 VF. At the moment, only
22899 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
22900 // lanes are used.
22901 unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF);
22902 if (has_single_bit(CandVF + 1)) {
22903 NonPowerOf2VF = CandVF;
22904 assert(NonPowerOf2VF != MaxVF &&
22905 "Non-power-of-2 VF should not be equal to MaxVF");
22906 }
22907 }
22908
22909 // MaxRegVF represents the number of instructions (scalar, or vector in
22910 // case of revec) that can be vectorized to naturally fit in a vector
22911 // register.
22912 unsigned MaxRegVF = MaxVF;
22913
22914 MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
22915 if (MaxVF < MinVF) {
22916 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
22917 << ") < "
22918 << "MinVF (" << MinVF << ")\n");
22919 continue;
22920 }
22921
22922 SmallVector<unsigned> CandidateVFs;
22923 for (unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
22924 VF = divideCeil(VF, 2))
22925 CandidateVFs.push_back(VF);
22926
22927 unsigned End = Operands.size();
22928 unsigned Repeat = 0;
22929 constexpr unsigned MaxAttempts = 4;
22930 OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(Operands.size());
22931 for (std::pair<unsigned, unsigned> &P : RangeSizes)
22932 P.first = P.second = 1;
22933 DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
22934 auto IsNotVectorized = [](bool First,
22935 const std::pair<unsigned, unsigned> &P) {
22936 return First ? P.first > 0 : P.second > 0;
22937 };
22938 auto IsVectorized = [](bool First,
22939 const std::pair<unsigned, unsigned> &P) {
22940 return First ? P.first == 0 : P.second == 0;
22941 };
22942 auto VFIsProfitable = [](bool First, unsigned Size,
22943 const std::pair<unsigned, unsigned> &P) {
22944 return First ? Size >= P.first : Size >= P.second;
22945 };
22946 auto FirstSizeSame = [](unsigned Size,
22947 const std::pair<unsigned, unsigned> &P) {
22948 return Size == P.first;
22949 };
22950 while (true) {
22951 ++Repeat;
22952 bool RepeatChanged = false;
22953 bool AnyProfitableGraph = false;
22954 for (unsigned VF : CandidateVFs) {
22955 AnyProfitableGraph = false;
22956 unsigned FirstUnvecStore =
22957 std::distance(RangeSizes.begin(),
22958 find_if(RangeSizes, std::bind(IsNotVectorized,
22959 VF >= MaxRegVF, _1)));
22960
22961 // Form slices of size VF starting from FirstUnvecStore and try to
22962 // vectorize them.
22963 while (FirstUnvecStore < End) {
22964 unsigned FirstVecStore = std::distance(
22965 RangeSizes.begin(),
22966 find_if(RangeSizes.drop_front(FirstUnvecStore),
22967 std::bind(IsVectorized, VF >= MaxRegVF, _1)));
22968 unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
22969 for (unsigned SliceStartIdx = FirstUnvecStore;
22970 SliceStartIdx + VF <= MaxSliceEnd;) {
22971 if (!checkTreeSizes(RangeSizes.slice(SliceStartIdx, VF),
22972 VF >= MaxRegVF)) {
22973 ++SliceStartIdx;
22974 continue;
22975 }
22976 ArrayRef<Value *> Slice =
22977 ArrayRef(Operands).slice(SliceStartIdx, VF);
22978 assert(all_of(Slice,
22979 [&](Value *V) {
22980 return cast<StoreInst>(V)
22981 ->getValueOperand()
22982 ->getType() ==
22983 cast<StoreInst>(Slice.front())
22984 ->getValueOperand()
22985 ->getType();
22986 }) &&
22987 "Expected all operands of same type.");
22988 if (!NonSchedulable.empty()) {
22989 auto [NonSchedSizeMax, NonSchedSizeMin] =
22990 NonSchedulable.lookup(Slice.front());
22991 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
22992 // VF is too ambitious. Try to vectorize another slice before
22993 // trying a smaller VF.
22994 SliceStartIdx += NonSchedSizeMax;
22995 continue;
22996 }
22997 }
22998 unsigned TreeSize;
22999 std::optional<bool> Res =
23000 vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
23001 if (!Res) {
23002 // Update the range of non schedulable VFs for slices starting
23003 // at SliceStartIdx.
23004 NonSchedulable
23005 .try_emplace(Slice.front(), std::make_pair(VF, VF))
23006 .first->getSecond()
23007 .second = VF;
23008 } else if (*Res) {
23009 // Mark the vectorized stores so that we don't vectorize them
23010 // again.
23011 VectorizedStores.insert_range(Slice);
23012 // Mark the vectorized stores so that we don't vectorize them
23013 // again.
23014 AnyProfitableGraph = RepeatChanged = Changed = true;
23015 // If we vectorized initial block, no need to try to vectorize
23016 // it again.
23017 for (std::pair<unsigned, unsigned> &P :
23018 RangeSizes.slice(SliceStartIdx, VF))
23019 P.first = P.second = 0;
23020 if (SliceStartIdx < FirstUnvecStore + MinVF) {
23021 for (std::pair<unsigned, unsigned> &P : RangeSizes.slice(
23022 FirstUnvecStore, SliceStartIdx - FirstUnvecStore))
23023 P.first = P.second = 0;
23024 FirstUnvecStore = SliceStartIdx + VF;
23025 }
23026 if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
23027 for (std::pair<unsigned, unsigned> &P :
23028 RangeSizes.slice(SliceStartIdx + VF,
23029 MaxSliceEnd - (SliceStartIdx + VF)))
23030 P.first = P.second = 0;
23031 if (MaxSliceEnd == End)
23032 End = SliceStartIdx;
23033 MaxSliceEnd = SliceStartIdx;
23034 }
23035 SliceStartIdx += VF;
23036 continue;
23037 }
23038 if (VF > 2 && Res &&
23039 !all_of(RangeSizes.slice(SliceStartIdx, VF),
23040 std::bind(VFIsProfitable, VF >= MaxRegVF, TreeSize,
23041 _1))) {
23042 SliceStartIdx += VF;
23043 continue;
23044 }
23045 // Check for the very big VFs that we're not rebuilding same
23046 // trees, just with larger number of elements.
23047 if (VF > MaxRegVF && TreeSize > 1 &&
23048 all_of(RangeSizes.slice(SliceStartIdx, VF),
23049 std::bind(FirstSizeSame, TreeSize, _1))) {
23050 SliceStartIdx += VF;
23051 while (SliceStartIdx != MaxSliceEnd &&
23052 RangeSizes[SliceStartIdx].first == TreeSize)
23053 ++SliceStartIdx;
23054 continue;
23055 }
23056 if (TreeSize > 1) {
23057 for (std::pair<unsigned, unsigned> &P :
23058 RangeSizes.slice(SliceStartIdx, VF)) {
23059 if (VF >= MaxRegVF)
23060 P.second = std::max(P.second, TreeSize);
23061 else
23062 P.first = std::max(P.first, TreeSize);
23063 }
23064 }
23065 ++SliceStartIdx;
23066 AnyProfitableGraph = true;
23067 }
23068 if (FirstUnvecStore >= End)
23069 break;
23070 if (MaxSliceEnd - FirstUnvecStore < VF &&
23071 MaxSliceEnd - FirstUnvecStore >= MinVF)
23072 AnyProfitableGraph = true;
23073 FirstUnvecStore = std::distance(
23074 RangeSizes.begin(),
23075 find_if(RangeSizes.drop_front(MaxSliceEnd),
23076 std::bind(IsNotVectorized, VF >= MaxRegVF, _1)));
23077 }
23078 if (!AnyProfitableGraph && VF >= MaxRegVF && has_single_bit(VF))
23079 break;
23080 }
23081 // All values vectorized - exit.
23082 if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
23083 return P.first == 0 && P.second == 0;
23084 }))
23085 break;
23086 // Check if tried all attempts or no need for the last attempts at all.
23087 if (Repeat >= MaxAttempts ||
23088 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
23089 break;
23090 constexpr unsigned StoresLimit = 64;
23091 const unsigned MaxTotalNum = std::min<unsigned>(
23092 Operands.size(),
23093 static_cast<unsigned>(
23094 End -
23095 std::distance(
23096 RangeSizes.begin(),
23097 find_if(RangeSizes, std::bind(IsNotVectorized, true, _1))) +
23098 1));
23099 unsigned VF = bit_ceil(CandidateVFs.front()) * 2;
23100 unsigned Limit =
23101 getFloorFullVectorNumberOfElements(*TTI, StoreTy, MaxTotalNum);
23102 CandidateVFs.clear();
23103 if (bit_floor(Limit) == VF)
23104 CandidateVFs.push_back(Limit);
23105 if (VF > MaxTotalNum || VF >= StoresLimit)
23106 break;
23107 for (std::pair<unsigned, unsigned> &P : RangeSizes) {
23108 if (P.first != 0)
23109 P.first = std::max(P.second, P.first);
23110 }
23111 // Last attempt to vectorize max number of elements, if all previous
23112 // attempts were unsuccessful because of the cost issues.
23113 CandidateVFs.push_back(VF);
23114 }
23115 }
23116 };
23117
23118 /// Groups of stores to vectorize
23119 SmallVector<RelatedStoreInsts> SortedStores;
23120
23121 // Inserts the specified store SI with the given index Idx to the set of the
23122 // stores. If the store with the same distance is found already - stop
23123 // insertion, try to vectorize already found stores. If some stores from this
23124 // sequence were not vectorized - try to vectorize them with the new store
23125 // later. But this logic is applied only to the stores, that come before the
23126 // previous store with the same distance.
23127 // Example:
23128 // 1. store x, %p
23129 // 2. store y, %p+1
23130 // 3. store z, %p+2
23131 // 4. store a, %p
23132 // 5. store b, %p+3
23133 // - Scan this from the last to first store. The very first bunch of stores is
23134 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
23135 // vector).
23136 // - The next store in the list - #1 - has the same distance from store #5 as
23137 // the store #4.
23138 // - Try to vectorize sequence of stores 4,2,3,5.
23139 // - If all these stores are vectorized - just drop them.
23140 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
23141 // - Start new stores sequence.
23142 // The new bunch of stores is {1, {1, 0}}.
23143 // - Add the stores from previous sequence, that were not vectorized.
23144 // Here we consider the stores in the reversed order, rather they are used in
23145 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
23146 // Store #3 can be added -> comes after store #4 with the same distance as
23147 // store #1.
23148 // Store #5 cannot be added - comes before store #4.
23149 // This logic allows to improve the compile time, we assume that the stores
23150 // after previous store with the same distance most likely have memory
23151 // dependencies and no need to waste compile time to try to vectorize them.
23152 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
23153 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
23154 std::optional<int64_t> PtrDist;
23155 auto *RelatedStores = find_if(
23156 SortedStores, [&PtrDist, SI, this](const RelatedStoreInsts &StoreSeq) {
23157 PtrDist = StoreSeq.getPointerDiff(*SI, *DL, *SE);
23158 return PtrDist.has_value();
23159 });
23160
23161 // We did not find a comparable store, start a new group.
23162 if (RelatedStores == SortedStores.end()) {
23163 SortedStores.emplace_back(Idx, Stores);
23164 return;
23165 }
23166
23167 // If there is already a store in the group with the same PtrDiff, try to
23168 // vectorize the existing instructions before adding the current store.
23169 // Otherwise, insert this store and keep collecting.
23170 if (std::optional<unsigned> PrevInst =
23171 RelatedStores->insertOrLookup(Idx, *PtrDist)) {
23172 TryToVectorize(RelatedStores->getStores());
23173 RelatedStores->clearVectorizedStores(VectorizedStores);
23174 RelatedStores->rebase(/*MinSafeIdx=*/*PrevInst + 1,
23175 /*NewBaseInstIdx=*/Idx,
23176 /*DistFromCurBase=*/*PtrDist);
23177 }
23178 };
23179 Type *PrevValTy = nullptr;
23180 for (auto [I, SI] : enumerate(Stores)) {
23181 if (R.isDeleted(SI))
23182 continue;
23183 if (!PrevValTy)
23184 PrevValTy = SI->getValueOperand()->getType();
23185 // Check that we do not try to vectorize stores of different types.
23186 if (PrevValTy != SI->getValueOperand()->getType()) {
23187 for (RelatedStoreInsts &StoreSeq : SortedStores)
23188 TryToVectorize(StoreSeq.getStores());
23189 SortedStores.clear();
23190 PrevValTy = SI->getValueOperand()->getType();
23191 }
23192 FillStoresSet(I, SI);
23193 }
23194
23195 // Final vectorization attempt.
23196 for (RelatedStoreInsts &StoreSeq : SortedStores)
23197 TryToVectorize(StoreSeq.getStores());
23198
23199 return Changed;
23200}
23201
23202void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
23203 // Initialize the collections. We will make a single pass over the block.
23204 Stores.clear();
23205 GEPs.clear();
23206
23207 // Visit the store and getelementptr instructions in BB and organize them in
23208 // Stores and GEPs according to the underlying objects of their pointer
23209 // operands.
23210 for (Instruction &I : *BB) {
23211 // Ignore store instructions that are volatile or have a pointer operand
23212 // that doesn't point to a scalar type.
23213 if (auto *SI = dyn_cast<StoreInst>(&I)) {
23214 if (!SI->isSimple())
23215 continue;
23216 if (!isValidElementType(SI->getValueOperand()->getType()))
23217 continue;
23218 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
23219 }
23220
23221 // Ignore getelementptr instructions that have more than one index, a
23222 // constant index, or a pointer operand that doesn't point to a scalar
23223 // type.
23224 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
23225 if (GEP->getNumIndices() != 1)
23226 continue;
23227 Value *Idx = GEP->idx_begin()->get();
23228 if (isa<Constant>(Idx))
23229 continue;
23230 if (!isValidElementType(Idx->getType()))
23231 continue;
23232 if (GEP->getType()->isVectorTy())
23233 continue;
23234 GEPs[GEP->getPointerOperand()].push_back(GEP);
23235 }
23236 }
23237}
23238
23239bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
23240 bool MaxVFOnly) {
23241 if (VL.size() < 2)
23242 return false;
23243
23244 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
23245 << VL.size() << ".\n");
23246
23247 // Check that all of the parts are instructions of the same type,
23248 // we permit an alternate opcode via InstructionsState.
23249 InstructionsState S = getSameOpcode(VL, *TLI);
23250 if (!S)
23251 return false;
23252
23253 Instruction *I0 = S.getMainOp();
23254 // Make sure invalid types (including vector type) are rejected before
23255 // determining vectorization factor for scalar instructions.
23256 for (Value *V : VL) {
23257 Type *Ty = V->getType();
23259 // NOTE: the following will give user internal llvm type name, which may
23260 // not be useful.
23261 R.getORE()->emit([&]() {
23262 std::string TypeStr;
23263 llvm::raw_string_ostream OS(TypeStr);
23264 Ty->print(OS);
23265 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
23266 << "Cannot SLP vectorize list: type "
23267 << TypeStr + " is unsupported by vectorizer";
23268 });
23269 return false;
23270 }
23271 }
23272
23273 Type *ScalarTy = getValueType(VL[0]);
23274 unsigned Sz = R.getVectorElementSize(I0);
23275 unsigned MinVF = R.getMinVF(Sz);
23276 unsigned MaxVF = std::max<unsigned>(
23277 getFloorFullVectorNumberOfElements(*TTI, ScalarTy, VL.size()), MinVF);
23278 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
23279 if (MaxVF < 2) {
23280 R.getORE()->emit([&]() {
23281 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
23282 << "Cannot SLP vectorize list: vectorization factor "
23283 << "less than 2 is not supported";
23284 });
23285 return false;
23286 }
23287
23288 bool Changed = false;
23289 bool CandidateFound = false;
23290 InstructionCost MinCost = SLPCostThreshold.getValue();
23291
23292 unsigned NextInst = 0, MaxInst = VL.size();
23293 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
23294 VF = getFloorFullVectorNumberOfElements(*TTI, I0->getType(), VF - 1)) {
23295 // No actual vectorization should happen, if number of parts is the same as
23296 // provided vectorization factor (i.e. the scalar type is used for vector
23297 // code during codegen).
23298 auto *VecTy = getWidenedType(ScalarTy, VF);
23299 if (TTI->getNumberOfParts(VecTy) == VF)
23300 continue;
23301 for (unsigned I = NextInst; I < MaxInst; ++I) {
23302 unsigned ActualVF = std::min(MaxInst - I, VF);
23303
23304 if (!hasFullVectorsOrPowerOf2(*TTI, ScalarTy, ActualVF))
23305 continue;
23306
23307 if (MaxVFOnly && ActualVF < MaxVF)
23308 break;
23309 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
23310 break;
23311
23312 SmallVector<Value *> Ops(ActualVF, nullptr);
23313 unsigned Idx = 0;
23314 for (Value *V : VL.drop_front(I)) {
23315 // Check that a previous iteration of this loop did not delete the
23316 // Value.
23317 if (auto *Inst = dyn_cast<Instruction>(V);
23318 !Inst || !R.isDeleted(Inst)) {
23319 Ops[Idx] = V;
23320 ++Idx;
23321 if (Idx == ActualVF)
23322 break;
23323 }
23324 }
23325 // Not enough vectorizable instructions - exit.
23326 if (Idx != ActualVF)
23327 break;
23328
23329 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
23330 << "\n");
23331
23332 R.buildTree(Ops);
23333 if (R.isTreeTinyAndNotFullyVectorizable())
23334 continue;
23335 if (R.isProfitableToReorder()) {
23336 R.reorderTopToBottom();
23337 R.reorderBottomToTop(!isa<InsertElementInst>(Ops.front()));
23338 }
23339 R.transformNodes();
23340 R.buildExternalUses();
23341
23342 R.computeMinimumValueSizes();
23343 InstructionCost Cost = R.getTreeCost();
23344 CandidateFound = true;
23345 MinCost = std::min(MinCost, Cost);
23346
23347 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
23348 << " for VF=" << ActualVF << "\n");
23349 if (Cost < -SLPCostThreshold) {
23350 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
23351 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
23353 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
23354 << " and with tree size "
23355 << ore::NV("TreeSize", R.getTreeSize()));
23356
23357 R.vectorizeTree();
23358 // Move to the next bundle.
23359 I += VF - 1;
23360 NextInst = I + 1;
23361 Changed = true;
23362 }
23363 }
23364 }
23365
23366 if (!Changed && CandidateFound) {
23367 R.getORE()->emit([&]() {
23368 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
23369 << "List vectorization was possible but not beneficial with cost "
23370 << ore::NV("Cost", MinCost) << " >= "
23371 << ore::NV("Treshold", -SLPCostThreshold);
23372 });
23373 } else if (!Changed) {
23374 R.getORE()->emit([&]() {
23375 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
23376 << "Cannot SLP vectorize list: vectorization was impossible"
23377 << " with available vectorization factors";
23378 });
23379 }
23380 return Changed;
23381}
23382
23383namespace {
23384
23385/// Model horizontal reductions.
23386///
23387/// A horizontal reduction is a tree of reduction instructions that has values
23388/// that can be put into a vector as its leaves. For example:
23389///
23390/// mul mul mul mul
23391/// \ / \ /
23392/// + +
23393/// \ /
23394/// +
23395/// This tree has "mul" as its leaf values and "+" as its reduction
23396/// instructions. A reduction can feed into a store or a binary operation
23397/// feeding a phi.
23398/// ...
23399/// \ /
23400/// +
23401/// |
23402/// phi +=
23403///
23404/// Or:
23405/// ...
23406/// \ /
23407/// +
23408/// |
23409/// *p =
23410///
23411class HorizontalReduction {
23412 using ReductionOpsType = SmallVector<Value *, 16>;
23413 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
23414 ReductionOpsListType ReductionOps;
23415 /// List of possibly reduced values.
23417 /// Maps reduced value to the corresponding reduction operation.
23418 SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;
23419 WeakTrackingVH ReductionRoot;
23420 /// The type of reduction operation.
23421 RecurKind RdxKind;
23422 /// Checks if the optimization of original scalar identity operations on
23423 /// matched horizontal reductions is enabled and allowed.
23424 bool IsSupportedHorRdxIdentityOp = false;
23425 /// The minimum number of the reduced values.
23426 const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
23427 /// Contains vector values for reduction including their scale factor and
23428 /// signedness.
23430
23431 static bool isCmpSelMinMax(Instruction *I) {
23432 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
23434 }
23435
23436 // And/or are potentially poison-safe logical patterns like:
23437 // select x, y, false
23438 // select x, true, y
23439 static bool isBoolLogicOp(Instruction *I) {
23440 return isa<SelectInst>(I) &&
23441 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
23442 }
23443
23444 /// Checks if instruction is associative and can be vectorized.
23445 static bool isVectorizable(RecurKind Kind, Instruction *I,
23446 bool TwoElementReduction = false) {
23447 if (Kind == RecurKind::None)
23448 return false;
23449
23450 // Integer ops that map to select instructions or intrinsics are fine.
23452 isBoolLogicOp(I))
23453 return true;
23454
23455 // No need to check for associativity, if 2 reduced values.
23456 if (TwoElementReduction)
23457 return true;
23458
23459 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
23460 // FP min/max are associative except for NaN and -0.0. We do not
23461 // have to rule out -0.0 here because the intrinsic semantics do not
23462 // specify a fixed result for it.
23463 return I->getFastMathFlags().noNaNs();
23464 }
23465
23466 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
23467 return true;
23468
23469 return I->isAssociative();
23470 }
23471
23472 static Value *getRdxOperand(Instruction *I, unsigned Index) {
23473 // Poison-safe 'or' takes the form: select X, true, Y
23474 // To make that work with the normal operand processing, we skip the
23475 // true value operand.
23476 // TODO: Change the code and data structures to handle this without a hack.
23477 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
23478 return I->getOperand(2);
23479 return I->getOperand(Index);
23480 }
23481
23482 /// Creates reduction operation with the current opcode.
23483 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
23484 Value *RHS, const Twine &Name, bool UseSelect) {
23485 Type *OpTy = LHS->getType();
23486 assert(OpTy == RHS->getType() && "Expected LHS and RHS of same type");
23487 switch (Kind) {
23488 case RecurKind::Or: {
23489 if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
23490 return Builder.CreateSelect(
23491 LHS, ConstantInt::getAllOnesValue(CmpInst::makeCmpResultType(OpTy)),
23492 RHS, Name);
23493 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
23494 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
23495 Name);
23496 }
23497 case RecurKind::And: {
23498 if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
23499 return Builder.CreateSelect(
23500 LHS, RHS,
23501 ConstantInt::getNullValue(CmpInst::makeCmpResultType(OpTy)), Name);
23502 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
23503 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
23504 Name);
23505 }
23506 case RecurKind::Add:
23507 case RecurKind::Mul:
23508 case RecurKind::Xor:
23509 case RecurKind::FAdd:
23510 case RecurKind::FMul: {
23511 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
23512 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
23513 Name);
23514 }
23515 case RecurKind::SMax:
23516 case RecurKind::SMin:
23517 case RecurKind::UMax:
23518 case RecurKind::UMin:
23519 if (UseSelect) {
23521 Value *Cmp = Builder.CreateICmp(Pred, LHS, RHS, Name);
23522 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
23523 }
23524 [[fallthrough]];
23525 case RecurKind::FMax:
23526 case RecurKind::FMin:
23527 case RecurKind::FMaximum:
23528 case RecurKind::FMinimum:
23529 case RecurKind::FMaximumNum:
23530 case RecurKind::FMinimumNum: {
23532 return Builder.CreateBinaryIntrinsic(Id, LHS, RHS);
23533 }
23534 default:
23535 llvm_unreachable("Unknown reduction operation.");
23536 }
23537 }
23538
23539 /// Creates reduction operation with the current opcode with the IR flags
23540 /// from \p ReductionOps, dropping nuw/nsw flags.
23541 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
23542 Value *RHS, const Twine &Name,
23543 const ReductionOpsListType &ReductionOps) {
23544 bool UseSelect = ReductionOps.size() == 2 ||
23545 // Logical or/and.
23546 (ReductionOps.size() == 1 &&
23547 any_of(ReductionOps.front(), IsaPred<SelectInst>));
23548 assert((!UseSelect || ReductionOps.size() != 2 ||
23549 isa<SelectInst>(ReductionOps[1][0])) &&
23550 "Expected cmp + select pairs for reduction");
23551 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
23553 if (auto *Sel = dyn_cast<SelectInst>(Op)) {
23554 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
23555 /*IncludeWrapFlags=*/false);
23556 propagateIRFlags(Op, ReductionOps[1], nullptr,
23557 /*IncludeWrapFlags=*/false);
23558 return Op;
23559 }
23560 }
23561 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
23562 return Op;
23563 }
23564
23565public:
23566 static RecurKind getRdxKind(Value *V) {
23567 auto *I = dyn_cast<Instruction>(V);
23568 if (!I)
23569 return RecurKind::None;
23570 if (match(I, m_Add(m_Value(), m_Value())))
23571 return RecurKind::Add;
23572 if (match(I, m_Mul(m_Value(), m_Value())))
23573 return RecurKind::Mul;
23574 if (match(I, m_And(m_Value(), m_Value())) ||
23576 return RecurKind::And;
23577 if (match(I, m_Or(m_Value(), m_Value())) ||
23579 return RecurKind::Or;
23580 if (match(I, m_Xor(m_Value(), m_Value())))
23581 return RecurKind::Xor;
23582 if (match(I, m_FAdd(m_Value(), m_Value())))
23583 return RecurKind::FAdd;
23584 if (match(I, m_FMul(m_Value(), m_Value())))
23585 return RecurKind::FMul;
23586
23588 return RecurKind::FMax;
23590 return RecurKind::FMin;
23591
23592 if (match(I, m_FMaximum(m_Value(), m_Value())))
23593 return RecurKind::FMaximum;
23594 if (match(I, m_FMinimum(m_Value(), m_Value())))
23595 return RecurKind::FMinimum;
23596 // This matches either cmp+select or intrinsics. SLP is expected to handle
23597 // either form.
23598 // TODO: If we are canonicalizing to intrinsics, we can remove several
23599 // special-case paths that deal with selects.
23600 if (match(I, m_SMax(m_Value(), m_Value())))
23601 return RecurKind::SMax;
23602 if (match(I, m_SMin(m_Value(), m_Value())))
23603 return RecurKind::SMin;
23604 if (match(I, m_UMax(m_Value(), m_Value())))
23605 return RecurKind::UMax;
23606 if (match(I, m_UMin(m_Value(), m_Value())))
23607 return RecurKind::UMin;
23608
23609 if (auto *Select = dyn_cast<SelectInst>(I)) {
23610 // Try harder: look for min/max pattern based on instructions producing
23611 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
23612 // During the intermediate stages of SLP, it's very common to have
23613 // pattern like this (since optimizeGatherSequence is run only once
23614 // at the end):
23615 // %1 = extractelement <2 x i32> %a, i32 0
23616 // %2 = extractelement <2 x i32> %a, i32 1
23617 // %cond = icmp sgt i32 %1, %2
23618 // %3 = extractelement <2 x i32> %a, i32 0
23619 // %4 = extractelement <2 x i32> %a, i32 1
23620 // %select = select i1 %cond, i32 %3, i32 %4
23621 CmpPredicate Pred;
23622 Instruction *L1;
23623 Instruction *L2;
23624
23625 Value *LHS = Select->getTrueValue();
23626 Value *RHS = Select->getFalseValue();
23627 Value *Cond = Select->getCondition();
23628
23629 // TODO: Support inverse predicates.
23630 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
23633 return RecurKind::None;
23634 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
23637 return RecurKind::None;
23638 } else {
23640 return RecurKind::None;
23641 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
23644 return RecurKind::None;
23645 }
23646
23647 switch (Pred) {
23648 default:
23649 return RecurKind::None;
23650 case CmpInst::ICMP_SGT:
23651 case CmpInst::ICMP_SGE:
23652 return RecurKind::SMax;
23653 case CmpInst::ICMP_SLT:
23654 case CmpInst::ICMP_SLE:
23655 return RecurKind::SMin;
23656 case CmpInst::ICMP_UGT:
23657 case CmpInst::ICMP_UGE:
23658 return RecurKind::UMax;
23659 case CmpInst::ICMP_ULT:
23660 case CmpInst::ICMP_ULE:
23661 return RecurKind::UMin;
23662 }
23663 }
23664 return RecurKind::None;
23665 }
23666
23667 /// Get the index of the first operand.
23668 static unsigned getFirstOperandIndex(Instruction *I) {
23669 return isCmpSelMinMax(I) ? 1 : 0;
23670 }
23671
23672private:
23673 /// Total number of operands in the reduction operation.
23674 static unsigned getNumberOfOperands(Instruction *I) {
23675 return isCmpSelMinMax(I) ? 3 : 2;
23676 }
23677
23678 /// Checks if the instruction is in basic block \p BB.
23679 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
23680 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
23681 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
23682 auto *Sel = cast<SelectInst>(I);
23683 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
23684 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
23685 }
23686 return I->getParent() == BB;
23687 }
23688
23689 /// Expected number of uses for reduction operations/reduced values.
23690 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
23691 if (IsCmpSelMinMax) {
23692 // SelectInst must be used twice while the condition op must have single
23693 // use only.
23694 if (auto *Sel = dyn_cast<SelectInst>(I))
23695 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
23696 return I->hasNUses(2);
23697 }
23698
23699 // Arithmetic reduction operation must be used once only.
23700 return I->hasOneUse();
23701 }
23702
23703 /// Initializes the list of reduction operations.
23704 void initReductionOps(Instruction *I) {
23705 if (isCmpSelMinMax(I))
23706 ReductionOps.assign(2, ReductionOpsType());
23707 else
23708 ReductionOps.assign(1, ReductionOpsType());
23709 }
23710
23711 /// Add all reduction operations for the reduction instruction \p I.
23712 void addReductionOps(Instruction *I) {
23713 if (isCmpSelMinMax(I)) {
23714 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
23715 ReductionOps[1].emplace_back(I);
23716 } else {
23717 ReductionOps[0].emplace_back(I);
23718 }
23719 }
23720
23721 static bool isGoodForReduction(ArrayRef<Value *> Data) {
23722 int Sz = Data.size();
23723 auto *I = dyn_cast<Instruction>(Data.front());
23724 return Sz > 1 || isConstant(Data.front()) ||
23725 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
23726 }
23727
23728public:
23729 HorizontalReduction() = default;
23731 : ReductionRoot(I), ReductionLimit(2) {
23732 RdxKind = HorizontalReduction::getRdxKind(I);
23733 ReductionOps.emplace_back().push_back(I);
23734 ReducedVals.emplace_back().assign(Ops.begin(), Ops.end());
23735 for (Value *V : Ops)
23736 ReducedValsToOps[V].push_back(I);
23737 }
23738
23739 bool matchReductionForOperands() const {
23740 // Analyze "regular" integer/FP types for reductions - no target-specific
23741 // types or pointers.
23742 assert(ReductionRoot && "Reduction root is not set!");
23743 if (!isVectorizable(RdxKind, cast<Instruction>(ReductionRoot),
23744 all_of(ReducedVals, [](ArrayRef<Value *> Ops) {
23745 return Ops.size() == 2;
23746 })))
23747 return false;
23748
23749 return true;
23750 }
23751
23752 /// Try to find a reduction tree.
23753 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
23754 ScalarEvolution &SE, const DataLayout &DL,
23755 const TargetLibraryInfo &TLI) {
23756 RdxKind = HorizontalReduction::getRdxKind(Root);
23757 if (!isVectorizable(RdxKind, Root))
23758 return false;
23759
23760 // Analyze "regular" integer/FP types for reductions - no target-specific
23761 // types or pointers.
23762 Type *Ty = Root->getType();
23763 if (!isValidElementType(Ty) || Ty->isPointerTy())
23764 return false;
23765
23766 // Though the ultimate reduction may have multiple uses, its condition must
23767 // have only single use.
23768 if (auto *Sel = dyn_cast<SelectInst>(Root))
23769 if (!Sel->getCondition()->hasOneUse())
23770 return false;
23771
23772 ReductionRoot = Root;
23773
23774 // Iterate through all the operands of the possible reduction tree and
23775 // gather all the reduced values, sorting them by their value id.
23776 BasicBlock *BB = Root->getParent();
23777 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
23779 1, std::make_pair(Root, 0));
23780 // Checks if the operands of the \p TreeN instruction are also reduction
23781 // operations or should be treated as reduced values or an extra argument,
23782 // which is not part of the reduction.
23783 auto CheckOperands = [&](Instruction *TreeN,
23784 SmallVectorImpl<Value *> &PossibleReducedVals,
23785 SmallVectorImpl<Instruction *> &ReductionOps,
23786 unsigned Level) {
23787 for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
23788 getNumberOfOperands(TreeN)))) {
23789 Value *EdgeVal = getRdxOperand(TreeN, I);
23790 ReducedValsToOps[EdgeVal].push_back(TreeN);
23791 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
23792 // If the edge is not an instruction, or it is different from the main
23793 // reduction opcode or has too many uses - possible reduced value.
23794 // Also, do not try to reduce const values, if the operation is not
23795 // foldable.
23796 if (!EdgeInst || Level > RecursionMaxDepth ||
23797 getRdxKind(EdgeInst) != RdxKind ||
23798 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
23799 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
23800 !isVectorizable(RdxKind, EdgeInst) ||
23801 (R.isAnalyzedReductionRoot(EdgeInst) &&
23802 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
23803 PossibleReducedVals.push_back(EdgeVal);
23804 continue;
23805 }
23806 ReductionOps.push_back(EdgeInst);
23807 }
23808 };
23809 // Try to regroup reduced values so that it gets more profitable to try to
23810 // reduce them. Values are grouped by their value ids, instructions - by
23811 // instruction op id and/or alternate op id, plus do extra analysis for
23812 // loads (grouping them by the distance between pointers) and cmp
23813 // instructions (grouping them by the predicate).
23814 SmallMapVector<
23815 size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
23816 8>
23817 PossibleReducedVals;
23818 initReductionOps(Root);
23819 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
23820 SmallSet<size_t, 2> LoadKeyUsed;
23821
23822 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
23824 Value *Ptr =
23826 if (!LoadKeyUsed.insert(Key).second) {
23827 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
23828 if (LIt != LoadsMap.end()) {
23829 for (LoadInst *RLI : LIt->second) {
23830 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
23831 LI->getType(), LI->getPointerOperand(), DL, SE,
23832 /*StrictCheck=*/true))
23833 return hash_value(RLI->getPointerOperand());
23834 }
23835 for (LoadInst *RLI : LIt->second) {
23837 LI->getPointerOperand(), TLI)) {
23838 hash_code SubKey = hash_value(RLI->getPointerOperand());
23839 return SubKey;
23840 }
23841 }
23842 if (LIt->second.size() > 2) {
23843 hash_code SubKey =
23844 hash_value(LIt->second.back()->getPointerOperand());
23845 return SubKey;
23846 }
23847 }
23848 }
23849 LoadsMap.try_emplace(std::make_pair(Key, Ptr))
23850 .first->second.push_back(LI);
23851 return hash_value(LI->getPointerOperand());
23852 };
23853
23854 while (!Worklist.empty()) {
23855 auto [TreeN, Level] = Worklist.pop_back_val();
23856 SmallVector<Value *> PossibleRedVals;
23857 SmallVector<Instruction *> PossibleReductionOps;
23858 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
23859 addReductionOps(TreeN);
23860 // Add reduction values. The values are sorted for better vectorization
23861 // results.
23862 for (Value *V : PossibleRedVals) {
23863 size_t Key, Idx;
23864 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
23865 /*AllowAlternate=*/false);
23866 ++PossibleReducedVals[Key][Idx].try_emplace(V, 0).first->second;
23867 }
23868 for (Instruction *I : reverse(PossibleReductionOps))
23869 Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);
23870 }
23871 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
23872 // Sort values by the total number of values kinds to start the reduction
23873 // from the longest possible reduced values sequences.
23874 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
23875 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
23876 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
23877 for (auto &Slice : PossibleRedVals) {
23878 PossibleRedValsVect.emplace_back();
23879 auto RedValsVect = Slice.second.takeVector();
23880 stable_sort(RedValsVect, llvm::less_second());
23881 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
23882 PossibleRedValsVect.back().append(Data.second, Data.first);
23883 }
23884 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
23885 return P1.size() > P2.size();
23886 });
23887 bool First = true;
23888 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
23889 if (First) {
23890 First = false;
23891 ReducedVals.emplace_back();
23892 } else if (!isGoodForReduction(Data)) {
23893 auto *LI = dyn_cast<LoadInst>(Data.front());
23894 auto *LastLI = dyn_cast<LoadInst>(ReducedVals.back().front());
23895 if (!LI || !LastLI ||
23897 getUnderlyingObject(LastLI->getPointerOperand()))
23898 ReducedVals.emplace_back();
23899 }
23900 ReducedVals.back().append(Data.rbegin(), Data.rend());
23901 }
23902 }
23903 // Sort the reduced values by number of same/alternate opcode and/or pointer
23904 // operand.
23905 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
23906 return P1.size() > P2.size();
23907 });
23908 return true;
23909 }
23910
23911 /// Attempt to vectorize the tree found by matchAssociativeReduction.
23912 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
23913 const TargetLibraryInfo &TLI, AssumptionCache *AC,
23914 DominatorTree &DT) {
23915 constexpr unsigned RegMaxNumber = 4;
23916 constexpr unsigned RedValsMaxNumber = 128;
23917 // If there are a sufficient number of reduction values, reduce
23918 // to a nearby power-of-2. We can safely generate oversized
23919 // vectors and rely on the backend to split them to legal sizes.
23920 if (unsigned NumReducedVals = std::accumulate(
23921 ReducedVals.begin(), ReducedVals.end(), 0,
23922 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
23923 if (!isGoodForReduction(Vals))
23924 return Num;
23925 return Num + Vals.size();
23926 });
23927 NumReducedVals < ReductionLimit &&
23928 all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
23929 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
23930 })) {
23931 for (ReductionOpsType &RdxOps : ReductionOps)
23932 for (Value *RdxOp : RdxOps)
23933 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
23934 return nullptr;
23935 }
23936
23937 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
23938 TargetFolder(DL));
23939 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
23940
23941 // Track the reduced values in case if they are replaced by extractelement
23942 // because of the vectorization.
23943 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *
23944 ReducedVals.front().size());
23945
23946 // The compare instruction of a min/max is the insertion point for new
23947 // instructions and may be replaced with a new compare instruction.
23948 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
23949 assert(isa<SelectInst>(RdxRootInst) &&
23950 "Expected min/max reduction to have select root instruction");
23951 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
23952 assert(isa<Instruction>(ScalarCond) &&
23953 "Expected min/max reduction to have compare condition");
23954 return cast<Instruction>(ScalarCond);
23955 };
23956
23957 bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {
23958 return isBoolLogicOp(cast<Instruction>(V));
23959 });
23960 // Return new VectorizedTree, based on previous value.
23961 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
23962 if (VectorizedTree) {
23963 // Update the final value in the reduction.
23965 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
23966 if (AnyBoolLogicOp) {
23967 auto It = ReducedValsToOps.find(VectorizedTree);
23968 auto It1 = ReducedValsToOps.find(Res);
23969 if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) ||
23970 isGuaranteedNotToBePoison(VectorizedTree, AC) ||
23971 (It != ReducedValsToOps.end() &&
23972 any_of(It->getSecond(), [&](Instruction *I) {
23973 return isBoolLogicOp(I) &&
23974 getRdxOperand(I, 0) == VectorizedTree;
23975 }))) {
23976 ;
23977 } else if (isGuaranteedNotToBePoison(Res, AC) ||
23978 (It1 != ReducedValsToOps.end() &&
23979 any_of(It1->getSecond(), [&](Instruction *I) {
23980 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
23981 }))) {
23982 std::swap(VectorizedTree, Res);
23983 } else {
23984 VectorizedTree = Builder.CreateFreeze(VectorizedTree);
23985 }
23986 }
23987
23988 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
23989 ReductionOps);
23990 }
23991 // Initialize the final value in the reduction.
23992 return Res;
23993 };
23994 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
23995 ReductionOps.front().size());
23996 for (ReductionOpsType &RdxOps : ReductionOps)
23997 for (Value *RdxOp : RdxOps) {
23998 if (!RdxOp)
23999 continue;
24000 IgnoreList.insert(RdxOp);
24001 }
24002 // Intersect the fast-math-flags from all reduction operations.
24003 FastMathFlags RdxFMF;
24004 RdxFMF.set();
24005 for (Value *U : IgnoreList)
24006 if (auto *FPMO = dyn_cast<FPMathOperator>(U))
24007 RdxFMF &= FPMO->getFastMathFlags();
24008 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
24009
24010 // Need to track reduced vals, they may be changed during vectorization of
24011 // subvectors.
24012 for (ArrayRef<Value *> Candidates : ReducedVals)
24013 for (Value *V : Candidates)
24014 TrackedVals.try_emplace(V, V);
24015
24016 auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,
24017 Value *V) -> unsigned & {
24018 auto *It = MV.find(V);
24019 assert(It != MV.end() && "Unable to find given key.");
24020 return It->second;
24021 };
24022
24023 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
24024 // List of the values that were reduced in other trees as part of gather
24025 // nodes and thus requiring extract if fully vectorized in other trees.
24026 SmallPtrSet<Value *, 4> RequiredExtract;
24027 WeakTrackingVH VectorizedTree = nullptr;
24028 bool CheckForReusedReductionOps = false;
24029 // Try to vectorize elements based on their type.
24031 for (ArrayRef<Value *> RV : ReducedVals)
24032 States.push_back(getSameOpcode(RV, TLI));
24033 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
24034 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
24035 InstructionsState S = States[I];
24036 SmallVector<Value *> Candidates;
24037 Candidates.reserve(2 * OrigReducedVals.size());
24038 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
24039 for (Value *ReducedVal : OrigReducedVals) {
24040 Value *RdxVal = TrackedVals.at(ReducedVal);
24041 // Check if the reduction value was not overriden by the extractelement
24042 // instruction because of the vectorization and exclude it, if it is not
24043 // compatible with other values.
24044 // Also check if the instruction was folded to constant/other value.
24045 auto *Inst = dyn_cast<Instruction>(RdxVal);
24046 if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
24047 (!S || !S.getMatchingMainOpOrAltOp(Inst))) ||
24048 (S && !Inst))
24049 continue;
24050 Candidates.push_back(RdxVal);
24051 TrackedToOrig.try_emplace(RdxVal, ReducedVal);
24052 }
24053 bool ShuffledExtracts = false;
24054 // Try to handle shuffled extractelements.
24055 if (S && S.getOpcode() == Instruction::ExtractElement &&
24056 !S.isAltShuffle() && I + 1 < E) {
24057 SmallVector<Value *> CommonCandidates(Candidates);
24058 for (Value *RV : ReducedVals[I + 1]) {
24059 Value *RdxVal = TrackedVals.at(RV);
24060 // Check if the reduction value was not overriden by the
24061 // extractelement instruction because of the vectorization and
24062 // exclude it, if it is not compatible with other values.
24063 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
24064 if (!Inst)
24065 continue;
24066 CommonCandidates.push_back(RdxVal);
24067 TrackedToOrig.try_emplace(RdxVal, RV);
24068 }
24069 SmallVector<int> Mask;
24070 if (isFixedVectorShuffle(CommonCandidates, Mask, AC)) {
24071 ++I;
24072 Candidates.swap(CommonCandidates);
24073 ShuffledExtracts = true;
24074 }
24075 }
24076
24077 // Emit code for constant values.
24078 if (Candidates.size() > 1 && allConstant(Candidates)) {
24079 Value *Res = Candidates.front();
24080 Value *OrigV = TrackedToOrig.at(Candidates.front());
24081 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24082 for (Value *VC : ArrayRef(Candidates).drop_front()) {
24083 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
24084 Value *OrigV = TrackedToOrig.at(VC);
24085 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24086 if (auto *ResI = dyn_cast<Instruction>(Res))
24087 V.analyzedReductionRoot(ResI);
24088 }
24089 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
24090 continue;
24091 }
24092
24093 unsigned NumReducedVals = Candidates.size();
24094 if (NumReducedVals < ReductionLimit &&
24095 (NumReducedVals < 2 || !isSplat(Candidates)))
24096 continue;
24097
24098 // Check if we support repeated scalar values processing (optimization of
24099 // original scalar identity operations on matched horizontal reductions).
24100 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
24101 RdxKind != RecurKind::FMul &&
24102 RdxKind != RecurKind::FMulAdd;
24103 // Gather same values.
24104 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
24105 if (IsSupportedHorRdxIdentityOp)
24106 for (Value *V : Candidates) {
24107 Value *OrigV = TrackedToOrig.at(V);
24108 ++SameValuesCounter.try_emplace(OrigV).first->second;
24109 }
24110 // Used to check if the reduced values used same number of times. In this
24111 // case the compiler may produce better code. E.g. if reduced values are
24112 // aabbccdd (8 x values), then the first node of the tree will have a node
24113 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
24114 // Plus, the final reduction will be performed on <8 x aabbccdd>.
24115 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
24116 // x abcd) * 2.
24117 // Currently it only handles add/fadd/xor. and/or/min/max do not require
24118 // this analysis, other operations may require an extra estimation of
24119 // the profitability.
24120 bool SameScaleFactor = false;
24121 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
24122 SameValuesCounter.size() != Candidates.size();
24123 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
24124 if (OptReusedScalars) {
24125 SameScaleFactor =
24126 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
24127 RdxKind == RecurKind::Xor) &&
24128 all_of(drop_begin(SameValuesCounter),
24129 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
24130 return P.second == SameValuesCounter.front().second;
24131 });
24132 Candidates.resize(SameValuesCounter.size());
24133 transform(SameValuesCounter, Candidates.begin(),
24134 [&](const auto &P) { return TrackedVals.at(P.first); });
24135 NumReducedVals = Candidates.size();
24136 // Have a reduction of the same element.
24137 if (NumReducedVals == 1) {
24138 Value *OrigV = TrackedToOrig.at(Candidates.front());
24139 unsigned Cnt = At(SameValuesCounter, OrigV);
24140 Value *RedVal =
24141 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
24142 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24143 VectorizedVals.try_emplace(OrigV, Cnt);
24144 ExternallyUsedValues.insert(OrigV);
24145 continue;
24146 }
24147 }
24148
24149 unsigned MaxVecRegSize = V.getMaxVecRegSize();
24150 unsigned EltSize = V.getVectorElementSize(Candidates[0]);
24151 const unsigned MaxElts = std::clamp<unsigned>(
24152 llvm::bit_floor(MaxVecRegSize / EltSize), RedValsMaxNumber,
24153 RegMaxNumber * RedValsMaxNumber);
24154
24155 unsigned ReduxWidth = NumReducedVals;
24156 auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) {
24157 unsigned NumParts, NumRegs;
24158 Type *ScalarTy = Candidates.front()->getType();
24159 ReduxWidth =
24160 getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth);
24161 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
24162 NumParts = ::getNumberOfParts(TTI, Tp);
24163 NumRegs =
24165 while (NumParts > NumRegs) {
24166 assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0.");
24167 ReduxWidth = bit_floor(ReduxWidth - 1);
24168 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
24169 NumParts = ::getNumberOfParts(TTI, Tp);
24170 NumRegs =
24172 }
24173 if (NumParts > NumRegs / 2)
24174 ReduxWidth = bit_floor(ReduxWidth);
24175 return ReduxWidth;
24176 };
24177 if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))
24178 ReduxWidth = GetVectorFactor(ReduxWidth);
24179 ReduxWidth = std::min(ReduxWidth, MaxElts);
24180
24181 unsigned Start = 0;
24182 unsigned Pos = Start;
24183 // Restarts vectorization attempt with lower vector factor.
24184 unsigned PrevReduxWidth = ReduxWidth;
24185 bool CheckForReusedReductionOpsLocal = false;
24186 auto AdjustReducedVals = [&](bool IgnoreVL = false) {
24187 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
24188 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
24189 // Check if any of the reduction ops are gathered. If so, worth
24190 // trying again with less number of reduction ops.
24191 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
24192 }
24193 ++Pos;
24194 if (Pos < NumReducedVals - ReduxWidth + 1)
24195 return IsAnyRedOpGathered;
24196 Pos = Start;
24197 --ReduxWidth;
24198 if (ReduxWidth > 1)
24199 ReduxWidth = GetVectorFactor(ReduxWidth);
24200 return IsAnyRedOpGathered;
24201 };
24202 bool AnyVectorized = false;
24203 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
24204 while (Pos < NumReducedVals - ReduxWidth + 1 &&
24205 ReduxWidth >= ReductionLimit) {
24206 // Dependency in tree of the reduction ops - drop this attempt, try
24207 // later.
24208 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
24209 Start == 0) {
24210 CheckForReusedReductionOps = true;
24211 break;
24212 }
24213 PrevReduxWidth = ReduxWidth;
24214 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
24215 // Been analyzed already - skip.
24216 if (IgnoredCandidates.contains(std::make_pair(Pos, ReduxWidth)) ||
24217 (!has_single_bit(ReduxWidth) &&
24218 (IgnoredCandidates.contains(
24219 std::make_pair(Pos, bit_floor(ReduxWidth))) ||
24220 IgnoredCandidates.contains(
24221 std::make_pair(Pos + (ReduxWidth - bit_floor(ReduxWidth)),
24222 bit_floor(ReduxWidth))))) ||
24223 V.areAnalyzedReductionVals(VL)) {
24224 (void)AdjustReducedVals(/*IgnoreVL=*/true);
24225 continue;
24226 }
24227 // Early exit if any of the reduction values were deleted during
24228 // previous vectorization attempts.
24229 if (any_of(VL, [&V](Value *RedVal) {
24230 auto *RedValI = dyn_cast<Instruction>(RedVal);
24231 return RedValI && V.isDeleted(RedValI);
24232 }))
24233 break;
24234 V.buildTree(VL, IgnoreList);
24235 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
24236 if (!AdjustReducedVals())
24237 V.analyzedReductionVals(VL);
24238 continue;
24239 }
24240 if (V.isLoadCombineReductionCandidate(RdxKind)) {
24241 if (!AdjustReducedVals())
24242 V.analyzedReductionVals(VL);
24243 continue;
24244 }
24245 V.reorderTopToBottom();
24246 // No need to reorder the root node at all for reassociative reduction.
24247 V.reorderBottomToTop(/*IgnoreReorder=*/RdxFMF.allowReassoc() ||
24248 VL.front()->getType()->isIntOrIntVectorTy() ||
24249 ReductionLimit > 2);
24250 // Keep extracted other reduction values, if they are used in the
24251 // vectorization trees.
24252 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
24253 ExternallyUsedValues);
24254 // The reduction root is used as the insertion point for new
24255 // instructions, so set it as externally used to prevent it from being
24256 // deleted.
24257 LocalExternallyUsedValues.insert(ReductionRoot);
24258 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
24259 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
24260 continue;
24261 for (Value *V : ReducedVals[Cnt])
24262 if (isa<Instruction>(V))
24263 LocalExternallyUsedValues.insert(TrackedVals[V]);
24264 }
24265 if (!IsSupportedHorRdxIdentityOp) {
24266 // Number of uses of the candidates in the vector of values.
24267 assert(SameValuesCounter.empty() &&
24268 "Reused values counter map is not empty");
24269 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24270 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24271 continue;
24272 Value *V = Candidates[Cnt];
24273 Value *OrigV = TrackedToOrig.at(V);
24274 ++SameValuesCounter.try_emplace(OrigV).first->second;
24275 }
24276 }
24277 V.transformNodes();
24278 SmallPtrSet<Value *, 4> VLScalars(llvm::from_range, VL);
24279 // Gather externally used values.
24280 SmallPtrSet<Value *, 4> Visited;
24281 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24282 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24283 continue;
24284 Value *RdxVal = Candidates[Cnt];
24285 if (auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
24286 RdxVal = It->second;
24287 if (!Visited.insert(RdxVal).second)
24288 continue;
24289 // Check if the scalar was vectorized as part of the vectorization
24290 // tree but not the top node.
24291 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
24292 LocalExternallyUsedValues.insert(RdxVal);
24293 continue;
24294 }
24295 Value *OrigV = TrackedToOrig.at(RdxVal);
24296 unsigned NumOps =
24297 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
24298 if (NumOps != ReducedValsToOps.at(OrigV).size())
24299 LocalExternallyUsedValues.insert(RdxVal);
24300 }
24301 // Do not need the list of reused scalars in regular mode anymore.
24302 if (!IsSupportedHorRdxIdentityOp)
24303 SameValuesCounter.clear();
24304 for (Value *RdxVal : VL)
24305 if (RequiredExtract.contains(RdxVal))
24306 LocalExternallyUsedValues.insert(RdxVal);
24307 V.buildExternalUses(LocalExternallyUsedValues);
24308
24309 V.computeMinimumValueSizes();
24310
24311 // Estimate cost.
24312 InstructionCost ReductionCost =
24313 getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT, DL, TLI);
24314 InstructionCost Cost = V.getTreeCost(VL, ReductionCost);
24315 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
24316 << " for reduction\n");
24317 if (!Cost.isValid())
24318 break;
24319 if (Cost >= -SLPCostThreshold) {
24320 V.getORE()->emit([&]() {
24321 return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
24322 ReducedValsToOps.at(VL[0]).front())
24323 << "Vectorizing horizontal reduction is possible "
24324 << "but not beneficial with cost " << ore::NV("Cost", Cost)
24325 << " and threshold "
24326 << ore::NV("Threshold", -SLPCostThreshold);
24327 });
24328 if (!AdjustReducedVals()) {
24329 V.analyzedReductionVals(VL);
24330 unsigned Offset = Pos == Start ? Pos : Pos - 1;
24331 if (ReduxWidth > ReductionLimit && V.isTreeNotExtendable()) {
24332 // Add subvectors of VL to the list of the analyzed values.
24333 for (unsigned VF = getFloorFullVectorNumberOfElements(
24334 *TTI, VL.front()->getType(), ReduxWidth - 1);
24335 VF >= ReductionLimit;
24337 *TTI, VL.front()->getType(), VF - 1)) {
24338 if (has_single_bit(VF) &&
24339 V.getCanonicalGraphSize() != V.getTreeSize())
24340 continue;
24341 for (unsigned Idx : seq<unsigned>(ReduxWidth - VF))
24342 IgnoredCandidates.insert(std::make_pair(Offset + Idx, VF));
24343 }
24344 }
24345 }
24346 continue;
24347 }
24348
24349 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
24350 << Cost << ". (HorRdx)\n");
24351 V.getORE()->emit([&]() {
24352 return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
24353 ReducedValsToOps.at(VL[0]).front())
24354 << "Vectorized horizontal reduction with cost "
24355 << ore::NV("Cost", Cost) << " and with tree size "
24356 << ore::NV("TreeSize", V.getTreeSize());
24357 });
24358
24359 Builder.setFastMathFlags(RdxFMF);
24360
24361 // Emit a reduction. If the root is a select (min/max idiom), the insert
24362 // point is the compare condition of that select.
24363 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
24364 Instruction *InsertPt = RdxRootInst;
24365 if (IsCmpSelMinMax)
24366 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
24367
24368 // Vectorize a tree.
24369 Value *VectorizedRoot = V.vectorizeTree(
24370 LocalExternallyUsedValues, InsertPt, VectorValuesAndScales);
24371 // Update TrackedToOrig mapping, since the tracked values might be
24372 // updated.
24373 for (Value *RdxVal : Candidates) {
24374 Value *OrigVal = TrackedToOrig.at(RdxVal);
24375 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
24376 if (TransformedRdxVal != RdxVal)
24377 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
24378 }
24379
24380 Builder.SetInsertPoint(InsertPt);
24381
24382 // To prevent poison from leaking across what used to be sequential,
24383 // safe, scalar boolean logic operations, the reduction operand must be
24384 // frozen.
24385 if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot, AC))
24386 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
24387
24388 // Emit code to correctly handle reused reduced values, if required.
24389 if (OptReusedScalars && !SameScaleFactor) {
24390 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
24391 SameValuesCounter, TrackedToOrig);
24392 }
24393
24394 Type *ScalarTy = VL.front()->getType();
24395 Type *VecTy = VectorizedRoot->getType();
24396 Type *RedScalarTy = VecTy->getScalarType();
24397 VectorValuesAndScales.emplace_back(
24398 VectorizedRoot,
24399 OptReusedScalars && SameScaleFactor
24400 ? SameValuesCounter.front().second
24401 : 1,
24402 RedScalarTy != ScalarTy->getScalarType()
24403 ? V.isSignedMinBitwidthRootNode()
24404 : true);
24405
24406 // Count vectorized reduced values to exclude them from final reduction.
24407 for (Value *RdxVal : VL) {
24408 Value *OrigV = TrackedToOrig.at(RdxVal);
24409 if (IsSupportedHorRdxIdentityOp) {
24410 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
24411 continue;
24412 }
24413 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24414 if (!V.isVectorized(RdxVal))
24415 RequiredExtract.insert(RdxVal);
24416 }
24417 Pos += ReduxWidth;
24418 Start = Pos;
24419 ReduxWidth = NumReducedVals - Pos;
24420 if (ReduxWidth > 1)
24421 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
24422 AnyVectorized = true;
24423 }
24424 if (OptReusedScalars && !AnyVectorized) {
24425 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
24426 Value *RdxVal = TrackedVals.at(P.first);
24427 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder, P.second);
24428 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24429 VectorizedVals.try_emplace(P.first, P.second);
24430 }
24431 continue;
24432 }
24433 }
24434 if (!VectorValuesAndScales.empty())
24435 VectorizedTree = GetNewVectorizedTree(
24436 VectorizedTree,
24437 emitReduction(Builder, *TTI, ReductionRoot->getType()));
24438 if (VectorizedTree) {
24439 // Reorder operands of bool logical op in the natural order to avoid
24440 // possible problem with poison propagation. If not possible to reorder
24441 // (both operands are originally RHS), emit an extra freeze instruction
24442 // for the LHS operand.
24443 // I.e., if we have original code like this:
24444 // RedOp1 = select i1 ?, i1 LHS, i1 false
24445 // RedOp2 = select i1 RHS, i1 ?, i1 false
24446
24447 // Then, we swap LHS/RHS to create a new op that matches the poison
24448 // semantics of the original code.
24449
24450 // If we have original code like this and both values could be poison:
24451 // RedOp1 = select i1 ?, i1 LHS, i1 false
24452 // RedOp2 = select i1 ?, i1 RHS, i1 false
24453
24454 // Then, we must freeze LHS in the new op.
24455 auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS,
24456 Instruction *RedOp1,
24457 Instruction *RedOp2,
24458 bool InitStep) {
24459 if (!AnyBoolLogicOp)
24460 return;
24461 if (isBoolLogicOp(RedOp1) && ((!InitStep && LHS == VectorizedTree) ||
24462 getRdxOperand(RedOp1, 0) == LHS ||
24464 return;
24465 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
24466 getRdxOperand(RedOp2, 0) == RHS ||
24468 std::swap(LHS, RHS);
24469 return;
24470 }
24471 if (LHS != VectorizedTree)
24472 LHS = Builder.CreateFreeze(LHS);
24473 };
24474 // Finish the reduction.
24475 // Need to add extra arguments and not vectorized possible reduction
24476 // values.
24477 // Try to avoid dependencies between the scalar remainders after
24478 // reductions.
24479 auto FinalGen =
24481 bool InitStep) {
24482 unsigned Sz = InstVals.size();
24484 Sz % 2);
24485 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
24486 Instruction *RedOp = InstVals[I + 1].first;
24487 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
24488 Value *RdxVal1 = InstVals[I].second;
24489 Value *StableRdxVal1 = RdxVal1;
24490 auto It1 = TrackedVals.find(RdxVal1);
24491 if (It1 != TrackedVals.end())
24492 StableRdxVal1 = It1->second;
24493 Value *RdxVal2 = InstVals[I + 1].second;
24494 Value *StableRdxVal2 = RdxVal2;
24495 auto It2 = TrackedVals.find(RdxVal2);
24496 if (It2 != TrackedVals.end())
24497 StableRdxVal2 = It2->second;
24498 // To prevent poison from leaking across what used to be
24499 // sequential, safe, scalar boolean logic operations, the
24500 // reduction operand must be frozen.
24501 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
24502 RedOp, InitStep);
24503 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
24504 StableRdxVal2, "op.rdx", ReductionOps);
24505 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
24506 }
24507 if (Sz % 2 == 1)
24508 ExtraReds[Sz / 2] = InstVals.back();
24509 return ExtraReds;
24510 };
24512 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
24513 VectorizedTree);
24514 SmallPtrSet<Value *, 8> Visited;
24515 for (ArrayRef<Value *> Candidates : ReducedVals) {
24516 for (Value *RdxVal : Candidates) {
24517 if (!Visited.insert(RdxVal).second)
24518 continue;
24519 unsigned NumOps = VectorizedVals.lookup(RdxVal);
24520 for (Instruction *RedOp :
24521 ArrayRef(ReducedValsToOps.at(RdxVal)).drop_back(NumOps))
24522 ExtraReductions.emplace_back(RedOp, RdxVal);
24523 }
24524 }
24525 // Iterate through all not-vectorized reduction values/extra arguments.
24526 bool InitStep = true;
24527 while (ExtraReductions.size() > 1) {
24529 FinalGen(ExtraReductions, InitStep);
24530 ExtraReductions.swap(NewReds);
24531 InitStep = false;
24532 }
24533 VectorizedTree = ExtraReductions.front().second;
24534
24535 ReductionRoot->replaceAllUsesWith(VectorizedTree);
24536
24537 // The original scalar reduction is expected to have no remaining
24538 // uses outside the reduction tree itself. Assert that we got this
24539 // correct, replace internal uses with undef, and mark for eventual
24540 // deletion.
24541#ifndef NDEBUG
24542 SmallPtrSet<Value *, 4> IgnoreSet;
24543 for (ArrayRef<Value *> RdxOps : ReductionOps)
24544 IgnoreSet.insert_range(RdxOps);
24545#endif
24546 for (ArrayRef<Value *> RdxOps : ReductionOps) {
24547 for (Value *Ignore : RdxOps) {
24548 if (!Ignore)
24549 continue;
24550#ifndef NDEBUG
24551 for (auto *U : Ignore->users()) {
24552 assert(IgnoreSet.count(U) &&
24553 "All users must be either in the reduction ops list.");
24554 }
24555#endif
24556 if (!Ignore->use_empty()) {
24557 Value *P = PoisonValue::get(Ignore->getType());
24558 Ignore->replaceAllUsesWith(P);
24559 }
24560 }
24561 V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
24562 }
24563 } else if (!CheckForReusedReductionOps) {
24564 for (ReductionOpsType &RdxOps : ReductionOps)
24565 for (Value *RdxOp : RdxOps)
24566 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
24567 }
24568 return VectorizedTree;
24569 }
24570
24571private:
24572 /// Creates the reduction from the given \p Vec vector value with the given
24573 /// scale \p Scale and signedness \p IsSigned.
24574 Value *createSingleOp(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
24575 Value *Vec, unsigned Scale, bool IsSigned,
24576 Type *DestTy) {
24577 Value *Rdx;
24578 if (auto *VecTy = dyn_cast<FixedVectorType>(DestTy)) {
24579 unsigned DestTyNumElements = getNumElements(VecTy);
24580 unsigned VF = getNumElements(Vec->getType()) / DestTyNumElements;
24581 Rdx = PoisonValue::get(
24582 getWidenedType(Vec->getType()->getScalarType(), DestTyNumElements));
24583 for (unsigned I : seq<unsigned>(DestTyNumElements)) {
24584 // Do reduction for each lane.
24585 // e.g., do reduce add for
24586 // VL[0] = <4 x Ty> <a, b, c, d>
24587 // VL[1] = <4 x Ty> <e, f, g, h>
24588 // Lane[0] = <2 x Ty> <a, e>
24589 // Lane[1] = <2 x Ty> <b, f>
24590 // Lane[2] = <2 x Ty> <c, g>
24591 // Lane[3] = <2 x Ty> <d, h>
24592 // result[0] = reduce add Lane[0]
24593 // result[1] = reduce add Lane[1]
24594 // result[2] = reduce add Lane[2]
24595 // result[3] = reduce add Lane[3]
24596 SmallVector<int, 16> Mask = createStrideMask(I, DestTyNumElements, VF);
24597 Value *Lane = Builder.CreateShuffleVector(Vec, Mask);
24598 Rdx = Builder.CreateInsertElement(
24599 Rdx, emitReduction(Lane, Builder, &TTI, DestTy), I);
24600 }
24601 } else {
24602 Rdx = emitReduction(Vec, Builder, &TTI, DestTy);
24603 }
24604 if (Rdx->getType() != DestTy)
24605 Rdx = Builder.CreateIntCast(Rdx, DestTy, IsSigned);
24606 // Improved analysis for add/fadd/xor reductions with same scale
24607 // factor for all operands of reductions. We can emit scalar ops for
24608 // them instead.
24609 if (Scale > 1)
24610 Rdx = emitScaleForReusedOps(Rdx, Builder, Scale);
24611 return Rdx;
24612 }
24613
24614 /// Calculate the cost of a reduction.
24615 InstructionCost getReductionCost(TargetTransformInfo *TTI,
24616 ArrayRef<Value *> ReducedVals,
24617 bool IsCmpSelMinMax, FastMathFlags FMF,
24618 const BoUpSLP &R, DominatorTree &DT,
24619 const DataLayout &DL,
24620 const TargetLibraryInfo &TLI) {
24622 Type *ScalarTy = ReducedVals.front()->getType();
24623 unsigned ReduxWidth = ReducedVals.size();
24624 FixedVectorType *VectorTy = R.getReductionType();
24625 InstructionCost VectorCost = 0, ScalarCost;
24626 // If all of the reduced values are constant, the vector cost is 0, since
24627 // the reduction value can be calculated at the compile time.
24628 bool AllConsts = allConstant(ReducedVals);
24629 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
24631 // Scalar cost is repeated for N-1 elements.
24632 int Cnt = ReducedVals.size();
24633 for (Value *RdxVal : ReducedVals) {
24634 if (Cnt == 1)
24635 break;
24636 --Cnt;
24637 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
24638 Cost += GenCostFn();
24639 continue;
24640 }
24641 InstructionCost ScalarCost = 0;
24642 for (User *U : RdxVal->users()) {
24643 auto *RdxOp = cast<Instruction>(U);
24644 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
24645 if (RdxKind == RecurKind::FAdd) {
24647 RdxOp, getSameOpcode(RdxOp, TLI), DT, DL, *TTI, TLI);
24648 if (FMACost.isValid()) {
24649 LLVM_DEBUG(dbgs() << "FMA cost: " << FMACost << "\n");
24650 if (auto *I = dyn_cast<Instruction>(RdxVal)) {
24651 // Also, exclude scalar fmul cost.
24652 InstructionCost FMulCost =
24654 LLVM_DEBUG(dbgs() << "Minus FMul cost: " << FMulCost << "\n");
24655 FMACost -= FMulCost;
24656 }
24657 ScalarCost += FMACost;
24658 continue;
24659 }
24660 }
24661 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
24662 continue;
24663 }
24664 ScalarCost = InstructionCost::getInvalid();
24665 break;
24666 }
24667 if (ScalarCost.isValid())
24668 Cost += ScalarCost;
24669 else
24670 Cost += GenCostFn();
24671 }
24672 return Cost;
24673 };
24674 // Require reduction cost if:
24675 // 1. This type is not a full register type and no other vectors with the
24676 // same type in the storage (first vector with small type).
24677 // 2. The storage does not have any vector with full vector use (first
24678 // vector with full register use).
24679 bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.empty();
24680 switch (RdxKind) {
24681 case RecurKind::Add:
24682 case RecurKind::Mul:
24683 case RecurKind::Or:
24684 case RecurKind::And:
24685 case RecurKind::Xor:
24686 case RecurKind::FAdd:
24687 case RecurKind::FMul: {
24688 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
24689 if (!AllConsts) {
24690 if (DoesRequireReductionOp) {
24691 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
24692 assert(SLPReVec && "FixedVectorType is not expected.");
24693 unsigned ScalarTyNumElements = VecTy->getNumElements();
24694 for (unsigned I : seq<unsigned>(ReducedVals.size())) {
24695 VectorCost += TTI->getShuffleCost(
24698 ReducedVals.size()),
24699 VectorTy,
24700 createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));
24701 VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy,
24702 FMF, CostKind);
24703 }
24704 VectorCost += TTI->getScalarizationOverhead(
24705 VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
24706 /*Extract*/ false, TTI::TCK_RecipThroughput);
24707 } else {
24708 Type *RedTy = VectorTy->getElementType();
24709 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
24710 std::make_pair(RedTy, true));
24711 if (RType == RedTy) {
24712 VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
24713 FMF, CostKind);
24714 } else {
24715 VectorCost = TTI->getExtendedReductionCost(
24716 RdxOpcode, !IsSigned, RedTy,
24717 getWidenedType(RType, ReduxWidth), FMF, CostKind);
24718 }
24719 }
24720 } else {
24721 Type *RedTy = VectorTy->getElementType();
24722 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
24723 std::make_pair(RedTy, true));
24724 VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
24725 InstructionCost FMACost = InstructionCost::getInvalid();
24726 if (RdxKind == RecurKind::FAdd) {
24727 // Check if the reduction operands can be converted to FMA.
24729 FastMathFlags FMF;
24730 FMF.set();
24731 for (Value *RdxVal : ReducedVals) {
24732 if (!RdxVal->hasOneUse()) {
24733 Ops.clear();
24734 break;
24735 }
24736 if (auto *FPCI = dyn_cast<FPMathOperator>(RdxVal))
24737 FMF &= FPCI->getFastMathFlags();
24738 Ops.push_back(RdxVal->user_back());
24739 }
24740 if (!Ops.empty()) {
24741 FMACost = canConvertToFMA(Ops, getSameOpcode(Ops, TLI), DT, DL,
24742 *TTI, TLI);
24743 if (FMACost.isValid()) {
24744 // Calculate actual FMAD cost.
24745 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
24746 {RVecTy, RVecTy, RVecTy}, FMF);
24747 FMACost = TTI->getIntrinsicInstrCost(ICA, CostKind);
24748
24749 LLVM_DEBUG(dbgs() << "Vector FMA cost: " << FMACost << "\n");
24750 // Also, exclude vector fmul cost.
24752 Instruction::FMul, RVecTy, CostKind);
24754 << "Minus vector FMul cost: " << FMulCost << "\n");
24755 FMACost -= FMulCost;
24756 }
24757 }
24758 }
24759 if (FMACost.isValid())
24760 VectorCost += FMACost;
24761 else
24762 VectorCost +=
24763 TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind);
24764 if (RType != RedTy) {
24765 unsigned Opcode = Instruction::Trunc;
24766 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
24767 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
24768 VectorCost += TTI->getCastInstrCost(
24769 Opcode, VectorTy, RVecTy, TTI::CastContextHint::None, CostKind);
24770 }
24771 }
24772 }
24773 ScalarCost = EvaluateScalarCost([&]() {
24774 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
24775 });
24776 break;
24777 }
24778 case RecurKind::FMax:
24779 case RecurKind::FMin:
24780 case RecurKind::FMaximum:
24781 case RecurKind::FMinimum:
24782 case RecurKind::SMax:
24783 case RecurKind::SMin:
24784 case RecurKind::UMax:
24785 case RecurKind::UMin: {
24787 if (!AllConsts) {
24788 if (DoesRequireReductionOp) {
24789 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
24790 } else {
24791 // Check if the previous reduction already exists and account it as
24792 // series of operations + single reduction.
24793 Type *RedTy = VectorTy->getElementType();
24794 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
24795 std::make_pair(RedTy, true));
24796 VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
24797 IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);
24798 VectorCost += TTI->getIntrinsicInstrCost(ICA, CostKind);
24799 if (RType != RedTy) {
24800 unsigned Opcode = Instruction::Trunc;
24801 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
24802 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
24803 VectorCost += TTI->getCastInstrCost(
24804 Opcode, VectorTy, RVecTy, TTI::CastContextHint::None, CostKind);
24805 }
24806 }
24807 }
24808 ScalarCost = EvaluateScalarCost([&]() {
24809 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
24810 return TTI->getIntrinsicInstrCost(ICA, CostKind);
24811 });
24812 break;
24813 }
24814 default:
24815 llvm_unreachable("Expected arithmetic or min/max reduction operation");
24816 }
24817
24818 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
24819 << " for reduction of " << shortBundleName(ReducedVals)
24820 << " (It is a splitting reduction)\n");
24821 return VectorCost - ScalarCost;
24822 }
24823
24824 /// Splits the values, stored in VectorValuesAndScales, into registers/free
24825 /// sub-registers, combines them with the given reduction operation as a
24826 /// vector operation and then performs single (small enough) reduction.
24827 Value *emitReduction(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
24828 Type *DestTy) {
24829 Value *ReducedSubTree = nullptr;
24830 // Creates reduction and combines with the previous reduction.
24831 auto CreateSingleOp = [&](Value *Vec, unsigned Scale, bool IsSigned) {
24832 Value *Rdx = createSingleOp(Builder, TTI, Vec, Scale, IsSigned, DestTy);
24833 if (ReducedSubTree)
24834 ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx,
24835 "op.rdx", ReductionOps);
24836 else
24837 ReducedSubTree = Rdx;
24838 };
24839 if (VectorValuesAndScales.size() == 1) {
24840 const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.front();
24841 CreateSingleOp(Vec, Scale, IsSigned);
24842 return ReducedSubTree;
24843 }
24844 // Scales Vec using given Cnt scale factor and then performs vector combine
24845 // with previous value of VecOp.
24846 Value *VecRes = nullptr;
24847 bool VecResSignedness = false;
24848 auto CreateVecOp = [&](Value *Vec, unsigned Cnt, bool IsSigned) {
24849 Type *ScalarTy = Vec->getType()->getScalarType();
24850 // Scale Vec using given Cnt scale factor.
24851 if (Cnt > 1) {
24852 ElementCount EC = cast<VectorType>(Vec->getType())->getElementCount();
24853 switch (RdxKind) {
24854 case RecurKind::Add: {
24855 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy) {
24856 unsigned VF = getNumElements(Vec->getType());
24857 LLVM_DEBUG(dbgs() << "SLP: ctpop " << Cnt << "of " << Vec
24858 << ". (HorRdx)\n");
24859 SmallVector<int> Mask(Cnt * VF, PoisonMaskElem);
24860 for (unsigned I : seq<unsigned>(Cnt))
24861 std::iota(std::next(Mask.begin(), VF * I),
24862 std::next(Mask.begin(), VF * (I + 1)), 0);
24863 ++NumVectorInstructions;
24864 Vec = Builder.CreateShuffleVector(Vec, Mask);
24865 break;
24866 }
24867 // res = mul vv, n
24868 if (ScalarTy != DestTy->getScalarType())
24869 Vec = Builder.CreateIntCast(
24870 Vec, getWidenedType(DestTy, getNumElements(Vec->getType())),
24871 IsSigned);
24873 EC, ConstantInt::get(DestTy->getScalarType(), Cnt));
24874 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of " << Vec
24875 << ". (HorRdx)\n");
24876 ++NumVectorInstructions;
24877 Vec = Builder.CreateMul(Vec, Scale);
24878 break;
24879 }
24880 case RecurKind::Xor: {
24881 // res = n % 2 ? 0 : vv
24883 << "SLP: Xor " << Cnt << "of " << Vec << ". (HorRdx)\n");
24884 if (Cnt % 2 == 0)
24885 Vec = Constant::getNullValue(Vec->getType());
24886 break;
24887 }
24888 case RecurKind::FAdd: {
24889 // res = fmul v, n
24890 Value *Scale =
24891 ConstantVector::getSplat(EC, ConstantFP::get(ScalarTy, Cnt));
24892 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of " << Vec
24893 << ". (HorRdx)\n");
24894 ++NumVectorInstructions;
24895 Vec = Builder.CreateFMul(Vec, Scale);
24896 break;
24897 }
24898 case RecurKind::And:
24899 case RecurKind::Or:
24900 case RecurKind::SMax:
24901 case RecurKind::SMin:
24902 case RecurKind::UMax:
24903 case RecurKind::UMin:
24904 case RecurKind::FMax:
24905 case RecurKind::FMin:
24906 case RecurKind::FMaximum:
24907 case RecurKind::FMinimum:
24908 // res = vv
24909 break;
24910 case RecurKind::Sub:
24911 case RecurKind::AddChainWithSubs:
24912 case RecurKind::Mul:
24913 case RecurKind::FMul:
24914 case RecurKind::FMulAdd:
24915 case RecurKind::AnyOf:
24916 case RecurKind::FindFirstIVSMin:
24917 case RecurKind::FindFirstIVUMin:
24918 case RecurKind::FindLastIVSMax:
24919 case RecurKind::FindLastIVUMax:
24920 case RecurKind::FMaxNum:
24921 case RecurKind::FMinNum:
24922 case RecurKind::FMaximumNum:
24923 case RecurKind::FMinimumNum:
24924 case RecurKind::None:
24925 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
24926 }
24927 }
24928 // Combine Vec with the previous VecOp.
24929 if (!VecRes) {
24930 VecRes = Vec;
24931 VecResSignedness = IsSigned;
24932 } else {
24933 ++NumVectorInstructions;
24934 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy &&
24935 VecRes->getType()->getScalarType() == Builder.getInt1Ty()) {
24936 // Handle ctpop.
24937 unsigned VecResVF = getNumElements(VecRes->getType());
24938 unsigned VecVF = getNumElements(Vec->getType());
24939 SmallVector<int> Mask(VecResVF + VecVF, PoisonMaskElem);
24940 std::iota(Mask.begin(), Mask.end(), 0);
24941 // Ensure that VecRes is always larger than Vec
24942 if (VecResVF < VecVF) {
24943 std::swap(VecRes, Vec);
24944 std::swap(VecResVF, VecVF);
24945 }
24946 if (VecResVF != VecVF) {
24947 SmallVector<int> ResizeMask(VecResVF, PoisonMaskElem);
24948 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
24949 Vec = Builder.CreateShuffleVector(Vec, ResizeMask);
24950 }
24951 VecRes = Builder.CreateShuffleVector(VecRes, Vec, Mask, "rdx.op");
24952 return;
24953 }
24954 if (VecRes->getType()->getScalarType() != DestTy->getScalarType())
24955 VecRes = Builder.CreateIntCast(
24956 VecRes, getWidenedType(DestTy, getNumElements(VecRes->getType())),
24957 VecResSignedness);
24958 if (ScalarTy != DestTy->getScalarType())
24959 Vec = Builder.CreateIntCast(
24960 Vec, getWidenedType(DestTy, getNumElements(Vec->getType())),
24961 IsSigned);
24962 unsigned VecResVF = getNumElements(VecRes->getType());
24963 unsigned VecVF = getNumElements(Vec->getType());
24964 // Ensure that VecRes is always larger than Vec
24965 if (VecResVF < VecVF) {
24966 std::swap(VecRes, Vec);
24967 std::swap(VecResVF, VecVF);
24968 }
24969 // extract + op + insert
24970 Value *Op = VecRes;
24971 if (VecResVF != VecVF)
24972 Op = createExtractVector(Builder, VecRes, VecVF, /*Index=*/0);
24973 Op = createOp(Builder, RdxKind, Op, Vec, "rdx.op", ReductionOps);
24974 if (VecResVF != VecVF)
24975 Op = createInsertVector(Builder, VecRes, Op, /*Index=*/0);
24976 VecRes = Op;
24977 }
24978 };
24979 for (auto [Vec, Scale, IsSigned] : VectorValuesAndScales)
24980 CreateVecOp(Vec, Scale, IsSigned);
24981 CreateSingleOp(VecRes, /*Scale=*/1, /*IsSigned=*/false);
24982
24983 return ReducedSubTree;
24984 }
24985
24986 /// Emit a horizontal reduction of the vectorized value.
24987 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
24988 const TargetTransformInfo *TTI, Type *DestTy) {
24989 assert(VectorizedValue && "Need to have a vectorized tree node");
24990 assert(RdxKind != RecurKind::FMulAdd &&
24991 "A call to the llvm.fmuladd intrinsic is not handled yet");
24992
24993 auto *FTy = cast<FixedVectorType>(VectorizedValue->getType());
24994 if (FTy->getScalarType() == Builder.getInt1Ty() &&
24995 RdxKind == RecurKind::Add &&
24996 DestTy->getScalarType() != FTy->getScalarType()) {
24997 // Convert vector_reduce_add(ZExt(<n x i1>)) to
24998 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
24999 Value *V = Builder.CreateBitCast(
25000 VectorizedValue, Builder.getIntNTy(FTy->getNumElements()));
25001 ++NumVectorInstructions;
25002 return Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, V);
25003 }
25004 ++NumVectorInstructions;
25005 return createSimpleReduction(Builder, VectorizedValue, RdxKind);
25006 }
25007
25008 /// Emits optimized code for unique scalar value reused \p Cnt times.
25009 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
25010 unsigned Cnt) {
25011 assert(IsSupportedHorRdxIdentityOp &&
25012 "The optimization of matched scalar identity horizontal reductions "
25013 "must be supported.");
25014 if (Cnt == 1)
25015 return VectorizedValue;
25016 switch (RdxKind) {
25017 case RecurKind::Add: {
25018 // res = mul vv, n
25019 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
25020 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
25021 << VectorizedValue << ". (HorRdx)\n");
25022 return Builder.CreateMul(VectorizedValue, Scale);
25023 }
25024 case RecurKind::Xor: {
25025 // res = n % 2 ? 0 : vv
25026 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
25027 << ". (HorRdx)\n");
25028 if (Cnt % 2 == 0)
25029 return Constant::getNullValue(VectorizedValue->getType());
25030 return VectorizedValue;
25031 }
25032 case RecurKind::FAdd: {
25033 // res = fmul v, n
25034 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
25035 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
25036 << VectorizedValue << ". (HorRdx)\n");
25037 return Builder.CreateFMul(VectorizedValue, Scale);
25038 }
25039 case RecurKind::And:
25040 case RecurKind::Or:
25041 case RecurKind::SMax:
25042 case RecurKind::SMin:
25043 case RecurKind::UMax:
25044 case RecurKind::UMin:
25045 case RecurKind::FMax:
25046 case RecurKind::FMin:
25047 case RecurKind::FMaximum:
25048 case RecurKind::FMinimum:
25049 // res = vv
25050 return VectorizedValue;
25051 case RecurKind::Sub:
25052 case RecurKind::AddChainWithSubs:
25053 case RecurKind::Mul:
25054 case RecurKind::FMul:
25055 case RecurKind::FMulAdd:
25056 case RecurKind::AnyOf:
25057 case RecurKind::FindFirstIVSMin:
25058 case RecurKind::FindFirstIVUMin:
25059 case RecurKind::FindLastIVSMax:
25060 case RecurKind::FindLastIVUMax:
25061 case RecurKind::FMaxNum:
25062 case RecurKind::FMinNum:
25063 case RecurKind::FMaximumNum:
25064 case RecurKind::FMinimumNum:
25065 case RecurKind::None:
25066 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
25067 }
25068 return nullptr;
25069 }
25070
25071 /// Emits actual operation for the scalar identity values, found during
25072 /// horizontal reduction analysis.
25073 Value *
25074 emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
25075 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
25076 const DenseMap<Value *, Value *> &TrackedToOrig) {
25077 assert(IsSupportedHorRdxIdentityOp &&
25078 "The optimization of matched scalar identity horizontal reductions "
25079 "must be supported.");
25080 ArrayRef<Value *> VL = R.getRootNodeScalars();
25081 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
25082 if (VTy->getElementType() != VL.front()->getType()) {
25083 VectorizedValue = Builder.CreateIntCast(
25084 VectorizedValue,
25085 getWidenedType(VL.front()->getType(), VTy->getNumElements()),
25086 R.isSignedMinBitwidthRootNode());
25087 }
25088 switch (RdxKind) {
25089 case RecurKind::Add: {
25090 // root = mul prev_root, <1, 1, n, 1>
25092 for (Value *V : VL) {
25093 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
25094 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
25095 }
25096 auto *Scale = ConstantVector::get(Vals);
25097 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
25098 << VectorizedValue << ". (HorRdx)\n");
25099 return Builder.CreateMul(VectorizedValue, Scale);
25100 }
25101 case RecurKind::And:
25102 case RecurKind::Or:
25103 // No need for multiple or/and(s).
25104 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
25105 << ". (HorRdx)\n");
25106 return VectorizedValue;
25107 case RecurKind::SMax:
25108 case RecurKind::SMin:
25109 case RecurKind::UMax:
25110 case RecurKind::UMin:
25111 case RecurKind::FMax:
25112 case RecurKind::FMin:
25113 case RecurKind::FMaximum:
25114 case RecurKind::FMinimum:
25115 // No need for multiple min/max(s) of the same value.
25116 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
25117 << ". (HorRdx)\n");
25118 return VectorizedValue;
25119 case RecurKind::Xor: {
25120 // Replace values with even number of repeats with 0, since
25121 // x xor x = 0.
25122 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
25123 // 7>, if elements 4th and 6th elements have even number of repeats.
25124 SmallVector<int> Mask(
25125 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
25127 std::iota(Mask.begin(), Mask.end(), 0);
25128 bool NeedShuffle = false;
25129 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
25130 Value *V = VL[I];
25131 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
25132 if (Cnt % 2 == 0) {
25133 Mask[I] = VF;
25134 NeedShuffle = true;
25135 }
25136 }
25137 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
25138 : Mask) dbgs()
25139 << I << " ";
25140 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
25141 if (NeedShuffle)
25142 VectorizedValue = Builder.CreateShuffleVector(
25143 VectorizedValue,
25144 ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
25145 return VectorizedValue;
25146 }
25147 case RecurKind::FAdd: {
25148 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
25150 for (Value *V : VL) {
25151 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
25152 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
25153 }
25154 auto *Scale = ConstantVector::get(Vals);
25155 return Builder.CreateFMul(VectorizedValue, Scale);
25156 }
25157 case RecurKind::Sub:
25158 case RecurKind::AddChainWithSubs:
25159 case RecurKind::Mul:
25160 case RecurKind::FMul:
25161 case RecurKind::FMulAdd:
25162 case RecurKind::AnyOf:
25163 case RecurKind::FindFirstIVSMin:
25164 case RecurKind::FindFirstIVUMin:
25165 case RecurKind::FindLastIVSMax:
25166 case RecurKind::FindLastIVUMax:
25167 case RecurKind::FMaxNum:
25168 case RecurKind::FMinNum:
25169 case RecurKind::FMaximumNum:
25170 case RecurKind::FMinimumNum:
25171 case RecurKind::None:
25172 llvm_unreachable("Unexpected reduction kind for reused scalars.");
25173 }
25174 return nullptr;
25175 }
25176};
25177} // end anonymous namespace
25178
25179/// Gets recurrence kind from the specified value.
25181 return HorizontalReduction::getRdxKind(V);
25182}
25183static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
25184 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
25185 return cast<FixedVectorType>(IE->getType())->getNumElements();
25186
25187 unsigned AggregateSize = 1;
25188 auto *IV = cast<InsertValueInst>(InsertInst);
25189 Type *CurrentType = IV->getType();
25190 do {
25191 if (auto *ST = dyn_cast<StructType>(CurrentType)) {
25192 for (auto *Elt : ST->elements())
25193 if (Elt != ST->getElementType(0)) // check homogeneity
25194 return std::nullopt;
25195 AggregateSize *= ST->getNumElements();
25196 CurrentType = ST->getElementType(0);
25197 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
25198 AggregateSize *= AT->getNumElements();
25199 CurrentType = AT->getElementType();
25200 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
25201 AggregateSize *= VT->getNumElements();
25202 return AggregateSize;
25203 } else if (CurrentType->isSingleValueType()) {
25204 return AggregateSize;
25205 } else {
25206 return std::nullopt;
25207 }
25208 } while (true);
25209}
25210
25211static void findBuildAggregateRec(Instruction *LastInsertInst,
25213 SmallVectorImpl<Value *> &BuildVectorOpds,
25214 SmallVectorImpl<Value *> &InsertElts,
25215 unsigned OperandOffset, const BoUpSLP &R) {
25216 do {
25217 Value *InsertedOperand = LastInsertInst->getOperand(1);
25218 std::optional<unsigned> OperandIndex =
25219 getElementIndex(LastInsertInst, OperandOffset);
25220 if (!OperandIndex || R.isDeleted(LastInsertInst))
25221 return;
25222 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
25224 BuildVectorOpds, InsertElts, *OperandIndex, R);
25225
25226 } else {
25227 BuildVectorOpds[*OperandIndex] = InsertedOperand;
25228 InsertElts[*OperandIndex] = LastInsertInst;
25229 }
25230 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
25231 } while (LastInsertInst != nullptr &&
25233 LastInsertInst->hasOneUse());
25234}
25235
25236/// Recognize construction of vectors like
25237/// %ra = insertelement <4 x float> poison, float %s0, i32 0
25238/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
25239/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
25240/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
25241/// starting from the last insertelement or insertvalue instruction.
25242///
25243/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
25244/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
25245/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
25246///
25247/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
25248///
25249/// \return true if it matches.
25250static bool findBuildAggregate(Instruction *LastInsertInst,
25252 SmallVectorImpl<Value *> &BuildVectorOpds,
25253 SmallVectorImpl<Value *> &InsertElts,
25254 const BoUpSLP &R) {
25255
25256 assert((isa<InsertElementInst>(LastInsertInst) ||
25257 isa<InsertValueInst>(LastInsertInst)) &&
25258 "Expected insertelement or insertvalue instruction!");
25259
25260 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
25261 "Expected empty result vectors!");
25262
25263 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
25264 if (!AggregateSize)
25265 return false;
25266 BuildVectorOpds.resize(*AggregateSize);
25267 InsertElts.resize(*AggregateSize);
25268
25269 findBuildAggregateRec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0, R);
25270 llvm::erase(BuildVectorOpds, nullptr);
25271 llvm::erase(InsertElts, nullptr);
25272 if (BuildVectorOpds.size() >= 2)
25273 return true;
25274
25275 return false;
25276}
25277
25278/// Try and get a reduction instruction from a phi node.
25279///
25280/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
25281/// if they come from either \p ParentBB or a containing loop latch.
25282///
25283/// \returns A candidate reduction value if possible, or \code nullptr \endcode
25284/// if not possible.
25286 BasicBlock *ParentBB, LoopInfo *LI) {
25287 // There are situations where the reduction value is not dominated by the
25288 // reduction phi. Vectorizing such cases has been reported to cause
25289 // miscompiles. See PR25787.
25290 auto DominatedReduxValue = [&](Value *R) {
25291 return isa<Instruction>(R) &&
25292 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
25293 };
25294
25295 Instruction *Rdx = nullptr;
25296
25297 // Return the incoming value if it comes from the same BB as the phi node.
25298 if (P->getIncomingBlock(0) == ParentBB) {
25299 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
25300 } else if (P->getIncomingBlock(1) == ParentBB) {
25301 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
25302 }
25303
25304 if (Rdx && DominatedReduxValue(Rdx))
25305 return Rdx;
25306
25307 // Otherwise, check whether we have a loop latch to look at.
25308 Loop *BBL = LI->getLoopFor(ParentBB);
25309 if (!BBL)
25310 return nullptr;
25311 BasicBlock *BBLatch = BBL->getLoopLatch();
25312 if (!BBLatch)
25313 return nullptr;
25314
25315 // There is a loop latch, return the incoming value if it comes from
25316 // that. This reduction pattern occasionally turns up.
25317 if (P->getIncomingBlock(0) == BBLatch) {
25318 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
25319 } else if (P->getIncomingBlock(1) == BBLatch) {
25320 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
25321 }
25322
25323 if (Rdx && DominatedReduxValue(Rdx))
25324 return Rdx;
25325
25326 return nullptr;
25327}
25328
25329static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
25330 if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
25331 return true;
25332 if (match(I, m_FMaxNum(m_Value(V0), m_Value(V1))))
25333 return true;
25334 if (match(I, m_FMinNum(m_Value(V0), m_Value(V1))))
25335 return true;
25336 if (match(I, m_FMaximum(m_Value(V0), m_Value(V1))))
25337 return true;
25338 if (match(I, m_FMinimum(m_Value(V0), m_Value(V1))))
25339 return true;
25341 return true;
25343 return true;
25345 return true;
25347 return true;
25348 return false;
25349}
25350
25351/// We could have an initial reduction that is not an add.
25352/// r *= v1 + v2 + v3 + v4
25353/// In such a case start looking for a tree rooted in the first '+'.
25354/// \Returns the new root if found, which may be nullptr if not an instruction.
25356 Instruction *Root) {
25357 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
25358 isa<IntrinsicInst>(Root)) &&
25359 "Expected binop, select, or intrinsic for reduction matching");
25360 Value *LHS =
25361 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
25362 Value *RHS =
25363 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
25364 if (LHS == Phi)
25365 return dyn_cast<Instruction>(RHS);
25366 if (RHS == Phi)
25367 return dyn_cast<Instruction>(LHS);
25368 return nullptr;
25369}
25370
25371/// \p Returns the first operand of \p I that does not match \p Phi. If
25372/// operand is not an instruction it returns nullptr.
25374 Value *Op0 = nullptr;
25375 Value *Op1 = nullptr;
25376 if (!matchRdxBop(I, Op0, Op1))
25377 return nullptr;
25378 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
25379}
25380
25381/// \Returns true if \p I is a candidate instruction for reduction vectorization.
25383 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
25384 Value *B0 = nullptr, *B1 = nullptr;
25385 bool IsBinop = matchRdxBop(I, B0, B1);
25386 return IsBinop || IsSelect;
25387}
25388
25389bool SLPVectorizerPass::vectorizeHorReduction(
25390 PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
25391 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
25392 if (!ShouldVectorizeHor)
25393 return false;
25394 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
25395
25396 if (Root->getParent() != BB || isa<PHINode>(Root))
25397 return false;
25398
25399 // If we can find a secondary reduction root, use that instead.
25400 auto SelectRoot = [&]() {
25401 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
25402 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
25403 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
25404 return NewRoot;
25405 return Root;
25406 };
25407
25408 // Start analysis starting from Root instruction. If horizontal reduction is
25409 // found, try to vectorize it. If it is not a horizontal reduction or
25410 // vectorization is not possible or not effective, and currently analyzed
25411 // instruction is a binary operation, try to vectorize the operands, using
25412 // pre-order DFS traversal order. If the operands were not vectorized, repeat
25413 // the same procedure considering each operand as a possible root of the
25414 // horizontal reduction.
25415 // Interrupt the process if the Root instruction itself was vectorized or all
25416 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
25417 // If a horizintal reduction was not matched or vectorized we collect
25418 // instructions for possible later attempts for vectorization.
25419 std::queue<std::pair<Instruction *, unsigned>> Stack;
25420 Stack.emplace(SelectRoot(), 0);
25421 SmallPtrSet<Value *, 8> VisitedInstrs;
25422 bool Res = false;
25423 auto TryToReduce = [this, &R, TTI = TTI](Instruction *Inst) -> Value * {
25424 if (R.isAnalyzedReductionRoot(Inst))
25425 return nullptr;
25426 if (!isReductionCandidate(Inst))
25427 return nullptr;
25428 HorizontalReduction HorRdx;
25429 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
25430 return nullptr;
25431 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
25432 };
25433 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
25434 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
25435 FutureSeed = getNonPhiOperand(Root, P);
25436 if (!FutureSeed)
25437 return false;
25438 }
25439 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
25440 // analysis is done separately.
25442 PostponedInsts.push_back(FutureSeed);
25443 return true;
25444 };
25445
25446 while (!Stack.empty()) {
25447 Instruction *Inst;
25448 unsigned Level;
25449 std::tie(Inst, Level) = Stack.front();
25450 Stack.pop();
25451 // Do not try to analyze instruction that has already been vectorized.
25452 // This may happen when we vectorize instruction operands on a previous
25453 // iteration while stack was populated before that happened.
25454 if (R.isDeleted(Inst))
25455 continue;
25456 if (Value *VectorizedV = TryToReduce(Inst)) {
25457 Res = true;
25458 if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
25459 // Try to find another reduction.
25460 Stack.emplace(I, Level);
25461 continue;
25462 }
25463 if (R.isDeleted(Inst))
25464 continue;
25465 } else {
25466 // We could not vectorize `Inst` so try to use it as a future seed.
25467 if (!TryAppendToPostponedInsts(Inst)) {
25468 assert(Stack.empty() && "Expected empty stack");
25469 break;
25470 }
25471 }
25472
25473 // Try to vectorize operands.
25474 // Continue analysis for the instruction from the same basic block only to
25475 // save compile time.
25476 if (++Level < RecursionMaxDepth)
25477 for (auto *Op : Inst->operand_values())
25478 if (VisitedInstrs.insert(Op).second)
25479 if (auto *I = dyn_cast<Instruction>(Op))
25480 // Do not try to vectorize CmpInst operands, this is done
25481 // separately.
25483 !R.isDeleted(I) && I->getParent() == BB)
25484 Stack.emplace(I, Level);
25485 }
25486 return Res;
25487}
25488
25489bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
25490 if (!I)
25491 return false;
25492
25493 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
25494 return false;
25495 // Skip potential FMA candidates.
25496 if ((I->getOpcode() == Instruction::FAdd ||
25497 I->getOpcode() == Instruction::FSub) &&
25498 canConvertToFMA(I, getSameOpcode(I, *TLI), *DT, *DL, *TTI, *TLI)
25499 .isValid())
25500 return false;
25501
25502 Value *P = I->getParent();
25503
25504 // Vectorize in current basic block only.
25505 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
25506 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
25507 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
25508 R.isDeleted(Op0) || R.isDeleted(Op1))
25509 return false;
25510
25511 // First collect all possible candidates
25513 Candidates.emplace_back(Op0, Op1);
25514
25515 auto *A = dyn_cast<BinaryOperator>(Op0);
25516 auto *B = dyn_cast<BinaryOperator>(Op1);
25517 // Try to skip B.
25518 if (A && B && B->hasOneUse()) {
25519 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
25520 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
25521 if (B0 && B0->getParent() == P && !R.isDeleted(B0))
25522 Candidates.emplace_back(A, B0);
25523 if (B1 && B1->getParent() == P && !R.isDeleted(B1))
25524 Candidates.emplace_back(A, B1);
25525 }
25526 // Try to skip A.
25527 if (B && A && A->hasOneUse()) {
25528 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
25529 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
25530 if (A0 && A0->getParent() == P && !R.isDeleted(A0))
25531 Candidates.emplace_back(A0, B);
25532 if (A1 && A1->getParent() == P && !R.isDeleted(A1))
25533 Candidates.emplace_back(A1, B);
25534 }
25535
25536 auto TryToReduce = [this, &R, &TTI = *TTI](Instruction *Inst,
25538 if (!isReductionCandidate(Inst))
25539 return false;
25540 Type *Ty = Inst->getType();
25541 if (!isValidElementType(Ty) || Ty->isPointerTy())
25542 return false;
25543 HorizontalReduction HorRdx(Inst, Ops);
25544 if (!HorRdx.matchReductionForOperands())
25545 return false;
25546 // Check the cost of operations.
25547 VectorType *VecTy = getWidenedType(Ty, Ops.size());
25549 InstructionCost ScalarCost =
25550 TTI.getScalarizationOverhead(
25551 VecTy, APInt::getAllOnes(getNumElements(VecTy)), /*Insert=*/false,
25552 /*Extract=*/true, CostKind) +
25553 TTI.getInstructionCost(Inst, CostKind);
25554 InstructionCost RedCost;
25555 switch (::getRdxKind(Inst)) {
25556 case RecurKind::Add:
25557 case RecurKind::Mul:
25558 case RecurKind::Or:
25559 case RecurKind::And:
25560 case RecurKind::Xor:
25561 case RecurKind::FAdd:
25562 case RecurKind::FMul: {
25563 FastMathFlags FMF;
25564 if (auto *FPCI = dyn_cast<FPMathOperator>(Inst))
25565 FMF = FPCI->getFastMathFlags();
25566 RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
25567 CostKind);
25568 break;
25569 }
25570 default:
25571 return false;
25572 }
25573 if (RedCost >= ScalarCost)
25574 return false;
25575
25576 return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) != nullptr;
25577 };
25578 if (Candidates.size() == 1)
25579 return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R);
25580
25581 // We have multiple options. Try to pick the single best.
25582 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
25583 if (!BestCandidate)
25584 return false;
25585 return (*BestCandidate == 0 &&
25586 TryToReduce(I, {Candidates[*BestCandidate].first,
25587 Candidates[*BestCandidate].second})) ||
25588 tryToVectorizeList({Candidates[*BestCandidate].first,
25589 Candidates[*BestCandidate].second},
25590 R);
25591}
25592
25593bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
25594 BasicBlock *BB, BoUpSLP &R) {
25595 SmallVector<WeakTrackingVH> PostponedInsts;
25596 bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts);
25597 Res |= tryToVectorize(PostponedInsts, R);
25598 return Res;
25599}
25600
25601bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
25602 BoUpSLP &R) {
25603 bool Res = false;
25604 for (Value *V : Insts)
25605 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
25606 Res |= tryToVectorize(Inst, R);
25607 return Res;
25608}
25609
25610bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
25611 BasicBlock *BB, BoUpSLP &R,
25612 bool MaxVFOnly) {
25613 if (!R.canMapToVector(IVI->getType()))
25614 return false;
25615
25616 SmallVector<Value *, 16> BuildVectorOpds;
25617 SmallVector<Value *, 16> BuildVectorInsts;
25618 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts, R))
25619 return false;
25620
25621 if (MaxVFOnly && BuildVectorOpds.size() == 2) {
25622 R.getORE()->emit([&]() {
25623 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
25624 << "Cannot SLP vectorize list: only 2 elements of buildvalue, "
25625 "trying reduction first.";
25626 });
25627 return false;
25628 }
25629 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
25630 // Aggregate value is unlikely to be processed in vector register.
25631 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
25632}
25633
25634bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
25635 BasicBlock *BB, BoUpSLP &R,
25636 bool MaxVFOnly) {
25637 SmallVector<Value *, 16> BuildVectorInsts;
25638 SmallVector<Value *, 16> BuildVectorOpds;
25639 SmallVector<int> Mask;
25640 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts, R) ||
25642 isFixedVectorShuffle(BuildVectorOpds, Mask, AC)))
25643 return false;
25644
25645 if (MaxVFOnly && BuildVectorInsts.size() == 2) {
25646 R.getORE()->emit([&]() {
25647 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
25648 << "Cannot SLP vectorize list: only 2 elements of buildvector, "
25649 "trying reduction first.";
25650 });
25651 return false;
25652 }
25653 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
25654 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
25655}
25656
25657template <typename T>
25659 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
25660 function_ref<bool(ArrayRef<T *>, T *)> AreCompatible,
25661 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
25662 bool MaxVFOnly, BoUpSLP &R) {
25663 bool Changed = false;
25664 // Sort by type, parent, operands.
25665 stable_sort(Incoming, Comparator);
25666
25667 // Try to vectorize elements base on their type.
25668 SmallVector<T *> Candidates;
25670 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
25671 VL.clear()) {
25672 // Look for the next elements with the same type, parent and operand
25673 // kinds.
25674 auto *I = dyn_cast<Instruction>(*IncIt);
25675 if (!I || R.isDeleted(I)) {
25676 ++IncIt;
25677 continue;
25678 }
25679 auto *SameTypeIt = IncIt;
25680 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
25681 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
25682 AreCompatible(VL, *SameTypeIt))) {
25683 auto *I = dyn_cast<Instruction>(*SameTypeIt);
25684 ++SameTypeIt;
25685 if (I && !R.isDeleted(I))
25686 VL.push_back(cast<T>(I));
25687 }
25688
25689 // Try to vectorize them.
25690 unsigned NumElts = VL.size();
25691 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
25692 << NumElts << ")\n");
25693 // The vectorization is a 3-state attempt:
25694 // 1. Try to vectorize instructions with the same/alternate opcodes with the
25695 // size of maximal register at first.
25696 // 2. Try to vectorize remaining instructions with the same type, if
25697 // possible. This may result in the better vectorization results rather than
25698 // if we try just to vectorize instructions with the same/alternate opcodes.
25699 // 3. Final attempt to try to vectorize all instructions with the
25700 // same/alternate ops only, this may result in some extra final
25701 // vectorization.
25702 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
25703 // Success start over because instructions might have been changed.
25704 Changed = true;
25705 VL.swap(Candidates);
25706 Candidates.clear();
25707 for (T *V : VL) {
25708 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
25709 Candidates.push_back(V);
25710 }
25711 } else {
25712 /// \Returns the minimum number of elements that we will attempt to
25713 /// vectorize.
25714 auto GetMinNumElements = [&R](Value *V) {
25715 unsigned EltSize = R.getVectorElementSize(V);
25716 return std::max(2U, R.getMaxVecRegSize() / EltSize);
25717 };
25718 if (NumElts < GetMinNumElements(*IncIt) &&
25719 (Candidates.empty() ||
25720 Candidates.front()->getType() == (*IncIt)->getType())) {
25721 for (T *V : VL) {
25722 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
25723 Candidates.push_back(V);
25724 }
25725 }
25726 }
25727 // Final attempt to vectorize instructions with the same types.
25728 if (Candidates.size() > 1 &&
25729 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
25730 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
25731 // Success start over because instructions might have been changed.
25732 Changed = true;
25733 } else if (MaxVFOnly) {
25734 // Try to vectorize using small vectors.
25736 for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
25737 VL.clear()) {
25738 auto *I = dyn_cast<Instruction>(*It);
25739 if (!I || R.isDeleted(I)) {
25740 ++It;
25741 continue;
25742 }
25743 auto *SameTypeIt = It;
25744 while (SameTypeIt != End &&
25745 (!isa<Instruction>(*SameTypeIt) ||
25746 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
25747 AreCompatible(*SameTypeIt, *It))) {
25748 auto *I = dyn_cast<Instruction>(*SameTypeIt);
25749 ++SameTypeIt;
25750 if (I && !R.isDeleted(I))
25751 VL.push_back(cast<T>(I));
25752 }
25753 unsigned NumElts = VL.size();
25754 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
25755 /*MaxVFOnly=*/false))
25756 Changed = true;
25757 It = SameTypeIt;
25758 }
25759 }
25760 Candidates.clear();
25761 }
25762
25763 // Start over at the next instruction of a different type (or the end).
25764 IncIt = SameTypeIt;
25765 }
25766 return Changed;
25767}
25768
25769/// Compare two cmp instructions. If IsCompatibility is true, function returns
25770/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
25771/// operands. If IsCompatibility is false, function implements strict weak
25772/// ordering relation between two cmp instructions, returning true if the first
25773/// instruction is "less" than the second, i.e. its predicate is less than the
25774/// predicate of the second or the operands IDs are less than the operands IDs
25775/// of the second cmp instruction.
25776template <bool IsCompatibility>
25777static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
25778 const DominatorTree &DT) {
25779 assert(isValidElementType(V->getType()) &&
25780 isValidElementType(V2->getType()) &&
25781 "Expected valid element types only.");
25782 if (V == V2)
25783 return IsCompatibility;
25784 auto *CI1 = cast<CmpInst>(V);
25785 auto *CI2 = cast<CmpInst>(V2);
25786 if (CI1->getOperand(0)->getType()->getTypeID() <
25787 CI2->getOperand(0)->getType()->getTypeID())
25788 return !IsCompatibility;
25789 if (CI1->getOperand(0)->getType()->getTypeID() >
25790 CI2->getOperand(0)->getType()->getTypeID())
25791 return false;
25792 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
25794 return !IsCompatibility;
25795 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
25797 return false;
25798 CmpInst::Predicate Pred1 = CI1->getPredicate();
25799 CmpInst::Predicate Pred2 = CI2->getPredicate();
25802 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
25803 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
25804 if (BasePred1 < BasePred2)
25805 return !IsCompatibility;
25806 if (BasePred1 > BasePred2)
25807 return false;
25808 // Compare operands.
25809 bool CI1Preds = Pred1 == BasePred1;
25810 bool CI2Preds = Pred2 == BasePred1;
25811 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
25812 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
25813 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
25814 if (Op1 == Op2)
25815 continue;
25816 if (Op1->getValueID() < Op2->getValueID())
25817 return !IsCompatibility;
25818 if (Op1->getValueID() > Op2->getValueID())
25819 return false;
25820 if (auto *I1 = dyn_cast<Instruction>(Op1))
25821 if (auto *I2 = dyn_cast<Instruction>(Op2)) {
25822 if (IsCompatibility) {
25823 if (I1->getParent() != I2->getParent())
25824 return false;
25825 } else {
25826 // Try to compare nodes with same parent.
25827 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
25828 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
25829 if (!NodeI1)
25830 return NodeI2 != nullptr;
25831 if (!NodeI2)
25832 return false;
25833 assert((NodeI1 == NodeI2) ==
25834 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
25835 "Different nodes should have different DFS numbers");
25836 if (NodeI1 != NodeI2)
25837 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
25838 }
25839 InstructionsState S = getSameOpcode({I1, I2}, TLI);
25840 if (S && (IsCompatibility || !S.isAltShuffle()))
25841 continue;
25842 if (IsCompatibility)
25843 return false;
25844 if (I1->getOpcode() != I2->getOpcode())
25845 return I1->getOpcode() < I2->getOpcode();
25846 }
25847 }
25848 return IsCompatibility;
25849}
25850
25851template <typename ItT>
25852bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
25853 BasicBlock *BB, BoUpSLP &R) {
25854 bool Changed = false;
25855 // Try to find reductions first.
25856 for (CmpInst *I : CmpInsts) {
25857 if (R.isDeleted(I))
25858 continue;
25859 for (Value *Op : I->operands())
25860 if (auto *RootOp = dyn_cast<Instruction>(Op)) {
25861 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R);
25862 if (R.isDeleted(I))
25863 break;
25864 }
25865 }
25866 // Try to vectorize operands as vector bundles.
25867 for (CmpInst *I : CmpInsts) {
25868 if (R.isDeleted(I))
25869 continue;
25870 Changed |= tryToVectorize(I, R);
25871 }
25872 // Try to vectorize list of compares.
25873 // Sort by type, compare predicate, etc.
25874 auto CompareSorter = [&](Value *V, Value *V2) {
25875 if (V == V2)
25876 return false;
25877 return compareCmp<false>(V, V2, *TLI, *DT);
25878 };
25879
25880 auto AreCompatibleCompares = [&](ArrayRef<Value *> VL, Value *V1) {
25881 if (VL.empty() || VL.back() == V1)
25882 return true;
25883 return compareCmp<true>(V1, VL.back(), *TLI, *DT);
25884 };
25885
25887 for (Instruction *V : CmpInsts)
25888 if (!R.isDeleted(V) && isValidElementType(getValueType(V)))
25889 Vals.push_back(V);
25890 if (Vals.size() <= 1)
25891 return Changed;
25893 Vals, CompareSorter, AreCompatibleCompares,
25894 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
25895 // Exclude possible reductions from other blocks.
25896 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
25897 return any_of(V->users(), [V](User *U) {
25898 auto *Select = dyn_cast<SelectInst>(U);
25899 return Select &&
25900 Select->getParent() != cast<Instruction>(V)->getParent();
25901 });
25902 });
25903 if (ArePossiblyReducedInOtherBlock)
25904 return false;
25905 return tryToVectorizeList(Candidates, R, MaxVFOnly);
25906 },
25907 /*MaxVFOnly=*/true, R);
25908 return Changed;
25909}
25910
25911bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
25912 BasicBlock *BB, BoUpSLP &R) {
25914 "This function only accepts Insert instructions");
25915 bool OpsChanged = false;
25916 SmallVector<WeakTrackingVH> PostponedInsts;
25917 for (auto *I : reverse(Instructions)) {
25918 // pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
25919 if (R.isDeleted(I) || isa<CmpInst>(I))
25920 continue;
25921 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
25922 OpsChanged |=
25923 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/true);
25924 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
25925 OpsChanged |=
25926 vectorizeInsertElementInst(LastInsertElem, BB, R, /*MaxVFOnly=*/true);
25927 }
25928 // pass2 - try to vectorize reductions only
25929 if (R.isDeleted(I))
25930 continue;
25931 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, PostponedInsts);
25932 if (R.isDeleted(I) || isa<CmpInst>(I))
25933 continue;
25934 // pass3 - try to match and vectorize a buildvector sequence.
25935 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
25936 OpsChanged |=
25937 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/false);
25938 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
25939 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
25940 /*MaxVFOnly=*/false);
25941 }
25942 }
25943 // Now try to vectorize postponed instructions.
25944 OpsChanged |= tryToVectorize(PostponedInsts, R);
25945
25946 Instructions.clear();
25947 return OpsChanged;
25948}
25949
25950bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
25951 bool Changed = false;
25952 SmallVector<Value *, 4> Incoming;
25953 SmallPtrSet<Value *, 16> VisitedInstrs;
25954 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
25955 // node. Allows better to identify the chains that can be vectorized in the
25956 // better way.
25957 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
25958 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
25960 isValidElementType(V2->getType()) &&
25961 "Expected vectorizable types only.");
25962 if (V1 == V2)
25963 return false;
25964 // It is fine to compare type IDs here, since we expect only vectorizable
25965 // types, like ints, floats and pointers, we don't care about other type.
25966 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
25967 return true;
25968 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
25969 return false;
25970 if (V1->getType()->getScalarSizeInBits() <
25971 V2->getType()->getScalarSizeInBits())
25972 return true;
25973 if (V1->getType()->getScalarSizeInBits() >
25974 V2->getType()->getScalarSizeInBits())
25975 return false;
25976 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
25977 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
25978 if (Opcodes1.size() < Opcodes2.size())
25979 return true;
25980 if (Opcodes1.size() > Opcodes2.size())
25981 return false;
25982 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
25983 {
25984 // Instructions come first.
25985 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
25986 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
25987 if (I1 && I2) {
25988 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
25989 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
25990 if (!NodeI1)
25991 return NodeI2 != nullptr;
25992 if (!NodeI2)
25993 return false;
25994 assert((NodeI1 == NodeI2) ==
25995 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
25996 "Different nodes should have different DFS numbers");
25997 if (NodeI1 != NodeI2)
25998 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
25999 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
26000 if (S && !S.isAltShuffle() && I1->getOpcode() == I2->getOpcode()) {
26001 const auto *E1 = dyn_cast<ExtractElementInst>(I1);
26002 const auto *E2 = dyn_cast<ExtractElementInst>(I2);
26003 if (!E1 || !E2)
26004 continue;
26005
26006 // Sort on ExtractElementInsts primarily by vector operands. Prefer
26007 // program order of the vector operands.
26008 const auto *V1 = dyn_cast<Instruction>(E1->getVectorOperand());
26009 const auto *V2 = dyn_cast<Instruction>(E2->getVectorOperand());
26010 if (V1 != V2) {
26011 if (V1 && !V2)
26012 return true;
26013 if (!V1 && V2)
26014 return false;
26016 DT->getNode(V1->getParent());
26018 DT->getNode(V2->getParent());
26019 if (!NodeI1)
26020 return NodeI2 != nullptr;
26021 if (!NodeI2)
26022 return false;
26023 assert((NodeI1 == NodeI2) ==
26024 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
26025 "Different nodes should have different DFS numbers");
26026 if (NodeI1 != NodeI2)
26027 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
26028 return V1->comesBefore(V2);
26029 }
26030 // If we have the same vector operand, try to sort by constant
26031 // index.
26032 std::optional<unsigned> Id1 = getExtractIndex(E1);
26033 std::optional<unsigned> Id2 = getExtractIndex(E2);
26034 // Bring constants to the top
26035 if (Id1 && !Id2)
26036 return true;
26037 if (!Id1 && Id2)
26038 return false;
26039 // First elements come first.
26040 if (Id1 && Id2)
26041 return *Id1 < *Id2;
26042
26043 continue;
26044 }
26045 if (I1->getOpcode() == I2->getOpcode())
26046 continue;
26047 return I1->getOpcode() < I2->getOpcode();
26048 }
26049 if (I1)
26050 return true;
26051 if (I2)
26052 return false;
26053 }
26054 {
26055 // Non-undef constants come next.
26056 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
26057 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
26058 if (C1 && C2)
26059 continue;
26060 if (C1)
26061 return true;
26062 if (C2)
26063 return false;
26064 }
26065 bool U1 = isa<UndefValue>(Opcodes1[I]);
26066 bool U2 = isa<UndefValue>(Opcodes2[I]);
26067 {
26068 // Non-constant non-instructions come next.
26069 if (!U1 && !U2) {
26070 auto ValID1 = Opcodes1[I]->getValueID();
26071 auto ValID2 = Opcodes2[I]->getValueID();
26072 if (ValID1 == ValID2)
26073 continue;
26074 if (ValID1 < ValID2)
26075 return true;
26076 if (ValID1 > ValID2)
26077 return false;
26078 }
26079 if (!U1)
26080 return true;
26081 if (!U2)
26082 return false;
26083 }
26084 // Undefs come last.
26085 assert(U1 && U2 && "The only thing left should be undef & undef.");
26086 }
26087 return false;
26088 };
26089 auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](ArrayRef<Value *> VL,
26090 Value *V1) {
26091 if (VL.empty() || V1 == VL.back())
26092 return true;
26093 Value *V2 = VL.back();
26094 if (V1->getType() != V2->getType())
26095 return false;
26096 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
26097 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
26098 if (Opcodes1.size() != Opcodes2.size())
26099 return false;
26100 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
26101 // Undefs are compatible with any other value.
26102 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
26103 continue;
26104 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
26105 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
26106 if (R.isDeleted(I1) || R.isDeleted(I2))
26107 return false;
26108 if (I1->getParent() != I2->getParent())
26109 return false;
26110 if (getSameOpcode({I1, I2}, *TLI))
26111 continue;
26112 return false;
26113 }
26114 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
26115 continue;
26116 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
26117 return false;
26118 }
26119 return true;
26120 };
26121
26122 bool HaveVectorizedPhiNodes = false;
26123 do {
26124 // Collect the incoming values from the PHIs.
26125 Incoming.clear();
26126 for (Instruction &I : *BB) {
26127 auto *P = dyn_cast<PHINode>(&I);
26128 if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
26129 break;
26130
26131 // No need to analyze deleted, vectorized and non-vectorizable
26132 // instructions.
26133 if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
26134 isValidElementType(P->getType()))
26135 Incoming.push_back(P);
26136 }
26137
26138 if (Incoming.size() <= 1)
26139 break;
26140
26141 // Find the corresponding non-phi nodes for better matching when trying to
26142 // build the tree.
26143 for (Value *V : Incoming) {
26144 SmallVectorImpl<Value *> &Opcodes =
26145 PHIToOpcodes.try_emplace(V).first->getSecond();
26146 if (!Opcodes.empty())
26147 continue;
26148 SmallVector<Value *, 4> Nodes(1, V);
26149 SmallPtrSet<Value *, 4> Visited;
26150 while (!Nodes.empty()) {
26151 auto *PHI = cast<PHINode>(Nodes.pop_back_val());
26152 if (!Visited.insert(PHI).second)
26153 continue;
26154 for (Value *V : PHI->incoming_values()) {
26155 if (auto *PHI1 = dyn_cast<PHINode>((V))) {
26156 Nodes.push_back(PHI1);
26157 continue;
26158 }
26159 Opcodes.emplace_back(V);
26160 }
26161 }
26162 }
26163
26164 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
26165 Incoming, PHICompare, AreCompatiblePHIs,
26166 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
26167 return tryToVectorizeList(Candidates, R, MaxVFOnly);
26168 },
26169 /*MaxVFOnly=*/true, R);
26170 Changed |= HaveVectorizedPhiNodes;
26171 if (HaveVectorizedPhiNodes && any_of(PHIToOpcodes, [&](const auto &P) {
26172 auto *PHI = dyn_cast<PHINode>(P.first);
26173 return !PHI || R.isDeleted(PHI);
26174 }))
26175 PHIToOpcodes.clear();
26176 VisitedInstrs.insert_range(Incoming);
26177 } while (HaveVectorizedPhiNodes);
26178
26179 VisitedInstrs.clear();
26180
26181 InstSetVector PostProcessInserts;
26182 SmallSetVector<CmpInst *, 8> PostProcessCmps;
26183 // Vectorizes Inserts in `PostProcessInserts` and if `VectorizeCmps` is true
26184 // also vectorizes `PostProcessCmps`.
26185 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
26186 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
26187 if (VectorizeCmps) {
26188 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
26189 PostProcessCmps.clear();
26190 }
26191 PostProcessInserts.clear();
26192 return Changed;
26193 };
26194 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
26195 auto IsInPostProcessInstrs = [&](Instruction *I) {
26196 if (auto *Cmp = dyn_cast<CmpInst>(I))
26197 return PostProcessCmps.contains(Cmp);
26199 PostProcessInserts.contains(I);
26200 };
26201 // Returns true if `I` is an instruction without users, like terminator, or
26202 // function call with ignored return value, store. Ignore unused instructions
26203 // (basing on instruction type, except for CallInst and InvokeInst).
26204 auto HasNoUsers = [](Instruction *I) {
26205 return I->use_empty() &&
26206 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
26207 };
26208 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
26209 // Skip instructions with scalable type. The num of elements is unknown at
26210 // compile-time for scalable type.
26211 if (isa<ScalableVectorType>(It->getType()))
26212 continue;
26213
26214 // Skip instructions marked for the deletion.
26215 if (R.isDeleted(&*It))
26216 continue;
26217 // We may go through BB multiple times so skip the one we have checked.
26218 if (!VisitedInstrs.insert(&*It).second) {
26219 if (HasNoUsers(&*It) &&
26220 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
26221 // We would like to start over since some instructions are deleted
26222 // and the iterator may become invalid value.
26223 Changed = true;
26224 It = BB->begin();
26225 E = BB->end();
26226 }
26227 continue;
26228 }
26229
26230 // Try to vectorize reductions that use PHINodes.
26231 if (PHINode *P = dyn_cast<PHINode>(It)) {
26232 // Check that the PHI is a reduction PHI.
26233 if (P->getNumIncomingValues() == 2) {
26234 // Try to match and vectorize a horizontal reduction.
26235 Instruction *Root = getReductionInstr(DT, P, BB, LI);
26236 if (Root && vectorizeRootInstruction(P, Root, BB, R)) {
26237 Changed = true;
26238 It = BB->begin();
26239 E = BB->end();
26240 continue;
26241 }
26242 }
26243 // Try to vectorize the incoming values of the PHI, to catch reductions
26244 // that feed into PHIs.
26245 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
26246 // Skip if the incoming block is the current BB for now. Also, bypass
26247 // unreachable IR for efficiency and to avoid crashing.
26248 // TODO: Collect the skipped incoming values and try to vectorize them
26249 // after processing BB.
26250 if (BB == P->getIncomingBlock(I) ||
26251 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
26252 continue;
26253
26254 // Postponed instructions should not be vectorized here, delay their
26255 // vectorization.
26256 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
26257 PI && !IsInPostProcessInstrs(PI)) {
26258 bool Res =
26259 vectorizeRootInstruction(nullptr, PI, P->getIncomingBlock(I), R);
26260 Changed |= Res;
26261 if (Res && R.isDeleted(P)) {
26262 It = BB->begin();
26263 E = BB->end();
26264 break;
26265 }
26266 }
26267 }
26268 continue;
26269 }
26270
26271 if (HasNoUsers(&*It)) {
26272 bool OpsChanged = false;
26273 auto *SI = dyn_cast<StoreInst>(It);
26274 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
26275 if (SI) {
26276 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
26277 // Try to vectorize chain in store, if this is the only store to the
26278 // address in the block.
26279 // TODO: This is just a temporarily solution to save compile time. Need
26280 // to investigate if we can safely turn on slp-vectorize-hor-store
26281 // instead to allow lookup for reduction chains in all non-vectorized
26282 // stores (need to check side effects and compile time).
26283 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
26284 SI->getValueOperand()->hasOneUse();
26285 }
26286 if (TryToVectorizeRoot) {
26287 for (auto *V : It->operand_values()) {
26288 // Postponed instructions should not be vectorized here, delay their
26289 // vectorization.
26290 if (auto *VI = dyn_cast<Instruction>(V);
26291 VI && !IsInPostProcessInstrs(VI))
26292 // Try to match and vectorize a horizontal reduction.
26293 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R);
26294 }
26295 }
26296 // Start vectorization of post-process list of instructions from the
26297 // top-tree instructions to try to vectorize as many instructions as
26298 // possible.
26299 OpsChanged |=
26300 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
26301 if (OpsChanged) {
26302 // We would like to start over since some instructions are deleted
26303 // and the iterator may become invalid value.
26304 Changed = true;
26305 It = BB->begin();
26306 E = BB->end();
26307 continue;
26308 }
26309 }
26310
26312 PostProcessInserts.insert(&*It);
26313 else if (isa<CmpInst>(It))
26314 PostProcessCmps.insert(cast<CmpInst>(&*It));
26315 }
26316
26317 return Changed;
26318}
26319
26320bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
26321 auto Changed = false;
26322 for (auto &Entry : GEPs) {
26323 // If the getelementptr list has fewer than two elements, there's nothing
26324 // to do.
26325 if (Entry.second.size() < 2)
26326 continue;
26327
26328 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
26329 << Entry.second.size() << ".\n");
26330
26331 // Process the GEP list in chunks suitable for the target's supported
26332 // vector size. If a vector register can't hold 1 element, we are done. We
26333 // are trying to vectorize the index computations, so the maximum number of
26334 // elements is based on the size of the index expression, rather than the
26335 // size of the GEP itself (the target's pointer size).
26336 auto *It = find_if(Entry.second, [&](GetElementPtrInst *GEP) {
26337 return !R.isDeleted(GEP);
26338 });
26339 if (It == Entry.second.end())
26340 continue;
26341 unsigned MaxVecRegSize = R.getMaxVecRegSize();
26342 unsigned EltSize = R.getVectorElementSize(*(*It)->idx_begin());
26343 if (MaxVecRegSize < EltSize)
26344 continue;
26345
26346 unsigned MaxElts = MaxVecRegSize / EltSize;
26347 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
26348 auto Len = std::min<unsigned>(BE - BI, MaxElts);
26349 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
26350
26351 // Initialize a set a candidate getelementptrs. Note that we use a
26352 // SetVector here to preserve program order. If the index computations
26353 // are vectorizable and begin with loads, we want to minimize the chance
26354 // of having to reorder them later.
26355 SetVector<Value *> Candidates(llvm::from_range, GEPList);
26356
26357 // Some of the candidates may have already been vectorized after we
26358 // initially collected them or their index is optimized to constant value.
26359 // If so, they are marked as deleted, so remove them from the set of
26360 // candidates.
26361 Candidates.remove_if([&R](Value *I) {
26362 return R.isDeleted(cast<Instruction>(I)) ||
26363 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
26364 });
26365
26366 // Remove from the set of candidates all pairs of getelementptrs with
26367 // constant differences. Such getelementptrs are likely not good
26368 // candidates for vectorization in a bottom-up phase since one can be
26369 // computed from the other. We also ensure all candidate getelementptr
26370 // indices are unique.
26371 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
26372 auto *GEPI = GEPList[I];
26373 if (!Candidates.count(GEPI))
26374 continue;
26375 const SCEV *SCEVI = SE->getSCEV(GEPList[I]);
26376 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
26377 auto *GEPJ = GEPList[J];
26378 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
26379 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
26380 Candidates.remove(GEPI);
26381 Candidates.remove(GEPJ);
26382 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
26383 Candidates.remove(GEPJ);
26384 }
26385 }
26386 }
26387
26388 // We break out of the above computation as soon as we know there are
26389 // fewer than two candidates remaining.
26390 if (Candidates.size() < 2)
26391 continue;
26392
26393 // Add the single, non-constant index of each candidate to the bundle. We
26394 // ensured the indices met these constraints when we originally collected
26395 // the getelementptrs.
26396 SmallVector<Value *, 16> Bundle(Candidates.size());
26397 auto BundleIndex = 0u;
26398 for (auto *V : Candidates) {
26399 auto *GEP = cast<GetElementPtrInst>(V);
26400 auto *GEPIdx = GEP->idx_begin()->get();
26401 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
26402 Bundle[BundleIndex++] = GEPIdx;
26403 }
26404
26405 // Try and vectorize the indices. We are currently only interested in
26406 // gather-like cases of the form:
26407 //
26408 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
26409 //
26410 // where the loads of "a", the loads of "b", and the subtractions can be
26411 // performed in parallel. It's likely that detecting this pattern in a
26412 // bottom-up phase will be simpler and less costly than building a
26413 // full-blown top-down phase beginning at the consecutive loads.
26414 Changed |= tryToVectorizeList(Bundle, R);
26415 }
26416 }
26417 return Changed;
26418}
26419
26420bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
26421 bool Changed = false;
26422 // Sort by type, base pointers and values operand. Value operands must be
26423 // compatible (have the same opcode, same parent), otherwise it is
26424 // definitely not profitable to try to vectorize them.
26425 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
26426 if (V->getValueOperand()->getType()->getTypeID() <
26427 V2->getValueOperand()->getType()->getTypeID())
26428 return true;
26429 if (V->getValueOperand()->getType()->getTypeID() >
26430 V2->getValueOperand()->getType()->getTypeID())
26431 return false;
26432 if (V->getPointerOperandType()->getTypeID() <
26433 V2->getPointerOperandType()->getTypeID())
26434 return true;
26435 if (V->getPointerOperandType()->getTypeID() >
26436 V2->getPointerOperandType()->getTypeID())
26437 return false;
26438 if (V->getValueOperand()->getType()->getScalarSizeInBits() <
26439 V2->getValueOperand()->getType()->getScalarSizeInBits())
26440 return true;
26441 if (V->getValueOperand()->getType()->getScalarSizeInBits() >
26442 V2->getValueOperand()->getType()->getScalarSizeInBits())
26443 return false;
26444 // UndefValues are compatible with all other values.
26445 if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
26446 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
26447 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =
26448 DT->getNode(I1->getParent());
26449 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =
26450 DT->getNode(I2->getParent());
26451 assert(NodeI1 && "Should only process reachable instructions");
26452 assert(NodeI2 && "Should only process reachable instructions");
26453 assert((NodeI1 == NodeI2) ==
26454 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
26455 "Different nodes should have different DFS numbers");
26456 if (NodeI1 != NodeI2)
26457 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
26458 return I1->getOpcode() < I2->getOpcode();
26459 }
26460 return V->getValueOperand()->getValueID() <
26461 V2->getValueOperand()->getValueID();
26462 };
26463
26464 bool SameParent = true;
26465 auto AreCompatibleStores = [&](ArrayRef<StoreInst *> VL, StoreInst *V1) {
26466 if (VL.empty()) {
26467 SameParent = true;
26468 return true;
26469 }
26470 StoreInst *V2 = VL.back();
26471 if (V1 == V2)
26472 return true;
26473 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
26474 return false;
26475 if (V1->getPointerOperandType() != V2->getPointerOperandType())
26476 return false;
26477 // Undefs are compatible with any other value.
26478 if (isa<UndefValue>(V1->getValueOperand()) ||
26480 return true;
26481 if (isa<Constant>(V1->getValueOperand()) &&
26483 return true;
26484 // Check if the operands of the stores can be vectorized. They can be
26485 // vectorized, if they have compatible operands or have operands, which can
26486 // be vectorized as copyables.
26487 auto *I1 = dyn_cast<Instruction>(V1->getValueOperand());
26488 auto *I2 = dyn_cast<Instruction>(V2->getValueOperand());
26489 if (I1 || I2) {
26490 // Accept only tail-following non-compatible values for now.
26491 // TODO: investigate if it is possible to vectorize incompatible values,
26492 // if the copyables are first in the list.
26493 if (I1 && !I2)
26494 return false;
26495 SameParent &= I1 && I2 && I1->getParent() == I2->getParent();
26496 SmallVector<Value *> NewVL(VL.size() + 1);
26497 for (auto [SI, V] : zip(VL, NewVL))
26498 V = SI->getValueOperand();
26499 NewVL.back() = V1->getValueOperand();
26500 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
26501 InstructionsState S = Analysis.buildInstructionsState(
26502 NewVL, R, VectorizeCopyableElements, /*WithProfitabilityCheck=*/true,
26503 /*SkipSameCodeCheck=*/!SameParent);
26504 if (S)
26505 return true;
26506 if (!SameParent)
26507 return false;
26508 }
26509 return V1->getValueOperand()->getValueID() ==
26510 V2->getValueOperand()->getValueID();
26511 };
26512
26513 // Attempt to sort and vectorize each of the store-groups.
26514 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
26515 for (auto &Pair : Stores) {
26516 if (Pair.second.size() < 2)
26517 continue;
26518
26519 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
26520 << Pair.second.size() << ".\n");
26521
26522 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
26523 continue;
26524
26525 // Reverse stores to do bottom-to-top analysis. This is important if the
26526 // values are stores to the same addresses several times, in this case need
26527 // to follow the stores order (reversed to meet the memory dependecies).
26528 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
26529 Pair.second.rend());
26531 ReversedStores, StoreSorter, AreCompatibleStores,
26532 [&](ArrayRef<StoreInst *> Candidates, bool) {
26533 return vectorizeStores(Candidates, R, Attempted);
26534 },
26535 /*MaxVFOnly=*/false, R);
26536 }
26537 return Changed;
26538}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isConstant(const MachineInstr &MI)
AMDGPU Register Bank Select
Rewrite undef for PHI
ReachingDefAnalysis InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
basic Basic Alias true
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:638
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
Early If Converter
static bool runImpl(Function &F, const TargetLowering &TLI, AssumptionCache *AC)
Definition ExpandFp.cpp:992
This is the interface for a simple mod/ref and alias analysis over globals.
static const HTTPClientCleanup Cleanup
Hexagon Common GEP
#define _
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition IVUsers.cpp:48
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
#define T
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
ppc ctr loops verify
static bool IsSelect(MachineInstr &MI)
if(PassOpts->AAPipeline)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
static std::optional< OperandInfo > getOperandInfo(const MachineOperand &MO)
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static cl::opt< bool > SplitAlternateInstructions("slp-split-alternate-instructions", cl::init(true), cl::Hidden, cl::desc("Improve the code quality by splitting alternate instructions"))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static bool isMaskedLoadCompress(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, const DominatorTree &DT, const TargetLibraryInfo &TLI, const function_ref< bool(Value *)> AreAllUsersVectorized, bool &IsMasked, unsigned &InterleaveFactor, SmallVectorImpl< int > &CompressMask, VectorType *&LoadVecTy)
Checks if the VL can be transformed to a (masked)load + compress or (masked) interleaved load.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< bool > VectorizeCopyableElements("slp-copyable-elements", cl::init(true), cl::Hidden, cl::desc("Try to replace values with the idempotent instructions for " "better vectorization."))
Enables vectorization of copyable elements.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static bool isCommutative(Instruction *I, Value *ValWithUses)
static bool allSameOpcode(ArrayRef< Value * > VL)
static InstructionCost canConvertToFMA(ArrayRef< Value * > VL, const InstructionsState &S, DominatorTree &DT, const DataLayout &DL, TargetTransformInfo &TTI, const TargetLibraryInfo &TLI)
Check if we can convert fadd/fsub sequence to FMAD.
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
#define SV_NAME
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Creates subvector insert.
static void findBuildAggregateRec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static const SCEV * calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static DebugLoc getDebugLocFromPHI(PHINode &PN)
static std::optional< unsigned > getExtractIndex(const Instruction *E)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static cl::opt< bool > ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden, cl::desc("Generate strided loads even if they are not " "profitable. Used for testing only."))
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static bool isMainInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an main operation for the given MainOp and AltOp instruction...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I)
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={})
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
static bool buildCompressMask(ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, Type *ScalarTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< int > &CompressMask)
Builds compress-like mask for shuffles for the given PointerOps, ordered with Order.
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
Calculates the costs of vectorized intrinsic (if possible) and vectorized function (if possible) call...
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static SmallVector< Constant * > replicateMask(ArrayRef< Constant * > Val, unsigned VF)
Replicates the given Val VF times.
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static Instruction * findInstructionWithOpcode(ArrayRef< Value * > VL, unsigned Opcode)
Find an instruction with a specific opcode in VL.
static InstructionCost getExtractWithExtendCost(const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst is a FixedVectorType,...
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static std::pair< Instruction *, Instruction * > getMainAltOpsNoStateVL(ArrayRef< Value * > VL)
Returns main/alternate instructions for the given VL.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int64_t > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(ArrayRef< T * >, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static bool tryToFindDuplicates(SmallVectorImpl< Value * > &VL, SmallVectorImpl< int > &ReuseShuffleIndices, const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const InstructionsState &S, const BoUpSLP::EdgeInfo &UserTreeIdx, bool TryPad=false)
Checks that every instruction appears once in the list and if not, packs them, building ReuseShuffleI...
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Generates subvector extract using Generator or using default shuffle.
static cl::opt< bool > DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden, cl::desc("Disable tree reordering even if it is " "profitable. Used for testing only."))
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static InstructionCost getVectorInstrCost(const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef< std::tuple< Value *, User *, int > > ScalarUserAndIdx)
This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy is a FixedVectorType,...
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, Type *ScalarTy, unsigned Opcode0, unsigned Opcode1)
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static const int BlockSize
Definition TarWriter.cpp:33
This pass exposes codegen information to IR-level passes.
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
Definition VPlanSLP.cpp:210
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition blake3_impl.h:83
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:234
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1406
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1330
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:371
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:380
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition APInt.cpp:1666
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1111
void clearAllBits()
Set every bit to 0.
Definition APInt.h:1396
void negate()
Negate this APInt in place.
Definition APInt.h:1468
unsigned logBase2() const
Definition APInt.h:1761
void setAllBits()
Set every bit to 1.
Definition APInt.h:1319
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition APInt.h:1367
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:440
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:200
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:389
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition APInt.h:286
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:239
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition ArrayRef.h:183
const T & back() const
back - Get the last element.
Definition ArrayRef.h:156
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition ArrayRef.h:224
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:200
const T & front() const
front - Get the first element.
Definition ArrayRef.h:150
iterator end() const
Definition ArrayRef.h:136
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
iterator begin() const
Definition ArrayRef.h:135
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:191
const T & consume_front()
consume_front() - Returns the first element and drops it from ArrayRef.
Definition ArrayRef.h:162
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:459
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
reverse_iterator rend()
Definition BasicBlock.h:477
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
size_t size() const
Definition BasicBlock.h:480
InstListType::const_reverse_iterator const_reverse_iterator
Definition BasicBlock.h:173
bool isEHPad() const
Return true if this basic block is an exception handling block.
Definition BasicBlock.h:707
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition InstrTypes.h:448
This class is the base class for the comparison instructions.
Definition InstrTypes.h:666
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:984
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
@ ICMP_SLT
signed less than
Definition InstrTypes.h:707
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:708
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:702
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:701
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:705
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:703
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:706
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:704
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition InstrTypes.h:829
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:791
Predicate getPredicate() const
Return the predicate for this instruction.
Definition InstrTypes.h:767
static LLVM_ABI Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
static LLVM_ABI Constant * getBinOpIdentity(unsigned Opcode, Type *Ty, bool AllowRHSConstant=false, bool NSZ=false)
Return the identity constant for a binary opcode.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:163
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:154
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
static bool shouldExecute(unsigned CounterName)
A debug info location.
Definition DebugLoc.h:124
static DebugLoc getUnknown()
Definition DebugLoc.h:162
An analysis that produces DemandedBits for a function.
LLVM_ABI APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:187
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:165
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:229
bool erase(const KeyT &Val)
Definition DenseMap.h:303
unsigned size() const
Definition DenseMap.h:108
bool empty() const
Definition DenseMap.h:107
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:161
iterator end()
Definition DenseMap.h:81
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition DenseMap.h:205
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:156
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:214
Implements a dense probed hash-table based set.
Definition DenseSet.h:269
Base class for the actual dominator tree node.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
Definition Dominators.h:284
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:165
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
This instruction extracts a single (scalar) element from a VectorType value.
This instruction extracts a struct member or array element value from an aggregate value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
bool allowReassoc() const
Flag queries.
Definition FMF.h:64
bool allowContract() const
Definition FMF.h:69
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
ArrayRef< Type * > params() const
Type * getReturnType() const
bool empty() const
Definition Function.h:857
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
For the node iterator we just need to turn the TreeEntry iterator into a TreeEntry* iterator so that ...
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2571
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition IRBuilder.h:547
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:575
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2637
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:345
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
LLVM_ABI Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2204
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2593
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1708
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition IRBuilder.h:2277
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2439
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1651
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1437
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isBinaryOp() const
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
const SmallVectorImpl< Type * > & getArgTypes() const
An instruction for reading from memory.
Value * getPointerOperand()
bool isSimple() const
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
iterator end()
Definition MapVector.h:67
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition MapVector.h:48
iterator find(const KeyT &Key)
Definition MapVector.h:141
bool empty() const
Definition MapVector.h:75
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:107
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:99
size_type size() const
Definition MapVector.h:56
std::pair< KeyT, ValueT > & front()
Definition MapVector.h:79
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:303
T & front() const
front - Get the first element.
Definition ArrayRef.h:354
iterator end() const
Definition ArrayRef.h:348
iterator begin() const
Definition ArrayRef.h:347
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
Definition Pass.h:99
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:59
ArrayRef< value_type > getArrayRef() const
Definition SetVector.h:90
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:104
const value_type & front() const
Return the first element of the SetVector.
Definition SetVector.h:149
void insert_range(Range &&R)
Definition SetVector.h:193
Vector takeVector()
Clear the SetVector and return the underlying vector.
Definition SetVector.h:93
void clear()
Completely clear the SetVector.
Definition SetVector.h:284
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:99
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:168
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition SetVector.h:269
static LLVM_ABI bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static LLVM_ABI bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static LLVM_ABI bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:281
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
iterator end() const
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:356
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition SmallSet.h:175
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition SmallSet.h:226
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:181
size_type size() const
Definition SmallSet.h:170
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
LLVM_ABI InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
LLVM_ABI InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
OperandValueProperties
Additional properties of an operand's values.
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ GatherScatter
The cast is used with a gather/scatter.
OperandValueKind
Additional information about an operand's possible values.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
Definition Type.cpp:181
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition Type.h:246
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
LLVM_ABI unsigned getStructNumElements() const
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition Type.h:296
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition Type.h:270
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
TypeID getTypeID() const
Return the type id for the type.
Definition Type.h:136
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition Type.h:225
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition User.cpp:21
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Definition User.h:119
op_iterator op_begin()
Definition User.h:284
Value * getOperand(unsigned i) const
Definition User.h:232
unsigned getNumOperands() const
Definition User.h:254
iterator_range< value_op_iterator > operand_values()
Definition User.h:316
The Vector Function Database.
Definition VectorUtils.h:33
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition VectorUtils.h:74
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
User * user_back()
Definition Value.h:412
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition Value.h:543
LLVM_ABI bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition Value.cpp:158
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1101
LLVM_ABI unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition Value.cpp:265
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:194
iterator find(const_arg_type_t< ValueT > V)
Definition DenseSet.h:163
void insert_range(Range &&R)
Definition DenseSet.h:220
size_type size() const
Definition DenseSet.h:87
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:169
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:174
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:166
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
Definition Hashing.h:76
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:134
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:359
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
VLOperands(ArrayRef< Value * > RootVL, ArrayRef< ValueList > Operands, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
Bottom Up SLP Vectorizer.
bool isProfitableToReorder() const
Checks if it is profitable to reorder the current tree.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleEntity &SE)
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleBundle &Bundle)
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
bool isStridedLoad(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, const bool IsAnyPointerUsedOutGraph, const int64_t Diff, StridedPtrInfo &SPtrInfo) const
Checks if strided loads can be generated out of VL loads with pointers PointerOps:
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
SmallVector< StoreInst *, 8 > StoreList
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={}, InstructionCost ReductionCost=TTI::TCC_Free)
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
SmallVector< Instruction *, 16 > InstrList
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, StridedPtrInfo &SPtrInfo, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isVectorized(const Value *V) const
Check if the value is vectorized in the tree.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleData &SD)
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals, ArrayRef< std::tuple< Value *, unsigned, bool > > VectorValuesAndScales)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
static bool isIdentityOrder(ArrayRef< unsigned > Order)
Does this non-empty order represent an identity order?
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Gets reordering data for the given tree entry.
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
CallInst * Call
Changed
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaxNum(const Opnd0 &Op0, const Opnd1 &Op1)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaximum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
unsigned combineHashValue(unsigned a, unsigned b)
Simplistic combination of 32-bit hash values into 32-bit hash values.
@ User
could "use" a pointer
DiagnosticInfoOptimizationBase::Argument NV
bool empty() const
Definition BasicBlock.h:101
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI iterator begin() const
LLVM_ABI Instruction & front() const
A private "module" namespace for types and utilities used by this pass.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:311
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
@ Offset
Definition DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:824
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2047
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1740
void fill(R &&Range, T &&Value)
Provide wrappers to std::fill which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1734
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1707
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1714
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1666
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
InstructionCost Cost
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition ScopeExit.h:59
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2461
auto pred_end(const MachineBasicBlock *BB)
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition Utils.cpp:1725
constexpr from_range_t from_range
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
LLVM_ABI std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:738
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition STLExtras.h:2220
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:627
auto cast_or_null(const Y &Val)
Definition Casting.h:720
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:557
iterator_range< po_iterator< T > > post_order(const T &G)
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:682
auto binary_search(R &&Range, T &&Value)
Provide wrappers to std::binary_search which take ranges instead of having to pass begin/end explicit...
Definition STLExtras.h:1970
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:314
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:396
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
DomTreeNodeBase< BasicBlock > DomTreeNode
Definition Dominators.h:95
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:759
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2117
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:1957
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1721
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:402
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:401
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1633
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
Definition STLExtras.h:1752
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:431
bool isa_and_present(const Y &Val)
isa_and_present<X> - Functionally identical to isa, except that a null value is accepted.
Definition Casting.h:675
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
Definition SPIRVUtils.h:288
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1728
auto make_first_range(ContainerTy &&c)
Given a container of pairs, return a range over the first elements.
Definition STLExtras.h:1408
LLVM_ABI std::optional< int64_t > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition Local.cpp:421
bool isModOrRefSet(const ModRefInfo MRI)
Definition ModRef.h:43
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
Definition STLExtras.h:1909
LLVM_ABI bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Ref
The access may reference the value stored in memory.
Definition ModRef.h:32
@ LLVM_MARK_AS_BITMASK_ENUM
Definition ModRef.h:37
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:405
@ Other
Any other memory.
Definition ModRef.h:68
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
TargetTransformInfo TTI
LLVM_ABI CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ None
Not a recurrence.
@ Xor
Bitwise or logical XOR of integers.
@ FMul
Product of floats.
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
@ FAdd
Sum of floats.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:155
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1943
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2019
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1824
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
Definition STLExtras.h:1418
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1950
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
auto pred_begin(const MachineBasicBlock *BB)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1747
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1886
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
template class LLVM_TEMPLATE_ABI DomTreeNodeBase< BasicBlock >
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2077
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:299
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
LLVM_ABI Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:836
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:853
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Used to keep track of an operand bundle.
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits(bool simple=false)
DenseMapInfo< BoUpSLP::TreeEntry * > FirstInfo
static bool isEqual(const BoUpSLP::EdgeInfo &LHS, const BoUpSLP::EdgeInfo &RHS)
static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val)
static BoUpSLP::EdgeInfo getTombstoneKey()
An information struct used to provide DenseMap with the various necessary components for a given valu...
Add the VectorizableTree to the index iterator to be able to return TreeEntry pointers.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
static nodes_iterator nodes_end(BoUpSLP *R)
BoUpSLP::TreeEntry::VecTreeTy ContainerTy
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:117
ScalarEvolution * SE
TargetTransformInfo * TTI
AssumptionCache * AC
TargetLibraryInfo * TLI
const DataLayout * DL
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Definition MapVector.h:249
Describe known properties for a set of pointers.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Definition STLExtras.h:1436
Function object to check whether the second component of a container supported by std::get (like std:...
Definition STLExtras.h:1445
This structure holds any data we need about the edges being traversed during buildTreeRec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.
bool operator==(const EdgeInfo &Other) const