LLVM 22.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
41#include "llvm/Analysis/Loads.h"
52#include "llvm/IR/Attributes.h"
53#include "llvm/IR/BasicBlock.h"
54#include "llvm/IR/Constant.h"
55#include "llvm/IR/Constants.h"
56#include "llvm/IR/DataLayout.h"
58#include "llvm/IR/Dominators.h"
59#include "llvm/IR/Function.h"
60#include "llvm/IR/IRBuilder.h"
61#include "llvm/IR/InstrTypes.h"
62#include "llvm/IR/Instruction.h"
65#include "llvm/IR/Intrinsics.h"
66#include "llvm/IR/Module.h"
67#include "llvm/IR/Operator.h"
69#include "llvm/IR/Type.h"
70#include "llvm/IR/Use.h"
71#include "llvm/IR/User.h"
72#include "llvm/IR/Value.h"
73#include "llvm/IR/ValueHandle.h"
74#ifdef EXPENSIVE_CHECKS
75#include "llvm/IR/Verifier.h"
76#endif
77#include "llvm/Pass.h"
82#include "llvm/Support/Debug.h"
94#include <algorithm>
95#include <cassert>
96#include <cstdint>
97#include <iterator>
98#include <memory>
99#include <optional>
100#include <set>
101#include <string>
102#include <tuple>
103#include <utility>
104
105using namespace llvm;
106using namespace llvm::PatternMatch;
107using namespace slpvectorizer;
108using namespace std::placeholders;
109
110#define SV_NAME "slp-vectorizer"
111#define DEBUG_TYPE "SLP"
112
113STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
114
115DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",
116 "Controls which SLP graphs should be vectorized.");
117
118static cl::opt<bool>
119 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
120 cl::desc("Run the SLP vectorization passes"));
121
122static cl::opt<bool>
123 SLPReVec("slp-revec", cl::init(false), cl::Hidden,
124 cl::desc("Enable vectorization for wider vector utilization"));
125
126static cl::opt<int>
128 cl::desc("Only vectorize if you gain more than this "
129 "number "));
130
132 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
133 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
134 "heuristics and makes vectorization decision via cost modeling."));
135
136static cl::opt<bool>
137ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
138 cl::desc("Attempt to vectorize horizontal reductions"));
139
141 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
142 cl::desc(
143 "Attempt to vectorize horizontal reductions feeding into a store"));
144
146 "slp-split-alternate-instructions", cl::init(true), cl::Hidden,
147 cl::desc("Improve the code quality by splitting alternate instructions"));
148
149static cl::opt<int>
151 cl::desc("Attempt to vectorize for this register size in bits"));
152
155 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
156
157/// Limits the size of scheduling regions in a block.
158/// It avoid long compile times for _very_ large blocks where vector
159/// instructions are spread over a wide range.
160/// This limit is way higher than needed by real-world functions.
161static cl::opt<int>
162ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
163 cl::desc("Limit the size of the SLP scheduling region per block"));
164
166 "slp-min-reg-size", cl::init(128), cl::Hidden,
167 cl::desc("Attempt to vectorize for this register size in bits"));
168
170 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
171 cl::desc("Limit the recursion depth when building a vectorizable tree"));
172
174 "slp-min-tree-size", cl::init(3), cl::Hidden,
175 cl::desc("Only vectorize small trees if they are fully vectorizable"));
176
177// The maximum depth that the look-ahead score heuristic will explore.
178// The higher this value, the higher the compilation time overhead.
180 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
181 cl::desc("The maximum look-ahead depth for operand reordering scores"));
182
183// The maximum depth that the look-ahead score heuristic will explore
184// when it probing among candidates for vectorization tree roots.
185// The higher this value, the higher the compilation time overhead but unlike
186// similar limit for operands ordering this is less frequently used, hence
187// impact of higher value is less noticeable.
189 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
190 cl::desc("The maximum look-ahead depth for searching best rooting option"));
191
193 "slp-min-strided-loads", cl::init(2), cl::Hidden,
194 cl::desc("The minimum number of loads, which should be considered strided, "
195 "if the stride is > 1 or is runtime value"));
196
198 "slp-max-stride", cl::init(8), cl::Hidden,
199 cl::desc("The maximum stride, considered to be profitable."));
200
201static cl::opt<bool>
202 ViewSLPTree("view-slp-tree", cl::Hidden,
203 cl::desc("Display the SLP trees with Graphviz"));
204
206 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
207 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
208
209/// Enables vectorization of copyable elements.
211 "slp-copyable-elements", cl::init(true), cl::Hidden,
212 cl::desc("Try to replace values with the idempotent instructions for "
213 "better vectorization."));
214
215// Limit the number of alias checks. The limit is chosen so that
216// it has no negative effect on the llvm benchmarks.
217static const unsigned AliasedCheckLimit = 10;
218
219// Limit of the number of uses for potentially transformed instructions/values,
220// used in checks to avoid compile-time explode.
221static constexpr int UsesLimit = 64;
222
223// Another limit for the alias checks: The maximum distance between load/store
224// instructions where alias checks are done.
225// This limit is useful for very large basic blocks.
226static const unsigned MaxMemDepDistance = 160;
227
228/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
229/// regions to be handled.
230static const int MinScheduleRegionSize = 16;
231
232/// Maximum allowed number of operands in the PHI nodes.
233static const unsigned MaxPHINumOperands = 128;
234
235/// Predicate for the element types that the SLP vectorizer supports.
236///
237/// The most important thing to filter here are types which are invalid in LLVM
238/// vectors. We also filter target specific types which have absolutely no
239/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
240/// avoids spending time checking the cost model and realizing that they will
241/// be inevitably scalarized.
242static bool isValidElementType(Type *Ty) {
243 // TODO: Support ScalableVectorType.
244 if (SLPReVec && isa<FixedVectorType>(Ty))
245 Ty = Ty->getScalarType();
246 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
247 !Ty->isPPC_FP128Ty();
248}
249
250/// Returns the type of the given value/instruction \p V. If it is store,
251/// returns the type of its value operand, for Cmp - the types of the compare
252/// operands and for insertelement - the type os the inserted operand.
253/// Otherwise, just the type of the value is returned.
255 if (auto *SI = dyn_cast<StoreInst>(V))
256 return SI->getValueOperand()->getType();
257 if (auto *CI = dyn_cast<CmpInst>(V))
258 return CI->getOperand(0)->getType();
259 if (auto *IE = dyn_cast<InsertElementInst>(V))
260 return IE->getOperand(1)->getType();
261 return V->getType();
262}
263
264/// \returns the number of elements for Ty.
265static unsigned getNumElements(Type *Ty) {
266 assert(!isa<ScalableVectorType>(Ty) &&
267 "ScalableVectorType is not supported.");
268 if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
269 return VecTy->getNumElements();
270 return 1;
271}
272
273/// \returns the vector type of ScalarTy based on vectorization factor.
274static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
275 return FixedVectorType::get(ScalarTy->getScalarType(),
276 VF * getNumElements(ScalarTy));
277}
278
279/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
280/// which forms type, which splits by \p TTI into whole vector types during
281/// legalization.
283 Type *Ty, unsigned Sz) {
284 if (!isValidElementType(Ty))
285 return bit_ceil(Sz);
286 // Find the number of elements, which forms full vectors.
287 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
288 if (NumParts == 0 || NumParts >= Sz)
289 return bit_ceil(Sz);
290 return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
291}
292
293/// Returns the number of elements of the given type \p Ty, not greater than \p
294/// Sz, which forms type, which splits by \p TTI into whole vector types during
295/// legalization.
296static unsigned
298 unsigned Sz) {
299 if (!isValidElementType(Ty))
300 return bit_floor(Sz);
301 // Find the number of elements, which forms full vectors.
302 unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
303 if (NumParts == 0 || NumParts >= Sz)
304 return bit_floor(Sz);
305 unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
306 if (RegVF > Sz)
307 return bit_floor(Sz);
308 return (Sz / RegVF) * RegVF;
309}
310
311static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
312 SmallVectorImpl<int> &Mask) {
313 // The ShuffleBuilder implementation use shufflevector to splat an "element".
314 // But the element have different meaning for SLP (scalar) and REVEC
315 // (vector). We need to expand Mask into masks which shufflevector can use
316 // directly.
317 SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
318 for (unsigned I : seq<unsigned>(Mask.size()))
319 for (auto [J, MaskV] : enumerate(MutableArrayRef(NewMask).slice(
320 I * VecTyNumElements, VecTyNumElements)))
321 MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem
322 : Mask[I] * VecTyNumElements + J;
323 Mask.swap(NewMask);
324}
325
326/// \returns the number of groups of shufflevector
327/// A group has the following features
328/// 1. All of value in a group are shufflevector.
329/// 2. The mask of all shufflevector is isExtractSubvectorMask.
330/// 3. The mask of all shufflevector uses all of the elements of the source.
331/// e.g., it is 1 group (%0)
332/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
333/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
334/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
335/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
336/// it is 2 groups (%3 and %4)
337/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
338/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
339/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
340/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
341/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
342/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
343/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
344/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
345/// it is 0 group
346/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
347/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
348/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
349/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
351 if (VL.empty())
352 return 0;
353 if (!all_of(VL, IsaPred<ShuffleVectorInst>))
354 return 0;
355 auto *SV = cast<ShuffleVectorInst>(VL.front());
356 unsigned SVNumElements =
357 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
358 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
359 if (SVNumElements % ShuffleMaskSize != 0)
360 return 0;
361 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
362 if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
363 return 0;
364 unsigned NumGroup = 0;
365 for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) {
366 auto *SV = cast<ShuffleVectorInst>(VL[I]);
367 Value *Src = SV->getOperand(0);
368 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
369 SmallBitVector ExpectedIndex(GroupSize);
370 if (!all_of(Group, [&](Value *V) {
371 auto *SV = cast<ShuffleVectorInst>(V);
372 // From the same source.
373 if (SV->getOperand(0) != Src)
374 return false;
375 int Index;
376 if (!SV->isExtractSubvectorMask(Index))
377 return false;
378 ExpectedIndex.set(Index / ShuffleMaskSize);
379 return true;
380 }))
381 return 0;
382 if (!ExpectedIndex.all())
383 return 0;
384 ++NumGroup;
385 }
386 assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
387 return NumGroup;
388}
389
390/// \returns a shufflevector mask which is used to vectorize shufflevectors
391/// e.g.,
392/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
393/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
394/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
395/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
396/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
397/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
398/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
399/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
400/// the result is
401/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
403 assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
404 auto *SV = cast<ShuffleVectorInst>(VL.front());
405 unsigned SVNumElements =
406 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
407 SmallVector<int> Mask;
408 unsigned AccumulateLength = 0;
409 for (Value *V : VL) {
410 auto *SV = cast<ShuffleVectorInst>(V);
411 for (int M : SV->getShuffleMask())
412 Mask.push_back(M == PoisonMaskElem ? PoisonMaskElem
413 : AccumulateLength + M);
414 AccumulateLength += SVNumElements;
415 }
416 return Mask;
417}
418
419/// \returns True if the value is a constant (but not globals/constant
420/// expressions).
421static bool isConstant(Value *V) {
422 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
423}
424
425/// Checks if \p V is one of vector-like instructions, i.e. undef,
426/// insertelement/extractelement with constant indices for fixed vector type or
427/// extractvalue instruction.
429 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
430 !isa<ExtractValueInst, UndefValue>(V))
431 return false;
432 auto *I = dyn_cast<Instruction>(V);
433 if (!I || isa<ExtractValueInst>(I))
434 return true;
435 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
436 return false;
437 if (isa<ExtractElementInst>(I))
438 return isConstant(I->getOperand(1));
439 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
440 return isConstant(I->getOperand(2));
441}
442
443/// Returns power-of-2 number of elements in a single register (part), given the
444/// total number of elements \p Size and number of registers (parts) \p
445/// NumParts.
446static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
447 return std::min<unsigned>(Size, bit_ceil(divideCeil(Size, NumParts)));
448}
449
450/// Returns correct remaining number of elements, considering total amount \p
451/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
452/// and current register (part) \p Part.
453static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
454 unsigned Part) {
455 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
456}
457
458#if !defined(NDEBUG)
459/// Print a short descriptor of the instruction bundle suitable for debug output.
460static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
461 std::string Result;
462 raw_string_ostream OS(Result);
463 if (Idx >= 0)
464 OS << "Idx: " << Idx << ", ";
465 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
466 return Result;
467}
468#endif
469
470/// \returns true if all of the instructions in \p VL are in the same block or
471/// false otherwise.
473 auto *It = find_if(VL, IsaPred<Instruction>);
474 if (It == VL.end())
475 return false;
476 Instruction *I0 = cast<Instruction>(*It);
478 return true;
479
480 BasicBlock *BB = I0->getParent();
481 for (Value *V : iterator_range(It, VL.end())) {
482 if (isa<PoisonValue>(V))
483 continue;
484 auto *II = dyn_cast<Instruction>(V);
485 if (!II)
486 return false;
487
488 if (BB != II->getParent())
489 return false;
490 }
491 return true;
492}
493
494/// \returns True if all of the values in \p VL are constants (but not
495/// globals/constant expressions).
497 // Constant expressions and globals can't be vectorized like normal integer/FP
498 // constants.
499 return all_of(VL, isConstant);
500}
501
502/// \returns True if all of the values in \p VL are identical or some of them
503/// are UndefValue.
504static bool isSplat(ArrayRef<Value *> VL) {
505 Value *FirstNonUndef = nullptr;
506 for (Value *V : VL) {
507 if (isa<UndefValue>(V))
508 continue;
509 if (!FirstNonUndef) {
510 FirstNonUndef = V;
511 continue;
512 }
513 if (V != FirstNonUndef)
514 return false;
515 }
516 return FirstNonUndef != nullptr;
517}
518
519/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
520/// For BinaryOperator, it also checks if \p InstWithUses is used in specific
521/// patterns that make it effectively commutative (like equality comparisons
522/// with zero).
523/// In most cases, users should not call this function directly (since \p I and
524/// \p InstWithUses are the same). However, when analyzing interchangeable
525/// instructions, we need to use the converted opcode along with the original
526/// uses.
527/// \param I The instruction to check for commutativity
528/// \param ValWithUses The value whose uses are analyzed for special
529/// patterns
530static bool isCommutative(Instruction *I, Value *ValWithUses) {
531 if (auto *Cmp = dyn_cast<CmpInst>(I))
532 return Cmp->isCommutative();
533 if (auto *BO = dyn_cast<BinaryOperator>(I))
534 return BO->isCommutative() ||
535 (BO->getOpcode() == Instruction::Sub &&
536 !ValWithUses->hasNUsesOrMore(UsesLimit) &&
537 all_of(
538 ValWithUses->uses(),
539 [](const Use &U) {
540 // Commutative, if icmp eq/ne sub, 0
541 CmpPredicate Pred;
542 if (match(U.getUser(),
543 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
544 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
545 return true;
546 // Commutative, if abs(sub nsw, true) or abs(sub, false).
547 ConstantInt *Flag;
548 return match(U.getUser(),
549 m_Intrinsic<Intrinsic::abs>(
550 m_Specific(U.get()), m_ConstantInt(Flag))) &&
551 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
552 Flag->isOne());
553 })) ||
554 (BO->getOpcode() == Instruction::FSub &&
555 !ValWithUses->hasNUsesOrMore(UsesLimit) &&
556 all_of(ValWithUses->uses(), [](const Use &U) {
557 return match(U.getUser(),
558 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
559 }));
560 return I->isCommutative();
561}
562
563/// This is a helper function to check whether \p I is commutative.
564/// This is a convenience wrapper that calls the two-parameter version of
565/// isCommutative with the same instruction for both parameters. This is
566/// the common case where the instruction being checked for commutativity
567/// is the same as the instruction whose uses are analyzed for special
568/// patterns (see the two-parameter version above for details).
569/// \param I The instruction to check for commutativity
570/// \returns true if the instruction is commutative, false otherwise
571static bool isCommutative(Instruction *I) { return isCommutative(I, I); }
572
573/// \returns number of operands of \p I, considering commutativity. Returns 2
574/// for commutative instrinsics.
575/// \param I The instruction to check for commutativity
577 if (isa<IntrinsicInst>(I) && isCommutative(I)) {
578 // IntrinsicInst::isCommutative returns true if swapping the first "two"
579 // arguments to the intrinsic produces the same result.
580 constexpr unsigned IntrinsicNumOperands = 2;
581 return IntrinsicNumOperands;
582 }
583 return I->getNumOperands();
584}
585
586template <typename T>
587static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
588 unsigned Offset) {
589 static_assert(std::is_same_v<T, InsertElementInst> ||
590 std::is_same_v<T, ExtractElementInst>,
591 "unsupported T");
592 int Index = Offset;
593 if (const auto *IE = dyn_cast<T>(Inst)) {
594 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
595 if (!VT)
596 return std::nullopt;
597 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
598 if (!CI)
599 return std::nullopt;
600 if (CI->getValue().uge(VT->getNumElements()))
601 return std::nullopt;
602 Index *= VT->getNumElements();
603 Index += CI->getZExtValue();
604 return Index;
605 }
606 return std::nullopt;
607}
608
609/// \returns inserting or extracting index of InsertElement, ExtractElement or
610/// InsertValue instruction, using Offset as base offset for index.
611/// \returns std::nullopt if the index is not an immediate.
612static std::optional<unsigned> getElementIndex(const Value *Inst,
613 unsigned Offset = 0) {
614 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
615 return Index;
616 if (auto Index = getInsertExtractIndex<ExtractElementInst>(Inst, Offset))
617 return Index;
618
619 int Index = Offset;
620
621 const auto *IV = dyn_cast<InsertValueInst>(Inst);
622 if (!IV)
623 return std::nullopt;
624
625 Type *CurrentType = IV->getType();
626 for (unsigned I : IV->indices()) {
627 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
628 Index *= ST->getNumElements();
629 CurrentType = ST->getElementType(I);
630 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
631 Index *= AT->getNumElements();
632 CurrentType = AT->getElementType();
633 } else {
634 return std::nullopt;
635 }
636 Index += I;
637 }
638 return Index;
639}
640
641/// \returns true if all of the values in \p VL use the same opcode.
642/// For comparison instructions, also checks if predicates match.
643/// PoisonValues are considered matching.
644/// Interchangeable instructions are not considered.
646 auto *It = find_if(VL, IsaPred<Instruction>);
647 if (It == VL.end())
648 return true;
649 Instruction *MainOp = cast<Instruction>(*It);
650 unsigned Opcode = MainOp->getOpcode();
651 bool IsCmpOp = isa<CmpInst>(MainOp);
652 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
654 return std::all_of(It, VL.end(), [&](Value *V) {
655 if (auto *CI = dyn_cast<CmpInst>(V))
656 return BasePred == CI->getPredicate();
657 if (auto *I = dyn_cast<Instruction>(V))
658 return I->getOpcode() == Opcode;
659 return isa<PoisonValue>(V);
660 });
661}
662
663namespace {
664/// Specifies the way the mask should be analyzed for undefs/poisonous elements
665/// in the shuffle mask.
666enum class UseMask {
667 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
668 ///< check for the mask elements for the first argument (mask
669 ///< indices are in range [0:VF)).
670 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
671 ///< for the mask elements for the second argument (mask indices
672 ///< are in range [VF:2*VF))
673 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
674 ///< future shuffle elements and mark them as ones as being used
675 ///< in future. Non-undef elements are considered as unused since
676 ///< they're already marked as used in the mask.
677};
678} // namespace
679
680/// Prepares a use bitset for the given mask either for the first argument or
681/// for the second.
683 UseMask MaskArg) {
684 SmallBitVector UseMask(VF, true);
685 for (auto [Idx, Value] : enumerate(Mask)) {
686 if (Value == PoisonMaskElem) {
687 if (MaskArg == UseMask::UndefsAsMask)
688 UseMask.reset(Idx);
689 continue;
690 }
691 if (MaskArg == UseMask::FirstArg && Value < VF)
692 UseMask.reset(Value);
693 else if (MaskArg == UseMask::SecondArg && Value >= VF)
694 UseMask.reset(Value - VF);
695 }
696 return UseMask;
697}
698
699/// Checks if the given value is actually an undefined constant vector.
700/// Also, if the \p UseMask is not empty, tries to check if the non-masked
701/// elements actually mask the insertelement buildvector, if any.
702template <bool IsPoisonOnly = false>
704 const SmallBitVector &UseMask = {}) {
705 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
706 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
707 if (isa<T>(V))
708 return Res;
709 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
710 if (!VecTy)
711 return Res.reset();
712 auto *C = dyn_cast<Constant>(V);
713 if (!C) {
714 if (!UseMask.empty()) {
715 const Value *Base = V;
716 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
717 Base = II->getOperand(0);
718 if (isa<T>(II->getOperand(1)))
719 continue;
720 std::optional<unsigned> Idx = getElementIndex(II);
721 if (!Idx) {
722 Res.reset();
723 return Res;
724 }
725 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
726 Res.reset(*Idx);
727 }
728 // TODO: Add analysis for shuffles here too.
729 if (V == Base) {
730 Res.reset();
731 } else {
732 SmallBitVector SubMask(UseMask.size(), false);
733 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
734 }
735 } else {
736 Res.reset();
737 }
738 return Res;
739 }
740 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
741 if (Constant *Elem = C->getAggregateElement(I))
742 if (!isa<T>(Elem) &&
743 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
744 Res.reset(I);
745 }
746 return Res;
747}
748
749/// Checks if the vector of instructions can be represented as a shuffle, like:
750/// %x0 = extractelement <4 x i8> %x, i32 0
751/// %x3 = extractelement <4 x i8> %x, i32 3
752/// %y1 = extractelement <4 x i8> %y, i32 1
753/// %y2 = extractelement <4 x i8> %y, i32 2
754/// %x0x0 = mul i8 %x0, %x0
755/// %x3x3 = mul i8 %x3, %x3
756/// %y1y1 = mul i8 %y1, %y1
757/// %y2y2 = mul i8 %y2, %y2
758/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
759/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
760/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
761/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
762/// ret <4 x i8> %ins4
763/// can be transformed into:
764/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
765/// i32 6>
766/// %2 = mul <4 x i8> %1, %1
767/// ret <4 x i8> %2
768/// Mask will return the Shuffle Mask equivalent to the extracted elements.
769/// TODO: Can we split off and reuse the shuffle mask detection from
770/// ShuffleVectorInst/getShuffleCost?
771static std::optional<TargetTransformInfo::ShuffleKind>
773 AssumptionCache *AC) {
774 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
775 if (It == VL.end())
776 return std::nullopt;
777 unsigned Size =
778 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) {
779 auto *EI = dyn_cast<ExtractElementInst>(V);
780 if (!EI)
781 return S;
782 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
783 if (!VTy)
784 return S;
785 return std::max(S, VTy->getNumElements());
786 });
787
788 Value *Vec1 = nullptr;
789 Value *Vec2 = nullptr;
790 bool HasNonUndefVec = any_of(VL, [&](Value *V) {
791 auto *EE = dyn_cast<ExtractElementInst>(V);
792 if (!EE)
793 return false;
794 Value *Vec = EE->getVectorOperand();
795 if (isa<UndefValue>(Vec))
796 return false;
797 return isGuaranteedNotToBePoison(Vec, AC);
798 });
799 enum ShuffleMode { Unknown, Select, Permute };
800 ShuffleMode CommonShuffleMode = Unknown;
801 Mask.assign(VL.size(), PoisonMaskElem);
802 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
803 // Undef can be represented as an undef element in a vector.
804 if (isa<UndefValue>(VL[I]))
805 continue;
806 auto *EI = cast<ExtractElementInst>(VL[I]);
807 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
808 return std::nullopt;
809 auto *Vec = EI->getVectorOperand();
810 // We can extractelement from undef or poison vector.
811 if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
812 continue;
813 // All vector operands must have the same number of vector elements.
814 if (isa<UndefValue>(Vec)) {
815 Mask[I] = I;
816 } else {
817 if (isa<UndefValue>(EI->getIndexOperand()))
818 continue;
819 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
820 if (!Idx)
821 return std::nullopt;
822 // Undefined behavior if Idx is negative or >= Size.
823 if (Idx->getValue().uge(Size))
824 continue;
825 unsigned IntIdx = Idx->getValue().getZExtValue();
826 Mask[I] = IntIdx;
827 }
828 if (isUndefVector(Vec).all() && HasNonUndefVec)
829 continue;
830 // For correct shuffling we have to have at most 2 different vector operands
831 // in all extractelement instructions.
832 if (!Vec1 || Vec1 == Vec) {
833 Vec1 = Vec;
834 } else if (!Vec2 || Vec2 == Vec) {
835 Vec2 = Vec;
836 Mask[I] += Size;
837 } else {
838 return std::nullopt;
839 }
840 if (CommonShuffleMode == Permute)
841 continue;
842 // If the extract index is not the same as the operation number, it is a
843 // permutation.
844 if (Mask[I] % Size != I) {
845 CommonShuffleMode = Permute;
846 continue;
847 }
848 CommonShuffleMode = Select;
849 }
850 // If we're not crossing lanes in different vectors, consider it as blending.
851 if (CommonShuffleMode == Select && Vec2)
853 // If Vec2 was never used, we have a permutation of a single vector, otherwise
854 // we have permutation of 2 vectors.
857}
858
859/// \returns True if Extract{Value,Element} instruction extracts element Idx.
860static std::optional<unsigned> getExtractIndex(const Instruction *E) {
861 unsigned Opcode = E->getOpcode();
862 assert((Opcode == Instruction::ExtractElement ||
863 Opcode == Instruction::ExtractValue) &&
864 "Expected extractelement or extractvalue instruction.");
865 if (Opcode == Instruction::ExtractElement) {
866 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
867 if (!CI)
868 return std::nullopt;
869 return CI->getZExtValue();
870 }
871 auto *EI = cast<ExtractValueInst>(E);
872 if (EI->getNumIndices() != 1)
873 return std::nullopt;
874 return *EI->idx_begin();
875}
876
877namespace llvm {
878/// Checks if the provided value does not require scheduling. It does not
879/// require scheduling if this is not an instruction or it is an instruction
880/// that does not read/write memory and all operands are either not instructions
881/// or phi nodes or instructions from different blocks.
882static bool areAllOperandsNonInsts(Value *V);
883/// Checks if the provided value does not require scheduling. It does not
884/// require scheduling if this is not an instruction or it is an instruction
885/// that does not read/write memory and all users are phi nodes or instructions
886/// from the different blocks.
887static bool isUsedOutsideBlock(Value *V);
888/// Checks if the specified value does not require scheduling. It does not
889/// require scheduling if all operands and all users do not need to be scheduled
890/// in the current basic block.
891static bool doesNotNeedToBeScheduled(Value *V);
892} // namespace llvm
893
894namespace {
895/// \returns true if \p Opcode is allowed as part of the main/alternate
896/// instruction for SLP vectorization.
897///
898/// Example of unsupported opcode is SDIV that can potentially cause UB if the
899/// "shuffled out" lane would result in division by zero.
900bool isValidForAlternation(unsigned Opcode) {
901 return !Instruction::isIntDivRem(Opcode);
902}
903
904/// Helper class that determines VL can use the same opcode.
905/// Alternate instruction is supported. In addition, it supports interchangeable
906/// instruction. An interchangeable instruction is an instruction that can be
907/// converted to another instruction with same semantics. For example, x << 1 is
908/// equal to x * 2. x * 1 is equal to x | 0.
909class BinOpSameOpcodeHelper {
910 using MaskType = std::uint_fast16_t;
911 /// Sort SupportedOp because it is used by binary_search.
912 constexpr static std::initializer_list<unsigned> SupportedOp = {
913 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
914 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
915 enum : MaskType {
916 ShlBIT = 0b1,
917 AShrBIT = 0b10,
918 MulBIT = 0b100,
919 AddBIT = 0b1000,
920 SubBIT = 0b10000,
921 AndBIT = 0b100000,
922 OrBIT = 0b1000000,
923 XorBIT = 0b10000000,
924 MainOpBIT = 0b100000000,
926 };
927 /// Return a non-nullptr if either operand of I is a ConstantInt.
928 /// The second return value represents the operand position. We check the
929 /// right-hand side first (1). If the right hand side is not a ConstantInt and
930 /// the instruction is neither Sub, Shl, nor AShr, we then check the left hand
931 /// side (0).
932 static std::pair<ConstantInt *, unsigned>
933 isBinOpWithConstantInt(const Instruction *I) {
934 unsigned Opcode = I->getOpcode();
935 assert(binary_search(SupportedOp, Opcode) && "Unsupported opcode.");
936 (void)SupportedOp;
937 auto *BinOp = cast<BinaryOperator>(I);
938 if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1)))
939 return {CI, 1};
940 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
941 Opcode == Instruction::AShr)
942 return {nullptr, 0};
943 if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(0)))
944 return {CI, 0};
945 return {nullptr, 0};
946 }
947 struct InterchangeableInfo {
948 const Instruction *I = nullptr;
949 /// The bit it sets represents whether MainOp can be converted to.
950 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
951 MulBIT | AShrBIT | ShlBIT;
952 /// We cannot create an interchangeable instruction that does not exist in
953 /// VL. For example, VL [x + 0, y * 1] can be converted to [x << 0, y << 0],
954 /// but << does not exist in VL. In the end, we convert VL to [x * 1, y *
955 /// 1]. SeenBefore is used to know what operations have been seen before.
956 MaskType SeenBefore = 0;
957 InterchangeableInfo(const Instruction *I) : I(I) {}
958 /// Return false allows BinOpSameOpcodeHelper to find an alternate
959 /// instruction. Directly setting the mask will destroy the mask state,
960 /// preventing us from determining which instruction it should convert to.
961 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
962 if (Mask & InterchangeableMask) {
963 SeenBefore |= OpcodeInMaskForm;
964 Mask &= InterchangeableMask;
965 return true;
966 }
967 return false;
968 }
969 bool equal(unsigned Opcode) {
970 if (Opcode == I->getOpcode())
971 return trySet(MainOpBIT, MainOpBIT);
972 return false;
973 }
974 unsigned getOpcode() const {
975 MaskType Candidate = Mask & SeenBefore;
976 if (Candidate & MainOpBIT)
977 return I->getOpcode();
978 if (Candidate & ShlBIT)
979 return Instruction::Shl;
980 if (Candidate & AShrBIT)
981 return Instruction::AShr;
982 if (Candidate & MulBIT)
983 return Instruction::Mul;
984 if (Candidate & AddBIT)
985 return Instruction::Add;
986 if (Candidate & SubBIT)
987 return Instruction::Sub;
988 if (Candidate & AndBIT)
989 return Instruction::And;
990 if (Candidate & OrBIT)
991 return Instruction::Or;
992 if (Candidate & XorBIT)
993 return Instruction::Xor;
994 llvm_unreachable("Cannot find interchangeable instruction.");
995 }
996
997 /// Return true if the instruction can be converted to \p Opcode.
998 bool hasCandidateOpcode(unsigned Opcode) const {
999 MaskType Candidate = Mask & SeenBefore;
1000 switch (Opcode) {
1001 case Instruction::Shl:
1002 return Candidate & ShlBIT;
1003 case Instruction::AShr:
1004 return Candidate & AShrBIT;
1005 case Instruction::Mul:
1006 return Candidate & MulBIT;
1007 case Instruction::Add:
1008 return Candidate & AddBIT;
1009 case Instruction::Sub:
1010 return Candidate & SubBIT;
1011 case Instruction::And:
1012 return Candidate & AndBIT;
1013 case Instruction::Or:
1014 return Candidate & OrBIT;
1015 case Instruction::Xor:
1016 return Candidate & XorBIT;
1017 case Instruction::LShr:
1018 case Instruction::FAdd:
1019 case Instruction::FSub:
1020 case Instruction::FMul:
1021 case Instruction::SDiv:
1022 case Instruction::UDiv:
1023 case Instruction::FDiv:
1024 case Instruction::SRem:
1025 case Instruction::URem:
1026 case Instruction::FRem:
1027 return false;
1028 default:
1029 break;
1030 }
1031 llvm_unreachable("Cannot find interchangeable instruction.");
1032 }
1033
1034 SmallVector<Value *> getOperand(const Instruction *To) const {
1035 unsigned ToOpcode = To->getOpcode();
1036 unsigned FromOpcode = I->getOpcode();
1037 if (FromOpcode == ToOpcode)
1038 return SmallVector<Value *>(I->operands());
1039 assert(binary_search(SupportedOp, ToOpcode) && "Unsupported opcode.");
1040 auto [CI, Pos] = isBinOpWithConstantInt(I);
1041 const APInt &FromCIValue = CI->getValue();
1042 unsigned FromCIValueBitWidth = FromCIValue.getBitWidth();
1043 APInt ToCIValue;
1044 switch (FromOpcode) {
1045 case Instruction::Shl:
1046 if (ToOpcode == Instruction::Mul) {
1047 ToCIValue = APInt::getOneBitSet(FromCIValueBitWidth,
1048 FromCIValue.getZExtValue());
1049 } else {
1050 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1051 ToCIValue = ToOpcode == Instruction::And
1052 ? APInt::getAllOnes(FromCIValueBitWidth)
1053 : APInt::getZero(FromCIValueBitWidth);
1054 }
1055 break;
1056 case Instruction::Mul:
1057 assert(FromCIValue.isPowerOf2() && "Cannot convert the instruction.");
1058 if (ToOpcode == Instruction::Shl) {
1059 ToCIValue = APInt(FromCIValueBitWidth, FromCIValue.logBase2());
1060 } else {
1061 assert(FromCIValue.isOne() && "Cannot convert the instruction.");
1062 ToCIValue = ToOpcode == Instruction::And
1063 ? APInt::getAllOnes(FromCIValueBitWidth)
1064 : APInt::getZero(FromCIValueBitWidth);
1065 }
1066 break;
1067 case Instruction::Add:
1068 case Instruction::Sub:
1069 if (FromCIValue.isZero()) {
1070 ToCIValue = APInt::getZero(FromCIValueBitWidth);
1071 } else {
1072 assert(is_contained({Instruction::Add, Instruction::Sub}, ToOpcode) &&
1073 "Cannot convert the instruction.");
1074 ToCIValue = FromCIValue;
1075 ToCIValue.negate();
1076 }
1077 break;
1078 case Instruction::And:
1079 assert(FromCIValue.isAllOnes() && "Cannot convert the instruction.");
1080 ToCIValue = ToOpcode == Instruction::Mul
1081 ? APInt::getOneBitSet(FromCIValueBitWidth, 0)
1082 : APInt::getZero(FromCIValueBitWidth);
1083 break;
1084 default:
1085 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1086 ToCIValue = APInt::getZero(FromCIValueBitWidth);
1087 break;
1088 }
1089 Value *LHS = I->getOperand(1 - Pos);
1090 Constant *RHS =
1091 ConstantInt::get(I->getOperand(Pos)->getType(), ToCIValue);
1092 // constant + x cannot be -constant - x
1093 // instead, it should be x - -constant
1094 if (Pos == 1 ||
1095 (FromOpcode == Instruction::Add && ToOpcode == Instruction::Sub))
1096 return SmallVector<Value *>({LHS, RHS});
1097 return SmallVector<Value *>({RHS, LHS});
1098 }
1099 };
1100 InterchangeableInfo MainOp;
1101 InterchangeableInfo AltOp;
1102 bool isValidForAlternation(const Instruction *I) const {
1103 return ::isValidForAlternation(MainOp.I->getOpcode()) &&
1104 ::isValidForAlternation(I->getOpcode());
1105 }
1106 bool initializeAltOp(const Instruction *I) {
1107 if (AltOp.I)
1108 return true;
1109 if (!isValidForAlternation(I))
1110 return false;
1111 AltOp.I = I;
1112 return true;
1113 }
1114
1115public:
1116 BinOpSameOpcodeHelper(const Instruction *MainOp,
1117 const Instruction *AltOp = nullptr)
1118 : MainOp(MainOp), AltOp(AltOp) {
1119 assert(is_sorted(SupportedOp) && "SupportedOp is not sorted.");
1120 }
1121 bool add(const Instruction *I) {
1122 assert(isa<BinaryOperator>(I) &&
1123 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1124 unsigned Opcode = I->getOpcode();
1125 MaskType OpcodeInMaskForm;
1126 // Prefer Shl, AShr, Mul, Add, Sub, And, Or and Xor over MainOp.
1127 switch (Opcode) {
1128 case Instruction::Shl:
1129 OpcodeInMaskForm = ShlBIT;
1130 break;
1131 case Instruction::AShr:
1132 OpcodeInMaskForm = AShrBIT;
1133 break;
1134 case Instruction::Mul:
1135 OpcodeInMaskForm = MulBIT;
1136 break;
1137 case Instruction::Add:
1138 OpcodeInMaskForm = AddBIT;
1139 break;
1140 case Instruction::Sub:
1141 OpcodeInMaskForm = SubBIT;
1142 break;
1143 case Instruction::And:
1144 OpcodeInMaskForm = AndBIT;
1145 break;
1146 case Instruction::Or:
1147 OpcodeInMaskForm = OrBIT;
1148 break;
1149 case Instruction::Xor:
1150 OpcodeInMaskForm = XorBIT;
1151 break;
1152 default:
1153 return MainOp.equal(Opcode) ||
1154 (initializeAltOp(I) && AltOp.equal(Opcode));
1155 }
1156 MaskType InterchangeableMask = OpcodeInMaskForm;
1157 ConstantInt *CI = isBinOpWithConstantInt(I).first;
1158 if (CI) {
1159 constexpr MaskType CanBeAll =
1160 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1161 const APInt &CIValue = CI->getValue();
1162 switch (Opcode) {
1163 case Instruction::Shl:
1164 if (CIValue.ult(CIValue.getBitWidth()))
1165 InterchangeableMask = CIValue.isZero() ? CanBeAll : MulBIT | ShlBIT;
1166 break;
1167 case Instruction::Mul:
1168 if (CIValue.isOne()) {
1169 InterchangeableMask = CanBeAll;
1170 break;
1171 }
1172 if (CIValue.isPowerOf2())
1173 InterchangeableMask = MulBIT | ShlBIT;
1174 break;
1175 case Instruction::Add:
1176 case Instruction::Sub:
1177 InterchangeableMask = CIValue.isZero() ? CanBeAll : SubBIT | AddBIT;
1178 break;
1179 case Instruction::And:
1180 if (CIValue.isAllOnes())
1181 InterchangeableMask = CanBeAll;
1182 break;
1183 default:
1184 if (CIValue.isZero())
1185 InterchangeableMask = CanBeAll;
1186 break;
1187 }
1188 }
1189 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1190 (initializeAltOp(I) &&
1191 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1192 }
1193 unsigned getMainOpcode() const { return MainOp.getOpcode(); }
1194 /// Checks if the list of potential opcodes includes \p Opcode.
1195 bool hasCandidateOpcode(unsigned Opcode) const {
1196 return MainOp.hasCandidateOpcode(Opcode);
1197 }
1198 bool hasAltOp() const { return AltOp.I; }
1199 unsigned getAltOpcode() const {
1200 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1201 }
1202 SmallVector<Value *> getOperand(const Instruction *I) const {
1203 return MainOp.getOperand(I);
1204 }
1205};
1206
1207/// Main data required for vectorization of instructions.
1208class InstructionsState {
1209 /// MainOp and AltOp are primarily determined by getSameOpcode. Currently,
1210 /// only BinaryOperator, CastInst, and CmpInst support alternate instructions
1211 /// (i.e., AltOp is not equal to MainOp; this can be checked using
1212 /// isAltShuffle).
1213 /// A rare exception is TrySplitNode, where the InstructionsState is derived
1214 /// from getMainAltOpsNoStateVL.
1215 /// For those InstructionsState that use alternate instructions, the resulting
1216 /// vectorized output ultimately comes from a shufflevector. For example,
1217 /// given a vector list (VL):
1218 /// VL[0] = add i32 a, e
1219 /// VL[1] = sub i32 b, f
1220 /// VL[2] = add i32 c, g
1221 /// VL[3] = sub i32 d, h
1222 /// The vectorized result would be:
1223 /// intermediated_0 = add <4 x i32> <a, b, c, d>, <e, f, g, h>
1224 /// intermediated_1 = sub <4 x i32> <a, b, c, d>, <e, f, g, h>
1225 /// result = shufflevector <4 x i32> intermediated_0,
1226 /// <4 x i32> intermediated_1,
1227 /// <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1228 /// Since shufflevector is used in the final result, when calculating the cost
1229 /// (getEntryCost), we must account for the usage of shufflevector in
1230 /// GetVectorCost.
1231 Instruction *MainOp = nullptr;
1232 Instruction *AltOp = nullptr;
1233 /// Wether the instruction state represents copyable instructions.
1234 bool HasCopyables = false;
1235
1236public:
1237 Instruction *getMainOp() const {
1238 assert(valid() && "InstructionsState is invalid.");
1239 return MainOp;
1240 }
1241
1242 Instruction *getAltOp() const {
1243 assert(valid() && "InstructionsState is invalid.");
1244 return AltOp;
1245 }
1246
1247 /// The main/alternate opcodes for the list of instructions.
1248 unsigned getOpcode() const { return getMainOp()->getOpcode(); }
1249
1250 unsigned getAltOpcode() const { return getAltOp()->getOpcode(); }
1251
1252 /// Some of the instructions in the list have alternate opcodes.
1253 bool isAltShuffle() const { return getMainOp() != getAltOp(); }
1254
1255 /// Checks if the instruction matches either the main or alternate opcode.
1256 /// \returns
1257 /// - MainOp if \param I matches MainOp's opcode directly or can be converted
1258 /// to it
1259 /// - AltOp if \param I matches AltOp's opcode directly or can be converted to
1260 /// it
1261 /// - nullptr if \param I cannot be matched or converted to either opcode
1262 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
1263 assert(MainOp && "MainOp cannot be nullptr.");
1264 if (I->getOpcode() == MainOp->getOpcode())
1265 return MainOp;
1266 // Prefer AltOp instead of interchangeable instruction of MainOp.
1267 assert(AltOp && "AltOp cannot be nullptr.");
1268 if (I->getOpcode() == AltOp->getOpcode())
1269 return AltOp;
1270 if (!I->isBinaryOp())
1271 return nullptr;
1272 BinOpSameOpcodeHelper Converter(MainOp);
1273 if (!Converter.add(I) || !Converter.add(MainOp))
1274 return nullptr;
1275 if (isAltShuffle() && !Converter.hasCandidateOpcode(MainOp->getOpcode())) {
1276 BinOpSameOpcodeHelper AltConverter(AltOp);
1277 if (AltConverter.add(I) && AltConverter.add(AltOp) &&
1278 AltConverter.hasCandidateOpcode(AltOp->getOpcode()))
1279 return AltOp;
1280 }
1281 if (Converter.hasAltOp() && !isAltShuffle())
1282 return nullptr;
1283 return Converter.hasAltOp() ? AltOp : MainOp;
1284 }
1285
1286 /// Checks if main/alt instructions are shift operations.
1287 bool isShiftOp() const {
1288 return getMainOp()->isShift() && getAltOp()->isShift();
1289 }
1290
1291 /// Checks if main/alt instructions are bitwise logic operations.
1292 bool isBitwiseLogicOp() const {
1293 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1294 }
1295
1296 /// Checks if main/alt instructions are mul/div/rem/fmul/fdiv/frem operations.
1297 bool isMulDivLikeOp() const {
1298 constexpr std::array<unsigned, 8> MulDiv = {
1299 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1300 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1301 Instruction::URem, Instruction::FRem};
1302 return is_contained(MulDiv, getOpcode()) &&
1303 is_contained(MulDiv, getAltOpcode());
1304 }
1305
1306 /// Checks if main/alt instructions are add/sub/fadd/fsub operations.
1307 bool isAddSubLikeOp() const {
1308 constexpr std::array<unsigned, 4> AddSub = {
1309 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1310 Instruction::FSub};
1311 return is_contained(AddSub, getOpcode()) &&
1312 is_contained(AddSub, getAltOpcode());
1313 }
1314
1315 /// Checks if main/alt instructions are cmp operations.
1316 bool isCmpOp() const {
1317 return (getOpcode() == Instruction::ICmp ||
1318 getOpcode() == Instruction::FCmp) &&
1319 getAltOpcode() == getOpcode();
1320 }
1321
1322 /// Checks if the current state is valid, i.e. has non-null MainOp
1323 bool valid() const { return MainOp && AltOp; }
1324
1325 explicit operator bool() const { return valid(); }
1326
1327 InstructionsState() = delete;
1328 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1329 bool HasCopyables = false)
1330 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1331 static InstructionsState invalid() { return {nullptr, nullptr}; }
1332
1333 /// Checks if the value is a copyable element.
1334 bool isCopyableElement(Value *V) const {
1335 assert(valid() && "InstructionsState is invalid.");
1336 if (!HasCopyables)
1337 return false;
1338 if (isAltShuffle() || getOpcode() == Instruction::GetElementPtr)
1339 return false;
1340 auto *I = dyn_cast<Instruction>(V);
1341 if (!I)
1342 return !isa<PoisonValue>(V);
1343 if (I->getParent() != MainOp->getParent() &&
1346 return true;
1347 if (I->getOpcode() == MainOp->getOpcode())
1348 return false;
1349 if (!I->isBinaryOp())
1350 return true;
1351 BinOpSameOpcodeHelper Converter(MainOp);
1352 return !Converter.add(I) || !Converter.add(MainOp) ||
1353 Converter.hasAltOp() || !Converter.hasCandidateOpcode(getOpcode());
1354 }
1355
1356 /// Checks if the value is non-schedulable.
1357 bool isNonSchedulable(Value *V) const {
1358 assert(valid() && "InstructionsState is invalid.");
1359 auto *I = dyn_cast<Instruction>(V);
1360 if (!HasCopyables)
1361 return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
1363 // MainOp for copyables always schedulable to correctly identify
1364 // non-schedulable copyables.
1365 if (getMainOp() == V)
1366 return false;
1367 if (isCopyableElement(V)) {
1368 auto IsNonSchedulableCopyableElement = [this](Value *V) {
1369 auto *I = dyn_cast<Instruction>(V);
1370 return !I || isa<PHINode>(I) || I->getParent() != MainOp->getParent() ||
1372 // If the copyable instructions comes after MainOp
1373 // (non-schedulable, but used in the block) - cannot vectorize
1374 // it, will possibly generate use before def.
1375 !MainOp->comesBefore(I));
1376 };
1377
1378 return IsNonSchedulableCopyableElement(V);
1379 }
1380 return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
1382 }
1383
1384 /// Checks if the state represents copyable instructions.
1385 bool areInstructionsWithCopyableElements() const {
1386 assert(valid() && "InstructionsState is invalid.");
1387 return HasCopyables;
1388 }
1389};
1390
1391std::pair<Instruction *, SmallVector<Value *>>
1392convertTo(Instruction *I, const InstructionsState &S) {
1393 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(I);
1394 assert(SelectedOp && "Cannot convert the instruction.");
1395 if (I->isBinaryOp()) {
1396 BinOpSameOpcodeHelper Converter(I);
1397 return std::make_pair(SelectedOp, Converter.getOperand(SelectedOp));
1398 }
1399 return std::make_pair(SelectedOp, SmallVector<Value *>(I->operands()));
1400}
1401
1402} // end anonymous namespace
1403
1404static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1405 const TargetLibraryInfo &TLI);
1406
1407/// Find an instruction with a specific opcode in VL.
1408/// \param VL Array of values to search through. Must contain only Instructions
1409/// and PoisonValues.
1410/// \param Opcode The instruction opcode to search for
1411/// \returns
1412/// - The first instruction found with matching opcode
1413/// - nullptr if no matching instruction is found
1415 unsigned Opcode) {
1416 for (Value *V : VL) {
1417 if (isa<PoisonValue>(V))
1418 continue;
1419 assert(isa<Instruction>(V) && "Only accepts PoisonValue and Instruction.");
1420 auto *Inst = cast<Instruction>(V);
1421 if (Inst->getOpcode() == Opcode)
1422 return Inst;
1423 }
1424 return nullptr;
1425}
1426
1427/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
1428/// compatible instructions or constants, or just some other regular values.
1429static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
1430 Value *Op1, const TargetLibraryInfo &TLI) {
1431 return (isConstant(BaseOp0) && isConstant(Op0)) ||
1432 (isConstant(BaseOp1) && isConstant(Op1)) ||
1433 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
1434 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
1435 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1436 getSameOpcode({BaseOp0, Op0}, TLI) ||
1437 getSameOpcode({BaseOp1, Op1}, TLI);
1438}
1439
1440/// \returns true if a compare instruction \p CI has similar "look" and
1441/// same predicate as \p BaseCI, "as is" or with its operands and predicate
1442/// swapped, false otherwise.
1443static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
1444 const TargetLibraryInfo &TLI) {
1445 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
1446 "Assessing comparisons of different types?");
1447 CmpInst::Predicate BasePred = BaseCI->getPredicate();
1448 CmpInst::Predicate Pred = CI->getPredicate();
1450
1451 Value *BaseOp0 = BaseCI->getOperand(0);
1452 Value *BaseOp1 = BaseCI->getOperand(1);
1453 Value *Op0 = CI->getOperand(0);
1454 Value *Op1 = CI->getOperand(1);
1455
1456 return (BasePred == Pred &&
1457 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
1458 (BasePred == SwappedPred &&
1459 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
1460}
1461
1462/// \returns analysis of the Instructions in \p VL described in
1463/// InstructionsState, the Opcode that we suppose the whole list
1464/// could be vectorized even if its structure is diverse.
1465static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1466 const TargetLibraryInfo &TLI) {
1467 // Make sure these are all Instructions.
1468 if (!all_of(VL, IsaPred<Instruction, PoisonValue>))
1469 return InstructionsState::invalid();
1470
1471 auto *It = find_if(VL, IsaPred<Instruction>);
1472 if (It == VL.end())
1473 return InstructionsState::invalid();
1474
1475 Instruction *MainOp = cast<Instruction>(*It);
1476 unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);
1477 if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) ||
1478 (VL.size() == 2 && InstCnt < 2))
1479 return InstructionsState::invalid();
1480
1481 bool IsCastOp = isa<CastInst>(MainOp);
1482 bool IsBinOp = isa<BinaryOperator>(MainOp);
1483 bool IsCmpOp = isa<CmpInst>(MainOp);
1484 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
1486 Instruction *AltOp = MainOp;
1487 unsigned Opcode = MainOp->getOpcode();
1488 unsigned AltOpcode = Opcode;
1489
1490 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1491 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1492 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
1493 UniquePreds.insert(BasePred);
1494 UniqueNonSwappedPreds.insert(BasePred);
1495 for (Value *V : VL) {
1496 auto *I = dyn_cast<CmpInst>(V);
1497 if (!I)
1498 return false;
1499 CmpInst::Predicate CurrentPred = I->getPredicate();
1500 CmpInst::Predicate SwappedCurrentPred =
1501 CmpInst::getSwappedPredicate(CurrentPred);
1502 UniqueNonSwappedPreds.insert(CurrentPred);
1503 if (!UniquePreds.contains(CurrentPred) &&
1504 !UniquePreds.contains(SwappedCurrentPred))
1505 UniquePreds.insert(CurrentPred);
1506 }
1507 // Total number of predicates > 2, but if consider swapped predicates
1508 // compatible only 2, consider swappable predicates as compatible opcodes,
1509 // not alternate.
1510 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
1511 }();
1512 // Check for one alternate opcode from another BinaryOperator.
1513 // TODO - generalize to support all operators (types, calls etc.).
1514 Intrinsic::ID BaseID = 0;
1515 SmallVector<VFInfo> BaseMappings;
1516 if (auto *CallBase = dyn_cast<CallInst>(MainOp)) {
1517 BaseID = getVectorIntrinsicIDForCall(CallBase, &TLI);
1518 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
1519 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
1520 return InstructionsState::invalid();
1521 }
1522 bool AnyPoison = InstCnt != VL.size();
1523 // Check MainOp too to be sure that it matches the requirements for the
1524 // instructions.
1525 for (Value *V : iterator_range(It, VL.end())) {
1526 auto *I = dyn_cast<Instruction>(V);
1527 if (!I)
1528 continue;
1529
1530 // Cannot combine poison and divisions.
1531 // TODO: do some smart analysis of the CallInsts to exclude divide-like
1532 // intrinsics/functions only.
1533 if (AnyPoison && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
1534 return InstructionsState::invalid();
1535 unsigned InstOpcode = I->getOpcode();
1536 if (IsBinOp && isa<BinaryOperator>(I)) {
1537 if (BinOpHelper.add(I))
1538 continue;
1539 } else if (IsCastOp && isa<CastInst>(I)) {
1540 Value *Op0 = MainOp->getOperand(0);
1541 Type *Ty0 = Op0->getType();
1542 Value *Op1 = I->getOperand(0);
1543 Type *Ty1 = Op1->getType();
1544 if (Ty0 == Ty1) {
1545 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1546 continue;
1547 if (Opcode == AltOpcode) {
1548 assert(isValidForAlternation(Opcode) &&
1549 isValidForAlternation(InstOpcode) &&
1550 "Cast isn't safe for alternation, logic needs to be updated!");
1551 AltOpcode = InstOpcode;
1552 AltOp = I;
1553 continue;
1554 }
1555 }
1556 } else if (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) {
1557 auto *BaseInst = cast<CmpInst>(MainOp);
1558 Type *Ty0 = BaseInst->getOperand(0)->getType();
1559 Type *Ty1 = Inst->getOperand(0)->getType();
1560 if (Ty0 == Ty1) {
1561 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
1562 assert(InstOpcode == AltOpcode &&
1563 "Alternate instructions are only supported by BinaryOperator "
1564 "and CastInst.");
1565 // Check for compatible operands. If the corresponding operands are not
1566 // compatible - need to perform alternate vectorization.
1567 CmpInst::Predicate CurrentPred = Inst->getPredicate();
1568 CmpInst::Predicate SwappedCurrentPred =
1569 CmpInst::getSwappedPredicate(CurrentPred);
1570
1571 if ((VL.size() == 2 || SwappedPredsCompatible) &&
1572 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1573 continue;
1574
1575 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
1576 continue;
1577 auto *AltInst = cast<CmpInst>(AltOp);
1578 if (MainOp != AltOp) {
1579 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
1580 continue;
1581 } else if (BasePred != CurrentPred) {
1582 assert(
1583 isValidForAlternation(InstOpcode) &&
1584 "CmpInst isn't safe for alternation, logic needs to be updated!");
1585 AltOp = I;
1586 continue;
1587 }
1588 CmpInst::Predicate AltPred = AltInst->getPredicate();
1589 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1590 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1591 continue;
1592 }
1593 } else if (InstOpcode == Opcode) {
1594 assert(InstOpcode == AltOpcode &&
1595 "Alternate instructions are only supported by BinaryOperator and "
1596 "CastInst.");
1597 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
1598 if (Gep->getNumOperands() != 2 ||
1599 Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType())
1600 return InstructionsState::invalid();
1601 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
1603 return InstructionsState::invalid();
1604 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
1605 auto *BaseLI = cast<LoadInst>(MainOp);
1606 if (!LI->isSimple() || !BaseLI->isSimple())
1607 return InstructionsState::invalid();
1608 } else if (auto *Call = dyn_cast<CallInst>(I)) {
1609 auto *CallBase = cast<CallInst>(MainOp);
1610 if (Call->getCalledFunction() != CallBase->getCalledFunction())
1611 return InstructionsState::invalid();
1612 if (Call->hasOperandBundles() &&
1614 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1615 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1616 CallBase->op_begin() +
1618 return InstructionsState::invalid();
1620 if (ID != BaseID)
1621 return InstructionsState::invalid();
1622 if (!ID) {
1623 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
1624 if (Mappings.size() != BaseMappings.size() ||
1625 Mappings.front().ISA != BaseMappings.front().ISA ||
1626 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
1627 Mappings.front().VectorName != BaseMappings.front().VectorName ||
1628 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
1629 Mappings.front().Shape.Parameters !=
1630 BaseMappings.front().Shape.Parameters)
1631 return InstructionsState::invalid();
1632 }
1633 }
1634 continue;
1635 }
1636 return InstructionsState::invalid();
1637 }
1638
1639 if (IsBinOp) {
1640 MainOp = findInstructionWithOpcode(VL, BinOpHelper.getMainOpcode());
1641 assert(MainOp && "Cannot find MainOp with Opcode from BinOpHelper.");
1642 AltOp = findInstructionWithOpcode(VL, BinOpHelper.getAltOpcode());
1643 assert(MainOp && "Cannot find AltOp with Opcode from BinOpHelper.");
1644 }
1645 assert((MainOp == AltOp || !allSameOpcode(VL)) &&
1646 "Incorrect implementation of allSameOpcode.");
1647 InstructionsState S(MainOp, AltOp);
1648 assert(all_of(VL,
1649 [&](Value *V) {
1650 return isa<PoisonValue>(V) ||
1651 S.getMatchingMainOpOrAltOp(cast<Instruction>(V));
1652 }) &&
1653 "Invalid InstructionsState.");
1654 return S;
1655}
1656
1657/// \returns true if all of the values in \p VL have the same type or false
1658/// otherwise.
1660 Type *Ty = VL.consume_front()->getType();
1661 return all_of(VL, [&](Value *V) { return V->getType() == Ty; });
1662}
1663
1664/// \returns True if in-tree use also needs extract. This refers to
1665/// possible scalar operand in vectorized instruction.
1666static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
1667 TargetLibraryInfo *TLI,
1668 const TargetTransformInfo *TTI) {
1669 if (!UserInst)
1670 return false;
1671 unsigned Opcode = UserInst->getOpcode();
1672 switch (Opcode) {
1673 case Instruction::Load: {
1674 LoadInst *LI = cast<LoadInst>(UserInst);
1675 return (LI->getPointerOperand() == Scalar);
1676 }
1677 case Instruction::Store: {
1678 StoreInst *SI = cast<StoreInst>(UserInst);
1679 return (SI->getPointerOperand() == Scalar);
1680 }
1681 case Instruction::Call: {
1682 CallInst *CI = cast<CallInst>(UserInst);
1684 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
1685 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1686 Arg.value().get() == Scalar;
1687 });
1688 }
1689 default:
1690 return false;
1691 }
1692}
1693
1694/// \returns the AA location that is being access by the instruction.
1696 if (StoreInst *SI = dyn_cast<StoreInst>(I))
1697 return MemoryLocation::get(SI);
1698 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1699 return MemoryLocation::get(LI);
1700 return MemoryLocation();
1701}
1702
1703/// \returns True if the instruction is not a volatile or atomic load/store.
1704static bool isSimple(Instruction *I) {
1705 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1706 return LI->isSimple();
1707 if (StoreInst *SI = dyn_cast<StoreInst>(I))
1708 return SI->isSimple();
1709 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
1710 return !MI->isVolatile();
1711 return true;
1712}
1713
1714/// Shuffles \p Mask in accordance with the given \p SubMask.
1715/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1716/// one but two input vectors.
1717static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
1718 bool ExtendingManyInputs = false) {
1719 if (SubMask.empty())
1720 return;
1721 assert(
1722 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
1723 // Check if input scalars were extended to match the size of other node.
1724 (SubMask.size() == Mask.size() && Mask.back() == PoisonMaskElem)) &&
1725 "SubMask with many inputs support must be larger than the mask.");
1726 if (Mask.empty()) {
1727 Mask.append(SubMask.begin(), SubMask.end());
1728 return;
1729 }
1730 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1731 int TermValue = std::min(Mask.size(), SubMask.size());
1732 for (int I = 0, E = SubMask.size(); I < E; ++I) {
1733 if (SubMask[I] == PoisonMaskElem ||
1734 (!ExtendingManyInputs &&
1735 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1736 continue;
1737 NewMask[I] = Mask[SubMask[I]];
1738 }
1739 Mask.swap(NewMask);
1740}
1741
1742/// Order may have elements assigned special value (size) which is out of
1743/// bounds. Such indices only appear on places which correspond to undef values
1744/// (see canReuseExtract for details) and used in order to avoid undef values
1745/// have effect on operands ordering.
1746/// The first loop below simply finds all unused indices and then the next loop
1747/// nest assigns these indices for undef values positions.
1748/// As an example below Order has two undef positions and they have assigned
1749/// values 3 and 7 respectively:
1750/// before: 6 9 5 4 9 2 1 0
1751/// after: 6 3 5 4 7 2 1 0
1753 const size_t Sz = Order.size();
1754 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1755 SmallBitVector MaskedIndices(Sz);
1756 for (unsigned I = 0; I < Sz; ++I) {
1757 if (Order[I] < Sz)
1758 UnusedIndices.reset(Order[I]);
1759 else
1760 MaskedIndices.set(I);
1761 }
1762 if (MaskedIndices.none())
1763 return;
1764 assert(UnusedIndices.count() == MaskedIndices.count() &&
1765 "Non-synced masked/available indices.");
1766 int Idx = UnusedIndices.find_first();
1767 int MIdx = MaskedIndices.find_first();
1768 while (MIdx >= 0) {
1769 assert(Idx >= 0 && "Indices must be synced.");
1770 Order[MIdx] = Idx;
1771 Idx = UnusedIndices.find_next(Idx);
1772 MIdx = MaskedIndices.find_next(MIdx);
1773 }
1774}
1775
1776/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1777/// Opcode1.
1779 unsigned Opcode0, unsigned Opcode1) {
1780 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
1781 SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
1782 for (unsigned Lane : seq<unsigned>(VL.size())) {
1783 if (isa<PoisonValue>(VL[Lane]))
1784 continue;
1785 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1786 OpcodeMask.set(Lane * ScalarTyNumElements,
1787 Lane * ScalarTyNumElements + ScalarTyNumElements);
1788 }
1789 return OpcodeMask;
1790}
1791
1792/// Replicates the given \p Val \p VF times.
1794 unsigned VF) {
1795 assert(none_of(Val, [](Constant *C) { return C->getType()->isVectorTy(); }) &&
1796 "Expected scalar constants.");
1797 SmallVector<Constant *> NewVal(Val.size() * VF);
1798 for (auto [I, V] : enumerate(Val))
1799 std::fill_n(NewVal.begin() + I * VF, VF, V);
1800 return NewVal;
1801}
1802
1803namespace llvm {
1804
1806 SmallVectorImpl<int> &Mask) {
1807 Mask.clear();
1808 const unsigned E = Indices.size();
1809 Mask.resize(E, PoisonMaskElem);
1810 for (unsigned I = 0; I < E; ++I)
1811 Mask[Indices[I]] = I;
1812}
1813
1814/// Reorders the list of scalars in accordance with the given \p Mask.
1816 ArrayRef<int> Mask) {
1817 assert(!Mask.empty() && "Expected non-empty mask.");
1818 SmallVector<Value *> Prev(Scalars.size(),
1819 PoisonValue::get(Scalars.front()->getType()));
1820 Prev.swap(Scalars);
1821 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1822 if (Mask[I] != PoisonMaskElem)
1823 Scalars[Mask[I]] = Prev[I];
1824}
1825
1826/// Checks if the provided value does not require scheduling. It does not
1827/// require scheduling if this is not an instruction or it is an instruction
1828/// that does not read/write memory and all operands are either not instructions
1829/// or phi nodes or instructions from different blocks.
1831 auto *I = dyn_cast<Instruction>(V);
1832 if (!I)
1833 return true;
1834 return !mayHaveNonDefUseDependency(*I) &&
1835 all_of(I->operands(), [I](Value *V) {
1836 auto *IO = dyn_cast<Instruction>(V);
1837 if (!IO)
1838 return true;
1839 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1840 });
1841}
1842
1843/// Checks if the provided value does not require scheduling. It does not
1844/// require scheduling if this is not an instruction or it is an instruction
1845/// that does not read/write memory and all users are phi nodes or instructions
1846/// from the different blocks.
1847static bool isUsedOutsideBlock(Value *V) {
1848 auto *I = dyn_cast<Instruction>(V);
1849 if (!I)
1850 return true;
1851 // Limits the number of uses to save compile time.
1852 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1853 all_of(I->users(), [I](User *U) {
1854 auto *IU = dyn_cast<Instruction>(U);
1855 if (!IU)
1856 return true;
1857 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1858 });
1859}
1860
1861/// Checks if the specified value does not require scheduling. It does not
1862/// require scheduling if all operands and all users do not need to be scheduled
1863/// in the current basic block.
1866}
1867
1868/// Checks if the specified array of instructions does not require scheduling.
1869/// It is so if all either instructions have operands that do not require
1870/// scheduling or their users do not require scheduling since they are phis or
1871/// in other basic blocks.
1873 return !VL.empty() &&
1875}
1876
1877/// Returns true if widened type of \p Ty elements with size \p Sz represents
1878/// full vector type, i.e. adding extra element results in extra parts upon type
1879/// legalization.
1881 unsigned Sz) {
1882 if (Sz <= 1)
1883 return false;
1884 if (!isValidElementType(Ty) && !isa<FixedVectorType>(Ty))
1885 return false;
1886 if (has_single_bit(Sz))
1887 return true;
1888 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
1889 return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
1890 Sz % NumParts == 0;
1891}
1892
1893/// Returns number of parts, the type \p VecTy will be split at the codegen
1894/// phase. If the type is going to be scalarized or does not uses whole
1895/// registers, returns 1.
1896static unsigned
1898 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1899 unsigned NumParts = TTI.getNumberOfParts(VecTy);
1900 if (NumParts == 0 || NumParts >= Limit)
1901 return 1;
1902 unsigned Sz = getNumElements(VecTy);
1903 if (NumParts >= Sz || Sz % NumParts != 0 ||
1904 !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), Sz / NumParts))
1905 return 1;
1906 return NumParts;
1907}
1908
1909namespace slpvectorizer {
1910
1911/// Bottom Up SLP Vectorizer.
1912class BoUpSLP {
1913 class TreeEntry;
1914 class ScheduleEntity;
1915 class ScheduleData;
1916 class ScheduleCopyableData;
1917 class ScheduleBundle;
1920
1921public:
1922 /// Tracks the state we can represent the loads in the given sequence.
1923 enum class LoadsState {
1924 Gather,
1925 Vectorize,
1929 };
1930
1937
1939 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1942 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1943 AC(AC), DB(DB), DL(DL), ORE(ORE),
1944 Builder(Se->getContext(), TargetFolder(*DL)) {
1945 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1946 // Use the vector register size specified by the target unless overridden
1947 // by a command-line option.
1948 // TODO: It would be better to limit the vectorization factor based on
1949 // data type rather than just register size. For example, x86 AVX has
1950 // 256-bit registers, but it does not support integer operations
1951 // at that width (that requires AVX2).
1952 if (MaxVectorRegSizeOption.getNumOccurrences())
1953 MaxVecRegSize = MaxVectorRegSizeOption;
1954 else
1955 MaxVecRegSize =
1957 .getFixedValue();
1958
1959 if (MinVectorRegSizeOption.getNumOccurrences())
1960 MinVecRegSize = MinVectorRegSizeOption;
1961 else
1962 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1963 }
1964
1965 /// Vectorize the tree that starts with the elements in \p VL.
1966 /// Returns the vectorized root.
1968
1969 /// Vectorize the tree but with the list of externally used values \p
1970 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1971 /// generated extractvalue instructions.
1973 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1974 Instruction *ReductionRoot = nullptr,
1975 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {});
1976
1977 /// \returns the cost incurred by unwanted spills and fills, caused by
1978 /// holding live values over call sites.
1980
1981 /// \returns the vectorization cost of the subtree that starts at \p VL.
1982 /// A negative number means that this is profitable.
1983 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = {},
1984 InstructionCost ReductionCost = TTI::TCC_Free);
1985
1986 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1987 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1988 void buildTree(ArrayRef<Value *> Roots,
1989 const SmallDenseSet<Value *> &UserIgnoreLst);
1990
1991 /// Construct a vectorizable tree that starts at \p Roots.
1992 void buildTree(ArrayRef<Value *> Roots);
1993
1994 /// Return the scalars of the root node.
1996 assert(!VectorizableTree.empty() && "No graph to get the first node from");
1997 return VectorizableTree.front()->Scalars;
1998 }
1999
2000 /// Returns the type/is-signed info for the root node in the graph without
2001 /// casting.
2002 std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const {
2003 const TreeEntry &Root = *VectorizableTree.front();
2004 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2005 !Root.Scalars.front()->getType()->isIntegerTy())
2006 return std::nullopt;
2007 auto It = MinBWs.find(&Root);
2008 if (It != MinBWs.end())
2009 return std::make_pair(IntegerType::get(Root.Scalars.front()->getContext(),
2010 It->second.first),
2011 It->second.second);
2012 if (Root.getOpcode() == Instruction::ZExt ||
2013 Root.getOpcode() == Instruction::SExt)
2014 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
2015 Root.getOpcode() == Instruction::SExt);
2016 return std::nullopt;
2017 }
2018
2019 /// Checks if the root graph node can be emitted with narrower bitwidth at
2020 /// codegen and returns it signedness, if so.
2022 return MinBWs.at(VectorizableTree.front().get()).second;
2023 }
2024
2025 /// Returns reduction type after minbitdth analysis.
2027 if (ReductionBitWidth == 0 ||
2028 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2029 ReductionBitWidth >=
2030 DL->getTypeSizeInBits(
2031 VectorizableTree.front()->Scalars.front()->getType()))
2032 return getWidenedType(
2033 VectorizableTree.front()->Scalars.front()->getType(),
2034 VectorizableTree.front()->getVectorFactor());
2035 return getWidenedType(
2037 VectorizableTree.front()->Scalars.front()->getContext(),
2038 ReductionBitWidth),
2039 VectorizableTree.front()->getVectorFactor());
2040 }
2041
2042 /// Builds external uses of the vectorized scalars, i.e. the list of
2043 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
2044 /// ExternallyUsedValues contains additional list of external uses to handle
2045 /// vectorization of reductions.
2046 void
2047 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
2048
2049 /// Transforms graph nodes to target specific representations, if profitable.
2050 void transformNodes();
2051
2052 /// Clear the internal data structures that are created by 'buildTree'.
2053 void deleteTree() {
2054 VectorizableTree.clear();
2055 ScalarToTreeEntries.clear();
2056 OperandsToTreeEntry.clear();
2057 ScalarsInSplitNodes.clear();
2058 MustGather.clear();
2059 NonScheduledFirst.clear();
2060 EntryToLastInstruction.clear();
2061 LoadEntriesToVectorize.clear();
2062 IsGraphTransformMode = false;
2063 GatheredLoadsEntriesFirst.reset();
2064 CompressEntryToData.clear();
2065 ExternalUses.clear();
2066 ExternalUsesAsOriginalScalar.clear();
2067 ExternalUsesWithNonUsers.clear();
2068 for (auto &Iter : BlocksSchedules) {
2069 BlockScheduling *BS = Iter.second.get();
2070 BS->clear();
2071 }
2072 MinBWs.clear();
2073 ReductionBitWidth = 0;
2074 BaseGraphSize = 1;
2075 CastMaxMinBWSizes.reset();
2076 ExtraBitWidthNodes.clear();
2077 InstrElementSize.clear();
2078 UserIgnoreList = nullptr;
2079 PostponedGathers.clear();
2080 ValueToGatherNodes.clear();
2081 }
2082
2083 unsigned getTreeSize() const { return VectorizableTree.size(); }
2084
2085 /// Returns the base graph size, before any transformations.
2086 unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
2087
2088 /// Perform LICM and CSE on the newly generated gather sequences.
2090
2091 /// Does this non-empty order represent an identity order? Identity
2092 /// should be represented as an empty order, so this is used to
2093 /// decide if we can canonicalize a computed order. Undef elements
2094 /// (represented as size) are ignored.
2096 assert(!Order.empty() && "expected non-empty order");
2097 const unsigned Sz = Order.size();
2098 return all_of(enumerate(Order), [&](const auto &P) {
2099 return P.value() == P.index() || P.value() == Sz;
2100 });
2101 }
2102
2103 /// Checks if the specified gather tree entry \p TE can be represented as a
2104 /// shuffled vector entry + (possibly) permutation with other gathers. It
2105 /// implements the checks only for possibly ordered scalars (Loads,
2106 /// ExtractElement, ExtractValue), which can be part of the graph.
2107 /// \param TopToBottom If true, used for the whole tree rotation, false - for
2108 /// sub-tree rotations. \param IgnoreReorder true, if the order of the root
2109 /// node might be ignored.
2110 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE,
2111 bool TopToBottom,
2112 bool IgnoreReorder);
2113
2114 /// Sort loads into increasing pointers offsets to allow greater clustering.
2115 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
2116
2117 /// Gets reordering data for the given tree entry. If the entry is vectorized
2118 /// - just return ReorderIndices, otherwise check if the scalars can be
2119 /// reordered and return the most optimal order.
2120 /// \return std::nullopt if ordering is not important, empty order, if
2121 /// identity order is important, or the actual order.
2122 /// \param TopToBottom If true, include the order of vectorized stores and
2123 /// insertelement nodes, otherwise skip them.
2124 /// \param IgnoreReorder true, if the root node order can be ignored.
2125 std::optional<OrdersType>
2126 getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder);
2127
2128 /// Checks if it is profitable to reorder the current tree.
2129 /// If the tree does not contain many profitable reordable nodes, better to
2130 /// skip it to save compile time.
2131 bool isProfitableToReorder() const;
2132
2133 /// Reorders the current graph to the most profitable order starting from the
2134 /// root node to the leaf nodes. The best order is chosen only from the nodes
2135 /// of the same size (vectorization factor). Smaller nodes are considered
2136 /// parts of subgraph with smaller VF and they are reordered independently. We
2137 /// can make it because we still need to extend smaller nodes to the wider VF
2138 /// and we can merge reordering shuffles with the widening shuffles.
2139 void reorderTopToBottom();
2140
2141 /// Reorders the current graph to the most profitable order starting from
2142 /// leaves to the root. It allows to rotate small subgraphs and reduce the
2143 /// number of reshuffles if the leaf nodes use the same order. In this case we
2144 /// can merge the orders and just shuffle user node instead of shuffling its
2145 /// operands. Plus, even the leaf nodes have different orders, it allows to
2146 /// sink reordering in the graph closer to the root node and merge it later
2147 /// during analysis.
2148 void reorderBottomToTop(bool IgnoreReorder = false);
2149
2150 /// \return The vector element size in bits to use when vectorizing the
2151 /// expression tree ending at \p V. If V is a store, the size is the width of
2152 /// the stored value. Otherwise, the size is the width of the largest loaded
2153 /// value reaching V. This method is used by the vectorizer to calculate
2154 /// vectorization factors.
2155 unsigned getVectorElementSize(Value *V);
2156
2157 /// Compute the minimum type sizes required to represent the entries in a
2158 /// vectorizable tree.
2160
2161 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
2162 unsigned getMaxVecRegSize() const {
2163 return MaxVecRegSize;
2164 }
2165
2166 // \returns minimum vector register size as set by cl::opt.
2167 unsigned getMinVecRegSize() const {
2168 return MinVecRegSize;
2169 }
2170
2171 unsigned getMinVF(unsigned Sz) const {
2172 return std::max(2U, getMinVecRegSize() / Sz);
2173 }
2174
2175 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
2176 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
2177 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2178 return MaxVF ? MaxVF : UINT_MAX;
2179 }
2180
2181 /// Check if homogeneous aggregate is isomorphic to some VectorType.
2182 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
2183 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
2184 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
2185 ///
2186 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
2187 unsigned canMapToVector(Type *T) const;
2188
2189 /// \returns True if the VectorizableTree is both tiny and not fully
2190 /// vectorizable. We do not vectorize such trees.
2191 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
2192
2193 /// Checks if the graph and all its subgraphs cannot be better vectorized.
2194 /// It may happen, if all gather nodes are loads and they cannot be
2195 /// "clusterized". In this case even subgraphs cannot be vectorized more
2196 /// effectively than the base graph.
2197 bool isTreeNotExtendable() const;
2198
2199 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
2200 /// can be load combined in the backend. Load combining may not be allowed in
2201 /// the IR optimizer, so we do not want to alter the pattern. For example,
2202 /// partially transforming a scalar bswap() pattern into vector code is
2203 /// effectively impossible for the backend to undo.
2204 /// TODO: If load combining is allowed in the IR optimizer, this analysis
2205 /// may not be necessary.
2206 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
2207
2208 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
2209 /// can be load combined in the backend. Load combining may not be allowed in
2210 /// the IR optimizer, so we do not want to alter the pattern. For example,
2211 /// partially transforming a scalar bswap() pattern into vector code is
2212 /// effectively impossible for the backend to undo.
2213 /// TODO: If load combining is allowed in the IR optimizer, this analysis
2214 /// may not be necessary.
2215 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
2216
2217 /// Checks if the given array of loads can be represented as a vectorized,
2218 /// scatter or just simple gather.
2219 /// \param VL list of loads.
2220 /// \param VL0 main load value.
2221 /// \param Order returned order of load instructions.
2222 /// \param PointerOps returned list of pointer operands.
2223 /// \param BestVF return best vector factor, if recursive check found better
2224 /// vectorization sequences rather than masked gather.
2225 /// \param TryRecursiveCheck used to check if long masked gather can be
2226 /// represented as a serie of loads/insert subvector, if profitable.
2229 SmallVectorImpl<Value *> &PointerOps,
2230 unsigned *BestVF = nullptr,
2231 bool TryRecursiveCheck = true) const;
2232
2233 /// Registers non-vectorizable sequence of loads
2234 template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
2235 ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
2236 }
2237
2238 /// Checks if the given loads sequence is known as not vectorizable
2239 template <typename T>
2241 return ListOfKnonwnNonVectorizableLoads.contains(hash_value(VL));
2242 }
2243
2245
2246 /// This structure holds any data we need about the edges being traversed
2247 /// during buildTreeRec(). We keep track of:
2248 /// (i) the user TreeEntry index, and
2249 /// (ii) the index of the edge.
2250 struct EdgeInfo {
2251 EdgeInfo() = default;
2252 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
2254 /// The user TreeEntry.
2255 TreeEntry *UserTE = nullptr;
2256 /// The operand index of the use.
2257 unsigned EdgeIdx = UINT_MAX;
2258#ifndef NDEBUG
2260 const BoUpSLP::EdgeInfo &EI) {
2261 EI.dump(OS);
2262 return OS;
2263 }
2264 /// Debug print.
2265 void dump(raw_ostream &OS) const {
2266 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
2267 << " EdgeIdx:" << EdgeIdx << "}";
2268 }
2269 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
2270#endif
2271 bool operator == (const EdgeInfo &Other) const {
2272 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
2273 }
2274
2275 operator bool() const { return UserTE != nullptr; }
2276 };
2277 friend struct DenseMapInfo<EdgeInfo>;
2278
2279 /// A helper class used for scoring candidates for two consecutive lanes.
2281 const TargetLibraryInfo &TLI;
2282 const DataLayout &DL;
2283 ScalarEvolution &SE;
2284 const BoUpSLP &R;
2285 int NumLanes; // Total number of lanes (aka vectorization factor).
2286 int MaxLevel; // The maximum recursion depth for accumulating score.
2287
2288 public:
2290 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
2291 int MaxLevel)
2292 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2293 MaxLevel(MaxLevel) {}
2294
2295 // The hard-coded scores listed here are not very important, though it shall
2296 // be higher for better matches to improve the resulting cost. When
2297 // computing the scores of matching one sub-tree with another, we are
2298 // basically counting the number of values that are matching. So even if all
2299 // scores are set to 1, we would still get a decent matching result.
2300 // However, sometimes we have to break ties. For example we may have to
2301 // choose between matching loads vs matching opcodes. This is what these
2302 // scores are helping us with: they provide the order of preference. Also,
2303 // this is important if the scalar is externally used or used in another
2304 // tree entry node in the different lane.
2305
2306 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
2307 static const int ScoreConsecutiveLoads = 4;
2308 /// The same load multiple times. This should have a better score than
2309 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
2310 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
2311 /// a vector load and 1.0 for a broadcast.
2312 static const int ScoreSplatLoads = 3;
2313 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
2314 static const int ScoreReversedLoads = 3;
2315 /// A load candidate for masked gather.
2316 static const int ScoreMaskedGatherCandidate = 1;
2317 /// ExtractElementInst from same vector and consecutive indexes.
2318 static const int ScoreConsecutiveExtracts = 4;
2319 /// ExtractElementInst from same vector and reversed indices.
2320 static const int ScoreReversedExtracts = 3;
2321 /// Constants.
2322 static const int ScoreConstants = 2;
2323 /// Instructions with the same opcode.
2324 static const int ScoreSameOpcode = 2;
2325 /// Instructions with alt opcodes (e.g, add + sub).
2326 static const int ScoreAltOpcodes = 1;
2327 /// Identical instructions (a.k.a. splat or broadcast).
2328 static const int ScoreSplat = 1;
2329 /// Matching with an undef is preferable to failing.
2330 static const int ScoreUndef = 1;
2331 /// Score for failing to find a decent match.
2332 static const int ScoreFail = 0;
2333 /// Score if all users are vectorized.
2334 static const int ScoreAllUserVectorized = 1;
2335
2336 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
2337 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
2338 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
2339 /// MainAltOps.
2341 ArrayRef<Value *> MainAltOps) const {
2342 if (!isValidElementType(V1->getType()) ||
2345
2346 if (V1 == V2) {
2347 if (isa<LoadInst>(V1)) {
2348 // Retruns true if the users of V1 and V2 won't need to be extracted.
2349 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
2350 // Bail out if we have too many uses to save compilation time.
2351 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
2352 return false;
2353
2354 auto AllUsersVectorized = [U1, U2, this](Value *V) {
2355 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
2356 return U == U1 || U == U2 || R.isVectorized(U);
2357 });
2358 };
2359 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2360 };
2361 // A broadcast of a load can be cheaper on some targets.
2362 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
2363 ElementCount::getFixed(NumLanes)) &&
2364 ((int)V1->getNumUses() == NumLanes ||
2365 AllUsersAreInternal(V1, V2)))
2367 }
2369 }
2370
2371 auto CheckSameEntryOrFail = [&]() {
2372 if (ArrayRef<TreeEntry *> TEs1 = R.getTreeEntries(V1); !TEs1.empty()) {
2374 if (ArrayRef<TreeEntry *> TEs2 = R.getTreeEntries(V2);
2375 !TEs2.empty() &&
2376 any_of(TEs2, [&](TreeEntry *E) { return Set.contains(E); }))
2378 }
2380 };
2381
2382 auto *LI1 = dyn_cast<LoadInst>(V1);
2383 auto *LI2 = dyn_cast<LoadInst>(V2);
2384 if (LI1 && LI2) {
2385 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2386 !LI2->isSimple())
2387 return CheckSameEntryOrFail();
2388
2389 std::optional<int64_t> Dist = getPointersDiff(
2390 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
2391 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
2392 if (!Dist || *Dist == 0) {
2393 if (getUnderlyingObject(LI1->getPointerOperand()) ==
2394 getUnderlyingObject(LI2->getPointerOperand()) &&
2395 R.TTI->isLegalMaskedGather(
2396 getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
2398 return CheckSameEntryOrFail();
2399 }
2400 // The distance is too large - still may be profitable to use masked
2401 // loads/gathers.
2402 if (std::abs(*Dist) > NumLanes / 2)
2404 // This still will detect consecutive loads, but we might have "holes"
2405 // in some cases. It is ok for non-power-2 vectorization and may produce
2406 // better results. It should not affect current vectorization.
2409 }
2410
2411 auto *C1 = dyn_cast<Constant>(V1);
2412 auto *C2 = dyn_cast<Constant>(V2);
2413 if (C1 && C2)
2415
2416 // Consider constants and buildvector compatible.
2417 if ((C1 && isa<InsertElementInst>(V2)) ||
2418 (C2 && isa<InsertElementInst>(V1)))
2420
2421 // Extracts from consecutive indexes of the same vector better score as
2422 // the extracts could be optimized away.
2423 Value *EV1;
2424 ConstantInt *Ex1Idx;
2425 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
2426 // Undefs are always profitable for extractelements.
2427 // Compiler can easily combine poison and extractelement <non-poison> or
2428 // undef and extractelement <poison>. But combining undef +
2429 // extractelement <non-poison-but-may-produce-poison> requires some
2430 // extra operations.
2431 if (isa<UndefValue>(V2))
2432 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
2435 Value *EV2 = nullptr;
2436 ConstantInt *Ex2Idx = nullptr;
2437 if (match(V2,
2439 m_Undef())))) {
2440 // Undefs are always profitable for extractelements.
2441 if (!Ex2Idx)
2443 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
2445 if (EV2 == EV1) {
2446 int Idx1 = Ex1Idx->getZExtValue();
2447 int Idx2 = Ex2Idx->getZExtValue();
2448 int Dist = Idx2 - Idx1;
2449 // The distance is too large - still may be profitable to use
2450 // shuffles.
2451 if (std::abs(Dist) == 0)
2453 if (std::abs(Dist) > NumLanes / 2)
2457 }
2459 }
2460 return CheckSameEntryOrFail();
2461 }
2462
2463 auto *I1 = dyn_cast<Instruction>(V1);
2464 auto *I2 = dyn_cast<Instruction>(V2);
2465 if (I1 && I2) {
2466 if (I1->getParent() != I2->getParent())
2467 return CheckSameEntryOrFail();
2468 SmallVector<Value *, 4> Ops(MainAltOps);
2469 Ops.push_back(I1);
2470 Ops.push_back(I2);
2471 InstructionsState S = getSameOpcode(Ops, TLI);
2472 // Note: Only consider instructions with <= 2 operands to avoid
2473 // complexity explosion.
2474 if (S &&
2475 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||
2476 !S.isAltShuffle()) &&
2477 all_of(Ops, [&S](Value *V) {
2478 return isa<PoisonValue>(V) ||
2479 cast<Instruction>(V)->getNumOperands() ==
2480 S.getMainOp()->getNumOperands();
2481 }))
2482 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
2484 }
2485
2486 if (I1 && isa<PoisonValue>(V2))
2488
2489 if (isa<UndefValue>(V2))
2491
2492 return CheckSameEntryOrFail();
2493 }
2494
2495 /// Go through the operands of \p LHS and \p RHS recursively until
2496 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
2497 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
2498 /// of \p U1 and \p U2), except at the beginning of the recursion where
2499 /// these are set to nullptr.
2500 ///
2501 /// For example:
2502 /// \verbatim
2503 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
2504 /// \ / \ / \ / \ /
2505 /// + + + +
2506 /// G1 G2 G3 G4
2507 /// \endverbatim
2508 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
2509 /// each level recursively, accumulating the score. It starts from matching
2510 /// the additions at level 0, then moves on to the loads (level 1). The
2511 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
2512 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
2513 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
2514 /// Please note that the order of the operands does not matter, as we
2515 /// evaluate the score of all profitable combinations of operands. In
2516 /// other words the score of G1 and G4 is the same as G1 and G2. This
2517 /// heuristic is based on ideas described in:
2518 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
2519 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
2520 /// Luís F. W. Góes
2522 Instruction *U2, int CurrLevel,
2523 ArrayRef<Value *> MainAltOps) const {
2524
2525 // Get the shallow score of V1 and V2.
2526 int ShallowScoreAtThisLevel =
2527 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
2528
2529 // If reached MaxLevel,
2530 // or if V1 and V2 are not instructions,
2531 // or if they are SPLAT,
2532 // or if they are not consecutive,
2533 // or if profitable to vectorize loads or extractelements, early return
2534 // the current cost.
2535 auto *I1 = dyn_cast<Instruction>(LHS);
2536 auto *I2 = dyn_cast<Instruction>(RHS);
2537 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2538 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
2539 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
2540 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2541 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
2542 ShallowScoreAtThisLevel))
2543 return ShallowScoreAtThisLevel;
2544 assert(I1 && I2 && "Should have early exited.");
2545
2546 // Contains the I2 operand indexes that got matched with I1 operands.
2547 SmallSet<unsigned, 4> Op2Used;
2548
2549 // Recursion towards the operands of I1 and I2. We are trying all possible
2550 // operand pairs, and keeping track of the best score.
2551 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2552 OpIdx1 != NumOperands1; ++OpIdx1) {
2553 // Try to pair op1I with the best operand of I2.
2554 int MaxTmpScore = 0;
2555 unsigned MaxOpIdx2 = 0;
2556 bool FoundBest = false;
2557 // If I2 is commutative try all combinations.
2558 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
2559 unsigned ToIdx = isCommutative(I2)
2560 ? I2->getNumOperands()
2561 : std::min(I2->getNumOperands(), OpIdx1 + 1);
2562 assert(FromIdx <= ToIdx && "Bad index");
2563 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2564 // Skip operands already paired with OpIdx1.
2565 if (Op2Used.count(OpIdx2))
2566 continue;
2567 // Recursively calculate the cost at each level
2568 int TmpScore =
2569 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
2570 I1, I2, CurrLevel + 1, {});
2571 // Look for the best score.
2572 if (TmpScore > LookAheadHeuristics::ScoreFail &&
2573 TmpScore > MaxTmpScore) {
2574 MaxTmpScore = TmpScore;
2575 MaxOpIdx2 = OpIdx2;
2576 FoundBest = true;
2577 }
2578 }
2579 if (FoundBest) {
2580 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
2581 Op2Used.insert(MaxOpIdx2);
2582 ShallowScoreAtThisLevel += MaxTmpScore;
2583 }
2584 }
2585 return ShallowScoreAtThisLevel;
2586 }
2587 };
2588 /// A helper data structure to hold the operands of a vector of instructions.
2589 /// This supports a fixed vector length for all operand vectors.
2591 /// For each operand we need (i) the value, and (ii) the opcode that it
2592 /// would be attached to if the expression was in a left-linearized form.
2593 /// This is required to avoid illegal operand reordering.
2594 /// For example:
2595 /// \verbatim
2596 /// 0 Op1
2597 /// |/
2598 /// Op1 Op2 Linearized + Op2
2599 /// \ / ----------> |/
2600 /// - -
2601 ///
2602 /// Op1 - Op2 (0 + Op1) - Op2
2603 /// \endverbatim
2604 ///
2605 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
2606 ///
2607 /// Another way to think of this is to track all the operations across the
2608 /// path from the operand all the way to the root of the tree and to
2609 /// calculate the operation that corresponds to this path. For example, the
2610 /// path from Op2 to the root crosses the RHS of the '-', therefore the
2611 /// corresponding operation is a '-' (which matches the one in the
2612 /// linearized tree, as shown above).
2613 ///
2614 /// For lack of a better term, we refer to this operation as Accumulated
2615 /// Path Operation (APO).
2616 struct OperandData {
2617 OperandData() = default;
2618 OperandData(Value *V, bool APO, bool IsUsed)
2619 : V(V), APO(APO), IsUsed(IsUsed) {}
2620 /// The operand value.
2621 Value *V = nullptr;
2622 /// TreeEntries only allow a single opcode, or an alternate sequence of
2623 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
2624 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
2625 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
2626 /// (e.g., Add/Mul)
2627 bool APO = false;
2628 /// Helper data for the reordering function.
2629 bool IsUsed = false;
2630 };
2631
2632 /// During operand reordering, we are trying to select the operand at lane
2633 /// that matches best with the operand at the neighboring lane. Our
2634 /// selection is based on the type of value we are looking for. For example,
2635 /// if the neighboring lane has a load, we need to look for a load that is
2636 /// accessing a consecutive address. These strategies are summarized in the
2637 /// 'ReorderingMode' enumerator.
2638 enum class ReorderingMode {
2639 Load, ///< Matching loads to consecutive memory addresses
2640 Opcode, ///< Matching instructions based on opcode (same or alternate)
2641 Constant, ///< Matching constants
2642 Splat, ///< Matching the same instruction multiple times (broadcast)
2643 Failed, ///< We failed to create a vectorizable group
2644 };
2645
2647
2648 /// A vector of operand vectors.
2650 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
2651 /// is not IntrinsicInst, ArgSize is User::getNumOperands.
2652 unsigned ArgSize = 0;
2653
2654 const TargetLibraryInfo &TLI;
2655 const DataLayout &DL;
2656 ScalarEvolution &SE;
2657 const BoUpSLP &R;
2658 const Loop *L = nullptr;
2659
2660 /// \returns the operand data at \p OpIdx and \p Lane.
2661 OperandData &getData(unsigned OpIdx, unsigned Lane) {
2662 return OpsVec[OpIdx][Lane];
2663 }
2664
2665 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
2666 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
2667 return OpsVec[OpIdx][Lane];
2668 }
2669
2670 /// Clears the used flag for all entries.
2671 void clearUsed() {
2672 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
2673 OpIdx != NumOperands; ++OpIdx)
2674 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2675 ++Lane)
2676 OpsVec[OpIdx][Lane].IsUsed = false;
2677 }
2678
2679 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
2680 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
2681 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2682 }
2683
2684 /// \param Lane lane of the operands under analysis.
2685 /// \param OpIdx operand index in \p Lane lane we're looking the best
2686 /// candidate for.
2687 /// \param Idx operand index of the current candidate value.
2688 /// \returns The additional score due to possible broadcasting of the
2689 /// elements in the lane. It is more profitable to have power-of-2 unique
2690 /// elements in the lane, it will be vectorized with higher probability
2691 /// after removing duplicates. Currently the SLP vectorizer supports only
2692 /// vectorization of the power-of-2 number of unique scalars.
2693 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
2694 const SmallBitVector &UsedLanes) const {
2695 Value *IdxLaneV = getData(Idx, Lane).V;
2696 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2697 isa<ExtractElementInst>(IdxLaneV))
2698 return 0;
2700 for (unsigned Ln : seq<unsigned>(getNumLanes())) {
2701 if (Ln == Lane)
2702 continue;
2703 Value *OpIdxLnV = getData(OpIdx, Ln).V;
2704 if (!isa<Instruction>(OpIdxLnV))
2705 return 0;
2706 Uniques.try_emplace(OpIdxLnV, Ln);
2707 }
2708 unsigned UniquesCount = Uniques.size();
2709 auto IdxIt = Uniques.find(IdxLaneV);
2710 unsigned UniquesCntWithIdxLaneV =
2711 IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2712 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2713 auto OpIdxIt = Uniques.find(OpIdxLaneV);
2714 unsigned UniquesCntWithOpIdxLaneV =
2715 OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2716 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2717 return 0;
2718 return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) -
2719 UniquesCntWithOpIdxLaneV,
2720 UniquesCntWithOpIdxLaneV -
2721 bit_floor(UniquesCntWithOpIdxLaneV)) -
2722 ((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second))
2723 ? UniquesCntWithIdxLaneV - bit_floor(UniquesCntWithIdxLaneV)
2724 : bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2725 }
2726
2727 /// \param Lane lane of the operands under analysis.
2728 /// \param OpIdx operand index in \p Lane lane we're looking the best
2729 /// candidate for.
2730 /// \param Idx operand index of the current candidate value.
2731 /// \returns The additional score for the scalar which users are all
2732 /// vectorized.
2733 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
2734 Value *IdxLaneV = getData(Idx, Lane).V;
2735 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2736 // Do not care about number of uses for vector-like instructions
2737 // (extractelement/extractvalue with constant indices), they are extracts
2738 // themselves and already externally used. Vectorization of such
2739 // instructions does not add extra extractelement instruction, just may
2740 // remove it.
2741 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
2742 isVectorLikeInstWithConstOps(OpIdxLaneV))
2744 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2745 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2746 return 0;
2747 return R.areAllUsersVectorized(IdxLaneI)
2749 : 0;
2750 }
2751
2752 /// Score scaling factor for fully compatible instructions but with
2753 /// different number of external uses. Allows better selection of the
2754 /// instructions with less external uses.
2755 static const int ScoreScaleFactor = 10;
2756
2757 /// \Returns the look-ahead score, which tells us how much the sub-trees
2758 /// rooted at \p LHS and \p RHS match, the more they match the higher the
2759 /// score. This helps break ties in an informed way when we cannot decide on
2760 /// the order of the operands by just considering the immediate
2761 /// predecessors.
2762 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
2763 int Lane, unsigned OpIdx, unsigned Idx,
2764 bool &IsUsed, const SmallBitVector &UsedLanes) {
2765 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
2767 // Keep track of the instruction stack as we recurse into the operands
2768 // during the look-ahead score exploration.
2769 int Score =
2770 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
2771 /*CurrLevel=*/1, MainAltOps);
2772 if (Score) {
2773 int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
2774 if (Score <= -SplatScore) {
2775 // Failed score.
2776 Score = 0;
2777 } else {
2778 Score += SplatScore;
2779 // Scale score to see the difference between different operands
2780 // and similar operands but all vectorized/not all vectorized
2781 // uses. It does not affect actual selection of the best
2782 // compatible operand in general, just allows to select the
2783 // operand with all vectorized uses.
2784 Score *= ScoreScaleFactor;
2785 Score += getExternalUseScore(Lane, OpIdx, Idx);
2786 IsUsed = true;
2787 }
2788 }
2789 return Score;
2790 }
2791
2792 /// Best defined scores per lanes between the passes. Used to choose the
2793 /// best operand (with the highest score) between the passes.
2794 /// The key - {Operand Index, Lane}.
2795 /// The value - the best score between the passes for the lane and the
2796 /// operand.
2798 BestScoresPerLanes;
2799
2800 // Search all operands in Ops[*][Lane] for the one that matches best
2801 // Ops[OpIdx][LastLane] and return its opreand index.
2802 // If no good match can be found, return std::nullopt.
2803 std::optional<unsigned>
2804 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
2805 ArrayRef<ReorderingMode> ReorderingModes,
2806 ArrayRef<Value *> MainAltOps,
2807 const SmallBitVector &UsedLanes) {
2808 unsigned NumOperands = getNumOperands();
2809
2810 // The operand of the previous lane at OpIdx.
2811 Value *OpLastLane = getData(OpIdx, LastLane).V;
2812
2813 // Our strategy mode for OpIdx.
2814 ReorderingMode RMode = ReorderingModes[OpIdx];
2815 if (RMode == ReorderingMode::Failed)
2816 return std::nullopt;
2817
2818 // The linearized opcode of the operand at OpIdx, Lane.
2819 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2820
2821 // The best operand index and its score.
2822 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
2823 // are using the score to differentiate between the two.
2824 struct BestOpData {
2825 std::optional<unsigned> Idx;
2826 unsigned Score = 0;
2827 } BestOp;
2828 BestOp.Score =
2829 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
2830 .first->second;
2831
2832 // Track if the operand must be marked as used. If the operand is set to
2833 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
2834 // want to reestimate the operands again on the following iterations).
2835 bool IsUsed = RMode == ReorderingMode::Splat ||
2836 RMode == ReorderingMode::Constant ||
2837 RMode == ReorderingMode::Load;
2838 // Iterate through all unused operands and look for the best.
2839 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2840 // Get the operand at Idx and Lane.
2841 OperandData &OpData = getData(Idx, Lane);
2842 Value *Op = OpData.V;
2843 bool OpAPO = OpData.APO;
2844
2845 // Skip already selected operands.
2846 if (OpData.IsUsed)
2847 continue;
2848
2849 // Skip if we are trying to move the operand to a position with a
2850 // different opcode in the linearized tree form. This would break the
2851 // semantics.
2852 if (OpAPO != OpIdxAPO)
2853 continue;
2854
2855 // Look for an operand that matches the current mode.
2856 switch (RMode) {
2857 case ReorderingMode::Load:
2858 case ReorderingMode::Opcode: {
2859 bool LeftToRight = Lane > LastLane;
2860 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
2861 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
2862 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2863 OpIdx, Idx, IsUsed, UsedLanes);
2864 if (Score > static_cast<int>(BestOp.Score) ||
2865 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
2866 Idx == OpIdx)) {
2867 BestOp.Idx = Idx;
2868 BestOp.Score = Score;
2869 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2870 }
2871 break;
2872 }
2873 case ReorderingMode::Constant:
2874 if (isa<Constant>(Op) ||
2875 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
2876 BestOp.Idx = Idx;
2877 if (isa<Constant>(Op)) {
2879 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2881 }
2882 if (isa<UndefValue>(Op) || !isa<Constant>(Op))
2883 IsUsed = false;
2884 }
2885 break;
2886 case ReorderingMode::Splat:
2887 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
2888 IsUsed = Op == OpLastLane;
2889 if (Op == OpLastLane) {
2890 BestOp.Score = LookAheadHeuristics::ScoreSplat;
2891 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2893 }
2894 BestOp.Idx = Idx;
2895 }
2896 break;
2897 case ReorderingMode::Failed:
2898 llvm_unreachable("Not expected Failed reordering mode.");
2899 }
2900 }
2901
2902 if (BestOp.Idx) {
2903 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2904 return BestOp.Idx;
2905 }
2906 // If we could not find a good match return std::nullopt.
2907 return std::nullopt;
2908 }
2909
2910 /// Helper for reorderOperandVecs.
2911 /// \returns the lane that we should start reordering from. This is the one
2912 /// which has the least number of operands that can freely move about or
2913 /// less profitable because it already has the most optimal set of operands.
2914 unsigned getBestLaneToStartReordering() const {
2915 unsigned Min = UINT_MAX;
2916 unsigned SameOpNumber = 0;
2917 // std::pair<unsigned, unsigned> is used to implement a simple voting
2918 // algorithm and choose the lane with the least number of operands that
2919 // can freely move about or less profitable because it already has the
2920 // most optimal set of operands. The first unsigned is a counter for
2921 // voting, the second unsigned is the counter of lanes with instructions
2922 // with same/alternate opcodes and same parent basic block.
2924 // Try to be closer to the original results, if we have multiple lanes
2925 // with same cost. If 2 lanes have the same cost, use the one with the
2926 // highest index.
2927 for (int I = getNumLanes(); I > 0; --I) {
2928 unsigned Lane = I - 1;
2929 OperandsOrderData NumFreeOpsHash =
2930 getMaxNumOperandsThatCanBeReordered(Lane);
2931 // Compare the number of operands that can move and choose the one with
2932 // the least number.
2933 if (NumFreeOpsHash.NumOfAPOs < Min) {
2934 Min = NumFreeOpsHash.NumOfAPOs;
2935 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2936 HashMap.clear();
2937 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2938 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2939 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2940 // Select the most optimal lane in terms of number of operands that
2941 // should be moved around.
2942 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2943 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2944 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2945 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2946 auto [It, Inserted] =
2947 HashMap.try_emplace(NumFreeOpsHash.Hash, 1, Lane);
2948 if (!Inserted)
2949 ++It->second.first;
2950 }
2951 }
2952 // Select the lane with the minimum counter.
2953 unsigned BestLane = 0;
2954 unsigned CntMin = UINT_MAX;
2955 for (const auto &Data : reverse(HashMap)) {
2956 if (Data.second.first < CntMin) {
2957 CntMin = Data.second.first;
2958 BestLane = Data.second.second;
2959 }
2960 }
2961 return BestLane;
2962 }
2963
2964 /// Data structure that helps to reorder operands.
2965 struct OperandsOrderData {
2966 /// The best number of operands with the same APOs, which can be
2967 /// reordered.
2968 unsigned NumOfAPOs = UINT_MAX;
2969 /// Number of operands with the same/alternate instruction opcode and
2970 /// parent.
2971 unsigned NumOpsWithSameOpcodeParent = 0;
2972 /// Hash for the actual operands ordering.
2973 /// Used to count operands, actually their position id and opcode
2974 /// value. It is used in the voting mechanism to find the lane with the
2975 /// least number of operands that can freely move about or less profitable
2976 /// because it already has the most optimal set of operands. Can be
2977 /// replaced with SmallVector<unsigned> instead but hash code is faster
2978 /// and requires less memory.
2979 unsigned Hash = 0;
2980 };
2981 /// \returns the maximum number of operands that are allowed to be reordered
2982 /// for \p Lane and the number of compatible instructions(with the same
2983 /// parent/opcode). This is used as a heuristic for selecting the first lane
2984 /// to start operand reordering.
2985 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
2986 unsigned CntTrue = 0;
2987 unsigned NumOperands = getNumOperands();
2988 // Operands with the same APO can be reordered. We therefore need to count
2989 // how many of them we have for each APO, like this: Cnt[APO] = x.
2990 // Since we only have two APOs, namely true and false, we can avoid using
2991 // a map. Instead we can simply count the number of operands that
2992 // correspond to one of them (in this case the 'true' APO), and calculate
2993 // the other by subtracting it from the total number of operands.
2994 // Operands with the same instruction opcode and parent are more
2995 // profitable since we don't need to move them in many cases, with a high
2996 // probability such lane already can be vectorized effectively.
2997 bool AllUndefs = true;
2998 unsigned NumOpsWithSameOpcodeParent = 0;
2999 Instruction *OpcodeI = nullptr;
3000 BasicBlock *Parent = nullptr;
3001 unsigned Hash = 0;
3002 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3003 const OperandData &OpData = getData(OpIdx, Lane);
3004 if (OpData.APO)
3005 ++CntTrue;
3006 // Use Boyer-Moore majority voting for finding the majority opcode and
3007 // the number of times it occurs.
3008 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
3009 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI) ||
3010 I->getParent() != Parent) {
3011 if (NumOpsWithSameOpcodeParent == 0) {
3012 NumOpsWithSameOpcodeParent = 1;
3013 OpcodeI = I;
3014 Parent = I->getParent();
3015 } else {
3016 --NumOpsWithSameOpcodeParent;
3017 }
3018 } else {
3019 ++NumOpsWithSameOpcodeParent;
3020 }
3021 }
3022 Hash = hash_combine(
3023 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
3024 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
3025 }
3026 if (AllUndefs)
3027 return {};
3028 OperandsOrderData Data;
3029 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
3030 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3031 Data.Hash = Hash;
3032 return Data;
3033 }
3034
3035 /// Go through the instructions in VL and append their operands.
3036 void appendOperands(ArrayRef<Value *> VL, ArrayRef<ValueList> Operands,
3037 const InstructionsState &S) {
3038 assert(!Operands.empty() && !VL.empty() && "Bad list of operands");
3039 assert((empty() || all_of(Operands,
3040 [this](const ValueList &VL) {
3041 return VL.size() == getNumLanes();
3042 })) &&
3043 "Expected same number of lanes");
3044 assert(S.valid() && "InstructionsState is invalid.");
3045 // IntrinsicInst::isCommutative returns true if swapping the first "two"
3046 // arguments to the intrinsic produces the same result.
3047 Instruction *MainOp = S.getMainOp();
3048 unsigned NumOperands = MainOp->getNumOperands();
3050 OpsVec.resize(ArgSize);
3051 unsigned NumLanes = VL.size();
3052 for (OperandDataVec &Ops : OpsVec)
3053 Ops.resize(NumLanes);
3054 for (unsigned Lane : seq<unsigned>(NumLanes)) {
3055 // Our tree has just 3 nodes: the root and two operands.
3056 // It is therefore trivial to get the APO. We only need to check the
3057 // opcode of V and whether the operand at OpIdx is the LHS or RHS
3058 // operand. The LHS operand of both add and sub is never attached to an
3059 // inversese operation in the linearized form, therefore its APO is
3060 // false. The RHS is true only if V is an inverse operation.
3061
3062 // Since operand reordering is performed on groups of commutative
3063 // operations or alternating sequences (e.g., +, -), we can safely tell
3064 // the inverse operations by checking commutativity.
3065 auto *I = dyn_cast<Instruction>(VL[Lane]);
3066 if (!I && isa<PoisonValue>(VL[Lane])) {
3067 for (unsigned OpIdx : seq<unsigned>(NumOperands))
3068 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], true, false};
3069 continue;
3070 }
3071 bool IsInverseOperation = false;
3072 if (S.isCopyableElement(VL[Lane])) {
3073 // The value is a copyable element.
3074 IsInverseOperation = !isCommutative(MainOp, VL[Lane]);
3075 } else {
3076 assert(I && "Expected instruction");
3077 auto [SelectedOp, Ops] = convertTo(I, S);
3078 // We cannot check commutativity by the converted instruction
3079 // (SelectedOp) because isCommutative also examines def-use
3080 // relationships.
3081 IsInverseOperation = !isCommutative(SelectedOp, I);
3082 }
3083 for (unsigned OpIdx : seq<unsigned>(ArgSize)) {
3084 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
3085 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], APO, false};
3086 }
3087 }
3088 }
3089
3090 /// \returns the number of operands.
3091 unsigned getNumOperands() const { return ArgSize; }
3092
3093 /// \returns the number of lanes.
3094 unsigned getNumLanes() const { return OpsVec[0].size(); }
3095
3096 /// \returns the operand value at \p OpIdx and \p Lane.
3097 Value *getValue(unsigned OpIdx, unsigned Lane) const {
3098 return getData(OpIdx, Lane).V;
3099 }
3100
3101 /// \returns true if the data structure is empty.
3102 bool empty() const { return OpsVec.empty(); }
3103
3104 /// Clears the data.
3105 void clear() { OpsVec.clear(); }
3106
3107 /// \Returns true if there are enough operands identical to \p Op to fill
3108 /// the whole vector (it is mixed with constants or loop invariant values).
3109 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
3110 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
3111 assert(Op == getValue(OpIdx, Lane) &&
3112 "Op is expected to be getValue(OpIdx, Lane).");
3113 // Small number of loads - try load matching.
3114 if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
3115 return false;
3116 bool OpAPO = getData(OpIdx, Lane).APO;
3117 bool IsInvariant = L && L->isLoopInvariant(Op);
3118 unsigned Cnt = 0;
3119 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3120 if (Ln == Lane)
3121 continue;
3122 // This is set to true if we found a candidate for broadcast at Lane.
3123 bool FoundCandidate = false;
3124 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3125 OperandData &Data = getData(OpI, Ln);
3126 if (Data.APO != OpAPO || Data.IsUsed)
3127 continue;
3128 Value *OpILane = getValue(OpI, Lane);
3129 bool IsConstantOp = isa<Constant>(OpILane);
3130 // Consider the broadcast candidate if:
3131 // 1. Same value is found in one of the operands.
3132 if (Data.V == Op ||
3133 // 2. The operand in the given lane is not constant but there is a
3134 // constant operand in another lane (which can be moved to the
3135 // given lane). In this case we can represent it as a simple
3136 // permutation of constant and broadcast.
3137 (!IsConstantOp &&
3138 ((Lns > 2 && isa<Constant>(Data.V)) ||
3139 // 2.1. If we have only 2 lanes, need to check that value in the
3140 // next lane does not build same opcode sequence.
3141 (Lns == 2 &&
3142 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) &&
3143 isa<Constant>(Data.V)))) ||
3144 // 3. The operand in the current lane is loop invariant (can be
3145 // hoisted out) and another operand is also a loop invariant
3146 // (though not a constant). In this case the whole vector can be
3147 // hoisted out.
3148 // FIXME: need to teach the cost model about this case for better
3149 // estimation.
3150 (IsInvariant && !isa<Constant>(Data.V) &&
3151 !getSameOpcode({Op, Data.V}, TLI) &&
3152 L->isLoopInvariant(Data.V))) {
3153 FoundCandidate = true;
3154 Data.IsUsed = Data.V == Op;
3155 if (Data.V == Op)
3156 ++Cnt;
3157 break;
3158 }
3159 }
3160 if (!FoundCandidate)
3161 return false;
3162 }
3163 return getNumLanes() == 2 || Cnt > 1;
3164 }
3165
3166 /// Checks if there is at least single compatible operand in lanes other
3167 /// than \p Lane, compatible with the operand \p Op.
3168 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
3169 assert(Op == getValue(OpIdx, Lane) &&
3170 "Op is expected to be getValue(OpIdx, Lane).");
3171 bool OpAPO = getData(OpIdx, Lane).APO;
3172 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3173 if (Ln == Lane)
3174 continue;
3175 if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
3176 const OperandData &Data = getData(OpI, Ln);
3177 if (Data.APO != OpAPO || Data.IsUsed)
3178 return true;
3179 Value *OpILn = getValue(OpI, Ln);
3180 return (L && L->isLoopInvariant(OpILn)) ||
3181 (getSameOpcode({Op, OpILn}, TLI) &&
3182 allSameBlock({Op, OpILn}));
3183 }))
3184 return true;
3185 }
3186 return false;
3187 }
3188
3189 public:
3190 /// Initialize with all the operands of the instruction vector \p RootVL.
3192 const InstructionsState &S, const BoUpSLP &R)
3193 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3194 L(R.LI->getLoopFor(S.getMainOp()->getParent())) {
3195 // Append all the operands of RootVL.
3196 appendOperands(RootVL, Operands, S);
3197 }
3198
3199 /// \Returns a value vector with the operands across all lanes for the
3200 /// opearnd at \p OpIdx.
3201 ValueList getVL(unsigned OpIdx) const {
3202 ValueList OpVL(OpsVec[OpIdx].size());
3203 assert(OpsVec[OpIdx].size() == getNumLanes() &&
3204 "Expected same num of lanes across all operands");
3205 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3206 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
3207 return OpVL;
3208 }
3209
3210 // Performs operand reordering for 2 or more operands.
3211 // The original operands are in OrigOps[OpIdx][Lane].
3212 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
3213 void reorder() {
3214 unsigned NumOperands = getNumOperands();
3215 unsigned NumLanes = getNumLanes();
3216 // Each operand has its own mode. We are using this mode to help us select
3217 // the instructions for each lane, so that they match best with the ones
3218 // we have selected so far.
3219 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
3220
3221 // This is a greedy single-pass algorithm. We are going over each lane
3222 // once and deciding on the best order right away with no back-tracking.
3223 // However, in order to increase its effectiveness, we start with the lane
3224 // that has operands that can move the least. For example, given the
3225 // following lanes:
3226 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
3227 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
3228 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
3229 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
3230 // we will start at Lane 1, since the operands of the subtraction cannot
3231 // be reordered. Then we will visit the rest of the lanes in a circular
3232 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
3233
3234 // Find the first lane that we will start our search from.
3235 unsigned FirstLane = getBestLaneToStartReordering();
3236
3237 // Initialize the modes.
3238 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3239 Value *OpLane0 = getValue(OpIdx, FirstLane);
3240 // Keep track if we have instructions with all the same opcode on one
3241 // side.
3242 if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
3243 // Check if OpLane0 should be broadcast.
3244 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
3245 !canBeVectorized(OpILane0, OpIdx, FirstLane))
3246 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3247 else if (isa<LoadInst>(OpILane0))
3248 ReorderingModes[OpIdx] = ReorderingMode::Load;
3249 else
3250 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
3251 } else if (isa<Constant>(OpLane0)) {
3252 ReorderingModes[OpIdx] = ReorderingMode::Constant;
3253 } else if (isa<Argument>(OpLane0)) {
3254 // Our best hope is a Splat. It may save some cost in some cases.
3255 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3256 } else {
3257 llvm_unreachable("Unexpected value kind.");
3258 }
3259 }
3260
3261 // Check that we don't have same operands. No need to reorder if operands
3262 // are just perfect diamond or shuffled diamond match. Do not do it only
3263 // for possible broadcasts or non-power of 2 number of scalars (just for
3264 // now).
3265 auto &&SkipReordering = [this]() {
3266 SmallPtrSet<Value *, 4> UniqueValues;
3267 ArrayRef<OperandData> Op0 = OpsVec.front();
3268 for (const OperandData &Data : Op0)
3269 UniqueValues.insert(Data.V);
3271 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
3272 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
3273 return !UniqueValues.contains(Data.V);
3274 }))
3275 return false;
3276 }
3277 // TODO: Check if we can remove a check for non-power-2 number of
3278 // scalars after full support of non-power-2 vectorization.
3279 return UniqueValues.size() != 2 &&
3280 hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(),
3281 UniqueValues.size());
3282 };
3283
3284 // If the initial strategy fails for any of the operand indexes, then we
3285 // perform reordering again in a second pass. This helps avoid assigning
3286 // high priority to the failed strategy, and should improve reordering for
3287 // the non-failed operand indexes.
3288 for (int Pass = 0; Pass != 2; ++Pass) {
3289 // Check if no need to reorder operands since they're are perfect or
3290 // shuffled diamond match.
3291 // Need to do it to avoid extra external use cost counting for
3292 // shuffled matches, which may cause regressions.
3293 if (SkipReordering())
3294 break;
3295 // Skip the second pass if the first pass did not fail.
3296 bool StrategyFailed = false;
3297 // Mark all operand data as free to use.
3298 clearUsed();
3299 // We keep the original operand order for the FirstLane, so reorder the
3300 // rest of the lanes. We are visiting the nodes in a circular fashion,
3301 // using FirstLane as the center point and increasing the radius
3302 // distance.
3303 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
3304 for (unsigned I = 0; I < NumOperands; ++I)
3305 MainAltOps[I].push_back(getData(I, FirstLane).V);
3306
3307 SmallBitVector UsedLanes(NumLanes);
3308 UsedLanes.set(FirstLane);
3309 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3310 // Visit the lane on the right and then the lane on the left.
3311 for (int Direction : {+1, -1}) {
3312 int Lane = FirstLane + Direction * Distance;
3313 if (Lane < 0 || Lane >= (int)NumLanes)
3314 continue;
3315 UsedLanes.set(Lane);
3316 int LastLane = Lane - Direction;
3317 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
3318 "Out of bounds");
3319 // Look for a good match for each operand.
3320 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3321 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
3322 std::optional<unsigned> BestIdx =
3323 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
3324 MainAltOps[OpIdx], UsedLanes);
3325 // By not selecting a value, we allow the operands that follow to
3326 // select a better matching value. We will get a non-null value in
3327 // the next run of getBestOperand().
3328 if (BestIdx) {
3329 // Swap the current operand with the one returned by
3330 // getBestOperand().
3331 swap(OpIdx, *BestIdx, Lane);
3332 } else {
3333 // Enable the second pass.
3334 StrategyFailed = true;
3335 }
3336 // Try to get the alternate opcode and follow it during analysis.
3337 if (MainAltOps[OpIdx].size() != 2) {
3338 OperandData &AltOp = getData(OpIdx, Lane);
3339 InstructionsState OpS =
3340 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
3341 if (OpS && OpS.isAltShuffle())
3342 MainAltOps[OpIdx].push_back(AltOp.V);
3343 }
3344 }
3345 }
3346 }
3347 // Skip second pass if the strategy did not fail.
3348 if (!StrategyFailed)
3349 break;
3350 }
3351 }
3352
3353#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3354 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
3355 switch (RMode) {
3356 case ReorderingMode::Load:
3357 return "Load";
3358 case ReorderingMode::Opcode:
3359 return "Opcode";
3360 case ReorderingMode::Constant:
3361 return "Constant";
3362 case ReorderingMode::Splat:
3363 return "Splat";
3364 case ReorderingMode::Failed:
3365 return "Failed";
3366 }
3367 llvm_unreachable("Unimplemented Reordering Type");
3368 }
3369
3370 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
3371 raw_ostream &OS) {
3372 return OS << getModeStr(RMode);
3373 }
3374
3375 /// Debug print.
3376 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
3377 printMode(RMode, dbgs());
3378 }
3379
3380 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
3381 return printMode(RMode, OS);
3382 }
3383
3385 const unsigned Indent = 2;
3386 unsigned Cnt = 0;
3387 for (const OperandDataVec &OpDataVec : OpsVec) {
3388 OS << "Operand " << Cnt++ << "\n";
3389 for (const OperandData &OpData : OpDataVec) {
3390 OS.indent(Indent) << "{";
3391 if (Value *V = OpData.V)
3392 OS << *V;
3393 else
3394 OS << "null";
3395 OS << ", APO:" << OpData.APO << "}\n";
3396 }
3397 OS << "\n";
3398 }
3399 return OS;
3400 }
3401
3402 /// Debug print.
3403 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
3404#endif
3405 };
3406
3407 /// Evaluate each pair in \p Candidates and return index into \p Candidates
3408 /// for a pair which have highest score deemed to have best chance to form
3409 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
3410 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
3411 /// of the cost, considered to be good enough score.
3412 std::optional<int>
3413 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
3414 int Limit = LookAheadHeuristics::ScoreFail) const {
3415 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
3417 int BestScore = Limit;
3418 std::optional<int> Index;
3419 for (int I : seq<int>(0, Candidates.size())) {
3420 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
3421 Candidates[I].second,
3422 /*U1=*/nullptr, /*U2=*/nullptr,
3423 /*CurrLevel=*/1, {});
3424 if (Score > BestScore) {
3425 BestScore = Score;
3426 Index = I;
3427 }
3428 }
3429 return Index;
3430 }
3431
3432 /// Checks if the instruction is marked for deletion.
3433 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
3434
3435 /// Removes an instruction from its block and eventually deletes it.
3436 /// It's like Instruction::eraseFromParent() except that the actual deletion
3437 /// is delayed until BoUpSLP is destructed.
3439 DeletedInstructions.insert(I);
3440 }
3441
3442 /// Remove instructions from the parent function and clear the operands of \p
3443 /// DeadVals instructions, marking for deletion trivially dead operands.
3444 template <typename T>
3446 ArrayRef<T *> DeadVals,
3447 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
3449 for (T *V : DeadVals) {
3450 auto *I = cast<Instruction>(V);
3452 }
3453 DenseSet<Value *> Processed;
3454 for (T *V : DeadVals) {
3455 if (!V || !Processed.insert(V).second)
3456 continue;
3457 auto *I = cast<Instruction>(V);
3459 ArrayRef<TreeEntry *> Entries = getTreeEntries(I);
3460 for (Use &U : I->operands()) {
3461 if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
3462 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
3464 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
3465 return Entry->VectorizedValue == OpI;
3466 })))
3467 DeadInsts.push_back(OpI);
3468 }
3469 I->dropAllReferences();
3470 }
3471 for (T *V : DeadVals) {
3472 auto *I = cast<Instruction>(V);
3473 if (!I->getParent())
3474 continue;
3475 assert((I->use_empty() || all_of(I->uses(),
3476 [&](Use &U) {
3477 return isDeleted(
3478 cast<Instruction>(U.getUser()));
3479 })) &&
3480 "trying to erase instruction with users.");
3481 I->removeFromParent();
3482 SE->forgetValue(I);
3483 }
3484 // Process the dead instruction list until empty.
3485 while (!DeadInsts.empty()) {
3486 Value *V = DeadInsts.pop_back_val();
3487 Instruction *VI = cast_or_null<Instruction>(V);
3488 if (!VI || !VI->getParent())
3489 continue;
3491 "Live instruction found in dead worklist!");
3492 assert(VI->use_empty() && "Instructions with uses are not dead.");
3493
3494 // Don't lose the debug info while deleting the instructions.
3495 salvageDebugInfo(*VI);
3496
3497 // Null out all of the instruction's operands to see if any operand
3498 // becomes dead as we go.
3499 for (Use &OpU : VI->operands()) {
3500 Value *OpV = OpU.get();
3501 if (!OpV)
3502 continue;
3503 OpU.set(nullptr);
3504
3505 if (!OpV->use_empty())
3506 continue;
3507
3508 // If the operand is an instruction that became dead as we nulled out
3509 // the operand, and if it is 'trivially' dead, delete it in a future
3510 // loop iteration.
3511 if (auto *OpI = dyn_cast<Instruction>(OpV))
3512 if (!DeletedInstructions.contains(OpI) &&
3513 (!OpI->getType()->isVectorTy() ||
3514 none_of(VectorValuesAndScales,
3515 [&](const std::tuple<Value *, unsigned, bool> &V) {
3516 return std::get<0>(V) == OpI;
3517 })) &&
3519 DeadInsts.push_back(OpI);
3520 }
3521
3522 VI->removeFromParent();
3523 eraseInstruction(VI);
3524 SE->forgetValue(VI);
3525 }
3526 }
3527
3528 /// Checks if the instruction was already analyzed for being possible
3529 /// reduction root.
3531 return AnalyzedReductionsRoots.count(I);
3532 }
3533 /// Register given instruction as already analyzed for being possible
3534 /// reduction root.
3536 AnalyzedReductionsRoots.insert(I);
3537 }
3538 /// Checks if the provided list of reduced values was checked already for
3539 /// vectorization.
3541 return AnalyzedReductionVals.contains(hash_value(VL));
3542 }
3543 /// Adds the list of reduced values to list of already checked values for the
3544 /// vectorization.
3546 AnalyzedReductionVals.insert(hash_value(VL));
3547 }
3548 /// Clear the list of the analyzed reduction root instructions.
3550 AnalyzedReductionsRoots.clear();
3551 AnalyzedReductionVals.clear();
3552 AnalyzedMinBWVals.clear();
3553 }
3554 /// Checks if the given value is gathered in one of the nodes.
3555 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
3556 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
3557 }
3558 /// Checks if the given value is gathered in one of the nodes.
3559 bool isGathered(const Value *V) const {
3560 return MustGather.contains(V);
3561 }
3562 /// Checks if the specified value was not schedule.
3563 bool isNotScheduled(const Value *V) const {
3564 return NonScheduledFirst.contains(V);
3565 }
3566
3567 /// Check if the value is vectorized in the tree.
3568 bool isVectorized(const Value *V) const {
3569 assert(V && "V cannot be nullptr.");
3570 return ScalarToTreeEntries.contains(V);
3571 }
3572
3573 ~BoUpSLP();
3574
3575private:
3576 /// Determine if a node \p E in can be demoted to a smaller type with a
3577 /// truncation. We collect the entries that will be demoted in ToDemote.
3578 /// \param E Node for analysis
3579 /// \param ToDemote indices of the nodes to be demoted.
3580 bool collectValuesToDemote(
3581 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
3583 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
3584 bool &IsProfitableToDemote, bool IsTruncRoot) const;
3585
3586 /// Builds the list of reorderable operands on the edges \p Edges of the \p
3587 /// UserTE, which allow reordering (i.e. the operands can be reordered because
3588 /// they have only one user and reordarable).
3589 /// \param ReorderableGathers List of all gather nodes that require reordering
3590 /// (e.g., gather of extractlements or partially vectorizable loads).
3591 /// \param GatherOps List of gather operand nodes for \p UserTE that require
3592 /// reordering, subset of \p NonVectorized.
3593 void buildReorderableOperands(
3594 TreeEntry *UserTE,
3595 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
3596 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
3597 SmallVectorImpl<TreeEntry *> &GatherOps);
3598
3599 /// Checks if the given \p TE is a gather node with clustered reused scalars
3600 /// and reorders it per given \p Mask.
3601 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
3602
3603 /// Checks if all users of \p I are the part of the vectorization tree.
3604 bool areAllUsersVectorized(
3605 Instruction *I,
3606 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
3607
3608 /// Return information about the vector formed for the specified index
3609 /// of a vector of (the same) instruction.
3611
3612 /// \returns the graph entry for the \p Idx operand of the \p E entry.
3613 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
3614 TreeEntry *getOperandEntry(TreeEntry *E, unsigned Idx) {
3615 return const_cast<TreeEntry *>(
3616 getOperandEntry(const_cast<const TreeEntry *>(E), Idx));
3617 }
3618
3619 /// Gets the root instruction for the given node. If the node is a strided
3620 /// load/store node with the reverse order, the root instruction is the last
3621 /// one.
3622 Instruction *getRootEntryInstruction(const TreeEntry &Entry) const;
3623
3624 /// \returns Cast context for the given graph node.
3626 getCastContextHint(const TreeEntry &TE) const;
3627
3628 /// \returns the cost of the vectorizable entry.
3629 InstructionCost getEntryCost(const TreeEntry *E,
3630 ArrayRef<Value *> VectorizedVals,
3631 SmallPtrSetImpl<Value *> &CheckedExtracts);
3632
3633 /// Checks if it is legal and profitable to build SplitVectorize node for the
3634 /// given \p VL.
3635 /// \param Op1 first homogeneous scalars.
3636 /// \param Op2 second homogeneous scalars.
3637 /// \param ReorderIndices indices to reorder the scalars.
3638 /// \returns true if the node was successfully built.
3639 bool canBuildSplitNode(ArrayRef<Value *> VL,
3640 const InstructionsState &LocalState,
3643 OrdersType &ReorderIndices) const;
3644
3645 /// This is the recursive part of buildTree.
3646 void buildTreeRec(ArrayRef<Value *> Roots, unsigned Depth, const EdgeInfo &EI,
3647 unsigned InterleaveFactor = 0);
3648
3649 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
3650 /// be vectorized to use the original vector (or aggregate "bitcast" to a
3651 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
3652 /// returns false, setting \p CurrentOrder to either an empty vector or a
3653 /// non-identity permutation that allows to reuse extract instructions.
3654 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
3655 /// extract order.
3656 bool canReuseExtract(ArrayRef<Value *> VL,
3657 SmallVectorImpl<unsigned> &CurrentOrder,
3658 bool ResizeAllowed = false) const;
3659
3660 /// Vectorize a single entry in the tree.
3661 Value *vectorizeTree(TreeEntry *E);
3662
3663 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
3664 /// \p E.
3665 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx);
3666
3667 /// Create a new vector from a list of scalar values. Produces a sequence
3668 /// which exploits values reused across lanes, and arranges the inserts
3669 /// for ease of later optimization.
3670 template <typename BVTy, typename ResTy, typename... Args>
3671 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
3672
3673 /// Create a new vector from a list of scalar values. Produces a sequence
3674 /// which exploits values reused across lanes, and arranges the inserts
3675 /// for ease of later optimization.
3676 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
3677
3678 /// Returns the instruction in the bundle, which can be used as a base point
3679 /// for scheduling. Usually it is the last instruction in the bundle, except
3680 /// for the case when all operands are external (in this case, it is the first
3681 /// instruction in the list).
3682 Instruction &getLastInstructionInBundle(const TreeEntry *E);
3683
3684 /// Tries to find extractelement instructions with constant indices from fixed
3685 /// vector type and gather such instructions into a bunch, which highly likely
3686 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3687 /// was successful, the matched scalars are replaced by poison values in \p VL
3688 /// for future analysis.
3689 std::optional<TargetTransformInfo::ShuffleKind>
3690 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
3691 SmallVectorImpl<int> &Mask) const;
3692
3693 /// Tries to find extractelement instructions with constant indices from fixed
3694 /// vector type and gather such instructions into a bunch, which highly likely
3695 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3696 /// was successful, the matched scalars are replaced by poison values in \p VL
3697 /// for future analysis.
3699 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
3701 unsigned NumParts) const;
3702
3703 /// Checks if the gathered \p VL can be represented as a single register
3704 /// shuffle(s) of previous tree entries.
3705 /// \param TE Tree entry checked for permutation.
3706 /// \param VL List of scalars (a subset of the TE scalar), checked for
3707 /// permutations. Must form single-register vector.
3708 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3709 /// commands to build the mask using the original vector value, without
3710 /// relying on the potential reordering.
3711 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
3712 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
3713 std::optional<TargetTransformInfo::ShuffleKind>
3714 isGatherShuffledSingleRegisterEntry(
3715 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
3716 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
3717 bool ForOrder);
3718
3719 /// Checks if the gathered \p VL can be represented as multi-register
3720 /// shuffle(s) of previous tree entries.
3721 /// \param TE Tree entry checked for permutation.
3722 /// \param VL List of scalars (a subset of the TE scalar), checked for
3723 /// permutations.
3724 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3725 /// commands to build the mask using the original vector value, without
3726 /// relying on the potential reordering.
3727 /// \returns per-register series of ShuffleKind, if gathered values can be
3728 /// represented as shuffles of previous tree entries. \p Mask is filled with
3729 /// the shuffle mask (also on per-register base).
3731 isGatherShuffledEntry(
3732 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
3734 unsigned NumParts, bool ForOrder = false);
3735
3736 /// \returns the cost of gathering (inserting) the values in \p VL into a
3737 /// vector.
3738 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
3739 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
3740 Type *ScalarTy) const;
3741
3742 /// Set the Builder insert point to one after the last instruction in
3743 /// the bundle
3744 void setInsertPointAfterBundle(const TreeEntry *E);
3745
3746 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
3747 /// specified, the starting vector value is poison.
3748 Value *
3749 gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
3750 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle);
3751
3752 /// \returns whether the VectorizableTree is fully vectorizable and will
3753 /// be beneficial even the tree height is tiny.
3754 bool isFullyVectorizableTinyTree(bool ForReduction) const;
3755
3756 /// Run through the list of all gathered loads in the graph and try to find
3757 /// vector loads/masked gathers instead of regular gathers. Later these loads
3758 /// are reshufled to build final gathered nodes.
3759 void tryToVectorizeGatheredLoads(
3760 const SmallMapVector<
3761 std::tuple<BasicBlock *, Value *, Type *>,
3762 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
3763 &GatheredLoads);
3764
3765 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
3766 /// users of \p TE and collects the stores. It returns the map from the store
3767 /// pointers to the collected stores.
3769 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
3770
3771 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
3772 /// stores in \p StoresVec can form a vector instruction. If so it returns
3773 /// true and populates \p ReorderIndices with the shuffle indices of the
3774 /// stores when compared to the sorted vector.
3775 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
3776 OrdersType &ReorderIndices) const;
3777
3778 /// Iterates through the users of \p TE, looking for scalar stores that can be
3779 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
3780 /// their order and builds an order index vector for each store bundle. It
3781 /// returns all these order vectors found.
3782 /// We run this after the tree has formed, otherwise we may come across user
3783 /// instructions that are not yet in the tree.
3785 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
3786
3787 /// Tries to reorder the gathering node for better vectorization
3788 /// opportunities.
3789 void reorderGatherNode(TreeEntry &TE);
3790
3791 class TreeEntry {
3792 public:
3793 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
3794 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3795
3796 /// \returns Common mask for reorder indices and reused scalars.
3797 SmallVector<int> getCommonMask() const {
3798 if (State == TreeEntry::SplitVectorize)
3799 return {};
3801 inversePermutation(ReorderIndices, Mask);
3802 ::addMask(Mask, ReuseShuffleIndices);
3803 return Mask;
3804 }
3805
3806 /// \returns The mask for split nodes.
3807 SmallVector<int> getSplitMask() const {
3808 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3809 "Expected only split vectorize node.");
3810 SmallVector<int> Mask(getVectorFactor(), PoisonMaskElem);
3811 unsigned CommonVF = std::max<unsigned>(
3812 CombinedEntriesWithIndices.back().second,
3813 Scalars.size() - CombinedEntriesWithIndices.back().second);
3814 for (auto [Idx, I] : enumerate(ReorderIndices))
3815 Mask[I] =
3816 Idx + (Idx >= CombinedEntriesWithIndices.back().second
3817 ? CommonVF - CombinedEntriesWithIndices.back().second
3818 : 0);
3819 return Mask;
3820 }
3821
3822 /// Updates (reorders) SplitVectorize node according to the given mask \p
3823 /// Mask and order \p MaskOrder.
3824 void reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
3825 ArrayRef<int> MaskOrder);
3826
3827 /// \returns true if the scalars in VL are equal to this entry.
3828 bool isSame(ArrayRef<Value *> VL) const {
3829 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
3830 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
3831 return std::equal(VL.begin(), VL.end(), Scalars.begin());
3832 return VL.size() == Mask.size() &&
3833 std::equal(VL.begin(), VL.end(), Mask.begin(),
3834 [Scalars](Value *V, int Idx) {
3835 return (isa<UndefValue>(V) &&
3836 Idx == PoisonMaskElem) ||
3837 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3838 });
3839 };
3840 if (!ReorderIndices.empty()) {
3841 // TODO: implement matching if the nodes are just reordered, still can
3842 // treat the vector as the same if the list of scalars matches VL
3843 // directly, without reordering.
3845 inversePermutation(ReorderIndices, Mask);
3846 if (VL.size() == Scalars.size())
3847 return IsSame(Scalars, Mask);
3848 if (VL.size() == ReuseShuffleIndices.size()) {
3849 ::addMask(Mask, ReuseShuffleIndices);
3850 return IsSame(Scalars, Mask);
3851 }
3852 return false;
3853 }
3854 return IsSame(Scalars, ReuseShuffleIndices);
3855 }
3856
3857 /// \returns true if current entry has same operands as \p TE.
3858 bool hasEqualOperands(const TreeEntry &TE) const {
3859 if (TE.getNumOperands() != getNumOperands())
3860 return false;
3861 SmallBitVector Used(getNumOperands());
3862 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
3863 unsigned PrevCount = Used.count();
3864 for (unsigned K = 0; K < E; ++K) {
3865 if (Used.test(K))
3866 continue;
3867 if (getOperand(K) == TE.getOperand(I)) {
3868 Used.set(K);
3869 break;
3870 }
3871 }
3872 // Check if we actually found the matching operand.
3873 if (PrevCount == Used.count())
3874 return false;
3875 }
3876 return true;
3877 }
3878
3879 /// \return Final vectorization factor for the node. Defined by the total
3880 /// number of vectorized scalars, including those, used several times in the
3881 /// entry and counted in the \a ReuseShuffleIndices, if any.
3882 unsigned getVectorFactor() const {
3883 if (!ReuseShuffleIndices.empty())
3884 return ReuseShuffleIndices.size();
3885 return Scalars.size();
3886 };
3887
3888 /// Checks if the current node is a gather node.
3889 bool isGather() const { return State == NeedToGather; }
3890
3891 /// A vector of scalars.
3892 ValueList Scalars;
3893
3894 /// The Scalars are vectorized into this value. It is initialized to Null.
3895 WeakTrackingVH VectorizedValue = nullptr;
3896
3897 /// Do we need to gather this sequence or vectorize it
3898 /// (either with vector instruction or with scatter/gather
3899 /// intrinsics for store/load)?
3900 enum EntryState {
3901 Vectorize, ///< The node is regularly vectorized.
3902 ScatterVectorize, ///< Masked scatter/gather node.
3903 StridedVectorize, ///< Strided loads (and stores)
3904 CompressVectorize, ///< (Masked) load with compress.
3905 NeedToGather, ///< Gather/buildvector node.
3906 CombinedVectorize, ///< Vectorized node, combined with its user into more
3907 ///< complex node like select/cmp to minmax, mul/add to
3908 ///< fma, etc. Must be used for the following nodes in
3909 ///< the pattern, not the very first one.
3910 SplitVectorize, ///< Splits the node into 2 subnodes, vectorizes them
3911 ///< independently and then combines back.
3912 };
3913 EntryState State;
3914
3915 /// List of combined opcodes supported by the vectorizer.
3916 enum CombinedOpcode {
3917 NotCombinedOp = -1,
3918 MinMax = Instruction::OtherOpsEnd + 1,
3919 FMulAdd,
3920 };
3921 CombinedOpcode CombinedOp = NotCombinedOp;
3922
3923 /// Does this sequence require some shuffling?
3924 SmallVector<int, 4> ReuseShuffleIndices;
3925
3926 /// Does this entry require reordering?
3927 SmallVector<unsigned, 4> ReorderIndices;
3928
3929 /// Points back to the VectorizableTree.
3930 ///
3931 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
3932 /// to be a pointer and needs to be able to initialize the child iterator.
3933 /// Thus we need a reference back to the container to translate the indices
3934 /// to entries.
3935 VecTreeTy &Container;
3936
3937 /// The TreeEntry index containing the user of this entry.
3938 EdgeInfo UserTreeIndex;
3939
3940 /// The index of this treeEntry in VectorizableTree.
3941 unsigned Idx = 0;
3942
3943 /// For gather/buildvector/alt opcode nodes, which are combined from
3944 /// other nodes as a series of insertvector instructions.
3945 SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
3946
3947 private:
3948 /// The operands of each instruction in each lane Operands[op_index][lane].
3949 /// Note: This helps avoid the replication of the code that performs the
3950 /// reordering of operands during buildTreeRec() and vectorizeTree().
3952
3953 /// Copyable elements of the entry node.
3954 SmallPtrSet<const Value *, 4> CopyableElements;
3955
3956 /// MainOp and AltOp are recorded inside. S should be obtained from
3957 /// newTreeEntry.
3958 InstructionsState S = InstructionsState::invalid();
3959
3960 /// Interleaving factor for interleaved loads Vectorize nodes.
3961 unsigned InterleaveFactor = 0;
3962
3963 /// True if the node does not require scheduling.
3964 bool DoesNotNeedToSchedule = false;
3965
3966 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
3967 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
3968 if (Operands.size() < OpIdx + 1)
3969 Operands.resize(OpIdx + 1);
3970 assert(Operands[OpIdx].empty() && "Already resized?");
3971 assert(OpVL.size() <= Scalars.size() &&
3972 "Number of operands is greater than the number of scalars.");
3973 Operands[OpIdx].resize(OpVL.size());
3974 copy(OpVL, Operands[OpIdx].begin());
3975 }
3976
3977 public:
3978 /// Returns interleave factor for interleave nodes.
3979 unsigned getInterleaveFactor() const { return InterleaveFactor; }
3980 /// Sets interleaving factor for the interleaving nodes.
3981 void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
3982
3983 /// Marks the node as one that does not require scheduling.
3984 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule = true; }
3985 /// Returns true if the node is marked as one that does not require
3986 /// scheduling.
3987 bool doesNotNeedToSchedule() const { return DoesNotNeedToSchedule; }
3988
3989 /// Set this bundle's operands from \p Operands.
3990 void setOperands(ArrayRef<ValueList> Operands) {
3991 for (unsigned I : seq<unsigned>(Operands.size()))
3992 setOperand(I, Operands[I]);
3993 }
3994
3995 /// Reorders operands of the node to the given mask \p Mask.
3996 void reorderOperands(ArrayRef<int> Mask) {
3997 for (ValueList &Operand : Operands)
3998 reorderScalars(Operand, Mask);
3999 }
4000
4001 /// \returns the \p OpIdx operand of this TreeEntry.
4002 ValueList &getOperand(unsigned OpIdx) {
4003 assert(OpIdx < Operands.size() && "Off bounds");
4004 return Operands[OpIdx];
4005 }
4006
4007 /// \returns the \p OpIdx operand of this TreeEntry.
4008 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
4009 assert(OpIdx < Operands.size() && "Off bounds");
4010 return Operands[OpIdx];
4011 }
4012
4013 /// \returns the number of operands.
4014 unsigned getNumOperands() const { return Operands.size(); }
4015
4016 /// \return the single \p OpIdx operand.
4017 Value *getSingleOperand(unsigned OpIdx) const {
4018 assert(OpIdx < Operands.size() && "Off bounds");
4019 assert(!Operands[OpIdx].empty() && "No operand available");
4020 return Operands[OpIdx][0];
4021 }
4022
4023 /// Some of the instructions in the list have alternate opcodes.
4024 bool isAltShuffle() const { return S.isAltShuffle(); }
4025
4026 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
4027 return S.getMatchingMainOpOrAltOp(I);
4028 }
4029
4030 /// Chooses the correct key for scheduling data. If \p Op has the same (or
4031 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
4032 /// \p OpValue.
4033 Value *isOneOf(Value *Op) const {
4034 auto *I = dyn_cast<Instruction>(Op);
4035 if (I && getMatchingMainOpOrAltOp(I))
4036 return Op;
4037 return S.getMainOp();
4038 }
4039
4040 void setOperations(const InstructionsState &S) {
4041 assert(S && "InstructionsState is invalid.");
4042 this->S = S;
4043 }
4044
4045 Instruction *getMainOp() const { return S.getMainOp(); }
4046
4047 Instruction *getAltOp() const { return S.getAltOp(); }
4048
4049 /// The main/alternate opcodes for the list of instructions.
4050 unsigned getOpcode() const { return S.getOpcode(); }
4051
4052 unsigned getAltOpcode() const { return S.getAltOpcode(); }
4053
4054 bool hasState() const { return S.valid(); }
4055
4056 /// Add \p V to the list of copyable elements.
4057 void addCopyableElement(Value *V) {
4058 assert(S.isCopyableElement(V) && "Not a copyable element.");
4059 CopyableElements.insert(V);
4060 }
4061
4062 /// Returns true if \p V is a copyable element.
4063 bool isCopyableElement(Value *V) const {
4064 return CopyableElements.contains(V);
4065 }
4066
4067 /// Returns true if any scalar in the list is a copyable element.
4068 bool hasCopyableElements() const { return !CopyableElements.empty(); }
4069
4070 /// Returns the state of the operations.
4071 const InstructionsState &getOperations() const { return S; }
4072
4073 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
4074 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
4075 unsigned findLaneForValue(Value *V) const {
4076 unsigned FoundLane = getVectorFactor();
4077 for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End;
4078 std::advance(It, 1)) {
4079 if (*It != V)
4080 continue;
4081 FoundLane = std::distance(Scalars.begin(), It);
4082 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4083 if (!ReorderIndices.empty())
4084 FoundLane = ReorderIndices[FoundLane];
4085 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4086 if (ReuseShuffleIndices.empty())
4087 break;
4088 if (auto *RIt = find(ReuseShuffleIndices, FoundLane);
4089 RIt != ReuseShuffleIndices.end()) {
4090 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
4091 break;
4092 }
4093 }
4094 assert(FoundLane < getVectorFactor() && "Unable to find given value.");
4095 return FoundLane;
4096 }
4097
4098 /// Build a shuffle mask for graph entry which represents a merge of main
4099 /// and alternate operations.
4100 void
4101 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
4103 SmallVectorImpl<Value *> *OpScalars = nullptr,
4104 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
4105
4106 /// Return true if this is a non-power-of-2 node.
4107 bool isNonPowOf2Vec() const {
4108 bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
4109 return IsNonPowerOf2;
4110 }
4111
4112 /// Return true if this is a node, which tries to vectorize number of
4113 /// elements, forming whole vectors.
4114 bool
4115 hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
4116 bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
4117 TTI, getValueType(Scalars.front()), Scalars.size());
4118 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
4119 "Reshuffling not supported with non-power-of-2 vectors yet.");
4120 return IsNonPowerOf2;
4121 }
4122
4123 Value *getOrdered(unsigned Idx) const {
4124 assert(isGather() && "Must be used only for buildvectors/gathers.");
4125 if (ReorderIndices.empty())
4126 return Scalars[Idx];
4128 inversePermutation(ReorderIndices, Mask);
4129 return Scalars[Mask[Idx]];
4130 }
4131
4132#ifndef NDEBUG
4133 /// Debug printer.
4134 LLVM_DUMP_METHOD void dump() const {
4135 dbgs() << Idx << ".\n";
4136 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4137 dbgs() << "Operand " << OpI << ":\n";
4138 for (const Value *V : Operands[OpI])
4139 dbgs().indent(2) << *V << "\n";
4140 }
4141 dbgs() << "Scalars: \n";
4142 for (Value *V : Scalars)
4143 dbgs().indent(2) << *V << "\n";
4144 dbgs() << "State: ";
4145 if (S && hasCopyableElements())
4146 dbgs() << "[[Copyable]] ";
4147 switch (State) {
4148 case Vectorize:
4149 if (InterleaveFactor > 0) {
4150 dbgs() << "Vectorize with interleave factor " << InterleaveFactor
4151 << "\n";
4152 } else {
4153 dbgs() << "Vectorize\n";
4154 }
4155 break;
4156 case ScatterVectorize:
4157 dbgs() << "ScatterVectorize\n";
4158 break;
4159 case StridedVectorize:
4160 dbgs() << "StridedVectorize\n";
4161 break;
4162 case CompressVectorize:
4163 dbgs() << "CompressVectorize\n";
4164 break;
4165 case NeedToGather:
4166 dbgs() << "NeedToGather\n";
4167 break;
4168 case CombinedVectorize:
4169 dbgs() << "CombinedVectorize\n";
4170 break;
4171 case SplitVectorize:
4172 dbgs() << "SplitVectorize\n";
4173 break;
4174 }
4175 if (S) {
4176 dbgs() << "MainOp: " << *S.getMainOp() << "\n";
4177 dbgs() << "AltOp: " << *S.getAltOp() << "\n";
4178 } else {
4179 dbgs() << "MainOp: NULL\n";
4180 dbgs() << "AltOp: NULL\n";
4181 }
4182 dbgs() << "VectorizedValue: ";
4183 if (VectorizedValue)
4184 dbgs() << *VectorizedValue << "\n";
4185 else
4186 dbgs() << "NULL\n";
4187 dbgs() << "ReuseShuffleIndices: ";
4188 if (ReuseShuffleIndices.empty())
4189 dbgs() << "Empty";
4190 else
4191 for (int ReuseIdx : ReuseShuffleIndices)
4192 dbgs() << ReuseIdx << ", ";
4193 dbgs() << "\n";
4194 dbgs() << "ReorderIndices: ";
4195 for (unsigned ReorderIdx : ReorderIndices)
4196 dbgs() << ReorderIdx << ", ";
4197 dbgs() << "\n";
4198 dbgs() << "UserTreeIndex: ";
4199 if (UserTreeIndex)
4200 dbgs() << UserTreeIndex;
4201 else
4202 dbgs() << "<invalid>";
4203 dbgs() << "\n";
4204 if (!CombinedEntriesWithIndices.empty()) {
4205 dbgs() << "Combined entries: ";
4206 interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
4207 dbgs() << "Entry index " << P.first << " with offset " << P.second;
4208 });
4209 dbgs() << "\n";
4210 }
4211 }
4212#endif
4213 };
4214
4215#ifndef NDEBUG
4216 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
4217 InstructionCost VecCost, InstructionCost ScalarCost,
4218 StringRef Banner) const {
4219 dbgs() << "SLP: " << Banner << ":\n";
4220 E->dump();
4221 dbgs() << "SLP: Costs:\n";
4222 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
4223 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
4224 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
4225 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4226 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
4227 }
4228#endif
4229
4230 /// Create a new gather TreeEntry
4231 TreeEntry *newGatherTreeEntry(ArrayRef<Value *> VL,
4232 const InstructionsState &S,
4233 const EdgeInfo &UserTreeIdx,
4234 ArrayRef<int> ReuseShuffleIndices = {}) {
4235 auto Invalid = ScheduleBundle::invalid();
4236 return newTreeEntry(VL, Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4237 }
4238
4239 /// Create a new VectorizableTree entry.
4240 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, ScheduleBundle &Bundle,
4241 const InstructionsState &S,
4242 const EdgeInfo &UserTreeIdx,
4243 ArrayRef<int> ReuseShuffleIndices = {},
4244 ArrayRef<unsigned> ReorderIndices = {},
4245 unsigned InterleaveFactor = 0) {
4246 TreeEntry::EntryState EntryState =
4247 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4248 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4249 ReuseShuffleIndices, ReorderIndices);
4250 if (E && InterleaveFactor > 0)
4251 E->setInterleave(InterleaveFactor);
4252 return E;
4253 }
4254
4255 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
4256 TreeEntry::EntryState EntryState,
4257 ScheduleBundle &Bundle, const InstructionsState &S,
4258 const EdgeInfo &UserTreeIdx,
4259 ArrayRef<int> ReuseShuffleIndices = {},
4260 ArrayRef<unsigned> ReorderIndices = {}) {
4261 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4262 EntryState == TreeEntry::SplitVectorize)) ||
4263 (Bundle && EntryState != TreeEntry::NeedToGather &&
4264 EntryState != TreeEntry::SplitVectorize)) &&
4265 "Need to vectorize gather entry?");
4266 // Gathered loads still gathered? Do not create entry, use the original one.
4267 if (GatheredLoadsEntriesFirst.has_value() &&
4268 EntryState == TreeEntry::NeedToGather && S &&
4269 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4270 !UserTreeIdx.UserTE)
4271 return nullptr;
4272 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
4273 TreeEntry *Last = VectorizableTree.back().get();
4274 Last->Idx = VectorizableTree.size() - 1;
4275 Last->State = EntryState;
4276 if (UserTreeIdx.UserTE)
4277 OperandsToTreeEntry.try_emplace(
4278 std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx), Last);
4279 // FIXME: Remove once support for ReuseShuffleIndices has been implemented
4280 // for non-power-of-two vectors.
4281 assert(
4283 ReuseShuffleIndices.empty()) &&
4284 "Reshuffling scalars not yet supported for nodes with padding");
4285 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
4286 ReuseShuffleIndices.end());
4287 if (ReorderIndices.empty()) {
4288 Last->Scalars.assign(VL.begin(), VL.end());
4289 if (S)
4290 Last->setOperations(S);
4291 } else {
4292 // Reorder scalars and build final mask.
4293 Last->Scalars.assign(VL.size(), nullptr);
4294 transform(ReorderIndices, Last->Scalars.begin(),
4295 [VL](unsigned Idx) -> Value * {
4296 if (Idx >= VL.size())
4297 return UndefValue::get(VL.front()->getType());
4298 return VL[Idx];
4299 });
4300 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
4301 if (S)
4302 Last->setOperations(S);
4303 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
4304 }
4305 if (EntryState == TreeEntry::SplitVectorize) {
4306 assert(S && "Split nodes must have operations.");
4307 Last->setOperations(S);
4308 SmallPtrSet<Value *, 4> Processed;
4309 for (Value *V : VL) {
4310 auto *I = dyn_cast<Instruction>(V);
4311 if (!I)
4312 continue;
4313 auto It = ScalarsInSplitNodes.find(V);
4314 if (It == ScalarsInSplitNodes.end()) {
4315 ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(Last);
4316 (void)Processed.insert(V);
4317 } else if (Processed.insert(V).second) {
4318 assert(!is_contained(It->getSecond(), Last) &&
4319 "Value already associated with the node.");
4320 It->getSecond().push_back(Last);
4321 }
4322 }
4323 } else if (!Last->isGather()) {
4324 if (isa<PHINode>(S.getMainOp()) ||
4325 isVectorLikeInstWithConstOps(S.getMainOp()) ||
4326 (!S.areInstructionsWithCopyableElements() &&
4327 doesNotNeedToSchedule(VL)) ||
4328 all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))
4329 Last->setDoesNotNeedToSchedule();
4330 SmallPtrSet<Value *, 4> Processed;
4331 for (Value *V : VL) {
4332 if (isa<PoisonValue>(V))
4333 continue;
4334 if (S.isCopyableElement(V)) {
4335 Last->addCopyableElement(V);
4336 continue;
4337 }
4338 auto It = ScalarToTreeEntries.find(V);
4339 if (It == ScalarToTreeEntries.end()) {
4340 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(Last);
4341 (void)Processed.insert(V);
4342 } else if (Processed.insert(V).second) {
4343 assert(!is_contained(It->getSecond(), Last) &&
4344 "Value already associated with the node.");
4345 It->getSecond().push_back(Last);
4346 }
4347 }
4348 // Update the scheduler bundle to point to this TreeEntry.
4349 assert((!Bundle.getBundle().empty() || Last->doesNotNeedToSchedule()) &&
4350 "Bundle and VL out of sync");
4351 if (!Bundle.getBundle().empty()) {
4352#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4353 auto *BundleMember = Bundle.getBundle().begin();
4354 SmallPtrSet<Value *, 4> Processed;
4355 for (Value *V : VL) {
4356 if (S.isNonSchedulable(V) || !Processed.insert(V).second)
4357 continue;
4358 ++BundleMember;
4359 }
4360 assert(BundleMember == Bundle.getBundle().end() &&
4361 "Bundle and VL out of sync");
4362#endif
4363 Bundle.setTreeEntry(Last);
4364 }
4365 } else {
4366 // Build a map for gathered scalars to the nodes where they are used.
4367 bool AllConstsOrCasts = true;
4368 for (Value *V : VL) {
4369 if (S && S.areInstructionsWithCopyableElements() &&
4370 S.isCopyableElement(V))
4371 Last->addCopyableElement(V);
4372 if (!isConstant(V)) {
4373 auto *I = dyn_cast<CastInst>(V);
4374 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
4375 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4376 !UserTreeIdx.UserTE->isGather())
4377 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
4378 }
4379 }
4380 if (AllConstsOrCasts)
4381 CastMaxMinBWSizes =
4382 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
4383 MustGather.insert_range(VL);
4384 }
4385
4386 if (UserTreeIdx.UserTE)
4387 Last->UserTreeIndex = UserTreeIdx;
4388 return Last;
4389 }
4390
4391 /// -- Vectorization State --
4392 /// Holds all of the tree entries.
4393 TreeEntry::VecTreeTy VectorizableTree;
4394
4395#ifndef NDEBUG
4396 /// Debug printer.
4397 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
4398 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4399 VectorizableTree[Id]->dump();
4400 dbgs() << "\n";
4401 }
4402 }
4403#endif
4404
4405 /// Get list of vector entries, associated with the value \p V.
4406 ArrayRef<TreeEntry *> getTreeEntries(Value *V) const {
4407 assert(V && "V cannot be nullptr.");
4408 auto It = ScalarToTreeEntries.find(V);
4409 if (It == ScalarToTreeEntries.end())
4410 return {};
4411 return It->getSecond();
4412 }
4413
4414 /// Get list of split vector entries, associated with the value \p V.
4415 ArrayRef<TreeEntry *> getSplitTreeEntries(Value *V) const {
4416 assert(V && "V cannot be nullptr.");
4417 auto It = ScalarsInSplitNodes.find(V);
4418 if (It == ScalarsInSplitNodes.end())
4419 return {};
4420 return It->getSecond();
4421 }
4422
4423 /// Returns first vector node for value \p V, matching values \p VL.
4424 TreeEntry *getSameValuesTreeEntry(Value *V, ArrayRef<Value *> VL,
4425 bool SameVF = false) const {
4426 assert(V && "V cannot be nullptr.");
4427 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
4428 if ((!SameVF || TE->getVectorFactor() == VL.size()) && TE->isSame(VL))
4429 return TE;
4430 return nullptr;
4431 }
4432
4433 /// Check that the operand node of alternate node does not generate
4434 /// buildvector sequence. If it is, then probably not worth it to build
4435 /// alternate shuffle, if number of buildvector operands + alternate
4436 /// instruction > than the number of buildvector instructions.
4437 /// \param S the instructions state of the analyzed values.
4438 /// \param VL list of the instructions with alternate opcodes.
4439 bool areAltOperandsProfitable(const InstructionsState &S,
4440 ArrayRef<Value *> VL) const;
4441
4442 /// Contains all the outputs of legality analysis for a list of values to
4443 /// vectorize.
4444 class ScalarsVectorizationLegality {
4445 InstructionsState S;
4446 bool IsLegal;
4447 bool TryToFindDuplicates;
4448 bool TrySplitVectorize;
4449
4450 public:
4451 ScalarsVectorizationLegality(InstructionsState S, bool IsLegal,
4452 bool TryToFindDuplicates = true,
4453 bool TrySplitVectorize = false)
4454 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4455 TrySplitVectorize(TrySplitVectorize) {
4456 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4457 "Inconsistent state");
4458 }
4459 const InstructionsState &getInstructionsState() const { return S; };
4460 bool isLegal() const { return IsLegal; }
4461 bool tryToFindDuplicates() const { return TryToFindDuplicates; }
4462 bool trySplitVectorize() const { return TrySplitVectorize; }
4463 };
4464
4465 /// Checks if the specified list of the instructions/values can be vectorized
4466 /// in general.
4467 ScalarsVectorizationLegality
4468 getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
4469 const EdgeInfo &UserTreeIdx,
4470 bool TryCopyableElementsVectorization) const;
4471
4472 /// Checks if the specified list of the instructions/values can be vectorized
4473 /// and fills required data before actual scheduling of the instructions.
4474 TreeEntry::EntryState
4475 getScalarsVectorizationState(const InstructionsState &S, ArrayRef<Value *> VL,
4476 bool IsScatterVectorizeUserTE,
4477 OrdersType &CurrentOrder,
4478 SmallVectorImpl<Value *> &PointerOps);
4479
4480 /// Maps a specific scalar to its tree entry(ies).
4482
4483 /// Maps the operand index and entry to the corresponding tree entry.
4485 OperandsToTreeEntry;
4486
4487 /// Scalars, used in split vectorize nodes.
4489
4490 /// Maps a value to the proposed vectorizable size.
4491 SmallDenseMap<Value *, unsigned> InstrElementSize;
4492
4493 /// A list of scalars that we found that we need to keep as scalars.
4494 ValueSet MustGather;
4495
4496 /// A set of first non-schedulable values.
4497 ValueSet NonScheduledFirst;
4498
4499 /// A map between the vectorized entries and the last instructions in the
4500 /// bundles. The bundles are built in use order, not in the def order of the
4501 /// instructions. So, we cannot rely directly on the last instruction in the
4502 /// bundle being the last instruction in the program order during
4503 /// vectorization process since the basic blocks are affected, need to
4504 /// pre-gather them before.
4506
4507 /// List of gather nodes, depending on other gather/vector nodes, which should
4508 /// be emitted after the vector instruction emission process to correctly
4509 /// handle order of the vector instructions and shuffles.
4510 SetVector<const TreeEntry *> PostponedGathers;
4511
4512 using ValueToGatherNodesMap =
4514 ValueToGatherNodesMap ValueToGatherNodes;
4515
4516 /// A list of the load entries (node indices), which can be vectorized using
4517 /// strided or masked gather approach, but attempted to be represented as
4518 /// contiguous loads.
4519 SetVector<unsigned> LoadEntriesToVectorize;
4520
4521 /// true if graph nodes transforming mode is on.
4522 bool IsGraphTransformMode = false;
4523
4524 /// The index of the first gathered load entry in the VectorizeTree.
4525 std::optional<unsigned> GatheredLoadsEntriesFirst;
4526
4527 /// Maps compress entries to their mask data for the final codegen.
4528 SmallDenseMap<const TreeEntry *,
4529 std::tuple<SmallVector<int>, VectorType *, unsigned, bool>>
4530 CompressEntryToData;
4531
4532 /// This POD struct describes one external user in the vectorized tree.
4533 struct ExternalUser {
4534 ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, unsigned L)
4535 : Scalar(S), User(U), E(E), Lane(L) {}
4536
4537 /// Which scalar in our function.
4538 Value *Scalar = nullptr;
4539
4540 /// Which user that uses the scalar.
4541 llvm::User *User = nullptr;
4542
4543 /// Vector node, the value is part of.
4544 const TreeEntry &E;
4545
4546 /// Which lane does the scalar belong to.
4547 unsigned Lane;
4548 };
4549 using UserList = SmallVector<ExternalUser, 16>;
4550
4551 /// Checks if two instructions may access the same memory.
4552 ///
4553 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
4554 /// is invariant in the calling loop.
4555 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
4556 Instruction *Inst2) {
4557 assert(Loc1.Ptr && isSimple(Inst1) && "Expected simple first instruction.");
4558 // First check if the result is already in the cache.
4559 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
4560 auto Res = AliasCache.try_emplace(Key);
4561 if (!Res.second)
4562 return Res.first->second;
4563 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
4564 // Store the result in the cache.
4565 Res.first->getSecond() = Aliased;
4566 return Aliased;
4567 }
4568
4569 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4570
4571 /// Cache for alias results.
4572 /// TODO: consider moving this to the AliasAnalysis itself.
4574
4575 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
4576 // globally through SLP because we don't perform any action which
4577 // invalidates capture results.
4578 BatchAAResults BatchAA;
4579
4580 /// Temporary store for deleted instructions. Instructions will be deleted
4581 /// eventually when the BoUpSLP is destructed. The deferral is required to
4582 /// ensure that there are no incorrect collisions in the AliasCache, which
4583 /// can happen if a new instruction is allocated at the same address as a
4584 /// previously deleted instruction.
4585 DenseSet<Instruction *> DeletedInstructions;
4586
4587 /// Set of the instruction, being analyzed already for reductions.
4588 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
4589
4590 /// Set of hashes for the list of reduction values already being analyzed.
4591 DenseSet<size_t> AnalyzedReductionVals;
4592
4593 /// Values, already been analyzed for mininmal bitwidth and found to be
4594 /// non-profitable.
4595 DenseSet<Value *> AnalyzedMinBWVals;
4596
4597 /// A list of values that need to extracted out of the tree.
4598 /// This list holds pairs of (Internal Scalar : External User). External User
4599 /// can be nullptr, it means that this Internal Scalar will be used later,
4600 /// after vectorization.
4601 UserList ExternalUses;
4602
4603 /// A list of GEPs which can be reaplced by scalar GEPs instead of
4604 /// extractelement instructions.
4605 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
4606
4607 /// A list of scalar to be extracted without specific user necause of too many
4608 /// uses.
4609 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
4610
4611 /// Values used only by @llvm.assume calls.
4613
4614 /// Holds all of the instructions that we gathered, shuffle instructions and
4615 /// extractelements.
4616 SetVector<Instruction *> GatherShuffleExtractSeq;
4617
4618 /// A list of blocks that we are going to CSE.
4619 DenseSet<BasicBlock *> CSEBlocks;
4620
4621 /// List of hashes of vector of loads, which are known to be non vectorizable.
4622 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
4623
4624 /// Represents a scheduling entity, either ScheduleData, ScheduleCopyableData
4625 /// or ScheduleBundle. ScheduleData used to gather dependecies for a single
4626 /// instructions, while ScheduleBundle represents a batch of instructions,
4627 /// going to be groupped together. ScheduleCopyableData models extra user for
4628 /// "copyable" instructions.
4629 class ScheduleEntity {
4630 friend class ScheduleBundle;
4631 friend class ScheduleData;
4632 friend class ScheduleCopyableData;
4633
4634 protected:
4635 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
4636 Kind getKind() const { return K; }
4637 ScheduleEntity(Kind K) : K(K) {}
4638
4639 private:
4640 /// Used for getting a "good" final ordering of instructions.
4641 int SchedulingPriority = 0;
4642 /// True if this instruction (or bundle) is scheduled (or considered as
4643 /// scheduled in the dry-run).
4644 bool IsScheduled = false;
4645 /// The kind of the ScheduleEntity.
4646 const Kind K = Kind::ScheduleData;
4647
4648 public:
4649 ScheduleEntity() = delete;
4650 /// Gets/sets the scheduling priority.
4651 void setSchedulingPriority(int Priority) { SchedulingPriority = Priority; }
4652 int getSchedulingPriority() const { return SchedulingPriority; }
4653 bool isReady() const {
4654 if (const auto *SD = dyn_cast<ScheduleData>(this))
4655 return SD->isReady();
4656 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4657 return CD->isReady();
4658 return cast<ScheduleBundle>(this)->isReady();
4659 }
4660 /// Returns true if the dependency information has been calculated.
4661 /// Note that depenendency validity can vary between instructions within
4662 /// a single bundle.
4663 bool hasValidDependencies() const {
4664 if (const auto *SD = dyn_cast<ScheduleData>(this))
4665 return SD->hasValidDependencies();
4666 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4667 return CD->hasValidDependencies();
4668 return cast<ScheduleBundle>(this)->hasValidDependencies();
4669 }
4670 /// Gets the number of unscheduled dependencies.
4671 int getUnscheduledDeps() const {
4672 if (const auto *SD = dyn_cast<ScheduleData>(this))
4673 return SD->getUnscheduledDeps();
4674 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4675 return CD->getUnscheduledDeps();
4676 return cast<ScheduleBundle>(this)->unscheduledDepsInBundle();
4677 }
4678 /// Increments the number of unscheduled dependencies.
4679 int incrementUnscheduledDeps(int Incr) {
4680 if (auto *SD = dyn_cast<ScheduleData>(this))
4681 return SD->incrementUnscheduledDeps(Incr);
4682 return cast<ScheduleCopyableData>(this)->incrementUnscheduledDeps(Incr);
4683 }
4684 /// Gets the number of dependencies.
4685 int getDependencies() const {
4686 if (const auto *SD = dyn_cast<ScheduleData>(this))
4687 return SD->getDependencies();
4688 return cast<ScheduleCopyableData>(this)->getDependencies();
4689 }
4690 /// Gets the instruction.
4691 Instruction *getInst() const {
4692 if (const auto *SD = dyn_cast<ScheduleData>(this))
4693 return SD->getInst();
4694 return cast<ScheduleCopyableData>(this)->getInst();
4695 }
4696
4697 /// Gets/sets if the bundle is scheduled.
4698 bool isScheduled() const { return IsScheduled; }
4699 void setScheduled(bool Scheduled) { IsScheduled = Scheduled; }
4700
4701 static bool classof(const ScheduleEntity *) { return true; }
4702
4703#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4704 void dump(raw_ostream &OS) const {
4705 if (const auto *SD = dyn_cast<ScheduleData>(this))
4706 return SD->dump(OS);
4707 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4708 return CD->dump(OS);
4709 return cast<ScheduleBundle>(this)->dump(OS);
4710 }
4711
4712 LLVM_DUMP_METHOD void dump() const {
4713 dump(dbgs());
4714 dbgs() << '\n';
4715 }
4716#endif // if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4717 };
4718
4719#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4721 const BoUpSLP::ScheduleEntity &SE) {
4722 SE.dump(OS);
4723 return OS;
4724 }
4725#endif
4726
4727 /// Contains all scheduling relevant data for an instruction.
4728 /// A ScheduleData either represents a single instruction or a member of an
4729 /// instruction bundle (= a group of instructions which is combined into a
4730 /// vector instruction).
4731 class ScheduleData final : public ScheduleEntity {
4732 public:
4733 // The initial value for the dependency counters. It means that the
4734 // dependencies are not calculated yet.
4735 enum { InvalidDeps = -1 };
4736
4737 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
4738 static bool classof(const ScheduleEntity *Entity) {
4739 return Entity->getKind() == Kind::ScheduleData;
4740 }
4741
4742 void init(int BlockSchedulingRegionID, Instruction *I) {
4743 NextLoadStore = nullptr;
4744 IsScheduled = false;
4745 SchedulingRegionID = BlockSchedulingRegionID;
4746 clearDependencies();
4747 Inst = I;
4748 }
4749
4750 /// Verify basic self consistency properties
4751 void verify() {
4752 if (hasValidDependencies()) {
4753 assert(UnscheduledDeps <= Dependencies && "invariant");
4754 } else {
4755 assert(UnscheduledDeps == Dependencies && "invariant");
4756 }
4757
4758 if (IsScheduled) {
4759 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
4760 "unexpected scheduled state");
4761 }
4762 }
4763
4764 /// Returns true if the dependency information has been calculated.
4765 /// Note that depenendency validity can vary between instructions within
4766 /// a single bundle.
4767 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
4768
4769 /// Returns true if it is ready for scheduling, i.e. it has no more
4770 /// unscheduled depending instructions/bundles.
4771 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
4772
4773 /// Modifies the number of unscheduled dependencies for this instruction,
4774 /// and returns the number of remaining dependencies for the containing
4775 /// bundle.
4776 int incrementUnscheduledDeps(int Incr) {
4777 assert(hasValidDependencies() &&
4778 "increment of unscheduled deps would be meaningless");
4779 UnscheduledDeps += Incr;
4780 return UnscheduledDeps;
4781 }
4782
4783 /// Sets the number of unscheduled dependencies to the number of
4784 /// dependencies.
4785 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
4786
4787 /// Clears all dependency information.
4788 void clearDependencies() {
4789 clearDirectDependencies();
4790 MemoryDependencies.clear();
4791 ControlDependencies.clear();
4792 }
4793
4794 /// Clears all direct dependencies only, except for control and memory
4795 /// dependencies.
4796 /// Required for copyable elements to correctly handle control/memory deps
4797 /// and avoid extra reclaculation of such deps.
4798 void clearDirectDependencies() {
4799 Dependencies = InvalidDeps;
4800 resetUnscheduledDeps();
4801 IsScheduled = false;
4802 }
4803
4804 /// Gets the number of unscheduled dependencies.
4805 int getUnscheduledDeps() const { return UnscheduledDeps; }
4806 /// Gets the number of dependencies.
4807 int getDependencies() const { return Dependencies; }
4808 /// Initializes the number of dependencies.
4809 void initDependencies() { Dependencies = 0; }
4810 /// Increments the number of dependencies.
4811 void incDependencies() { Dependencies++; }
4812
4813 /// Gets scheduling region ID.
4814 int getSchedulingRegionID() const { return SchedulingRegionID; }
4815
4816 /// Gets the instruction.
4817 Instruction *getInst() const { return Inst; }
4818
4819 /// Gets the list of memory dependencies.
4820 ArrayRef<ScheduleData *> getMemoryDependencies() const {
4821 return MemoryDependencies;
4822 }
4823 /// Adds a memory dependency.
4824 void addMemoryDependency(ScheduleData *Dep) {
4825 MemoryDependencies.push_back(Dep);
4826 }
4827 /// Gets the list of control dependencies.
4828 ArrayRef<ScheduleData *> getControlDependencies() const {
4829 return ControlDependencies;
4830 }
4831 /// Adds a control dependency.
4832 void addControlDependency(ScheduleData *Dep) {
4833 ControlDependencies.push_back(Dep);
4834 }
4835 /// Gets/sets the next load/store instruction in the block.
4836 ScheduleData *getNextLoadStore() const { return NextLoadStore; }
4837 void setNextLoadStore(ScheduleData *Next) { NextLoadStore = Next; }
4838
4839 void dump(raw_ostream &OS) const { OS << *Inst; }
4840
4841 LLVM_DUMP_METHOD void dump() const {
4842 dump(dbgs());
4843 dbgs() << '\n';
4844 }
4845
4846 private:
4847 Instruction *Inst = nullptr;
4848
4849 /// Single linked list of all memory instructions (e.g. load, store, call)
4850 /// in the block - until the end of the scheduling region.
4851 ScheduleData *NextLoadStore = nullptr;
4852
4853 /// The dependent memory instructions.
4854 /// This list is derived on demand in calculateDependencies().
4855 SmallVector<ScheduleData *> MemoryDependencies;
4856
4857 /// List of instructions which this instruction could be control dependent
4858 /// on. Allowing such nodes to be scheduled below this one could introduce
4859 /// a runtime fault which didn't exist in the original program.
4860 /// ex: this is a load or udiv following a readonly call which inf loops
4861 SmallVector<ScheduleData *> ControlDependencies;
4862
4863 /// This ScheduleData is in the current scheduling region if this matches
4864 /// the current SchedulingRegionID of BlockScheduling.
4865 int SchedulingRegionID = 0;
4866
4867 /// The number of dependencies. Constitutes of the number of users of the
4868 /// instruction plus the number of dependent memory instructions (if any).
4869 /// This value is calculated on demand.
4870 /// If InvalidDeps, the number of dependencies is not calculated yet.
4871 int Dependencies = InvalidDeps;
4872
4873 /// The number of dependencies minus the number of dependencies of scheduled
4874 /// instructions. As soon as this is zero, the instruction/bundle gets ready
4875 /// for scheduling.
4876 /// Note that this is negative as long as Dependencies is not calculated.
4877 int UnscheduledDeps = InvalidDeps;
4878 };
4879
4880#ifndef NDEBUG
4882 const BoUpSLP::ScheduleData &SD) {
4883 SD.dump(OS);
4884 return OS;
4885 }
4886#endif
4887
4888 class ScheduleBundle final : public ScheduleEntity {
4889 /// The schedule data for the instructions in the bundle.
4891 /// True if this bundle is valid.
4892 bool IsValid = true;
4893 /// The TreeEntry that this instruction corresponds to.
4894 TreeEntry *TE = nullptr;
4895 ScheduleBundle(bool IsValid)
4896 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
4897
4898 public:
4899 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
4900 static bool classof(const ScheduleEntity *Entity) {
4901 return Entity->getKind() == Kind::ScheduleBundle;
4902 }
4903
4904 /// Verify basic self consistency properties
4905 void verify() const {
4906 for (const ScheduleEntity *SD : Bundle) {
4907 if (SD->hasValidDependencies()) {
4908 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
4909 "invariant");
4910 } else {
4911 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
4912 "invariant");
4913 }
4914
4915 if (isScheduled()) {
4916 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
4917 "unexpected scheduled state");
4918 }
4919 }
4920 }
4921
4922 /// Returns the number of unscheduled dependencies in the bundle.
4923 int unscheduledDepsInBundle() const {
4924 assert(*this && "bundle must not be empty");
4925 int Sum = 0;
4926 for (const ScheduleEntity *BundleMember : Bundle) {
4927 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
4928 return ScheduleData::InvalidDeps;
4929 Sum += BundleMember->getUnscheduledDeps();
4930 }
4931 return Sum;
4932 }
4933
4934 /// Returns true if the dependency information has been calculated.
4935 /// Note that depenendency validity can vary between instructions within
4936 /// a single bundle.
4937 bool hasValidDependencies() const {
4938 return all_of(Bundle, [](const ScheduleEntity *SD) {
4939 return SD->hasValidDependencies();
4940 });
4941 }
4942
4943 /// Returns true if it is ready for scheduling, i.e. it has no more
4944 /// unscheduled depending instructions/bundles.
4945 bool isReady() const {
4946 assert(*this && "bundle must not be empty");
4947 return unscheduledDepsInBundle() == 0 && !isScheduled();
4948 }
4949
4950 /// Returns the bundle of scheduling data, associated with the current
4951 /// instruction.
4952 ArrayRef<ScheduleEntity *> getBundle() { return Bundle; }
4953 ArrayRef<const ScheduleEntity *> getBundle() const { return Bundle; }
4954 /// Adds an instruction to the bundle.
4955 void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
4956
4957 /// Gets/sets the associated tree entry.
4958 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
4959 TreeEntry *getTreeEntry() const { return TE; }
4960
4961 static ScheduleBundle invalid() { return {false}; }
4962
4963 operator bool() const { return IsValid; }
4964
4965#ifndef NDEBUG
4966 void dump(raw_ostream &OS) const {
4967 if (!*this) {
4968 OS << "[]";
4969 return;
4970 }
4971 OS << '[';
4972 interleaveComma(Bundle, OS, [&](const ScheduleEntity *SD) {
4973 if (isa<ScheduleCopyableData>(SD))
4974 OS << "<Copyable>";
4975 OS << *SD->getInst();
4976 });
4977 OS << ']';
4978 }
4979
4980 LLVM_DUMP_METHOD void dump() const {
4981 dump(dbgs());
4982 dbgs() << '\n';
4983 }
4984#endif // NDEBUG
4985 };
4986
4987#ifndef NDEBUG
4989 const BoUpSLP::ScheduleBundle &Bundle) {
4990 Bundle.dump(OS);
4991 return OS;
4992 }
4993#endif
4994
4995 /// Contains all scheduling relevant data for the copyable instruction.
4996 /// It models the virtual instructions, supposed to replace the original
4997 /// instructions. E.g., if instruction %0 = load is a part of the bundle [%0,
4998 /// %1], where %1 = add, then the ScheduleCopyableData models virtual
4999 /// instruction %virt = add %0, 0.
5000 class ScheduleCopyableData final : public ScheduleEntity {
5001 /// The source schedule data for the instruction.
5002 Instruction *Inst = nullptr;
5003 /// The edge information for the instruction.
5004 const EdgeInfo EI;
5005 /// This ScheduleData is in the current scheduling region if this matches
5006 /// the current SchedulingRegionID of BlockScheduling.
5007 int SchedulingRegionID = 0;
5008 /// Bundle, this data is part of.
5009 ScheduleBundle &Bundle;
5010
5011 public:
5012 ScheduleCopyableData(int BlockSchedulingRegionID, Instruction *I,
5013 const EdgeInfo &EI, ScheduleBundle &Bundle)
5014 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(I), EI(EI),
5015 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5016 static bool classof(const ScheduleEntity *Entity) {
5017 return Entity->getKind() == Kind::ScheduleCopyableData;
5018 }
5019
5020 /// Verify basic self consistency properties
5021 void verify() {
5022 if (hasValidDependencies()) {
5023 assert(UnscheduledDeps <= Dependencies && "invariant");
5024 } else {
5025 assert(UnscheduledDeps == Dependencies && "invariant");
5026 }
5027
5028 if (IsScheduled) {
5029 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5030 "unexpected scheduled state");
5031 }
5032 }
5033
5034 /// Returns true if the dependency information has been calculated.
5035 /// Note that depenendency validity can vary between instructions within
5036 /// a single bundle.
5037 bool hasValidDependencies() const {
5038 return Dependencies != ScheduleData::InvalidDeps;
5039 }
5040
5041 /// Returns true if it is ready for scheduling, i.e. it has no more
5042 /// unscheduled depending instructions/bundles.
5043 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
5044
5045 /// Modifies the number of unscheduled dependencies for this instruction,
5046 /// and returns the number of remaining dependencies for the containing
5047 /// bundle.
5048 int incrementUnscheduledDeps(int Incr) {
5049 assert(hasValidDependencies() &&
5050 "increment of unscheduled deps would be meaningless");
5051 UnscheduledDeps += Incr;
5052 assert(UnscheduledDeps >= 0 && "invariant");
5053 return UnscheduledDeps;
5054 }
5055
5056 /// Sets the number of unscheduled dependencies to the number of
5057 /// dependencies.
5058 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5059
5060 /// Gets the number of unscheduled dependencies.
5061 int getUnscheduledDeps() const { return UnscheduledDeps; }
5062 /// Gets the number of dependencies.
5063 int getDependencies() const { return Dependencies; }
5064 /// Initializes the number of dependencies.
5065 void initDependencies() { Dependencies = 0; }
5066 /// Increments the number of dependencies.
5067 void incDependencies() { Dependencies++; }
5068
5069 /// Gets scheduling region ID.
5070 int getSchedulingRegionID() const { return SchedulingRegionID; }
5071
5072 /// Gets the instruction.
5073 Instruction *getInst() const { return Inst; }
5074
5075 /// Clears all dependency information.
5076 void clearDependencies() {
5077 Dependencies = ScheduleData::InvalidDeps;
5078 UnscheduledDeps = ScheduleData::InvalidDeps;
5079 IsScheduled = false;
5080 }
5081
5082 /// Gets the edge information.
5083 const EdgeInfo &getEdgeInfo() const { return EI; }
5084
5085 /// Gets the bundle.
5086 ScheduleBundle &getBundle() { return Bundle; }
5087 const ScheduleBundle &getBundle() const { return Bundle; }
5088
5089#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5090 void dump(raw_ostream &OS) const { OS << "[Copyable]" << *getInst(); }
5091
5092 LLVM_DUMP_METHOD void dump() const {
5093 dump(dbgs());
5094 dbgs() << '\n';
5095 }
5096#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5097
5098 private:
5099 /// true, if it has valid dependency information. These nodes always have
5100 /// only single dependency.
5101 int Dependencies = ScheduleData::InvalidDeps;
5102
5103 /// The number of dependencies minus the number of dependencies of scheduled
5104 /// instructions. As soon as this is zero, the instruction/bundle gets ready
5105 /// for scheduling.
5106 /// Note that this is negative as long as Dependencies is not calculated.
5107 int UnscheduledDeps = ScheduleData::InvalidDeps;
5108 };
5109
5110#ifndef NDEBUG
5111 friend inline raw_ostream &
5112 operator<<(raw_ostream &OS, const BoUpSLP::ScheduleCopyableData &SD) {
5113 SD.dump(OS);
5114 return OS;
5115 }
5116#endif
5117
5118 friend struct GraphTraits<BoUpSLP *>;
5119 friend struct DOTGraphTraits<BoUpSLP *>;
5120
5121 /// Contains all scheduling data for a basic block.
5122 /// It does not schedules instructions, which are not memory read/write
5123 /// instructions and their operands are either constants, or arguments, or
5124 /// phis, or instructions from others blocks, or their users are phis or from
5125 /// the other blocks. The resulting vector instructions can be placed at the
5126 /// beginning of the basic block without scheduling (if operands does not need
5127 /// to be scheduled) or at the end of the block (if users are outside of the
5128 /// block). It allows to save some compile time and memory used by the
5129 /// compiler.
5130 /// ScheduleData is assigned for each instruction in between the boundaries of
5131 /// the tree entry, even for those, which are not part of the graph. It is
5132 /// required to correctly follow the dependencies between the instructions and
5133 /// their correct scheduling. The ScheduleData is not allocated for the
5134 /// instructions, which do not require scheduling, like phis, nodes with
5135 /// extractelements/insertelements only or nodes with instructions, with
5136 /// uses/operands outside of the block.
5137 struct BlockScheduling {
5138 BlockScheduling(BasicBlock *BB)
5139 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
5140
5141 void clear() {
5142 ScheduledBundles.clear();
5143 ScheduledBundlesList.clear();
5144 ScheduleCopyableDataMap.clear();
5145 ScheduleCopyableDataMapByInst.clear();
5146 ScheduleCopyableDataMapByInstUser.clear();
5147 ScheduleCopyableDataMapByUsers.clear();
5148 ReadyInsts.clear();
5149 ScheduleStart = nullptr;
5150 ScheduleEnd = nullptr;
5151 FirstLoadStoreInRegion = nullptr;
5152 LastLoadStoreInRegion = nullptr;
5153 RegionHasStackSave = false;
5154
5155 // Reduce the maximum schedule region size by the size of the
5156 // previous scheduling run.
5157 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5158 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
5159 ScheduleRegionSizeLimit = MinScheduleRegionSize;
5160 ScheduleRegionSize = 0;
5161
5162 // Make a new scheduling region, i.e. all existing ScheduleData is not
5163 // in the new region yet.
5164 ++SchedulingRegionID;
5165 }
5166
5167 ScheduleData *getScheduleData(Instruction *I) {
5168 if (!I)
5169 return nullptr;
5170 if (BB != I->getParent())
5171 // Avoid lookup if can't possibly be in map.
5172 return nullptr;
5173 ScheduleData *SD = ScheduleDataMap.lookup(I);
5174 if (SD && isInSchedulingRegion(*SD))
5175 return SD;
5176 return nullptr;
5177 }
5178
5179 ScheduleData *getScheduleData(Value *V) {
5180 return getScheduleData(dyn_cast<Instruction>(V));
5181 }
5182
5183 /// Returns the ScheduleCopyableData for the given edge (user tree entry and
5184 /// operand number) and value.
5185 ScheduleCopyableData *getScheduleCopyableData(const EdgeInfo &EI,
5186 const Value *V) const {
5187 if (ScheduleCopyableDataMap.empty())
5188 return nullptr;
5189 auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
5190 if (It == ScheduleCopyableDataMap.end())
5191 return nullptr;
5192 ScheduleCopyableData *SD = It->getSecond().get();
5193 if (!isInSchedulingRegion(*SD))
5194 return nullptr;
5195 return SD;
5196 }
5197
5198 /// Returns the ScheduleCopyableData for the given user \p User, operand
5199 /// number and operand \p V.
5201 getScheduleCopyableData(const Value *User, unsigned OperandIdx,
5202 const Value *V) {
5203 if (ScheduleCopyableDataMapByInstUser.empty())
5204 return {};
5205 const auto It = ScheduleCopyableDataMapByInstUser.find(
5206 std::make_pair(std::make_pair(User, OperandIdx), V));
5207 if (It == ScheduleCopyableDataMapByInstUser.end())
5208 return {};
5210 for (ScheduleCopyableData *SD : It->getSecond()) {
5211 if (isInSchedulingRegion(*SD))
5212 Res.push_back(SD);
5213 }
5214 return Res;
5215 }
5216
5217 /// Returns true if all operands of the given instruction \p User are
5218 /// replaced by copyable data.
5219 /// \param User The user instruction.
5220 /// \param Op The operand, which might be replaced by the copyable data.
5221 /// \param SLP The SLP tree.
5222 /// \param NumOps The number of operands used. If the instruction uses the
5223 /// same operand several times, check for the first use, then the second,
5224 /// etc.
5225 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5226 Instruction *Op, BoUpSLP &SLP,
5227 unsigned NumOps) const {
5228 assert(NumOps > 0 && "No operands");
5229 if (ScheduleCopyableDataMap.empty())
5230 return false;
5231 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5233 for (const Use &U : User->operands()) {
5234 if (U.get() != Op)
5235 continue;
5236 ArrayRef<TreeEntry *> Entries = SLP.getTreeEntries(User);
5237 if (Entries.empty())
5238 return false;
5239 // Check all tree entries, if they have operands replaced by copyable
5240 // data.
5241 for (TreeEntry *TE : Entries) {
5242 // Check if the user is commutative.
5243 // The commutatives are handled later, as their oeprands can be
5244 // reordered.
5245 // Same applies even for non-commutative cmps, because we can invert
5246 // their predicate potentially and, thus, reorder the operands.
5247 bool IsCommutativeUser =
5248 ::isCommutative(TE->getMatchingMainOpOrAltOp(User), User);
5249 EdgeInfo EI(TE, U.getOperandNo());
5250 if (!IsCommutativeUser && !isa<CmpInst>(User)) {
5251 unsigned &OpCnt =
5252 OrderedEntriesCount.try_emplace(TE, 0).first->getSecond();
5253 if (!getScheduleCopyableData(EI, Op) && OpCnt < NumOps)
5254 return false;
5255 // Found copyable operand - continue.
5256 ++OpCnt;
5257 continue;
5258 }
5259 ++PotentiallyReorderedEntriesCount.try_emplace(TE, 0)
5260 .first->getSecond();
5261 }
5262 }
5263 // Check the commutative/cmp entries.
5264 if (!PotentiallyReorderedEntriesCount.empty()) {
5265 for (auto &P : PotentiallyReorderedEntriesCount) {
5266 auto *It = find(P.first->Scalars, User);
5267 assert(It != P.first->Scalars.end() &&
5268 "User is not in the tree entry");
5269 int Lane = std::distance(P.first->Scalars.begin(), It);
5270 assert(Lane >= 0 && "Lane is not found");
5271 if (isa<StoreInst>(User) && !P.first->ReorderIndices.empty())
5272 Lane = P.first->ReorderIndices[Lane];
5273 assert(Lane < static_cast<int>(P.first->Scalars.size()) &&
5274 "Couldn't find extract lane");
5275 SmallVector<unsigned> OpIndices;
5276 for (unsigned OpIdx :
5278 P.first->getMainOp()))) {
5279 if (P.first->getOperand(OpIdx)[Lane] == Op &&
5280 getScheduleCopyableData(EdgeInfo(P.first, OpIdx), Op))
5281 --P.getSecond();
5282 }
5283 }
5284 return all_of(PotentiallyReorderedEntriesCount,
5285 [&](const std::pair<const TreeEntry *, unsigned> &P) {
5286 return P.second == NumOps - 1;
5287 });
5288 }
5289 return true;
5290 }
5291
5293 getScheduleCopyableData(const Instruction *I) const {
5294 if (ScheduleCopyableDataMapByInst.empty())
5295 return {};
5296 const auto It = ScheduleCopyableDataMapByInst.find(I);
5297 if (It == ScheduleCopyableDataMapByInst.end())
5298 return {};
5300 for (ScheduleCopyableData *SD : It->getSecond()) {
5301 if (isInSchedulingRegion(*SD))
5302 Res.push_back(SD);
5303 }
5304 return Res;
5305 }
5306
5308 getScheduleCopyableDataUsers(const Instruction *User) const {
5309 if (ScheduleCopyableDataMapByUsers.empty())
5310 return {};
5311 const auto It = ScheduleCopyableDataMapByUsers.find(User);
5312 if (It == ScheduleCopyableDataMapByUsers.end())
5313 return {};
5315 for (ScheduleCopyableData *SD : It->getSecond()) {
5316 if (isInSchedulingRegion(*SD))
5317 Res.push_back(SD);
5318 }
5319 return Res;
5320 }
5321
5322 ScheduleCopyableData &addScheduleCopyableData(const EdgeInfo &EI,
5323 Instruction *I,
5324 int SchedulingRegionID,
5325 ScheduleBundle &Bundle) {
5326 assert(!getScheduleCopyableData(EI, I) && "already in the map");
5327 ScheduleCopyableData *CD =
5328 ScheduleCopyableDataMap
5329 .try_emplace(std::make_pair(EI, I),
5330 std::make_unique<ScheduleCopyableData>(
5331 SchedulingRegionID, I, EI, Bundle))
5332 .first->getSecond()
5333 .get();
5334 ScheduleCopyableDataMapByInst[I].push_back(CD);
5335 if (EI.UserTE) {
5336 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
5337 const auto *It = find(Op, I);
5338 assert(It != Op.end() && "Lane not set");
5340 do {
5341 int Lane = std::distance(Op.begin(), It);
5342 assert(Lane >= 0 && "Lane not set");
5343 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
5344 !EI.UserTE->ReorderIndices.empty())
5345 Lane = EI.UserTE->ReorderIndices[Lane];
5346 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
5347 "Couldn't find extract lane");
5348 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
5349 if (!Visited.insert(In).second) {
5350 It = find(make_range(std::next(It), Op.end()), I);
5351 continue;
5352 }
5353 ScheduleCopyableDataMapByInstUser
5354 .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx), I))
5355 .first->getSecond()
5356 .push_back(CD);
5357 ScheduleCopyableDataMapByUsers.try_emplace(I)
5358 .first->getSecond()
5359 .insert(CD);
5360 // Remove extra deps for users, becoming non-immediate users of the
5361 // instruction. It may happen, if the chain of same copyable elements
5362 // appears in the tree.
5363 if (In == I) {
5364 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5365 if (ScheduleCopyableData *UserCD =
5366 getScheduleCopyableData(UserEI, In))
5367 ScheduleCopyableDataMapByUsers[I].remove(UserCD);
5368 }
5369 It = find(make_range(std::next(It), Op.end()), I);
5370 } while (It != Op.end());
5371 } else {
5372 ScheduleCopyableDataMapByUsers.try_emplace(I).first->getSecond().insert(
5373 CD);
5374 }
5375 return *CD;
5376 }
5377
5378 ArrayRef<ScheduleBundle *> getScheduleBundles(Value *V) const {
5379 auto *I = dyn_cast<Instruction>(V);
5380 if (!I)
5381 return {};
5382 auto It = ScheduledBundles.find(I);
5383 if (It == ScheduledBundles.end())
5384 return {};
5385 return It->getSecond();
5386 }
5387
5388 /// Returns true if the entity is in the scheduling region.
5389 bool isInSchedulingRegion(const ScheduleEntity &SD) const {
5390 if (const auto *Data = dyn_cast<ScheduleData>(&SD))
5391 return Data->getSchedulingRegionID() == SchedulingRegionID;
5392 if (const auto *CD = dyn_cast<ScheduleCopyableData>(&SD))
5393 return CD->getSchedulingRegionID() == SchedulingRegionID;
5394 return all_of(cast<ScheduleBundle>(SD).getBundle(),
5395 [&](const ScheduleEntity *BundleMember) {
5396 return isInSchedulingRegion(*BundleMember);
5397 });
5398 }
5399
5400 /// Marks an instruction as scheduled and puts all dependent ready
5401 /// instructions into the ready-list.
5402 template <typename ReadyListType>
5403 void schedule(const BoUpSLP &R, const InstructionsState &S,
5404 const EdgeInfo &EI, ScheduleEntity *Data,
5405 ReadyListType &ReadyList) {
5406 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5408 // Handle the def-use chain dependencies.
5409
5410 // Decrement the unscheduled counter and insert to ready list if ready.
5411 auto DecrUnsched = [&](auto *Data, bool IsControl = false) {
5412 if ((IsControl || Data->hasValidDependencies()) &&
5413 Data->incrementUnscheduledDeps(-1) == 0) {
5414 // There are no more unscheduled dependencies after
5415 // decrementing, so we can put the dependent instruction
5416 // into the ready list.
5417 SmallVector<ScheduleBundle *, 1> CopyableBundle;
5419 if (auto *CD = dyn_cast<ScheduleCopyableData>(Data)) {
5420 CopyableBundle.push_back(&CD->getBundle());
5421 Bundles = CopyableBundle;
5422 } else {
5423 Bundles = getScheduleBundles(Data->getInst());
5424 }
5425 if (!Bundles.empty()) {
5426 for (ScheduleBundle *Bundle : Bundles) {
5427 if (Bundle->unscheduledDepsInBundle() == 0) {
5428 assert(!Bundle->isScheduled() &&
5429 "already scheduled bundle gets ready");
5430 ReadyList.insert(Bundle);
5432 << "SLP: gets ready: " << *Bundle << "\n");
5433 }
5434 }
5435 return;
5436 }
5437 assert(!Data->isScheduled() &&
5438 "already scheduled bundle gets ready");
5439 assert(!isa<ScheduleCopyableData>(Data) &&
5440 "Expected non-copyable data");
5441 ReadyList.insert(Data);
5442 LLVM_DEBUG(dbgs() << "SLP: gets ready: " << *Data << "\n");
5443 }
5444 };
5445
5446 auto DecrUnschedForInst = [&](Instruction *User, unsigned OpIdx,
5447 Instruction *I) {
5448 if (!ScheduleCopyableDataMap.empty()) {
5450 getScheduleCopyableData(User, OpIdx, I);
5451 for (ScheduleCopyableData *CD : CopyableData)
5452 DecrUnsched(CD, /*IsControl=*/false);
5453 if (!CopyableData.empty())
5454 return;
5455 }
5456 if (ScheduleData *OpSD = getScheduleData(I))
5457 DecrUnsched(OpSD, /*IsControl=*/false);
5458 };
5459
5460 // If BundleMember is a vector bundle, its operands may have been
5461 // reordered during buildTree(). We therefore need to get its operands
5462 // through the TreeEntry.
5463 if (!Bundles.empty()) {
5464 auto *In = BundleMember->getInst();
5465 // Count uses of each instruction operand.
5467 unsigned TotalOpCount = 0;
5468 if (isa<ScheduleCopyableData>(BundleMember)) {
5469 // Copyable data is used only once (uses itself).
5470 TotalOpCount = OperandsUses[In] = 1;
5471 } else {
5472 for (const Use &U : In->operands()) {
5473 if (auto *I = dyn_cast<Instruction>(U.get())) {
5474 auto Res = OperandsUses.try_emplace(I, 0);
5475 ++Res.first->getSecond();
5476 ++TotalOpCount;
5477 }
5478 }
5479 }
5480 // Decrement the unscheduled counter and insert to ready list if
5481 // ready.
5482 auto DecrUnschedForInst = [&](Instruction *I, TreeEntry *UserTE,
5483 unsigned OpIdx) {
5484 if (!ScheduleCopyableDataMap.empty()) {
5485 const EdgeInfo EI = {UserTE, OpIdx};
5486 if (ScheduleCopyableData *CD = getScheduleCopyableData(EI, I)) {
5487 DecrUnsched(CD, /*IsControl=*/false);
5488 return;
5489 }
5490 }
5491 auto It = OperandsUses.find(I);
5492 assert(It != OperandsUses.end() && "Operand not found");
5493 if (It->second > 0) {
5494 --It->getSecond();
5495 assert(TotalOpCount > 0 && "No more operands to decrement");
5496 --TotalOpCount;
5497 if (ScheduleData *OpSD = getScheduleData(I))
5498 DecrUnsched(OpSD, /*IsControl=*/false);
5499 }
5500 };
5501
5502 for (ScheduleBundle *Bundle : Bundles) {
5503 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
5504 break;
5505 // Need to search for the lane since the tree entry can be
5506 // reordered.
5507 int Lane = std::distance(Bundle->getTreeEntry()->Scalars.begin(),
5508 find(Bundle->getTreeEntry()->Scalars, In));
5509 assert(Lane >= 0 && "Lane not set");
5510 if (isa<StoreInst>(In) &&
5511 !Bundle->getTreeEntry()->ReorderIndices.empty())
5512 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
5513 assert(Lane < static_cast<int>(
5514 Bundle->getTreeEntry()->Scalars.size()) &&
5515 "Couldn't find extract lane");
5516
5517 // Since vectorization tree is being built recursively this
5518 // assertion ensures that the tree entry has all operands set before
5519 // reaching this code. Couple of exceptions known at the moment are
5520 // extracts where their second (immediate) operand is not added.
5521 // Since immediates do not affect scheduler behavior this is
5522 // considered okay.
5523 assert(In &&
5524 (isa<ExtractValueInst, ExtractElementInst, CallBase>(In) ||
5525 In->getNumOperands() ==
5526 Bundle->getTreeEntry()->getNumOperands() ||
5527 Bundle->getTreeEntry()->isCopyableElement(In)) &&
5528 "Missed TreeEntry operands?");
5529
5530 for (unsigned OpIdx :
5531 seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
5532 if (auto *I = dyn_cast<Instruction>(
5533 Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
5534 LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): " << *I
5535 << "\n");
5536 DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx);
5537 }
5538 }
5539 } else {
5540 // If BundleMember is a stand-alone instruction, no operand reordering
5541 // has taken place, so we directly access its operands.
5542 for (Use &U : BundleMember->getInst()->operands()) {
5543 if (auto *I = dyn_cast<Instruction>(U.get())) {
5545 << "SLP: check for readiness (def): " << *I << "\n");
5546 DecrUnschedForInst(BundleMember->getInst(), U.getOperandNo(), I);
5547 }
5548 }
5549 }
5550 // Handle the memory dependencies.
5551 auto *SD = dyn_cast<ScheduleData>(BundleMember);
5552 if (!SD)
5553 return;
5555 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
5556 if (!VisitedMemory.insert(MemoryDep).second)
5557 continue;
5558 // There are no more unscheduled dependencies after decrementing,
5559 // so we can put the dependent instruction into the ready list.
5560 LLVM_DEBUG(dbgs() << "SLP: check for readiness (mem): "
5561 << *MemoryDep << "\n");
5562 DecrUnsched(MemoryDep);
5563 }
5564 // Handle the control dependencies.
5566 for (ScheduleData *Dep : SD->getControlDependencies()) {
5567 if (!VisitedControl.insert(Dep).second)
5568 continue;
5569 // There are no more unscheduled dependencies after decrementing,
5570 // so we can put the dependent instruction into the ready list.
5572 << "SLP: check for readiness (ctrl): " << *Dep << "\n");
5573 DecrUnsched(Dep, /*IsControl=*/true);
5574 }
5575 };
5576 if (auto *SD = dyn_cast<ScheduleData>(Data)) {
5577 SD->setScheduled(/*Scheduled=*/true);
5578 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
5579 ProcessBundleMember(SD, {});
5580 } else {
5581 ScheduleBundle &Bundle = *cast<ScheduleBundle>(Data);
5582 Bundle.setScheduled(/*Scheduled=*/true);
5583 LLVM_DEBUG(dbgs() << "SLP: schedule " << Bundle << "\n");
5584 auto AreAllBundlesScheduled =
5585 [&](const ScheduleEntity *SD,
5586 ArrayRef<ScheduleBundle *> SDBundles) {
5587 if (isa<ScheduleCopyableData>(SD))
5588 return true;
5589 return !SDBundles.empty() &&
5590 all_of(SDBundles, [&](const ScheduleBundle *SDBundle) {
5591 return SDBundle->isScheduled();
5592 });
5593 };
5594 for (ScheduleEntity *SD : Bundle.getBundle()) {
5596 if (!isa<ScheduleCopyableData>(SD))
5597 SDBundles = getScheduleBundles(SD->getInst());
5598 if (AreAllBundlesScheduled(SD, SDBundles)) {
5599 SD->setScheduled(/*Scheduled=*/true);
5600 ProcessBundleMember(SD, isa<ScheduleCopyableData>(SD) ? &Bundle
5601 : SDBundles);
5602 }
5603 }
5604 }
5605 }
5606
5607 /// Verify basic self consistency properties of the data structure.
5608 void verify() {
5609 if (!ScheduleStart)
5610 return;
5611
5612 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
5613 ScheduleStart->comesBefore(ScheduleEnd) &&
5614 "Not a valid scheduling region?");
5615
5616 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5617 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
5618 if (!Bundles.empty()) {
5619 for (ScheduleBundle *Bundle : Bundles) {
5620 assert(isInSchedulingRegion(*Bundle) &&
5621 "primary schedule data not in window?");
5622 Bundle->verify();
5623 }
5624 continue;
5625 }
5626 auto *SD = getScheduleData(I);
5627 if (!SD)
5628 continue;
5629 assert(isInSchedulingRegion(*SD) &&
5630 "primary schedule data not in window?");
5631 SD->verify();
5632 }
5633
5634 assert(all_of(ReadyInsts,
5635 [](const ScheduleEntity *Bundle) {
5636 return Bundle->isReady();
5637 }) &&
5638 "item in ready list not ready?");
5639 }
5640
5641 /// Put all instructions into the ReadyList which are ready for scheduling.
5642 template <typename ReadyListType>
5643 void initialFillReadyList(ReadyListType &ReadyList) {
5645 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5646 ScheduleData *SD = getScheduleData(I);
5647 if (SD && SD->hasValidDependencies() && SD->isReady()) {
5648 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
5649 !Bundles.empty()) {
5650 for (ScheduleBundle *Bundle : Bundles) {
5651 if (!Visited.insert(Bundle).second)
5652 continue;
5653 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
5654 ReadyList.insert(Bundle);
5655 LLVM_DEBUG(dbgs() << "SLP: initially in ready list: "
5656 << *Bundle << "\n");
5657 }
5658 }
5659 continue;
5660 }
5661 ReadyList.insert(SD);
5663 << "SLP: initially in ready list: " << *SD << "\n");
5664 }
5665 }
5666 }
5667
5668 /// Build a bundle from the ScheduleData nodes corresponding to the
5669 /// scalar instruction for each lane.
5670 /// \param VL The list of scalar instructions.
5671 /// \param S The state of the instructions.
5672 /// \param EI The edge in the SLP graph or the user node/operand number.
5673 ScheduleBundle &buildBundle(ArrayRef<Value *> VL,
5674 const InstructionsState &S, const EdgeInfo &EI);
5675
5676 /// Checks if a bundle of instructions can be scheduled, i.e. has no
5677 /// cyclic dependencies. This is only a dry-run, no instructions are
5678 /// actually moved at this stage.
5679 /// \returns the scheduling bundle. The returned Optional value is not
5680 /// std::nullopt if \p VL is allowed to be scheduled.
5681 std::optional<ScheduleBundle *>
5682 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
5683 const InstructionsState &S, const EdgeInfo &EI);
5684
5685 /// Allocates schedule data chunk.
5686 ScheduleData *allocateScheduleDataChunks();
5687
5688 /// Extends the scheduling region so that V is inside the region.
5689 /// \returns true if the region size is within the limit.
5690 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
5691
5692 /// Initialize the ScheduleData structures for new instructions in the
5693 /// scheduling region.
5694 void initScheduleData(Instruction *FromI, Instruction *ToI,
5695 ScheduleData *PrevLoadStore,
5696 ScheduleData *NextLoadStore);
5697
5698 /// Updates the dependency information of a bundle and of all instructions/
5699 /// bundles which depend on the original bundle.
5700 void calculateDependencies(ScheduleBundle &Bundle, bool InsertInReadyList,
5701 BoUpSLP *SLP,
5702 ArrayRef<ScheduleData *> ControlDeps = {});
5703
5704 /// Sets all instruction in the scheduling region to un-scheduled.
5705 void resetSchedule();
5706
5707 BasicBlock *BB;
5708
5709 /// Simple memory allocation for ScheduleData.
5711
5712 /// The size of a ScheduleData array in ScheduleDataChunks.
5713 int ChunkSize;
5714
5715 /// The allocator position in the current chunk, which is the last entry
5716 /// of ScheduleDataChunks.
5717 int ChunkPos;
5718
5719 /// Attaches ScheduleData to Instruction.
5720 /// Note that the mapping survives during all vectorization iterations, i.e.
5721 /// ScheduleData structures are recycled.
5723
5724 /// Attaches ScheduleCopyableData to EdgeInfo (UserTreeEntry + operand
5725 /// number) and the operand instruction, represented as copyable element.
5727 std::unique_ptr<ScheduleCopyableData>>
5728 ScheduleCopyableDataMap;
5729
5730 /// Represents mapping between instruction and all related
5731 /// ScheduleCopyableData (for all uses in the tree, represenedt as copyable
5732 /// element). The SLP tree may contain several representations of the same
5733 /// instruction.
5735 ScheduleCopyableDataMapByInst;
5736
5737 /// Represents mapping between user value and operand number, the operand
5738 /// value and all related ScheduleCopyableData. The relation is 1:n, because
5739 /// the same user may refernce the same operand in different tree entries
5740 /// and the operand may be modelled by the different copyable data element.
5743 ScheduleCopyableDataMapByInstUser;
5744
5745 /// Represents mapping between instruction and all related
5746 /// ScheduleCopyableData. It represents the mapping between the actual
5747 /// instruction and the last copyable data element in the chain. E.g., if
5748 /// the graph models the following instructions:
5749 /// %0 = non-add instruction ...
5750 /// ...
5751 /// %4 = add %3, 1
5752 /// %5 = add %4, 1
5753 /// %6 = insertelement poison, %0, 0
5754 /// %7 = insertelement %6, %5, 1
5755 /// And the graph is modeled as:
5756 /// [%5, %0] -> [%4, copyable %0 <0> ] -> [%3, copyable %0 <1> ]
5757 /// -> [1, 0] -> [%1, 0]
5758 ///
5759 /// this map will map %0 only to the copyable element <1>, which is the last
5760 /// user (direct user of the actual instruction). <0> uses <1>, so <1> will
5761 /// keep the map to <0>, not the %0.
5762 SmallDenseMap<const Instruction *,
5764 ScheduleCopyableDataMapByUsers;
5765
5766 /// Attaches ScheduleBundle to Instruction.
5768 ScheduledBundles;
5769 /// The list of ScheduleBundles.
5770 SmallVector<std::unique_ptr<ScheduleBundle>> ScheduledBundlesList;
5771
5772 /// The ready-list for scheduling (only used for the dry-run).
5773 SetVector<ScheduleEntity *> ReadyInsts;
5774
5775 /// The first instruction of the scheduling region.
5776 Instruction *ScheduleStart = nullptr;
5777
5778 /// The first instruction _after_ the scheduling region.
5779 Instruction *ScheduleEnd = nullptr;
5780
5781 /// The first memory accessing instruction in the scheduling region
5782 /// (can be null).
5783 ScheduleData *FirstLoadStoreInRegion = nullptr;
5784
5785 /// The last memory accessing instruction in the scheduling region
5786 /// (can be null).
5787 ScheduleData *LastLoadStoreInRegion = nullptr;
5788
5789 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
5790 /// region? Used to optimize the dependence calculation for the
5791 /// common case where there isn't.
5792 bool RegionHasStackSave = false;
5793
5794 /// The current size of the scheduling region.
5795 int ScheduleRegionSize = 0;
5796
5797 /// The maximum size allowed for the scheduling region.
5798 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
5799
5800 /// The ID of the scheduling region. For a new vectorization iteration this
5801 /// is incremented which "removes" all ScheduleData from the region.
5802 /// Make sure that the initial SchedulingRegionID is greater than the
5803 /// initial SchedulingRegionID in ScheduleData (which is 0).
5804 int SchedulingRegionID = 1;
5805 };
5806
5807 /// Attaches the BlockScheduling structures to basic blocks.
5809
5810 /// Performs the "real" scheduling. Done before vectorization is actually
5811 /// performed in a basic block.
5812 void scheduleBlock(const BoUpSLP &R, BlockScheduling *BS);
5813
5814 /// List of users to ignore during scheduling and that don't need extracting.
5815 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
5816
5817 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
5818 /// sorted SmallVectors of unsigned.
5819 struct OrdersTypeDenseMapInfo {
5820 static OrdersType getEmptyKey() {
5821 OrdersType V;
5822 V.push_back(~1U);
5823 return V;
5824 }
5825
5826 static OrdersType getTombstoneKey() {
5827 OrdersType V;
5828 V.push_back(~2U);
5829 return V;
5830 }
5831
5832 static unsigned getHashValue(const OrdersType &V) {
5833 return static_cast<unsigned>(hash_combine_range(V));
5834 }
5835
5836 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
5837 return LHS == RHS;
5838 }
5839 };
5840
5841 // Analysis and block reference.
5842 Function *F;
5843 ScalarEvolution *SE;
5845 TargetLibraryInfo *TLI;
5846 LoopInfo *LI;
5847 DominatorTree *DT;
5848 AssumptionCache *AC;
5849 DemandedBits *DB;
5850 const DataLayout *DL;
5852
5853 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
5854 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
5855
5856 /// Instruction builder to construct the vectorized tree.
5858
5859 /// A map of scalar integer values to the smallest bit width with which they
5860 /// can legally be represented. The values map to (width, signed) pairs,
5861 /// where "width" indicates the minimum bit width and "signed" is True if the
5862 /// value must be signed-extended, rather than zero-extended, back to its
5863 /// original width.
5865
5866 /// Final size of the reduced vector, if the current graph represents the
5867 /// input for the reduction and it was possible to narrow the size of the
5868 /// reduction.
5869 unsigned ReductionBitWidth = 0;
5870
5871 /// Canonical graph size before the transformations.
5872 unsigned BaseGraphSize = 1;
5873
5874 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
5875 /// type sizes, used in the tree.
5876 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
5877
5878 /// Indices of the vectorized nodes, which supposed to be the roots of the new
5879 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
5880 DenseSet<unsigned> ExtraBitWidthNodes;
5881};
5882
5883} // end namespace slpvectorizer
5884
5885template <> struct DenseMapInfo<BoUpSLP::EdgeInfo> {
5889 return BoUpSLP::EdgeInfo(FirstInfo::getEmptyKey(),
5890 SecondInfo::getEmptyKey());
5891 }
5892
5894 return BoUpSLP::EdgeInfo(FirstInfo::getTombstoneKey(),
5895 SecondInfo::getTombstoneKey());
5896 }
5897
5898 static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val) {
5899 return detail::combineHashValue(FirstInfo::getHashValue(Val.UserTE),
5900 SecondInfo::getHashValue(Val.EdgeIdx));
5901 }
5902
5903 static bool isEqual(const BoUpSLP::EdgeInfo &LHS,
5904 const BoUpSLP::EdgeInfo &RHS) {
5905 return LHS == RHS;
5906 }
5907};
5908
5909template <> struct GraphTraits<BoUpSLP *> {
5910 using TreeEntry = BoUpSLP::TreeEntry;
5911
5912 /// NodeRef has to be a pointer per the GraphWriter.
5914
5916
5917 /// Add the VectorizableTree to the index iterator to be able to return
5918 /// TreeEntry pointers.
5919 struct ChildIteratorType
5920 : public iterator_adaptor_base<
5921 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
5923
5925 ContainerTy &VT)
5926 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
5927
5928 NodeRef operator*() { return I->UserTE; }
5929 };
5930
5932 return R.VectorizableTree[0].get();
5933 }
5934
5935 static ChildIteratorType child_begin(NodeRef N) {
5936 return {&N->UserTreeIndex, N->Container};
5937 }
5938
5939 static ChildIteratorType child_end(NodeRef N) {
5940 return {&N->UserTreeIndex + 1, N->Container};
5941 }
5942
5943 /// For the node iterator we just need to turn the TreeEntry iterator into a
5944 /// TreeEntry* iterator so that it dereferences to NodeRef.
5945 class nodes_iterator {
5947 ItTy It;
5948
5949 public:
5950 nodes_iterator(const ItTy &It2) : It(It2) {}
5951 NodeRef operator*() { return It->get(); }
5952 nodes_iterator operator++() {
5953 ++It;
5954 return *this;
5955 }
5956 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
5957 };
5958
5959 static nodes_iterator nodes_begin(BoUpSLP *R) {
5960 return nodes_iterator(R->VectorizableTree.begin());
5961 }
5962
5963 static nodes_iterator nodes_end(BoUpSLP *R) {
5964 return nodes_iterator(R->VectorizableTree.end());
5965 }
5966
5967 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
5968};
5969
5970template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
5971 using TreeEntry = BoUpSLP::TreeEntry;
5972
5973 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
5974
5975 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
5976 std::string Str;
5978 OS << Entry->Idx << ".\n";
5979 if (isSplat(Entry->Scalars))
5980 OS << "<splat> ";
5981 for (auto *V : Entry->Scalars) {
5982 OS << *V;
5983 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
5984 return EU.Scalar == V;
5985 }))
5986 OS << " <extract>";
5987 OS << "\n";
5988 }
5989 return Str;
5990 }
5991
5992 static std::string getNodeAttributes(const TreeEntry *Entry,
5993 const BoUpSLP *) {
5994 if (Entry->isGather())
5995 return "color=red";
5996 if (Entry->State == TreeEntry::ScatterVectorize ||
5997 Entry->State == TreeEntry::StridedVectorize ||
5998 Entry->State == TreeEntry::CompressVectorize)
5999 return "color=blue";
6000 return "";
6001 }
6002};
6003
6004} // end namespace llvm
6005
6008 for (auto *I : DeletedInstructions) {
6009 if (!I->getParent()) {
6010 // Temporarily insert instruction back to erase them from parent and
6011 // memory later.
6012 if (isa<PHINode>(I))
6013 // Phi nodes must be the very first instructions in the block.
6014 I->insertBefore(F->getEntryBlock(),
6015 F->getEntryBlock().getFirstNonPHIIt());
6016 else
6017 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
6018 continue;
6019 }
6020 for (Use &U : I->operands()) {
6021 auto *Op = dyn_cast<Instruction>(U.get());
6022 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
6024 DeadInsts.emplace_back(Op);
6025 }
6026 I->dropAllReferences();
6027 }
6028 for (auto *I : DeletedInstructions) {
6029 assert(I->use_empty() &&
6030 "trying to erase instruction with users.");
6031 I->eraseFromParent();
6032 }
6033
6034 // Cleanup any dead scalar code feeding the vectorized instructions
6036
6037#ifdef EXPENSIVE_CHECKS
6038 // If we could guarantee that this call is not extremely slow, we could
6039 // remove the ifdef limitation (see PR47712).
6040 assert(!verifyFunction(*F, &dbgs()));
6041#endif
6042}
6043
6044/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
6045/// contains original mask for the scalars reused in the node. Procedure
6046/// transform this mask in accordance with the given \p Mask.
6048 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
6049 "Expected non-empty mask.");
6050 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
6051 Prev.swap(Reuses);
6052 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
6053 if (Mask[I] != PoisonMaskElem)
6054 Reuses[Mask[I]] = Prev[I];
6055}
6056
6057/// Reorders the given \p Order according to the given \p Mask. \p Order - is
6058/// the original order of the scalars. Procedure transforms the provided order
6059/// in accordance with the given \p Mask. If the resulting \p Order is just an
6060/// identity order, \p Order is cleared.
6062 bool BottomOrder = false) {
6063 assert(!Mask.empty() && "Expected non-empty mask.");
6064 unsigned Sz = Mask.size();
6065 if (BottomOrder) {
6066 SmallVector<unsigned> PrevOrder;
6067 if (Order.empty()) {
6068 PrevOrder.resize(Sz);
6069 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
6070 } else {
6071 PrevOrder.swap(Order);
6072 }
6073 Order.assign(Sz, Sz);
6074 for (unsigned I = 0; I < Sz; ++I)
6075 if (Mask[I] != PoisonMaskElem)
6076 Order[I] = PrevOrder[Mask[I]];
6077 if (all_of(enumerate(Order), [&](const auto &Data) {
6078 return Data.value() == Sz || Data.index() == Data.value();
6079 })) {
6080 Order.clear();
6081 return;
6082 }
6083 fixupOrderingIndices(Order);
6084 return;
6085 }
6086 SmallVector<int> MaskOrder;
6087 if (Order.empty()) {
6088 MaskOrder.resize(Sz);
6089 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
6090 } else {
6091 inversePermutation(Order, MaskOrder);
6092 }
6093 reorderReuses(MaskOrder, Mask);
6094 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
6095 Order.clear();
6096 return;
6097 }
6098 Order.assign(Sz, Sz);
6099 for (unsigned I = 0; I < Sz; ++I)
6100 if (MaskOrder[I] != PoisonMaskElem)
6101 Order[MaskOrder[I]] = I;
6102 fixupOrderingIndices(Order);
6103}
6104
6105std::optional<BoUpSLP::OrdersType>
6106BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
6107 bool TopToBottom, bool IgnoreReorder) {
6108 assert(TE.isGather() && "Expected gather node only.");
6109 // Try to find subvector extract/insert patterns and reorder only such
6110 // patterns.
6111 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
6112 Type *ScalarTy = GatheredScalars.front()->getType();
6113 size_t NumScalars = GatheredScalars.size();
6114 if (!isValidElementType(ScalarTy))
6115 return std::nullopt;
6116 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
6117 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, NumScalars);
6118 SmallVector<int> ExtractMask;
6119 SmallVector<int> Mask;
6122 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
6124 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
6125 /*ForOrder=*/true);
6126 // No shuffled operands - ignore.
6127 if (GatherShuffles.empty() && ExtractShuffles.empty())
6128 return std::nullopt;
6129 OrdersType CurrentOrder(NumScalars, NumScalars);
6130 if (GatherShuffles.size() == 1 &&
6131 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
6132 Entries.front().front()->isSame(TE.Scalars)) {
6133 // If the full matched node in whole tree rotation - no need to consider the
6134 // matching order, rotating the whole tree.
6135 if (TopToBottom)
6136 return std::nullopt;
6137 // No need to keep the order for the same user node.
6138 if (Entries.front().front()->UserTreeIndex.UserTE ==
6139 TE.UserTreeIndex.UserTE)
6140 return std::nullopt;
6141 // No need to keep the order for the matched root node, if it can be freely
6142 // reordered.
6143 if (!IgnoreReorder && Entries.front().front()->Idx == 0)
6144 return std::nullopt;
6145 // If shuffling 2 elements only and the matching node has reverse reuses -
6146 // no need to count order, both work fine.
6147 if (!Entries.front().front()->ReuseShuffleIndices.empty() &&
6148 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6149 any_of(enumerate(Entries.front().front()->ReuseShuffleIndices),
6150 [](const auto &P) {
6151 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6152 }))
6153 return std::nullopt;
6154
6155 // Perfect match in the graph, will reuse the previously vectorized
6156 // node. Cost is 0.
6157 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
6158 return CurrentOrder;
6159 }
6160 auto IsSplatMask = [](ArrayRef<int> Mask) {
6161 int SingleElt = PoisonMaskElem;
6162 return all_of(Mask, [&](int I) {
6163 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
6164 SingleElt = I;
6165 return I == PoisonMaskElem || I == SingleElt;
6166 });
6167 };
6168 // Exclusive broadcast mask - ignore.
6169 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
6170 (Entries.size() != 1 ||
6171 Entries.front().front()->ReorderIndices.empty())) ||
6172 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
6173 return std::nullopt;
6174 SmallBitVector ShuffledSubMasks(NumParts);
6175 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
6176 ArrayRef<int> Mask, int PartSz, int NumParts,
6177 function_ref<unsigned(unsigned)> GetVF) {
6178 for (int I : seq<int>(0, NumParts)) {
6179 if (ShuffledSubMasks.test(I))
6180 continue;
6181 const int VF = GetVF(I);
6182 if (VF == 0)
6183 continue;
6184 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
6185 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
6186 // Shuffle of at least 2 vectors - ignore.
6187 if (any_of(Slice, [&](unsigned I) { return I != NumScalars; })) {
6188 llvm::fill(Slice, NumScalars);
6189 ShuffledSubMasks.set(I);
6190 continue;
6191 }
6192 // Try to include as much elements from the mask as possible.
6193 int FirstMin = INT_MAX;
6194 int SecondVecFound = false;
6195 for (int K : seq<int>(Limit)) {
6196 int Idx = Mask[I * PartSz + K];
6197 if (Idx == PoisonMaskElem) {
6198 Value *V = GatheredScalars[I * PartSz + K];
6199 if (isConstant(V) && !isa<PoisonValue>(V)) {
6200 SecondVecFound = true;
6201 break;
6202 }
6203 continue;
6204 }
6205 if (Idx < VF) {
6206 if (FirstMin > Idx)
6207 FirstMin = Idx;
6208 } else {
6209 SecondVecFound = true;
6210 break;
6211 }
6212 }
6213 FirstMin = (FirstMin / PartSz) * PartSz;
6214 // Shuffle of at least 2 vectors - ignore.
6215 if (SecondVecFound) {
6216 llvm::fill(Slice, NumScalars);
6217 ShuffledSubMasks.set(I);
6218 continue;
6219 }
6220 for (int K : seq<int>(Limit)) {
6221 int Idx = Mask[I * PartSz + K];
6222 if (Idx == PoisonMaskElem)
6223 continue;
6224 Idx -= FirstMin;
6225 if (Idx >= PartSz) {
6226 SecondVecFound = true;
6227 break;
6228 }
6229 if (CurrentOrder[I * PartSz + Idx] >
6230 static_cast<unsigned>(I * PartSz + K) &&
6231 CurrentOrder[I * PartSz + Idx] !=
6232 static_cast<unsigned>(I * PartSz + Idx))
6233 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
6234 }
6235 // Shuffle of at least 2 vectors - ignore.
6236 if (SecondVecFound) {
6237 llvm::fill(Slice, NumScalars);
6238 ShuffledSubMasks.set(I);
6239 continue;
6240 }
6241 }
6242 };
6243 int PartSz = getPartNumElems(NumScalars, NumParts);
6244 if (!ExtractShuffles.empty())
6245 TransformMaskToOrder(
6246 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
6247 if (!ExtractShuffles[I])
6248 return 0U;
6249 unsigned VF = 0;
6250 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
6251 for (unsigned Idx : seq<unsigned>(Sz)) {
6252 int K = I * PartSz + Idx;
6253 if (ExtractMask[K] == PoisonMaskElem)
6254 continue;
6255 if (!TE.ReuseShuffleIndices.empty())
6256 K = TE.ReuseShuffleIndices[K];
6257 if (K == PoisonMaskElem)
6258 continue;
6259 if (!TE.ReorderIndices.empty())
6260 K = std::distance(TE.ReorderIndices.begin(),
6261 find(TE.ReorderIndices, K));
6262 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
6263 if (!EI)
6264 continue;
6265 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
6266 ->getElementCount()
6267 .getKnownMinValue());
6268 }
6269 return VF;
6270 });
6271 // Check special corner case - single shuffle of the same entry.
6272 if (GatherShuffles.size() == 1 && NumParts != 1) {
6273 if (ShuffledSubMasks.any())
6274 return std::nullopt;
6275 PartSz = NumScalars;
6276 NumParts = 1;
6277 }
6278 if (!Entries.empty())
6279 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
6280 if (!GatherShuffles[I])
6281 return 0U;
6282 return std::max(Entries[I].front()->getVectorFactor(),
6283 Entries[I].back()->getVectorFactor());
6284 });
6285 unsigned NumUndefs = count(CurrentOrder, NumScalars);
6286 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6287 return std::nullopt;
6288 return std::move(CurrentOrder);
6289}
6290
6291static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
6292 const TargetLibraryInfo &TLI,
6293 bool CompareOpcodes = true) {
6296 return false;
6297 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
6298 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
6299 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6300 (!GEP2 || GEP2->getNumOperands() == 2) &&
6301 (((!GEP1 || isConstant(GEP1->getOperand(1))) &&
6302 (!GEP2 || isConstant(GEP2->getOperand(1)))) ||
6303 !CompareOpcodes ||
6304 (GEP1 && GEP2 &&
6305 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
6306}
6307
6308/// Calculates minimal alignment as a common alignment.
6309template <typename T>
6311 Align CommonAlignment = cast<T>(VL.consume_front())->getAlign();
6312 for (Value *V : VL)
6313 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
6314 return CommonAlignment;
6315}
6316
6317/// Check if \p Order represents reverse order.
6319 assert(!Order.empty() &&
6320 "Order is empty. Please check it before using isReverseOrder.");
6321 unsigned Sz = Order.size();
6322 return all_of(enumerate(Order), [&](const auto &Pair) {
6323 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6324 });
6325}
6326
6327/// Checks if the provided list of pointers \p Pointers represents the strided
6328/// pointers for type ElemTy. If they are not, std::nullopt is returned.
6329/// Otherwise, if \p Inst is not specified, just initialized optional value is
6330/// returned to show that the pointers represent strided pointers. If \p Inst
6331/// specified, the runtime stride is materialized before the given \p Inst.
6332/// \returns std::nullopt if the pointers are not pointers with the runtime
6333/// stride, nullptr or actual stride value, otherwise.
6334static std::optional<Value *>
6336 const DataLayout &DL, ScalarEvolution &SE,
6337 SmallVectorImpl<unsigned> &SortedIndices,
6338 Instruction *Inst = nullptr) {
6340 const SCEV *PtrSCEVLowest = nullptr;
6341 const SCEV *PtrSCEVHighest = nullptr;
6342 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
6343 // addresses).
6344 for (Value *Ptr : PointerOps) {
6345 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
6346 if (!PtrSCEV)
6347 return std::nullopt;
6348 SCEVs.push_back(PtrSCEV);
6349 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6350 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6351 continue;
6352 }
6353 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
6354 if (isa<SCEVCouldNotCompute>(Diff))
6355 return std::nullopt;
6356 if (Diff->isNonConstantNegative()) {
6357 PtrSCEVLowest = PtrSCEV;
6358 continue;
6359 }
6360 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
6361 if (isa<SCEVCouldNotCompute>(Diff1))
6362 return std::nullopt;
6363 if (Diff1->isNonConstantNegative()) {
6364 PtrSCEVHighest = PtrSCEV;
6365 continue;
6366 }
6367 }
6368 // Dist = PtrSCEVHighest - PtrSCEVLowest;
6369 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
6370 if (isa<SCEVCouldNotCompute>(Dist))
6371 return std::nullopt;
6372 int Size = DL.getTypeStoreSize(ElemTy);
6373 auto TryGetStride = [&](const SCEV *Dist,
6374 const SCEV *Multiplier) -> const SCEV * {
6375 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
6376 if (M->getOperand(0) == Multiplier)
6377 return M->getOperand(1);
6378 if (M->getOperand(1) == Multiplier)
6379 return M->getOperand(0);
6380 return nullptr;
6381 }
6382 if (Multiplier == Dist)
6383 return SE.getConstant(Dist->getType(), 1);
6384 return SE.getUDivExactExpr(Dist, Multiplier);
6385 };
6386 // Stride_in_elements = Dist / element_size * (num_elems - 1).
6387 const SCEV *Stride = nullptr;
6388 if (Size != 1 || SCEVs.size() > 2) {
6389 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
6390 Stride = TryGetStride(Dist, Sz);
6391 if (!Stride)
6392 return std::nullopt;
6393 }
6394 if (!Stride || isa<SCEVConstant>(Stride))
6395 return std::nullopt;
6396 // Iterate through all pointers and check if all distances are
6397 // unique multiple of Stride.
6398 using DistOrdPair = std::pair<int64_t, int>;
6399 auto Compare = llvm::less_first();
6400 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
6401 int Cnt = 0;
6402 bool IsConsecutive = true;
6403 for (const SCEV *PtrSCEV : SCEVs) {
6404 unsigned Dist = 0;
6405 if (PtrSCEV != PtrSCEVLowest) {
6406 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
6407 const SCEV *Coeff = TryGetStride(Diff, Stride);
6408 if (!Coeff)
6409 return std::nullopt;
6410 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
6411 if (!SC || isa<SCEVCouldNotCompute>(SC))
6412 return std::nullopt;
6413 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
6414 SE.getMulExpr(Stride, SC)))
6415 ->isZero())
6416 return std::nullopt;
6417 Dist = SC->getAPInt().getZExtValue();
6418 }
6419 // If the strides are not the same or repeated, we can't vectorize.
6420 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
6421 return std::nullopt;
6422 auto Res = Offsets.emplace(Dist, Cnt);
6423 if (!Res.second)
6424 return std::nullopt;
6425 // Consecutive order if the inserted element is the last one.
6426 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
6427 ++Cnt;
6428 }
6429 if (Offsets.size() != SCEVs.size())
6430 return std::nullopt;
6431 SortedIndices.clear();
6432 if (!IsConsecutive) {
6433 // Fill SortedIndices array only if it is non-consecutive.
6434 SortedIndices.resize(PointerOps.size());
6435 Cnt = 0;
6436 for (const std::pair<int64_t, int> &Pair : Offsets) {
6437 SortedIndices[Cnt] = Pair.second;
6438 ++Cnt;
6439 }
6440 }
6441 if (!Inst)
6442 return nullptr;
6443 SCEVExpander Expander(SE, DL, "strided-load-vec");
6444 return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
6445}
6446
6447static std::pair<InstructionCost, InstructionCost>
6449 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
6450 Type *ScalarTy, VectorType *VecTy);
6451
6452/// Returns the cost of the shuffle instructions with the given \p Kind, vector
6453/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
6454/// subvector pattern.
6455static InstructionCost
6457 VectorType *Tp, ArrayRef<int> Mask = {},
6459 int Index = 0, VectorType *SubTp = nullptr,
6461 VectorType *DstTy = Tp;
6462 if (!Mask.empty())
6463 DstTy = FixedVectorType::get(Tp->getScalarType(), Mask.size());
6464
6465 if (Kind != TTI::SK_PermuteTwoSrc)
6466 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
6467 Args);
6468 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6469 int NumSubElts;
6471 Mask, NumSrcElts, NumSubElts, Index)) {
6472 if (Index + NumSubElts > NumSrcElts &&
6473 Index + NumSrcElts <= static_cast<int>(Mask.size()))
6474 return TTI.getShuffleCost(TTI::SK_InsertSubvector, DstTy, Tp, Mask,
6476 }
6477 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
6478 Args);
6479}
6480
6481/// This is similar to TargetTransformInfo::getScalarizationOverhead, but if
6482/// ScalarTy is a FixedVectorType, a vector will be inserted or extracted
6483/// instead of a scalar.
6484static InstructionCost
6486 VectorType *Ty, const APInt &DemandedElts, bool Insert,
6487 bool Extract, TTI::TargetCostKind CostKind,
6488 bool ForPoisonSrc = true, ArrayRef<Value *> VL = {}) {
6489 assert(!isa<ScalableVectorType>(Ty) &&
6490 "ScalableVectorType is not supported.");
6491 assert(getNumElements(ScalarTy) * DemandedElts.getBitWidth() ==
6492 getNumElements(Ty) &&
6493 "Incorrect usage.");
6494 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
6495 assert(SLPReVec && "Only supported by REVEC.");
6496 // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
6497 // of CreateInsertElement.
6498 unsigned ScalarTyNumElements = VecTy->getNumElements();
6500 for (unsigned I : seq(DemandedElts.getBitWidth())) {
6501 if (!DemandedElts[I])
6502 continue;
6503 if (Insert)
6505 I * ScalarTyNumElements, VecTy);
6506 if (Extract)
6508 I * ScalarTyNumElements, VecTy);
6509 }
6510 return Cost;
6511 }
6512 return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
6513 CostKind, ForPoisonSrc, VL);
6514}
6515
6516/// This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy
6517/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6519 const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val,
6520 TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar,
6521 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
6522 if (Opcode == Instruction::ExtractElement) {
6523 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
6524 assert(SLPReVec && "Only supported by REVEC.");
6525 assert(isa<VectorType>(Val) && "Val must be a vector type.");
6527 cast<VectorType>(Val), {}, CostKind,
6528 Index * VecTy->getNumElements(), VecTy);
6529 }
6530 }
6531 return TTI.getVectorInstrCost(Opcode, Val, CostKind, Index, Scalar,
6532 ScalarUserAndIdx);
6533}
6534
6535/// This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst
6536/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6538 const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst,
6539 VectorType *VecTy, unsigned Index,
6541 if (auto *ScalarTy = dyn_cast<FixedVectorType>(Dst)) {
6542 assert(SLPReVec && "Only supported by REVEC.");
6543 auto *SubTp =
6544 getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
6546 Index * ScalarTy->getNumElements(), SubTp) +
6548 CostKind);
6549 }
6550 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index, CostKind);
6551}
6552
6553/// Creates subvector insert. Generates shuffle using \p Generator or
6554/// using default shuffle.
6556 IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index,
6557 function_ref<Value *(Value *, Value *, ArrayRef<int>)> Generator = {}) {
6558 if (isa<PoisonValue>(Vec) && isa<PoisonValue>(V))
6559 return Vec;
6560 const unsigned SubVecVF = getNumElements(V->getType());
6561 // Create shuffle, insertvector requires that index is multiple of
6562 // the subvector length.
6563 const unsigned VecVF = getNumElements(Vec->getType());
6565 if (isa<PoisonValue>(Vec)) {
6566 auto *Begin = std::next(Mask.begin(), Index);
6567 std::iota(Begin, std::next(Begin, SubVecVF), 0);
6568 Vec = Builder.CreateShuffleVector(V, Mask);
6569 return Vec;
6570 }
6571 std::iota(Mask.begin(), Mask.end(), 0);
6572 std::iota(std::next(Mask.begin(), Index),
6573 std::next(Mask.begin(), Index + SubVecVF), VecVF);
6574 if (Generator)
6575 return Generator(Vec, V, Mask);
6576 // 1. Resize V to the size of Vec.
6577 SmallVector<int> ResizeMask(VecVF, PoisonMaskElem);
6578 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
6579 V = Builder.CreateShuffleVector(V, ResizeMask);
6580 // 2. Insert V into Vec.
6581 return Builder.CreateShuffleVector(Vec, V, Mask);
6582}
6583
6584/// Generates subvector extract using \p Generator or using default shuffle.
6586 unsigned SubVecVF, unsigned Index) {
6587 SmallVector<int> Mask(SubVecVF, PoisonMaskElem);
6588 std::iota(Mask.begin(), Mask.end(), Index);
6589 return Builder.CreateShuffleVector(Vec, Mask);
6590}
6591
6592/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered
6593/// with \p Order.
6594/// \return true if the mask represents strided access, false - otherwise.
6596 ArrayRef<unsigned> Order, Type *ScalarTy,
6597 const DataLayout &DL, ScalarEvolution &SE,
6598 SmallVectorImpl<int> &CompressMask) {
6599 const unsigned Sz = PointerOps.size();
6600 CompressMask.assign(Sz, PoisonMaskElem);
6601 // The first element always set.
6602 CompressMask[0] = 0;
6603 // Check if the mask represents strided access.
6604 std::optional<unsigned> Stride = 0;
6605 Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps[Order.front()];
6606 for (unsigned I : seq<unsigned>(1, Sz)) {
6607 Value *Ptr = Order.empty() ? PointerOps[I] : PointerOps[Order[I]];
6608 std::optional<int64_t> OptPos =
6609 getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
6610 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
6611 return false;
6612 unsigned Pos = static_cast<unsigned>(*OptPos);
6613 CompressMask[I] = Pos;
6614 if (!Stride)
6615 continue;
6616 if (*Stride == 0) {
6617 *Stride = Pos;
6618 continue;
6619 }
6620 if (Pos != *Stride * I)
6621 Stride.reset();
6622 }
6623 return Stride.has_value();
6624}
6625
6626/// Checks if the \p VL can be transformed to a (masked)load + compress or
6627/// (masked) interleaved load.
6629 ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
6632 const DominatorTree &DT, const TargetLibraryInfo &TLI,
6633 const function_ref<bool(Value *)> AreAllUsersVectorized, bool &IsMasked,
6634 unsigned &InterleaveFactor, SmallVectorImpl<int> &CompressMask,
6635 VectorType *&LoadVecTy) {
6636 InterleaveFactor = 0;
6637 Type *ScalarTy = VL.front()->getType();
6638 const size_t Sz = VL.size();
6639 auto *VecTy = getWidenedType(ScalarTy, Sz);
6641 SmallVector<int> Mask;
6642 if (!Order.empty())
6643 inversePermutation(Order, Mask);
6644 // Check external uses.
6645 for (const auto [I, V] : enumerate(VL)) {
6646 if (AreAllUsersVectorized(V))
6647 continue;
6648 InstructionCost ExtractCost =
6649 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
6650 Mask.empty() ? I : Mask[I]);
6651 InstructionCost ScalarCost =
6652 TTI.getInstructionCost(cast<Instruction>(V), CostKind);
6653 if (ExtractCost <= ScalarCost)
6654 return false;
6655 }
6656 Value *Ptr0;
6657 Value *PtrN;
6658 if (Order.empty()) {
6659 Ptr0 = PointerOps.front();
6660 PtrN = PointerOps.back();
6661 } else {
6662 Ptr0 = PointerOps[Order.front()];
6663 PtrN = PointerOps[Order.back()];
6664 }
6665 std::optional<int64_t> Diff =
6666 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
6667 if (!Diff)
6668 return false;
6669 const size_t MaxRegSize =
6671 .getFixedValue();
6672 // Check for very large distances between elements.
6673 if (*Diff / Sz >= MaxRegSize / 8)
6674 return false;
6675 LoadVecTy = getWidenedType(ScalarTy, *Diff + 1);
6676 auto *LI = cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]);
6677 Align CommonAlignment = LI->getAlign();
6678 IsMasked = !isSafeToLoadUnconditionally(
6679 Ptr0, LoadVecTy, CommonAlignment, DL,
6680 cast<LoadInst>(Order.empty() ? VL.back() : VL[Order.back()]), &AC, &DT,
6681 &TLI);
6682 if (IsMasked && !TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment,
6683 LI->getPointerAddressSpace()))
6684 return false;
6685 // TODO: perform the analysis of each scalar load for better
6686 // safe-load-unconditionally analysis.
6687 bool IsStrided =
6688 buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask);
6689 assert(CompressMask.size() >= 2 && "At least two elements are required");
6690 SmallVector<Value *> OrderedPointerOps(PointerOps);
6691 if (!Order.empty())
6692 reorderScalars(OrderedPointerOps, Mask);
6693 auto [ScalarGEPCost, VectorGEPCost] =
6694 getGEPCosts(TTI, OrderedPointerOps, OrderedPointerOps.front(),
6695 Instruction::GetElementPtr, CostKind, ScalarTy, LoadVecTy);
6696 // The cost of scalar loads.
6697 InstructionCost ScalarLoadsCost =
6698 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
6699 [&](InstructionCost C, Value *V) {
6700 return C + TTI.getInstructionCost(cast<Instruction>(V),
6701 CostKind);
6702 }) +
6703 ScalarGEPCost;
6704 APInt DemandedElts = APInt::getAllOnes(Sz);
6705 InstructionCost GatherCost =
6706 getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
6707 /*Insert=*/true,
6708 /*Extract=*/false, CostKind) +
6709 ScalarLoadsCost;
6710 InstructionCost LoadCost = 0;
6711 if (IsMasked) {
6712 LoadCost =
6713 TTI.getMaskedMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6714 LI->getPointerAddressSpace(), CostKind);
6715 } else {
6716 LoadCost =
6717 TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6718 LI->getPointerAddressSpace(), CostKind);
6719 }
6720 if (IsStrided && !IsMasked && Order.empty()) {
6721 // Check for potential segmented(interleaved) loads.
6722 VectorType *AlignedLoadVecTy = getWidenedType(
6723 ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1));
6724 if (!isSafeToLoadUnconditionally(Ptr0, AlignedLoadVecTy, CommonAlignment,
6725 DL, cast<LoadInst>(VL.back()), &AC, &DT,
6726 &TLI))
6727 AlignedLoadVecTy = LoadVecTy;
6728 if (TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
6729 CommonAlignment,
6730 LI->getPointerAddressSpace())) {
6731 InstructionCost InterleavedCost =
6732 VectorGEPCost + TTI.getInterleavedMemoryOpCost(
6733 Instruction::Load, AlignedLoadVecTy,
6734 CompressMask[1], {}, CommonAlignment,
6735 LI->getPointerAddressSpace(), CostKind, IsMasked);
6736 if (InterleavedCost < GatherCost) {
6737 InterleaveFactor = CompressMask[1];
6738 LoadVecTy = AlignedLoadVecTy;
6739 return true;
6740 }
6741 }
6742 }
6743 InstructionCost CompressCost = ::getShuffleCost(
6744 TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind);
6745 if (!Order.empty()) {
6746 SmallVector<int> NewMask(Sz, PoisonMaskElem);
6747 for (unsigned I : seq<unsigned>(Sz)) {
6748 NewMask[I] = CompressMask[Mask[I]];
6749 }
6750 CompressMask.swap(NewMask);
6751 }
6752 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
6753 return TotalVecCost < GatherCost;
6754}
6755
6756/// Checks if the \p VL can be transformed to a (masked)load + compress or
6757/// (masked) interleaved load.
6758static bool
6761 const DataLayout &DL, ScalarEvolution &SE,
6762 AssumptionCache &AC, const DominatorTree &DT,
6763 const TargetLibraryInfo &TLI,
6764 const function_ref<bool(Value *)> AreAllUsersVectorized) {
6765 bool IsMasked;
6766 unsigned InterleaveFactor;
6767 SmallVector<int> CompressMask;
6768 VectorType *LoadVecTy;
6769 return isMaskedLoadCompress(VL, PointerOps, Order, TTI, DL, SE, AC, DT, TLI,
6770 AreAllUsersVectorized, IsMasked, InterleaveFactor,
6771 CompressMask, LoadVecTy);
6772}
6773
6774/// Checks if strided loads can be generated out of \p VL loads with pointers \p
6775/// PointerOps:
6776/// 1. Target with strided load support is detected.
6777/// 2. The number of loads is greater than MinProfitableStridedLoads, or the
6778/// potential stride <= MaxProfitableLoadStride and the potential stride is
6779/// power-of-2 (to avoid perf regressions for the very small number of loads)
6780/// and max distance > number of loads, or potential stride is -1.
6781/// 3. The loads are ordered, or number of unordered loads <=
6782/// MaxProfitableUnorderedLoads, or loads are in reversed order. (this check is
6783/// to avoid extra costs for very expensive shuffles).
6784/// 4. Any pointer operand is an instruction with the users outside of the
6785/// current graph (for masked gathers extra extractelement instructions
6786/// might be required).
6788 ArrayRef<unsigned> Order,
6789 const TargetTransformInfo &TTI, const DataLayout &DL,
6790 ScalarEvolution &SE,
6791 const bool IsAnyPointerUsedOutGraph,
6792 const int64_t Diff) {
6793 const size_t Sz = VL.size();
6794 const uint64_t AbsoluteDiff = std::abs(Diff);
6795 Type *ScalarTy = VL.front()->getType();
6796 auto *VecTy = getWidenedType(ScalarTy, Sz);
6797 if (IsAnyPointerUsedOutGraph ||
6798 (AbsoluteDiff > Sz &&
6800 (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
6801 AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) ||
6802 Diff == -(static_cast<int64_t>(Sz) - 1)) {
6803 int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
6804 if (Diff != Stride * static_cast<int64_t>(Sz - 1))
6805 return false;
6806 Align Alignment =
6807 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
6808 ->getAlign();
6809 if (!TTI.isLegalStridedLoadStore(VecTy, Alignment))
6810 return false;
6811 Value *Ptr0;
6812 Value *PtrN;
6813 if (Order.empty()) {
6814 Ptr0 = PointerOps.front();
6815 PtrN = PointerOps.back();
6816 } else {
6817 Ptr0 = PointerOps[Order.front()];
6818 PtrN = PointerOps[Order.back()];
6819 }
6820 // Iterate through all pointers and check if all distances are
6821 // unique multiple of Dist.
6823 for (Value *Ptr : PointerOps) {
6824 int64_t Dist = 0;
6825 if (Ptr == PtrN)
6826 Dist = Diff;
6827 else if (Ptr != Ptr0)
6828 Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
6829 // If the strides are not the same or repeated, we can't
6830 // vectorize.
6831 if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second)
6832 break;
6833 }
6834 if (Dists.size() == Sz)
6835 return true;
6836 }
6837 return false;
6838}
6839
6843 SmallVectorImpl<Value *> &PointerOps,
6844 unsigned *BestVF, bool TryRecursiveCheck) const {
6845 // Check that a vectorized load would load the same memory as a scalar
6846 // load. For example, we don't want to vectorize loads that are smaller
6847 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
6848 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
6849 // from such a struct, we read/write packed bits disagreeing with the
6850 // unvectorized version.
6851 if (BestVF)
6852 *BestVF = 0;
6854 return LoadsState::Gather;
6855 Type *ScalarTy = VL0->getType();
6856
6857 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
6858 return LoadsState::Gather;
6859
6860 // Make sure all loads in the bundle are simple - we can't vectorize
6861 // atomic or volatile loads.
6862 PointerOps.clear();
6863 const size_t Sz = VL.size();
6864 PointerOps.resize(Sz);
6865 auto *POIter = PointerOps.begin();
6866 for (Value *V : VL) {
6867 auto *L = dyn_cast<LoadInst>(V);
6868 if (!L || !L->isSimple())
6869 return LoadsState::Gather;
6870 *POIter = L->getPointerOperand();
6871 ++POIter;
6872 }
6873
6874 Order.clear();
6875 // Check the order of pointer operands or that all pointers are the same.
6876 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
6877
6878 auto *VecTy = getWidenedType(ScalarTy, Sz);
6879 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
6880 if (!IsSorted) {
6881 if (Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy)) {
6882 if (TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
6883 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
6885 }
6886
6887 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
6888 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
6889 return LoadsState::Gather;
6890
6891 if (!all_of(PointerOps, [&](Value *P) {
6892 return arePointersCompatible(P, PointerOps.front(), *TLI);
6893 }))
6894 return LoadsState::Gather;
6895
6896 } else {
6897 Value *Ptr0;
6898 Value *PtrN;
6899 if (Order.empty()) {
6900 Ptr0 = PointerOps.front();
6901 PtrN = PointerOps.back();
6902 } else {
6903 Ptr0 = PointerOps[Order.front()];
6904 PtrN = PointerOps[Order.back()];
6905 }
6906 std::optional<int64_t> Diff =
6907 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
6908 // Check that the sorted loads are consecutive.
6909 if (static_cast<uint64_t>(*Diff) == Sz - 1)
6910 return LoadsState::Vectorize;
6911 if (isMaskedLoadCompress(VL, PointerOps, Order, *TTI, *DL, *SE, *AC, *DT,
6912 *TLI, [&](Value *V) {
6913 return areAllUsersVectorized(
6914 cast<Instruction>(V), UserIgnoreList);
6915 }))
6917 // Simple check if not a strided access - clear order.
6918 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
6919 // Try to generate strided load node.
6920 auto IsAnyPointerUsedOutGraph =
6921 IsPossibleStrided && any_of(PointerOps, [&](Value *V) {
6922 return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
6923 return !isVectorized(U) && !MustGather.contains(U);
6924 });
6925 });
6926 if (IsPossibleStrided &&
6927 isStridedLoad(VL, PointerOps, Order, *TTI, *DL, *SE,
6928 IsAnyPointerUsedOutGraph, *Diff))
6930 }
6931 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
6932 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
6933 return LoadsState::Gather;
6934 // Correctly identify compare the cost of loads + shuffles rather than
6935 // strided/masked gather loads. Returns true if vectorized + shuffles
6936 // representation is better than just gather.
6937 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
6938 unsigned *BestVF,
6939 bool ProfitableGatherPointers) {
6940 if (BestVF)
6941 *BestVF = 0;
6942 // Compare masked gather cost and loads + insert subvector costs.
6944 auto [ScalarGEPCost, VectorGEPCost] =
6945 getGEPCosts(TTI, PointerOps, PointerOps.front(),
6946 Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
6947 // Estimate the cost of masked gather GEP. If not a splat, roughly
6948 // estimate as a buildvector, otherwise estimate as splat.
6949 APInt DemandedElts = APInt::getAllOnes(Sz);
6950 Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType();
6951 VectorType *PtrVecTy = getWidenedType(PtrScalarTy, Sz);
6952 if (static_cast<unsigned>(count_if(
6953 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
6954 any_of(PointerOps, [&](Value *V) {
6955 return getUnderlyingObject(V) !=
6956 getUnderlyingObject(PointerOps.front());
6957 }))
6958 VectorGEPCost += getScalarizationOverhead(TTI, PtrScalarTy, PtrVecTy,
6959 DemandedElts, /*Insert=*/true,
6960 /*Extract=*/false, CostKind);
6961 else
6962 VectorGEPCost +=
6964 TTI, PtrScalarTy, PtrVecTy, APInt::getOneBitSet(Sz, 0),
6965 /*Insert=*/true, /*Extract=*/false, CostKind) +
6967 // The cost of scalar loads.
6968 InstructionCost ScalarLoadsCost =
6969 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
6970 [&](InstructionCost C, Value *V) {
6971 return C + TTI.getInstructionCost(
6972 cast<Instruction>(V), CostKind);
6973 }) +
6974 ScalarGEPCost;
6975 // The cost of masked gather.
6976 InstructionCost MaskedGatherCost =
6978 Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
6979 /*VariableMask=*/false, CommonAlignment, CostKind) +
6980 (ProfitableGatherPointers ? 0 : VectorGEPCost);
6981 InstructionCost GatherCost =
6982 getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
6983 /*Insert=*/true,
6984 /*Extract=*/false, CostKind) +
6985 ScalarLoadsCost;
6986 // The list of loads is small or perform partial check already - directly
6987 // compare masked gather cost and gather cost.
6988 constexpr unsigned ListLimit = 4;
6989 if (!TryRecursiveCheck || VL.size() < ListLimit)
6990 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
6991
6992 // FIXME: The following code has not been updated for non-power-of-2
6993 // vectors (and not whole registers). The splitting logic here does not
6994 // cover the original vector if the vector factor is not a power of two.
6995 if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size()))
6996 return false;
6997
6998 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
6999 unsigned MinVF = getMinVF(2 * Sz);
7000 DemandedElts.clearAllBits();
7001 // Iterate through possible vectorization factors and check if vectorized +
7002 // shuffles is better than just gather.
7003 for (unsigned VF =
7004 getFloorFullVectorNumberOfElements(TTI, ScalarTy, VL.size() - 1);
7005 VF >= MinVF;
7006 VF = getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {
7008 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
7009 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
7011 SmallVector<Value *> PointerOps;
7012 LoadsState LS =
7013 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps, BestVF,
7014 /*TryRecursiveCheck=*/false);
7015 // Check that the sorted loads are consecutive.
7016 if (LS == LoadsState::Gather) {
7017 if (BestVF) {
7018 DemandedElts.setAllBits();
7019 break;
7020 }
7021 DemandedElts.setBits(Cnt, Cnt + VF);
7022 continue;
7023 }
7024 // If need the reorder - consider as high-cost masked gather for now.
7025 if ((LS == LoadsState::Vectorize ||
7028 !Order.empty() && !isReverseOrder(Order))
7030 States.push_back(LS);
7031 }
7032 if (DemandedElts.isAllOnes())
7033 // All loads gathered - try smaller VF.
7034 continue;
7035 // Can be vectorized later as a serie of loads/insertelements.
7036 InstructionCost VecLdCost = 0;
7037 if (!DemandedElts.isZero()) {
7038 VecLdCost = getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
7039 /*Insert=*/true,
7040 /*Extract=*/false, CostKind) +
7041 ScalarGEPCost;
7042 for (unsigned Idx : seq<unsigned>(VL.size()))
7043 if (DemandedElts[Idx])
7044 VecLdCost +=
7045 TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
7046 }
7047 auto *SubVecTy = getWidenedType(ScalarTy, VF);
7048 for (auto [I, LS] : enumerate(States)) {
7049 auto *LI0 = cast<LoadInst>(VL[I * VF]);
7050 InstructionCost VectorGEPCost =
7051 (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
7052 ? 0
7053 : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
7054 LI0->getPointerOperand(),
7055 Instruction::GetElementPtr, CostKind, ScalarTy,
7056 SubVecTy)
7057 .second;
7058 if (LS == LoadsState::ScatterVectorize) {
7059 if (static_cast<unsigned>(
7060 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
7061 PointerOps.size() - 1 ||
7062 any_of(PointerOps, [&](Value *V) {
7063 return getUnderlyingObject(V) !=
7064 getUnderlyingObject(PointerOps.front());
7065 }))
7066 VectorGEPCost += getScalarizationOverhead(
7067 TTI, ScalarTy, SubVecTy, APInt::getAllOnes(VF),
7068 /*Insert=*/true, /*Extract=*/false, CostKind);
7069 else
7070 VectorGEPCost +=
7072 TTI, ScalarTy, SubVecTy, APInt::getOneBitSet(VF, 0),
7073 /*Insert=*/true, /*Extract=*/false, CostKind) +
7074 ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
7075 CostKind);
7076 }
7077 switch (LS) {
7079 VecLdCost +=
7080 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
7081 LI0->getPointerAddressSpace(), CostKind,
7083 VectorGEPCost;
7084 break;
7086 VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
7087 LI0->getPointerOperand(),
7088 /*VariableMask=*/false,
7089 CommonAlignment, CostKind) +
7090 VectorGEPCost;
7091 break;
7093 VecLdCost += TTI.getMaskedMemoryOpCost(
7094 Instruction::Load, SubVecTy, CommonAlignment,
7095 LI0->getPointerAddressSpace(), CostKind) +
7096 VectorGEPCost +
7098 {}, CostKind);
7099 break;
7101 VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
7102 LI0->getPointerOperand(),
7103 /*VariableMask=*/false,
7104 CommonAlignment, CostKind) +
7105 VectorGEPCost;
7106 break;
7107 case LoadsState::Gather:
7108 // Gathers are already calculated - ignore.
7109 continue;
7110 }
7111 SmallVector<int> ShuffleMask(VL.size());
7112 for (int Idx : seq<int>(0, VL.size()))
7113 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
7114 if (I > 0)
7115 VecLdCost +=
7116 ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, ShuffleMask,
7117 CostKind, I * VF, SubVecTy);
7118 }
7119 // If masked gather cost is higher - better to vectorize, so
7120 // consider it as a gather node. It will be better estimated
7121 // later.
7122 if (MaskedGatherCost >= VecLdCost &&
7123 VecLdCost - GatherCost < -SLPCostThreshold) {
7124 if (BestVF)
7125 *BestVF = VF;
7126 return true;
7127 }
7128 }
7129 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7130 };
7131 // TODO: need to improve analysis of the pointers, if not all of them are
7132 // GEPs or have > 2 operands, we end up with a gather node, which just
7133 // increases the cost.
7134 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
7135 bool ProfitableGatherPointers =
7136 L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
7137 return L->isLoopInvariant(V);
7138 })) <= Sz / 2;
7139 if (ProfitableGatherPointers || all_of(PointerOps, [](Value *P) {
7140 auto *GEP = dyn_cast<GetElementPtrInst>(P);
7141 return (!GEP && doesNotNeedToBeScheduled(P)) ||
7142 (GEP && GEP->getNumOperands() == 2 &&
7143 isa<Constant, Instruction>(GEP->getOperand(1)));
7144 })) {
7145 // Check if potential masked gather can be represented as series
7146 // of loads + insertsubvectors.
7147 // If masked gather cost is higher - better to vectorize, so
7148 // consider it as a gather node. It will be better estimated
7149 // later.
7150 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
7151 ProfitableGatherPointers))
7153 }
7154
7155 return LoadsState::Gather;
7156}
7157
7159 ArrayRef<BasicBlock *> BBs, Type *ElemTy,
7160 const DataLayout &DL, ScalarEvolution &SE,
7161 SmallVectorImpl<unsigned> &SortedIndices) {
7162 assert(
7163 all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
7164 "Expected list of pointer operands.");
7165 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
7166 // Ptr into, sort and return the sorted indices with values next to one
7167 // another.
7169 std::pair<BasicBlock *, Value *>,
7171 Bases;
7172 Bases
7173 .try_emplace(std::make_pair(
7175 .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U);
7176
7177 SortedIndices.clear();
7178 for (auto [Cnt, Ptr] : enumerate(VL.drop_front())) {
7179 auto Key = std::make_pair(BBs[Cnt + 1],
7181 bool Found = any_of(Bases.try_emplace(Key).first->second,
7182 [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) {
7183 std::optional<int64_t> Diff =
7184 getPointersDiff(ElemTy, std::get<0>(Base.front()),
7185 ElemTy, Ptr, DL, SE,
7186 /*StrictCheck=*/true);
7187 if (!Diff)
7188 return false;
7189
7190 Base.emplace_back(Ptr, *Diff, Cnt + 1);
7191 return true;
7192 });
7193
7194 if (!Found) {
7195 // If we haven't found enough to usefully cluster, return early.
7196 if (Bases.size() > VL.size() / 2 - 1)
7197 return false;
7198
7199 // Not found already - add a new Base
7200 Bases.find(Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
7201 }
7202 }
7203
7204 if (Bases.size() == VL.size())
7205 return false;
7206
7207 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
7208 Bases.front().second.size() == VL.size()))
7209 return false;
7210
7211 // For each of the bases sort the pointers by Offset and check if any of the
7212 // base become consecutively allocated.
7213 auto ComparePointers = [](Value *Ptr1, Value *Ptr2) {
7214 SmallPtrSet<Value *, 13> FirstPointers;
7215 SmallPtrSet<Value *, 13> SecondPointers;
7216 Value *P1 = Ptr1;
7217 Value *P2 = Ptr2;
7218 unsigned Depth = 0;
7219 while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1)) {
7220 if (P1 == P2 || Depth > RecursionMaxDepth)
7221 return false;
7222 FirstPointers.insert(P1);
7223 SecondPointers.insert(P2);
7224 P1 = getUnderlyingObject(P1, /*MaxLookup=*/1);
7225 P2 = getUnderlyingObject(P2, /*MaxLookup=*/1);
7226 ++Depth;
7227 }
7228 assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) &&
7229 "Unable to find matching root.");
7230 return FirstPointers.contains(P2) && !SecondPointers.contains(P1);
7231 };
7232 for (auto &Base : Bases) {
7233 for (auto &Vec : Base.second) {
7234 if (Vec.size() > 1) {
7236 int64_t InitialOffset = std::get<1>(Vec[0]);
7237 bool AnyConsecutive =
7238 all_of(enumerate(Vec), [InitialOffset](const auto &P) {
7239 return std::get<1>(P.value()) ==
7240 int64_t(P.index()) + InitialOffset;
7241 });
7242 // Fill SortedIndices array only if it looks worth-while to sort the
7243 // ptrs.
7244 if (!AnyConsecutive)
7245 return false;
7246 }
7247 }
7248 stable_sort(Base.second, [&](const auto &V1, const auto &V2) {
7249 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
7250 });
7251 }
7252
7253 for (auto &T : Bases)
7254 for (const auto &Vec : T.second)
7255 for (const auto &P : Vec)
7256 SortedIndices.push_back(std::get<2>(P));
7257
7258 assert(SortedIndices.size() == VL.size() &&
7259 "Expected SortedIndices to be the size of VL");
7260 return true;
7261}
7262
7263std::optional<BoUpSLP::OrdersType>
7264BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
7265 assert(TE.isGather() && "Expected gather node only.");
7266 Type *ScalarTy = TE.Scalars[0]->getType();
7267
7269 Ptrs.reserve(TE.Scalars.size());
7271 BBs.reserve(TE.Scalars.size());
7272 for (Value *V : TE.Scalars) {
7273 auto *L = dyn_cast<LoadInst>(V);
7274 if (!L || !L->isSimple())
7275 return std::nullopt;
7276 Ptrs.push_back(L->getPointerOperand());
7277 BBs.push_back(L->getParent());
7278 }
7279
7280 BoUpSLP::OrdersType Order;
7281 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
7282 clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order))
7283 return std::move(Order);
7284 return std::nullopt;
7285}
7286
7287/// Check if two insertelement instructions are from the same buildvector.
7290 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
7291 // Instructions must be from the same basic blocks.
7292 if (VU->getParent() != V->getParent())
7293 return false;
7294 // Checks if 2 insertelements are from the same buildvector.
7295 if (VU->getType() != V->getType())
7296 return false;
7297 // Multiple used inserts are separate nodes.
7298 if (!VU->hasOneUse() && !V->hasOneUse())
7299 return false;
7300 auto *IE1 = VU;
7301 auto *IE2 = V;
7302 std::optional<unsigned> Idx1 = getElementIndex(IE1);
7303 std::optional<unsigned> Idx2 = getElementIndex(IE2);
7304 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
7305 return false;
7306 // Go through the vector operand of insertelement instructions trying to find
7307 // either VU as the original vector for IE2 or V as the original vector for
7308 // IE1.
7309 SmallBitVector ReusedIdx(
7310 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
7311 bool IsReusedIdx = false;
7312 do {
7313 if (IE2 == VU && !IE1)
7314 return VU->hasOneUse();
7315 if (IE1 == V && !IE2)
7316 return V->hasOneUse();
7317 if (IE1 && IE1 != V) {
7318 unsigned Idx1 = getElementIndex(IE1).value_or(*Idx2);
7319 IsReusedIdx |= ReusedIdx.test(Idx1);
7320 ReusedIdx.set(Idx1);
7321 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
7322 IE1 = nullptr;
7323 else
7324 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
7325 }
7326 if (IE2 && IE2 != VU) {
7327 unsigned Idx2 = getElementIndex(IE2).value_or(*Idx1);
7328 IsReusedIdx |= ReusedIdx.test(Idx2);
7329 ReusedIdx.set(Idx2);
7330 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
7331 IE2 = nullptr;
7332 else
7333 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
7334 }
7335 } while (!IsReusedIdx && (IE1 || IE2));
7336 return false;
7337}
7338
7339/// Checks if the specified instruction \p I is an alternate operation for
7340/// the given \p MainOp and \p AltOp instructions.
7341static bool isAlternateInstruction(Instruction *I, Instruction *MainOp,
7342 Instruction *AltOp,
7343 const TargetLibraryInfo &TLI);
7344
7345std::optional<BoUpSLP::OrdersType>
7346BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
7347 bool IgnoreReorder) {
7348 // No need to reorder if need to shuffle reuses, still need to shuffle the
7349 // node.
7350 if (!TE.ReuseShuffleIndices.empty()) {
7351 // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
7352 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
7353 "Reshuffling scalars not yet supported for nodes with padding");
7354
7355 if (isSplat(TE.Scalars))
7356 return std::nullopt;
7357 // Check if reuse shuffle indices can be improved by reordering.
7358 // For this, check that reuse mask is "clustered", i.e. each scalar values
7359 // is used once in each submask of size <number_of_scalars>.
7360 // Example: 4 scalar values.
7361 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
7362 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
7363 // element 3 is used twice in the second submask.
7364 unsigned Sz = TE.Scalars.size();
7365 if (TE.isGather()) {
7366 if (std::optional<OrdersType> CurrentOrder =
7367 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder)) {
7368 SmallVector<int> Mask;
7369 fixupOrderingIndices(*CurrentOrder);
7370 inversePermutation(*CurrentOrder, Mask);
7371 ::addMask(Mask, TE.ReuseShuffleIndices);
7372 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
7373 unsigned Sz = TE.Scalars.size();
7374 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
7375 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
7376 if (Idx != PoisonMaskElem)
7377 Res[Idx + K * Sz] = I + K * Sz;
7378 }
7379 return std::move(Res);
7380 }
7381 }
7382 if (Sz == 2 && TE.getVectorFactor() == 4 &&
7383 ::getNumberOfParts(*TTI, getWidenedType(TE.Scalars.front()->getType(),
7384 2 * TE.getVectorFactor())) == 1)
7385 return std::nullopt;
7386 if (TE.ReuseShuffleIndices.size() % Sz != 0)
7387 return std::nullopt;
7388 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
7389 Sz)) {
7390 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
7391 if (TE.ReorderIndices.empty())
7392 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
7393 else
7394 inversePermutation(TE.ReorderIndices, ReorderMask);
7395 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
7396 unsigned VF = ReorderMask.size();
7397 OrdersType ResOrder(VF, VF);
7398 unsigned NumParts = divideCeil(VF, Sz);
7399 SmallBitVector UsedVals(NumParts);
7400 for (unsigned I = 0; I < VF; I += Sz) {
7401 int Val = PoisonMaskElem;
7402 unsigned UndefCnt = 0;
7403 unsigned Limit = std::min(Sz, VF - I);
7404 if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
7405 [&](int Idx) {
7406 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
7407 Val = Idx;
7408 if (Idx == PoisonMaskElem)
7409 ++UndefCnt;
7410 return Idx != PoisonMaskElem && Idx != Val;
7411 }) ||
7412 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
7413 UndefCnt > Sz / 2)
7414 return std::nullopt;
7415 UsedVals.set(Val);
7416 for (unsigned K = 0; K < NumParts; ++K) {
7417 unsigned Idx = Val + Sz * K;
7418 if (Idx < VF && I + K < VF)
7419 ResOrder[Idx] = I + K;
7420 }
7421 }
7422 return std::move(ResOrder);
7423 }
7424 unsigned VF = TE.getVectorFactor();
7425 // Try build correct order for extractelement instructions.
7426 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
7427 TE.ReuseShuffleIndices.end());
7428 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
7429 all_of(TE.Scalars, [Sz](Value *V) {
7430 if (isa<PoisonValue>(V))
7431 return true;
7432 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
7433 return Idx && *Idx < Sz;
7434 })) {
7435 assert(!TE.isAltShuffle() && "Alternate instructions are only supported "
7436 "by BinaryOperator and CastInst.");
7437 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
7438 if (TE.ReorderIndices.empty())
7439 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
7440 else
7441 inversePermutation(TE.ReorderIndices, ReorderMask);
7442 for (unsigned I = 0; I < VF; ++I) {
7443 int &Idx = ReusedMask[I];
7444 if (Idx == PoisonMaskElem)
7445 continue;
7446 Value *V = TE.Scalars[ReorderMask[Idx]];
7447 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
7448 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
7449 }
7450 }
7451 // Build the order of the VF size, need to reorder reuses shuffles, they are
7452 // always of VF size.
7453 OrdersType ResOrder(VF);
7454 std::iota(ResOrder.begin(), ResOrder.end(), 0);
7455 auto *It = ResOrder.begin();
7456 for (unsigned K = 0; K < VF; K += Sz) {
7457 OrdersType CurrentOrder(TE.ReorderIndices);
7458 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
7459 if (SubMask.front() == PoisonMaskElem)
7460 std::iota(SubMask.begin(), SubMask.end(), 0);
7461 reorderOrder(CurrentOrder, SubMask);
7462 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
7463 std::advance(It, Sz);
7464 }
7465 if (TE.isGather() && all_of(enumerate(ResOrder), [](const auto &Data) {
7466 return Data.index() == Data.value();
7467 }))
7468 return std::nullopt; // No need to reorder.
7469 return std::move(ResOrder);
7470 }
7471 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
7472 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
7473 !Instruction::isBinaryOp(TE.UserTreeIndex.UserTE->getOpcode())) &&
7474 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
7475 return std::nullopt;
7476 if (TE.State == TreeEntry::SplitVectorize ||
7477 ((TE.State == TreeEntry::Vectorize ||
7478 TE.State == TreeEntry::StridedVectorize ||
7479 TE.State == TreeEntry::CompressVectorize) &&
7480 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
7481 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))))) {
7482 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
7483 "Alternate instructions are only supported by "
7484 "BinaryOperator and CastInst.");
7485 return TE.ReorderIndices;
7486 }
7487 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
7488 TE.isAltShuffle()) {
7489 assert(TE.ReuseShuffleIndices.empty() &&
7490 "ReuseShuffleIndices should be "
7491 "empty for alternate instructions.");
7492 SmallVector<int> Mask;
7493 TE.buildAltOpShuffleMask(
7494 [&](Instruction *I) {
7495 assert(TE.getMatchingMainOpOrAltOp(I) &&
7496 "Unexpected main/alternate opcode");
7497 return isAlternateInstruction(I, TE.getMainOp(), TE.getAltOp(), *TLI);
7498 },
7499 Mask);
7500 const int VF = TE.getVectorFactor();
7501 OrdersType ResOrder(VF, VF);
7502 for (unsigned I : seq<unsigned>(VF)) {
7503 if (Mask[I] == PoisonMaskElem)
7504 continue;
7505 ResOrder[Mask[I] % VF] = I;
7506 }
7507 return std::move(ResOrder);
7508 }
7509 if (!TE.ReorderIndices.empty())
7510 return TE.ReorderIndices;
7511 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
7512 if (!TE.ReorderIndices.empty())
7513 return TE.ReorderIndices;
7514
7515 SmallVector<Instruction *> UserBVHead(TE.Scalars.size());
7516 for (auto [I, V] : zip(UserBVHead, TE.Scalars)) {
7517 if (isa<Constant>(V) || !V->hasNUsesOrMore(1))
7518 continue;
7519 auto *II = dyn_cast<InsertElementInst>(*V->user_begin());
7520 if (!II)
7521 continue;
7522 Instruction *BVHead = nullptr;
7523 BasicBlock *BB = II->getParent();
7524 while (II && II->hasOneUse() && II->getParent() == BB) {
7525 BVHead = II;
7526 II = dyn_cast<InsertElementInst>(II->getOperand(0));
7527 }
7528 I = BVHead;
7529 }
7530
7531 auto CompareByBasicBlocks = [&](BasicBlock *BB1, BasicBlock *BB2) {
7532 assert(BB1 != BB2 && "Expected different basic blocks.");
7533 if (!DT->isReachableFromEntry(BB1))
7534 return false;
7535 if (!DT->isReachableFromEntry(BB2))
7536 return true;
7537 auto *NodeA = DT->getNode(BB1);
7538 auto *NodeB = DT->getNode(BB2);
7539 assert(NodeA && "Should only process reachable instructions");
7540 assert(NodeB && "Should only process reachable instructions");
7541 assert((NodeA == NodeB) ==
7542 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
7543 "Different nodes should have different DFS numbers");
7544 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
7545 };
7546 auto PHICompare = [&](unsigned I1, unsigned I2) {
7547 Value *V1 = TE.Scalars[I1];
7548 Value *V2 = TE.Scalars[I2];
7549 if (V1 == V2 || (V1->use_empty() && V2->use_empty()))
7550 return false;
7551 if (isa<PoisonValue>(V1))
7552 return true;
7553 if (isa<PoisonValue>(V2))
7554 return false;
7555 if (V1->getNumUses() < V2->getNumUses())
7556 return true;
7557 if (V1->getNumUses() > V2->getNumUses())
7558 return false;
7559 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
7560 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
7561 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
7562 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
7563 FirstUserOfPhi2->getParent());
7564 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
7565 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
7566 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
7567 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
7568 if (IE1 && !IE2)
7569 return true;
7570 if (!IE1 && IE2)
7571 return false;
7572 if (IE1 && IE2) {
7573 if (UserBVHead[I1] && !UserBVHead[I2])
7574 return true;
7575 if (!UserBVHead[I1])
7576 return false;
7577 if (UserBVHead[I1] == UserBVHead[I2])
7578 return getElementIndex(IE1) < getElementIndex(IE2);
7579 if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent())
7580 return CompareByBasicBlocks(UserBVHead[I1]->getParent(),
7581 UserBVHead[I2]->getParent());
7582 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
7583 }
7584 if (EE1 && !EE2)
7585 return true;
7586 if (!EE1 && EE2)
7587 return false;
7588 if (EE1 && EE2) {
7589 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
7590 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
7591 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
7592 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
7593 if (!Inst2 && !P2)
7594 return Inst1 || P1;
7595 if (EE1->getOperand(0) == EE2->getOperand(0))
7596 return getElementIndex(EE1) < getElementIndex(EE2);
7597 if (!Inst1 && Inst2)
7598 return false;
7599 if (Inst1 && Inst2) {
7600 if (Inst1->getParent() != Inst2->getParent())
7601 return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent());
7602 return Inst1->comesBefore(Inst2);
7603 }
7604 if (!P1 && P2)
7605 return false;
7606 assert(P1 && P2 &&
7607 "Expected either instructions or arguments vector operands.");
7608 return P1->getArgNo() < P2->getArgNo();
7609 }
7610 return false;
7611 };
7612 OrdersType Phis(TE.Scalars.size());
7613 std::iota(Phis.begin(), Phis.end(), 0);
7614 stable_sort(Phis, PHICompare);
7615 if (isIdentityOrder(Phis))
7616 return std::nullopt; // No need to reorder.
7617 return std::move(Phis);
7618 }
7619 if (TE.isGather() &&
7620 (!TE.hasState() || !TE.isAltShuffle() ||
7621 ScalarsInSplitNodes.contains(TE.getMainOp())) &&
7622 allSameType(TE.Scalars)) {
7623 // TODO: add analysis of other gather nodes with extractelement
7624 // instructions and other values/instructions, not only undefs.
7625 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
7626 (all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
7627 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
7628 all_of(TE.Scalars, [](Value *V) {
7629 auto *EE = dyn_cast<ExtractElementInst>(V);
7630 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
7631 })) {
7632 // Check that gather of extractelements can be represented as
7633 // just a shuffle of a single vector.
7634 OrdersType CurrentOrder;
7635 bool Reuse =
7636 canReuseExtract(TE.Scalars, CurrentOrder, /*ResizeAllowed=*/true);
7637 if (Reuse || !CurrentOrder.empty())
7638 return std::move(CurrentOrder);
7639 }
7640 // If the gather node is <undef, v, .., poison> and
7641 // insertelement poison, v, 0 [+ permute]
7642 // is cheaper than
7643 // insertelement poison, v, n - try to reorder.
7644 // If rotating the whole graph, exclude the permute cost, the whole graph
7645 // might be transformed.
7646 int Sz = TE.Scalars.size();
7647 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
7648 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
7649 const auto *It = find_if_not(TE.Scalars, isConstant);
7650 if (It == TE.Scalars.begin())
7651 return OrdersType();
7652 auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
7653 if (It != TE.Scalars.end()) {
7654 OrdersType Order(Sz, Sz);
7655 unsigned Idx = std::distance(TE.Scalars.begin(), It);
7656 Order[Idx] = 0;
7657 fixupOrderingIndices(Order);
7658 SmallVector<int> Mask;
7659 inversePermutation(Order, Mask);
7660 InstructionCost PermuteCost =
7661 TopToBottom
7662 ? 0
7664 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
7665 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
7666 PoisonValue::get(Ty), *It);
7667 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
7668 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
7669 PoisonValue::get(Ty), *It);
7670 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
7671 OrdersType Order(Sz, Sz);
7672 Order[Idx] = 0;
7673 return std::move(Order);
7674 }
7675 }
7676 }
7677 if (isSplat(TE.Scalars))
7678 return std::nullopt;
7679 if (TE.Scalars.size() >= 3)
7680 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
7681 return Order;
7682 // Check if can include the order of vectorized loads. For masked gathers do
7683 // extra analysis later, so include such nodes into a special list.
7684 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
7685 SmallVector<Value *> PointerOps;
7686 OrdersType CurrentOrder;
7687 LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
7688 CurrentOrder, PointerOps);
7691 return std::move(CurrentOrder);
7692 }
7693 // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
7694 // has been auditted for correctness with non-power-of-two vectors.
7695 if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
7696 if (std::optional<OrdersType> CurrentOrder =
7697 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
7698 return CurrentOrder;
7699 }
7700 return std::nullopt;
7701}
7702
7703/// Checks if the given mask is a "clustered" mask with the same clusters of
7704/// size \p Sz, which are not identity submasks.
7706 unsigned Sz) {
7707 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
7708 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
7709 return false;
7710 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
7711 ArrayRef<int> Cluster = Mask.slice(I, Sz);
7712 if (Cluster != FirstCluster)
7713 return false;
7714 }
7715 return true;
7716}
7717
7718void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
7719 // Reorder reuses mask.
7720 reorderReuses(TE.ReuseShuffleIndices, Mask);
7721 const unsigned Sz = TE.Scalars.size();
7722 // For vectorized and non-clustered reused no need to do anything else.
7723 if (!TE.isGather() ||
7725 Sz) ||
7726 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
7727 return;
7728 SmallVector<int> NewMask;
7729 inversePermutation(TE.ReorderIndices, NewMask);
7730 addMask(NewMask, TE.ReuseShuffleIndices);
7731 // Clear reorder since it is going to be applied to the new mask.
7732 TE.ReorderIndices.clear();
7733 // Try to improve gathered nodes with clustered reuses, if possible.
7734 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
7735 SmallVector<unsigned> NewOrder(Slice);
7736 inversePermutation(NewOrder, NewMask);
7737 reorderScalars(TE.Scalars, NewMask);
7738 // Fill the reuses mask with the identity submasks.
7739 for (auto *It = TE.ReuseShuffleIndices.begin(),
7740 *End = TE.ReuseShuffleIndices.end();
7741 It != End; std::advance(It, Sz))
7742 std::iota(It, std::next(It, Sz), 0);
7743}
7744
7746 ArrayRef<unsigned> SecondaryOrder) {
7747 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
7748 "Expected same size of orders");
7749 size_t Sz = Order.size();
7750 SmallBitVector UsedIndices(Sz);
7751 for (unsigned Idx : seq<unsigned>(0, Sz)) {
7752 if (Order[Idx] != Sz)
7753 UsedIndices.set(Order[Idx]);
7754 }
7755 if (SecondaryOrder.empty()) {
7756 for (unsigned Idx : seq<unsigned>(0, Sz))
7757 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
7758 Order[Idx] = Idx;
7759 } else {
7760 for (unsigned Idx : seq<unsigned>(0, Sz))
7761 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
7762 !UsedIndices.test(SecondaryOrder[Idx]))
7763 Order[Idx] = SecondaryOrder[Idx];
7764 }
7765}
7766
7768 constexpr unsigned TinyVF = 2;
7769 constexpr unsigned TinyTree = 10;
7770 constexpr unsigned PhiOpsLimit = 12;
7771 constexpr unsigned GatherLoadsLimit = 2;
7772 if (VectorizableTree.size() <= TinyTree)
7773 return true;
7774 if (VectorizableTree.front()->hasState() &&
7775 !VectorizableTree.front()->isGather() &&
7776 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
7777 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
7778 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
7779 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
7780 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
7781 VectorizableTree.front()->ReorderIndices.empty()) {
7782 // Check if the tree has only single store and single (unordered) load node,
7783 // other nodes are phis or geps/binops, combined with phis, and/or single
7784 // gather load node
7785 if (VectorizableTree.front()->hasState() &&
7786 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
7787 VectorizableTree.front()->Scalars.size() == TinyVF &&
7788 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
7789 return false;
7790 // Single node, which require reorder - skip.
7791 if (VectorizableTree.front()->hasState() &&
7792 VectorizableTree.front()->getOpcode() == Instruction::Store &&
7793 VectorizableTree.front()->ReorderIndices.empty()) {
7794 const unsigned ReorderedSplitsCnt =
7795 count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
7796 return TE->State == TreeEntry::SplitVectorize &&
7797 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
7798 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
7799 ::isCommutative(TE->UserTreeIndex.UserTE->getMainOp());
7800 });
7801 if (ReorderedSplitsCnt <= 1 &&
7802 static_cast<unsigned>(count_if(
7803 VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
7804 return ((!TE->isGather() &&
7805 (TE->ReorderIndices.empty() ||
7806 (TE->UserTreeIndex.UserTE &&
7807 TE->UserTreeIndex.UserTE->State ==
7808 TreeEntry::Vectorize &&
7809 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
7810 .empty()))) ||
7811 (TE->isGather() && TE->ReorderIndices.empty() &&
7812 (!TE->hasState() || TE->isAltShuffle() ||
7813 TE->getOpcode() == Instruction::Load ||
7814 TE->getOpcode() == Instruction::ZExt ||
7815 TE->getOpcode() == Instruction::SExt))) &&
7816 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
7817 !TE->isGather() || none_of(TE->Scalars, [&](Value *V) {
7818 return !isConstant(V) && isVectorized(V);
7819 }));
7820 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
7821 return false;
7822 }
7823 bool HasPhis = false;
7824 bool HasLoad = true;
7825 unsigned GatherLoads = 0;
7826 for (const std::unique_ptr<TreeEntry> &TE :
7827 ArrayRef(VectorizableTree).drop_front()) {
7828 if (TE->State == TreeEntry::SplitVectorize)
7829 continue;
7830 if (!TE->hasState()) {
7831 if (all_of(TE->Scalars, IsaPred<Constant, PHINode>) ||
7832 all_of(TE->Scalars, IsaPred<BinaryOperator, PHINode>))
7833 continue;
7834 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
7835 any_of(TE->Scalars, IsaPred<PHINode, GEPOperator>))
7836 continue;
7837 return true;
7838 }
7839 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
7840 if (!TE->isGather()) {
7841 HasLoad = false;
7842 continue;
7843 }
7844 if (HasLoad)
7845 return true;
7846 ++GatherLoads;
7847 if (GatherLoads >= GatherLoadsLimit)
7848 return true;
7849 }
7850 if (TE->getOpcode() == Instruction::GetElementPtr ||
7851 Instruction::isBinaryOp(TE->getOpcode()))
7852 continue;
7853 if (TE->getOpcode() != Instruction::PHI &&
7854 (!TE->hasCopyableElements() ||
7855 static_cast<unsigned>(count_if(TE->Scalars, IsaPred<PHINode>)) <
7856 TE->Scalars.size() / 2))
7857 return true;
7858 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
7859 TE->getNumOperands() > PhiOpsLimit)
7860 return false;
7861 HasPhis = true;
7862 }
7863 return !HasPhis;
7864 }
7865 return true;
7866}
7867
7868void BoUpSLP::TreeEntry::reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
7869 ArrayRef<int> MaskOrder) {
7870 assert(State == TreeEntry::SplitVectorize && "Expected split user node.");
7871 SmallVector<int> NewMask(getVectorFactor());
7872 SmallVector<int> NewMaskOrder(getVectorFactor());
7873 std::iota(NewMask.begin(), NewMask.end(), 0);
7874 std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
7875 if (Idx == 0) {
7876 copy(Mask, NewMask.begin());
7877 copy(MaskOrder, NewMaskOrder.begin());
7878 } else {
7879 assert(Idx == 1 && "Expected either 0 or 1 index.");
7880 unsigned Offset = CombinedEntriesWithIndices.back().second;
7881 for (unsigned I : seq<unsigned>(Mask.size())) {
7882 NewMask[I + Offset] = Mask[I] + Offset;
7883 NewMaskOrder[I + Offset] = MaskOrder[I] + Offset;
7884 }
7885 }
7886 reorderScalars(Scalars, NewMask);
7887 reorderOrder(ReorderIndices, NewMaskOrder, /*BottomOrder=*/true);
7888 if (!ReorderIndices.empty() && BoUpSLP::isIdentityOrder(ReorderIndices))
7889 ReorderIndices.clear();
7890}
7891
7893 // Maps VF to the graph nodes.
7895 // ExtractElement gather nodes which can be vectorized and need to handle
7896 // their ordering.
7898
7899 // Phi nodes can have preferred ordering based on their result users
7901
7902 // AltShuffles can also have a preferred ordering that leads to fewer
7903 // instructions, e.g., the addsub instruction in x86.
7904 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
7905
7906 // Maps a TreeEntry to the reorder indices of external users.
7908 ExternalUserReorderMap;
7909 // Find all reorderable nodes with the given VF.
7910 // Currently the are vectorized stores,loads,extracts + some gathering of
7911 // extracts.
7912 for_each(VectorizableTree, [&, &TTIRef = *TTI](
7913 const std::unique_ptr<TreeEntry> &TE) {
7914 // Look for external users that will probably be vectorized.
7915 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
7916 findExternalStoreUsersReorderIndices(TE.get());
7917 if (!ExternalUserReorderIndices.empty()) {
7918 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
7919 ExternalUserReorderMap.try_emplace(TE.get(),
7920 std::move(ExternalUserReorderIndices));
7921 }
7922
7923 // Patterns like [fadd,fsub] can be combined into a single instruction in
7924 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
7925 // to take into account their order when looking for the most used order.
7926 if (TE->hasState() && TE->isAltShuffle() &&
7927 TE->State != TreeEntry::SplitVectorize) {
7928 Type *ScalarTy = TE->Scalars[0]->getType();
7929 VectorType *VecTy = getWidenedType(ScalarTy, TE->Scalars.size());
7930 unsigned Opcode0 = TE->getOpcode();
7931 unsigned Opcode1 = TE->getAltOpcode();
7932 SmallBitVector OpcodeMask(
7933 getAltInstrMask(TE->Scalars, ScalarTy, Opcode0, Opcode1));
7934 // If this pattern is supported by the target then we consider the order.
7935 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
7936 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
7937 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
7938 }
7939 // TODO: Check the reverse order too.
7940 }
7941
7942 bool IgnoreReorder =
7943 !UserIgnoreList && VectorizableTree.front()->hasState() &&
7944 (VectorizableTree.front()->getOpcode() == Instruction::InsertElement ||
7945 VectorizableTree.front()->getOpcode() == Instruction::Store);
7946 if (std::optional<OrdersType> CurrentOrder =
7947 getReorderingData(*TE, /*TopToBottom=*/true, IgnoreReorder)) {
7948 // Do not include ordering for nodes used in the alt opcode vectorization,
7949 // better to reorder them during bottom-to-top stage. If follow the order
7950 // here, it causes reordering of the whole graph though actually it is
7951 // profitable just to reorder the subgraph that starts from the alternate
7952 // opcode vectorization node. Such nodes already end-up with the shuffle
7953 // instruction and it is just enough to change this shuffle rather than
7954 // rotate the scalars for the whole graph.
7955 unsigned Cnt = 0;
7956 const TreeEntry *UserTE = TE.get();
7957 while (UserTE && Cnt < RecursionMaxDepth) {
7958 if (!UserTE->UserTreeIndex)
7959 break;
7960 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
7961 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
7962 UserTE->UserTreeIndex.UserTE->Idx != 0)
7963 return;
7964 UserTE = UserTE->UserTreeIndex.UserTE;
7965 ++Cnt;
7966 }
7967 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
7968 if (!(TE->State == TreeEntry::Vectorize ||
7969 TE->State == TreeEntry::StridedVectorize ||
7970 TE->State == TreeEntry::SplitVectorize ||
7971 TE->State == TreeEntry::CompressVectorize) ||
7972 !TE->ReuseShuffleIndices.empty())
7973 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
7974 if (TE->State == TreeEntry::Vectorize &&
7975 TE->getOpcode() == Instruction::PHI)
7976 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
7977 }
7978 });
7979
7980 // Reorder the graph nodes according to their vectorization factor.
7981 for (unsigned VF = VectorizableTree.front()->getVectorFactor();
7982 !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
7983 auto It = VFToOrderedEntries.find(VF);
7984 if (It == VFToOrderedEntries.end())
7985 continue;
7986 // Try to find the most profitable order. We just are looking for the most
7987 // used order and reorder scalar elements in the nodes according to this
7988 // mostly used order.
7989 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
7990 // Delete VF entry upon exit.
7991 auto Cleanup = make_scope_exit([&]() { VFToOrderedEntries.erase(It); });
7992
7993 // All operands are reordered and used only in this node - propagate the
7994 // most used order to the user node.
7997 OrdersUses;
7998 for (const TreeEntry *OpTE : OrderedEntries) {
7999 // No need to reorder this nodes, still need to extend and to use shuffle,
8000 // just need to merge reordering shuffle and the reuse shuffle.
8001 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE) &&
8002 OpTE->State != TreeEntry::SplitVectorize)
8003 continue;
8004 // Count number of orders uses.
8005 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8006 &PhisToOrders]() -> const OrdersType & {
8007 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8008 auto It = GathersToOrders.find(OpTE);
8009 if (It != GathersToOrders.end())
8010 return It->second;
8011 }
8012 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8013 auto It = AltShufflesToOrders.find(OpTE);
8014 if (It != AltShufflesToOrders.end())
8015 return It->second;
8016 }
8017 if (OpTE->State == TreeEntry::Vectorize &&
8018 OpTE->getOpcode() == Instruction::PHI) {
8019 auto It = PhisToOrders.find(OpTE);
8020 if (It != PhisToOrders.end())
8021 return It->second;
8022 }
8023 return OpTE->ReorderIndices;
8024 }();
8025 // First consider the order of the external scalar users.
8026 auto It = ExternalUserReorderMap.find(OpTE);
8027 if (It != ExternalUserReorderMap.end()) {
8028 const auto &ExternalUserReorderIndices = It->second;
8029 // If the OpTE vector factor != number of scalars - use natural order,
8030 // it is an attempt to reorder node with reused scalars but with
8031 // external uses.
8032 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8033 OrdersUses.try_emplace(OrdersType(), 0).first->second +=
8034 ExternalUserReorderIndices.size();
8035 } else {
8036 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
8037 ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
8038 }
8039 // No other useful reorder data in this entry.
8040 if (Order.empty())
8041 continue;
8042 }
8043 // Stores actually store the mask, not the order, need to invert.
8044 if (OpTE->State == TreeEntry::Vectorize &&
8045 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
8046 assert(!OpTE->isAltShuffle() &&
8047 "Alternate instructions are only supported by BinaryOperator "
8048 "and CastInst.");
8049 SmallVector<int> Mask;
8050 inversePermutation(Order, Mask);
8051 unsigned E = Order.size();
8052 OrdersType CurrentOrder(E, E);
8053 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
8054 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8055 });
8056 fixupOrderingIndices(CurrentOrder);
8057 ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
8058 } else {
8059 ++OrdersUses.try_emplace(Order, 0).first->second;
8060 }
8061 }
8062 if (OrdersUses.empty())
8063 continue;
8064 // Choose the most used order.
8065 unsigned IdentityCnt = 0;
8066 unsigned FilledIdentityCnt = 0;
8067 OrdersType IdentityOrder(VF, VF);
8068 for (auto &Pair : OrdersUses) {
8069 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
8070 if (!Pair.first.empty())
8071 FilledIdentityCnt += Pair.second;
8072 IdentityCnt += Pair.second;
8073 combineOrders(IdentityOrder, Pair.first);
8074 }
8075 }
8076 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
8077 unsigned Cnt = IdentityCnt;
8078 for (auto &Pair : OrdersUses) {
8079 // Prefer identity order. But, if filled identity found (non-empty order)
8080 // with same number of uses, as the new candidate order, we can choose
8081 // this candidate order.
8082 if (Cnt < Pair.second ||
8083 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8084 Cnt == Pair.second && !BestOrder.empty() &&
8085 isIdentityOrder(BestOrder))) {
8086 combineOrders(Pair.first, BestOrder);
8087 BestOrder = Pair.first;
8088 Cnt = Pair.second;
8089 } else {
8090 combineOrders(BestOrder, Pair.first);
8091 }
8092 }
8093 // Set order of the user node.
8094 if (isIdentityOrder(BestOrder))
8095 continue;
8096 fixupOrderingIndices(BestOrder);
8097 SmallVector<int> Mask;
8098 inversePermutation(BestOrder, Mask);
8099 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
8100 unsigned E = BestOrder.size();
8101 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
8102 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8103 });
8104 // Do an actual reordering, if profitable.
8105 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8106 // Just do the reordering for the nodes with the given VF.
8107 if (TE->Scalars.size() != VF) {
8108 if (TE->ReuseShuffleIndices.size() == VF) {
8109 assert(TE->State != TreeEntry::SplitVectorize &&
8110 "Split vectorized not expected.");
8111 // Need to reorder the reuses masks of the operands with smaller VF to
8112 // be able to find the match between the graph nodes and scalar
8113 // operands of the given node during vectorization/cost estimation.
8114 assert(
8115 (!TE->UserTreeIndex ||
8116 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8117 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8118 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8119 "All users must be of VF size.");
8120 if (SLPReVec) {
8121 assert(SLPReVec && "Only supported by REVEC.");
8122 // ShuffleVectorInst does not do reorderOperands (and it should not
8123 // because ShuffleVectorInst supports only a limited set of
8124 // patterns). Only do reorderNodeWithReuses if the user is not
8125 // ShuffleVectorInst.
8126 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
8127 isa<ShuffleVectorInst>(TE->UserTreeIndex.UserTE->getMainOp()))
8128 continue;
8129 }
8130 // Update ordering of the operands with the smaller VF than the given
8131 // one.
8132 reorderNodeWithReuses(*TE, Mask);
8133 // Update orders in user split vectorize nodes.
8134 if (TE->UserTreeIndex &&
8135 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8136 TE->UserTreeIndex.UserTE->reorderSplitNode(
8137 TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
8138 }
8139 continue;
8140 }
8141 if ((TE->State == TreeEntry::SplitVectorize &&
8142 TE->ReuseShuffleIndices.empty()) ||
8143 ((TE->State == TreeEntry::Vectorize ||
8144 TE->State == TreeEntry::StridedVectorize ||
8145 TE->State == TreeEntry::CompressVectorize) &&
8147 InsertElementInst>(TE->getMainOp()) ||
8148 (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp()))))) {
8149 assert(
8150 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
8151 TE->ReuseShuffleIndices.empty())) &&
8152 "Alternate instructions are only supported by BinaryOperator "
8153 "and CastInst.");
8154 // Build correct orders for extract{element,value}, loads,
8155 // stores and alternate (split) nodes.
8156 reorderOrder(TE->ReorderIndices, Mask);
8157 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
8158 TE->reorderOperands(Mask);
8159 } else {
8160 // Reorder the node and its operands.
8161 TE->reorderOperands(Mask);
8162 assert(TE->ReorderIndices.empty() &&
8163 "Expected empty reorder sequence.");
8164 reorderScalars(TE->Scalars, Mask);
8165 }
8166 if (!TE->ReuseShuffleIndices.empty()) {
8167 // Apply reversed order to keep the original ordering of the reused
8168 // elements to avoid extra reorder indices shuffling.
8169 OrdersType CurrentOrder;
8170 reorderOrder(CurrentOrder, MaskOrder);
8171 SmallVector<int> NewReuses;
8172 inversePermutation(CurrentOrder, NewReuses);
8173 addMask(NewReuses, TE->ReuseShuffleIndices);
8174 TE->ReuseShuffleIndices.swap(NewReuses);
8175 } else if (TE->UserTreeIndex &&
8176 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8177 // Update orders in user split vectorize nodes.
8178 TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
8179 Mask, MaskOrder);
8180 }
8181 }
8182}
8183
8184void BoUpSLP::buildReorderableOperands(
8185 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
8186 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
8187 SmallVectorImpl<TreeEntry *> &GatherOps) {
8188 for (unsigned I : seq<unsigned>(UserTE->getNumOperands())) {
8189 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
8190 return OpData.first == I &&
8191 (OpData.second->State == TreeEntry::Vectorize ||
8192 OpData.second->State == TreeEntry::StridedVectorize ||
8193 OpData.second->State == TreeEntry::CompressVectorize ||
8194 OpData.second->State == TreeEntry::SplitVectorize);
8195 }))
8196 continue;
8197 // Do not request operands, if they do not exist.
8198 if (UserTE->hasState()) {
8199 if (UserTE->getOpcode() == Instruction::ExtractElement ||
8200 UserTE->getOpcode() == Instruction::ExtractValue)
8201 continue;
8202 if (UserTE->getOpcode() == Instruction::InsertElement && I == 0)
8203 continue;
8204 if (UserTE->getOpcode() == Instruction::Store &&
8205 UserTE->State == TreeEntry::Vectorize && I == 1)
8206 continue;
8207 if (UserTE->getOpcode() == Instruction::Load &&
8208 (UserTE->State == TreeEntry::Vectorize ||
8209 UserTE->State == TreeEntry::StridedVectorize ||
8210 UserTE->State == TreeEntry::CompressVectorize))
8211 continue;
8212 }
8213 TreeEntry *TE = getOperandEntry(UserTE, I);
8214 assert(TE && "Expected operand entry.");
8215 if (!TE->isGather()) {
8216 // Add the node to the list of the ordered nodes with the identity
8217 // order.
8218 Edges.emplace_back(I, TE);
8219 // Add ScatterVectorize nodes to the list of operands, where just
8220 // reordering of the scalars is required. Similar to the gathers, so
8221 // simply add to the list of gathered ops.
8222 // If there are reused scalars, process this node as a regular vectorize
8223 // node, just reorder reuses mask.
8224 if (TE->State == TreeEntry::ScatterVectorize &&
8225 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
8226 GatherOps.push_back(TE);
8227 continue;
8228 }
8229 if (ReorderableGathers.contains(TE))
8230 GatherOps.push_back(TE);
8231 }
8232}
8233
8234void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
8235 struct TreeEntryCompare {
8236 bool operator()(const TreeEntry *LHS, const TreeEntry *RHS) const {
8237 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
8238 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
8239 return LHS->Idx < RHS->Idx;
8240 }
8241 };
8243 DenseSet<const TreeEntry *> GathersToOrders;
8244 // Find all reorderable leaf nodes with the given VF.
8245 // Currently the are vectorized loads,extracts without alternate operands +
8246 // some gathering of extracts.
8248 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8249 if (TE->State != TreeEntry::Vectorize &&
8250 TE->State != TreeEntry::StridedVectorize &&
8251 TE->State != TreeEntry::CompressVectorize &&
8252 TE->State != TreeEntry::SplitVectorize)
8253 NonVectorized.insert(TE.get());
8254 if (std::optional<OrdersType> CurrentOrder =
8255 getReorderingData(*TE, /*TopToBottom=*/false, IgnoreReorder)) {
8256 Queue.push(TE.get());
8257 if (!(TE->State == TreeEntry::Vectorize ||
8258 TE->State == TreeEntry::StridedVectorize ||
8259 TE->State == TreeEntry::CompressVectorize ||
8260 TE->State == TreeEntry::SplitVectorize) ||
8261 !TE->ReuseShuffleIndices.empty())
8262 GathersToOrders.insert(TE.get());
8263 }
8264 }
8265
8266 // 1. Propagate order to the graph nodes, which use only reordered nodes.
8267 // I.e., if the node has operands, that are reordered, try to make at least
8268 // one operand order in the natural order and reorder others + reorder the
8269 // user node itself.
8270 SmallPtrSet<const TreeEntry *, 4> Visited, RevisitedOps;
8271 while (!Queue.empty()) {
8272 // 1. Filter out only reordered nodes.
8273 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
8274 TreeEntry *TE = Queue.top();
8275 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
8276 Queue.pop();
8277 SmallVector<TreeEntry *> OrderedOps(1, TE);
8278 while (!Queue.empty()) {
8279 TE = Queue.top();
8280 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
8281 break;
8282 Queue.pop();
8283 OrderedOps.push_back(TE);
8284 }
8285 for (TreeEntry *TE : OrderedOps) {
8286 if (!(TE->State == TreeEntry::Vectorize ||
8287 TE->State == TreeEntry::StridedVectorize ||
8288 TE->State == TreeEntry::CompressVectorize ||
8289 TE->State == TreeEntry::SplitVectorize ||
8290 (TE->isGather() && GathersToOrders.contains(TE))) ||
8291 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
8292 !Visited.insert(TE).second)
8293 continue;
8294 // Build a map between user nodes and their operands order to speedup
8295 // search. The graph currently does not provide this dependency directly.
8296 Users.first = TE->UserTreeIndex.UserTE;
8297 Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
8298 }
8299 if (Users.first) {
8300 auto &Data = Users;
8301 if (Data.first->State == TreeEntry::SplitVectorize) {
8302 assert(
8303 Data.second.size() <= 2 &&
8304 "Expected not greater than 2 operands for split vectorize node.");
8305 if (any_of(Data.second,
8306 [](const auto &Op) { return !Op.second->UserTreeIndex; }))
8307 continue;
8308 // Update orders in user split vectorize nodes.
8309 assert(Data.first->CombinedEntriesWithIndices.size() == 2 &&
8310 "Expected exactly 2 entries.");
8311 for (const auto &P : Data.first->CombinedEntriesWithIndices) {
8312 TreeEntry &OpTE = *VectorizableTree[P.first];
8313 OrdersType Order = OpTE.ReorderIndices;
8314 if (Order.empty() || !OpTE.ReuseShuffleIndices.empty()) {
8315 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
8316 continue;
8317 const auto BestOrder =
8318 getReorderingData(OpTE, /*TopToBottom=*/false, IgnoreReorder);
8319 if (!BestOrder || BestOrder->empty() || isIdentityOrder(*BestOrder))
8320 continue;
8321 Order = *BestOrder;
8322 }
8323 fixupOrderingIndices(Order);
8324 SmallVector<int> Mask;
8325 inversePermutation(Order, Mask);
8326 const unsigned E = Order.size();
8327 SmallVector<int> MaskOrder(E, PoisonMaskElem);
8328 transform(Order, MaskOrder.begin(), [E](unsigned I) {
8329 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8330 });
8331 Data.first->reorderSplitNode(P.second ? 1 : 0, Mask, MaskOrder);
8332 // Clear ordering of the operand.
8333 if (!OpTE.ReorderIndices.empty()) {
8334 OpTE.ReorderIndices.clear();
8335 } else if (!OpTE.ReuseShuffleIndices.empty()) {
8336 reorderReuses(OpTE.ReuseShuffleIndices, Mask);
8337 } else {
8338 assert(OpTE.isGather() && "Expected only gather/buildvector node.");
8339 reorderScalars(OpTE.Scalars, Mask);
8340 }
8341 }
8342 if (Data.first->ReuseShuffleIndices.empty() &&
8343 !Data.first->ReorderIndices.empty()) {
8344 // Insert user node to the list to try to sink reordering deeper in
8345 // the graph.
8346 Queue.push(Data.first);
8347 }
8348 continue;
8349 }
8350 // Check that operands are used only in the User node.
8351 SmallVector<TreeEntry *> GatherOps;
8352 buildReorderableOperands(Data.first, Data.second, NonVectorized,
8353 GatherOps);
8354 // All operands are reordered and used only in this node - propagate the
8355 // most used order to the user node.
8358 OrdersUses;
8359 // Do the analysis for each tree entry only once, otherwise the order of
8360 // the same node my be considered several times, though might be not
8361 // profitable.
8364 for (const auto &Op : Data.second) {
8365 TreeEntry *OpTE = Op.second;
8366 if (!VisitedOps.insert(OpTE).second)
8367 continue;
8368 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
8369 continue;
8370 const auto Order = [&]() -> const OrdersType {
8371 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
8372 return getReorderingData(*OpTE, /*TopToBottom=*/false,
8373 IgnoreReorder)
8374 .value_or(OrdersType(1));
8375 return OpTE->ReorderIndices;
8376 }();
8377 // The order is partially ordered, skip it in favor of fully non-ordered
8378 // orders.
8379 if (Order.size() == 1)
8380 continue;
8381
8382 // Check that the reordering does not increase number of shuffles, i.e.
8383 // same-values-nodes has same parents or their parents has same parents.
8384 if (!Order.empty() && !isIdentityOrder(Order)) {
8385 Value *Root = OpTE->hasState()
8386 ? OpTE->getMainOp()
8387 : *find_if_not(OpTE->Scalars, isConstant);
8388 auto GetSameNodesUsers = [&](Value *Root) {
8390 for (const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
8391 if (TE != OpTE && TE->UserTreeIndex &&
8392 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8393 TE->Scalars.size() == OpTE->Scalars.size() &&
8394 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8395 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8396 Res.insert(TE->UserTreeIndex.UserTE);
8397 }
8398 for (const TreeEntry *TE : getTreeEntries(Root)) {
8399 if (TE != OpTE && TE->UserTreeIndex &&
8400 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8401 TE->Scalars.size() == OpTE->Scalars.size() &&
8402 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8403 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8404 Res.insert(TE->UserTreeIndex.UserTE);
8405 }
8406 return Res.takeVector();
8407 };
8408 auto GetNumOperands = [](const TreeEntry *TE) {
8409 if (TE->State == TreeEntry::SplitVectorize)
8410 return TE->getNumOperands();
8411 if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
8412 return CI->arg_size();
8413 return TE->getNumOperands();
8414 };
8415 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
8416 const TreeEntry *TE) {
8418 if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
8420 for (unsigned Idx : seq<unsigned>(GetNumOperands(TE))) {
8423 continue;
8424 const TreeEntry *Op = getOperandEntry(TE, Idx);
8425 if (Op->isGather() && Op->hasState()) {
8426 const TreeEntry *VecOp =
8427 getSameValuesTreeEntry(Op->getMainOp(), Op->Scalars);
8428 if (VecOp)
8429 Op = VecOp;
8430 }
8431 if (Op->ReorderIndices.empty() && Op->ReuseShuffleIndices.empty())
8432 return false;
8433 }
8434 return true;
8435 };
8436 SmallVector<TreeEntry *> Users = GetSameNodesUsers(Root);
8437 if (!Users.empty() && !all_of(Users, [&](TreeEntry *UTE) {
8438 if (!RevisitedOps.insert(UTE).second)
8439 return false;
8440 return UTE == Data.first || !UTE->ReorderIndices.empty() ||
8441 !UTE->ReuseShuffleIndices.empty() ||
8442 (UTE->UserTreeIndex &&
8443 UTE->UserTreeIndex.UserTE == Data.first) ||
8444 (Data.first->UserTreeIndex &&
8445 Data.first->UserTreeIndex.UserTE == UTE) ||
8446 (IgnoreReorder && UTE->UserTreeIndex &&
8447 UTE->UserTreeIndex.UserTE->Idx == 0) ||
8448 NodeShouldBeReorderedWithOperands(UTE);
8449 }))
8450 continue;
8451 for (TreeEntry *UTE : Users) {
8453 if (auto *CI = dyn_cast<CallInst>(UTE->getMainOp()); CI)
8455 for (unsigned Idx : seq<unsigned>(GetNumOperands(UTE))) {
8458 continue;
8459 const TreeEntry *Op = getOperandEntry(UTE, Idx);
8460 Visited.erase(Op);
8461 Queue.push(const_cast<TreeEntry *>(Op));
8462 }
8463 }
8464 }
8465 unsigned NumOps = count_if(
8466 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
8467 return P.second == OpTE;
8468 });
8469 // Stores actually store the mask, not the order, need to invert.
8470 if (OpTE->State == TreeEntry::Vectorize &&
8471 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
8472 assert(!OpTE->isAltShuffle() &&
8473 "Alternate instructions are only supported by BinaryOperator "
8474 "and CastInst.");
8475 SmallVector<int> Mask;
8476 inversePermutation(Order, Mask);
8477 unsigned E = Order.size();
8478 OrdersType CurrentOrder(E, E);
8479 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
8480 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8481 });
8482 fixupOrderingIndices(CurrentOrder);
8483 OrdersUses.try_emplace(CurrentOrder, 0).first->second += NumOps;
8484 } else {
8485 OrdersUses.try_emplace(Order, 0).first->second += NumOps;
8486 }
8487 auto Res = OrdersUses.try_emplace(OrdersType(), 0);
8488 const auto AllowsReordering = [&](const TreeEntry *TE) {
8489 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
8490 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
8491 (IgnoreReorder && TE->Idx == 0))
8492 return true;
8493 if (TE->isGather()) {
8494 if (GathersToOrders.contains(TE))
8495 return !getReorderingData(*TE, /*TopToBottom=*/false,
8496 IgnoreReorder)
8497 .value_or(OrdersType(1))
8498 .empty();
8499 return true;
8500 }
8501 return false;
8502 };
8503 if (OpTE->UserTreeIndex) {
8504 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
8505 if (!VisitedUsers.insert(UserTE).second)
8506 continue;
8507 // May reorder user node if it requires reordering, has reused
8508 // scalars, is an alternate op vectorize node or its op nodes require
8509 // reordering.
8510 if (AllowsReordering(UserTE))
8511 continue;
8512 // Check if users allow reordering.
8513 // Currently look up just 1 level of operands to avoid increase of
8514 // the compile time.
8515 // Profitable to reorder if definitely more operands allow
8516 // reordering rather than those with natural order.
8518 if (static_cast<unsigned>(count_if(
8519 Ops, [UserTE, &AllowsReordering](
8520 const std::pair<unsigned, TreeEntry *> &Op) {
8521 return AllowsReordering(Op.second) &&
8522 Op.second->UserTreeIndex.UserTE == UserTE;
8523 })) <= Ops.size() / 2)
8524 ++Res.first->second;
8525 }
8526 }
8527 if (OrdersUses.empty()) {
8528 Visited.insert_range(llvm::make_second_range(Data.second));
8529 continue;
8530 }
8531 // Choose the most used order.
8532 unsigned IdentityCnt = 0;
8533 unsigned VF = Data.second.front().second->getVectorFactor();
8534 OrdersType IdentityOrder(VF, VF);
8535 for (auto &Pair : OrdersUses) {
8536 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
8537 IdentityCnt += Pair.second;
8538 combineOrders(IdentityOrder, Pair.first);
8539 }
8540 }
8541 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
8542 unsigned Cnt = IdentityCnt;
8543 for (auto &Pair : OrdersUses) {
8544 // Prefer identity order. But, if filled identity found (non-empty
8545 // order) with same number of uses, as the new candidate order, we can
8546 // choose this candidate order.
8547 if (Cnt < Pair.second) {
8548 combineOrders(Pair.first, BestOrder);
8549 BestOrder = Pair.first;
8550 Cnt = Pair.second;
8551 } else {
8552 combineOrders(BestOrder, Pair.first);
8553 }
8554 }
8555 // Set order of the user node.
8556 if (isIdentityOrder(BestOrder)) {
8557 Visited.insert_range(llvm::make_second_range(Data.second));
8558 continue;
8559 }
8560 fixupOrderingIndices(BestOrder);
8561 // Erase operands from OrderedEntries list and adjust their orders.
8562 VisitedOps.clear();
8563 SmallVector<int> Mask;
8564 inversePermutation(BestOrder, Mask);
8565 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
8566 unsigned E = BestOrder.size();
8567 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
8568 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8569 });
8570 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
8571 TreeEntry *TE = Op.second;
8572 if (!VisitedOps.insert(TE).second)
8573 continue;
8574 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
8575 reorderNodeWithReuses(*TE, Mask);
8576 continue;
8577 }
8578 // Gathers are processed separately.
8579 if (TE->State != TreeEntry::Vectorize &&
8580 TE->State != TreeEntry::StridedVectorize &&
8581 TE->State != TreeEntry::CompressVectorize &&
8582 TE->State != TreeEntry::SplitVectorize &&
8583 (TE->State != TreeEntry::ScatterVectorize ||
8584 TE->ReorderIndices.empty()))
8585 continue;
8586 assert((BestOrder.size() == TE->ReorderIndices.size() ||
8587 TE->ReorderIndices.empty()) &&
8588 "Non-matching sizes of user/operand entries.");
8589 reorderOrder(TE->ReorderIndices, Mask);
8590 if (IgnoreReorder && TE == VectorizableTree.front().get())
8591 IgnoreReorder = false;
8592 }
8593 // For gathers just need to reorder its scalars.
8594 for (TreeEntry *Gather : GatherOps) {
8595 assert(Gather->ReorderIndices.empty() &&
8596 "Unexpected reordering of gathers.");
8597 if (!Gather->ReuseShuffleIndices.empty()) {
8598 // Just reorder reuses indices.
8599 reorderReuses(Gather->ReuseShuffleIndices, Mask);
8600 continue;
8601 }
8602 reorderScalars(Gather->Scalars, Mask);
8603 Visited.insert(Gather);
8604 }
8605 // Reorder operands of the user node and set the ordering for the user
8606 // node itself.
8607 auto IsNotProfitableAltCodeNode = [](const TreeEntry &TE) {
8608 return TE.isAltShuffle() &&
8609 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
8610 TE.ReorderIndices.empty());
8611 };
8612 if (Data.first->State != TreeEntry::Vectorize ||
8613 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
8614 Data.first->getMainOp()) ||
8615 IsNotProfitableAltCodeNode(*Data.first))
8616 Data.first->reorderOperands(Mask);
8617 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
8618 IsNotProfitableAltCodeNode(*Data.first) ||
8619 Data.first->State == TreeEntry::StridedVectorize ||
8620 Data.first->State == TreeEntry::CompressVectorize) {
8621 reorderScalars(Data.first->Scalars, Mask);
8622 reorderOrder(Data.first->ReorderIndices, MaskOrder,
8623 /*BottomOrder=*/true);
8624 if (Data.first->ReuseShuffleIndices.empty() &&
8625 !Data.first->ReorderIndices.empty() &&
8626 !IsNotProfitableAltCodeNode(*Data.first)) {
8627 // Insert user node to the list to try to sink reordering deeper in
8628 // the graph.
8629 Queue.push(Data.first);
8630 }
8631 } else {
8632 reorderOrder(Data.first->ReorderIndices, Mask);
8633 }
8634 }
8635 }
8636 // If the reordering is unnecessary, just remove the reorder.
8637 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
8638 VectorizableTree.front()->ReuseShuffleIndices.empty())
8639 VectorizableTree.front()->ReorderIndices.clear();
8640}
8641
8642Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
8643 if (Entry.hasState() &&
8644 (Entry.getOpcode() == Instruction::Store ||
8645 Entry.getOpcode() == Instruction::Load) &&
8646 Entry.State == TreeEntry::StridedVectorize &&
8647 !Entry.ReorderIndices.empty() && isReverseOrder(Entry.ReorderIndices))
8648 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
8649 return dyn_cast<Instruction>(Entry.Scalars.front());
8650}
8651
8653 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
8654 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
8655 DenseMap<Value *, unsigned> ScalarToExtUses;
8656 SmallPtrSet<Value *, 4> ExternalUsers;
8657 // Collect the values that we need to extract from the tree.
8658 for (auto &TEPtr : VectorizableTree) {
8659 TreeEntry *Entry = TEPtr.get();
8660
8661 // No need to handle users of gathered values.
8662 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
8663 continue;
8664
8665 // For each lane:
8666 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
8667 Value *Scalar = Entry->Scalars[Lane];
8668 if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
8669 continue;
8670
8671 // All uses must be replaced already? No need to do it again.
8672 auto It = ScalarToExtUses.find(Scalar);
8673 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
8674 continue;
8675
8676 if (Scalar->hasNUsesOrMore(NumVectScalars)) {
8677 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8678 LLVM_DEBUG(dbgs() << "SLP: Need to extract from lane " << FoundLane
8679 << " from " << *Scalar << "for many users.\n");
8680 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
8681 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
8682 ExternalUsesWithNonUsers.insert(Scalar);
8683 continue;
8684 }
8685
8686 // Check if the scalar is externally used as an extra arg.
8687 const auto ExtI = ExternallyUsedValues.find(Scalar);
8688 if (ExtI != ExternallyUsedValues.end()) {
8689 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8690 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
8691 << FoundLane << " from " << *Scalar << ".\n");
8692 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
8693 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
8694 continue;
8695 }
8696 for (User *U : Scalar->users()) {
8697 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
8698
8699 Instruction *UserInst = dyn_cast<Instruction>(U);
8700 if (!UserInst || isDeleted(UserInst))
8701 continue;
8702
8703 // Ignore users in the user ignore list.
8704 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
8705 continue;
8706
8707 // Skip in-tree scalars that become vectors
8708 if (ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
8709 !UseEntries.empty()) {
8710 // Some in-tree scalars will remain as scalar in vectorized
8711 // instructions. If that is the case, the one in FoundLane will
8712 // be used.
8713 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
8714 isa<LoadInst, StoreInst>(UserInst)) ||
8715 isa<CallInst>(UserInst)) ||
8716 all_of(UseEntries, [&](TreeEntry *UseEntry) {
8717 return UseEntry->State == TreeEntry::ScatterVectorize ||
8719 Scalar, getRootEntryInstruction(*UseEntry), TLI,
8720 TTI);
8721 })) {
8722 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
8723 << ".\n");
8724 assert(none_of(UseEntries,
8725 [](TreeEntry *UseEntry) {
8726 return UseEntry->isGather();
8727 }) &&
8728 "Bad state");
8729 continue;
8730 }
8731 U = nullptr;
8732 if (It != ScalarToExtUses.end()) {
8733 ExternalUses[It->second].User = nullptr;
8734 break;
8735 }
8736 }
8737
8738 if (U && Scalar->hasNUsesOrMore(UsesLimit))
8739 U = nullptr;
8740 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8741 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
8742 << " from lane " << FoundLane << " from " << *Scalar
8743 << ".\n");
8744 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
8745 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
8746 ExternalUsesWithNonUsers.insert(Scalar);
8747 if (!U)
8748 break;
8749 }
8750 }
8751 }
8752}
8753
8755BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
8758 PtrToStoresMap;
8759 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
8760 Value *V = TE->Scalars[Lane];
8761 // Don't iterate over the users of constant data.
8762 if (!isa<Instruction>(V))
8763 continue;
8764 // To save compilation time we don't visit if we have too many users.
8765 if (V->hasNUsesOrMore(UsesLimit))
8766 break;
8767
8768 // Collect stores per pointer object.
8769 for (User *U : V->users()) {
8770 auto *SI = dyn_cast<StoreInst>(U);
8771 // Test whether we can handle the store. V might be a global, which could
8772 // be used in a different function.
8773 if (SI == nullptr || !SI->isSimple() || SI->getFunction() != F ||
8774 !isValidElementType(SI->getValueOperand()->getType()))
8775 continue;
8776 // Skip entry if already
8777 if (isVectorized(U))
8778 continue;
8779
8780 Value *Ptr =
8781 getUnderlyingObject(SI->getPointerOperand(), RecursionMaxDepth);
8782 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
8783 SI->getValueOperand()->getType(), Ptr}];
8784 // For now just keep one store per pointer object per lane.
8785 // TODO: Extend this to support multiple stores per pointer per lane
8786 if (StoresVec.size() > Lane)
8787 continue;
8788 if (!StoresVec.empty()) {
8789 std::optional<int64_t> Diff = getPointersDiff(
8790 SI->getValueOperand()->getType(), SI->getPointerOperand(),
8791 SI->getValueOperand()->getType(),
8792 StoresVec.front()->getPointerOperand(), *DL, *SE,
8793 /*StrictCheck=*/true);
8794 // We failed to compare the pointers so just abandon this store.
8795 if (!Diff)
8796 continue;
8797 }
8798 StoresVec.push_back(SI);
8799 }
8800 }
8801 SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());
8802 unsigned I = 0;
8803 for (auto &P : PtrToStoresMap) {
8804 Res[I].swap(P.second);
8805 ++I;
8806 }
8807 return Res;
8808}
8809
8810bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
8811 OrdersType &ReorderIndices) const {
8812 // We check whether the stores in StoreVec can form a vector by sorting them
8813 // and checking whether they are consecutive.
8814
8815 // To avoid calling getPointersDiff() while sorting we create a vector of
8816 // pairs {store, offset from first} and sort this instead.
8818 StoreInst *S0 = StoresVec[0];
8819 StoreOffsetVec.emplace_back(0, 0);
8820 Type *S0Ty = S0->getValueOperand()->getType();
8821 Value *S0Ptr = S0->getPointerOperand();
8822 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
8823 StoreInst *SI = StoresVec[Idx];
8824 std::optional<int64_t> Diff =
8825 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
8826 SI->getPointerOperand(), *DL, *SE,
8827 /*StrictCheck=*/true);
8828 StoreOffsetVec.emplace_back(*Diff, Idx);
8829 }
8830
8831 // Check if the stores are consecutive by checking if their difference is 1.
8832 if (StoreOffsetVec.size() != StoresVec.size())
8833 return false;
8834 sort(StoreOffsetVec, llvm::less_first());
8835 unsigned Idx = 0;
8836 int64_t PrevDist = 0;
8837 for (const auto &P : StoreOffsetVec) {
8838 if (Idx > 0 && P.first != PrevDist + 1)
8839 return false;
8840 PrevDist = P.first;
8841 ++Idx;
8842 }
8843
8844 // Calculate the shuffle indices according to their offset against the sorted
8845 // StoreOffsetVec.
8846 ReorderIndices.assign(StoresVec.size(), 0);
8847 bool IsIdentity = true;
8848 for (auto [I, P] : enumerate(StoreOffsetVec)) {
8849 ReorderIndices[P.second] = I;
8850 IsIdentity &= P.second == I;
8851 }
8852 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
8853 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
8854 // same convention here.
8855 if (IsIdentity)
8856 ReorderIndices.clear();
8857
8858 return true;
8859}
8860
8861#ifndef NDEBUG
8863 for (unsigned Idx : Order)
8864 dbgs() << Idx << ", ";
8865 dbgs() << "\n";
8866}
8867#endif
8868
8870BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
8871 unsigned NumLanes = TE->Scalars.size();
8872
8873 SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);
8874
8875 // Holds the reorder indices for each candidate store vector that is a user of
8876 // the current TreeEntry.
8877 SmallVector<OrdersType, 1> ExternalReorderIndices;
8878
8879 // Now inspect the stores collected per pointer and look for vectorization
8880 // candidates. For each candidate calculate the reorder index vector and push
8881 // it into `ExternalReorderIndices`
8882 for (ArrayRef<StoreInst *> StoresVec : Stores) {
8883 // If we have fewer than NumLanes stores, then we can't form a vector.
8884 if (StoresVec.size() != NumLanes)
8885 continue;
8886
8887 // If the stores are not consecutive then abandon this StoresVec.
8888 OrdersType ReorderIndices;
8889 if (!canFormVector(StoresVec, ReorderIndices))
8890 continue;
8891
8892 // We now know that the scalars in StoresVec can form a vector instruction,
8893 // so set the reorder indices.
8894 ExternalReorderIndices.push_back(ReorderIndices);
8895 }
8896 return ExternalReorderIndices;
8897}
8898
8900 const SmallDenseSet<Value *> &UserIgnoreLst) {
8901 deleteTree();
8902 UserIgnoreList = &UserIgnoreLst;
8903 if (!allSameType(Roots))
8904 return;
8905 buildTreeRec(Roots, 0, EdgeInfo());
8906}
8907
8909 deleteTree();
8910 if (!allSameType(Roots))
8911 return;
8912 buildTreeRec(Roots, 0, EdgeInfo());
8913}
8914
8915/// Tries to find subvector of loads and builds new vector of only loads if can
8916/// be profitable.
8918 const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL,
8920 SmallVectorImpl<SmallVector<std::pair<LoadInst *, int64_t>>> &GatheredLoads,
8921 bool AddNew = true) {
8922 if (VL.empty())
8923 return;
8924 Type *ScalarTy = getValueType(VL.front());
8925 if (!isValidElementType(ScalarTy))
8926 return;
8928 SmallVector<DenseMap<int64_t, LoadInst *>> ClusteredDistToLoad;
8929 for (Value *V : VL) {
8930 auto *LI = dyn_cast<LoadInst>(V);
8931 if (!LI)
8932 continue;
8933 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
8934 continue;
8935 bool IsFound = false;
8936 for (auto [Map, Data] : zip(ClusteredDistToLoad, ClusteredLoads)) {
8937 assert(LI->getParent() == Data.front().first->getParent() &&
8938 LI->getType() == Data.front().first->getType() &&
8939 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) ==
8940 getUnderlyingObject(Data.front().first->getPointerOperand(),
8942 "Expected loads with the same type, same parent and same "
8943 "underlying pointer.");
8944 std::optional<int64_t> Dist = getPointersDiff(
8945 LI->getType(), LI->getPointerOperand(), Data.front().first->getType(),
8946 Data.front().first->getPointerOperand(), DL, SE,
8947 /*StrictCheck=*/true);
8948 if (!Dist)
8949 continue;
8950 auto It = Map.find(*Dist);
8951 if (It != Map.end() && It->second != LI)
8952 continue;
8953 if (It == Map.end()) {
8954 Data.emplace_back(LI, *Dist);
8955 Map.try_emplace(*Dist, LI);
8956 }
8957 IsFound = true;
8958 break;
8959 }
8960 if (!IsFound) {
8961 ClusteredLoads.emplace_back().emplace_back(LI, 0);
8962 ClusteredDistToLoad.emplace_back().try_emplace(0, LI);
8963 }
8964 }
8965 auto FindMatchingLoads =
8968 &GatheredLoads,
8969 SetVector<unsigned> &ToAdd, SetVector<unsigned> &Repeated,
8970 int64_t &Offset, unsigned &Start) {
8971 if (Loads.empty())
8972 return GatheredLoads.end();
8973 LoadInst *LI = Loads.front().first;
8974 for (auto [Idx, Data] : enumerate(GatheredLoads)) {
8975 if (Idx < Start)
8976 continue;
8977 ToAdd.clear();
8978 if (LI->getParent() != Data.front().first->getParent() ||
8979 LI->getType() != Data.front().first->getType())
8980 continue;
8981 std::optional<int64_t> Dist =
8983 Data.front().first->getType(),
8984 Data.front().first->getPointerOperand(), DL, SE,
8985 /*StrictCheck=*/true);
8986 if (!Dist)
8987 continue;
8988 SmallSet<int64_t, 4> DataDists;
8990 for (std::pair<LoadInst *, int64_t> P : Data) {
8991 DataDists.insert(P.second);
8992 DataLoads.insert(P.first);
8993 }
8994 // Found matching gathered loads - check if all loads are unique or
8995 // can be effectively vectorized.
8996 unsigned NumUniques = 0;
8997 for (auto [Cnt, Pair] : enumerate(Loads)) {
8998 bool Used = DataLoads.contains(Pair.first);
8999 if (!Used && !DataDists.contains(*Dist + Pair.second)) {
9000 ++NumUniques;
9001 ToAdd.insert(Cnt);
9002 } else if (Used) {
9003 Repeated.insert(Cnt);
9004 }
9005 }
9006 if (NumUniques > 0 &&
9007 (Loads.size() == NumUniques ||
9008 (Loads.size() - NumUniques >= 2 &&
9009 Loads.size() - NumUniques >= Loads.size() / 2 &&
9010 (has_single_bit(Data.size() + NumUniques) ||
9011 bit_ceil(Data.size()) <
9012 bit_ceil(Data.size() + NumUniques))))) {
9013 Offset = *Dist;
9014 Start = Idx + 1;
9015 return std::next(GatheredLoads.begin(), Idx);
9016 }
9017 }
9018 ToAdd.clear();
9019 return GatheredLoads.end();
9020 };
9021 for (ArrayRef<std::pair<LoadInst *, int64_t>> Data : ClusteredLoads) {
9022 unsigned Start = 0;
9023 SetVector<unsigned> ToAdd, LocalToAdd, Repeated;
9024 int64_t Offset = 0;
9025 auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,
9026 Offset, Start);
9027 while (It != GatheredLoads.end()) {
9028 assert(!LocalToAdd.empty() && "Expected some elements to add.");
9029 for (unsigned Idx : LocalToAdd)
9030 It->emplace_back(Data[Idx].first, Data[Idx].second + Offset);
9031 ToAdd.insert_range(LocalToAdd);
9032 It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset,
9033 Start);
9034 }
9035 if (any_of(seq<unsigned>(Data.size()), [&](unsigned Idx) {
9036 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
9037 })) {
9038 auto AddNewLoads =
9040 for (unsigned Idx : seq<unsigned>(Data.size())) {
9041 if (ToAdd.contains(Idx) || Repeated.contains(Idx))
9042 continue;
9043 Loads.push_back(Data[Idx]);
9044 }
9045 };
9046 if (!AddNew) {
9047 LoadInst *LI = Data.front().first;
9048 It = find_if(
9049 GatheredLoads, [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9050 return PD.front().first->getParent() == LI->getParent() &&
9051 PD.front().first->getType() == LI->getType();
9052 });
9053 while (It != GatheredLoads.end()) {
9054 AddNewLoads(*It);
9055 It = std::find_if(
9056 std::next(It), GatheredLoads.end(),
9057 [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9058 return PD.front().first->getParent() == LI->getParent() &&
9059 PD.front().first->getType() == LI->getType();
9060 });
9061 }
9062 }
9063 GatheredLoads.emplace_back().append(Data.begin(), Data.end());
9064 AddNewLoads(GatheredLoads.emplace_back());
9065 }
9066 }
9067}
9068
9069void BoUpSLP::tryToVectorizeGatheredLoads(
9070 const SmallMapVector<
9071 std::tuple<BasicBlock *, Value *, Type *>,
9072 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
9073 &GatheredLoads) {
9074 GatheredLoadsEntriesFirst = VectorizableTree.size();
9075
9076 SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
9077 LoadEntriesToVectorize.size());
9078 for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
9079 Set.insert_range(VectorizableTree[Idx]->Scalars);
9080
9081 // Sort loads by distance.
9082 auto LoadSorter = [](const std::pair<LoadInst *, int64_t> &L1,
9083 const std::pair<LoadInst *, int64_t> &L2) {
9084 return L1.second > L2.second;
9085 };
9086
9087 auto IsMaskedGatherSupported = [&, TTI = TTI](ArrayRef<LoadInst *> Loads) {
9088 ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
9089 Loads.size());
9090 Align Alignment = computeCommonAlignment<LoadInst>(Values);
9091 auto *Ty = getWidenedType(Loads.front()->getType(), Loads.size());
9092 return TTI->isLegalMaskedGather(Ty, Alignment) &&
9093 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
9094 };
9095
9096 auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,
9097 BoUpSLP::ValueSet &VectorizedLoads,
9098 SmallVectorImpl<LoadInst *> &NonVectorized,
9099 bool Final, unsigned MaxVF) {
9101 unsigned StartIdx = 0;
9102 SmallVector<int> CandidateVFs;
9103 if (VectorizeNonPowerOf2 && has_single_bit(MaxVF + 1))
9104 CandidateVFs.push_back(MaxVF);
9105 for (int NumElts = getFloorFullVectorNumberOfElements(
9106 *TTI, Loads.front()->getType(), MaxVF);
9107 NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(
9108 *TTI, Loads.front()->getType(), NumElts - 1)) {
9109 CandidateVFs.push_back(NumElts);
9110 if (VectorizeNonPowerOf2 && NumElts > 2)
9111 CandidateVFs.push_back(NumElts - 1);
9112 }
9113
9114 if (Final && CandidateVFs.empty())
9115 return Results;
9116
9117 unsigned BestVF = Final ? CandidateVFs.back() : 0;
9118 for (unsigned NumElts : CandidateVFs) {
9119 if (Final && NumElts > BestVF)
9120 continue;
9121 SmallVector<unsigned> MaskedGatherVectorized;
9122 for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E;
9123 ++Cnt) {
9124 ArrayRef<LoadInst *> Slice =
9125 ArrayRef(Loads).slice(Cnt, std::min(NumElts, E - Cnt));
9126 if (VectorizedLoads.count(Slice.front()) ||
9127 VectorizedLoads.count(Slice.back()) ||
9129 continue;
9130 // Check if it is profitable to try vectorizing gathered loads. It is
9131 // profitable if we have more than 3 consecutive loads or if we have
9132 // less but all users are vectorized or deleted.
9133 bool AllowToVectorize = false;
9134 // Check if it is profitable to vectorize 2-elements loads.
9135 if (NumElts == 2) {
9136 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
9137 Slice.front()->getType(), ElementCount::getFixed(NumElts));
9138 auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
9139 for (LoadInst *LI : Slice) {
9140 // If single use/user - allow to vectorize.
9141 if (LI->hasOneUse())
9142 continue;
9143 // 1. Check if number of uses equals number of users.
9144 // 2. All users are deleted.
9145 // 3. The load broadcasts are not allowed or the load is not
9146 // broadcasted.
9147 if (static_cast<unsigned int>(std::distance(
9148 LI->user_begin(), LI->user_end())) != LI->getNumUses())
9149 return false;
9150 if (!IsLegalBroadcastLoad)
9151 continue;
9152 if (LI->hasNUsesOrMore(UsesLimit))
9153 return false;
9154 for (User *U : LI->users()) {
9155 if (auto *UI = dyn_cast<Instruction>(U); UI && isDeleted(UI))
9156 continue;
9157 for (const TreeEntry *UTE : getTreeEntries(U)) {
9158 for (int I : seq<int>(UTE->getNumOperands())) {
9159 if (all_of(UTE->getOperand(I), [LI](Value *V) {
9160 return V == LI || isa<PoisonValue>(V);
9161 }))
9162 // Found legal broadcast - do not vectorize.
9163 return false;
9164 }
9165 }
9166 }
9167 }
9168 return true;
9169 };
9170 AllowToVectorize = CheckIfAllowed(Slice);
9171 } else {
9172 AllowToVectorize =
9173 (NumElts >= 3 ||
9174 any_of(ValueToGatherNodes.at(Slice.front()),
9175 [=](const TreeEntry *TE) {
9176 return TE->Scalars.size() == 2 &&
9177 ((TE->Scalars.front() == Slice.front() &&
9178 TE->Scalars.back() == Slice.back()) ||
9179 (TE->Scalars.front() == Slice.back() &&
9180 TE->Scalars.back() == Slice.front()));
9181 })) &&
9182 hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),
9183 Slice.size());
9184 }
9185 if (AllowToVectorize) {
9186 SmallVector<Value *> PointerOps;
9187 OrdersType CurrentOrder;
9188 // Try to build vector load.
9189 ArrayRef<Value *> Values(
9190 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9191 LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
9192 PointerOps, &BestVF);
9193 if (LS != LoadsState::Gather ||
9194 (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
9195 if (LS == LoadsState::ScatterVectorize) {
9196 if (MaskedGatherVectorized.empty() ||
9197 Cnt >= MaskedGatherVectorized.back() + NumElts)
9198 MaskedGatherVectorized.push_back(Cnt);
9199 continue;
9200 }
9201 if (LS != LoadsState::Gather) {
9202 Results.emplace_back(Values, LS);
9203 VectorizedLoads.insert_range(Slice);
9204 // If we vectorized initial block, no need to try to vectorize it
9205 // again.
9206 if (Cnt == StartIdx)
9207 StartIdx += NumElts;
9208 }
9209 // Check if the whole array was vectorized already - exit.
9210 if (StartIdx >= Loads.size())
9211 break;
9212 // Erase last masked gather candidate, if another candidate within
9213 // the range is found to be better.
9214 if (!MaskedGatherVectorized.empty() &&
9215 Cnt < MaskedGatherVectorized.back() + NumElts)
9216 MaskedGatherVectorized.pop_back();
9217 Cnt += NumElts - 1;
9218 continue;
9219 }
9220 }
9221 if (!AllowToVectorize || BestVF == 0)
9223 }
9224 // Mark masked gathers candidates as vectorized, if any.
9225 for (unsigned Cnt : MaskedGatherVectorized) {
9226 ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(
9227 Cnt, std::min<unsigned>(NumElts, Loads.size() - Cnt));
9228 ArrayRef<Value *> Values(
9229 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9230 Results.emplace_back(Values, LoadsState::ScatterVectorize);
9231 VectorizedLoads.insert_range(Slice);
9232 // If we vectorized initial block, no need to try to vectorize it again.
9233 if (Cnt == StartIdx)
9234 StartIdx += NumElts;
9235 }
9236 }
9237 for (LoadInst *LI : Loads) {
9238 if (!VectorizedLoads.contains(LI))
9239 NonVectorized.push_back(LI);
9240 }
9241 return Results;
9242 };
9243 auto ProcessGatheredLoads =
9244 [&, &TTI = *TTI](
9246 bool Final = false) {
9247 SmallVector<LoadInst *> NonVectorized;
9248 for (ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
9249 GatheredLoads) {
9250 if (LoadsDists.size() <= 1) {
9251 NonVectorized.push_back(LoadsDists.back().first);
9252 continue;
9253 }
9255 LoadsDists);
9256 SmallVector<LoadInst *> OriginalLoads(make_first_range(LoadsDists));
9257 stable_sort(LocalLoadsDists, LoadSorter);
9259 unsigned MaxConsecutiveDistance = 0;
9260 unsigned CurrentConsecutiveDist = 1;
9261 int64_t LastDist = LocalLoadsDists.front().second;
9262 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
9263 for (const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
9264 if (isVectorized(L.first))
9265 continue;
9266 assert(LastDist >= L.second &&
9267 "Expected first distance always not less than second");
9268 if (static_cast<uint64_t>(LastDist - L.second) ==
9269 CurrentConsecutiveDist) {
9270 ++CurrentConsecutiveDist;
9271 MaxConsecutiveDistance =
9272 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
9273 Loads.push_back(L.first);
9274 continue;
9275 }
9276 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
9277 !Loads.empty())
9278 Loads.pop_back();
9279 CurrentConsecutiveDist = 1;
9280 LastDist = L.second;
9281 Loads.push_back(L.first);
9282 }
9283 if (Loads.size() <= 1)
9284 continue;
9285 if (AllowMaskedGather)
9286 MaxConsecutiveDistance = Loads.size();
9287 else if (MaxConsecutiveDistance < 2)
9288 continue;
9289 BoUpSLP::ValueSet VectorizedLoads;
9290 SmallVector<LoadInst *> SortedNonVectorized;
9292 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
9293 Final, MaxConsecutiveDistance);
9294 if (!Results.empty() && !SortedNonVectorized.empty() &&
9295 OriginalLoads.size() == Loads.size() &&
9296 MaxConsecutiveDistance == Loads.size() &&
9298 [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
9299 return P.second == LoadsState::ScatterVectorize;
9300 })) {
9301 VectorizedLoads.clear();
9302 SmallVector<LoadInst *> UnsortedNonVectorized;
9304 UnsortedResults =
9305 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
9306 UnsortedNonVectorized, Final,
9307 OriginalLoads.size());
9308 if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
9309 SortedNonVectorized.swap(UnsortedNonVectorized);
9310 Results.swap(UnsortedResults);
9311 }
9312 }
9313 for (auto [Slice, _] : Results) {
9314 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
9315 << Slice.size() << ")\n");
9316 if (any_of(Slice, [&](Value *V) { return isVectorized(V); })) {
9317 for (Value *L : Slice)
9318 if (!isVectorized(L))
9319 SortedNonVectorized.push_back(cast<LoadInst>(L));
9320 continue;
9321 }
9322
9323 // Select maximum VF as a maximum of user gathered nodes and
9324 // distance between scalar loads in these nodes.
9325 unsigned MaxVF = Slice.size();
9326 unsigned UserMaxVF = 0;
9327 unsigned InterleaveFactor = 0;
9328 if (MaxVF == 2) {
9329 UserMaxVF = MaxVF;
9330 } else {
9331 // Found distance between segments of the interleaved loads.
9332 std::optional<unsigned> InterleavedLoadsDistance = 0;
9333 unsigned Order = 0;
9334 std::optional<unsigned> CommonVF = 0;
9336 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
9337 for (auto [Idx, V] : enumerate(Slice)) {
9338 for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
9339 UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
9340 unsigned Pos =
9341 EntryToPosition.try_emplace(E, Idx).first->second;
9342 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
9343 if (CommonVF) {
9344 if (*CommonVF == 0) {
9345 CommonVF = E->Scalars.size();
9346 continue;
9347 }
9348 if (*CommonVF != E->Scalars.size())
9349 CommonVF.reset();
9350 }
9351 // Check if the load is the part of the interleaved load.
9352 if (Pos != Idx && InterleavedLoadsDistance) {
9353 if (!DeinterleavedNodes.contains(E) &&
9354 any_of(E->Scalars, [&, Slice = Slice](Value *V) {
9355 if (isa<Constant>(V))
9356 return false;
9357 if (isVectorized(V))
9358 return true;
9359 const auto &Nodes = ValueToGatherNodes.at(V);
9360 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
9361 !is_contained(Slice, V);
9362 })) {
9363 InterleavedLoadsDistance.reset();
9364 continue;
9365 }
9366 DeinterleavedNodes.insert(E);
9367 if (*InterleavedLoadsDistance == 0) {
9368 InterleavedLoadsDistance = Idx - Pos;
9369 continue;
9370 }
9371 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
9372 (Idx - Pos) / *InterleavedLoadsDistance < Order)
9373 InterleavedLoadsDistance.reset();
9374 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
9375 }
9376 }
9377 }
9378 DeinterleavedNodes.clear();
9379 // Check if the large load represents interleaved load operation.
9380 if (InterleavedLoadsDistance.value_or(0) > 1 &&
9381 CommonVF.value_or(0) != 0) {
9382 InterleaveFactor = bit_ceil(*InterleavedLoadsDistance);
9383 unsigned VF = *CommonVF;
9384 OrdersType Order;
9385 SmallVector<Value *> PointerOps;
9386 // Segmented load detected - vectorize at maximum vector factor.
9387 if (InterleaveFactor <= Slice.size() &&
9389 getWidenedType(Slice.front()->getType(), VF),
9390 InterleaveFactor,
9391 cast<LoadInst>(Slice.front())->getAlign(),
9392 cast<LoadInst>(Slice.front())
9394 canVectorizeLoads(Slice, Slice.front(), Order,
9395 PointerOps) == LoadsState::Vectorize) {
9396 UserMaxVF = InterleaveFactor * VF;
9397 } else {
9398 InterleaveFactor = 0;
9399 }
9400 }
9401 // Cannot represent the loads as consecutive vectorizable nodes -
9402 // just exit.
9403 unsigned ConsecutiveNodesSize = 0;
9404 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
9405 any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9406 [&, Slice = Slice](const auto &P) {
9407 const auto *It = find_if(Slice, [&](Value *V) {
9408 return std::get<1>(P).contains(V);
9409 });
9410 if (It == Slice.end())
9411 return false;
9412 const TreeEntry &TE =
9413 *VectorizableTree[std::get<0>(P)];
9414 ArrayRef<Value *> VL = TE.Scalars;
9415 OrdersType Order;
9416 SmallVector<Value *> PointerOps;
9418 VL, VL.front(), Order, PointerOps);
9419 if (State == LoadsState::ScatterVectorize ||
9421 return false;
9422 ConsecutiveNodesSize += VL.size();
9423 size_t Start = std::distance(Slice.begin(), It);
9424 size_t Sz = Slice.size() - Start;
9425 return Sz < VL.size() ||
9426 Slice.slice(Start, VL.size()) != VL;
9427 }))
9428 continue;
9429 // Try to build long masked gather loads.
9430 UserMaxVF = bit_ceil(UserMaxVF);
9431 if (InterleaveFactor == 0 &&
9432 any_of(seq<unsigned>(Slice.size() / UserMaxVF),
9433 [&, Slice = Slice](unsigned Idx) {
9434 OrdersType Order;
9435 SmallVector<Value *> PointerOps;
9436 return canVectorizeLoads(
9437 Slice.slice(Idx * UserMaxVF, UserMaxVF),
9438 Slice[Idx * UserMaxVF], Order,
9439 PointerOps) ==
9440 LoadsState::ScatterVectorize;
9441 }))
9442 UserMaxVF = MaxVF;
9443 if (Slice.size() != ConsecutiveNodesSize)
9444 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
9445 }
9446 for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
9447 bool IsVectorized = true;
9448 for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
9449 ArrayRef<Value *> SubSlice =
9450 Slice.slice(I, std::min(VF, E - I));
9451 if (isVectorized(SubSlice.front()))
9452 continue;
9453 // Check if the subslice is to be-vectorized entry, which is not
9454 // equal to entry.
9455 if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9456 [&](const auto &P) {
9457 return !SubSlice.equals(
9458 VectorizableTree[std::get<0>(P)]
9459 ->Scalars) &&
9460 set_is_subset(SubSlice, std::get<1>(P));
9461 }))
9462 continue;
9463 unsigned Sz = VectorizableTree.size();
9464 buildTreeRec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
9465 if (Sz == VectorizableTree.size()) {
9466 IsVectorized = false;
9467 // Try non-interleaved vectorization with smaller vector
9468 // factor.
9469 if (InterleaveFactor > 0) {
9470 VF = 2 * (MaxVF / InterleaveFactor);
9471 InterleaveFactor = 0;
9472 }
9473 continue;
9474 }
9475 }
9476 if (IsVectorized)
9477 break;
9478 }
9479 }
9480 NonVectorized.append(SortedNonVectorized);
9481 }
9482 return NonVectorized;
9483 };
9484 for (const auto &GLs : GatheredLoads) {
9485 const auto &Ref = GLs.second;
9486 SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);
9487 if (!Ref.empty() && !NonVectorized.empty() &&
9488 std::accumulate(
9489 Ref.begin(), Ref.end(), 0u,
9490 [](unsigned S, ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
9491 -> unsigned { return S + LoadsDists.size(); }) !=
9492 NonVectorized.size() &&
9493 IsMaskedGatherSupported(NonVectorized)) {
9495 FinalGatheredLoads;
9496 for (LoadInst *LI : NonVectorized) {
9497 // Reinsert non-vectorized loads to other list of loads with the same
9498 // base pointers.
9499 gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI,
9500 FinalGatheredLoads,
9501 /*AddNew=*/false);
9502 }
9503 // Final attempt to vectorize non-vectorized loads.
9504 (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
9505 }
9506 }
9507 // Try to vectorize postponed load entries, previously marked as gathered.
9508 for (unsigned Idx : LoadEntriesToVectorize) {
9509 const TreeEntry &E = *VectorizableTree[Idx];
9510 SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
9511 // Avoid reordering, if possible.
9512 if (!E.ReorderIndices.empty()) {
9513 // Build a mask out of the reorder indices and reorder scalars per this
9514 // mask.
9515 SmallVector<int> ReorderMask;
9516 inversePermutation(E.ReorderIndices, ReorderMask);
9517 reorderScalars(GatheredScalars, ReorderMask);
9518 }
9519 buildTreeRec(GatheredScalars, 0, EdgeInfo());
9520 }
9521 // If no new entries created, consider it as no gathered loads entries must be
9522 // handled.
9523 if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
9524 VectorizableTree.size())
9525 GatheredLoadsEntriesFirst.reset();
9526}
9527
9528/// Generates key/subkey pair for the given value to provide effective sorting
9529/// of the values and better detection of the vectorizable values sequences. The
9530/// keys/subkeys can be used for better sorting of the values themselves (keys)
9531/// and in values subgroups (subkeys).
9532static std::pair<size_t, size_t> generateKeySubkey(
9533 Value *V, const TargetLibraryInfo *TLI,
9534 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
9535 bool AllowAlternate) {
9536 hash_code Key = hash_value(V->getValueID() + 2);
9537 hash_code SubKey = hash_value(0);
9538 // Sort the loads by the distance between the pointers.
9539 if (auto *LI = dyn_cast<LoadInst>(V)) {
9540 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
9541 if (LI->isSimple())
9542 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
9543 else
9544 Key = SubKey = hash_value(LI);
9545 } else if (isVectorLikeInstWithConstOps(V)) {
9546 // Sort extracts by the vector operands.
9547 if (isa<ExtractElementInst, UndefValue>(V))
9548 Key = hash_value(Value::UndefValueVal + 1);
9549 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
9550 if (!isUndefVector(EI->getVectorOperand()).all() &&
9551 !isa<UndefValue>(EI->getIndexOperand()))
9552 SubKey = hash_value(EI->getVectorOperand());
9553 }
9554 } else if (auto *I = dyn_cast<Instruction>(V)) {
9555 // Sort other instructions just by the opcodes except for CMPInst.
9556 // For CMP also sort by the predicate kind.
9557 if ((isa<BinaryOperator, CastInst>(I)) &&
9558 isValidForAlternation(I->getOpcode())) {
9559 if (AllowAlternate)
9560 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
9561 else
9562 Key = hash_combine(hash_value(I->getOpcode()), Key);
9563 SubKey = hash_combine(
9564 hash_value(I->getOpcode()), hash_value(I->getType()),
9565 hash_value(isa<BinaryOperator>(I)
9566 ? I->getType()
9567 : cast<CastInst>(I)->getOperand(0)->getType()));
9568 // For casts, look through the only operand to improve compile time.
9569 if (isa<CastInst>(I)) {
9570 std::pair<size_t, size_t> OpVals =
9571 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
9572 /*AllowAlternate=*/true);
9573 Key = hash_combine(OpVals.first, Key);
9574 SubKey = hash_combine(OpVals.first, SubKey);
9575 }
9576 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
9577 CmpInst::Predicate Pred = CI->getPredicate();
9578 if (CI->isCommutative())
9579 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
9581 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
9582 hash_value(SwapPred),
9583 hash_value(CI->getOperand(0)->getType()));
9584 } else if (auto *Call = dyn_cast<CallInst>(I)) {
9587 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
9588 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
9589 SubKey = hash_combine(hash_value(I->getOpcode()),
9590 hash_value(Call->getCalledFunction()));
9591 } else {
9592 Key = hash_combine(hash_value(Call), Key);
9593 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
9594 }
9595 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
9596 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
9597 hash_value(Op.Tag), SubKey);
9598 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
9599 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
9600 SubKey = hash_value(Gep->getPointerOperand());
9601 else
9602 SubKey = hash_value(Gep);
9603 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
9604 !isa<ConstantInt>(I->getOperand(1))) {
9605 // Do not try to vectorize instructions with potentially high cost.
9606 SubKey = hash_value(I);
9607 } else {
9608 SubKey = hash_value(I->getOpcode());
9609 }
9610 Key = hash_combine(hash_value(I->getParent()), Key);
9611 }
9612 return std::make_pair(Key, SubKey);
9613}
9614
9615/// Checks if the specified instruction \p I is an main operation for the given
9616/// \p MainOp and \p AltOp instructions.
9617static bool isMainInstruction(Instruction *I, Instruction *MainOp,
9618 Instruction *AltOp, const TargetLibraryInfo &TLI);
9619
9620bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
9621 ArrayRef<Value *> VL) const {
9622 Type *ScalarTy = S.getMainOp()->getType();
9623 unsigned Opcode0 = S.getOpcode();
9624 unsigned Opcode1 = S.getAltOpcode();
9625 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
9626 // If this pattern is supported by the target then consider it profitable.
9627 if (TTI->isLegalAltInstr(getWidenedType(ScalarTy, VL.size()), Opcode0,
9628 Opcode1, OpcodeMask))
9629 return true;
9631 for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
9632 Operands.emplace_back();
9633 // Prepare the operand vector.
9634 for (Value *V : VL) {
9635 if (isa<PoisonValue>(V)) {
9636 Operands.back().push_back(
9637 PoisonValue::get(S.getMainOp()->getOperand(I)->getType()));
9638 continue;
9639 }
9640 Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
9641 }
9642 }
9643 if (Operands.size() == 2) {
9644 // Try find best operands candidates.
9645 for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
9647 Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
9648 Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
9649 Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
9650 std::optional<int> Res = findBestRootPair(Candidates);
9651 switch (Res.value_or(0)) {
9652 case 0:
9653 break;
9654 case 1:
9655 std::swap(Operands[0][I + 1], Operands[1][I + 1]);
9656 break;
9657 case 2:
9658 std::swap(Operands[0][I], Operands[1][I]);
9659 break;
9660 default:
9661 llvm_unreachable("Unexpected index.");
9662 }
9663 }
9664 }
9665 DenseSet<unsigned> UniqueOpcodes;
9666 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
9667 unsigned NonInstCnt = 0;
9668 // Estimate number of instructions, required for the vectorized node and for
9669 // the buildvector node.
9670 unsigned UndefCnt = 0;
9671 // Count the number of extra shuffles, required for vector nodes.
9672 unsigned ExtraShuffleInsts = 0;
9673 // Check that operands do not contain same values and create either perfect
9674 // diamond match or shuffled match.
9675 if (Operands.size() == 2) {
9676 // Do not count same operands twice.
9677 if (Operands.front() == Operands.back()) {
9678 Operands.erase(Operands.begin());
9679 } else if (!allConstant(Operands.front()) &&
9680 all_of(Operands.front(), [&](Value *V) {
9681 return is_contained(Operands.back(), V);
9682 })) {
9683 Operands.erase(Operands.begin());
9684 ++ExtraShuffleInsts;
9685 }
9686 }
9687 const Loop *L = LI->getLoopFor(S.getMainOp()->getParent());
9688 // Vectorize node, if:
9689 // 1. at least single operand is constant or splat.
9690 // 2. Operands have many loop invariants (the instructions are not loop
9691 // invariants).
9692 // 3. At least single unique operands is supposed to vectorized.
9693 return none_of(Operands,
9694 [&](ArrayRef<Value *> Op) {
9695 if (allConstant(Op) ||
9696 (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
9697 getSameOpcode(Op, *TLI)))
9698 return false;
9700 for (Value *V : Op) {
9701 if (isa<Constant, ExtractElementInst>(V) ||
9702 isVectorized(V) || (L && L->isLoopInvariant(V))) {
9703 if (isa<UndefValue>(V))
9704 ++UndefCnt;
9705 continue;
9706 }
9707 auto Res = Uniques.try_emplace(V, 0);
9708 // Found first duplicate - need to add shuffle.
9709 if (!Res.second && Res.first->second == 1)
9710 ++ExtraShuffleInsts;
9711 ++Res.first->getSecond();
9712 if (auto *I = dyn_cast<Instruction>(V))
9713 UniqueOpcodes.insert(I->getOpcode());
9714 else if (Res.second)
9715 ++NonInstCnt;
9716 }
9717 return none_of(Uniques, [&](const auto &P) {
9718 return P.first->hasNUsesOrMore(P.second + 1) &&
9719 none_of(P.first->users(), [&](User *U) {
9720 return isVectorized(U) || Uniques.contains(U);
9721 });
9722 });
9723 }) ||
9724 // Do not vectorize node, if estimated number of vector instructions is
9725 // more than estimated number of buildvector instructions. Number of
9726 // vector operands is number of vector instructions + number of vector
9727 // instructions for operands (buildvectors). Number of buildvector
9728 // instructions is just number_of_operands * number_of_scalars.
9729 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
9730 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
9731 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
9732}
9733
9734/// Builds the arguments types vector for the given call instruction with the
9735/// given \p ID for the specified vector factor.
9738 const unsigned VF, unsigned MinBW,
9739 const TargetTransformInfo *TTI) {
9740 SmallVector<Type *> ArgTys;
9741 for (auto [Idx, Arg] : enumerate(CI->args())) {
9744 ArgTys.push_back(Arg->getType());
9745 continue;
9746 }
9747 if (MinBW > 0) {
9748 ArgTys.push_back(
9749 getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
9750 continue;
9751 }
9752 }
9753 ArgTys.push_back(getWidenedType(Arg->getType(), VF));
9754 }
9755 return ArgTys;
9756}
9757
9758/// Calculates the costs of vectorized intrinsic (if possible) and vectorized
9759/// function (if possible) calls. Returns invalid cost for the corresponding
9760/// calls, if they cannot be vectorized/will be scalarized.
9761static std::pair<InstructionCost, InstructionCost>
9764 ArrayRef<Type *> ArgTys) {
9765 auto Shape = VFShape::get(CI->getFunctionType(),
9767 false /*HasGlobalPred*/);
9768 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
9769 auto LibCost = InstructionCost::getInvalid();
9770 if (!CI->isNoBuiltin() && VecFunc) {
9771 // Calculate the cost of the vector library call.
9772 // If the corresponding vector call is cheaper, return its cost.
9773 LibCost =
9774 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
9775 }
9777
9778 // Calculate the cost of the vector intrinsic call.
9779 FastMathFlags FMF;
9780 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
9781 FMF = FPCI->getFastMathFlags();
9782 const InstructionCost ScalarLimit = 10000;
9783 IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF, nullptr,
9784 LibCost.isValid() ? LibCost : ScalarLimit);
9785 auto IntrinsicCost =
9787 if ((LibCost.isValid() && IntrinsicCost > LibCost) ||
9788 (!LibCost.isValid() && IntrinsicCost > ScalarLimit))
9790
9791 return {IntrinsicCost, LibCost};
9792}
9793
9794BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
9795 const InstructionsState &S, ArrayRef<Value *> VL,
9796 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
9797 SmallVectorImpl<Value *> &PointerOps) {
9798 assert(S.getMainOp() &&
9799 "Expected instructions with same/alternate opcodes only.");
9800
9801 unsigned ShuffleOrOp =
9802 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
9803 Instruction *VL0 = S.getMainOp();
9804 switch (ShuffleOrOp) {
9805 case Instruction::PHI: {
9806 // Too many operands - gather, most probably won't be vectorized.
9807 if (VL0->getNumOperands() > MaxPHINumOperands)
9808 return TreeEntry::NeedToGather;
9809 // Check for terminator values (e.g. invoke).
9810 for (Value *V : VL) {
9811 auto *PHI = dyn_cast<PHINode>(V);
9812 if (!PHI)
9813 continue;
9814 for (Value *Incoming : PHI->incoming_values()) {
9815 Instruction *Term = dyn_cast<Instruction>(Incoming);
9816 if (Term && Term->isTerminator()) {
9818 << "SLP: Need to swizzle PHINodes (terminator use).\n");
9819 return TreeEntry::NeedToGather;
9820 }
9821 }
9822 }
9823
9824 return TreeEntry::Vectorize;
9825 }
9826 case Instruction::ExtractElement:
9827 if (any_of(VL, [&](Value *V) {
9828 auto *EI = dyn_cast<ExtractElementInst>(V);
9829 if (!EI)
9830 return true;
9831 return isVectorized(EI->getOperand(0));
9832 }))
9833 return TreeEntry::NeedToGather;
9834 [[fallthrough]];
9835 case Instruction::ExtractValue: {
9836 bool Reuse = canReuseExtract(VL, CurrentOrder);
9837 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
9838 // non-full registers).
9839 if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size()))
9840 return TreeEntry::NeedToGather;
9841 if (Reuse || !CurrentOrder.empty())
9842 return TreeEntry::Vectorize;
9843 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
9844 return TreeEntry::NeedToGather;
9845 }
9846 case Instruction::InsertElement: {
9847 // Check that we have a buildvector and not a shuffle of 2 or more
9848 // different vectors.
9849 ValueSet SourceVectors;
9850 for (Value *V : VL) {
9851 if (isa<PoisonValue>(V)) {
9852 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement/poison vector.\n");
9853 return TreeEntry::NeedToGather;
9854 }
9855 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
9856 assert(getElementIndex(V) != std::nullopt &&
9857 "Non-constant or undef index?");
9858 }
9859
9860 if (count_if(VL, [&SourceVectors](Value *V) {
9861 return !SourceVectors.contains(V);
9862 }) >= 2) {
9863 // Found 2nd source vector - cancel.
9864 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
9865 "different source vectors.\n");
9866 return TreeEntry::NeedToGather;
9867 }
9868
9869 if (any_of(VL, [&SourceVectors](Value *V) {
9870 // The last InsertElement can have multiple uses.
9871 return SourceVectors.contains(V) && !V->hasOneUse();
9872 })) {
9873 assert(SLPReVec && "Only supported by REVEC.");
9874 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
9875 "multiple uses.\n");
9876 return TreeEntry::NeedToGather;
9877 }
9878
9879 return TreeEntry::Vectorize;
9880 }
9881 case Instruction::Load: {
9882 // Check that a vectorized load would load the same memory as a scalar
9883 // load. For example, we don't want to vectorize loads that are smaller
9884 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
9885 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
9886 // from such a struct, we read/write packed bits disagreeing with the
9887 // unvectorized version.
9888 auto IsGatheredNode = [&]() {
9889 if (!GatheredLoadsEntriesFirst)
9890 return false;
9891 return all_of(VL, [&](Value *V) {
9892 if (isa<PoisonValue>(V))
9893 return true;
9894 return any_of(getTreeEntries(V), [&](const TreeEntry *TE) {
9895 return TE->Idx >= *GatheredLoadsEntriesFirst;
9896 });
9897 });
9898 };
9899 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {
9901 return TreeEntry::Vectorize;
9903 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
9904 // Delay slow vectorized nodes for better vectorization attempts.
9905 LoadEntriesToVectorize.insert(VectorizableTree.size());
9906 return TreeEntry::NeedToGather;
9907 }
9908 return IsGatheredNode() ? TreeEntry::NeedToGather
9909 : TreeEntry::CompressVectorize;
9911 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
9912 // Delay slow vectorized nodes for better vectorization attempts.
9913 LoadEntriesToVectorize.insert(VectorizableTree.size());
9914 return TreeEntry::NeedToGather;
9915 }
9916 return IsGatheredNode() ? TreeEntry::NeedToGather
9917 : TreeEntry::ScatterVectorize;
9919 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
9920 // Delay slow vectorized nodes for better vectorization attempts.
9921 LoadEntriesToVectorize.insert(VectorizableTree.size());
9922 return TreeEntry::NeedToGather;
9923 }
9924 return IsGatheredNode() ? TreeEntry::NeedToGather
9925 : TreeEntry::StridedVectorize;
9926 case LoadsState::Gather:
9927#ifndef NDEBUG
9928 Type *ScalarTy = VL0->getType();
9929 if (DL->getTypeSizeInBits(ScalarTy) !=
9930 DL->getTypeAllocSizeInBits(ScalarTy))
9931 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
9932 else if (any_of(VL, [](Value *V) {
9933 auto *LI = dyn_cast<LoadInst>(V);
9934 return !LI || !LI->isSimple();
9935 }))
9936 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
9937 else
9938 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
9939#endif // NDEBUG
9941 return TreeEntry::NeedToGather;
9942 }
9943 llvm_unreachable("Unexpected state of loads");
9944 }
9945 case Instruction::ZExt:
9946 case Instruction::SExt:
9947 case Instruction::FPToUI:
9948 case Instruction::FPToSI:
9949 case Instruction::FPExt:
9950 case Instruction::PtrToInt:
9951 case Instruction::IntToPtr:
9952 case Instruction::SIToFP:
9953 case Instruction::UIToFP:
9954 case Instruction::Trunc:
9955 case Instruction::FPTrunc:
9956 case Instruction::BitCast: {
9957 Type *SrcTy = VL0->getOperand(0)->getType();
9958 for (Value *V : VL) {
9959 if (isa<PoisonValue>(V))
9960 continue;
9961 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
9962 if (Ty != SrcTy || !isValidElementType(Ty)) {
9963 LLVM_DEBUG(
9964 dbgs() << "SLP: Gathering casts with different src types.\n");
9965 return TreeEntry::NeedToGather;
9966 }
9967 }
9968 return TreeEntry::Vectorize;
9969 }
9970 case Instruction::ICmp:
9971 case Instruction::FCmp: {
9972 // Check that all of the compares have the same predicate.
9973 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
9975 Type *ComparedTy = VL0->getOperand(0)->getType();
9976 for (Value *V : VL) {
9977 if (isa<PoisonValue>(V))
9978 continue;
9979 auto *Cmp = cast<CmpInst>(V);
9980 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
9981 Cmp->getOperand(0)->getType() != ComparedTy) {
9982 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
9983 return TreeEntry::NeedToGather;
9984 }
9985 }
9986 return TreeEntry::Vectorize;
9987 }
9988 case Instruction::Select:
9989 case Instruction::FNeg:
9990 case Instruction::Add:
9991 case Instruction::FAdd:
9992 case Instruction::Sub:
9993 case Instruction::FSub:
9994 case Instruction::Mul:
9995 case Instruction::FMul:
9996 case Instruction::UDiv:
9997 case Instruction::SDiv:
9998 case Instruction::FDiv:
9999 case Instruction::URem:
10000 case Instruction::SRem:
10001 case Instruction::FRem:
10002 case Instruction::Shl:
10003 case Instruction::LShr:
10004 case Instruction::AShr:
10005 case Instruction::And:
10006 case Instruction::Or:
10007 case Instruction::Xor:
10008 case Instruction::Freeze:
10009 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10011 auto *I = dyn_cast<Instruction>(V);
10012 return I && I->isBinaryOp() && !I->isFast();
10013 }))
10014 return TreeEntry::NeedToGather;
10015 return TreeEntry::Vectorize;
10016 case Instruction::GetElementPtr: {
10017 // We don't combine GEPs with complicated (nested) indexing.
10018 for (Value *V : VL) {
10019 auto *I = dyn_cast<GetElementPtrInst>(V);
10020 if (!I)
10021 continue;
10022 if (I->getNumOperands() != 2) {
10023 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
10024 return TreeEntry::NeedToGather;
10025 }
10026 }
10027
10028 // We can't combine several GEPs into one vector if they operate on
10029 // different types.
10030 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
10031 for (Value *V : VL) {
10032 auto *GEP = dyn_cast<GEPOperator>(V);
10033 if (!GEP)
10034 continue;
10035 Type *CurTy = GEP->getSourceElementType();
10036 if (Ty0 != CurTy) {
10037 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
10038 return TreeEntry::NeedToGather;
10039 }
10040 }
10041
10042 // We don't combine GEPs with non-constant indexes.
10043 Type *Ty1 = VL0->getOperand(1)->getType();
10044 for (Value *V : VL) {
10045 auto *I = dyn_cast<GetElementPtrInst>(V);
10046 if (!I)
10047 continue;
10048 auto *Op = I->getOperand(1);
10049 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
10050 (Op->getType() != Ty1 &&
10051 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
10052 Op->getType()->getScalarSizeInBits() >
10053 DL->getIndexSizeInBits(
10054 V->getType()->getPointerAddressSpace())))) {
10055 LLVM_DEBUG(
10056 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
10057 return TreeEntry::NeedToGather;
10058 }
10059 }
10060
10061 return TreeEntry::Vectorize;
10062 }
10063 case Instruction::Store: {
10064 // Check if the stores are consecutive or if we need to swizzle them.
10065 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
10066 // Avoid types that are padded when being allocated as scalars, while
10067 // being packed together in a vector (such as i1).
10068 if (DL->getTypeSizeInBits(ScalarTy) !=
10069 DL->getTypeAllocSizeInBits(ScalarTy)) {
10070 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
10071 return TreeEntry::NeedToGather;
10072 }
10073 // Make sure all stores in the bundle are simple - we can't vectorize
10074 // atomic or volatile stores.
10075 for (Value *V : VL) {
10076 auto *SI = cast<StoreInst>(V);
10077 if (!SI->isSimple()) {
10078 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
10079 return TreeEntry::NeedToGather;
10080 }
10081 PointerOps.push_back(SI->getPointerOperand());
10082 }
10083
10084 // Check the order of pointer operands.
10085 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
10086 Value *Ptr0;
10087 Value *PtrN;
10088 if (CurrentOrder.empty()) {
10089 Ptr0 = PointerOps.front();
10090 PtrN = PointerOps.back();
10091 } else {
10092 Ptr0 = PointerOps[CurrentOrder.front()];
10093 PtrN = PointerOps[CurrentOrder.back()];
10094 }
10095 std::optional<int64_t> Dist =
10096 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
10097 // Check that the sorted pointer operands are consecutive.
10098 if (static_cast<uint64_t>(*Dist) == VL.size() - 1)
10099 return TreeEntry::Vectorize;
10100 }
10101
10102 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
10103 return TreeEntry::NeedToGather;
10104 }
10105 case Instruction::Call: {
10106 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10108 auto *I = dyn_cast<Instruction>(V);
10109 return I && !I->isFast();
10110 }))
10111 return TreeEntry::NeedToGather;
10112 // Check if the calls are all to the same vectorizable intrinsic or
10113 // library function.
10114 CallInst *CI = cast<CallInst>(VL0);
10116
10117 VFShape Shape = VFShape::get(
10118 CI->getFunctionType(),
10119 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
10120 false /*HasGlobalPred*/);
10121 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10122
10123 if (!VecFunc && !isTriviallyVectorizable(ID)) {
10124 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
10125 return TreeEntry::NeedToGather;
10126 }
10127 Function *F = CI->getCalledFunction();
10128 unsigned NumArgs = CI->arg_size();
10129 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
10130 for (unsigned J = 0; J != NumArgs; ++J)
10132 ScalarArgs[J] = CI->getArgOperand(J);
10133 for (Value *V : VL) {
10134 CallInst *CI2 = dyn_cast<CallInst>(V);
10135 if (!CI2 || CI2->getCalledFunction() != F ||
10136 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
10137 (VecFunc &&
10138 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
10140 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
10141 << "\n");
10142 return TreeEntry::NeedToGather;
10143 }
10144 // Some intrinsics have scalar arguments and should be same in order for
10145 // them to be vectorized.
10146 for (unsigned J = 0; J != NumArgs; ++J) {
10148 Value *A1J = CI2->getArgOperand(J);
10149 if (ScalarArgs[J] != A1J) {
10151 << "SLP: mismatched arguments in call:" << *CI
10152 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
10153 return TreeEntry::NeedToGather;
10154 }
10155 }
10156 }
10157 // Verify that the bundle operands are identical between the two calls.
10158 if (CI->hasOperandBundles() &&
10159 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
10160 CI->op_begin() + CI->getBundleOperandsEndIndex(),
10161 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
10162 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
10163 << "!=" << *V << '\n');
10164 return TreeEntry::NeedToGather;
10165 }
10166 }
10167 SmallVector<Type *> ArgTys =
10168 buildIntrinsicArgTypes(CI, ID, VL.size(), 0, TTI);
10169 auto *VecTy = getWidenedType(S.getMainOp()->getType(), VL.size());
10170 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
10171 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10172 return TreeEntry::NeedToGather;
10173
10174 return TreeEntry::Vectorize;
10175 }
10176 case Instruction::ShuffleVector: {
10177 if (!S.isAltShuffle()) {
10178 // REVEC can support non alternate shuffle.
10180 return TreeEntry::Vectorize;
10181 // If this is not an alternate sequence of opcode like add-sub
10182 // then do not vectorize this instruction.
10183 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
10184 return TreeEntry::NeedToGather;
10185 }
10186 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
10187 LLVM_DEBUG(
10188 dbgs()
10189 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
10190 "the whole alt sequence is not profitable.\n");
10191 return TreeEntry::NeedToGather;
10192 }
10193
10194 return TreeEntry::Vectorize;
10195 }
10196 default:
10197 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
10198 return TreeEntry::NeedToGather;
10199 }
10200}
10201
10202namespace {
10203/// Allows to correctly handle operands of the phi nodes based on the \p Main
10204/// PHINode order of incoming basic blocks/values.
10205class PHIHandler {
10206 DominatorTree &DT;
10207 PHINode *Main = nullptr;
10210
10211public:
10212 PHIHandler() = delete;
10213 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
10214 : DT(DT), Main(Main), Phis(Phis),
10215 Operands(Main->getNumIncomingValues(),
10216 SmallVector<Value *>(Phis.size(), nullptr)) {}
10217 void buildOperands() {
10218 constexpr unsigned FastLimit = 4;
10219 if (Main->getNumIncomingValues() <= FastLimit) {
10220 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
10221 BasicBlock *InBB = Main->getIncomingBlock(I);
10222 if (!DT.isReachableFromEntry(InBB)) {
10223 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
10224 continue;
10225 }
10226 // Prepare the operand vector.
10227 for (auto [Idx, V] : enumerate(Phis)) {
10228 auto *P = dyn_cast<PHINode>(V);
10229 if (!P) {
10230 assert(isa<PoisonValue>(V) &&
10231 "Expected isa instruction or poison value.");
10232 Operands[I][Idx] = V;
10233 continue;
10234 }
10235 if (P->getIncomingBlock(I) == InBB)
10236 Operands[I][Idx] = P->getIncomingValue(I);
10237 else
10238 Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
10239 }
10240 }
10241 return;
10242 }
10244 Blocks;
10245 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues())) {
10246 BasicBlock *InBB = Main->getIncomingBlock(I);
10247 if (!DT.isReachableFromEntry(InBB)) {
10248 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
10249 continue;
10250 }
10251 Blocks.try_emplace(InBB).first->second.push_back(I);
10252 }
10253 for (auto [Idx, V] : enumerate(Phis)) {
10254 if (isa<PoisonValue>(V)) {
10255 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues()))
10256 Operands[I][Idx] = V;
10257 continue;
10258 }
10259 auto *P = cast<PHINode>(V);
10260 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
10261 BasicBlock *InBB = P->getIncomingBlock(I);
10262 if (InBB == Main->getIncomingBlock(I)) {
10263 if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))
10264 continue;
10265 Operands[I][Idx] = P->getIncomingValue(I);
10266 continue;
10267 }
10268 auto *It = Blocks.find(InBB);
10269 if (It == Blocks.end())
10270 continue;
10271 Operands[It->second.front()][Idx] = P->getIncomingValue(I);
10272 }
10273 }
10274 for (const auto &P : Blocks) {
10275 ArrayRef<unsigned> IncomingValues = P.second;
10276 if (IncomingValues.size() <= 1)
10277 continue;
10278 unsigned BasicI = IncomingValues.consume_front();
10279 for (unsigned I : IncomingValues) {
10281 [&](const auto &Data) {
10282 return !Data.value() ||
10283 Data.value() == Operands[BasicI][Data.index()];
10284 }) &&
10285 "Expected empty operands list.");
10286 Operands[I] = Operands[BasicI];
10287 }
10288 }
10289 }
10290 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
10291};
10292} // namespace
10293
10294/// Returns main/alternate instructions for the given \p VL. Unlike
10295/// getSameOpcode supports non-compatible instructions for better SplitVectorize
10296/// node support.
10297/// \returns first main/alt instructions, if only poisons and instruction with
10298/// only 2 opcodes exists. Returns pair of nullptr otherwise.
10299static std::pair<Instruction *, Instruction *>
10301 Instruction *MainOp = nullptr;
10302 Instruction *AltOp = nullptr;
10303 for (Value *V : VL) {
10304 if (isa<PoisonValue>(V))
10305 continue;
10306 auto *I = dyn_cast<Instruction>(V);
10307 if (!I)
10308 return {};
10309 if (!MainOp) {
10310 MainOp = I;
10311 continue;
10312 }
10313 if (MainOp->getOpcode() == I->getOpcode()) {
10314 if (I->getParent() != MainOp->getParent())
10315 return {};
10316 continue;
10317 }
10318 if (!AltOp) {
10319 AltOp = I;
10320 continue;
10321 }
10322 if (AltOp->getOpcode() == I->getOpcode()) {
10323 if (I->getParent() != AltOp->getParent())
10324 return {};
10325 continue;
10326 }
10327 return {};
10328 }
10329 if (!AltOp)
10330 return {};
10331 assert(MainOp && AltOp && MainOp->getOpcode() != AltOp->getOpcode() &&
10332 "Expected different main and alt instructions.");
10333 return std::make_pair(MainOp, AltOp);
10334}
10335
10336/// Checks that every instruction appears once in the list and if not, packs
10337/// them, building \p ReuseShuffleIndices mask and mutating \p VL. The list of
10338/// unique scalars is extended by poison values to the whole register size.
10339///
10340/// \returns false if \p VL could not be uniquified, in which case \p VL is
10341/// unchanged and \p ReuseShuffleIndices is empty.
10343 SmallVectorImpl<int> &ReuseShuffleIndices,
10344 const TargetTransformInfo &TTI,
10345 const TargetLibraryInfo &TLI,
10346 const InstructionsState &S,
10347 const BoUpSLP::EdgeInfo &UserTreeIdx,
10348 bool TryPad = false) {
10349 // Check that every instruction appears once in this bundle.
10350 SmallVector<Value *> UniqueValues;
10351 SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
10352 for (Value *V : VL) {
10353 if (isConstant(V)) {
10354 // Constants are always considered distinct, even if the same constant
10355 // appears multiple times in VL.
10356 ReuseShuffleIndices.emplace_back(
10357 isa<PoisonValue>(V) ? PoisonMaskElem : UniqueValues.size());
10358 UniqueValues.emplace_back(V);
10359 continue;
10360 }
10361 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
10362 ReuseShuffleIndices.emplace_back(Res.first->second);
10363 if (Res.second)
10364 UniqueValues.emplace_back(V);
10365 }
10366
10367 // Easy case: VL has unique values and a "natural" size
10368 size_t NumUniqueScalarValues = UniqueValues.size();
10369 bool IsFullVectors = hasFullVectorsOrPowerOf2(
10370 TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues);
10371 if (NumUniqueScalarValues == VL.size() &&
10372 (VectorizeNonPowerOf2 || IsFullVectors)) {
10373 ReuseShuffleIndices.clear();
10374 return true;
10375 }
10376
10377 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
10378 if ((UserTreeIdx.UserTE &&
10379 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(TTI)) ||
10381 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
10382 "for nodes with padding.\n");
10383 ReuseShuffleIndices.clear();
10384 return false;
10385 }
10386
10387 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
10388 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
10389 (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
10390 return isa<UndefValue>(V) || !isConstant(V);
10391 }))) {
10392 if (TryPad && UniquePositions.size() > 1 && NumUniqueScalarValues > 1 &&
10393 S.getMainOp()->isSafeToRemove() &&
10394 (S.areInstructionsWithCopyableElements() ||
10395 all_of(UniqueValues, IsaPred<Instruction, PoisonValue>))) {
10396 // Find the number of elements, which forms full vectors.
10397 unsigned PWSz = getFullVectorNumberOfElements(
10398 TTI, UniqueValues.front()->getType(), UniqueValues.size());
10399 PWSz = std::min<unsigned>(PWSz, VL.size());
10400 if (PWSz == VL.size()) {
10401 // We ended up with the same size after removing duplicates and
10402 // upgrading the resulting vector size to a "nice size". Just keep
10403 // the initial VL then.
10404 ReuseShuffleIndices.clear();
10405 } else {
10406 // Pad unique values with poison to grow the vector to a "nice" size
10407 SmallVector<Value *> PaddedUniqueValues(UniqueValues.begin(),
10408 UniqueValues.end());
10409 PaddedUniqueValues.append(
10410 PWSz - UniqueValues.size(),
10411 PoisonValue::get(UniqueValues.front()->getType()));
10412 // Check that extended with poisons/copyable operations are still valid
10413 // for vectorization (div/rem are not allowed).
10414 if (!S.areInstructionsWithCopyableElements() &&
10415 !getSameOpcode(PaddedUniqueValues, TLI).valid()) {
10416 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
10417 ReuseShuffleIndices.clear();
10418 return false;
10419 }
10420 VL = std::move(PaddedUniqueValues);
10421 }
10422 return true;
10423 }
10424 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
10425 ReuseShuffleIndices.clear();
10426 return false;
10427 }
10428 VL = std::move(UniqueValues);
10429 return true;
10430}
10431
10432bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
10433 const InstructionsState &LocalState,
10436 OrdersType &ReorderIndices) const {
10437 constexpr unsigned SmallNodeSize = 4;
10438 if (VL.size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
10440 return false;
10441
10442 // Check if this is a duplicate of another split entry.
10443 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *LocalState.getMainOp()
10444 << ".\n");
10445 for (TreeEntry *E : getSplitTreeEntries(LocalState.getMainOp())) {
10446 if (E->isSame(VL)) {
10447 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at "
10448 << *LocalState.getMainOp() << ".\n");
10449 return false;
10450 }
10451 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
10452 if (all_of(VL, [&](Value *V) {
10453 return isa<PoisonValue>(V) || Values.contains(V);
10454 })) {
10455 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
10456 return false;
10457 }
10458 }
10459
10460 ReorderIndices.assign(VL.size(), VL.size());
10461 SmallBitVector Op1Indices(VL.size());
10462 for (auto [Idx, V] : enumerate(VL)) {
10463 auto *I = dyn_cast<Instruction>(V);
10464 if (!I) {
10465 Op1.push_back(V);
10466 Op1Indices.set(Idx);
10467 continue;
10468 }
10469 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
10470 isMainInstruction(I, LocalState.getMainOp(), LocalState.getAltOp(),
10471 *TLI)) ||
10472 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
10473 !isAlternateInstruction(I, LocalState.getMainOp(),
10474 LocalState.getAltOp(), *TLI))) {
10475 Op1.push_back(V);
10476 Op1Indices.set(Idx);
10477 continue;
10478 }
10479 Op2.push_back(V);
10480 }
10481 Type *ScalarTy = getValueType(VL.front());
10482 VectorType *VecTy = getWidenedType(ScalarTy, VL.size());
10483 unsigned Opcode0 = LocalState.getOpcode();
10484 unsigned Opcode1 = LocalState.getAltOpcode();
10485 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
10486 // Enable split node, only if all nodes do not form legal alternate
10487 // instruction (like X86 addsub).
10490 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
10491 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
10492 !hasFullVectorsOrPowerOf2(*TTI, Op1.front()->getType(), Op1.size()) ||
10493 !hasFullVectorsOrPowerOf2(*TTI, Op2.front()->getType(), Op2.size()))
10494 return false;
10495 // Enable split node, only if all nodes are power-of-2/full registers.
10496 unsigned Op1Cnt = 0, Op2Cnt = Op1.size();
10497 for (unsigned Idx : seq<unsigned>(VL.size())) {
10498 if (Op1Indices.test(Idx)) {
10499 ReorderIndices[Op1Cnt] = Idx;
10500 ++Op1Cnt;
10501 } else {
10502 ReorderIndices[Op2Cnt] = Idx;
10503 ++Op2Cnt;
10504 }
10505 }
10506 if (isIdentityOrder(ReorderIndices))
10507 ReorderIndices.clear();
10509 if (!ReorderIndices.empty())
10510 inversePermutation(ReorderIndices, Mask);
10511 unsigned NumParts = TTI->getNumberOfParts(VecTy);
10512 VectorType *Op1VecTy = getWidenedType(ScalarTy, Op1.size());
10513 VectorType *Op2VecTy = getWidenedType(ScalarTy, Op2.size());
10514 // Check non-profitable single register ops, which better to be represented
10515 // as alternate ops.
10516 if (NumParts >= VL.size())
10517 return false;
10519 InstructionCost InsertCost = ::getShuffleCost(
10520 *TTI, TTI::SK_InsertSubvector, VecTy, {}, Kind, Op1.size(), Op2VecTy);
10521 FixedVectorType *SubVecTy =
10522 getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size()));
10523 InstructionCost NewShuffleCost =
10524 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, SubVecTy, Mask, Kind);
10525 if (!LocalState.isCmpOp() && NumParts <= 1 &&
10526 (Mask.empty() || InsertCost >= NewShuffleCost))
10527 return false;
10528 if ((LocalState.getMainOp()->isBinaryOp() &&
10529 LocalState.getAltOp()->isBinaryOp() &&
10530 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
10531 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
10532 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
10533 (LocalState.getMainOp()->isUnaryOp() &&
10534 LocalState.getAltOp()->isUnaryOp())) {
10535 InstructionCost OriginalVecOpsCost =
10536 TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
10537 TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
10538 SmallVector<int> OriginalMask(VL.size(), PoisonMaskElem);
10539 for (unsigned Idx : seq<unsigned>(VL.size())) {
10540 if (isa<PoisonValue>(VL[Idx]))
10541 continue;
10542 OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.size());
10543 }
10544 InstructionCost OriginalCost =
10545 OriginalVecOpsCost + ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
10546 VecTy, OriginalMask, Kind);
10547 InstructionCost NewVecOpsCost =
10548 TTI->getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
10549 TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
10550 InstructionCost NewCost =
10551 NewVecOpsCost + InsertCost +
10552 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
10553 VectorizableTree.front()->getOpcode() == Instruction::Store
10554 ? NewShuffleCost
10555 : 0);
10556 // If not profitable to split - exit.
10557 if (NewCost >= OriginalCost)
10558 return false;
10559 }
10560 return true;
10561}
10562
10563namespace {
10564/// Class accepts incoming list of values, checks if it is able to model
10565/// "copyable" values as compatible operations, and generates the list of values
10566/// for scheduling and list of operands doe the new nodes.
10567class InstructionsCompatibilityAnalysis {
10568 DominatorTree &DT;
10569 const DataLayout &DL;
10570 const TargetTransformInfo &TTI;
10571 const TargetLibraryInfo &TLI;
10572 unsigned MainOpcode = 0;
10573 Instruction *MainOp = nullptr;
10574
10575 /// Checks if the opcode is supported as the main opcode for copyable
10576 /// elements.
10577 static bool isSupportedOpcode(const unsigned Opcode) {
10578 return Opcode == Instruction::Add || Opcode == Instruction::LShr;
10579 }
10580
10581 /// Identifies the best candidate value, which represents main opcode
10582 /// operation.
10583 /// Currently the best candidate is the Add instruction with the parent
10584 /// block with the highest DFS incoming number (block, that dominates other).
10585 void findAndSetMainInstruction(ArrayRef<Value *> VL, const BoUpSLP &R) {
10586 BasicBlock *Parent = nullptr;
10587 // Checks if the instruction has supported opcode.
10588 auto IsSupportedInstruction = [&](Instruction *I) {
10589 return I && isSupportedOpcode(I->getOpcode()) &&
10590 (!doesNotNeedToBeScheduled(I) || !R.isVectorized(I));
10591 };
10592 // Exclude operands instructions immediately to improve compile time, it
10593 // will be unable to schedule anyway.
10596 for (Value *V : VL) {
10597 auto *I = dyn_cast<Instruction>(V);
10598 if (!I)
10599 continue;
10600 if (!DT.isReachableFromEntry(I->getParent()))
10601 continue;
10602 if (Candidates.empty()) {
10603 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
10604 Parent = I->getParent();
10605 Operands.insert(I->op_begin(), I->op_end());
10606 continue;
10607 }
10608 if (Parent == I->getParent()) {
10609 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
10610 Operands.insert(I->op_begin(), I->op_end());
10611 continue;
10612 }
10613 auto *NodeA = DT.getNode(Parent);
10614 auto *NodeB = DT.getNode(I->getParent());
10615 assert(NodeA && "Should only process reachable instructions");
10616 assert(NodeB && "Should only process reachable instructions");
10617 assert((NodeA == NodeB) ==
10618 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10619 "Different nodes should have different DFS numbers");
10620 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
10621 Candidates.clear();
10622 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
10623 Parent = I->getParent();
10624 Operands.clear();
10625 Operands.insert(I->op_begin(), I->op_end());
10626 }
10627 }
10628 unsigned BestOpcodeNum = 0;
10629 MainOp = nullptr;
10630 for (const auto &P : Candidates) {
10631 if (P.second.size() < BestOpcodeNum)
10632 continue;
10633 for (Instruction *I : P.second) {
10634 if (IsSupportedInstruction(I) && !Operands.contains(I)) {
10635 MainOp = I;
10636 BestOpcodeNum = P.second.size();
10637 break;
10638 }
10639 }
10640 }
10641 if (MainOp)
10642 MainOpcode = MainOp->getOpcode();
10643 }
10644
10645 /// Returns the idempotent value for the \p MainOp with the detected \p
10646 /// MainOpcode. For Add, returns 0. For Or, it should choose between false and
10647 /// the operand itself, since V or V == V.
10648 Value *selectBestIdempotentValue() const {
10649 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
10650 return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(),
10651 !MainOp->isCommutative());
10652 }
10653
10654 /// Returns the value and operands for the \p V, considering if it is original
10655 /// instruction and its actual operands should be returned, or it is a
10656 /// copyable element and its should be represented as idempotent instruction.
10657 SmallVector<Value *> getOperands(const InstructionsState &S, Value *V) const {
10658 if (isa<PoisonValue>(V))
10659 return {V, V};
10660 if (!S.isCopyableElement(V))
10661 return convertTo(cast<Instruction>(V), S).second;
10662 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
10663 return {V, selectBestIdempotentValue()};
10664 }
10665
10666 /// Builds operands for the original instructions.
10667 void
10668 buildOriginalOperands(const InstructionsState &S, ArrayRef<Value *> VL,
10670
10671 unsigned ShuffleOrOp =
10672 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
10673 Instruction *VL0 = S.getMainOp();
10674
10675 switch (ShuffleOrOp) {
10676 case Instruction::PHI: {
10677 auto *PH = cast<PHINode>(VL0);
10678
10679 // Keeps the reordered operands to avoid code duplication.
10680 PHIHandler Handler(DT, PH, VL);
10681 Handler.buildOperands();
10682 Operands.assign(PH->getNumOperands(), {});
10683 for (unsigned I : seq<unsigned>(PH->getNumOperands()))
10684 Operands[I].assign(Handler.getOperands(I).begin(),
10685 Handler.getOperands(I).end());
10686 return;
10687 }
10688 case Instruction::ExtractValue:
10689 case Instruction::ExtractElement:
10690 // This is a special case, as it does not gather, but at the same time
10691 // we are not extending buildTree_rec() towards the operands.
10692 Operands.assign(1, {VL.size(), VL0->getOperand(0)});
10693 return;
10694 case Instruction::InsertElement:
10695 Operands.assign(2, {VL.size(), nullptr});
10696 for (auto [Idx, V] : enumerate(VL)) {
10697 auto *IE = cast<InsertElementInst>(V);
10698 for (auto [OpIdx, Ops] : enumerate(Operands))
10699 Ops[Idx] = IE->getOperand(OpIdx);
10700 }
10701 return;
10702 case Instruction::Load:
10703 Operands.assign(
10704 1, {VL.size(),
10705 PoisonValue::get(cast<LoadInst>(VL0)->getPointerOperandType())});
10706 for (auto [V, Op] : zip(VL, Operands.back())) {
10707 auto *LI = dyn_cast<LoadInst>(V);
10708 if (!LI)
10709 continue;
10710 Op = LI->getPointerOperand();
10711 }
10712 return;
10713 case Instruction::ZExt:
10714 case Instruction::SExt:
10715 case Instruction::FPToUI:
10716 case Instruction::FPToSI:
10717 case Instruction::FPExt:
10718 case Instruction::PtrToInt:
10719 case Instruction::IntToPtr:
10720 case Instruction::SIToFP:
10721 case Instruction::UIToFP:
10722 case Instruction::Trunc:
10723 case Instruction::FPTrunc:
10724 case Instruction::BitCast:
10725 case Instruction::ICmp:
10726 case Instruction::FCmp:
10727 case Instruction::Select:
10728 case Instruction::FNeg:
10729 case Instruction::Add:
10730 case Instruction::FAdd:
10731 case Instruction::Sub:
10732 case Instruction::FSub:
10733 case Instruction::Mul:
10734 case Instruction::FMul:
10735 case Instruction::UDiv:
10736 case Instruction::SDiv:
10737 case Instruction::FDiv:
10738 case Instruction::URem:
10739 case Instruction::SRem:
10740 case Instruction::FRem:
10741 case Instruction::Shl:
10742 case Instruction::LShr:
10743 case Instruction::AShr:
10744 case Instruction::And:
10745 case Instruction::Or:
10746 case Instruction::Xor:
10747 case Instruction::Freeze:
10748 case Instruction::Store:
10749 case Instruction::ShuffleVector:
10750 Operands.assign(VL0->getNumOperands(), {VL.size(), nullptr});
10751 for (auto [Idx, V] : enumerate(VL)) {
10752 auto *I = dyn_cast<Instruction>(V);
10753 if (!I) {
10754 for (auto [OpIdx, Ops] : enumerate(Operands))
10755 Ops[Idx] = PoisonValue::get(VL0->getOperand(OpIdx)->getType());
10756 continue;
10757 }
10758 auto [Op, ConvertedOps] = convertTo(I, S);
10759 for (auto [OpIdx, Ops] : enumerate(Operands))
10760 Ops[Idx] = ConvertedOps[OpIdx];
10761 }
10762 return;
10763 case Instruction::GetElementPtr: {
10764 Operands.assign(2, {VL.size(), nullptr});
10765 // Need to cast all indices to the same type before vectorization to
10766 // avoid crash.
10767 // Required to be able to find correct matches between different gather
10768 // nodes and reuse the vectorized values rather than trying to gather them
10769 // again.
10770 const unsigned IndexIdx = 1;
10771 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
10772 Type *Ty =
10773 all_of(VL,
10774 [&](Value *V) {
10775 auto *GEP = dyn_cast<GetElementPtrInst>(V);
10776 return !GEP || VL0Ty == GEP->getOperand(IndexIdx)->getType();
10777 })
10778 ? VL0Ty
10779 : DL.getIndexType(cast<GetElementPtrInst>(VL0)
10780 ->getPointerOperandType()
10781 ->getScalarType());
10782 for (auto [Idx, V] : enumerate(VL)) {
10783 auto *GEP = dyn_cast<GetElementPtrInst>(V);
10784 if (!GEP) {
10785 Operands[0][Idx] = V;
10786 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
10787 continue;
10788 }
10789 Operands[0][Idx] = GEP->getPointerOperand();
10790 auto *Op = GEP->getOperand(IndexIdx);
10791 auto *CI = dyn_cast<ConstantInt>(Op);
10793 CI, Ty, CI->getValue().isSignBitSet(), DL)
10794 : Op;
10795 }
10796 return;
10797 }
10798 case Instruction::Call: {
10799 auto *CI = cast<CallInst>(VL0);
10801 for (unsigned Idx : seq<unsigned>(CI->arg_size())) {
10803 continue;
10804 auto &Ops = Operands.emplace_back();
10805 for (Value *V : VL) {
10806 auto *I = dyn_cast<Instruction>(V);
10807 Ops.push_back(I ? I->getOperand(Idx)
10809 }
10810 }
10811 return;
10812 }
10813 default:
10814 break;
10815 }
10816 llvm_unreachable("Unexpected vectorization of the instructions.");
10817 }
10818
10819public:
10820 InstructionsCompatibilityAnalysis(DominatorTree &DT, const DataLayout &DL,
10821 const TargetTransformInfo &TTI,
10822 const TargetLibraryInfo &TLI)
10823 : DT(DT), DL(DL), TTI(TTI), TLI(TLI) {}
10824
10825 InstructionsState
10826 buildInstructionsState(ArrayRef<Value *> VL, const BoUpSLP &R,
10827 bool TryCopyableElementsVectorization,
10828 bool WithProfitabilityCheck = false,
10829 bool SkipSameCodeCheck = false) {
10830 InstructionsState S = (SkipSameCodeCheck || !allSameBlock(VL))
10831 ? InstructionsState::invalid()
10832 : getSameOpcode(VL, TLI);
10833 if (S)
10834 return S;
10835 if (!VectorizeCopyableElements || !TryCopyableElementsVectorization)
10836 return S;
10837 findAndSetMainInstruction(VL, R);
10838 if (!MainOp)
10839 return InstructionsState::invalid();
10840 S = InstructionsState(MainOp, MainOp, /*HasCopyables=*/true);
10841 if (!WithProfitabilityCheck)
10842 return S;
10843 // Check if it is profitable to vectorize the instruction.
10844 SmallVector<BoUpSLP::ValueList> Operands = buildOperands(S, VL);
10845 auto BuildCandidates =
10847 Value *V2) {
10848 if (V1 != V2 && isa<PHINode>(V1))
10849 return;
10850 auto *I1 = dyn_cast<Instruction>(V1);
10851 auto *I2 = dyn_cast<Instruction>(V2);
10852 if (I1 && I2 && I1->getOpcode() == I2->getOpcode() &&
10853 I1->getParent() != I2->getParent())
10854 return;
10855 Candidates.emplace_back(V1, (I1 || I2) ? V2 : V1);
10856 };
10857 if (VL.size() == 2) {
10858 // Check if the operands allow better vectorization.
10859 SmallVector<std::pair<Value *, Value *>, 4> Candidates1, Candidates2;
10860 BuildCandidates(Candidates1, Operands[0][0], Operands[0][1]);
10861 BuildCandidates(Candidates2, Operands[1][0], Operands[1][1]);
10862 bool Res = !Candidates1.empty() && !Candidates2.empty() &&
10863 R.findBestRootPair(Candidates1) &&
10864 R.findBestRootPair(Candidates2);
10865 if (!Res && isCommutative(MainOp)) {
10866 Candidates1.clear();
10867 Candidates2.clear();
10868 BuildCandidates(Candidates1, Operands[0][0], Operands[1][1]);
10869 BuildCandidates(Candidates2, Operands[1][0], Operands[0][1]);
10870 Res = !Candidates1.empty() && !Candidates2.empty() &&
10871 R.findBestRootPair(Candidates1) &&
10872 R.findBestRootPair(Candidates2);
10873 }
10874 if (!Res)
10875 return InstructionsState::invalid();
10877 InstructionCost ScalarCost = TTI.getInstructionCost(S.getMainOp(), Kind);
10878 InstructionCost VectorCost;
10879 FixedVectorType *VecTy =
10880 getWidenedType(S.getMainOp()->getType(), VL.size());
10881 switch (MainOpcode) {
10882 case Instruction::Add:
10883 case Instruction::LShr:
10884 VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
10885 break;
10886 default:
10887 llvm_unreachable("Unexpected instruction.");
10888 }
10889 if (VectorCost > ScalarCost)
10890 return InstructionsState::invalid();
10891 return S;
10892 }
10893 assert(Operands.size() == 2 && "Unexpected number of operands!");
10894 unsigned CopyableNum =
10895 count_if(VL, [&](Value *V) { return S.isCopyableElement(V); });
10896 if (CopyableNum < VL.size() / 2)
10897 return S;
10898 // Too many phi copyables - exit.
10899 const unsigned Limit = VL.size() / 24;
10900 if ((CopyableNum >= VL.size() - Limit ||
10901 (CopyableNum >= VL.size() - 1 && VL.size() > 4) ||
10902 CopyableNum >= MaxPHINumOperands) &&
10903 all_of(VL, [&](Value *V) {
10904 return isa<PHINode>(V) || !S.isCopyableElement(V);
10905 }))
10906 return InstructionsState::invalid();
10907 // Check profitability if number of copyables > VL.size() / 2.
10908 // 1. Reorder operands for better matching.
10909 if (isCommutative(MainOp)) {
10910 for (auto &Ops : Operands) {
10911 // Make instructions the first operands.
10912 if (!isa<Instruction>(Ops.front()) && isa<Instruction>(Ops.back())) {
10913 std::swap(Ops.front(), Ops.back());
10914 continue;
10915 }
10916 // Make constants the second operands.
10917 if (isa<Constant>(Ops.front())) {
10918 std::swap(Ops.front(), Ops.back());
10919 continue;
10920 }
10921 }
10922 }
10923 // 2. Check, if operands can be vectorized.
10924 if (count_if(Operands.back(), IsaPred<Instruction>) > 1)
10925 return InstructionsState::invalid();
10926 auto CheckOperand = [&](ArrayRef<Value *> Ops) {
10927 if (allConstant(Ops) || isSplat(Ops))
10928 return true;
10929 // Check if it is "almost" splat, i.e. has >= 4 elements and only single
10930 // one is different.
10931 constexpr unsigned Limit = 4;
10932 if (Operands.front().size() >= Limit) {
10934 for (Value *V : Ops) {
10935 if (isa<UndefValue>(V))
10936 continue;
10937 ++Counters[V];
10938 }
10939 if (Counters.size() == 2 &&
10940 any_of(Counters, [&](const std::pair<const Value *, unsigned> &C) {
10941 return C.second == 1;
10942 }))
10943 return true;
10944 }
10945 // First operand not a constant or splat? Last attempt - check for
10946 // potential vectorization.
10947 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
10948 InstructionsState OpS = Analysis.buildInstructionsState(
10949 Ops, R, /*TryCopyableElementsVectorization=*/true);
10950 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !allSameBlock(Ops)))
10951 return false;
10952 unsigned CopyableNum =
10953 count_if(Ops, [&](Value *V) { return OpS.isCopyableElement(V); });
10954 return CopyableNum <= VL.size() / 2;
10955 };
10956 if (!CheckOperand(Operands.front()))
10957 return InstructionsState::invalid();
10958
10959 return S;
10960 }
10961
10962 SmallVector<BoUpSLP::ValueList> buildOperands(const InstructionsState &S,
10963 ArrayRef<Value *> VL) {
10964 assert(S && "Invalid state!");
10966 if (S.areInstructionsWithCopyableElements()) {
10967 MainOp = S.getMainOp();
10968 MainOpcode = S.getOpcode();
10969 Operands.assign(MainOp->getNumOperands(),
10970 BoUpSLP::ValueList(VL.size(), nullptr));
10971 for (auto [Idx, V] : enumerate(VL)) {
10972 SmallVector<Value *> OperandsForValue = getOperands(S, V);
10973 for (auto [OperandIdx, Operand] : enumerate(OperandsForValue))
10974 Operands[OperandIdx][Idx] = Operand;
10975 }
10976 } else {
10977 buildOriginalOperands(S, VL, Operands);
10978 }
10979 return Operands;
10980 }
10981};
10982} // namespace
10983
10984BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
10985 ArrayRef<Value *> VL, unsigned Depth, const EdgeInfo &UserTreeIdx,
10986 bool TryCopyableElementsVectorization) const {
10987 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
10988
10989 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
10990 InstructionsState S = Analysis.buildInstructionsState(
10991 VL, *this, TryCopyableElementsVectorization,
10992 /*WithProfitabilityCheck=*/true, TryCopyableElementsVectorization);
10993
10994 // Don't go into catchswitch blocks, which can happen with PHIs.
10995 // Such blocks can only have PHIs and the catchswitch. There is no
10996 // place to insert a shuffle if we need to, so just avoid that issue.
10997 if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
10998 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
10999 // Do not try to pack to avoid extra instructions here.
11000 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11001 /*TryToFindDuplicates=*/false);
11002 }
11003
11004 // Check if this is a duplicate of another entry.
11005 if (S) {
11006 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp() << ".\n");
11007 for (TreeEntry *E : getTreeEntries(S.getMainOp())) {
11008 if (E->isSame(VL)) {
11009 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
11010 << ".\n");
11011 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11012 }
11013 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
11014 if (all_of(VL, [&](Value *V) {
11015 return isa<PoisonValue>(V) || Values.contains(V);
11016 })) {
11017 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
11018 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11019 }
11020 }
11021 }
11022
11023 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
11024 // a load), in which case peek through to include it in the tree, without
11025 // ballooning over-budget.
11026 if (Depth >= RecursionMaxDepth &&
11027 !(S && !S.isAltShuffle() && VL.size() >= 4 &&
11028 (match(S.getMainOp(), m_Load(m_Value())) ||
11029 all_of(VL, [&S](const Value *I) {
11030 return match(I,
11032 cast<Instruction>(I)->getOpcode() == S.getOpcode();
11033 })))) {
11034 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
11035 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11036 }
11037
11038 // Don't handle scalable vectors
11039 if (S && S.getOpcode() == Instruction::ExtractElement &&
11040 isa<ScalableVectorType>(
11041 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
11042 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
11043 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11044 }
11045
11046 // Don't handle vectors.
11047 if (!SLPReVec && getValueType(VL.front())->isVectorTy()) {
11048 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
11049 // Do not try to pack to avoid extra instructions here.
11050 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11051 /*TryToFindDuplicates=*/false);
11052 }
11053
11054 // If all of the operands are identical or constant we have a simple solution.
11055 // If we deal with insert/extract instructions, they all must have constant
11056 // indices, otherwise we should gather them, not try to vectorize.
11057 // If alternate op node with 2 elements with gathered operands - do not
11058 // vectorize.
11059 auto NotProfitableForVectorization = [&S, this, Depth](ArrayRef<Value *> VL) {
11060 if (!S || !S.isAltShuffle() || VL.size() > 2)
11061 return false;
11062 if (VectorizableTree.size() < MinTreeSize)
11063 return false;
11064 if (Depth >= RecursionMaxDepth - 1)
11065 return true;
11066 // Check if all operands are extracts, part of vector node or can build a
11067 // regular vectorize node.
11068 SmallVector<unsigned, 8> InstsCount;
11069 for (Value *V : VL) {
11070 auto *I = cast<Instruction>(V);
11071 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
11072 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
11073 }));
11074 }
11075 bool IsCommutative =
11076 isCommutative(S.getMainOp()) || isCommutative(S.getAltOp());
11077 if ((IsCommutative &&
11078 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
11079 (!IsCommutative &&
11080 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
11081 return true;
11082 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
11084 auto *I1 = cast<Instruction>(VL.front());
11085 auto *I2 = cast<Instruction>(VL.back());
11086 for (int Op : seq<int>(S.getMainOp()->getNumOperands()))
11087 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
11088 I2->getOperand(Op));
11089 if (static_cast<unsigned>(count_if(
11090 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
11092 })) >= S.getMainOp()->getNumOperands() / 2)
11093 return false;
11094 if (S.getMainOp()->getNumOperands() > 2)
11095 return true;
11096 if (IsCommutative) {
11097 // Check permuted operands.
11098 Candidates.clear();
11099 for (int Op = 0, E = S.getMainOp()->getNumOperands(); Op < E; ++Op)
11100 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
11101 I2->getOperand((Op + 1) % E));
11102 if (any_of(
11103 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
11105 }))
11106 return false;
11107 }
11108 return true;
11109 };
11110 SmallVector<unsigned> SortedIndices;
11111 BasicBlock *BB = nullptr;
11112 bool IsScatterVectorizeUserTE =
11113 UserTreeIdx.UserTE &&
11114 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11115 bool AreAllSameBlock = S.valid();
11116 bool AreScatterAllGEPSameBlock =
11117 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
11118 VL.size() > 2 &&
11119 all_of(VL,
11120 [&BB](Value *V) {
11121 auto *I = dyn_cast<GetElementPtrInst>(V);
11122 if (!I)
11123 return doesNotNeedToBeScheduled(V);
11124 if (!BB)
11125 BB = I->getParent();
11126 return BB == I->getParent() && I->getNumOperands() == 2;
11127 }) &&
11128 BB &&
11129 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
11130 SortedIndices));
11131 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
11132 if (!AreAllSameInsts || (!S && allConstant(VL)) || isSplat(VL) ||
11133 (S &&
11134 isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
11135 S.getMainOp()) &&
11137 NotProfitableForVectorization(VL)) {
11138 if (!S) {
11139 LLVM_DEBUG(dbgs() << "SLP: Try split and if failed, gathering due to "
11140 "C,S,B,O, small shuffle. \n";
11141 dbgs() << "[";
11142 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
11143 dbgs() << "]\n");
11144 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11145 /*TryToFindDuplicates=*/true,
11146 /*TrySplitVectorize=*/true);
11147 }
11148 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n";
11149 dbgs() << "[";
11150 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
11151 dbgs() << "]\n");
11152 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11153 }
11154
11155 // Don't vectorize ephemeral values.
11156 if (S && !EphValues.empty()) {
11157 for (Value *V : VL) {
11158 if (EphValues.count(V)) {
11159 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
11160 << ") is ephemeral.\n");
11161 // Do not try to pack to avoid extra instructions here.
11162 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11163 /*TryToFindDuplicates=*/false);
11164 }
11165 }
11166 }
11167
11168 // We now know that this is a vector of instructions of the same type from
11169 // the same block.
11170
11171 // Check that none of the instructions in the bundle are already in the tree
11172 // and the node may be not profitable for the vectorization as the small
11173 // alternate node.
11174 if (S && S.isAltShuffle()) {
11175 auto GetNumVectorizedExtracted = [&]() {
11176 APInt Extracted = APInt::getZero(VL.size());
11177 APInt Vectorized = APInt::getAllOnes(VL.size());
11178 for (auto [Idx, V] : enumerate(VL)) {
11179 auto *I = dyn_cast<Instruction>(V);
11180 if (!I || doesNotNeedToBeScheduled(I) ||
11181 all_of(I->operands(), [&](const Use &U) {
11182 return isa<ExtractElementInst>(U.get());
11183 }))
11184 continue;
11185 if (isVectorized(I))
11186 Vectorized.clearBit(Idx);
11187 else if (!I->hasOneUser() && !areAllUsersVectorized(I, UserIgnoreList))
11188 Extracted.setBit(Idx);
11189 }
11190 return std::make_pair(Vectorized, Extracted);
11191 };
11192 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
11194 bool PreferScalarize = !Vectorized.isAllOnes() && VL.size() == 2;
11195 if (!Vectorized.isAllOnes() && !PreferScalarize) {
11196 // Rough cost estimation, if the vector code (+ potential extracts) is
11197 // more profitable than the scalar + buildvector.
11198 Type *ScalarTy = VL.front()->getType();
11199 auto *VecTy = getWidenedType(ScalarTy, VL.size());
11200 InstructionCost VectorizeCostEstimate =
11202 ::getScalarizationOverhead(*TTI, ScalarTy, VecTy, Extracted,
11203 /*Insert=*/false, /*Extract=*/true, Kind);
11204 InstructionCost ScalarizeCostEstimate = ::getScalarizationOverhead(
11205 *TTI, ScalarTy, VecTy, Vectorized,
11206 /*Insert=*/true, /*Extract=*/false, Kind, /*ForPoisonSrc=*/false);
11207 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
11208 }
11209 if (PreferScalarize) {
11210 LLVM_DEBUG(dbgs() << "SLP: The instructions are in tree and alternate "
11211 "node is not profitable.\n");
11212 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11213 }
11214 }
11215
11216 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
11217 if (UserIgnoreList && !UserIgnoreList->empty()) {
11218 for (Value *V : VL) {
11219 if (UserIgnoreList->contains(V)) {
11220 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
11221 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11222 }
11223 }
11224 }
11225
11226 // Special processing for sorted pointers for ScatterVectorize node with
11227 // constant indeces only.
11228 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
11229 assert(VL.front()->getType()->isPointerTy() &&
11230 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
11231 "Expected pointers only.");
11232 // Reset S to make it GetElementPtr kind of node.
11233 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
11234 assert(It != VL.end() && "Expected at least one GEP.");
11235 S = getSameOpcode(*It, *TLI);
11236 }
11237
11238 // Check that all of the users of the scalars that we want to vectorize are
11239 // schedulable.
11240 Instruction *VL0 = S.getMainOp();
11241 BB = VL0->getParent();
11242
11243 if (S &&
11244 (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()) ||
11245 !DT->isReachableFromEntry(BB))) {
11246 // Don't go into unreachable blocks. They may contain instructions with
11247 // dependency cycles which confuse the final scheduling.
11248 // Do not vectorize EH and non-returning blocks, not profitable in most
11249 // cases.
11250 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
11251 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11252 }
11253 return ScalarsVectorizationLegality(S, /*IsLegal=*/true);
11254}
11255
11256void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
11257 const EdgeInfo &UserTreeIdx,
11258 unsigned InterleaveFactor) {
11259 assert((allConstant(VLRef) || allSameType(VLRef)) && "Invalid types!");
11260
11261 SmallVector<int> ReuseShuffleIndices;
11262 SmallVector<Value *> VL(VLRef);
11263
11264 // Tries to build split node.
11265 auto TrySplitNode = [&](const InstructionsState &LocalState) {
11266 SmallVector<Value *> Op1, Op2;
11267 OrdersType ReorderIndices;
11268 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
11269 return false;
11270
11271 SmallVector<Value *> NewVL(VL.size());
11272 copy(Op1, NewVL.begin());
11273 copy(Op2, std::next(NewVL.begin(), Op1.size()));
11274 auto Invalid = ScheduleBundle::invalid();
11275 auto *TE = newTreeEntry(VL, TreeEntry::SplitVectorize, Invalid, LocalState,
11276 UserTreeIdx, {}, ReorderIndices);
11277 LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n"; TE->dump());
11278 auto AddNode = [&](ArrayRef<Value *> Op, unsigned Idx) {
11279 InstructionsState S = getSameOpcode(Op, *TLI);
11280 if (S && (isa<LoadInst>(S.getMainOp()) ||
11281 getSameValuesTreeEntry(S.getMainOp(), Op, /*SameVF=*/true))) {
11282 // Build gather node for loads, they will be gathered later.
11283 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11284 Idx == 0 ? 0 : Op1.size());
11285 (void)newTreeEntry(Op, TreeEntry::NeedToGather, Invalid, S, {TE, Idx});
11286 } else {
11287 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11288 Idx == 0 ? 0 : Op1.size());
11289 buildTreeRec(Op, Depth, {TE, Idx});
11290 }
11291 };
11292 AddNode(Op1, 0);
11293 AddNode(Op2, 1);
11294 return true;
11295 };
11296
11297 auto AreOnlyConstsWithPHIs = [](ArrayRef<Value *> VL) {
11298 bool AreConsts = false;
11299 for (Value *V : VL) {
11300 if (isa<PoisonValue>(V))
11301 continue;
11302 if (isa<Constant>(V)) {
11303 AreConsts = true;
11304 continue;
11305 }
11306 if (!isa<PHINode>(V))
11307 return false;
11308 }
11309 return AreConsts;
11310 };
11311 if (AreOnlyConstsWithPHIs(VL)) {
11312 LLVM_DEBUG(dbgs() << "SLP: Gathering due to all constants and PHIs.\n");
11313 newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
11314 return;
11315 }
11316
11317 ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
11318 VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/false);
11319 InstructionsState S = Legality.getInstructionsState();
11320 if (!Legality.isLegal()) {
11321 if (Legality.trySplitVectorize()) {
11322 auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
11323 // Last chance to try to vectorize alternate node.
11324 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
11325 return;
11326 }
11327 if (!S)
11328 Legality = getScalarsVectorizationLegality(
11329 VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/true);
11330 if (!Legality.isLegal()) {
11331 if (Legality.tryToFindDuplicates())
11332 tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S,
11333 UserTreeIdx);
11334
11335 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11336 return;
11337 }
11338 S = Legality.getInstructionsState();
11339 }
11340
11341 // FIXME: investigate if there are profitable cases for VL.size() <= 4.
11342 if (S.isAltShuffle() && TrySplitNode(S))
11343 return;
11344
11345 // Check that every instruction appears once in this bundle.
11346 if (!tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S, UserTreeIdx,
11347 /*TryPad=*/true)) {
11348 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11349 return;
11350 }
11351
11352 // Perform specific checks for each particular instruction kind.
11353 bool IsScatterVectorizeUserTE =
11354 UserTreeIdx.UserTE &&
11355 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11356 OrdersType CurrentOrder;
11357 SmallVector<Value *> PointerOps;
11358 TreeEntry::EntryState State = getScalarsVectorizationState(
11359 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
11360 if (State == TreeEntry::NeedToGather) {
11361 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11362 return;
11363 }
11364
11365 Instruction *VL0 = S.getMainOp();
11366 BasicBlock *BB = VL0->getParent();
11367 auto &BSRef = BlocksSchedules[BB];
11368 if (!BSRef)
11369 BSRef = std::make_unique<BlockScheduling>(BB);
11370
11371 BlockScheduling &BS = *BSRef;
11372
11373 SetVector<Value *> UniqueValues(llvm::from_range, VL);
11374 std::optional<ScheduleBundle *> BundlePtr =
11375 BS.tryScheduleBundle(UniqueValues.getArrayRef(), this, S, UserTreeIdx);
11376#ifdef EXPENSIVE_CHECKS
11377 // Make sure we didn't break any internal invariants
11378 BS.verify();
11379#endif
11380 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
11381 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
11382 // Last chance to try to vectorize alternate node.
11383 if (S.isAltShuffle() && ReuseShuffleIndices.empty() && TrySplitNode(S))
11384 return;
11385 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11386 NonScheduledFirst.insert(VL.front());
11387 if (S.getOpcode() == Instruction::Load &&
11388 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
11390 return;
11391 }
11392 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
11393 SmallVector<ValueList> Operands = Analysis.buildOperands(S, VL);
11394 ScheduleBundle Empty;
11395 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() : Empty;
11396 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
11397
11398 unsigned ShuffleOrOp =
11399 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
11400 auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) {
11401 // Postpone PHI nodes creation
11402 SmallVector<unsigned> PHIOps;
11403 for (unsigned I : seq<unsigned>(Operands.size())) {
11405 if (Op.empty())
11406 continue;
11407 InstructionsState S = getSameOpcode(Op, *TLI);
11408 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
11409 buildTreeRec(Op, Depth + 1, {TE, I});
11410 else
11411 PHIOps.push_back(I);
11412 }
11413 for (unsigned I : PHIOps)
11414 buildTreeRec(Operands[I], Depth + 1, {TE, I});
11415 };
11416 switch (ShuffleOrOp) {
11417 case Instruction::PHI: {
11418 TreeEntry *TE =
11419 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
11420 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n";
11421 TE->dump());
11422
11423 TE->setOperands(Operands);
11424 CreateOperandNodes(TE, Operands);
11425 return;
11426 }
11427 case Instruction::ExtractValue:
11428 case Instruction::ExtractElement: {
11429 if (CurrentOrder.empty()) {
11430 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
11431 } else {
11432 LLVM_DEBUG({
11433 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
11434 "with order";
11435 for (unsigned Idx : CurrentOrder)
11436 dbgs() << " " << Idx;
11437 dbgs() << "\n";
11438 });
11439 fixupOrderingIndices(CurrentOrder);
11440 }
11441 // Insert new order with initial value 0, if it does not exist,
11442 // otherwise return the iterator to the existing one.
11443 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11444 ReuseShuffleIndices, CurrentOrder);
11445 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
11446 "(ExtractValueInst/ExtractElementInst).\n";
11447 TE->dump());
11448 // This is a special case, as it does not gather, but at the same time
11449 // we are not extending buildTreeRec() towards the operands.
11450 TE->setOperands(Operands);
11451 return;
11452 }
11453 case Instruction::InsertElement: {
11454 assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
11455
11456 auto OrdCompare = [](const std::pair<int, int> &P1,
11457 const std::pair<int, int> &P2) {
11458 return P1.first > P2.first;
11459 };
11461 decltype(OrdCompare)>
11462 Indices(OrdCompare);
11463 for (int I = 0, E = VL.size(); I < E; ++I) {
11464 unsigned Idx = *getElementIndex(VL[I]);
11465 Indices.emplace(Idx, I);
11466 }
11467 OrdersType CurrentOrder(VL.size(), VL.size());
11468 bool IsIdentity = true;
11469 for (int I = 0, E = VL.size(); I < E; ++I) {
11470 CurrentOrder[Indices.top().second] = I;
11471 IsIdentity &= Indices.top().second == I;
11472 Indices.pop();
11473 }
11474 if (IsIdentity)
11475 CurrentOrder.clear();
11476 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11477 {}, CurrentOrder);
11478 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
11479 TE->dump());
11480
11481 TE->setOperands(Operands);
11482 buildTreeRec(TE->getOperand(1), Depth + 1, {TE, 1});
11483 return;
11484 }
11485 case Instruction::Load: {
11486 // Check that a vectorized load would load the same memory as a scalar
11487 // load. For example, we don't want to vectorize loads that are smaller
11488 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
11489 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
11490 // from such a struct, we read/write packed bits disagreeing with the
11491 // unvectorized version.
11492 TreeEntry *TE = nullptr;
11493 fixupOrderingIndices(CurrentOrder);
11494 switch (State) {
11495 case TreeEntry::Vectorize:
11496 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11497 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
11498 if (CurrentOrder.empty())
11499 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
11500 TE->dump());
11501 else
11503 << "SLP: added a new TreeEntry (jumbled LoadInst).\n";
11504 TE->dump());
11505 break;
11506 case TreeEntry::CompressVectorize:
11507 // Vectorizing non-consecutive loads with (masked)load + compress.
11508 TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
11509 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11510 LLVM_DEBUG(
11511 dbgs()
11512 << "SLP: added a new TreeEntry (masked LoadInst + compress).\n";
11513 TE->dump());
11514 break;
11515 case TreeEntry::StridedVectorize:
11516 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
11517 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
11518 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11519 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
11520 TE->dump());
11521 break;
11522 case TreeEntry::ScatterVectorize:
11523 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
11524 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
11525 UserTreeIdx, ReuseShuffleIndices);
11526 LLVM_DEBUG(
11527 dbgs()
11528 << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
11529 TE->dump());
11530 break;
11531 case TreeEntry::CombinedVectorize:
11532 case TreeEntry::SplitVectorize:
11533 case TreeEntry::NeedToGather:
11534 llvm_unreachable("Unexpected loads state.");
11535 }
11536 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
11537 assert(Operands.size() == 1 && "Expected a single operand only");
11539 inversePermutation(CurrentOrder, Mask);
11540 reorderScalars(Operands.front(), Mask);
11541 }
11542 TE->setOperands(Operands);
11543 if (State == TreeEntry::ScatterVectorize)
11544 buildTreeRec(PointerOps, Depth + 1, {TE, 0});
11545 return;
11546 }
11547 case Instruction::ZExt:
11548 case Instruction::SExt:
11549 case Instruction::FPToUI:
11550 case Instruction::FPToSI:
11551 case Instruction::FPExt:
11552 case Instruction::PtrToInt:
11553 case Instruction::IntToPtr:
11554 case Instruction::SIToFP:
11555 case Instruction::UIToFP:
11556 case Instruction::Trunc:
11557 case Instruction::FPTrunc:
11558 case Instruction::BitCast: {
11559 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
11560 std::make_pair(std::numeric_limits<unsigned>::min(),
11561 std::numeric_limits<unsigned>::max()));
11562 if (ShuffleOrOp == Instruction::ZExt ||
11563 ShuffleOrOp == Instruction::SExt) {
11564 CastMaxMinBWSizes = std::make_pair(
11565 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
11566 PrevMaxBW),
11567 std::min<unsigned>(
11568 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
11569 PrevMinBW));
11570 } else if (ShuffleOrOp == Instruction::Trunc) {
11571 CastMaxMinBWSizes = std::make_pair(
11572 std::max<unsigned>(
11573 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
11574 PrevMaxBW),
11575 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
11576 PrevMinBW));
11577 }
11578 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11579 ReuseShuffleIndices);
11580 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
11581 TE->dump());
11582
11583 TE->setOperands(Operands);
11584 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
11585 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
11586 if (ShuffleOrOp == Instruction::Trunc) {
11587 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11588 } else if (ShuffleOrOp == Instruction::SIToFP ||
11589 ShuffleOrOp == Instruction::UIToFP) {
11590 unsigned NumSignBits =
11591 ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
11592 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
11593 APInt Mask = DB->getDemandedBits(OpI);
11594 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
11595 }
11596 if (NumSignBits * 2 >=
11597 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
11598 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11599 }
11600 return;
11601 }
11602 case Instruction::ICmp:
11603 case Instruction::FCmp: {
11604 // Check that all of the compares have the same predicate.
11605 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
11606 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11607 ReuseShuffleIndices);
11608 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n";
11609 TE->dump());
11610
11611 VLOperands Ops(VL, Operands, S, *this);
11612 if (cast<CmpInst>(VL0)->isCommutative()) {
11613 // Commutative predicate - collect + sort operands of the instructions
11614 // so that each side is more likely to have the same opcode.
11616 "Commutative Predicate mismatch");
11617 Ops.reorder();
11618 Operands.front() = Ops.getVL(0);
11619 Operands.back() = Ops.getVL(1);
11620 } else {
11621 // Collect operands - commute if it uses the swapped predicate.
11622 for (auto [Idx, V] : enumerate(VL)) {
11623 if (isa<PoisonValue>(V))
11624 continue;
11625 auto *Cmp = cast<CmpInst>(V);
11626 if (Cmp->getPredicate() != P0)
11627 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
11628 }
11629 }
11630 TE->setOperands(Operands);
11631 buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
11632 buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
11633 if (ShuffleOrOp == Instruction::ICmp) {
11634 unsigned NumSignBits0 =
11635 ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
11636 if (NumSignBits0 * 2 >=
11637 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
11638 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11639 unsigned NumSignBits1 =
11640 ComputeNumSignBits(VL0->getOperand(1), *DL, AC, nullptr, DT);
11641 if (NumSignBits1 * 2 >=
11642 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
11643 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
11644 }
11645 return;
11646 }
11647 case Instruction::Select:
11648 case Instruction::FNeg:
11649 case Instruction::Add:
11650 case Instruction::FAdd:
11651 case Instruction::Sub:
11652 case Instruction::FSub:
11653 case Instruction::Mul:
11654 case Instruction::FMul:
11655 case Instruction::UDiv:
11656 case Instruction::SDiv:
11657 case Instruction::FDiv:
11658 case Instruction::URem:
11659 case Instruction::SRem:
11660 case Instruction::FRem:
11661 case Instruction::Shl:
11662 case Instruction::LShr:
11663 case Instruction::AShr:
11664 case Instruction::And:
11665 case Instruction::Or:
11666 case Instruction::Xor:
11667 case Instruction::Freeze: {
11668 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11669 ReuseShuffleIndices);
11670 LLVM_DEBUG(
11671 dbgs() << "SLP: added a new TreeEntry "
11672 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
11673 TE->dump());
11674
11675 if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
11676 VLOperands Ops(VL, Operands, S, *this);
11677 Ops.reorder();
11678 Operands[0] = Ops.getVL(0);
11679 Operands[1] = Ops.getVL(1);
11680 }
11681 TE->setOperands(Operands);
11682 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
11683 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
11684 return;
11685 }
11686 case Instruction::GetElementPtr: {
11687 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11688 ReuseShuffleIndices);
11689 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
11690 TE->dump());
11691 TE->setOperands(Operands);
11692
11693 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
11694 buildTreeRec(Operands[I], Depth + 1, {TE, I});
11695 return;
11696 }
11697 case Instruction::Store: {
11698 bool Consecutive = CurrentOrder.empty();
11699 if (!Consecutive)
11700 fixupOrderingIndices(CurrentOrder);
11701 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11702 ReuseShuffleIndices, CurrentOrder);
11703 if (Consecutive)
11704 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
11705 TE->dump());
11706 else
11707 LLVM_DEBUG(
11708 dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n";
11709 TE->dump());
11710 TE->setOperands(Operands);
11711 buildTreeRec(TE->getOperand(0), Depth + 1, {TE, 0});
11712 return;
11713 }
11714 case Instruction::Call: {
11715 // Check if the calls are all to the same vectorizable intrinsic or
11716 // library function.
11717 CallInst *CI = cast<CallInst>(VL0);
11719
11720 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11721 ReuseShuffleIndices);
11722 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
11723 TE->dump());
11724 if (isCommutative(VL0)) {
11725 VLOperands Ops(VL, Operands, S, *this);
11726 Ops.reorder();
11727 Operands[0] = Ops.getVL(0);
11728 Operands[1] = Ops.getVL(1);
11729 }
11730 TE->setOperands(Operands);
11731 for (unsigned I : seq<unsigned>(CI->arg_size())) {
11732 // For scalar operands no need to create an entry since no need to
11733 // vectorize it.
11735 continue;
11736 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
11737 }
11738 return;
11739 }
11740 case Instruction::ShuffleVector: {
11741 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11742 ReuseShuffleIndices);
11743 if (S.isAltShuffle()) {
11744 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
11745 TE->dump());
11746 } else {
11747 assert(SLPReVec && "Only supported by REVEC.");
11748 LLVM_DEBUG(
11749 dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
11750 TE->dump());
11751 }
11752
11753 // Reorder operands if reordering would enable vectorization.
11754 auto *CI = dyn_cast<CmpInst>(VL0);
11755 if (CI && any_of(VL, [](Value *V) {
11756 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
11757 })) {
11758 auto *MainCI = cast<CmpInst>(S.getMainOp());
11759 auto *AltCI = cast<CmpInst>(S.getAltOp());
11760 CmpInst::Predicate MainP = MainCI->getPredicate();
11761 CmpInst::Predicate AltP = AltCI->getPredicate();
11762 assert(MainP != AltP &&
11763 "Expected different main/alternate predicates.");
11764 // Collect operands - commute if it uses the swapped predicate or
11765 // alternate operation.
11766 for (auto [Idx, V] : enumerate(VL)) {
11767 if (isa<PoisonValue>(V))
11768 continue;
11769 auto *Cmp = cast<CmpInst>(V);
11770
11771 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
11772 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
11773 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
11774 } else {
11775 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
11776 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
11777 }
11778 }
11779 TE->setOperands(Operands);
11780 buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
11781 buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
11782 return;
11783 }
11784
11785 if (isa<BinaryOperator>(VL0) || CI) {
11786 VLOperands Ops(VL, Operands, S, *this);
11787 Ops.reorder();
11788 Operands[0] = Ops.getVL(0);
11789 Operands[1] = Ops.getVL(1);
11790 }
11791 TE->setOperands(Operands);
11792 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
11793 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
11794 return;
11795 }
11796 default:
11797 break;
11798 }
11799 llvm_unreachable("Unexpected vectorization of the instructions.");
11800}
11801
11803 unsigned N = 1;
11804 Type *EltTy = T;
11805
11806 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
11807 if (EltTy->isEmptyTy())
11808 return 0;
11809 if (auto *ST = dyn_cast<StructType>(EltTy)) {
11810 // Check that struct is homogeneous.
11811 for (const auto *Ty : ST->elements())
11812 if (Ty != *ST->element_begin())
11813 return 0;
11814 N *= ST->getNumElements();
11815 EltTy = *ST->element_begin();
11816 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
11817 N *= AT->getNumElements();
11818 EltTy = AT->getElementType();
11819 } else {
11820 auto *VT = cast<FixedVectorType>(EltTy);
11821 N *= VT->getNumElements();
11822 EltTy = VT->getElementType();
11823 }
11824 }
11825
11826 if (!isValidElementType(EltTy))
11827 return 0;
11828 size_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N));
11829 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
11830 VTSize != DL->getTypeStoreSizeInBits(T))
11831 return 0;
11832 return N;
11833}
11834
11835bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
11836 SmallVectorImpl<unsigned> &CurrentOrder,
11837 bool ResizeAllowed) const {
11838 const auto *It = find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
11839 assert(It != VL.end() && "Expected at least one extract instruction.");
11840 auto *E0 = cast<Instruction>(*It);
11841 assert(
11842 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
11843 "Invalid opcode");
11844 // Check if all of the extracts come from the same vector and from the
11845 // correct offset.
11846 Value *Vec = E0->getOperand(0);
11847
11848 CurrentOrder.clear();
11849
11850 // We have to extract from a vector/aggregate with the same number of elements.
11851 unsigned NElts;
11852 if (E0->getOpcode() == Instruction::ExtractValue) {
11853 NElts = canMapToVector(Vec->getType());
11854 if (!NElts)
11855 return false;
11856 // Check if load can be rewritten as load of vector.
11857 LoadInst *LI = dyn_cast<LoadInst>(Vec);
11858 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
11859 return false;
11860 } else {
11861 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
11862 }
11863
11864 unsigned E = VL.size();
11865 if (!ResizeAllowed && NElts != E)
11866 return false;
11867 SmallVector<int> Indices(E, PoisonMaskElem);
11868 unsigned MinIdx = NElts, MaxIdx = 0;
11869 for (auto [I, V] : enumerate(VL)) {
11870 auto *Inst = dyn_cast<Instruction>(V);
11871 if (!Inst)
11872 continue;
11873 if (Inst->getOperand(0) != Vec)
11874 return false;
11875 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
11876 if (isa<UndefValue>(EE->getIndexOperand()))
11877 continue;
11878 std::optional<unsigned> Idx = getExtractIndex(Inst);
11879 if (!Idx)
11880 return false;
11881 const unsigned ExtIdx = *Idx;
11882 if (ExtIdx >= NElts)
11883 continue;
11884 Indices[I] = ExtIdx;
11885 if (MinIdx > ExtIdx)
11886 MinIdx = ExtIdx;
11887 if (MaxIdx < ExtIdx)
11888 MaxIdx = ExtIdx;
11889 }
11890 if (MaxIdx - MinIdx + 1 > E)
11891 return false;
11892 if (MaxIdx + 1 <= E)
11893 MinIdx = 0;
11894
11895 // Check that all of the indices extract from the correct offset.
11896 bool ShouldKeepOrder = true;
11897 // Assign to all items the initial value E + 1 so we can check if the extract
11898 // instruction index was used already.
11899 // Also, later we can check that all the indices are used and we have a
11900 // consecutive access in the extract instructions, by checking that no
11901 // element of CurrentOrder still has value E + 1.
11902 CurrentOrder.assign(E, E);
11903 for (unsigned I = 0; I < E; ++I) {
11904 if (Indices[I] == PoisonMaskElem)
11905 continue;
11906 const unsigned ExtIdx = Indices[I] - MinIdx;
11907 if (CurrentOrder[ExtIdx] != E) {
11908 CurrentOrder.clear();
11909 return false;
11910 }
11911 ShouldKeepOrder &= ExtIdx == I;
11912 CurrentOrder[ExtIdx] = I;
11913 }
11914 if (ShouldKeepOrder)
11915 CurrentOrder.clear();
11916
11917 return ShouldKeepOrder;
11918}
11919
11920bool BoUpSLP::areAllUsersVectorized(
11921 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
11922 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
11923 all_of(I->users(), [this](User *U) {
11924 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
11925 (isa<ExtractElementInst>(U) && MustGather.contains(U));
11926 });
11927}
11928
11929void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
11930 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
11931 SmallVectorImpl<Value *> *OpScalars,
11932 SmallVectorImpl<Value *> *AltScalars) const {
11933 unsigned Sz = Scalars.size();
11934 Mask.assign(Sz, PoisonMaskElem);
11935 SmallVector<int> OrderMask;
11936 if (!ReorderIndices.empty())
11937 inversePermutation(ReorderIndices, OrderMask);
11938 for (unsigned I = 0; I < Sz; ++I) {
11939 unsigned Idx = I;
11940 if (!ReorderIndices.empty())
11941 Idx = OrderMask[I];
11942 if (isa<PoisonValue>(Scalars[Idx]))
11943 continue;
11944 auto *OpInst = cast<Instruction>(Scalars[Idx]);
11945 if (IsAltOp(OpInst)) {
11946 Mask[I] = Sz + Idx;
11947 if (AltScalars)
11948 AltScalars->push_back(OpInst);
11949 } else {
11950 Mask[I] = Idx;
11951 if (OpScalars)
11952 OpScalars->push_back(OpInst);
11953 }
11954 }
11955 if (!ReuseShuffleIndices.empty()) {
11956 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
11957 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
11958 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
11959 });
11960 Mask.swap(NewMask);
11961 }
11962}
11963
11965 Instruction *AltOp,
11966 const TargetLibraryInfo &TLI) {
11967 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == MainOp;
11968}
11969
11971 Instruction *AltOp,
11972 const TargetLibraryInfo &TLI) {
11973 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
11974 auto *AltCI = cast<CmpInst>(AltOp);
11975 CmpInst::Predicate MainP = MainCI->getPredicate();
11976 [[maybe_unused]] CmpInst::Predicate AltP = AltCI->getPredicate();
11977 assert(MainP != AltP && "Expected different main/alternate predicates.");
11978 auto *CI = cast<CmpInst>(I);
11979 if (isCmpSameOrSwapped(MainCI, CI, TLI))
11980 return false;
11981 if (isCmpSameOrSwapped(AltCI, CI, TLI))
11982 return true;
11983 CmpInst::Predicate P = CI->getPredicate();
11985
11986 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
11987 "CmpInst expected to match either main or alternate predicate or "
11988 "their swap.");
11989 return MainP != P && MainP != SwappedP;
11990 }
11991 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == AltOp;
11992}
11993
11994TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
11995 assert(!Ops.empty());
11996 const auto *Op0 = Ops.front();
11997
11998 const bool IsConstant = all_of(Ops, [](Value *V) {
11999 // TODO: We should allow undef elements here
12000 return isConstant(V) && !isa<UndefValue>(V);
12001 });
12002 const bool IsUniform = all_of(Ops, [=](Value *V) {
12003 // TODO: We should allow undef elements here
12004 return V == Op0;
12005 });
12006 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
12007 // TODO: We should allow undef elements here
12008 if (auto *CI = dyn_cast<ConstantInt>(V))
12009 return CI->getValue().isPowerOf2();
12010 return false;
12011 });
12012 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
12013 // TODO: We should allow undef elements here
12014 if (auto *CI = dyn_cast<ConstantInt>(V))
12015 return CI->getValue().isNegatedPowerOf2();
12016 return false;
12017 });
12018
12020 if (IsConstant && IsUniform)
12022 else if (IsConstant)
12024 else if (IsUniform)
12026
12028 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
12029 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
12030
12031 return {VK, VP};
12032}
12033
12034namespace {
12035/// The base class for shuffle instruction emission and shuffle cost estimation.
12036class BaseShuffleAnalysis {
12037protected:
12038 Type *ScalarTy = nullptr;
12039
12040 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
12041
12042 /// V is expected to be a vectorized value.
12043 /// When REVEC is disabled, there is no difference between VF and
12044 /// VNumElements.
12045 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
12046 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
12047 /// of 8.
12048 unsigned getVF(Value *V) const {
12049 assert(V && "V cannot be nullptr");
12050 assert(isa<FixedVectorType>(V->getType()) &&
12051 "V does not have FixedVectorType");
12052 assert(ScalarTy && "ScalarTy cannot be nullptr");
12053 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
12054 unsigned VNumElements =
12055 cast<FixedVectorType>(V->getType())->getNumElements();
12056 assert(VNumElements > ScalarTyNumElements &&
12057 "the number of elements of V is not large enough");
12058 assert(VNumElements % ScalarTyNumElements == 0 &&
12059 "the number of elements of V is not a vectorized value");
12060 return VNumElements / ScalarTyNumElements;
12061 }
12062
12063 /// Checks if the mask is an identity mask.
12064 /// \param IsStrict if is true the function returns false if mask size does
12065 /// not match vector size.
12066 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
12067 bool IsStrict) {
12068 int Limit = Mask.size();
12069 int VF = VecTy->getNumElements();
12070 int Index = -1;
12071 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
12072 return true;
12073 if (!IsStrict) {
12074 // Consider extract subvector starting from index 0.
12076 Index == 0)
12077 return true;
12078 // All VF-size submasks are identity (e.g.
12079 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
12080 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
12081 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
12082 return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
12084 }))
12085 return true;
12086 }
12087 return false;
12088 }
12089
12090 /// Tries to combine 2 different masks into single one.
12091 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
12092 /// change the size of the vector, \p LocalVF is the original size of the
12093 /// shuffled vector.
12094 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
12095 ArrayRef<int> ExtMask) {
12096 unsigned VF = Mask.size();
12097 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
12098 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
12099 if (ExtMask[I] == PoisonMaskElem)
12100 continue;
12101 int MaskedIdx = Mask[ExtMask[I] % VF];
12102 NewMask[I] =
12103 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
12104 }
12105 Mask.swap(NewMask);
12106 }
12107
12108 /// Looks through shuffles trying to reduce final number of shuffles in the
12109 /// code. The function looks through the previously emitted shuffle
12110 /// instructions and properly mark indices in mask as undef.
12111 /// For example, given the code
12112 /// \code
12113 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
12114 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
12115 /// \endcode
12116 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
12117 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
12118 /// <0, 1, 2, 3> for the shuffle.
12119 /// If 2 operands are of different size, the smallest one will be resized and
12120 /// the mask recalculated properly.
12121 /// For example, given the code
12122 /// \code
12123 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
12124 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
12125 /// \endcode
12126 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
12127 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
12128 /// <0, 1, 2, 3> for the shuffle.
12129 /// So, it tries to transform permutations to simple vector merge, if
12130 /// possible.
12131 /// \param V The input vector which must be shuffled using the given \p Mask.
12132 /// If the better candidate is found, \p V is set to this best candidate
12133 /// vector.
12134 /// \param Mask The input mask for the shuffle. If the best candidate is found
12135 /// during looking-through-shuffles attempt, it is updated accordingly.
12136 /// \param SinglePermute true if the shuffle operation is originally a
12137 /// single-value-permutation. In this case the look-through-shuffles procedure
12138 /// may look for resizing shuffles as the best candidates.
12139 /// \return true if the shuffle results in the non-resizing identity shuffle
12140 /// (and thus can be ignored), false - otherwise.
12141 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
12142 bool SinglePermute) {
12143 Value *Op = V;
12144 ShuffleVectorInst *IdentityOp = nullptr;
12145 SmallVector<int> IdentityMask;
12146 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
12147 // Exit if not a fixed vector type or changing size shuffle.
12148 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
12149 if (!SVTy)
12150 break;
12151 // Remember the identity or broadcast mask, if it is not a resizing
12152 // shuffle. If no better candidates are found, this Op and Mask will be
12153 // used in the final shuffle.
12154 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
12155 if (!IdentityOp || !SinglePermute ||
12156 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
12158 IdentityMask.size()))) {
12159 IdentityOp = SV;
12160 // Store current mask in the IdentityMask so later we did not lost
12161 // this info if IdentityOp is selected as the best candidate for the
12162 // permutation.
12163 IdentityMask.assign(Mask);
12164 }
12165 }
12166 // Remember the broadcast mask. If no better candidates are found, this Op
12167 // and Mask will be used in the final shuffle.
12168 // Zero splat can be used as identity too, since it might be used with
12169 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
12170 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
12171 // expensive, the analysis founds out, that the source vector is just a
12172 // broadcast, this original mask can be transformed to identity mask <0,
12173 // 1, 2, 3>.
12174 // \code
12175 // %0 = shuffle %v, poison, zeroinitalizer
12176 // %res = shuffle %0, poison, <3, 1, 2, 0>
12177 // \endcode
12178 // may be transformed to
12179 // \code
12180 // %0 = shuffle %v, poison, zeroinitalizer
12181 // %res = shuffle %0, poison, <0, 1, 2, 3>
12182 // \endcode
12183 if (SV->isZeroEltSplat()) {
12184 IdentityOp = SV;
12185 IdentityMask.assign(Mask);
12186 }
12187 int LocalVF = Mask.size();
12188 if (auto *SVOpTy =
12189 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
12190 LocalVF = SVOpTy->getNumElements();
12191 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
12192 for (auto [Idx, I] : enumerate(Mask)) {
12193 if (I == PoisonMaskElem ||
12194 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
12195 continue;
12196 ExtMask[Idx] = SV->getMaskValue(I);
12197 }
12198 bool IsOp1Undef = isUndefVector</*isPoisonOnly=*/true>(
12199 SV->getOperand(0),
12200 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
12201 .all();
12202 bool IsOp2Undef = isUndefVector</*isPoisonOnly=*/true>(
12203 SV->getOperand(1),
12204 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
12205 .all();
12206 if (!IsOp1Undef && !IsOp2Undef) {
12207 // Update mask and mark undef elems.
12208 for (int &I : Mask) {
12209 if (I == PoisonMaskElem)
12210 continue;
12211 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
12213 I = PoisonMaskElem;
12214 }
12215 break;
12216 }
12217 SmallVector<int> ShuffleMask(SV->getShuffleMask());
12218 combineMasks(LocalVF, ShuffleMask, Mask);
12219 Mask.swap(ShuffleMask);
12220 if (IsOp2Undef)
12221 Op = SV->getOperand(0);
12222 else
12223 Op = SV->getOperand(1);
12224 }
12225 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
12226 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
12228 if (IdentityOp) {
12229 V = IdentityOp;
12230 assert(Mask.size() == IdentityMask.size() &&
12231 "Expected masks of same sizes.");
12232 // Clear known poison elements.
12233 for (auto [I, Idx] : enumerate(Mask))
12234 if (Idx == PoisonMaskElem)
12235 IdentityMask[I] = PoisonMaskElem;
12236 Mask.swap(IdentityMask);
12237 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
12238 return SinglePermute &&
12239 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
12240 /*IsStrict=*/true) ||
12241 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
12242 Shuffle->isZeroEltSplat() &&
12244 all_of(enumerate(Mask), [&](const auto &P) {
12245 return P.value() == PoisonMaskElem ||
12246 Shuffle->getShuffleMask()[P.index()] == 0;
12247 })));
12248 }
12249 V = Op;
12250 return false;
12251 }
12252 V = Op;
12253 return true;
12254 }
12255
12256 /// Smart shuffle instruction emission, walks through shuffles trees and
12257 /// tries to find the best matching vector for the actual shuffle
12258 /// instruction.
12259 template <typename T, typename ShuffleBuilderTy>
12260 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
12261 ShuffleBuilderTy &Builder, Type *ScalarTy) {
12262 assert(V1 && "Expected at least one vector value.");
12263 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
12264 SmallVector<int> NewMask(Mask);
12265 if (ScalarTyNumElements != 1) {
12266 assert(SLPReVec && "FixedVectorType is not expected.");
12267 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, NewMask);
12268 Mask = NewMask;
12269 }
12270 if (V2)
12271 Builder.resizeToMatch(V1, V2);
12272 int VF = Mask.size();
12273 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
12274 VF = FTy->getNumElements();
12275 if (V2 && !isUndefVector</*IsPoisonOnly=*/true>(
12276 V2, buildUseMask(VF, Mask, UseMask::SecondArg))
12277 .all()) {
12278 // Peek through shuffles.
12279 Value *Op1 = V1;
12280 Value *Op2 = V2;
12281 int VF =
12282 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
12283 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
12284 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
12285 for (int I = 0, E = Mask.size(); I < E; ++I) {
12286 if (Mask[I] < VF)
12287 CombinedMask1[I] = Mask[I];
12288 else
12289 CombinedMask2[I] = Mask[I] - VF;
12290 }
12291 Value *PrevOp1;
12292 Value *PrevOp2;
12293 do {
12294 PrevOp1 = Op1;
12295 PrevOp2 = Op2;
12296 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
12297 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
12298 // Check if we have 2 resizing shuffles - need to peek through operands
12299 // again.
12300 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
12301 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
12302 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
12303 for (auto [Idx, I] : enumerate(CombinedMask1)) {
12304 if (I == PoisonMaskElem)
12305 continue;
12306 ExtMask1[Idx] = SV1->getMaskValue(I);
12307 }
12308 SmallBitVector UseMask1 = buildUseMask(
12309 cast<FixedVectorType>(SV1->getOperand(1)->getType())
12310 ->getNumElements(),
12311 ExtMask1, UseMask::SecondArg);
12312 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
12313 for (auto [Idx, I] : enumerate(CombinedMask2)) {
12314 if (I == PoisonMaskElem)
12315 continue;
12316 ExtMask2[Idx] = SV2->getMaskValue(I);
12317 }
12318 SmallBitVector UseMask2 = buildUseMask(
12319 cast<FixedVectorType>(SV2->getOperand(1)->getType())
12320 ->getNumElements(),
12321 ExtMask2, UseMask::SecondArg);
12322 if (SV1->getOperand(0)->getType() ==
12323 SV2->getOperand(0)->getType() &&
12324 SV1->getOperand(0)->getType() != SV1->getType() &&
12325 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
12326 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
12327 Op1 = SV1->getOperand(0);
12328 Op2 = SV2->getOperand(0);
12329 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
12330 int LocalVF = ShuffleMask1.size();
12331 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
12332 LocalVF = FTy->getNumElements();
12333 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
12334 CombinedMask1.swap(ShuffleMask1);
12335 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
12336 LocalVF = ShuffleMask2.size();
12337 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
12338 LocalVF = FTy->getNumElements();
12339 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
12340 CombinedMask2.swap(ShuffleMask2);
12341 }
12342 }
12343 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
12344 Builder.resizeToMatch(Op1, Op2);
12345 VF = std::max(cast<VectorType>(Op1->getType())
12346 ->getElementCount()
12347 .getKnownMinValue(),
12348 cast<VectorType>(Op2->getType())
12349 ->getElementCount()
12350 .getKnownMinValue());
12351 for (int I = 0, E = Mask.size(); I < E; ++I) {
12352 if (CombinedMask2[I] != PoisonMaskElem) {
12353 assert(CombinedMask1[I] == PoisonMaskElem &&
12354 "Expected undefined mask element");
12355 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
12356 }
12357 }
12358 if (Op1 == Op2 &&
12359 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
12360 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
12361 isa<ShuffleVectorInst>(Op1) &&
12362 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
12363 ArrayRef(CombinedMask1))))
12364 return Builder.createIdentity(Op1);
12365 return Builder.createShuffleVector(
12366 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
12367 CombinedMask1);
12368 }
12369 if (isa<PoisonValue>(V1))
12370 return Builder.createPoison(
12371 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
12372 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
12373 assert(V1 && "Expected non-null value after looking through shuffles.");
12374
12375 if (!IsIdentity)
12376 return Builder.createShuffleVector(V1, NewMask);
12377 return Builder.createIdentity(V1);
12378 }
12379
12380 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
12381 /// shuffle emission.
12382 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
12383 ArrayRef<int> Mask) {
12384 for (unsigned I : seq<unsigned>(CommonMask.size()))
12385 if (Mask[I] != PoisonMaskElem)
12386 CommonMask[I] = I;
12387 }
12388};
12389} // namespace
12390
12391/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
12392static std::pair<InstructionCost, InstructionCost>
12394 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
12395 Type *ScalarTy, VectorType *VecTy) {
12396 InstructionCost ScalarCost = 0;
12397 InstructionCost VecCost = 0;
12398 // Here we differentiate two cases: (1) when Ptrs represent a regular
12399 // vectorization tree node (as they are pointer arguments of scattered
12400 // loads) or (2) when Ptrs are the arguments of loads or stores being
12401 // vectorized as plane wide unit-stride load/store since all the
12402 // loads/stores are known to be from/to adjacent locations.
12403 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
12404 // Case 2: estimate costs for pointer related costs when vectorizing to
12405 // a wide load/store.
12406 // Scalar cost is estimated as a set of pointers with known relationship
12407 // between them.
12408 // For vector code we will use BasePtr as argument for the wide load/store
12409 // but we also need to account all the instructions which are going to
12410 // stay in vectorized code due to uses outside of these scalar
12411 // loads/stores.
12412 ScalarCost = TTI.getPointersChainCost(
12413 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
12414 CostKind);
12415
12416 SmallVector<const Value *> PtrsRetainedInVecCode;
12417 for (Value *V : Ptrs) {
12418 if (V == BasePtr) {
12419 PtrsRetainedInVecCode.push_back(V);
12420 continue;
12421 }
12422 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
12423 // For simplicity assume Ptr to stay in vectorized code if it's not a
12424 // GEP instruction. We don't care since it's cost considered free.
12425 // TODO: We should check for any uses outside of vectorizable tree
12426 // rather than just single use.
12427 if (!Ptr || !Ptr->hasOneUse())
12428 PtrsRetainedInVecCode.push_back(V);
12429 }
12430
12431 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
12432 // If all pointers stay in vectorized code then we don't have
12433 // any savings on that.
12434 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
12435 }
12436 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
12437 TTI::PointersChainInfo::getKnownStride(),
12438 VecTy, CostKind);
12439 } else {
12440 // Case 1: Ptrs are the arguments of loads that we are going to transform
12441 // into masked gather load intrinsic.
12442 // All the scalar GEPs will be removed as a result of vectorization.
12443 // For any external uses of some lanes extract element instructions will
12444 // be generated (which cost is estimated separately).
12445 TTI::PointersChainInfo PtrsInfo =
12446 all_of(Ptrs,
12447 [](const Value *V) {
12448 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
12449 return Ptr && !Ptr->hasAllConstantIndices();
12450 })
12451 ? TTI::PointersChainInfo::getUnknownStride()
12452 : TTI::PointersChainInfo::getKnownStride();
12453
12454 ScalarCost =
12455 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
12456 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
12457 if (!BaseGEP) {
12458 auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
12459 if (It != Ptrs.end())
12460 BaseGEP = cast<GEPOperator>(*It);
12461 }
12462 if (BaseGEP) {
12463 SmallVector<const Value *> Indices(BaseGEP->indices());
12464 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
12465 BaseGEP->getPointerOperand(), Indices, VecTy,
12466 CostKind);
12467 }
12468 }
12469
12470 return std::make_pair(ScalarCost, VecCost);
12471}
12472
12473void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
12474 assert(TE.isGather() && TE.ReorderIndices.empty() &&
12475 "Expected gather node without reordering.");
12477 SmallSet<size_t, 2> LoadKeyUsed;
12478
12479 // Do not reorder nodes if it small (just 2 elements), all-constant or all
12480 // instructions have same opcode already.
12481 if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||
12482 all_of(TE.Scalars, isConstant))
12483 return;
12484
12485 if (any_of(seq<unsigned>(TE.Idx), [&](unsigned Idx) {
12486 return VectorizableTree[Idx]->isSame(TE.Scalars);
12487 }))
12488 return;
12489
12490 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
12491 Key = hash_combine(hash_value(LI->getParent()), Key);
12492 Value *Ptr =
12494 if (LoadKeyUsed.contains(Key)) {
12495 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
12496 if (LIt != LoadsMap.end()) {
12497 for (LoadInst *RLI : LIt->second) {
12498 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
12499 LI->getType(), LI->getPointerOperand(), *DL, *SE,
12500 /*StrictCheck=*/true))
12501 return hash_value(RLI->getPointerOperand());
12502 }
12503 for (LoadInst *RLI : LIt->second) {
12505 LI->getPointerOperand(), *TLI)) {
12506 hash_code SubKey = hash_value(RLI->getPointerOperand());
12507 return SubKey;
12508 }
12509 }
12510 if (LIt->second.size() > 2) {
12511 hash_code SubKey =
12512 hash_value(LIt->second.back()->getPointerOperand());
12513 return SubKey;
12514 }
12515 }
12516 }
12517 LoadKeyUsed.insert(Key);
12518 LoadsMap.try_emplace(std::make_pair(Key, Ptr)).first->second.push_back(LI);
12519 return hash_value(LI->getPointerOperand());
12520 };
12523 bool IsOrdered = true;
12524 unsigned NumInstructions = 0;
12525 // Try to "cluster" scalar instructions, to be able to build extra vectorized
12526 // nodes.
12527 for (auto [I, V] : enumerate(TE.Scalars)) {
12528 size_t Key = 1, Idx = 1;
12529 if (auto *Inst = dyn_cast<Instruction>(V);
12530 Inst && !isa<ExtractElementInst, LoadInst, CastInst>(V) &&
12531 !isDeleted(Inst) && !isVectorized(V)) {
12532 std::tie(Key, Idx) = generateKeySubkey(V, TLI, GenerateLoadsSubkey,
12533 /*AllowAlternate=*/false);
12534 ++NumInstructions;
12535 }
12536 auto &Container = SortedValues[Key];
12537 if (IsOrdered && !KeyToIndex.contains(V) &&
12538 !(isa<Constant, ExtractElementInst>(V) ||
12540 ((Container.contains(Idx) &&
12541 KeyToIndex.at(Container[Idx].back()).back() != I - 1) ||
12542 (!Container.empty() && !Container.contains(Idx) &&
12543 KeyToIndex.at(Container.back().second.back()).back() != I - 1)))
12544 IsOrdered = false;
12545 auto &KTI = KeyToIndex[V];
12546 if (KTI.empty())
12547 Container[Idx].push_back(V);
12548 KTI.push_back(I);
12549 }
12551 APInt DemandedElts = APInt::getAllOnes(TE.Scalars.size());
12552 if (!IsOrdered && NumInstructions > 1) {
12553 unsigned Cnt = 0;
12554 TE.ReorderIndices.resize(TE.Scalars.size(), TE.Scalars.size());
12555 for (const auto &D : SortedValues) {
12556 for (const auto &P : D.second) {
12557 unsigned Sz = 0;
12558 for (Value *V : P.second) {
12559 ArrayRef<unsigned> Indices = KeyToIndex.at(V);
12560 for (auto [K, Idx] : enumerate(Indices)) {
12561 TE.ReorderIndices[Cnt + K] = Idx;
12562 TE.Scalars[Cnt + K] = V;
12563 }
12564 Sz += Indices.size();
12565 Cnt += Indices.size();
12566 }
12567 if (Sz > 1 && isa<Instruction>(P.second.front())) {
12568 const unsigned SubVF = getFloorFullVectorNumberOfElements(
12569 *TTI, TE.Scalars.front()->getType(), Sz);
12570 SubVectors.emplace_back(Cnt - Sz, SubVF);
12571 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
12572 DemandedElts.clearBit(I);
12573 } else if (!P.second.empty() && isConstant(P.second.front())) {
12574 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
12575 DemandedElts.clearBit(I);
12576 }
12577 }
12578 }
12579 }
12580 // Reuses always require shuffles, so consider it as profitable.
12581 if (!TE.ReuseShuffleIndices.empty() || TE.ReorderIndices.empty())
12582 return;
12583 // Do simple cost estimation.
12586 auto *ScalarTy = TE.Scalars.front()->getType();
12587 auto *VecTy = getWidenedType(ScalarTy, TE.Scalars.size());
12588 for (auto [Idx, Sz] : SubVectors) {
12590 Idx, getWidenedType(ScalarTy, Sz));
12591 }
12592 Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
12593 /*Insert=*/true,
12594 /*Extract=*/false, CostKind);
12595 int Sz = TE.Scalars.size();
12596 SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
12597 TE.ReorderIndices.end());
12598 for (unsigned I : seq<unsigned>(Sz)) {
12599 Value *V = TE.getOrdered(I);
12600 if (isa<PoisonValue>(V)) {
12601 ReorderMask[I] = PoisonMaskElem;
12602 } else if (isConstant(V) || DemandedElts[I]) {
12603 ReorderMask[I] = I + TE.ReorderIndices.size();
12604 }
12605 }
12607 any_of(ReorderMask, [&](int I) { return I >= Sz; })
12610 VecTy, ReorderMask);
12611 DemandedElts = APInt::getAllOnes(TE.Scalars.size());
12612 ReorderMask.assign(Sz, PoisonMaskElem);
12613 for (unsigned I : seq<unsigned>(Sz)) {
12614 Value *V = TE.getOrdered(I);
12615 if (isConstant(V)) {
12616 DemandedElts.clearBit(I);
12617 if (!isa<PoisonValue>(V))
12618 ReorderMask[I] = I;
12619 } else {
12620 ReorderMask[I] = I + Sz;
12621 }
12622 }
12623 InstructionCost BVCost =
12624 getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
12625 /*Insert=*/true, /*Extract=*/false, CostKind);
12626 if (!DemandedElts.isAllOnes())
12627 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);
12628 if (Cost >= BVCost) {
12629 SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end());
12630 reorderScalars(TE.Scalars, Mask);
12631 TE.ReorderIndices.clear();
12632 }
12633}
12634
12635/// Check if we can convert fadd/fsub sequence to FMAD.
12636/// \returns Cost of the FMAD, if conversion is possible, invalid cost otherwise.
12638 const InstructionsState &S,
12639 DominatorTree &DT, const DataLayout &DL,
12641 const TargetLibraryInfo &TLI) {
12642 assert(all_of(VL,
12643 [](Value *V) {
12644 return V->getType()->getScalarType()->isFloatingPointTy();
12645 }) &&
12646 "Can only convert to FMA for floating point types");
12647 assert(S.isAddSubLikeOp() && "Can only convert to FMA for add/sub");
12648
12649 auto CheckForContractable = [&](ArrayRef<Value *> VL) {
12650 FastMathFlags FMF;
12651 FMF.set();
12652 for (Value *V : VL) {
12653 auto *I = dyn_cast<Instruction>(V);
12654 if (!I)
12655 continue;
12656 if (S.isCopyableElement(I))
12657 continue;
12658 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(I);
12659 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
12660 continue;
12661 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
12662 FMF &= FPCI->getFastMathFlags();
12663 }
12664 return FMF.allowContract();
12665 };
12666 if (!CheckForContractable(VL))
12668 // fmul also should be contractable
12669 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
12670 SmallVector<BoUpSLP::ValueList> Operands = Analysis.buildOperands(S, VL);
12671
12672 InstructionsState OpS = getSameOpcode(Operands.front(), TLI);
12673 if (!OpS.valid())
12675
12676 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
12678 if (!CheckForContractable(Operands.front()))
12680 // Compare the costs.
12681 InstructionCost FMulPlusFAddCost = 0;
12682 InstructionCost FMACost = 0;
12684 FastMathFlags FMF;
12685 FMF.set();
12686 for (Value *V : VL) {
12687 auto *I = dyn_cast<Instruction>(V);
12688 if (!I)
12689 continue;
12690 if (!S.isCopyableElement(I))
12691 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
12692 FMF &= FPCI->getFastMathFlags();
12693 FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
12694 }
12695 unsigned NumOps = 0;
12696 for (auto [V, Op] : zip(VL, Operands.front())) {
12697 if (S.isCopyableElement(V))
12698 continue;
12699 auto *I = dyn_cast<Instruction>(Op);
12700 if (!I || !I->hasOneUse() || OpS.isCopyableElement(I)) {
12701 if (auto *OpI = dyn_cast<Instruction>(V))
12702 FMACost += TTI.getInstructionCost(OpI, CostKind);
12703 if (I)
12704 FMACost += TTI.getInstructionCost(I, CostKind);
12705 continue;
12706 }
12707 ++NumOps;
12708 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
12709 FMF &= FPCI->getFastMathFlags();
12710 FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
12711 }
12712 Type *Ty = VL.front()->getType();
12713 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, Ty, {Ty, Ty, Ty}, FMF);
12714 FMACost += NumOps * TTI.getIntrinsicInstrCost(ICA, CostKind);
12715 return FMACost < FMulPlusFAddCost ? FMACost : InstructionCost::getInvalid();
12716}
12717
12720 BaseGraphSize = VectorizableTree.size();
12721 // Turn graph transforming mode on and off, when done.
12722 class GraphTransformModeRAAI {
12723 bool &SavedIsGraphTransformMode;
12724
12725 public:
12726 GraphTransformModeRAAI(bool &IsGraphTransformMode)
12727 : SavedIsGraphTransformMode(IsGraphTransformMode) {
12728 IsGraphTransformMode = true;
12729 }
12730 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
12731 } TransformContext(IsGraphTransformMode);
12732 // Operands are profitable if they are:
12733 // 1. At least one constant
12734 // or
12735 // 2. Splats
12736 // or
12737 // 3. Results in good vectorization opportunity, i.e. may generate vector
12738 // nodes and reduce cost of the graph.
12739 auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
12740 const InstructionsState &S) {
12742 for (unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
12743 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
12744 I2->getOperand(Op));
12745 return all_of(
12746 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
12747 return all_of(Cand,
12748 [](const std::pair<Value *, Value *> &P) {
12749 return isa<Constant>(P.first) ||
12750 isa<Constant>(P.second) || P.first == P.second;
12751 }) ||
12753 });
12754 };
12755
12756 // Try to reorder gather nodes for better vectorization opportunities.
12757 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
12758 TreeEntry &E = *VectorizableTree[Idx];
12759 if (E.isGather())
12760 reorderGatherNode(E);
12761 }
12762
12763 // Better to use full gathered loads analysis, if there are only 2 loads
12764 // gathered nodes each having less than 16 elements.
12765 constexpr unsigned VFLimit = 16;
12766 bool ForceLoadGather =
12767 count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12768 return TE->isGather() && TE->hasState() &&
12769 TE->getOpcode() == Instruction::Load &&
12770 TE->getVectorFactor() < VFLimit;
12771 }) == 2;
12772
12773 // Checks if the scalars are used in other node.
12774 auto AreReusedScalars = [&](const TreeEntry *TE, ArrayRef<Value *> VL,
12775 function_ref<bool(Value *)> CheckContainer) {
12776 return TE->isSame(VL) || all_of(VL, [&](Value *V) {
12777 if (isa<PoisonValue>(V))
12778 return true;
12779 auto *I = dyn_cast<Instruction>(V);
12780 if (!I)
12781 return false;
12782 return is_contained(TE->Scalars, I) || CheckContainer(I);
12783 });
12784 };
12785 auto CheckForSameVectorNodes = [&](const TreeEntry &E) {
12786 if (E.hasState()) {
12787 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(E.getMainOp());
12788 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
12789 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
12790 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
12791 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12792 return is_contained(TEs, TE);
12793 });
12794 });
12795 }))
12796 return true;
12797 ;
12798 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(E.getMainOp());
12799 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
12800 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
12801 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
12802 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12803 return is_contained(TEs, TE);
12804 });
12805 });
12806 }))
12807 return true;
12808 } else {
12809 // Check if the gather node full copy of split node.
12810 auto *It = find_if(E.Scalars, IsaPred<Instruction>);
12811 if (It != E.Scalars.end()) {
12812 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(*It);
12813 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
12814 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
12815 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
12816 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12817 return is_contained(TEs, TE);
12818 });
12819 });
12820 }))
12821 return true;
12822 }
12823 }
12824 return false;
12825 };
12826 // The tree may grow here, so iterate over nodes, built before.
12827 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
12828 TreeEntry &E = *VectorizableTree[Idx];
12829 if (E.isGather()) {
12830 ArrayRef<Value *> VL = E.Scalars;
12831 const unsigned Sz = getVectorElementSize(VL.front());
12832 unsigned MinVF = getMinVF(2 * Sz);
12833 // Do not try partial vectorization for small nodes (<= 2), nodes with the
12834 // same opcode and same parent block or all constants.
12835 if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
12836 !(!E.hasState() || E.getOpcode() == Instruction::Load ||
12837 // We use allSameOpcode instead of isAltShuffle because we don't
12838 // want to use interchangeable instruction here.
12839 !allSameOpcode(VL) || !allSameBlock(VL)) ||
12840 allConstant(VL) || isSplat(VL))
12841 continue;
12842 if (ForceLoadGather && E.hasState() && E.getOpcode() == Instruction::Load)
12843 continue;
12844 // Check if the node is a copy of other vector nodes.
12845 if (CheckForSameVectorNodes(E))
12846 continue;
12847 // Try to find vectorizable sequences and transform them into a series of
12848 // insertvector instructions.
12849 unsigned StartIdx = 0;
12850 unsigned End = VL.size();
12851 for (unsigned VF = getFloorFullVectorNumberOfElements(
12852 *TTI, VL.front()->getType(), VL.size() - 1);
12853 VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
12854 *TTI, VL.front()->getType(), VF - 1)) {
12855 if (StartIdx + VF > End)
12856 continue;
12858 bool AllStrided = true;
12859 for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
12860 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
12861 // If any instruction is vectorized already - do not try again.
12862 // Reuse the existing node, if it fully matches the slice.
12863 if (isVectorized(Slice.front()) &&
12864 !getSameValuesTreeEntry(Slice.front(), Slice, /*SameVF=*/true))
12865 continue;
12866 // Constant already handled effectively - skip.
12867 if (allConstant(Slice))
12868 continue;
12869 // Do not try to vectorize small splats (less than vector register and
12870 // only with the single non-undef element).
12871 bool IsSplat = isSplat(Slice);
12872 bool IsTwoRegisterSplat = true;
12873 if (IsSplat && VF == 2) {
12874 unsigned NumRegs2VF = ::getNumberOfParts(
12875 *TTI, getWidenedType(Slice.front()->getType(), 2 * VF));
12876 IsTwoRegisterSplat = NumRegs2VF == 2;
12877 }
12878 if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat ||
12879 count(Slice, Slice.front()) ==
12880 static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
12881 : 1)) {
12882 if (IsSplat)
12883 continue;
12884 InstructionsState S = getSameOpcode(Slice, *TLI);
12885 if (!S || !allSameOpcode(Slice) || !allSameBlock(Slice) ||
12886 (S.getOpcode() == Instruction::Load &&
12887 areKnownNonVectorizableLoads(Slice)) ||
12888 (S.getOpcode() != Instruction::Load &&
12889 !hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), VF)))
12890 continue;
12891 if (VF == 2) {
12892 // Try to vectorize reduced values or if all users are vectorized.
12893 // For expensive instructions extra extracts might be profitable.
12894 if ((!UserIgnoreList || E.Idx != 0) &&
12895 TTI->getInstructionCost(S.getMainOp(), CostKind) <
12897 !all_of(Slice, [&](Value *V) {
12898 if (isa<PoisonValue>(V))
12899 return true;
12900 return areAllUsersVectorized(cast<Instruction>(V),
12901 UserIgnoreList);
12902 }))
12903 continue;
12904 if (S.getOpcode() == Instruction::Load) {
12905 OrdersType Order;
12906 SmallVector<Value *> PointerOps;
12907 LoadsState Res =
12908 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps);
12909 AllStrided &= Res == LoadsState::StridedVectorize ||
12910 Res == LoadsState::ScatterVectorize ||
12911 Res == LoadsState::Gather;
12912 // Do not vectorize gathers.
12913 if (Res == LoadsState::ScatterVectorize ||
12914 Res == LoadsState::Gather) {
12915 if (Res == LoadsState::Gather) {
12916 registerNonVectorizableLoads(Slice);
12917 // If reductions and the scalars from the root node are
12918 // analyzed - mark as non-vectorizable reduction.
12919 if (UserIgnoreList && E.Idx == 0)
12920 analyzedReductionVals(Slice);
12921 }
12922 continue;
12923 }
12924 } else if (S.getOpcode() == Instruction::ExtractElement ||
12925 (TTI->getInstructionCost(S.getMainOp(), CostKind) <
12927 !CheckOperandsProfitability(
12928 S.getMainOp(),
12929 cast<Instruction>(*find_if(reverse(Slice),
12930 IsaPred<Instruction>)),
12931 S))) {
12932 // Do not vectorize extractelements (handled effectively
12933 // alread). Do not vectorize non-profitable instructions (with
12934 // low cost and non-vectorizable operands.)
12935 continue;
12936 }
12937 }
12938 }
12939 Slices.emplace_back(Cnt, Slice.size());
12940 }
12941 // Do not try to vectorize if all slides are strided or gathered with
12942 // vector factor 2 and there are more than 2 slices. Better to handle
12943 // them in gathered loads analysis, may result in better vectorization.
12944 if (VF == 2 && AllStrided && Slices.size() > 2)
12945 continue;
12946 auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
12947 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
12948 if (StartIdx == Cnt)
12949 StartIdx = Cnt + Sz;
12950 if (End == Cnt + Sz)
12951 End = Cnt;
12952 };
12953 for (auto [Cnt, Sz] : Slices) {
12954 ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);
12955 const TreeEntry *SameTE = nullptr;
12956 if (const auto *It = find_if(Slice, IsaPred<Instruction>);
12957 It != Slice.end()) {
12958 // If any instruction is vectorized already - do not try again.
12959 SameTE = getSameValuesTreeEntry(*It, Slice);
12960 }
12961 unsigned PrevSize = VectorizableTree.size();
12962 [[maybe_unused]] unsigned PrevEntriesSize =
12963 LoadEntriesToVectorize.size();
12964 buildTreeRec(Slice, 0, EdgeInfo(&E, UINT_MAX));
12965 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
12966 VectorizableTree[PrevSize]->isGather() &&
12967 VectorizableTree[PrevSize]->hasState() &&
12968 VectorizableTree[PrevSize]->getOpcode() !=
12969 Instruction::ExtractElement &&
12970 !isSplat(Slice)) {
12971 if (UserIgnoreList && E.Idx == 0 && VF == 2)
12972 analyzedReductionVals(Slice);
12973 VectorizableTree.pop_back();
12974 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
12975 "LoadEntriesToVectorize expected to remain the same");
12976 continue;
12977 }
12978 AddCombinedNode(PrevSize, Cnt, Sz);
12979 }
12980 }
12981 // Restore ordering, if no extra vectorization happened.
12982 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
12983 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
12984 reorderScalars(E.Scalars, Mask);
12985 E.ReorderIndices.clear();
12986 }
12987 }
12988 if (!E.hasState())
12989 continue;
12990 switch (E.getOpcode()) {
12991 case Instruction::Load: {
12992 // No need to reorder masked gather loads, just reorder the scalar
12993 // operands.
12994 if (E.State != TreeEntry::Vectorize)
12995 break;
12996 Type *ScalarTy = E.getMainOp()->getType();
12997 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
12998 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
12999 // Check if profitable to represent consecutive load + reverse as strided
13000 // load with stride -1.
13001 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
13002 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13004 inversePermutation(E.ReorderIndices, Mask);
13005 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
13006 InstructionCost OriginalVecCost =
13007 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
13008 BaseLI->getPointerAddressSpace(), CostKind,
13012 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
13013 /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
13014 if (StridedCost < OriginalVecCost)
13015 // Strided load is more profitable than consecutive load + reverse -
13016 // transform the node to strided load.
13017 E.State = TreeEntry::StridedVectorize;
13018 }
13019 break;
13020 }
13021 case Instruction::Store: {
13022 Type *ScalarTy =
13023 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
13024 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
13025 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
13026 // Check if profitable to represent consecutive load + reverse as strided
13027 // load with stride -1.
13028 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
13029 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13031 inversePermutation(E.ReorderIndices, Mask);
13032 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
13033 InstructionCost OriginalVecCost =
13034 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
13035 BaseSI->getPointerAddressSpace(), CostKind,
13039 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
13040 /*VariableMask=*/false, CommonAlignment, CostKind, BaseSI);
13041 if (StridedCost < OriginalVecCost)
13042 // Strided store is more profitable than reverse + consecutive store -
13043 // transform the node to strided store.
13044 E.State = TreeEntry::StridedVectorize;
13045 } else if (!E.ReorderIndices.empty()) {
13046 // Check for interleaved stores.
13047 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {
13048 auto *BaseSI = cast<StoreInst>(E.Scalars.front());
13049 assert(Mask.size() > 1 && "Expected mask greater than 1 element.");
13050 if (Mask.size() < 4)
13051 return 0u;
13052 for (unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
13054 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
13056 VecTy, Factor, BaseSI->getAlign(),
13057 BaseSI->getPointerAddressSpace()))
13058 return Factor;
13059 }
13060
13061 return 0u;
13062 };
13063 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
13064 unsigned InterleaveFactor = IsInterleaveMask(Mask);
13065 if (InterleaveFactor != 0)
13066 E.setInterleave(InterleaveFactor);
13067 }
13068 break;
13069 }
13070 case Instruction::Select: {
13071 if (E.State != TreeEntry::Vectorize)
13072 break;
13073 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
13074 if (MinMaxID == Intrinsic::not_intrinsic)
13075 break;
13076 // This node is a minmax node.
13077 E.CombinedOp = TreeEntry::MinMax;
13078 TreeEntry *CondEntry = getOperandEntry(&E, 0);
13079 if (SelectOnly && CondEntry->UserTreeIndex &&
13080 CondEntry->State == TreeEntry::Vectorize) {
13081 // The condition node is part of the combined minmax node.
13082 CondEntry->State = TreeEntry::CombinedVectorize;
13083 }
13084 break;
13085 }
13086 case Instruction::FSub:
13087 case Instruction::FAdd: {
13088 // Check if possible to convert (a*b)+c to fma.
13089 if (E.State != TreeEntry::Vectorize ||
13090 !E.getOperations().isAddSubLikeOp())
13091 break;
13092 if (!canConvertToFMA(E.Scalars, E.getOperations(), *DT, *DL, *TTI, *TLI)
13093 .isValid())
13094 break;
13095 // This node is a fmuladd node.
13096 E.CombinedOp = TreeEntry::FMulAdd;
13097 TreeEntry *FMulEntry = getOperandEntry(&E, 0);
13098 if (FMulEntry->UserTreeIndex &&
13099 FMulEntry->State == TreeEntry::Vectorize) {
13100 // The FMul node is part of the combined fmuladd node.
13101 FMulEntry->State = TreeEntry::CombinedVectorize;
13102 }
13103 break;
13104 }
13105 default:
13106 break;
13107 }
13108 }
13109
13110 if (LoadEntriesToVectorize.empty()) {
13111 // Single load node - exit.
13112 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
13113 VectorizableTree.front()->getOpcode() == Instruction::Load)
13114 return;
13115 // Small graph with small VF - exit.
13116 constexpr unsigned SmallTree = 3;
13117 constexpr unsigned SmallVF = 2;
13118 if ((VectorizableTree.size() <= SmallTree &&
13119 VectorizableTree.front()->Scalars.size() == SmallVF) ||
13120 (VectorizableTree.size() <= 2 && UserIgnoreList))
13121 return;
13122
13123 if (VectorizableTree.front()->isNonPowOf2Vec() &&
13124 getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
13125 getCanonicalGraphSize() <= SmallTree &&
13126 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
13127 [](const std::unique_ptr<TreeEntry> &TE) {
13128 return TE->isGather() && TE->hasState() &&
13129 TE->getOpcode() == Instruction::Load &&
13130 !allSameBlock(TE->Scalars);
13131 }) == 1)
13132 return;
13133 }
13134
13135 // A list of loads to be gathered during the vectorization process. We can
13136 // try to vectorize them at the end, if profitable.
13139 GatheredLoads;
13140
13141 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
13142 TreeEntry &E = *TE;
13143 if (E.isGather() &&
13144 ((E.hasState() && E.getOpcode() == Instruction::Load) ||
13145 (!E.hasState() && any_of(E.Scalars,
13146 [&](Value *V) {
13147 return isa<LoadInst>(V) &&
13148 !isVectorized(V) &&
13149 !isDeleted(cast<Instruction>(V));
13150 }))) &&
13151 !isSplat(E.Scalars)) {
13152 for (Value *V : E.Scalars) {
13153 auto *LI = dyn_cast<LoadInst>(V);
13154 if (!LI)
13155 continue;
13156 if (isDeleted(LI) || isVectorized(LI) || !LI->isSimple())
13157 continue;
13159 *this, V, *DL, *SE, *TTI,
13160 GatheredLoads[std::make_tuple(
13161 LI->getParent(),
13163 LI->getType())]);
13164 }
13165 }
13166 }
13167 // Try to vectorize gathered loads if this is not just a gather of loads.
13168 if (!GatheredLoads.empty())
13169 tryToVectorizeGatheredLoads(GatheredLoads);
13170}
13171
13172/// Merges shuffle masks and emits final shuffle instruction, if required. It
13173/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
13174/// when the actual shuffle instruction is generated only if this is actually
13175/// required. Otherwise, the shuffle instruction emission is delayed till the
13176/// end of the process, to reduce the number of emitted instructions and further
13177/// analysis/transformations.
13178class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
13179 bool IsFinalized = false;
13180 SmallVector<int> CommonMask;
13182 const TargetTransformInfo &TTI;
13184 SmallDenseSet<Value *> VectorizedVals;
13185 BoUpSLP &R;
13186 SmallPtrSetImpl<Value *> &CheckedExtracts;
13187 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13188 /// While set, still trying to estimate the cost for the same nodes and we
13189 /// can delay actual cost estimation (virtual shuffle instruction emission).
13190 /// May help better estimate the cost if same nodes must be permuted + allows
13191 /// to move most of the long shuffles cost estimation to TTI.
13192 bool SameNodesEstimated = true;
13193
13194 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
13195 if (Ty->getScalarType()->isPointerTy()) {
13199 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
13200 Ty->getScalarType());
13201 if (auto *VTy = dyn_cast<VectorType>(Ty))
13202 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
13203 return Res;
13204 }
13205 return Constant::getAllOnesValue(Ty);
13206 }
13207
13208 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
13209 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
13210 return TTI::TCC_Free;
13211 auto *VecTy = getWidenedType(ScalarTy, VL.size());
13212 InstructionCost GatherCost = 0;
13213 SmallVector<Value *> Gathers(VL);
13214 if (!Root && isSplat(VL)) {
13215 // Found the broadcasting of the single scalar, calculate the cost as
13216 // the broadcast.
13217 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
13218 assert(It != VL.end() && "Expected at least one non-undef value.");
13219 // Add broadcast for non-identity shuffle only.
13220 bool NeedShuffle =
13221 count(VL, *It) > 1 &&
13222 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
13223 if (!NeedShuffle) {
13224 if (isa<FixedVectorType>(ScalarTy)) {
13225 assert(SLPReVec && "FixedVectorType is not expected.");
13226 return TTI.getShuffleCost(
13227 TTI::SK_InsertSubvector, VecTy, VecTy, {}, CostKind,
13228 std::distance(VL.begin(), It) * getNumElements(ScalarTy),
13229 cast<FixedVectorType>(ScalarTy));
13230 }
13231 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
13232 CostKind, std::distance(VL.begin(), It),
13233 PoisonValue::get(VecTy), *It);
13234 }
13235
13236 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
13237 transform(VL, ShuffleMask.begin(), [](Value *V) {
13238 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
13239 });
13240 InstructionCost InsertCost =
13241 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
13242 PoisonValue::get(VecTy), *It);
13243 return InsertCost + ::getShuffleCost(TTI,
13245 VecTy, ShuffleMask, CostKind,
13246 /*Index=*/0, /*SubTp=*/nullptr,
13247 /*Args=*/*It);
13248 }
13249 return GatherCost +
13250 (all_of(Gathers, IsaPred<UndefValue>)
13252 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
13253 ScalarTy));
13254 };
13255
13256 /// Compute the cost of creating a vector containing the extracted values from
13257 /// \p VL.
13259 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
13260 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13261 unsigned NumParts) {
13262 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
13263 unsigned NumElts =
13264 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
13265 auto *EE = dyn_cast<ExtractElementInst>(V);
13266 if (!EE)
13267 return Sz;
13268 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
13269 if (!VecTy)
13270 return Sz;
13271 return std::max(Sz, VecTy->getNumElements());
13272 });
13273 // FIXME: this must be moved to TTI for better estimation.
13274 unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
13275 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
13277 SmallVectorImpl<unsigned> &SubVecSizes)
13278 -> std::optional<TTI::ShuffleKind> {
13279 if (NumElts <= EltsPerVector)
13280 return std::nullopt;
13281 int OffsetReg0 =
13282 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
13283 [](int S, int I) {
13284 if (I == PoisonMaskElem)
13285 return S;
13286 return std::min(S, I);
13287 }),
13288 EltsPerVector);
13289 int OffsetReg1 = OffsetReg0;
13290 DenseSet<int> RegIndices;
13291 // Check that if trying to permute same single/2 input vectors.
13293 int FirstRegId = -1;
13294 Indices.assign(1, OffsetReg0);
13295 for (auto [Pos, I] : enumerate(Mask)) {
13296 if (I == PoisonMaskElem)
13297 continue;
13298 int Idx = I - OffsetReg0;
13299 int RegId =
13300 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
13301 if (FirstRegId < 0)
13302 FirstRegId = RegId;
13303 RegIndices.insert(RegId);
13304 if (RegIndices.size() > 2)
13305 return std::nullopt;
13306 if (RegIndices.size() == 2) {
13307 ShuffleKind = TTI::SK_PermuteTwoSrc;
13308 if (Indices.size() == 1) {
13309 OffsetReg1 = alignDown(
13310 std::accumulate(
13311 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
13312 [&](int S, int I) {
13313 if (I == PoisonMaskElem)
13314 return S;
13315 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
13316 ((I - OffsetReg0) % NumElts) / EltsPerVector;
13317 if (RegId == FirstRegId)
13318 return S;
13319 return std::min(S, I);
13320 }),
13321 EltsPerVector);
13322 unsigned Index = OffsetReg1 % NumElts;
13323 Indices.push_back(Index);
13324 SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
13325 }
13326 Idx = I - OffsetReg1;
13327 }
13328 I = (Idx % NumElts) % EltsPerVector +
13329 (RegId == FirstRegId ? 0 : EltsPerVector);
13330 }
13331 return ShuffleKind;
13332 };
13334
13335 // Process extracts in blocks of EltsPerVector to check if the source vector
13336 // operand can be re-used directly. If not, add the cost of creating a
13337 // shuffle to extract the values into a vector register.
13338 for (unsigned Part : seq<unsigned>(NumParts)) {
13339 if (!ShuffleKinds[Part])
13340 continue;
13341 ArrayRef<int> MaskSlice = Mask.slice(
13342 Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
13343 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
13344 copy(MaskSlice, SubMask.begin());
13346 SmallVector<unsigned, 2> SubVecSizes;
13347 std::optional<TTI::ShuffleKind> RegShuffleKind =
13348 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
13349 if (!RegShuffleKind) {
13350 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
13352 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
13353 Cost +=
13354 ::getShuffleCost(TTI, *ShuffleKinds[Part],
13355 getWidenedType(ScalarTy, NumElts), MaskSlice);
13356 continue;
13357 }
13358 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
13359 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
13360 Cost +=
13361 ::getShuffleCost(TTI, *RegShuffleKind,
13362 getWidenedType(ScalarTy, EltsPerVector), SubMask);
13363 }
13364 const unsigned BaseVF = getFullVectorNumberOfElements(
13365 *R.TTI, VL.front()->getType(), alignTo(NumElts, EltsPerVector));
13366 for (const auto [Idx, SubVecSize] : zip(Indices, SubVecSizes)) {
13367 assert((Idx + SubVecSize) <= BaseVF &&
13368 "SK_ExtractSubvector index out of range");
13370 getWidenedType(ScalarTy, BaseVF), {}, CostKind,
13371 Idx, getWidenedType(ScalarTy, SubVecSize));
13372 }
13373 // Second attempt to check, if just a permute is better estimated than
13374 // subvector extract.
13375 SubMask.assign(NumElts, PoisonMaskElem);
13376 copy(MaskSlice, SubMask.begin());
13377 InstructionCost OriginalCost = ::getShuffleCost(
13378 TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask);
13379 if (OriginalCost < Cost)
13380 Cost = OriginalCost;
13381 }
13382 return Cost;
13383 }
13384 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
13385 /// mask \p Mask, register number \p Part, that includes \p SliceSize
13386 /// elements.
13387 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
13388 ArrayRef<int> Mask, unsigned Part,
13389 unsigned SliceSize) {
13390 if (SameNodesEstimated) {
13391 // Delay the cost estimation if the same nodes are reshuffling.
13392 // If we already requested the cost of reshuffling of E1 and E2 before, no
13393 // need to estimate another cost with the sub-Mask, instead include this
13394 // sub-Mask into the CommonMask to estimate it later and avoid double cost
13395 // estimation.
13396 if ((InVectors.size() == 2 &&
13397 cast<const TreeEntry *>(InVectors.front()) == &E1 &&
13398 cast<const TreeEntry *>(InVectors.back()) == E2) ||
13399 (!E2 && cast<const TreeEntry *>(InVectors.front()) == &E1)) {
13400 unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
13401 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
13402 [](int Idx) { return Idx == PoisonMaskElem; }) &&
13403 "Expected all poisoned elements.");
13404 ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
13405 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
13406 return;
13407 }
13408 // Found non-matching nodes - need to estimate the cost for the matched
13409 // and transform mask.
13410 Cost += createShuffle(InVectors.front(),
13411 InVectors.size() == 1 ? nullptr : InVectors.back(),
13412 CommonMask);
13413 transformMaskAfterShuffle(CommonMask, CommonMask);
13414 } else if (InVectors.size() == 2) {
13415 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
13416 transformMaskAfterShuffle(CommonMask, CommonMask);
13417 }
13418 SameNodesEstimated = false;
13419 if (!E2 && InVectors.size() == 1) {
13420 unsigned VF = E1.getVectorFactor();
13421 if (Value *V1 = dyn_cast<Value *>(InVectors.front())) {
13422 VF = std::max(VF, getVF(V1));
13423 } else {
13424 const auto *E = cast<const TreeEntry *>(InVectors.front());
13425 VF = std::max(VF, E->getVectorFactor());
13426 }
13427 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13428 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
13429 CommonMask[Idx] = Mask[Idx] + VF;
13430 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
13431 transformMaskAfterShuffle(CommonMask, CommonMask);
13432 } else {
13433 auto P = InVectors.front();
13434 Cost += createShuffle(&E1, E2, Mask);
13435 unsigned VF = Mask.size();
13436 if (Value *V1 = dyn_cast<Value *>(P)) {
13437 VF = std::max(VF,
13438 getNumElements(V1->getType()));
13439 } else {
13440 const auto *E = cast<const TreeEntry *>(P);
13441 VF = std::max(VF, E->getVectorFactor());
13442 }
13443 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13444 if (Mask[Idx] != PoisonMaskElem)
13445 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
13446 Cost += createShuffle(P, InVectors.front(), CommonMask);
13447 transformMaskAfterShuffle(CommonMask, CommonMask);
13448 }
13449 }
13450
13451 class ShuffleCostBuilder {
13452 const TargetTransformInfo &TTI;
13453
13454 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
13455 int Index = -1;
13456 return Mask.empty() ||
13457 (VF == Mask.size() &&
13460 Index == 0);
13461 }
13462
13463 public:
13464 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
13465 ~ShuffleCostBuilder() = default;
13466 InstructionCost createShuffleVector(Value *V1, Value *,
13467 ArrayRef<int> Mask) const {
13468 // Empty mask or identity mask are free.
13469 unsigned VF =
13470 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
13471 if (isEmptyOrIdentity(Mask, VF))
13472 return TTI::TCC_Free;
13473 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
13474 cast<VectorType>(V1->getType()), Mask);
13475 }
13476 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
13477 // Empty mask or identity mask are free.
13478 unsigned VF =
13479 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
13480 if (isEmptyOrIdentity(Mask, VF))
13481 return TTI::TCC_Free;
13482 return ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
13483 cast<VectorType>(V1->getType()), Mask);
13484 }
13485 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
13486 InstructionCost createPoison(Type *Ty, unsigned VF) const {
13487 return TTI::TCC_Free;
13488 }
13489 void resizeToMatch(Value *&, Value *&) const {}
13490 };
13491
13492 /// Smart shuffle instruction emission, walks through shuffles trees and
13493 /// tries to find the best matching vector for the actual shuffle
13494 /// instruction.
13496 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
13498 ArrayRef<int> Mask) {
13499 ShuffleCostBuilder Builder(TTI);
13500 SmallVector<int> CommonMask(Mask);
13501 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
13502 unsigned CommonVF = Mask.size();
13503 InstructionCost ExtraCost = 0;
13504 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
13505 unsigned VF) -> InstructionCost {
13506 if (E.isGather() && allConstant(E.Scalars))
13507 return TTI::TCC_Free;
13508 Type *EScalarTy = E.Scalars.front()->getType();
13509 bool IsSigned = true;
13510 if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
13511 EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
13512 IsSigned = It->second.second;
13513 }
13514 if (EScalarTy != ScalarTy) {
13515 unsigned CastOpcode = Instruction::Trunc;
13516 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13517 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13518 if (DstSz > SrcSz)
13519 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13520 return TTI.getCastInstrCost(CastOpcode, getWidenedType(ScalarTy, VF),
13521 getWidenedType(EScalarTy, VF),
13522 TTI::CastContextHint::None, CostKind);
13523 }
13524 return TTI::TCC_Free;
13525 };
13526 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
13527 if (isa<Constant>(V))
13528 return TTI::TCC_Free;
13529 auto *VecTy = cast<VectorType>(V->getType());
13530 Type *EScalarTy = VecTy->getElementType();
13531 if (EScalarTy != ScalarTy) {
13532 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
13533 unsigned CastOpcode = Instruction::Trunc;
13534 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13535 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13536 if (DstSz > SrcSz)
13537 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13538 return TTI.getCastInstrCost(
13539 CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
13540 VecTy, TTI::CastContextHint::None, CostKind);
13541 }
13542 return TTI::TCC_Free;
13543 };
13544 if (!V1 && !V2 && !P2.isNull()) {
13545 // Shuffle 2 entry nodes.
13546 const TreeEntry *E = cast<const TreeEntry *>(P1);
13547 unsigned VF = E->getVectorFactor();
13548 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
13549 CommonVF = std::max(VF, E2->getVectorFactor());
13550 assert(all_of(Mask,
13551 [=](int Idx) {
13552 return Idx < 2 * static_cast<int>(CommonVF);
13553 }) &&
13554 "All elements in mask must be less than 2 * CommonVF.");
13555 if (E->Scalars.size() == E2->Scalars.size()) {
13556 SmallVector<int> EMask = E->getCommonMask();
13557 SmallVector<int> E2Mask = E2->getCommonMask();
13558 if (!EMask.empty() || !E2Mask.empty()) {
13559 for (int &Idx : CommonMask) {
13560 if (Idx == PoisonMaskElem)
13561 continue;
13562 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
13563 Idx = EMask[Idx];
13564 else if (Idx >= static_cast<int>(CommonVF))
13565 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
13566 E->Scalars.size();
13567 }
13568 }
13569 CommonVF = E->Scalars.size();
13570 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
13571 GetNodeMinBWAffectedCost(*E2, CommonVF);
13572 } else {
13573 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
13574 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
13575 }
13576 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13577 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13578 } else if (!V1 && P2.isNull()) {
13579 // Shuffle single entry node.
13580 const TreeEntry *E = cast<const TreeEntry *>(P1);
13581 unsigned VF = E->getVectorFactor();
13582 CommonVF = VF;
13583 assert(
13584 all_of(Mask,
13585 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
13586 "All elements in mask must be less than CommonVF.");
13587 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
13588 SmallVector<int> EMask = E->getCommonMask();
13589 assert(!EMask.empty() && "Expected non-empty common mask.");
13590 for (int &Idx : CommonMask) {
13591 if (Idx != PoisonMaskElem)
13592 Idx = EMask[Idx];
13593 }
13594 CommonVF = E->Scalars.size();
13595 } else if (unsigned Factor = E->getInterleaveFactor();
13596 Factor > 0 && E->Scalars.size() != Mask.size() &&
13598 Factor)) {
13599 // Deinterleaved nodes are free.
13600 std::iota(CommonMask.begin(), CommonMask.end(), 0);
13601 }
13602 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
13603 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13604 // Not identity/broadcast? Try to see if the original vector is better.
13605 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
13606 CommonVF == CommonMask.size() &&
13607 any_of(enumerate(CommonMask),
13608 [](const auto &&P) {
13609 return P.value() != PoisonMaskElem &&
13610 static_cast<unsigned>(P.value()) != P.index();
13611 }) &&
13612 any_of(CommonMask,
13613 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
13614 SmallVector<int> ReorderMask;
13615 inversePermutation(E->ReorderIndices, ReorderMask);
13616 ::addMask(CommonMask, ReorderMask);
13617 }
13618 } else if (V1 && P2.isNull()) {
13619 // Shuffle single vector.
13620 ExtraCost += GetValueMinBWAffectedCost(V1);
13621 CommonVF = getVF(V1);
13622 assert(
13623 all_of(Mask,
13624 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
13625 "All elements in mask must be less than CommonVF.");
13626 } else if (V1 && !V2) {
13627 // Shuffle vector and tree node.
13628 unsigned VF = getVF(V1);
13629 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
13630 CommonVF = std::max(VF, E2->getVectorFactor());
13631 assert(all_of(Mask,
13632 [=](int Idx) {
13633 return Idx < 2 * static_cast<int>(CommonVF);
13634 }) &&
13635 "All elements in mask must be less than 2 * CommonVF.");
13636 if (E2->Scalars.size() == VF && VF != CommonVF) {
13637 SmallVector<int> E2Mask = E2->getCommonMask();
13638 assert(!E2Mask.empty() && "Expected non-empty common mask.");
13639 for (int &Idx : CommonMask) {
13640 if (Idx == PoisonMaskElem)
13641 continue;
13642 if (Idx >= static_cast<int>(CommonVF))
13643 Idx = E2Mask[Idx - CommonVF] + VF;
13644 }
13645 CommonVF = VF;
13646 }
13647 ExtraCost += GetValueMinBWAffectedCost(V1);
13648 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13649 ExtraCost += GetNodeMinBWAffectedCost(
13650 *E2, std::min(CommonVF, E2->getVectorFactor()));
13651 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13652 } else if (!V1 && V2) {
13653 // Shuffle vector and tree node.
13654 unsigned VF = getVF(V2);
13655 const TreeEntry *E1 = cast<const TreeEntry *>(P1);
13656 CommonVF = std::max(VF, E1->getVectorFactor());
13657 assert(all_of(Mask,
13658 [=](int Idx) {
13659 return Idx < 2 * static_cast<int>(CommonVF);
13660 }) &&
13661 "All elements in mask must be less than 2 * CommonVF.");
13662 if (E1->Scalars.size() == VF && VF != CommonVF) {
13663 SmallVector<int> E1Mask = E1->getCommonMask();
13664 assert(!E1Mask.empty() && "Expected non-empty common mask.");
13665 for (int &Idx : CommonMask) {
13666 if (Idx == PoisonMaskElem)
13667 continue;
13668 if (Idx >= static_cast<int>(CommonVF))
13669 Idx = E1Mask[Idx - CommonVF] + VF;
13670 else
13671 Idx = E1Mask[Idx];
13672 }
13673 CommonVF = VF;
13674 }
13675 ExtraCost += GetNodeMinBWAffectedCost(
13676 *E1, std::min(CommonVF, E1->getVectorFactor()));
13677 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13678 ExtraCost += GetValueMinBWAffectedCost(V2);
13679 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13680 } else {
13681 assert(V1 && V2 && "Expected both vectors.");
13682 unsigned VF = getVF(V1);
13683 CommonVF = std::max(VF, getVF(V2));
13684 assert(all_of(Mask,
13685 [=](int Idx) {
13686 return Idx < 2 * static_cast<int>(CommonVF);
13687 }) &&
13688 "All elements in mask must be less than 2 * CommonVF.");
13689 ExtraCost +=
13690 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
13691 if (V1->getType() != V2->getType()) {
13692 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13693 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13694 } else {
13695 if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
13696 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13697 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
13698 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13699 }
13700 }
13701 InVectors.front() =
13702 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
13703 if (InVectors.size() == 2)
13704 InVectors.pop_back();
13705 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
13706 V1, V2, CommonMask, Builder, ScalarTy);
13707 }
13708
13709public:
13711 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
13712 SmallPtrSetImpl<Value *> &CheckedExtracts)
13713 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
13714 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
13715 CheckedExtracts(CheckedExtracts) {}
13716 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
13717 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13718 unsigned NumParts, bool &UseVecBaseAsInput) {
13719 UseVecBaseAsInput = false;
13720 if (Mask.empty())
13721 return nullptr;
13722 Value *VecBase = nullptr;
13723 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
13724 if (!E->ReorderIndices.empty()) {
13725 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
13726 E->ReorderIndices.end());
13727 reorderScalars(VL, ReorderMask);
13728 }
13729 // Check if it can be considered reused if same extractelements were
13730 // vectorized already.
13731 bool PrevNodeFound = any_of(
13732 ArrayRef(R.VectorizableTree).take_front(E->Idx),
13733 [&](const std::unique_ptr<TreeEntry> &TE) {
13734 return ((TE->hasState() && !TE->isAltShuffle() &&
13735 TE->getOpcode() == Instruction::ExtractElement) ||
13736 TE->isGather()) &&
13737 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
13738 return VL.size() > Data.index() &&
13739 (Mask[Data.index()] == PoisonMaskElem ||
13740 isa<UndefValue>(VL[Data.index()]) ||
13741 Data.value() == VL[Data.index()]);
13742 });
13743 });
13744 SmallPtrSet<Value *, 4> UniqueBases;
13745 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
13746 SmallDenseMap<Value *, APInt, 4> VectorOpsToExtracts;
13747 for (unsigned Part : seq<unsigned>(NumParts)) {
13748 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
13749 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
13750 for (auto [I, V] :
13751 enumerate(ArrayRef(VL).slice(Part * SliceSize, Limit))) {
13752 // Ignore non-extractelement scalars.
13753 if (isa<UndefValue>(V) ||
13754 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
13755 continue;
13756 // If all users of instruction are going to be vectorized and this
13757 // instruction itself is not going to be vectorized, consider this
13758 // instruction as dead and remove its cost from the final cost of the
13759 // vectorized tree.
13760 // Also, avoid adjusting the cost for extractelements with multiple uses
13761 // in different graph entries.
13762 auto *EE = cast<ExtractElementInst>(V);
13763 VecBase = EE->getVectorOperand();
13764 UniqueBases.insert(VecBase);
13765 ArrayRef<TreeEntry *> VEs = R.getTreeEntries(V);
13766 if (!CheckedExtracts.insert(V).second ||
13767 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
13768 any_of(EE->users(),
13769 [&](User *U) {
13770 return isa<GetElementPtrInst>(U) &&
13771 !R.areAllUsersVectorized(cast<Instruction>(U),
13772 &VectorizedVals);
13773 }) ||
13774 (!VEs.empty() && !is_contained(VEs, E)))
13775 continue;
13776 std::optional<unsigned> EEIdx = getExtractIndex(EE);
13777 if (!EEIdx)
13778 continue;
13779 unsigned Idx = *EEIdx;
13780 // Take credit for instruction that will become dead.
13781 if (EE->hasOneUse() || !PrevNodeFound) {
13782 Instruction *Ext = EE->user_back();
13783 if (isa<SExtInst, ZExtInst>(Ext) &&
13784 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
13785 // Use getExtractWithExtendCost() to calculate the cost of
13786 // extractelement/ext pair.
13788 Ext->getOpcode(), Ext->getType(), EE->getVectorOperandType(),
13789 Idx, CostKind);
13790 // Add back the cost of s|zext which is subtracted separately.
13792 Ext->getOpcode(), Ext->getType(), EE->getType(),
13793 TTI::getCastContextHint(Ext), CostKind, Ext);
13794 continue;
13795 }
13796 }
13797 APInt &DemandedElts =
13798 VectorOpsToExtracts
13799 .try_emplace(VecBase,
13800 APInt::getZero(getNumElements(VecBase->getType())))
13801 .first->getSecond();
13802 DemandedElts.setBit(Idx);
13803 }
13804 }
13805 for (const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
13806 Cost -= TTI.getScalarizationOverhead(cast<VectorType>(Vec->getType()),
13807 DemandedElts, /*Insert=*/false,
13808 /*Extract=*/true, CostKind);
13809 // Check that gather of extractelements can be represented as just a
13810 // shuffle of a single/two vectors the scalars are extracted from.
13811 // Found the bunch of extractelement instructions that must be gathered
13812 // into a vector and can be represented as a permutation elements in a
13813 // single input vector or of 2 input vectors.
13814 // Done for reused if same extractelements were vectorized already.
13815 if (!PrevNodeFound)
13816 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
13817 InVectors.assign(1, E);
13818 CommonMask.assign(Mask.begin(), Mask.end());
13819 transformMaskAfterShuffle(CommonMask, CommonMask);
13820 SameNodesEstimated = false;
13821 if (NumParts != 1 && UniqueBases.size() != 1) {
13822 UseVecBaseAsInput = true;
13823 VecBase =
13824 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
13825 }
13826 return VecBase;
13827 }
13828 /// Checks if the specified entry \p E needs to be delayed because of its
13829 /// dependency nodes.
13830 std::optional<InstructionCost>
13831 needToDelay(const TreeEntry *,
13833 // No need to delay the cost estimation during analysis.
13834 return std::nullopt;
13835 }
13836 /// Reset the builder to handle perfect diamond match.
13838 IsFinalized = false;
13839 CommonMask.clear();
13840 InVectors.clear();
13841 Cost = 0;
13842 VectorizedVals.clear();
13843 SameNodesEstimated = true;
13844 }
13845 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
13846 if (&E1 == &E2) {
13847 assert(all_of(Mask,
13848 [&](int Idx) {
13849 return Idx < static_cast<int>(E1.getVectorFactor());
13850 }) &&
13851 "Expected single vector shuffle mask.");
13852 add(E1, Mask);
13853 return;
13854 }
13855 if (InVectors.empty()) {
13856 CommonMask.assign(Mask.begin(), Mask.end());
13857 InVectors.assign({&E1, &E2});
13858 return;
13859 }
13860 assert(!CommonMask.empty() && "Expected non-empty common mask.");
13861 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
13862 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
13863 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
13864 const auto *It =
13865 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
13866 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
13867 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
13868 }
13869 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
13870 if (InVectors.empty()) {
13871 CommonMask.assign(Mask.begin(), Mask.end());
13872 InVectors.assign(1, &E1);
13873 return;
13874 }
13875 assert(!CommonMask.empty() && "Expected non-empty common mask.");
13876 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
13877 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
13878 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
13879 const auto *It =
13880 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
13881 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
13882 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
13883 if (!SameNodesEstimated && InVectors.size() == 1)
13884 InVectors.emplace_back(&E1);
13885 }
13886 /// Adds 2 input vectors and the mask for their shuffling.
13887 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
13888 // May come only for shuffling of 2 vectors with extractelements, already
13889 // handled in adjustExtracts.
13890 assert(InVectors.size() == 1 &&
13891 all_of(enumerate(CommonMask),
13892 [&](auto P) {
13893 if (P.value() == PoisonMaskElem)
13894 return Mask[P.index()] == PoisonMaskElem;
13895 auto *EI = cast<ExtractElementInst>(
13896 cast<const TreeEntry *>(InVectors.front())
13897 ->getOrdered(P.index()));
13898 return EI->getVectorOperand() == V1 ||
13899 EI->getVectorOperand() == V2;
13900 }) &&
13901 "Expected extractelement vectors.");
13902 }
13903 /// Adds another one input vector and the mask for the shuffling.
13904 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
13905 if (InVectors.empty()) {
13906 assert(CommonMask.empty() && !ForExtracts &&
13907 "Expected empty input mask/vectors.");
13908 CommonMask.assign(Mask.begin(), Mask.end());
13909 InVectors.assign(1, V1);
13910 return;
13911 }
13912 if (ForExtracts) {
13913 // No need to add vectors here, already handled them in adjustExtracts.
13914 assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
13915 !CommonMask.empty() &&
13916 all_of(enumerate(CommonMask),
13917 [&](auto P) {
13918 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
13919 ->getOrdered(P.index());
13920 if (P.value() == PoisonMaskElem)
13921 return P.value() == Mask[P.index()] ||
13922 isa<UndefValue>(Scalar);
13923 if (isa<Constant>(V1))
13924 return true;
13925 auto *EI = cast<ExtractElementInst>(Scalar);
13926 return EI->getVectorOperand() == V1;
13927 }) &&
13928 "Expected only tree entry for extractelement vectors.");
13929 return;
13930 }
13931 assert(!InVectors.empty() && !CommonMask.empty() &&
13932 "Expected only tree entries from extracts/reused buildvectors.");
13933 unsigned VF = getVF(V1);
13934 if (InVectors.size() == 2) {
13935 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
13936 transformMaskAfterShuffle(CommonMask, CommonMask);
13937 VF = std::max<unsigned>(VF, CommonMask.size());
13938 } else if (const auto *InTE =
13939 InVectors.front().dyn_cast<const TreeEntry *>()) {
13940 VF = std::max(VF, InTE->getVectorFactor());
13941 } else {
13942 VF = std::max(
13943 VF, cast<FixedVectorType>(cast<Value *>(InVectors.front())->getType())
13944 ->getNumElements());
13945 }
13946 InVectors.push_back(V1);
13947 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13948 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
13949 CommonMask[Idx] = Mask[Idx] + VF;
13950 }
13951 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
13952 Value *Root = nullptr) {
13953 Cost += getBuildVectorCost(VL, Root);
13954 if (!Root) {
13955 // FIXME: Need to find a way to avoid use of getNullValue here.
13957 unsigned VF = VL.size();
13958 if (MaskVF != 0)
13959 VF = std::min(VF, MaskVF);
13960 Type *VLScalarTy = VL.front()->getType();
13961 for (Value *V : VL.take_front(VF)) {
13962 Type *ScalarTy = VLScalarTy->getScalarType();
13963 if (isa<PoisonValue>(V)) {
13964 Vals.push_back(PoisonValue::get(ScalarTy));
13965 continue;
13966 }
13967 if (isa<UndefValue>(V)) {
13968 Vals.push_back(UndefValue::get(ScalarTy));
13969 continue;
13970 }
13971 Vals.push_back(Constant::getNullValue(ScalarTy));
13972 }
13973 if (auto *VecTy = dyn_cast<FixedVectorType>(VLScalarTy)) {
13974 assert(SLPReVec && "FixedVectorType is not expected.");
13975 // When REVEC is enabled, we need to expand vector types into scalar
13976 // types.
13977 Vals = replicateMask(Vals, VecTy->getNumElements());
13978 }
13979 return ConstantVector::get(Vals);
13980 }
13983 cast<FixedVectorType>(Root->getType())->getNumElements()),
13984 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
13985 }
13987 /// Finalize emission of the shuffles.
13989 ArrayRef<int> ExtMask,
13990 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
13991 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
13994 Action = {}) {
13995 IsFinalized = true;
13996 if (Action) {
13997 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
13998 if (InVectors.size() == 2)
13999 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
14000 else
14001 Cost += createShuffle(Vec, nullptr, CommonMask);
14002 transformMaskAfterShuffle(CommonMask, CommonMask);
14003 assert(VF > 0 &&
14004 "Expected vector length for the final value before action.");
14005 Value *V = cast<Value *>(Vec);
14006 Action(V, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
14007 Cost += createShuffle(V1, V2, Mask);
14008 return V1;
14009 });
14010 InVectors.front() = V;
14011 }
14012 if (!SubVectors.empty()) {
14013 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
14014 if (InVectors.size() == 2)
14015 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
14016 else
14017 Cost += createShuffle(Vec, nullptr, CommonMask);
14018 transformMaskAfterShuffle(CommonMask, CommonMask);
14019 // Add subvectors permutation cost.
14020 if (!SubVectorsMask.empty()) {
14021 assert(SubVectorsMask.size() <= CommonMask.size() &&
14022 "Expected same size of masks for subvectors and common mask.");
14023 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
14024 copy(SubVectorsMask, SVMask.begin());
14025 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
14026 if (I2 != PoisonMaskElem) {
14027 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
14028 I1 = I2 + CommonMask.size();
14029 }
14030 }
14032 getWidenedType(ScalarTy, CommonMask.size()),
14033 SVMask, CostKind);
14034 }
14035 for (auto [E, Idx] : SubVectors) {
14036 Type *EScalarTy = E->Scalars.front()->getType();
14037 bool IsSigned = true;
14038 if (auto It = R.MinBWs.find(E); It != R.MinBWs.end()) {
14039 EScalarTy =
14040 IntegerType::get(EScalarTy->getContext(), It->second.first);
14041 IsSigned = It->second.second;
14042 }
14043 if (ScalarTy != EScalarTy) {
14044 unsigned CastOpcode = Instruction::Trunc;
14045 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
14046 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
14047 if (DstSz > SrcSz)
14048 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14050 CastOpcode, getWidenedType(ScalarTy, E->getVectorFactor()),
14051 getWidenedType(EScalarTy, E->getVectorFactor()),
14053 }
14056 getWidenedType(ScalarTy, CommonMask.size()), {}, CostKind, Idx,
14057 getWidenedType(ScalarTy, E->getVectorFactor()));
14058 if (!CommonMask.empty()) {
14059 std::iota(std::next(CommonMask.begin(), Idx),
14060 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
14061 Idx);
14062 }
14063 }
14064 }
14065
14066 if (!ExtMask.empty()) {
14067 if (CommonMask.empty()) {
14068 CommonMask.assign(ExtMask.begin(), ExtMask.end());
14069 } else {
14070 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
14071 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
14072 if (ExtMask[I] == PoisonMaskElem)
14073 continue;
14074 NewMask[I] = CommonMask[ExtMask[I]];
14075 }
14076 CommonMask.swap(NewMask);
14077 }
14078 }
14079 if (CommonMask.empty()) {
14080 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
14081 return Cost;
14082 }
14083 return Cost +
14084 createShuffle(InVectors.front(),
14085 InVectors.size() == 2 ? InVectors.back() : nullptr,
14086 CommonMask);
14087 }
14088
14090 assert((IsFinalized || CommonMask.empty()) &&
14091 "Shuffle construction must be finalized.");
14092 }
14093};
14094
14095const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
14096 unsigned Idx) const {
14097 TreeEntry *Op = OperandsToTreeEntry.at({E, Idx});
14098 assert(Op->isSame(E->getOperand(Idx)) && "Operands mismatch!");
14099 return Op;
14100}
14101
14102TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
14103 if (TE.State == TreeEntry::ScatterVectorize ||
14104 TE.State == TreeEntry::StridedVectorize)
14106 if (TE.State == TreeEntry::CompressVectorize)
14108 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
14109 !TE.isAltShuffle()) {
14110 if (TE.ReorderIndices.empty())
14112 SmallVector<int> Mask;
14113 inversePermutation(TE.ReorderIndices, Mask);
14114 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
14116 }
14118}
14119
14121BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
14122 SmallPtrSetImpl<Value *> &CheckedExtracts) {
14123 ArrayRef<Value *> VL = E->Scalars;
14124
14125 Type *ScalarTy = getValueType(VL[0]);
14126 if (!isValidElementType(ScalarTy))
14129
14130 // If we have computed a smaller type for the expression, update VecTy so
14131 // that the costs will be accurate.
14132 auto It = MinBWs.find(E);
14133 Type *OrigScalarTy = ScalarTy;
14134 if (It != MinBWs.end()) {
14135 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
14136 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
14137 if (VecTy)
14138 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
14139 }
14140 auto *VecTy = getWidenedType(ScalarTy, VL.size());
14141 unsigned EntryVF = E->getVectorFactor();
14142 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
14143
14144 if (E->isGather()) {
14145 if (allConstant(VL))
14146 return 0;
14147 if (isa<InsertElementInst>(VL[0]))
14149 if (isa<CmpInst>(VL.front()))
14150 ScalarTy = VL.front()->getType();
14151 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
14152 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
14153 }
14154 if (E->State == TreeEntry::SplitVectorize) {
14155 assert(E->CombinedEntriesWithIndices.size() == 2 &&
14156 "Expected exactly 2 combined entries.");
14157 assert(E->ReuseShuffleIndices.empty() && "Expected empty reuses mask.");
14158 InstructionCost VectorCost = 0;
14159 if (E->ReorderIndices.empty()) {
14160 VectorCost = ::getShuffleCost(
14161 *TTI, TTI::SK_InsertSubvector, FinalVecTy, {}, CostKind,
14162 E->CombinedEntriesWithIndices.back().second,
14164 ScalarTy,
14165 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
14166 ->getVectorFactor()));
14167 } else {
14168 unsigned CommonVF =
14169 std::max(VectorizableTree[E->CombinedEntriesWithIndices.front().first]
14170 ->getVectorFactor(),
14171 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
14172 ->getVectorFactor());
14174 getWidenedType(ScalarTy, CommonVF),
14175 E->getSplitMask(), CostKind);
14176 }
14177 LLVM_DEBUG(dumpTreeCosts(E, 0, VectorCost, 0, "Calculated costs for Tree"));
14178 return VectorCost;
14179 }
14180 InstructionCost CommonCost = 0;
14182 if (!E->ReorderIndices.empty() && E->State != TreeEntry::CompressVectorize &&
14183 (E->State != TreeEntry::StridedVectorize ||
14184 !isReverseOrder(E->ReorderIndices))) {
14185 SmallVector<int> NewMask;
14186 if (E->getOpcode() == Instruction::Store) {
14187 // For stores the order is actually a mask.
14188 NewMask.resize(E->ReorderIndices.size());
14189 copy(E->ReorderIndices, NewMask.begin());
14190 } else {
14191 inversePermutation(E->ReorderIndices, NewMask);
14192 }
14193 ::addMask(Mask, NewMask);
14194 }
14195 if (!E->ReuseShuffleIndices.empty())
14196 ::addMask(Mask, E->ReuseShuffleIndices);
14197 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
14198 CommonCost =
14199 ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
14200 assert((E->State == TreeEntry::Vectorize ||
14201 E->State == TreeEntry::ScatterVectorize ||
14202 E->State == TreeEntry::StridedVectorize ||
14203 E->State == TreeEntry::CompressVectorize) &&
14204 "Unhandled state");
14205 assert(E->getOpcode() &&
14206 ((allSameType(VL) && allSameBlock(VL)) ||
14207 (E->getOpcode() == Instruction::GetElementPtr &&
14208 E->getMainOp()->getType()->isPointerTy()) ||
14209 E->hasCopyableElements()) &&
14210 "Invalid VL");
14211 Instruction *VL0 = E->getMainOp();
14212 unsigned ShuffleOrOp =
14213 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
14214 if (E->CombinedOp != TreeEntry::NotCombinedOp)
14215 ShuffleOrOp = E->CombinedOp;
14216 SmallSetVector<Value *, 16> UniqueValues(VL.begin(), VL.end());
14217 const unsigned Sz = UniqueValues.size();
14218 SmallBitVector UsedScalars(Sz, false);
14219 for (unsigned I = 0; I < Sz; ++I) {
14220 if (isa<Instruction>(UniqueValues[I]) &&
14221 !E->isCopyableElement(UniqueValues[I]) &&
14222 getTreeEntries(UniqueValues[I]).front() == E)
14223 continue;
14224 UsedScalars.set(I);
14225 }
14226 auto GetCastContextHint = [&](Value *V) {
14227 if (ArrayRef<TreeEntry *> OpTEs = getTreeEntries(V); OpTEs.size() == 1)
14228 return getCastContextHint(*OpTEs.front());
14229 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
14230 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
14231 !SrcState.isAltShuffle())
14234 };
14235 auto GetCostDiff =
14236 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
14238 // Calculate the cost of this instruction.
14239 InstructionCost ScalarCost = 0;
14240 if (isa<CastInst, CallInst>(VL0)) {
14241 // For some of the instructions no need to calculate cost for each
14242 // particular instruction, we can use the cost of the single
14243 // instruction x total number of scalar instructions.
14244 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
14245 } else {
14246 for (unsigned I = 0; I < Sz; ++I) {
14247 if (UsedScalars.test(I))
14248 continue;
14249 ScalarCost += ScalarEltCost(I);
14250 }
14251 }
14252
14253 InstructionCost VecCost = VectorCost(CommonCost);
14254 // Check if the current node must be resized, if the parent node is not
14255 // resized.
14256 if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) &&
14257 E->Idx != 0 &&
14258 (E->getOpcode() != Instruction::Load || E->UserTreeIndex)) {
14259 const EdgeInfo &EI = E->UserTreeIndex;
14260 if (!EI.UserTE->hasState() ||
14261 EI.UserTE->getOpcode() != Instruction::Select ||
14262 EI.EdgeIdx != 0) {
14263 auto UserBWIt = MinBWs.find(EI.UserTE);
14264 Type *UserScalarTy =
14265 (EI.UserTE->isGather() ||
14266 EI.UserTE->State == TreeEntry::SplitVectorize)
14267 ? EI.UserTE->Scalars.front()->getType()
14268 : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
14269 if (UserBWIt != MinBWs.end())
14270 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
14271 UserBWIt->second.first);
14272 if (ScalarTy != UserScalarTy) {
14273 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
14274 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
14275 unsigned VecOpcode;
14276 auto *UserVecTy = getWidenedType(UserScalarTy, E->Scalars.size());
14277 if (BWSz > SrcBWSz)
14278 VecOpcode = Instruction::Trunc;
14279 else
14280 VecOpcode =
14281 It->second.second ? Instruction::SExt : Instruction::ZExt;
14282 TTI::CastContextHint CCH = GetCastContextHint(VL0);
14283 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
14284 CostKind);
14285 }
14286 }
14287 }
14288 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
14289 ScalarCost, "Calculated costs for Tree"));
14290 return VecCost - ScalarCost;
14291 };
14292 // Calculate cost difference from vectorizing set of GEPs.
14293 // Negative value means vectorizing is profitable.
14294 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
14295 assert((E->State == TreeEntry::Vectorize ||
14296 E->State == TreeEntry::StridedVectorize ||
14297 E->State == TreeEntry::CompressVectorize) &&
14298 "Entry state expected to be Vectorize, StridedVectorize or "
14299 "MaskedLoadCompressVectorize here.");
14300 InstructionCost ScalarCost = 0;
14301 InstructionCost VecCost = 0;
14302 std::tie(ScalarCost, VecCost) = getGEPCosts(
14303 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
14304 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
14305 "Calculated GEPs cost for Tree"));
14306
14307 return VecCost - ScalarCost;
14308 };
14309
14310 auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
14311 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI ? VI : VL);
14312 if (MinMaxID == Intrinsic::not_intrinsic)
14314 Type *CanonicalType = Ty;
14315 if (CanonicalType->isPtrOrPtrVectorTy())
14316 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
14317 CanonicalType->getContext(),
14318 DL->getTypeSizeInBits(CanonicalType->getScalarType())));
14319
14320 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
14321 {CanonicalType, CanonicalType});
14323 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
14324 // If the selects are the only uses of the compares, they will be
14325 // dead and we can adjust the cost by removing their cost.
14326 if (VI && SelectOnly) {
14327 assert((!Ty->isVectorTy() || SLPReVec) &&
14328 "Expected only for scalar type.");
14329 auto *CI = cast<CmpInst>(VI->getOperand(0));
14331 CI->getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
14332 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
14333 {TTI::OK_AnyValue, TTI::OP_None}, CI);
14334 }
14335 return IntrinsicCost;
14336 };
14337 auto GetFMulAddCost = [&, &TTI = *TTI](const InstructionsState &S,
14338 Instruction *VI) {
14339 InstructionCost Cost = canConvertToFMA(VI, S, *DT, *DL, TTI, *TLI);
14340 return Cost;
14341 };
14342 switch (ShuffleOrOp) {
14343 case Instruction::PHI: {
14344 // Count reused scalars.
14345 InstructionCost ScalarCost = 0;
14347 for (Value *V : UniqueValues) {
14348 auto *PHI = dyn_cast<PHINode>(V);
14349 if (!PHI)
14350 continue;
14351
14352 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
14353 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
14354 Value *Op = PHI->getIncomingValue(I);
14355 Operands[I] = Op;
14356 }
14357 if (const TreeEntry *OpTE =
14358 getSameValuesTreeEntry(Operands.front(), Operands))
14359 if (CountedOps.insert(OpTE).second &&
14360 !OpTE->ReuseShuffleIndices.empty())
14361 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
14362 OpTE->Scalars.size());
14363 }
14364
14365 return CommonCost - ScalarCost;
14366 }
14367 case Instruction::ExtractValue:
14368 case Instruction::ExtractElement: {
14369 APInt DemandedElts;
14370 VectorType *SrcVecTy = nullptr;
14371 auto GetScalarCost = [&](unsigned Idx) {
14372 if (isa<PoisonValue>(UniqueValues[Idx]))
14374
14375 auto *I = cast<Instruction>(UniqueValues[Idx]);
14376 if (!SrcVecTy) {
14377 if (ShuffleOrOp == Instruction::ExtractElement) {
14378 auto *EE = cast<ExtractElementInst>(I);
14379 SrcVecTy = EE->getVectorOperandType();
14380 } else {
14381 auto *EV = cast<ExtractValueInst>(I);
14382 Type *AggregateTy = EV->getAggregateOperand()->getType();
14383 unsigned NumElts;
14384 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
14385 NumElts = ATy->getNumElements();
14386 else
14387 NumElts = AggregateTy->getStructNumElements();
14388 SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
14389 }
14390 }
14391 if (I->hasOneUse()) {
14392 Instruction *Ext = I->user_back();
14393 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
14394 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
14395 // Use getExtractWithExtendCost() to calculate the cost of
14396 // extractelement/ext pair.
14398 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I),
14399 CostKind);
14400 // Subtract the cost of s|zext which is subtracted separately.
14402 Ext->getOpcode(), Ext->getType(), I->getType(),
14404 return Cost;
14405 }
14406 }
14407 if (DemandedElts.isZero())
14408 DemandedElts = APInt::getZero(getNumElements(SrcVecTy));
14409 DemandedElts.setBit(*getExtractIndex(I));
14411 };
14412 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
14413 return CommonCost - (DemandedElts.isZero()
14416 SrcVecTy, DemandedElts, /*Insert=*/false,
14417 /*Extract=*/true, CostKind));
14418 };
14419 return GetCostDiff(GetScalarCost, GetVectorCost);
14420 }
14421 case Instruction::InsertElement: {
14422 assert(E->ReuseShuffleIndices.empty() &&
14423 "Unique insertelements only are expected.");
14424 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
14425 unsigned const NumElts = SrcVecTy->getNumElements();
14426 unsigned const NumScalars = VL.size();
14427
14428 unsigned NumOfParts = ::getNumberOfParts(*TTI, SrcVecTy);
14429
14430 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
14431 unsigned OffsetBeg = *getElementIndex(VL.front());
14432 unsigned OffsetEnd = OffsetBeg;
14433 InsertMask[OffsetBeg] = 0;
14434 for (auto [I, V] : enumerate(VL.drop_front())) {
14435 unsigned Idx = *getElementIndex(V);
14436 if (OffsetBeg > Idx)
14437 OffsetBeg = Idx;
14438 else if (OffsetEnd < Idx)
14439 OffsetEnd = Idx;
14440 InsertMask[Idx] = I + 1;
14441 }
14442 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
14443 if (NumOfParts > 0 && NumOfParts < NumElts)
14444 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
14445 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
14446 VecScalarsSz;
14447 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
14448 unsigned InsertVecSz = std::min<unsigned>(
14449 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
14450 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
14451 bool IsWholeSubvector =
14452 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
14453 // Check if we can safely insert a subvector. If it is not possible, just
14454 // generate a whole-sized vector and shuffle the source vector and the new
14455 // subvector.
14456 if (OffsetBeg + InsertVecSz > VecSz) {
14457 // Align OffsetBeg to generate correct mask.
14458 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
14459 InsertVecSz = VecSz;
14460 }
14461
14462 APInt DemandedElts = APInt::getZero(NumElts);
14463 // TODO: Add support for Instruction::InsertValue.
14465 if (!E->ReorderIndices.empty()) {
14466 inversePermutation(E->ReorderIndices, Mask);
14467 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
14468 } else {
14469 Mask.assign(VecSz, PoisonMaskElem);
14470 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
14471 }
14472 bool IsIdentity = true;
14473 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
14474 Mask.swap(PrevMask);
14475 for (unsigned I = 0; I < NumScalars; ++I) {
14476 unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);
14477 DemandedElts.setBit(InsertIdx);
14478 IsIdentity &= InsertIdx - OffsetBeg == I;
14479 Mask[InsertIdx - OffsetBeg] = I;
14480 }
14481 assert(Offset < NumElts && "Failed to find vector index offset");
14482
14484 Cost -=
14485 getScalarizationOverhead(*TTI, ScalarTy, SrcVecTy, DemandedElts,
14486 /*Insert*/ true, /*Extract*/ false, CostKind);
14487
14488 // First cost - resize to actual vector size if not identity shuffle or
14489 // need to shift the vector.
14490 // Do not calculate the cost if the actual size is the register size and
14491 // we can merge this shuffle with the following SK_Select.
14492 auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz);
14493 if (!IsIdentity)
14495 InsertVecTy, Mask);
14496 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
14497 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
14498 }));
14499 // Second cost - permutation with subvector, if some elements are from the
14500 // initial vector or inserting a subvector.
14501 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
14502 // subvector of ActualVecTy.
14503 SmallBitVector InMask =
14504 isUndefVector(FirstInsert->getOperand(0),
14505 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
14506 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
14507 if (InsertVecSz != VecSz) {
14508 auto *ActualVecTy = getWidenedType(ScalarTy, VecSz);
14509 Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, ActualVecTy, {},
14510 CostKind, OffsetBeg - Offset, InsertVecTy);
14511 } else {
14512 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
14513 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
14514 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
14515 I <= End; ++I)
14516 if (Mask[I] != PoisonMaskElem)
14517 Mask[I] = I + VecSz;
14518 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
14519 Mask[I] =
14520 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
14521 Cost +=
14522 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
14523 }
14524 }
14525 return Cost;
14526 }
14527 case Instruction::ZExt:
14528 case Instruction::SExt:
14529 case Instruction::FPToUI:
14530 case Instruction::FPToSI:
14531 case Instruction::FPExt:
14532 case Instruction::PtrToInt:
14533 case Instruction::IntToPtr:
14534 case Instruction::SIToFP:
14535 case Instruction::UIToFP:
14536 case Instruction::Trunc:
14537 case Instruction::FPTrunc:
14538 case Instruction::BitCast: {
14539 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
14540 Type *SrcScalarTy = VL0->getOperand(0)->getType();
14541 auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
14542 unsigned Opcode = ShuffleOrOp;
14543 unsigned VecOpcode = Opcode;
14544 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
14545 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
14546 // Check if the values are candidates to demote.
14547 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->getScalarType());
14548 if (SrcIt != MinBWs.end()) {
14549 SrcBWSz = SrcIt->second.first;
14550 unsigned SrcScalarTyNumElements = getNumElements(SrcScalarTy);
14551 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
14552 SrcVecTy =
14553 getWidenedType(SrcScalarTy, VL.size() * SrcScalarTyNumElements);
14554 }
14555 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
14556 if (BWSz == SrcBWSz) {
14557 VecOpcode = Instruction::BitCast;
14558 } else if (BWSz < SrcBWSz) {
14559 VecOpcode = Instruction::Trunc;
14560 } else if (It != MinBWs.end()) {
14561 assert(BWSz > SrcBWSz && "Invalid cast!");
14562 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
14563 } else if (SrcIt != MinBWs.end()) {
14564 assert(BWSz > SrcBWSz && "Invalid cast!");
14565 VecOpcode =
14566 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
14567 }
14568 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
14569 !SrcIt->second.second) {
14570 VecOpcode = Instruction::UIToFP;
14571 }
14572 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
14573 assert(Idx == 0 && "Expected 0 index only");
14574 return TTI->getCastInstrCost(Opcode, VL0->getType(),
14575 VL0->getOperand(0)->getType(),
14577 };
14578 auto GetVectorCost = [=](InstructionCost CommonCost) {
14579 // Do not count cost here if minimum bitwidth is in effect and it is just
14580 // a bitcast (here it is just a noop).
14581 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
14582 return CommonCost;
14583 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
14584 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
14585
14586 bool IsArithmeticExtendedReduction =
14587 E->Idx == 0 && UserIgnoreList &&
14588 all_of(*UserIgnoreList, [](Value *V) {
14589 auto *I = cast<Instruction>(V);
14590 return is_contained({Instruction::Add, Instruction::FAdd,
14591 Instruction::Mul, Instruction::FMul,
14592 Instruction::And, Instruction::Or,
14593 Instruction::Xor},
14594 I->getOpcode());
14595 });
14596 if (IsArithmeticExtendedReduction &&
14597 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
14598 return CommonCost;
14599 return CommonCost +
14600 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
14601 VecOpcode == Opcode ? VI : nullptr);
14602 };
14603 return GetCostDiff(GetScalarCost, GetVectorCost);
14604 }
14605 case Instruction::FCmp:
14606 case Instruction::ICmp:
14607 case Instruction::Select: {
14608 CmpPredicate VecPred, SwappedVecPred;
14609 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
14610 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
14611 match(VL0, MatchCmp))
14612 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
14613 else
14614 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
14617 auto GetScalarCost = [&](unsigned Idx) {
14618 if (isa<PoisonValue>(UniqueValues[Idx]))
14620
14621 auto *VI = cast<Instruction>(UniqueValues[Idx]);
14622 CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
14625 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
14626 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
14627 !match(VI, MatchCmp)) ||
14628 (CurrentPred != static_cast<CmpInst::Predicate>(VecPred) &&
14629 CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred)))
14630 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
14633
14635 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
14636 CostKind, getOperandInfo(VI->getOperand(0)),
14637 getOperandInfo(VI->getOperand(1)), VI);
14638 InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
14639 if (IntrinsicCost.isValid())
14640 ScalarCost = IntrinsicCost;
14641
14642 return ScalarCost;
14643 };
14644 auto GetVectorCost = [&](InstructionCost CommonCost) {
14645 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
14646
14647 InstructionCost VecCost =
14648 TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VecPred,
14649 CostKind, getOperandInfo(E->getOperand(0)),
14650 getOperandInfo(E->getOperand(1)), VL0);
14651 if (auto *SI = dyn_cast<SelectInst>(VL0)) {
14652 auto *CondType =
14653 getWidenedType(SI->getCondition()->getType(), VL.size());
14654 unsigned CondNumElements = CondType->getNumElements();
14655 unsigned VecTyNumElements = getNumElements(VecTy);
14656 assert(VecTyNumElements >= CondNumElements &&
14657 VecTyNumElements % CondNumElements == 0 &&
14658 "Cannot vectorize Instruction::Select");
14659 if (CondNumElements != VecTyNumElements) {
14660 // When the return type is i1 but the source is fixed vector type, we
14661 // need to duplicate the condition value.
14662 VecCost += ::getShuffleCost(
14663 *TTI, TTI::SK_PermuteSingleSrc, CondType,
14664 createReplicatedMask(VecTyNumElements / CondNumElements,
14665 CondNumElements));
14666 }
14667 }
14668 return VecCost + CommonCost;
14669 };
14670 return GetCostDiff(GetScalarCost, GetVectorCost);
14671 }
14672 case TreeEntry::MinMax: {
14673 auto GetScalarCost = [&](unsigned Idx) {
14674 return GetMinMaxCost(OrigScalarTy);
14675 };
14676 auto GetVectorCost = [&](InstructionCost CommonCost) {
14677 InstructionCost VecCost = GetMinMaxCost(VecTy);
14678 return VecCost + CommonCost;
14679 };
14680 return GetCostDiff(GetScalarCost, GetVectorCost);
14681 }
14682 case TreeEntry::FMulAdd: {
14683 auto GetScalarCost = [&](unsigned Idx) {
14684 if (isa<PoisonValue>(UniqueValues[Idx]))
14686 return GetFMulAddCost(E->getOperations(),
14687 cast<Instruction>(UniqueValues[Idx]));
14688 };
14689 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
14690 FastMathFlags FMF;
14691 FMF.set();
14692 for (Value *V : E->Scalars) {
14693 if (auto *FPCI = dyn_cast<FPMathOperator>(V)) {
14694 FMF &= FPCI->getFastMathFlags();
14695 if (auto *FPCIOp = dyn_cast<FPMathOperator>(FPCI->getOperand(0)))
14696 FMF &= FPCIOp->getFastMathFlags();
14697 }
14698 }
14699 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
14700 {VecTy, VecTy, VecTy}, FMF);
14702 return VecCost + CommonCost;
14703 };
14704 return GetCostDiff(GetScalarCost, GetVectorCost);
14705 }
14706 case Instruction::FNeg:
14707 case Instruction::Add:
14708 case Instruction::FAdd:
14709 case Instruction::Sub:
14710 case Instruction::FSub:
14711 case Instruction::Mul:
14712 case Instruction::FMul:
14713 case Instruction::UDiv:
14714 case Instruction::SDiv:
14715 case Instruction::FDiv:
14716 case Instruction::URem:
14717 case Instruction::SRem:
14718 case Instruction::FRem:
14719 case Instruction::Shl:
14720 case Instruction::LShr:
14721 case Instruction::AShr:
14722 case Instruction::And:
14723 case Instruction::Or:
14724 case Instruction::Xor: {
14725 auto GetScalarCost = [&](unsigned Idx) {
14726 if (isa<PoisonValue>(UniqueValues[Idx]))
14728
14729 // We cannot retrieve the operand from UniqueValues[Idx] because an
14730 // interchangeable instruction may be used. The order and the actual
14731 // operand might differ from what is retrieved from UniqueValues[Idx].
14732 Value *Op1 = E->getOperand(0)[Idx];
14733 Value *Op2;
14735 if (isa<UnaryOperator>(UniqueValues[Idx])) {
14736 Op2 = Op1;
14737 } else {
14738 Op2 = E->getOperand(1)[Idx];
14739 Operands.push_back(Op2);
14740 }
14744 ShuffleOrOp, OrigScalarTy, CostKind, Op1Info, Op2Info, Operands);
14745 if (auto *I = dyn_cast<Instruction>(UniqueValues[Idx]);
14746 I && (ShuffleOrOp == Instruction::FAdd ||
14747 ShuffleOrOp == Instruction::FSub)) {
14748 InstructionCost IntrinsicCost = GetFMulAddCost(E->getOperations(), I);
14749 if (IntrinsicCost.isValid())
14750 ScalarCost = IntrinsicCost;
14751 }
14752 return ScalarCost;
14753 };
14754 auto GetVectorCost = [=](InstructionCost CommonCost) {
14755 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
14756 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
14757 ArrayRef<Value *> Ops = E->getOperand(I);
14758 if (all_of(Ops, [&](Value *Op) {
14759 auto *CI = dyn_cast<ConstantInt>(Op);
14760 return CI && CI->getValue().countr_one() >= It->second.first;
14761 }))
14762 return CommonCost;
14763 }
14764 }
14765 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
14766 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
14767 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
14768 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
14769 Op2Info, {}, nullptr, TLI) +
14770 CommonCost;
14771 };
14772 return GetCostDiff(GetScalarCost, GetVectorCost);
14773 }
14774 case Instruction::GetElementPtr: {
14775 return CommonCost + GetGEPCostDiff(VL, VL0);
14776 }
14777 case Instruction::Load: {
14778 auto GetScalarCost = [&](unsigned Idx) {
14779 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
14780 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
14781 VI->getAlign(), VI->getPointerAddressSpace(),
14783 };
14784 auto *LI0 = cast<LoadInst>(VL0);
14785 auto GetVectorCost = [&](InstructionCost CommonCost) {
14786 InstructionCost VecLdCost;
14787 switch (E->State) {
14788 case TreeEntry::Vectorize:
14789 if (unsigned Factor = E->getInterleaveFactor()) {
14790 VecLdCost = TTI->getInterleavedMemoryOpCost(
14791 Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
14792 LI0->getPointerAddressSpace(), CostKind);
14793
14794 } else {
14795 VecLdCost = TTI->getMemoryOpCost(
14796 Instruction::Load, VecTy, LI0->getAlign(),
14797 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
14798 }
14799 break;
14800 case TreeEntry::StridedVectorize: {
14801 Align CommonAlignment =
14802 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
14803 VecLdCost = TTI->getStridedMemoryOpCost(
14804 Instruction::Load, VecTy, LI0->getPointerOperand(),
14805 /*VariableMask=*/false, CommonAlignment, CostKind);
14806 break;
14807 }
14808 case TreeEntry::CompressVectorize: {
14809 bool IsMasked;
14810 unsigned InterleaveFactor;
14811 SmallVector<int> CompressMask;
14812 VectorType *LoadVecTy;
14813 SmallVector<Value *> Scalars(VL);
14814 if (!E->ReorderIndices.empty()) {
14815 SmallVector<int> Mask(E->ReorderIndices.begin(),
14816 E->ReorderIndices.end());
14817 reorderScalars(Scalars, Mask);
14818 }
14819 SmallVector<Value *> PointerOps(Scalars.size());
14820 for (auto [I, V] : enumerate(Scalars))
14821 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
14822 [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress(
14823 Scalars, PointerOps, E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
14824 *TLI, [](Value *) { return true; }, IsMasked, InterleaveFactor,
14825 CompressMask, LoadVecTy);
14826 assert(IsVectorized && "Failed to vectorize load");
14827 CompressEntryToData.try_emplace(E, CompressMask, LoadVecTy,
14828 InterleaveFactor, IsMasked);
14829 Align CommonAlignment = LI0->getAlign();
14830 if (InterleaveFactor) {
14831 VecLdCost = TTI->getInterleavedMemoryOpCost(
14832 Instruction::Load, LoadVecTy, InterleaveFactor, {},
14833 CommonAlignment, LI0->getPointerAddressSpace(), CostKind);
14834 } else if (IsMasked) {
14835 VecLdCost = TTI->getMaskedMemoryOpCost(
14836 Instruction::Load, LoadVecTy, CommonAlignment,
14837 LI0->getPointerAddressSpace(), CostKind);
14838 // TODO: include this cost into CommonCost.
14840 LoadVecTy, CompressMask, CostKind);
14841 } else {
14842 VecLdCost = TTI->getMemoryOpCost(
14843 Instruction::Load, LoadVecTy, CommonAlignment,
14844 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
14845 // TODO: include this cost into CommonCost.
14847 LoadVecTy, CompressMask, CostKind);
14848 }
14849 break;
14850 }
14851 case TreeEntry::ScatterVectorize: {
14852 Align CommonAlignment =
14853 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
14854 VecLdCost = TTI->getGatherScatterOpCost(
14855 Instruction::Load, VecTy, LI0->getPointerOperand(),
14856 /*VariableMask=*/false, CommonAlignment, CostKind);
14857 break;
14858 }
14859 case TreeEntry::CombinedVectorize:
14860 case TreeEntry::SplitVectorize:
14861 case TreeEntry::NeedToGather:
14862 llvm_unreachable("Unexpected vectorization state.");
14863 }
14864 return VecLdCost + CommonCost;
14865 };
14866
14867 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
14868 // If this node generates masked gather load then it is not a terminal node.
14869 // Hence address operand cost is estimated separately.
14870 if (E->State == TreeEntry::ScatterVectorize)
14871 return Cost;
14872
14873 // Estimate cost of GEPs since this tree node is a terminator.
14874 SmallVector<Value *> PointerOps(VL.size());
14875 for (auto [I, V] : enumerate(VL))
14876 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
14877 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
14878 }
14879 case Instruction::Store: {
14880 bool IsReorder = !E->ReorderIndices.empty();
14881 auto GetScalarCost = [=](unsigned Idx) {
14882 auto *VI = cast<StoreInst>(VL[Idx]);
14883 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
14884 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
14885 VI->getAlign(), VI->getPointerAddressSpace(),
14886 CostKind, OpInfo, VI);
14887 };
14888 auto *BaseSI =
14889 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
14890 auto GetVectorCost = [=](InstructionCost CommonCost) {
14891 // We know that we can merge the stores. Calculate the cost.
14892 InstructionCost VecStCost;
14893 if (E->State == TreeEntry::StridedVectorize) {
14894 Align CommonAlignment =
14895 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
14896 VecStCost = TTI->getStridedMemoryOpCost(
14897 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
14898 /*VariableMask=*/false, CommonAlignment, CostKind);
14899 } else {
14900 assert(E->State == TreeEntry::Vectorize &&
14901 "Expected either strided or consecutive stores.");
14902 if (unsigned Factor = E->getInterleaveFactor()) {
14903 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
14904 "No reused shuffles expected");
14905 CommonCost = 0;
14906 VecStCost = TTI->getInterleavedMemoryOpCost(
14907 Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
14908 BaseSI->getPointerAddressSpace(), CostKind);
14909 } else {
14910 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
14911 VecStCost = TTI->getMemoryOpCost(
14912 Instruction::Store, VecTy, BaseSI->getAlign(),
14913 BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
14914 }
14915 }
14916 return VecStCost + CommonCost;
14917 };
14918 SmallVector<Value *> PointerOps(VL.size());
14919 for (auto [I, V] : enumerate(VL)) {
14920 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
14921 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
14922 }
14923
14924 return GetCostDiff(GetScalarCost, GetVectorCost) +
14925 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
14926 }
14927 case Instruction::Call: {
14928 auto GetScalarCost = [&](unsigned Idx) {
14929 auto *CI = cast<CallInst>(UniqueValues[Idx]);
14932 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
14933 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
14934 }
14937 CI->getFunctionType()->params(), CostKind);
14938 };
14939 auto GetVectorCost = [=](InstructionCost CommonCost) {
14940 auto *CI = cast<CallInst>(VL0);
14943 CI, ID, VecTy->getNumElements(),
14944 It != MinBWs.end() ? It->second.first : 0, TTI);
14945 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
14946 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
14947 };
14948 return GetCostDiff(GetScalarCost, GetVectorCost);
14949 }
14950 case Instruction::ShuffleVector: {
14951 if (!SLPReVec || E->isAltShuffle())
14952 assert(E->isAltShuffle() &&
14953 ((Instruction::isBinaryOp(E->getOpcode()) &&
14954 Instruction::isBinaryOp(E->getAltOpcode())) ||
14955 (Instruction::isCast(E->getOpcode()) &&
14956 Instruction::isCast(E->getAltOpcode())) ||
14957 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
14958 "Invalid Shuffle Vector Operand");
14959 // Try to find the previous shuffle node with the same operands and same
14960 // main/alternate ops.
14961 auto TryFindNodeWithEqualOperands = [=]() {
14962 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
14963 if (TE.get() == E)
14964 break;
14965 if (TE->hasState() && TE->isAltShuffle() &&
14966 ((TE->getOpcode() == E->getOpcode() &&
14967 TE->getAltOpcode() == E->getAltOpcode()) ||
14968 (TE->getOpcode() == E->getAltOpcode() &&
14969 TE->getAltOpcode() == E->getOpcode())) &&
14970 TE->hasEqualOperands(*E))
14971 return true;
14972 }
14973 return false;
14974 };
14975 auto GetScalarCost = [&](unsigned Idx) {
14976 if (isa<PoisonValue>(UniqueValues[Idx]))
14978
14979 auto *VI = cast<Instruction>(UniqueValues[Idx]);
14980 assert(E->getMatchingMainOpOrAltOp(VI) &&
14981 "Unexpected main/alternate opcode");
14982 (void)E;
14983 return TTI->getInstructionCost(VI, CostKind);
14984 };
14985 // Need to clear CommonCost since the final shuffle cost is included into
14986 // vector cost.
14987 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
14988 // VecCost is equal to sum of the cost of creating 2 vectors
14989 // and the cost of creating shuffle.
14990 InstructionCost VecCost = 0;
14991 if (TryFindNodeWithEqualOperands()) {
14992 LLVM_DEBUG({
14993 dbgs() << "SLP: diamond match for alternate node found.\n";
14994 E->dump();
14995 });
14996 // No need to add new vector costs here since we're going to reuse
14997 // same main/alternate vector ops, just do different shuffling.
14998 } else if (Instruction::isBinaryOp(E->getOpcode())) {
14999 VecCost =
15000 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
15001 VecCost +=
15002 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
15003 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
15004 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
15005 VecCost = TTIRef.getCmpSelInstrCost(
15006 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(), CostKind,
15007 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15008 VL0);
15009 VecCost += TTIRef.getCmpSelInstrCost(
15010 E->getOpcode(), VecTy, MaskTy,
15011 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
15012 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15013 E->getAltOp());
15014 } else {
15015 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
15016 auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
15017 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
15018 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
15019 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
15020 unsigned SrcBWSz =
15021 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
15022 if (SrcIt != MinBWs.end()) {
15023 SrcBWSz = SrcIt->second.first;
15024 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
15025 SrcTy = getWidenedType(SrcSclTy, VL.size());
15026 }
15027 if (BWSz <= SrcBWSz) {
15028 if (BWSz < SrcBWSz)
15029 VecCost =
15030 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
15032 LLVM_DEBUG({
15033 dbgs()
15034 << "SLP: alternate extension, which should be truncated.\n";
15035 E->dump();
15036 });
15037 return VecCost;
15038 }
15039 }
15040 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
15042 VecCost +=
15043 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
15045 }
15047 E->buildAltOpShuffleMask(
15048 [&](Instruction *I) {
15049 assert(E->getMatchingMainOpOrAltOp(I) &&
15050 "Unexpected main/alternate opcode");
15051 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
15052 *TLI);
15053 },
15054 Mask);
15056 FinalVecTy, Mask, CostKind);
15057 // Patterns like [fadd,fsub] can be combined into a single instruction
15058 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
15059 // need to take into account their order when looking for the most used
15060 // order.
15061 unsigned Opcode0 = E->getOpcode();
15062 unsigned Opcode1 = E->getAltOpcode();
15063 SmallBitVector OpcodeMask(
15064 getAltInstrMask(E->Scalars, ScalarTy, Opcode0, Opcode1));
15065 // If this pattern is supported by the target then we consider the
15066 // order.
15067 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
15068 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
15069 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
15070 return AltVecCost < VecCost ? AltVecCost : VecCost;
15071 }
15072 // TODO: Check the reverse order too.
15073 return VecCost;
15074 };
15075 if (SLPReVec && !E->isAltShuffle())
15076 return GetCostDiff(
15077 GetScalarCost, [&](InstructionCost) -> InstructionCost {
15078 // If a group uses mask in order, the shufflevector can be
15079 // eliminated by instcombine. Then the cost is 0.
15080 assert(isa<ShuffleVectorInst>(VL.front()) &&
15081 "Not supported shufflevector usage.");
15082 auto *SV = cast<ShuffleVectorInst>(VL.front());
15083 unsigned SVNumElements =
15084 cast<FixedVectorType>(SV->getOperand(0)->getType())
15085 ->getNumElements();
15086 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
15087 for (size_t I = 0, End = VL.size(); I != End; I += GroupSize) {
15088 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
15089 int NextIndex = 0;
15090 if (!all_of(Group, [&](Value *V) {
15091 assert(isa<ShuffleVectorInst>(V) &&
15092 "Not supported shufflevector usage.");
15093 auto *SV = cast<ShuffleVectorInst>(V);
15094 int Index;
15095 [[maybe_unused]] bool IsExtractSubvectorMask =
15096 SV->isExtractSubvectorMask(Index);
15097 assert(IsExtractSubvectorMask &&
15098 "Not supported shufflevector usage.");
15099 if (NextIndex != Index)
15100 return false;
15101 NextIndex += SV->getShuffleMask().size();
15102 return true;
15103 }))
15104 return ::getShuffleCost(
15106 calculateShufflevectorMask(E->Scalars));
15107 }
15108 return TTI::TCC_Free;
15109 });
15110 return GetCostDiff(GetScalarCost, GetVectorCost);
15111 }
15112 case Instruction::Freeze:
15113 return CommonCost;
15114 default:
15115 llvm_unreachable("Unknown instruction");
15116 }
15117}
15118
15119bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
15120 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
15121 << VectorizableTree.size() << " is fully vectorizable .\n");
15122
15123 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
15125 return TE->isGather() &&
15126 !any_of(TE->Scalars,
15127 [this](Value *V) { return EphValues.contains(V); }) &&
15128 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
15129 TE->Scalars.size() < Limit ||
15130 (((TE->hasState() &&
15131 TE->getOpcode() == Instruction::ExtractElement) ||
15132 all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
15133 isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
15134 (TE->hasState() && TE->getOpcode() == Instruction::Load &&
15135 !TE->isAltShuffle()) ||
15136 any_of(TE->Scalars, IsaPred<LoadInst>));
15137 };
15138
15139 // We only handle trees of heights 1 and 2.
15140 if (VectorizableTree.size() == 1 &&
15141 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
15142 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
15143 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
15144 (ForReduction &&
15145 AreVectorizableGathers(VectorizableTree[0].get(),
15146 VectorizableTree[0]->Scalars.size()) &&
15147 VectorizableTree[0]->getVectorFactor() > 2)))
15148 return true;
15149
15150 if (VectorizableTree.size() != 2)
15151 return false;
15152
15153 // Handle splat and all-constants stores. Also try to vectorize tiny trees
15154 // with the second gather nodes if they have less scalar operands rather than
15155 // the initial tree element (may be profitable to shuffle the second gather)
15156 // or they are extractelements, which form shuffle.
15157 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
15158 AreVectorizableGathers(VectorizableTree[1].get(),
15159 VectorizableTree[0]->Scalars.size()))
15160 return true;
15161
15162 // Gathering cost would be too much for tiny trees.
15163 if (VectorizableTree[0]->isGather() ||
15164 (VectorizableTree[1]->isGather() &&
15165 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
15166 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
15167 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
15168 return false;
15169
15170 return true;
15171}
15172
15173static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
15175 bool MustMatchOrInst) {
15176 // Look past the root to find a source value. Arbitrarily follow the
15177 // path through operand 0 of any 'or'. Also, peek through optional
15178 // shift-left-by-multiple-of-8-bits.
15179 Value *ZextLoad = Root;
15180 const APInt *ShAmtC;
15181 bool FoundOr = false;
15182 while (!isa<ConstantExpr>(ZextLoad) &&
15183 (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
15184 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
15185 ShAmtC->urem(8) == 0))) {
15186 auto *BinOp = cast<BinaryOperator>(ZextLoad);
15187 ZextLoad = BinOp->getOperand(0);
15188 if (BinOp->getOpcode() == Instruction::Or)
15189 FoundOr = true;
15190 }
15191 // Check if the input is an extended load of the required or/shift expression.
15192 Value *Load;
15193 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
15194 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
15195 return false;
15196
15197 // Require that the total load bit width is a legal integer type.
15198 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
15199 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
15200 Type *SrcTy = Load->getType();
15201 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
15202 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
15203 return false;
15204
15205 // Everything matched - assume that we can fold the whole sequence using
15206 // load combining.
15207 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
15208 << *(cast<Instruction>(Root)) << "\n");
15209
15210 return true;
15211}
15212
15214 if (RdxKind != RecurKind::Or)
15215 return false;
15216
15217 unsigned NumElts = VectorizableTree[0]->Scalars.size();
15218 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
15219 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
15220 /* MatchOr */ false);
15221}
15222
15224 // Peek through a final sequence of stores and check if all operations are
15225 // likely to be load-combined.
15226 unsigned NumElts = Stores.size();
15227 for (Value *Scalar : Stores) {
15228 Value *X;
15229 if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
15230 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
15231 return false;
15232 }
15233 return true;
15234}
15235
15236bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
15237 if (!DebugCounter::shouldExecute(VectorizedGraphs))
15238 return true;
15239
15240 // Graph is empty - do nothing.
15241 if (VectorizableTree.empty()) {
15242 assert(ExternalUses.empty() && "We shouldn't have any external users");
15243
15244 return true;
15245 }
15246
15247 // No need to vectorize inserts of gathered values.
15248 if (VectorizableTree.size() == 2 &&
15249 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
15250 VectorizableTree[1]->isGather() &&
15251 (VectorizableTree[1]->getVectorFactor() <= 2 ||
15252 !(isSplat(VectorizableTree[1]->Scalars) ||
15253 allConstant(VectorizableTree[1]->Scalars))))
15254 return true;
15255
15256 // If the graph includes only PHI nodes and gathers, it is defnitely not
15257 // profitable for the vectorization, we can skip it, if the cost threshold is
15258 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
15259 // gathers/buildvectors.
15260 constexpr int Limit = 4;
15261 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
15262 !VectorizableTree.empty() &&
15263 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15264 return (TE->isGather() &&
15265 (!TE->hasState() ||
15266 TE->getOpcode() != Instruction::ExtractElement) &&
15267 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
15268 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
15269 }))
15270 return true;
15271
15272 // Do not vectorize small tree of phis only, if all vector phis are also
15273 // gathered.
15274 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15275 VectorizableTree.size() <= Limit &&
15276 all_of(VectorizableTree,
15277 [&](const std::unique_ptr<TreeEntry> &TE) {
15278 return (TE->isGather() &&
15279 (!TE->hasState() ||
15280 TE->getOpcode() != Instruction::ExtractElement) &&
15281 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <=
15282 Limit) ||
15283 (TE->hasState() &&
15284 (TE->getOpcode() == Instruction::InsertElement ||
15285 (TE->getOpcode() == Instruction::PHI &&
15286 all_of(TE->Scalars, [&](Value *V) {
15287 return isa<PoisonValue>(V) || MustGather.contains(V);
15288 }))));
15289 }) &&
15290 any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15291 return TE->State == TreeEntry::Vectorize &&
15292 TE->getOpcode() == Instruction::PHI;
15293 }))
15294 return true;
15295
15296 // If the tree contains only phis, buildvectors, split nodes and
15297 // small nodes with reuses, we can skip it.
15298 SmallVector<const TreeEntry *> StoreLoadNodes;
15299 unsigned NumGathers = 0;
15300 constexpr int LimitTreeSize = 36;
15301 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
15302 all_of(VectorizableTree,
15303 [&](const std::unique_ptr<TreeEntry> &TE) {
15304 if (!TE->isGather() && TE->hasState() &&
15305 (TE->getOpcode() == Instruction::Load ||
15306 TE->getOpcode() == Instruction::Store)) {
15307 StoreLoadNodes.push_back(TE.get());
15308 return true;
15309 }
15310 if (TE->isGather())
15311 ++NumGathers;
15312 return TE->State == TreeEntry::SplitVectorize ||
15313 (TE->Idx == 0 && TE->Scalars.size() == 2 &&
15314 TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
15315 VectorizableTree.size() > LimitTreeSize) ||
15316 (TE->isGather() &&
15317 none_of(TE->Scalars, IsaPred<ExtractElementInst>)) ||
15318 (TE->hasState() &&
15319 (TE->getOpcode() == Instruction::PHI ||
15320 (TE->hasCopyableElements() &&
15321 static_cast<unsigned>(count_if(
15322 TE->Scalars, IsaPred<PHINode, Constant>)) >=
15323 TE->Scalars.size() / 2) ||
15324 ((!TE->ReuseShuffleIndices.empty() ||
15325 !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
15326 TE->Scalars.size() == 2)));
15327 }) &&
15328 (StoreLoadNodes.empty() ||
15329 (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.size() &&
15330 (NumGathers > 0 || none_of(StoreLoadNodes, [&](const TreeEntry *TE) {
15331 return TE->getOpcode() == Instruction::Store ||
15332 all_of(TE->Scalars, [&](Value *V) {
15333 return !isa<LoadInst>(V) ||
15334 areAllUsersVectorized(cast<Instruction>(V));
15335 });
15336 })))))
15337 return true;
15338
15339 // If the tree contains only buildvector, 2 non-buildvectors (with root user
15340 // tree node) and other buildvectors, we can skip it.
15341 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15342 VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
15343 VectorizableTree.size() >= Limit &&
15344 count_if(ArrayRef(VectorizableTree).drop_front(),
15345 [&](const std::unique_ptr<TreeEntry> &TE) {
15346 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
15347 TE->UserTreeIndex.UserTE->Idx == 0;
15348 }) == 2)
15349 return true;
15350
15351 // If the tree contains only vectorization of the phi node from the
15352 // buildvector - skip it.
15353 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15354 VectorizableTree.size() > 2 &&
15355 VectorizableTree.front()->State == TreeEntry::Vectorize &&
15356 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
15357 VectorizableTree[1]->State == TreeEntry::Vectorize &&
15358 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
15359 all_of(
15360 ArrayRef(VectorizableTree).drop_front(2),
15361 [&](const std::unique_ptr<TreeEntry> &TE) { return TE->isGather(); }))
15362 return true;
15363
15364 // We can vectorize the tree if its size is greater than or equal to the
15365 // minimum size specified by the MinTreeSize command line option.
15366 if (VectorizableTree.size() >= MinTreeSize)
15367 return false;
15368
15369 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
15370 // can vectorize it if we can prove it fully vectorizable.
15371 if (isFullyVectorizableTinyTree(ForReduction))
15372 return false;
15373
15374 // Check if any of the gather node forms an insertelement buildvector
15375 // somewhere.
15376 bool IsAllowedSingleBVNode =
15377 VectorizableTree.size() > 1 ||
15378 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
15379 !VectorizableTree.front()->isAltShuffle() &&
15380 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
15381 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
15382 allSameBlock(VectorizableTree.front()->Scalars));
15383 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15384 return TE->isGather() && all_of(TE->Scalars, [&](Value *V) {
15385 return isa<ExtractElementInst, Constant>(V) ||
15386 (IsAllowedSingleBVNode &&
15387 !V->hasNUsesOrMore(UsesLimit) &&
15388 any_of(V->users(), IsaPred<InsertElementInst>));
15389 });
15390 }))
15391 return false;
15392
15393 if (VectorizableTree.back()->isGather() &&
15394 VectorizableTree.back()->hasState() &&
15395 VectorizableTree.back()->isAltShuffle() &&
15396 VectorizableTree.back()->getVectorFactor() > 2 &&
15397 allSameBlock(VectorizableTree.back()->Scalars) &&
15398 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
15400 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
15401 VectorizableTree.back()->getVectorFactor()),
15402 APInt::getAllOnes(VectorizableTree.back()->getVectorFactor()),
15403 /*Insert=*/true, /*Extract=*/false,
15405 return false;
15406
15407 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
15408 // vectorizable.
15409 return true;
15410}
15411
15414 constexpr unsigned SmallTree = 3;
15415 if (VectorizableTree.front()->isNonPowOf2Vec() &&
15416 getCanonicalGraphSize() <= SmallTree &&
15417 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
15418 [](const std::unique_ptr<TreeEntry> &TE) {
15419 return TE->isGather() && TE->hasState() &&
15420 TE->getOpcode() == Instruction::Load &&
15421 !allSameBlock(TE->Scalars);
15422 }) == 1)
15423 return true;
15424 return false;
15425 }
15426 bool Res = false;
15427 for (unsigned Idx : seq<unsigned>(getTreeSize())) {
15428 TreeEntry &E = *VectorizableTree[Idx];
15429 if (E.State == TreeEntry::SplitVectorize)
15430 return false;
15431 if (!E.isGather())
15432 continue;
15433 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
15434 (!E.hasState() &&
15435 all_of(E.Scalars, IsaPred<ExtractElementInst, LoadInst>)) ||
15436 (isa<ExtractElementInst>(E.Scalars.front()) &&
15437 getSameOpcode(ArrayRef(E.Scalars).drop_front(), *TLI).valid()))
15438 return false;
15439 if (isSplat(E.Scalars) || allConstant(E.Scalars))
15440 continue;
15441 Res = true;
15442 }
15443 return Res;
15444}
15445
15447 // Walk from the bottom of the tree to the top, tracking which values are
15448 // live. When we see a call instruction that is not part of our tree,
15449 // query TTI to see if there is a cost to keeping values live over it
15450 // (for example, if spills and fills are required).
15451
15452 const TreeEntry *Root = VectorizableTree.front().get();
15453 if (Root->isGather())
15454 return 0;
15455
15458 EntriesToOperands;
15459 SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;
15460 SmallPtrSet<const Instruction *, 8> LastInstructions;
15461 for (const auto &TEPtr : VectorizableTree) {
15462 if (!TEPtr->isGather()) {
15463 Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
15464 EntriesToLastInstruction.try_emplace(TEPtr.get(), LastInst);
15465 LastInstructions.insert(LastInst);
15466 }
15467 if (TEPtr->UserTreeIndex)
15468 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
15469 }
15470
15471 auto NoCallIntrinsic = [this](const Instruction *I) {
15472 const auto *II = dyn_cast<IntrinsicInst>(I);
15473 if (!II)
15474 return false;
15475 if (II->isAssumeLikeIntrinsic())
15476 return true;
15477 IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
15478 InstructionCost IntrCost =
15481 nullptr, II->getType(), ICA.getArgTypes(), TTI::TCK_RecipThroughput);
15482 return IntrCost < CallCost;
15483 };
15484
15485 // Maps last instruction in the entry to the last instruction for the one of
15486 // operand entries and the flag. If the flag is true, there are no calls in
15487 // between these instructions.
15489 CheckedInstructions;
15490 unsigned Budget = 0;
15491 const unsigned BudgetLimit =
15492 ScheduleRegionSizeBudget / VectorizableTree.size();
15493 auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
15494 const Instruction *Last) {
15495 assert(First->getParent() == Last->getParent() &&
15496 "Expected instructions in same block.");
15497 if (auto It = CheckedInstructions.find(Last);
15498 It != CheckedInstructions.end()) {
15499 const Instruction *Checked = It->second.getPointer();
15500 if (Checked == First || Checked->comesBefore(First))
15501 return It->second.getInt() != 0;
15502 Last = Checked;
15503 } else if (Last == First || Last->comesBefore(First)) {
15504 return true;
15505 }
15507 ++First->getIterator().getReverse(),
15508 PrevInstIt =
15509 Last->getIterator().getReverse();
15510 SmallVector<const Instruction *> LastInstsInRange;
15511 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
15512 // Debug information does not impact spill cost.
15513 // Vectorized calls, represented as vector intrinsics, do not impact spill
15514 // cost.
15515 if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
15516 CB && !NoCallIntrinsic(CB) && !isVectorized(CB)) {
15517 for (const Instruction *LastInst : LastInstsInRange)
15518 CheckedInstructions.try_emplace(LastInst, &*PrevInstIt, 0);
15519 return false;
15520 }
15521 if (LastInstructions.contains(&*PrevInstIt))
15522 LastInstsInRange.push_back(&*PrevInstIt);
15523
15524 ++PrevInstIt;
15525 ++Budget;
15526 }
15527 for (const Instruction *LastInst : LastInstsInRange)
15528 CheckedInstructions.try_emplace(
15529 LastInst, PrevInstIt == InstIt ? First : &*PrevInstIt,
15530 Budget <= BudgetLimit ? 1 : 0);
15531 return Budget <= BudgetLimit;
15532 };
15533 auto AddCosts = [&](const TreeEntry *Op) {
15534 Type *ScalarTy = Op->Scalars.front()->getType();
15535 auto It = MinBWs.find(Op);
15536 if (It != MinBWs.end())
15537 ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
15538 auto *VecTy = getWidenedType(ScalarTy, Op->getVectorFactor());
15540 if (ScalarTy->isVectorTy()) {
15541 // Handle revec dead vector instructions.
15542 Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
15543 }
15544 };
15545 // Memoize the relationship between blocks, i.e. if there is (at least one)
15546 // non-vectorized call between the blocks. This allows to skip the analysis of
15547 // the same block paths multiple times.
15549 ParentOpParentToPreds;
15550 auto CheckPredecessors = [&](BasicBlock *Root, BasicBlock *Pred,
15551 BasicBlock *OpParent) {
15552 auto Key = std::make_pair(Root, OpParent);
15553 if (auto It = ParentOpParentToPreds.find(Key);
15554 It != ParentOpParentToPreds.end())
15555 return It->second;
15557 if (Pred)
15558 Worklist.push_back(Pred);
15559 else
15560 Worklist.append(pred_begin(Root), pred_end(Root));
15563 ParentsPairsToAdd;
15564 bool Res = false;
15565 auto Cleanup = make_scope_exit([&]() {
15566 for (const auto &KeyPair : ParentsPairsToAdd) {
15567 assert(!ParentOpParentToPreds.contains(KeyPair) &&
15568 "Should not have been added before.");
15569 ParentOpParentToPreds.try_emplace(KeyPair, Res);
15570 }
15571 });
15572 while (!Worklist.empty()) {
15573 BasicBlock *BB = Worklist.pop_back_val();
15574 if (BB == OpParent || !Visited.insert(BB).second)
15575 continue;
15576 auto Pair = std::make_pair(BB, OpParent);
15577 if (auto It = ParentOpParentToPreds.find(Pair);
15578 It != ParentOpParentToPreds.end()) {
15579 Res = It->second;
15580 return Res;
15581 }
15582 ParentsPairsToAdd.insert(Pair);
15583 unsigned BlockSize = BB->size();
15584 if (BlockSize > static_cast<unsigned>(ScheduleRegionSizeBudget))
15585 return Res;
15586 Budget += BlockSize;
15587 if (Budget > BudgetLimit)
15588 return Res;
15589 if (!isa<CatchSwitchInst>(BB->getTerminator()) &&
15590 !CheckForNonVecCallsInSameBlock(&*BB->getFirstNonPHIOrDbgOrAlloca(),
15591 BB->getTerminator()))
15592 return Res;
15593 Worklist.append(pred_begin(BB), pred_end(BB));
15594 }
15595 Res = true;
15596 return Res;
15597 };
15598 SmallVector<const TreeEntry *> LiveEntries(1, Root);
15599 while (!LiveEntries.empty()) {
15600 const TreeEntry *Entry = LiveEntries.pop_back_val();
15601 SmallVector<const TreeEntry *> Operands = EntriesToOperands.lookup(Entry);
15602 if (Operands.empty())
15603 continue;
15604 Instruction *LastInst = EntriesToLastInstruction.at(Entry);
15605 BasicBlock *Parent = LastInst->getParent();
15606 for (const TreeEntry *Op : Operands) {
15607 if (!Op->isGather())
15608 LiveEntries.push_back(Op);
15609 if (Entry->State == TreeEntry::SplitVectorize ||
15610 (Entry->getOpcode() != Instruction::PHI && Op->isGather()) ||
15611 (Op->isGather() && allConstant(Op->Scalars)))
15612 continue;
15613 Budget = 0;
15614 BasicBlock *Pred = nullptr;
15615 if (auto *Phi = dyn_cast<PHINode>(Entry->getMainOp()))
15616 Pred = Phi->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
15617 BasicBlock *OpParent;
15618 Instruction *OpLastInst;
15619 if (Op->isGather()) {
15620 assert(Entry->getOpcode() == Instruction::PHI &&
15621 "Expected phi node only.");
15622 OpParent = cast<PHINode>(Entry->getMainOp())
15623 ->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
15624 OpLastInst = OpParent->getTerminator();
15625 for (Value *V : Op->Scalars) {
15626 auto *Inst = dyn_cast<Instruction>(V);
15627 if (!Inst)
15628 continue;
15629 if (isVectorized(V)) {
15630 OpParent = Inst->getParent();
15631 OpLastInst = Inst;
15632 break;
15633 }
15634 }
15635 } else {
15636 OpLastInst = EntriesToLastInstruction.at(Op);
15637 OpParent = OpLastInst->getParent();
15638 }
15639 // Check the call instructions within the same basic blocks.
15640 if (OpParent == Parent) {
15641 if (Entry->getOpcode() == Instruction::PHI) {
15642 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
15643 AddCosts(Op);
15644 continue;
15645 }
15646 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
15647 AddCosts(Op);
15648 continue;
15649 }
15650 // Check for call instruction in between blocks.
15651 // 1. Check entry's block to the head.
15652 if (Entry->getOpcode() != Instruction::PHI &&
15653 !CheckForNonVecCallsInSameBlock(
15654 &*LastInst->getParent()->getFirstNonPHIOrDbgOrAlloca(),
15655 LastInst)) {
15656 AddCosts(Op);
15657 continue;
15658 }
15659 // 2. Check op's block from the end.
15660 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
15661 OpParent->getTerminator())) {
15662 AddCosts(Op);
15663 continue;
15664 }
15665 // 3. Check the predecessors of entry's block till op's block.
15666 if (!CheckPredecessors(Parent, Pred, OpParent)) {
15667 AddCosts(Op);
15668 continue;
15669 }
15670 }
15671 }
15672
15673 return Cost;
15674}
15675
15676/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
15677/// buildvector sequence.
15679 const InsertElementInst *IE2) {
15680 if (IE1 == IE2)
15681 return false;
15682 const auto *I1 = IE1;
15683 const auto *I2 = IE2;
15684 const InsertElementInst *PrevI1;
15685 const InsertElementInst *PrevI2;
15686 unsigned Idx1 = *getElementIndex(IE1);
15687 unsigned Idx2 = *getElementIndex(IE2);
15688 do {
15689 if (I2 == IE1)
15690 return true;
15691 if (I1 == IE2)
15692 return false;
15693 PrevI1 = I1;
15694 PrevI2 = I2;
15695 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
15696 getElementIndex(I1).value_or(Idx2) != Idx2)
15697 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
15698 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
15699 getElementIndex(I2).value_or(Idx1) != Idx1)
15700 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
15701 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
15702 llvm_unreachable("Two different buildvectors not expected.");
15703}
15704
15705namespace {
15706/// Returns incoming Value *, if the requested type is Value * too, or a default
15707/// value, otherwise.
15708struct ValueSelect {
15709 template <typename U>
15710 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
15711 return V;
15712 }
15713 template <typename U>
15714 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
15715 return U();
15716 }
15717};
15718} // namespace
15719
15720/// Does the analysis of the provided shuffle masks and performs the requested
15721/// actions on the vectors with the given shuffle masks. It tries to do it in
15722/// several steps.
15723/// 1. If the Base vector is not undef vector, resizing the very first mask to
15724/// have common VF and perform action for 2 input vectors (including non-undef
15725/// Base). Other shuffle masks are combined with the resulting after the 1 stage
15726/// and processed as a shuffle of 2 elements.
15727/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
15728/// action only for 1 vector with the given mask, if it is not the identity
15729/// mask.
15730/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
15731/// vectors, combing the masks properly between the steps.
15732template <typename T>
15734 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
15735 function_ref<unsigned(T *)> GetVF,
15736 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
15738 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
15739 SmallVector<int> Mask(ShuffleMask.begin()->second);
15740 auto VMIt = std::next(ShuffleMask.begin());
15741 T *Prev = nullptr;
15742 SmallBitVector UseMask =
15743 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
15744 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
15745 if (!IsBaseUndef.all()) {
15746 // Base is not undef, need to combine it with the next subvectors.
15747 std::pair<T *, bool> Res =
15748 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
15749 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
15750 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
15751 if (Mask[Idx] == PoisonMaskElem)
15752 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
15753 else
15754 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
15755 }
15756 [[maybe_unused]] auto *V = ValueSelect::get<T *>(Base);
15757 assert((!V || GetVF(V) == Mask.size()) &&
15758 "Expected base vector of VF number of elements.");
15759 Prev = Action(Mask, {nullptr, Res.first});
15760 } else if (ShuffleMask.size() == 1) {
15761 // Base is undef and only 1 vector is shuffled - perform the action only for
15762 // single vector, if the mask is not the identity mask.
15763 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
15764 /*ForSingleMask=*/true);
15765 if (Res.second)
15766 // Identity mask is found.
15767 Prev = Res.first;
15768 else
15769 Prev = Action(Mask, {ShuffleMask.begin()->first});
15770 } else {
15771 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
15772 // shuffles step by step, combining shuffle between the steps.
15773 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
15774 unsigned Vec2VF = GetVF(VMIt->first);
15775 if (Vec1VF == Vec2VF) {
15776 // No need to resize the input vectors since they are of the same size, we
15777 // can shuffle them directly.
15778 ArrayRef<int> SecMask = VMIt->second;
15779 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
15780 if (SecMask[I] != PoisonMaskElem) {
15781 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
15782 Mask[I] = SecMask[I] + Vec1VF;
15783 }
15784 }
15785 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
15786 } else {
15787 // Vectors of different sizes - resize and reshuffle.
15788 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
15789 /*ForSingleMask=*/false);
15790 std::pair<T *, bool> Res2 =
15791 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
15792 ArrayRef<int> SecMask = VMIt->second;
15793 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
15794 if (Mask[I] != PoisonMaskElem) {
15795 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
15796 if (Res1.second)
15797 Mask[I] = I;
15798 } else if (SecMask[I] != PoisonMaskElem) {
15799 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
15800 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
15801 }
15802 }
15803 Prev = Action(Mask, {Res1.first, Res2.first});
15804 }
15805 VMIt = std::next(VMIt);
15806 }
15807 [[maybe_unused]] bool IsBaseNotUndef = !IsBaseUndef.all();
15808 // Perform requested actions for the remaining masks/vectors.
15809 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
15810 // Shuffle other input vectors, if any.
15811 std::pair<T *, bool> Res =
15812 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
15813 ArrayRef<int> SecMask = VMIt->second;
15814 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
15815 if (SecMask[I] != PoisonMaskElem) {
15816 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
15817 "Multiple uses of scalars.");
15818 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
15819 } else if (Mask[I] != PoisonMaskElem) {
15820 Mask[I] = I;
15821 }
15822 }
15823 Prev = Action(Mask, {Prev, Res.first});
15824 }
15825 return Prev;
15826}
15827
15828namespace {
15829/// Data type for handling buildvector sequences with the reused scalars from
15830/// other tree entries.
15831template <typename T> struct ShuffledInsertData {
15832 /// List of insertelements to be replaced by shuffles.
15833 SmallVector<InsertElementInst *> InsertElements;
15834 /// The parent vectors and shuffle mask for the given list of inserts.
15836};
15837} // namespace
15838
15840 InstructionCost ReductionCost) {
15841 InstructionCost Cost = ReductionCost;
15842 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
15843 << VectorizableTree.size() << ".\n");
15844
15845 SmallPtrSet<Value *, 4> CheckedExtracts;
15846 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
15847 TreeEntry &TE = *VectorizableTree[I];
15848 // No need to count the cost for combined entries, they are combined and
15849 // just skip their cost.
15850 if (TE.State == TreeEntry::CombinedVectorize) {
15851 LLVM_DEBUG(
15852 dbgs() << "SLP: Skipping cost for combined node that starts with "
15853 << *TE.Scalars[0] << ".\n";
15854 TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
15855 continue;
15856 }
15857 if (TE.hasState() &&
15858 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
15859 if (const TreeEntry *E =
15860 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
15861 E && E->getVectorFactor() == TE.getVectorFactor()) {
15862 // Some gather nodes might be absolutely the same as some vectorizable
15863 // nodes after reordering, need to handle it.
15864 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
15865 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
15866 << "SLP: Current total cost = " << Cost << "\n");
15867 continue;
15868 }
15869 }
15870
15871 // Exclude cost of gather loads nodes which are not used. These nodes were
15872 // built as part of the final attempt to vectorize gathered loads.
15873 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
15874 "Expected gather nodes with users only.");
15875
15876 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
15877 Cost += C;
15878 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
15879 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
15880 << "SLP: Current total cost = " << Cost << "\n");
15881 }
15882
15883 if (Cost >= -SLPCostThreshold &&
15884 none_of(ExternalUses, [](const ExternalUser &EU) {
15885 return isa_and_nonnull<InsertElementInst>(EU.User);
15886 }))
15887 return Cost;
15888
15889 SmallPtrSet<Value *, 16> ExtractCostCalculated;
15890 InstructionCost ExtractCost = 0;
15892 SmallVector<APInt> DemandedElts;
15893 SmallDenseSet<Value *, 4> UsedInserts;
15895 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
15897 SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
15898 // Keep track {Scalar, Index, User} tuple.
15899 // On AArch64, this helps in fusing a mov instruction, associated with
15900 // extractelement, with fmul in the backend so that extractelement is free.
15902 for (ExternalUser &EU : ExternalUses) {
15903 ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane);
15904 }
15905 SmallDenseSet<std::pair<Value *, Value *>, 8> CheckedScalarUser;
15906 for (ExternalUser &EU : ExternalUses) {
15907 LLVM_DEBUG(dbgs() << "SLP: Computing cost for external use of TreeEntry "
15908 << EU.E.Idx << " in lane " << EU.Lane << "\n");
15909 LLVM_DEBUG(if (EU.User) dbgs() << " User:" << *EU.User << "\n";
15910 else dbgs() << " User: nullptr\n");
15911 LLVM_DEBUG(dbgs() << " Use: " << EU.Scalar->getNameOrAsOperand() << "\n");
15912
15913 // Uses by ephemeral values are free (because the ephemeral value will be
15914 // removed prior to code generation, and so the extraction will be
15915 // removed as well).
15916 if (EphValues.count(EU.User))
15917 continue;
15918
15919 // Check if the scalar for the given user or all users is accounted already.
15920 if (!CheckedScalarUser.insert(std::make_pair(EU.Scalar, EU.User)).second ||
15921 (EU.User &&
15922 CheckedScalarUser.contains(std::make_pair(EU.Scalar, nullptr))))
15923 continue;
15924
15925 // Used in unreachable blocks or in EH pads (rarely executed) or is
15926 // terminated with unreachable instruction.
15927 if (BasicBlock *UserParent =
15928 EU.User ? cast<Instruction>(EU.User)->getParent() : nullptr;
15929 UserParent &&
15930 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
15931 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
15932 continue;
15933
15934 // We only add extract cost once for the same scalar.
15935 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
15936 !ExtractCostCalculated.insert(EU.Scalar).second)
15937 continue;
15938
15939 // No extract cost for vector "scalar" if REVEC is disabled
15940 if (!SLPReVec && isa<FixedVectorType>(EU.Scalar->getType()))
15941 continue;
15942
15943 // If found user is an insertelement, do not calculate extract cost but try
15944 // to detect it as a final shuffled/identity match.
15945 // TODO: what if a user is insertvalue when REVEC is enabled?
15946 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
15947 VU && VU->getOperand(1) == EU.Scalar) {
15948 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
15949 if (!UsedInserts.insert(VU).second)
15950 continue;
15951 std::optional<unsigned> InsertIdx = getElementIndex(VU);
15952 if (InsertIdx) {
15953 const TreeEntry *ScalarTE = &EU.E;
15954 auto *It = find_if(
15955 ShuffledInserts,
15956 [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
15957 // Checks if 2 insertelements are from the same buildvector.
15958 InsertElementInst *VecInsert = Data.InsertElements.front();
15960 VU, VecInsert, [this](InsertElementInst *II) -> Value * {
15961 Value *Op0 = II->getOperand(0);
15962 if (isVectorized(II) && !isVectorized(Op0))
15963 return nullptr;
15964 return Op0;
15965 });
15966 });
15967 int VecId = -1;
15968 if (It == ShuffledInserts.end()) {
15969 auto &Data = ShuffledInserts.emplace_back();
15970 Data.InsertElements.emplace_back(VU);
15971 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
15972 VecId = ShuffledInserts.size() - 1;
15973 auto It = MinBWs.find(ScalarTE);
15974 if (It != MinBWs.end() &&
15975 VectorCasts
15976 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
15977 .second) {
15978 unsigned BWSz = It->second.first;
15979 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
15980 unsigned VecOpcode;
15981 if (DstBWSz < BWSz)
15982 VecOpcode = Instruction::Trunc;
15983 else
15984 VecOpcode =
15985 It->second.second ? Instruction::SExt : Instruction::ZExt;
15988 VecOpcode, FTy,
15989 getWidenedType(IntegerType::get(FTy->getContext(), BWSz),
15990 FTy->getNumElements()),
15992 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
15993 << " for extending externally used vector with "
15994 "non-equal minimum bitwidth.\n");
15995 Cost += C;
15996 }
15997 } else {
15998 if (isFirstInsertElement(VU, It->InsertElements.front()))
15999 It->InsertElements.front() = VU;
16000 VecId = std::distance(ShuffledInserts.begin(), It);
16001 }
16002 int InIdx = *InsertIdx;
16003 SmallVectorImpl<int> &Mask =
16004 ShuffledInserts[VecId].ValueMasks[ScalarTE];
16005 if (Mask.empty())
16006 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
16007 Mask[InIdx] = EU.Lane;
16008 DemandedElts[VecId].setBit(InIdx);
16009 continue;
16010 }
16011 }
16012 }
16013
16015 // If we plan to rewrite the tree in a smaller type, we will need to sign
16016 // extend the extracted value back to the original type. Here, we account
16017 // for the extract and the added cost of the sign extend if needed.
16018 InstructionCost ExtraCost = TTI::TCC_Free;
16019 auto *ScalarTy = EU.Scalar->getType();
16020 const unsigned BundleWidth = EU.E.getVectorFactor();
16021 assert(EU.Lane < BundleWidth && "Extracted lane out of bounds.");
16022 auto *VecTy = getWidenedType(ScalarTy, BundleWidth);
16023 const TreeEntry *Entry = &EU.E;
16024 auto It = MinBWs.find(Entry);
16025 if (It != MinBWs.end()) {
16026 Type *MinTy = IntegerType::get(F->getContext(), It->second.first);
16027 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy))
16028 MinTy = getWidenedType(MinTy, VecTy->getNumElements());
16029 unsigned Extend = isKnownNonNegative(EU.Scalar, SimplifyQuery(*DL))
16030 ? Instruction::ZExt
16031 : Instruction::SExt;
16032 VecTy = getWidenedType(MinTy, BundleWidth);
16033 ExtraCost =
16034 getExtractWithExtendCost(*TTI, Extend, ScalarTy, VecTy, EU.Lane);
16035 LLVM_DEBUG(dbgs() << " ExtractExtend or ExtractSubvec cost: "
16036 << ExtraCost << "\n");
16037 } else {
16038 ExtraCost =
16039 getVectorInstrCost(*TTI, ScalarTy, Instruction::ExtractElement, VecTy,
16040 CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
16041 LLVM_DEBUG(dbgs() << " ExtractElement cost for " << *ScalarTy << " from "
16042 << *VecTy << ": " << ExtraCost << "\n");
16043 }
16044 // Leave the scalar instructions as is if they are cheaper than extracts.
16045 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
16046 Entry->getOpcode() == Instruction::Load) {
16047 // Checks if the user of the external scalar is phi in loop body.
16048 auto IsPhiInLoop = [&](const ExternalUser &U) {
16049 if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
16050 auto *I = cast<Instruction>(U.Scalar);
16051 const Loop *L = LI->getLoopFor(Phi->getParent());
16052 return L && (Phi->getParent() == I->getParent() ||
16053 L == LI->getLoopFor(I->getParent()));
16054 }
16055 return false;
16056 };
16057 if (!ValueToExtUses) {
16058 ValueToExtUses.emplace();
16059 for (const auto &P : enumerate(ExternalUses)) {
16060 // Ignore phis in loops.
16061 if (IsPhiInLoop(P.value()))
16062 continue;
16063
16064 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
16065 }
16066 }
16067 // Can use original instruction, if no operands vectorized or they are
16068 // marked as externally used already.
16069 auto *Inst = cast<Instruction>(EU.Scalar);
16070 InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
16071 auto OperandIsScalar = [&](Value *V) {
16072 if (!isVectorized(V)) {
16073 // Some extractelements might be not vectorized, but
16074 // transformed into shuffle and removed from the function,
16075 // consider it here.
16076 if (auto *EE = dyn_cast<ExtractElementInst>(V))
16077 return !EE->hasOneUse() || !MustGather.contains(EE);
16078 return true;
16079 }
16080 return ValueToExtUses->contains(V);
16081 };
16082 bool CanBeUsedAsScalar = all_of(Inst->operands(), OperandIsScalar);
16083 bool CanBeUsedAsScalarCast = false;
16084 if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
16085 if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));
16086 Op && all_of(Op->operands(), OperandIsScalar)) {
16087 InstructionCost OpCost =
16088 (isVectorized(Op) && !ValueToExtUses->contains(Op))
16090 : 0;
16091 if (ScalarCost + OpCost <= ExtraCost) {
16092 CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
16093 ScalarCost += OpCost;
16094 }
16095 }
16096 }
16097 if (CanBeUsedAsScalar) {
16098 bool KeepScalar = ScalarCost <= ExtraCost;
16099 // Try to keep original scalar if the user is the phi node from the same
16100 // block as the root phis, currently vectorized. It allows to keep
16101 // better ordering info of PHIs, being vectorized currently.
16102 bool IsProfitablePHIUser =
16103 (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
16104 VectorizableTree.front()->Scalars.size() > 2)) &&
16105 VectorizableTree.front()->hasState() &&
16106 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
16107 !Inst->hasNUsesOrMore(UsesLimit) &&
16108 none_of(Inst->users(),
16109 [&](User *U) {
16110 auto *PHIUser = dyn_cast<PHINode>(U);
16111 return (!PHIUser ||
16112 PHIUser->getParent() !=
16113 cast<Instruction>(
16114 VectorizableTree.front()->getMainOp())
16115 ->getParent()) &&
16116 !isVectorized(U);
16117 }) &&
16118 count_if(Entry->Scalars, [&](Value *V) {
16119 return ValueToExtUses->contains(V);
16120 }) <= 2;
16121 if (IsProfitablePHIUser) {
16122 KeepScalar = true;
16123 } else if (KeepScalar && ScalarCost != TTI::TCC_Free &&
16124 ExtraCost - ScalarCost <= TTI::TCC_Basic &&
16125 (!GatheredLoadsEntriesFirst.has_value() ||
16126 Entry->Idx < *GatheredLoadsEntriesFirst)) {
16127 unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
16128 return ValueToExtUses->contains(V);
16129 });
16130 auto It = ExtractsCount.find(Entry);
16131 if (It != ExtractsCount.end()) {
16132 assert(ScalarUsesCount >= It->getSecond().size() &&
16133 "Expected total number of external uses not less than "
16134 "number of scalar uses.");
16135 ScalarUsesCount -= It->getSecond().size();
16136 }
16137 // Keep original scalar if number of externally used instructions in
16138 // the same entry is not power of 2. It may help to do some extra
16139 // vectorization for now.
16140 KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
16141 }
16142 if (KeepScalar) {
16143 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
16144 for (Value *V : Inst->operands()) {
16145 auto It = ValueToExtUses->find(V);
16146 if (It != ValueToExtUses->end()) {
16147 // Replace all uses to avoid compiler crash.
16148 ExternalUses[It->second].User = nullptr;
16149 }
16150 }
16151 ExtraCost = ScalarCost;
16152 if (!IsPhiInLoop(EU))
16153 ExtractsCount[Entry].insert(Inst);
16154 if (CanBeUsedAsScalarCast) {
16155 ScalarOpsFromCasts.insert(Inst->getOperand(0));
16156 // Update the users of the operands of the cast operand to avoid
16157 // compiler crash.
16158 if (auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
16159 for (Value *V : IOp->operands()) {
16160 auto It = ValueToExtUses->find(V);
16161 if (It != ValueToExtUses->end()) {
16162 // Replace all uses to avoid compiler crash.
16163 ExternalUses[It->second].User = nullptr;
16164 }
16165 }
16166 }
16167 }
16168 }
16169 }
16170 }
16171
16172 ExtractCost += ExtraCost;
16173 }
16174 // Insert externals for extract of operands of casts to be emitted as scalars
16175 // instead of extractelement.
16176 for (Value *V : ScalarOpsFromCasts) {
16177 ExternalUsesAsOriginalScalar.insert(V);
16178 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V); !TEs.empty()) {
16179 ExternalUses.emplace_back(V, nullptr, *TEs.front(),
16180 TEs.front()->findLaneForValue(V));
16181 }
16182 }
16183 // Add reduced value cost, if resized.
16184 if (!VectorizedVals.empty()) {
16185 const TreeEntry &Root = *VectorizableTree.front();
16186 auto BWIt = MinBWs.find(&Root);
16187 if (BWIt != MinBWs.end()) {
16188 Type *DstTy = Root.Scalars.front()->getType();
16189 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->getScalarType());
16190 unsigned SrcSz =
16191 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
16192 if (OriginalSz != SrcSz) {
16193 unsigned Opcode = Instruction::Trunc;
16194 if (OriginalSz > SrcSz)
16195 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
16196 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
16197 if (auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
16198 assert(SLPReVec && "Only supported by REVEC.");
16199 SrcTy = getWidenedType(SrcTy, VecTy->getNumElements());
16200 }
16201 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
16204 }
16205 }
16206 }
16207
16208 Cost += ExtractCost;
16209 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
16210 bool ForSingleMask) {
16211 InstructionCost C = 0;
16212 unsigned VF = Mask.size();
16213 unsigned VecVF = TE->getVectorFactor();
16214 bool HasLargeIndex =
16215 any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); });
16216 if ((VF != VecVF && HasLargeIndex) ||
16218
16219 if (HasLargeIndex) {
16220 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
16221 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
16222 OrigMask.begin());
16224 getWidenedType(TE->getMainOp()->getType(), VecVF),
16225 OrigMask);
16226 LLVM_DEBUG(
16227 dbgs() << "SLP: Adding cost " << C
16228 << " for final shuffle of insertelement external users.\n";
16229 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
16230 Cost += C;
16231 return std::make_pair(TE, true);
16232 }
16233
16234 if (!ForSingleMask) {
16235 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
16236 for (unsigned I = 0; I < VF; ++I) {
16237 if (Mask[I] != PoisonMaskElem)
16238 ResizeMask[Mask[I]] = Mask[I];
16239 }
16240 if (!ShuffleVectorInst::isIdentityMask(ResizeMask, VF))
16243 getWidenedType(TE->getMainOp()->getType(), VecVF), ResizeMask);
16244 LLVM_DEBUG(
16245 dbgs() << "SLP: Adding cost " << C
16246 << " for final shuffle of insertelement external users.\n";
16247 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
16248
16249 Cost += C;
16250 }
16251 }
16252 return std::make_pair(TE, false);
16253 };
16254 // Calculate the cost of the reshuffled vectors, if any.
16255 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
16256 Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0);
16257 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
16258 unsigned VF = 0;
16259 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
16261 assert((TEs.size() == 1 || TEs.size() == 2) &&
16262 "Expected exactly 1 or 2 tree entries.");
16263 if (TEs.size() == 1) {
16264 if (VF == 0)
16265 VF = TEs.front()->getVectorFactor();
16266 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16267 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
16268 !all_of(enumerate(Mask), [=](const auto &Data) {
16269 return Data.value() == PoisonMaskElem ||
16270 (Data.index() < VF &&
16271 static_cast<int>(Data.index()) == Data.value());
16272 })) {
16275 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
16276 << " for final shuffle of insertelement "
16277 "external users.\n";
16278 TEs.front()->dump();
16279 dbgs() << "SLP: Current total cost = " << Cost << "\n");
16280 Cost += C;
16281 }
16282 } else {
16283 if (VF == 0) {
16284 if (TEs.front() &&
16285 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
16286 VF = TEs.front()->getVectorFactor();
16287 else
16288 VF = Mask.size();
16289 }
16290 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16293 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
16294 << " for final shuffle of vector node and external "
16295 "insertelement users.\n";
16296 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
16297 dbgs() << "SLP: Current total cost = " << Cost << "\n");
16298 Cost += C;
16299 }
16300 VF = Mask.size();
16301 return TEs.back();
16302 };
16303 (void)performExtractsShuffleAction<const TreeEntry>(
16304 MutableArrayRef(Vector.data(), Vector.size()), Base,
16305 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
16306 EstimateShufflesCost);
16308 cast<FixedVectorType>(
16309 ShuffledInserts[I].InsertElements.front()->getType()),
16310 DemandedElts[I],
16311 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
16312 Cost -= InsertCost;
16313 }
16314
16315 // Add the cost for reduced value resize (if required).
16316 if (ReductionBitWidth != 0) {
16317 assert(UserIgnoreList && "Expected reduction tree.");
16318 const TreeEntry &E = *VectorizableTree.front();
16319 auto It = MinBWs.find(&E);
16320 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
16321 unsigned SrcSize = It->second.first;
16322 unsigned DstSize = ReductionBitWidth;
16323 unsigned Opcode = Instruction::Trunc;
16324 if (SrcSize < DstSize) {
16325 bool IsArithmeticExtendedReduction =
16326 all_of(*UserIgnoreList, [](Value *V) {
16327 auto *I = cast<Instruction>(V);
16328 return is_contained({Instruction::Add, Instruction::FAdd,
16329 Instruction::Mul, Instruction::FMul,
16330 Instruction::And, Instruction::Or,
16331 Instruction::Xor},
16332 I->getOpcode());
16333 });
16334 if (IsArithmeticExtendedReduction)
16335 Opcode =
16336 Instruction::BitCast; // Handle it by getExtendedReductionCost
16337 else
16338 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
16339 }
16340 if (Opcode != Instruction::BitCast) {
16341 auto *SrcVecTy =
16342 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
16343 auto *DstVecTy =
16344 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
16345 TTI::CastContextHint CCH = getCastContextHint(E);
16346 InstructionCost CastCost;
16347 switch (E.getOpcode()) {
16348 case Instruction::SExt:
16349 case Instruction::ZExt:
16350 case Instruction::Trunc: {
16351 const TreeEntry *OpTE = getOperandEntry(&E, 0);
16352 CCH = getCastContextHint(*OpTE);
16353 break;
16354 }
16355 default:
16356 break;
16357 }
16358 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
16360 Cost += CastCost;
16361 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
16362 << " for final resize for reduction from " << SrcVecTy
16363 << " to " << DstVecTy << "\n";
16364 dbgs() << "SLP: Current total cost = " << Cost << "\n");
16365 }
16366 }
16367 }
16368
16369 std::optional<InstructionCost> SpillCost;
16370 if (Cost < -SLPCostThreshold) {
16371 SpillCost = getSpillCost();
16372 Cost += *SpillCost;
16373 }
16374#ifndef NDEBUG
16375 SmallString<256> Str;
16376 {
16378 OS << "SLP: Spill Cost = ";
16379 if (SpillCost)
16380 OS << *SpillCost;
16381 else
16382 OS << "<skipped>";
16383 OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n"
16384 << "SLP: Total Cost = " << Cost << ".\n";
16385 }
16386 LLVM_DEBUG(dbgs() << Str);
16387 if (ViewSLPTree)
16388 ViewGraph(this, "SLP" + F->getName(), false, Str);
16389#endif
16390
16391 return Cost;
16392}
16393
16394/// Tries to find extractelement instructions with constant indices from fixed
16395/// vector type and gather such instructions into a bunch, which highly likely
16396/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
16397/// successful, the matched scalars are replaced by poison values in \p VL for
16398/// future analysis.
16399std::optional<TTI::ShuffleKind>
16400BoUpSLP::tryToGatherSingleRegisterExtractElements(
16402 // Scan list of gathered scalars for extractelements that can be represented
16403 // as shuffles.
16405 SmallVector<int> UndefVectorExtracts;
16406 for (int I = 0, E = VL.size(); I < E; ++I) {
16407 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
16408 if (!EI) {
16409 if (isa<UndefValue>(VL[I]))
16410 UndefVectorExtracts.push_back(I);
16411 continue;
16412 }
16413 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
16414 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
16415 continue;
16416 std::optional<unsigned> Idx = getExtractIndex(EI);
16417 // Undefined index.
16418 if (!Idx) {
16419 UndefVectorExtracts.push_back(I);
16420 continue;
16421 }
16422 if (Idx >= VecTy->getNumElements()) {
16423 UndefVectorExtracts.push_back(I);
16424 continue;
16425 }
16426 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
16427 ExtractMask.reset(*Idx);
16428 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
16429 UndefVectorExtracts.push_back(I);
16430 continue;
16431 }
16432 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
16433 }
16434 // Sort the vector operands by the maximum number of uses in extractelements.
16436 VectorOpToIdx.takeVector();
16437 stable_sort(Vectors, [](const auto &P1, const auto &P2) {
16438 return P1.second.size() > P2.second.size();
16439 });
16440 // Find the best pair of the vectors or a single vector.
16441 const int UndefSz = UndefVectorExtracts.size();
16442 unsigned SingleMax = 0;
16443 unsigned PairMax = 0;
16444 if (!Vectors.empty()) {
16445 SingleMax = Vectors.front().second.size() + UndefSz;
16446 if (Vectors.size() > 1) {
16447 auto *ItNext = std::next(Vectors.begin());
16448 PairMax = SingleMax + ItNext->second.size();
16449 }
16450 }
16451 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
16452 return std::nullopt;
16453 // Check if better to perform a shuffle of 2 vectors or just of a single
16454 // vector.
16455 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
16456 SmallVector<Value *> GatheredExtracts(
16457 VL.size(), PoisonValue::get(VL.front()->getType()));
16458 if (SingleMax >= PairMax && SingleMax) {
16459 for (int Idx : Vectors.front().second)
16460 std::swap(GatheredExtracts[Idx], VL[Idx]);
16461 } else if (!Vectors.empty()) {
16462 for (unsigned Idx : {0, 1})
16463 for (int Idx : Vectors[Idx].second)
16464 std::swap(GatheredExtracts[Idx], VL[Idx]);
16465 }
16466 // Add extracts from undefs too.
16467 for (int Idx : UndefVectorExtracts)
16468 std::swap(GatheredExtracts[Idx], VL[Idx]);
16469 // Check that gather of extractelements can be represented as just a
16470 // shuffle of a single/two vectors the scalars are extracted from.
16471 std::optional<TTI::ShuffleKind> Res =
16472 isFixedVectorShuffle(GatheredExtracts, Mask, AC);
16473 if (!Res || all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
16474 // TODO: try to check other subsets if possible.
16475 // Restore the original VL if attempt was not successful.
16476 copy(SavedVL, VL.begin());
16477 return std::nullopt;
16478 }
16479 // Restore unused scalars from mask, if some of the extractelements were not
16480 // selected for shuffle.
16481 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
16482 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
16483 isa<UndefValue>(GatheredExtracts[I])) {
16484 std::swap(VL[I], GatheredExtracts[I]);
16485 continue;
16486 }
16487 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
16488 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
16489 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
16490 is_contained(UndefVectorExtracts, I))
16491 continue;
16492 }
16493 return Res;
16494}
16495
16496/// Tries to find extractelement instructions with constant indices from fixed
16497/// vector type and gather such instructions into a bunch, which highly likely
16498/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
16499/// successful, the matched scalars are replaced by poison values in \p VL for
16500/// future analysis.
16502BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
16504 unsigned NumParts) const {
16505 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
16506 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
16507 Mask.assign(VL.size(), PoisonMaskElem);
16508 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
16509 for (unsigned Part : seq<unsigned>(NumParts)) {
16510 // Scan list of gathered scalars for extractelements that can be represented
16511 // as shuffles.
16513 Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
16514 SmallVector<int> SubMask;
16515 std::optional<TTI::ShuffleKind> Res =
16516 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
16517 ShufflesRes[Part] = Res;
16518 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
16519 }
16520 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
16521 return Res.has_value();
16522 }))
16523 ShufflesRes.clear();
16524 return ShufflesRes;
16525}
16526
16527std::optional<TargetTransformInfo::ShuffleKind>
16528BoUpSLP::isGatherShuffledSingleRegisterEntry(
16529 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
16530 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
16531 Entries.clear();
16532 // TODO: currently checking only for Scalars in the tree entry, need to count
16533 // reused elements too for better cost estimation.
16534 auto GetUserEntry = [&](const TreeEntry *TE) {
16535 while (TE->UserTreeIndex && TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16536 TE = TE->UserTreeIndex.UserTE;
16537 if (TE == VectorizableTree.front().get())
16538 return EdgeInfo(const_cast<TreeEntry *>(TE), 0);
16539 return TE->UserTreeIndex;
16540 };
16541 auto HasGatherUser = [&](const TreeEntry *TE) {
16542 while (TE->Idx != 0 && TE->UserTreeIndex) {
16543 if (TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16544 return true;
16545 TE = TE->UserTreeIndex.UserTE;
16546 }
16547 return false;
16548 };
16549 const EdgeInfo TEUseEI = GetUserEntry(TE);
16550 if (!TEUseEI)
16551 return std::nullopt;
16552 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
16553 const BasicBlock *TEInsertBlock = nullptr;
16554 // Main node of PHI entries keeps the correct order of operands/incoming
16555 // blocks.
16556 if (auto *PHI = dyn_cast_or_null<PHINode>(
16557 TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() : nullptr);
16558 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
16559 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
16560 TEInsertPt = TEInsertBlock->getTerminator();
16561 } else {
16562 TEInsertBlock = TEInsertPt->getParent();
16563 }
16564 if (!DT->isReachableFromEntry(TEInsertBlock))
16565 return std::nullopt;
16566 auto *NodeUI = DT->getNode(TEInsertBlock);
16567 assert(NodeUI && "Should only process reachable instructions");
16568 SmallPtrSet<Value *, 4> GatheredScalars(llvm::from_range, VL);
16569 auto CheckOrdering = [&](const Instruction *InsertPt) {
16570 // Argument InsertPt is an instruction where vector code for some other
16571 // tree entry (one that shares one or more scalars with TE) is going to be
16572 // generated. This lambda returns true if insertion point of vector code
16573 // for the TE dominates that point (otherwise dependency is the other way
16574 // around). The other node is not limited to be of a gather kind. Gather
16575 // nodes are not scheduled and their vector code is inserted before their
16576 // first user. If user is PHI, that is supposed to be at the end of a
16577 // predecessor block. Otherwise it is the last instruction among scalars of
16578 // the user node. So, instead of checking dependency between instructions
16579 // themselves, we check dependency between their insertion points for vector
16580 // code (since each scalar instruction ends up as a lane of a vector
16581 // instruction).
16582 const BasicBlock *InsertBlock = InsertPt->getParent();
16583 auto *NodeEUI = DT->getNode(InsertBlock);
16584 if (!NodeEUI)
16585 return false;
16586 assert((NodeUI == NodeEUI) ==
16587 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
16588 "Different nodes should have different DFS numbers");
16589 // Check the order of the gather nodes users.
16590 if (TEInsertPt->getParent() != InsertBlock &&
16591 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
16592 return false;
16593 if (TEInsertPt->getParent() == InsertBlock &&
16594 TEInsertPt->comesBefore(InsertPt))
16595 return false;
16596 return true;
16597 };
16598 // Find all tree entries used by the gathered values. If no common entries
16599 // found - not a shuffle.
16600 // Here we build a set of tree nodes for each gathered value and trying to
16601 // find the intersection between these sets. If we have at least one common
16602 // tree node for each gathered value - we have just a permutation of the
16603 // single vector. If we have 2 different sets, we're in situation where we
16604 // have a permutation of 2 input vectors.
16606 SmallDenseMap<Value *, int> UsedValuesEntry;
16607 SmallPtrSet<const Value *, 16> VisitedValue;
16608 auto CheckAndUseSameNode = [&](const TreeEntry *TEPtr) {
16609 // The node is reused - exit.
16610 if ((TEPtr->getVectorFactor() != VL.size() &&
16611 TEPtr->Scalars.size() != VL.size()) ||
16612 (!TEPtr->isSame(VL) && !TEPtr->isSame(TE->Scalars)))
16613 return false;
16614 UsedTEs.clear();
16615 UsedTEs.emplace_back().insert(TEPtr);
16616 for (Value *V : VL) {
16617 if (isConstant(V))
16618 continue;
16619 UsedValuesEntry.try_emplace(V, 0);
16620 }
16621 return true;
16622 };
16623 auto CheckParentNodes = [&](const TreeEntry *User1, const TreeEntry *User2,
16624 unsigned EdgeIdx) {
16625 const TreeEntry *Ptr1 = User1;
16626 const TreeEntry *Ptr2 = User2;
16628 while (Ptr2) {
16629 PtrToIdx.try_emplace(Ptr2, EdgeIdx);
16630 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
16631 Ptr2 = Ptr2->UserTreeIndex.UserTE;
16632 }
16633 while (Ptr1) {
16634 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
16635 Ptr1 = Ptr1->UserTreeIndex.UserTE;
16636 if (auto It = PtrToIdx.find(Ptr1); It != PtrToIdx.end())
16637 return Idx < It->second;
16638 }
16639 return false;
16640 };
16641 for (Value *V : VL) {
16642 if (isConstant(V) || !VisitedValue.insert(V).second)
16643 continue;
16644 // Build a list of tree entries where V is used.
16646 for (const TreeEntry *TEPtr : ValueToGatherNodes.lookup(V)) {
16647 if (TEPtr == TE || TEPtr->Idx == 0)
16648 continue;
16649 assert(any_of(TEPtr->Scalars,
16650 [&](Value *V) { return GatheredScalars.contains(V); }) &&
16651 "Must contain at least single gathered value.");
16652 assert(TEPtr->UserTreeIndex &&
16653 "Expected only single user of a gather node.");
16654 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
16655
16656 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
16657 UseEI.UserTE->hasState())
16658 ? dyn_cast<PHINode>(UseEI.UserTE->getMainOp())
16659 : nullptr;
16660 Instruction *InsertPt =
16661 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
16662 : &getLastInstructionInBundle(UseEI.UserTE);
16663 if (TEInsertPt == InsertPt) {
16664 // Check nodes, which might be emitted first.
16665 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16666 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
16667 TEUseEI.UserTE->isAltShuffle()) &&
16668 all_of(TEUseEI.UserTE->Scalars, isUsedOutsideBlock)) {
16669 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
16670 (UseEI.UserTE->hasState() &&
16671 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16672 !UseEI.UserTE->isAltShuffle()) ||
16673 !all_of(UseEI.UserTE->Scalars, isUsedOutsideBlock))
16674 continue;
16675 }
16676
16677 // If the schedulable insertion point is used in multiple entries - just
16678 // exit, no known ordering at this point, available only after real
16679 // scheduling.
16680 if (!doesNotNeedToBeScheduled(InsertPt) &&
16681 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
16682 continue;
16683 // If the users are the PHI nodes with the same incoming blocks - skip.
16684 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16685 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
16686 UseEI.UserTE->State == TreeEntry::Vectorize &&
16687 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16688 TEUseEI.UserTE != UseEI.UserTE)
16689 continue;
16690 // If 2 gathers are operands of the same entry (regardless of whether
16691 // user is PHI or else), compare operands indices, use the earlier one
16692 // as the base.
16693 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
16694 continue;
16695 // If the user instruction is used for some reason in different
16696 // vectorized nodes - make it depend on index.
16697 if (TEUseEI.UserTE != UseEI.UserTE &&
16698 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
16699 HasGatherUser(TEUseEI.UserTE)))
16700 continue;
16701 // If the user node is the operand of the other user node - skip.
16702 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
16703 continue;
16704 }
16705
16706 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
16707 TEUseEI.UserTE->doesNotNeedToSchedule() !=
16708 UseEI.UserTE->doesNotNeedToSchedule() &&
16709 is_contained(UseEI.UserTE->Scalars, TEInsertPt))
16710 continue;
16711 // Check if the user node of the TE comes after user node of TEPtr,
16712 // otherwise TEPtr depends on TE.
16713 if ((TEInsertBlock != InsertPt->getParent() ||
16714 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
16715 !CheckOrdering(InsertPt))
16716 continue;
16717 // The node is reused - exit.
16718 if (CheckAndUseSameNode(TEPtr))
16719 break;
16720 VToTEs.insert(TEPtr);
16721 }
16722 if (ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V); !VTEs.empty()) {
16723 const auto *It = find_if(
16724 VTEs, [&](const TreeEntry *MTE) { return MTE != TEUseEI.UserTE; });
16725 if (It != VTEs.end()) {
16726 const TreeEntry *VTE = *It;
16727 if (none_of(TE->CombinedEntriesWithIndices,
16728 [&](const auto &P) { return P.first == VTE->Idx; })) {
16729 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16730 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16731 continue;
16732 }
16733 // The node is reused - exit.
16734 if (CheckAndUseSameNode(VTE))
16735 break;
16736 VToTEs.insert(VTE);
16737 }
16738 }
16739 if (ArrayRef<TreeEntry *> VTEs = getTreeEntries(V); !VTEs.empty()) {
16740 const TreeEntry *VTE = VTEs.front();
16741 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
16742 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
16743 VTEs = VTEs.drop_front();
16744 // Iterate through all vectorized nodes.
16745 const auto *MIt = find_if(VTEs, [](const TreeEntry *MTE) {
16746 return MTE->State == TreeEntry::Vectorize;
16747 });
16748 if (MIt == VTEs.end())
16749 continue;
16750 VTE = *MIt;
16751 }
16752 if (none_of(TE->CombinedEntriesWithIndices,
16753 [&](const auto &P) { return P.first == VTE->Idx; })) {
16754 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16755 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16756 continue;
16757 }
16758 // The node is reused - exit.
16759 if (CheckAndUseSameNode(VTE))
16760 break;
16761 VToTEs.insert(VTE);
16762 }
16763 if (VToTEs.empty())
16764 continue;
16765 if (UsedTEs.empty()) {
16766 // The first iteration, just insert the list of nodes to vector.
16767 UsedTEs.push_back(VToTEs);
16768 UsedValuesEntry.try_emplace(V, 0);
16769 } else {
16770 // Need to check if there are any previously used tree nodes which use V.
16771 // If there are no such nodes, consider that we have another one input
16772 // vector.
16773 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
16774 unsigned Idx = 0;
16775 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
16776 // Do we have a non-empty intersection of previously listed tree entries
16777 // and tree entries using current V?
16778 set_intersect(VToTEs, Set);
16779 if (!VToTEs.empty()) {
16780 // Yes, write the new subset and continue analysis for the next
16781 // scalar.
16782 Set.swap(VToTEs);
16783 break;
16784 }
16785 VToTEs = SavedVToTEs;
16786 ++Idx;
16787 }
16788 // No non-empty intersection found - need to add a second set of possible
16789 // source vectors.
16790 if (Idx == UsedTEs.size()) {
16791 // If the number of input vectors is greater than 2 - not a permutation,
16792 // fallback to the regular gather.
16793 // TODO: support multiple reshuffled nodes.
16794 if (UsedTEs.size() == 2)
16795 continue;
16796 UsedTEs.push_back(SavedVToTEs);
16797 Idx = UsedTEs.size() - 1;
16798 }
16799 UsedValuesEntry.try_emplace(V, Idx);
16800 }
16801 }
16802
16803 if (UsedTEs.empty()) {
16804 Entries.clear();
16805 return std::nullopt;
16806 }
16807
16808 unsigned VF = 0;
16809 if (UsedTEs.size() == 1) {
16810 // Keep the order to avoid non-determinism.
16811 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
16812 UsedTEs.front().end());
16813 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
16814 return TE1->Idx < TE2->Idx;
16815 });
16816 // Try to find the perfect match in another gather node at first.
16817 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
16818 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
16819 });
16820 if (It != FirstEntries.end() &&
16821 ((*It)->getVectorFactor() == VL.size() ||
16822 ((*It)->getVectorFactor() == TE->Scalars.size() &&
16823 TE->ReuseShuffleIndices.size() == VL.size() &&
16824 (*It)->isSame(TE->Scalars)))) {
16825 Entries.push_back(*It);
16826 if ((*It)->getVectorFactor() == VL.size()) {
16827 std::iota(std::next(Mask.begin(), Part * VL.size()),
16828 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
16829 } else {
16830 SmallVector<int> CommonMask = TE->getCommonMask();
16831 copy(CommonMask, Mask.begin());
16832 }
16833 // Clear undef scalars.
16834 for (unsigned I : seq<unsigned>(VL.size()))
16835 if (isa<PoisonValue>(VL[I]))
16836 Mask[Part * VL.size() + I] = PoisonMaskElem;
16838 }
16839 // No perfect match, just shuffle, so choose the first tree node from the
16840 // tree.
16841 Entries.push_back(FirstEntries.front());
16842 // Update mapping between values and corresponding tree entries.
16843 for (auto &P : UsedValuesEntry)
16844 P.second = 0;
16845 VF = FirstEntries.front()->getVectorFactor();
16846 } else {
16847 // Try to find nodes with the same vector factor.
16848 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
16849 // Keep the order of tree nodes to avoid non-determinism.
16851 for (const TreeEntry *TE : UsedTEs.front()) {
16852 unsigned VF = TE->getVectorFactor();
16853 auto It = VFToTE.find(VF);
16854 if (It != VFToTE.end()) {
16855 if (It->second->Idx > TE->Idx)
16856 It->getSecond() = TE;
16857 continue;
16858 }
16859 VFToTE.try_emplace(VF, TE);
16860 }
16861 // Same, keep the order to avoid non-determinism.
16862 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
16863 UsedTEs.back().end());
16864 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
16865 return TE1->Idx < TE2->Idx;
16866 });
16867 for (const TreeEntry *TE : SecondEntries) {
16868 auto It = VFToTE.find(TE->getVectorFactor());
16869 if (It != VFToTE.end()) {
16870 VF = It->first;
16871 Entries.push_back(It->second);
16872 Entries.push_back(TE);
16873 break;
16874 }
16875 }
16876 // No 2 source vectors with the same vector factor - just choose 2 with max
16877 // index.
16878 if (Entries.empty()) {
16879 Entries.push_back(*llvm::max_element(
16880 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
16881 return TE1->Idx < TE2->Idx;
16882 }));
16883 Entries.push_back(SecondEntries.front());
16884 VF = std::max(Entries.front()->getVectorFactor(),
16885 Entries.back()->getVectorFactor());
16886 } else {
16887 VF = Entries.front()->getVectorFactor();
16888 }
16889 SmallVector<SmallPtrSet<Value *, 8>> ValuesToEntries;
16890 for (const TreeEntry *E : Entries)
16891 ValuesToEntries.emplace_back().insert(E->Scalars.begin(),
16892 E->Scalars.end());
16893 // Update mapping between values and corresponding tree entries.
16894 for (auto &P : UsedValuesEntry) {
16895 for (unsigned Idx : seq<unsigned>(ValuesToEntries.size()))
16896 if (ValuesToEntries[Idx].contains(P.first)) {
16897 P.second = Idx;
16898 break;
16899 }
16900 }
16901 }
16902
16903 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
16904 // Checks if the 2 PHIs are compatible in terms of high possibility to be
16905 // vectorized.
16906 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
16907 auto *PHI = cast<PHINode>(V);
16908 auto *PHI1 = cast<PHINode>(V1);
16909 // Check that all incoming values are compatible/from same parent (if they
16910 // are instructions).
16911 // The incoming values are compatible if they all are constants, or
16912 // instruction with the same/alternate opcodes from the same basic block.
16913 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
16914 Value *In = PHI->getIncomingValue(I);
16915 Value *In1 = PHI1->getIncomingValue(I);
16916 if (isConstant(In) && isConstant(In1))
16917 continue;
16918 if (!getSameOpcode({In, In1}, *TLI))
16919 return false;
16920 if (cast<Instruction>(In)->getParent() !=
16921 cast<Instruction>(In1)->getParent())
16922 return false;
16923 }
16924 return true;
16925 };
16926 // Check if the value can be ignored during analysis for shuffled gathers.
16927 // We suppose it is better to ignore instruction, which do not form splats,
16928 // are not vectorized/not extractelements (these instructions will be handled
16929 // by extractelements processing) or may form vector node in future.
16930 auto MightBeIgnored = [=](Value *V) {
16931 auto *I = dyn_cast<Instruction>(V);
16932 return I && !IsSplatOrUndefs && !isVectorized(I) &&
16934 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
16935 };
16936 // Check that the neighbor instruction may form a full vector node with the
16937 // current instruction V. It is possible, if they have same/alternate opcode
16938 // and same parent basic block.
16939 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
16940 Value *V1 = VL[Idx];
16941 bool UsedInSameVTE = false;
16942 auto It = UsedValuesEntry.find(V1);
16943 if (It != UsedValuesEntry.end())
16944 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
16945 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
16946 getSameOpcode({V, V1}, *TLI) &&
16947 cast<Instruction>(V)->getParent() ==
16948 cast<Instruction>(V1)->getParent() &&
16949 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
16950 };
16951 // Build a shuffle mask for better cost estimation and vector emission.
16952 SmallBitVector UsedIdxs(Entries.size());
16954 for (int I = 0, E = VL.size(); I < E; ++I) {
16955 Value *V = VL[I];
16956 auto It = UsedValuesEntry.find(V);
16957 if (It == UsedValuesEntry.end())
16958 continue;
16959 // Do not try to shuffle scalars, if they are constants, or instructions
16960 // that can be vectorized as a result of the following vector build
16961 // vectorization.
16962 if (isConstant(V) || (MightBeIgnored(V) &&
16963 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
16964 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
16965 continue;
16966 unsigned Idx = It->second;
16967 EntryLanes.emplace_back(Idx, I);
16968 UsedIdxs.set(Idx);
16969 }
16970 // Iterate through all shuffled scalars and select entries, which can be used
16971 // for final shuffle.
16973 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
16974 if (!UsedIdxs.test(I))
16975 continue;
16976 // Fix the entry number for the given scalar. If it is the first entry, set
16977 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
16978 // These indices are used when calculating final shuffle mask as the vector
16979 // offset.
16980 for (std::pair<unsigned, int> &Pair : EntryLanes)
16981 if (Pair.first == I)
16982 Pair.first = TempEntries.size();
16983 TempEntries.push_back(Entries[I]);
16984 }
16985 Entries.swap(TempEntries);
16986 if (EntryLanes.size() == Entries.size() &&
16987 !VL.equals(ArrayRef(TE->Scalars)
16988 .slice(Part * VL.size(),
16989 std::min<int>(VL.size(), TE->Scalars.size())))) {
16990 // We may have here 1 or 2 entries only. If the number of scalars is equal
16991 // to the number of entries, no need to do the analysis, it is not very
16992 // profitable. Since VL is not the same as TE->Scalars, it means we already
16993 // have some shuffles before. Cut off not profitable case.
16994 Entries.clear();
16995 return std::nullopt;
16996 }
16997 // Build the final mask, check for the identity shuffle, if possible.
16998 bool IsIdentity = Entries.size() == 1;
16999 // Pair.first is the offset to the vector, while Pair.second is the index of
17000 // scalar in the list.
17001 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
17002 unsigned Idx = Part * VL.size() + Pair.second;
17003 Mask[Idx] =
17004 Pair.first * VF +
17005 (ForOrder ? std::distance(
17006 Entries[Pair.first]->Scalars.begin(),
17007 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
17008 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
17009 IsIdentity &= Mask[Idx] == Pair.second;
17010 }
17011 if (ForOrder || IsIdentity || Entries.empty()) {
17012 switch (Entries.size()) {
17013 case 1:
17014 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
17016 break;
17017 case 2:
17018 if (EntryLanes.size() > 2 || VL.size() <= 2)
17020 break;
17021 default:
17022 break;
17023 }
17024 } else if (!isa<VectorType>(VL.front()->getType()) &&
17025 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
17026 // Do the cost estimation if shuffle beneficial than buildvector.
17027 SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),
17028 std::next(Mask.begin(), (Part + 1) * VL.size()));
17029 int MinElement = SubMask.front(), MaxElement = SubMask.front();
17030 for (int Idx : SubMask) {
17031 if (Idx == PoisonMaskElem)
17032 continue;
17033 if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
17034 MinElement = Idx;
17035 if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
17036 MaxElement = Idx;
17037 }
17038 assert(MaxElement >= 0 && MinElement >= 0 &&
17039 MaxElement % VF >= MinElement % VF &&
17040 "Expected at least single element.");
17041 unsigned NewVF = std::max<unsigned>(
17042 VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(),
17043 (MaxElement % VF) -
17044 (MinElement % VF) + 1));
17045 if (NewVF < VF) {
17046 for (int &Idx : SubMask) {
17047 if (Idx == PoisonMaskElem)
17048 continue;
17049 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
17050 (Idx >= static_cast<int>(VF) ? NewVF : 0);
17051 }
17052 } else {
17053 NewVF = VF;
17054 }
17055
17057 auto *VecTy = getWidenedType(VL.front()->getType(), NewVF);
17058 auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
17059 auto GetShuffleCost = [&,
17062 VectorType *VecTy) -> InstructionCost {
17063 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
17065 Mask, Entries.front()->getInterleaveFactor()))
17066 return TTI::TCC_Free;
17067 return ::getShuffleCost(TTI,
17068 Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
17070 VecTy, Mask, CostKind);
17071 };
17072 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
17073 InstructionCost FirstShuffleCost = 0;
17074 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
17075 if (Entries.size() == 1 || !Entries[0]->isGather()) {
17076 FirstShuffleCost = ShuffleCost;
17077 } else {
17078 // Transform mask to include only first entry.
17079 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
17080 bool IsIdentity = true;
17081 for (auto [I, Idx] : enumerate(FirstMask)) {
17082 if (Idx >= static_cast<int>(NewVF)) {
17084 } else {
17085 DemandedElts.clearBit(I);
17086 if (Idx != PoisonMaskElem)
17087 IsIdentity &= static_cast<int>(I) == Idx;
17088 }
17089 }
17090 if (!IsIdentity)
17091 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
17092 FirstShuffleCost += getScalarizationOverhead(
17093 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
17094 /*Extract=*/false, CostKind);
17095 }
17096 InstructionCost SecondShuffleCost = 0;
17097 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
17098 if (Entries.size() == 1 || !Entries[1]->isGather()) {
17099 SecondShuffleCost = ShuffleCost;
17100 } else {
17101 // Transform mask to include only first entry.
17102 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
17103 bool IsIdentity = true;
17104 for (auto [I, Idx] : enumerate(SecondMask)) {
17105 if (Idx < static_cast<int>(NewVF) && Idx >= 0) {
17107 } else {
17108 DemandedElts.clearBit(I);
17109 if (Idx != PoisonMaskElem) {
17110 Idx -= NewVF;
17111 IsIdentity &= static_cast<int>(I) == Idx;
17112 }
17113 }
17114 }
17115 if (!IsIdentity)
17116 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
17117 SecondShuffleCost += getScalarizationOverhead(
17118 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
17119 /*Extract=*/false, CostKind);
17120 }
17121 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
17122 for (auto [I, Idx] : enumerate(SubMask))
17123 if (Idx == PoisonMaskElem)
17124 DemandedElts.clearBit(I);
17125 InstructionCost BuildVectorCost = getScalarizationOverhead(
17126 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
17127 /*Extract=*/false, CostKind);
17128 const TreeEntry *BestEntry = nullptr;
17129 if (FirstShuffleCost < ShuffleCost) {
17130 std::for_each(std::next(Mask.begin(), Part * VL.size()),
17131 std::next(Mask.begin(), (Part + 1) * VL.size()),
17132 [&](int &Idx) {
17133 if (Idx >= static_cast<int>(VF))
17134 Idx = PoisonMaskElem;
17135 });
17136 BestEntry = Entries.front();
17137 ShuffleCost = FirstShuffleCost;
17138 }
17139 if (SecondShuffleCost < ShuffleCost) {
17140 std::for_each(std::next(Mask.begin(), Part * VL.size()),
17141 std::next(Mask.begin(), (Part + 1) * VL.size()),
17142 [&](int &Idx) {
17143 if (Idx < static_cast<int>(VF))
17144 Idx = PoisonMaskElem;
17145 else
17146 Idx -= VF;
17147 });
17148 BestEntry = Entries[1];
17149 ShuffleCost = SecondShuffleCost;
17150 }
17151 if (BuildVectorCost >= ShuffleCost) {
17152 if (BestEntry) {
17153 Entries.clear();
17154 Entries.push_back(BestEntry);
17155 }
17156 return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
17158 }
17159 }
17160 Entries.clear();
17161 // Clear the corresponding mask elements.
17162 std::fill(std::next(Mask.begin(), Part * VL.size()),
17163 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
17164 return std::nullopt;
17165}
17166
17168BoUpSLP::isGatherShuffledEntry(
17169 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
17170 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
17171 bool ForOrder) {
17172 assert(NumParts > 0 && NumParts < VL.size() &&
17173 "Expected positive number of registers.");
17174 Entries.clear();
17175 // No need to check for the topmost gather node.
17176 if (TE == VectorizableTree.front().get() &&
17177 (!GatheredLoadsEntriesFirst.has_value() ||
17178 none_of(ArrayRef(VectorizableTree).drop_front(),
17179 [](const std::unique_ptr<TreeEntry> &TE) {
17180 return !TE->isGather();
17181 })))
17182 return {};
17183 // FIXME: Gathering for non-power-of-2 (non whole registers) nodes not
17184 // implemented yet.
17185 if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
17186 return {};
17187 Mask.assign(VL.size(), PoisonMaskElem);
17188 assert((TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
17189 "Expected only single user of the gather node.");
17190 assert(VL.size() % NumParts == 0 &&
17191 "Number of scalars must be divisible by NumParts.");
17192 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->isGather() &&
17193 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
17194 (TE->Idx == 0 ||
17195 (TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) ||
17196 isSplat(TE->Scalars) ||
17197 (TE->hasState() &&
17198 getSameValuesTreeEntry(TE->getMainOp(), TE->Scalars))))
17199 return {};
17200 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
17202 for (unsigned Part : seq<unsigned>(NumParts)) {
17203 ArrayRef<Value *> SubVL =
17204 VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
17205 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
17206 std::optional<TTI::ShuffleKind> SubRes =
17207 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
17208 ForOrder);
17209 if (!SubRes)
17210 SubEntries.clear();
17211 Res.push_back(SubRes);
17212 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
17213 SubEntries.front()->getVectorFactor() == VL.size() &&
17214 (SubEntries.front()->isSame(TE->Scalars) ||
17215 SubEntries.front()->isSame(VL))) {
17216 SmallVector<const TreeEntry *> LocalSubEntries;
17217 LocalSubEntries.swap(SubEntries);
17218 Entries.clear();
17219 Res.clear();
17220 std::iota(Mask.begin(), Mask.end(), 0);
17221 // Clear undef scalars.
17222 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
17223 if (isa<PoisonValue>(VL[I]))
17225 Entries.emplace_back(1, LocalSubEntries.front());
17227 return Res;
17228 }
17229 }
17230 if (all_of(Res,
17231 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
17232 Entries.clear();
17233 return {};
17234 }
17235 return Res;
17236}
17237
17238InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
17239 Type *ScalarTy) const {
17240 const unsigned VF = VL.size();
17241 auto *VecTy = getWidenedType(ScalarTy, VF);
17242 // Find the cost of inserting/extracting values from the vector.
17243 // Check if the same elements are inserted several times and count them as
17244 // shuffle candidates.
17245 APInt DemandedElements = APInt::getZero(VF);
17248 auto EstimateInsertCost = [&](unsigned I, Value *V) {
17249 DemandedElements.setBit(I);
17250 if (V->getType() != ScalarTy)
17251 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
17253 };
17254 SmallVector<int> ConstantShuffleMask(VF, PoisonMaskElem);
17255 std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
17256 for (auto [I, V] : enumerate(VL)) {
17257 // No need to shuffle duplicates for constants.
17258 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V))
17259 continue;
17260
17261 if (isConstant(V)) {
17262 ConstantShuffleMask[I] = I + VF;
17263 continue;
17264 }
17265 EstimateInsertCost(I, V);
17266 }
17267 // FIXME: add a cost for constant vector materialization.
17268 bool IsAnyNonUndefConst =
17269 any_of(VL, [](Value *V) { return !isa<UndefValue>(V) && isConstant(V); });
17270 // 1. Shuffle input source vector and constant vector.
17271 if (!ForPoisonSrc && IsAnyNonUndefConst) {
17273 ConstantShuffleMask);
17274 }
17275
17276 // 2. Insert unique non-constants.
17277 if (!DemandedElements.isZero())
17278 Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElements,
17279 /*Insert=*/true,
17280 /*Extract=*/false, CostKind,
17281 ForPoisonSrc && !IsAnyNonUndefConst, VL);
17282 return Cost;
17283}
17284
17285Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
17286 auto It = EntryToLastInstruction.find(E);
17287 if (It != EntryToLastInstruction.end())
17288 return *cast<Instruction>(It->second);
17289 Instruction *Res = nullptr;
17290 // Get the basic block this bundle is in. All instructions in the bundle
17291 // should be in this block (except for extractelement-like instructions with
17292 // constant indices or gathered loads or copyables).
17293 Instruction *Front;
17294 unsigned Opcode;
17295 if (E->hasState()) {
17296 Front = E->getMainOp();
17297 Opcode = E->getOpcode();
17298 } else {
17299 Front = cast<Instruction>(*find_if(E->Scalars, IsaPred<Instruction>));
17300 Opcode = Front->getOpcode();
17301 }
17302 auto *BB = Front->getParent();
17303 assert(
17304 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
17305 E->isGather() && E->Idx < *GatheredLoadsEntriesFirst) ||
17306 E->State == TreeEntry::SplitVectorize || E->hasCopyableElements() ||
17307 all_of(E->Scalars,
17308 [=](Value *V) -> bool {
17309 if (Opcode == Instruction::GetElementPtr &&
17310 !isa<GetElementPtrInst>(V))
17311 return true;
17312 auto *I = dyn_cast<Instruction>(V);
17313 return !I || !E->getMatchingMainOpOrAltOp(I) ||
17314 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
17315 })) &&
17316 "Expected gathered loads or GEPs or instructions from same basic "
17317 "block.");
17318
17319 auto FindLastInst = [&]() {
17320 Instruction *LastInst = Front;
17321 for (Value *V : E->Scalars) {
17322 auto *I = dyn_cast<Instruction>(V);
17323 if (!I)
17324 continue;
17325 if (E->isCopyableElement(I))
17326 continue;
17327 if (LastInst->getParent() == I->getParent()) {
17328 if (LastInst->comesBefore(I))
17329 LastInst = I;
17330 continue;
17331 }
17332 assert(((Opcode == Instruction::GetElementPtr &&
17333 !isa<GetElementPtrInst>(I)) ||
17334 E->State == TreeEntry::SplitVectorize ||
17335 (isVectorLikeInstWithConstOps(LastInst) &&
17337 (GatheredLoadsEntriesFirst.has_value() &&
17338 Opcode == Instruction::Load && E->isGather() &&
17339 E->Idx < *GatheredLoadsEntriesFirst)) &&
17340 "Expected vector-like or non-GEP in GEP node insts only.");
17341 if (!DT->isReachableFromEntry(LastInst->getParent())) {
17342 LastInst = I;
17343 continue;
17344 }
17345 if (!DT->isReachableFromEntry(I->getParent()))
17346 continue;
17347 auto *NodeA = DT->getNode(LastInst->getParent());
17348 auto *NodeB = DT->getNode(I->getParent());
17349 assert(NodeA && "Should only process reachable instructions");
17350 assert(NodeB && "Should only process reachable instructions");
17351 assert((NodeA == NodeB) ==
17352 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17353 "Different nodes should have different DFS numbers");
17354 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
17355 LastInst = I;
17356 }
17357 BB = LastInst->getParent();
17358 return LastInst;
17359 };
17360
17361 auto FindFirstInst = [&]() {
17362 Instruction *FirstInst = Front;
17363 for (Value *V : E->Scalars) {
17364 auto *I = dyn_cast<Instruction>(V);
17365 if (!I)
17366 continue;
17367 if (E->isCopyableElement(I))
17368 continue;
17369 if (FirstInst->getParent() == I->getParent()) {
17370 if (I->comesBefore(FirstInst))
17371 FirstInst = I;
17372 continue;
17373 }
17374 assert(((Opcode == Instruction::GetElementPtr &&
17375 !isa<GetElementPtrInst>(I)) ||
17376 (isVectorLikeInstWithConstOps(FirstInst) &&
17378 "Expected vector-like or non-GEP in GEP node insts only.");
17379 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
17380 FirstInst = I;
17381 continue;
17382 }
17383 if (!DT->isReachableFromEntry(I->getParent()))
17384 continue;
17385 auto *NodeA = DT->getNode(FirstInst->getParent());
17386 auto *NodeB = DT->getNode(I->getParent());
17387 assert(NodeA && "Should only process reachable instructions");
17388 assert(NodeB && "Should only process reachable instructions");
17389 assert((NodeA == NodeB) ==
17390 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17391 "Different nodes should have different DFS numbers");
17392 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
17393 FirstInst = I;
17394 }
17395 return FirstInst;
17396 };
17397
17398 if (E->State == TreeEntry::SplitVectorize) {
17399 Res = FindLastInst();
17400 if (ArrayRef<TreeEntry *> Entries = getTreeEntries(Res); !Entries.empty()) {
17401 for (auto *E : Entries) {
17402 auto *I = dyn_cast_or_null<Instruction>(E->VectorizedValue);
17403 if (!I)
17404 I = &getLastInstructionInBundle(E);
17405 if (Res->getParent() == I->getParent() && Res->comesBefore(I))
17406 Res = I;
17407 }
17408 }
17409 EntryToLastInstruction.try_emplace(E, Res);
17410 return *Res;
17411 }
17412
17413 // Set insertpoint for gathered loads to the very first load.
17414 if (GatheredLoadsEntriesFirst.has_value() &&
17415 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
17416 Opcode == Instruction::Load) {
17417 Res = FindFirstInst();
17418 EntryToLastInstruction.try_emplace(E, Res);
17419 return *Res;
17420 }
17421
17422 // Set the insert point to the beginning of the basic block if the entry
17423 // should not be scheduled.
17424 auto FindScheduleBundle = [&](const TreeEntry *E) -> const ScheduleBundle * {
17425 if (E->isGather())
17426 return nullptr;
17427 // Found previously that the instruction do not need to be scheduled.
17428 const auto *It = BlocksSchedules.find(BB);
17429 if (It == BlocksSchedules.end())
17430 return nullptr;
17431 for (Value *V : E->Scalars) {
17432 auto *I = dyn_cast<Instruction>(V);
17433 if (!I || isa<PHINode>(I) ||
17434 (!E->isCopyableElement(I) && doesNotNeedToBeScheduled(I)))
17435 continue;
17436 ArrayRef<ScheduleBundle *> Bundles = It->second->getScheduleBundles(I);
17437 if (Bundles.empty())
17438 continue;
17439 const auto *It = find_if(
17440 Bundles, [&](ScheduleBundle *B) { return B->getTreeEntry() == E; });
17441 if (It != Bundles.end())
17442 return *It;
17443 }
17444 return nullptr;
17445 };
17446 const ScheduleBundle *Bundle = FindScheduleBundle(E);
17447 if (!E->isGather() && !Bundle) {
17448 if ((Opcode == Instruction::GetElementPtr &&
17449 any_of(E->Scalars,
17450 [](Value *V) {
17451 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
17452 })) ||
17453 all_of(E->Scalars, [&](Value *V) {
17454 return isa<PoisonValue>(V) || E->isCopyableElement(V) ||
17455 (!isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V));
17456 }))
17457 Res = FindLastInst();
17458 else
17459 Res = FindFirstInst();
17460 EntryToLastInstruction.try_emplace(E, Res);
17461 return *Res;
17462 }
17463
17464 // Find the last instruction. The common case should be that BB has been
17465 // scheduled, and the last instruction is VL.back(). So we start with
17466 // VL.back() and iterate over schedule data until we reach the end of the
17467 // bundle. The end of the bundle is marked by null ScheduleData.
17468 if (Bundle) {
17469 assert(!E->isGather() && "Gathered instructions should not be scheduled");
17470 Res = Bundle->getBundle().back()->getInst();
17471 EntryToLastInstruction.try_emplace(E, Res);
17472 return *Res;
17473 }
17474
17475 // LastInst can still be null at this point if there's either not an entry
17476 // for BB in BlocksSchedules or there's no ScheduleData available for
17477 // VL.back(). This can be the case if buildTreeRec aborts for various
17478 // reasons (e.g., the maximum recursion depth is reached, the maximum region
17479 // size is reached, etc.). ScheduleData is initialized in the scheduling
17480 // "dry-run".
17481 //
17482 // If this happens, we can still find the last instruction by brute force. We
17483 // iterate forwards from Front (inclusive) until we either see all
17484 // instructions in the bundle or reach the end of the block. If Front is the
17485 // last instruction in program order, LastInst will be set to Front, and we
17486 // will visit all the remaining instructions in the block.
17487 //
17488 // One of the reasons we exit early from buildTreeRec is to place an upper
17489 // bound on compile-time. Thus, taking an additional compile-time hit here is
17490 // not ideal. However, this should be exceedingly rare since it requires that
17491 // we both exit early from buildTreeRec and that the bundle be out-of-order
17492 // (causing us to iterate all the way to the end of the block).
17493 if (!Res)
17494 Res = FindLastInst();
17495 assert(Res && "Failed to find last instruction in bundle");
17496 EntryToLastInstruction.try_emplace(E, Res);
17497 return *Res;
17498}
17499
17500void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
17501 auto *Front = E->getMainOp();
17502 Instruction *LastInst = &getLastInstructionInBundle(E);
17503 assert(LastInst && "Failed to find last instruction in bundle");
17504 BasicBlock::iterator LastInstIt = LastInst->getIterator();
17505 // If the instruction is PHI, set the insert point after all the PHIs.
17506 bool IsPHI = isa<PHINode>(LastInst);
17507 if (IsPHI) {
17508 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
17509 if (LastInstIt != LastInst->getParent()->end() &&
17510 LastInstIt->getParent()->isLandingPad())
17511 LastInstIt = std::next(LastInstIt);
17512 }
17513 if (IsPHI ||
17514 (!E->isGather() && E->State != TreeEntry::SplitVectorize &&
17515 E->doesNotNeedToSchedule()) ||
17516 (GatheredLoadsEntriesFirst.has_value() &&
17517 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
17518 E->getOpcode() == Instruction::Load)) {
17519 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
17520 } else {
17521 // Set the insertion point after the last instruction in the bundle. Set the
17522 // debug location to Front.
17523 Builder.SetInsertPoint(
17524 LastInst->getParent(),
17525 LastInst->getNextNode()->getIterator());
17526 }
17527 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
17528}
17529
17530Value *BoUpSLP::gather(
17531 ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
17532 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle) {
17533 // List of instructions/lanes from current block and/or the blocks which are
17534 // part of the current loop. These instructions will be inserted at the end to
17535 // make it possible to optimize loops and hoist invariant instructions out of
17536 // the loops body with better chances for success.
17538 SmallSet<int, 4> PostponedIndices;
17539 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
17540 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
17542 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
17543 InsertBB = InsertBB->getSinglePredecessor();
17544 return InsertBB && InsertBB == InstBB;
17545 };
17546 for (int I = 0, E = VL.size(); I < E; ++I) {
17547 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
17548 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
17549 isVectorized(Inst) ||
17550 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
17551 PostponedIndices.insert(I).second)
17552 PostponedInsts.emplace_back(Inst, I);
17553 }
17554
17555 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
17556 Type *Ty) {
17557 Value *Scalar = V;
17558 if (Scalar->getType() != Ty) {
17559 assert(Scalar->getType()->isIntOrIntVectorTy() &&
17560 Ty->isIntOrIntVectorTy() && "Expected integer types only.");
17561 Value *V = Scalar;
17562 if (auto *CI = dyn_cast<CastInst>(Scalar);
17563 isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
17564 Value *Op = CI->getOperand(0);
17565 if (auto *IOp = dyn_cast<Instruction>(Op);
17566 !IOp || !(isDeleted(IOp) || isVectorized(IOp)))
17567 V = Op;
17568 }
17569 Scalar = Builder.CreateIntCast(
17570 V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
17571 }
17572
17573 Instruction *InsElt;
17574 if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
17575 assert(SLPReVec && "FixedVectorType is not expected.");
17576 Vec =
17577 createInsertVector(Builder, Vec, Scalar, Pos * getNumElements(VecTy));
17578 auto *II = dyn_cast<Instruction>(Vec);
17579 if (!II)
17580 return Vec;
17581 InsElt = II;
17582 } else {
17583 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
17584 InsElt = dyn_cast<InsertElementInst>(Vec);
17585 if (!InsElt)
17586 return Vec;
17587 }
17588 GatherShuffleExtractSeq.insert(InsElt);
17589 CSEBlocks.insert(InsElt->getParent());
17590 // Add to our 'need-to-extract' list.
17591 if (isa<Instruction>(V)) {
17592 if (ArrayRef<TreeEntry *> Entries = getTreeEntries(V); !Entries.empty()) {
17593 // Find which lane we need to extract.
17594 User *UserOp = nullptr;
17595 if (Scalar != V) {
17596 if (auto *SI = dyn_cast<Instruction>(Scalar))
17597 UserOp = SI;
17598 } else {
17599 if (V->getType()->isVectorTy()) {
17600 if (auto *SV = dyn_cast<ShuffleVectorInst>(InsElt);
17601 SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
17602 // Find shufflevector, caused by resize.
17603 auto FindOperand = [](Value *Vec, Value *V) -> Instruction * {
17604 if (auto *SV = dyn_cast<ShuffleVectorInst>(Vec)) {
17605 if (SV->getOperand(0) == V)
17606 return SV;
17607 if (SV->getOperand(1) == V)
17608 return SV;
17609 }
17610 return nullptr;
17611 };
17612 InsElt = nullptr;
17613 if (Instruction *User = FindOperand(SV->getOperand(0), V))
17614 InsElt = User;
17615 else if (Instruction *User = FindOperand(SV->getOperand(1), V))
17616 InsElt = User;
17617 assert(InsElt &&
17618 "Failed to find shufflevector, caused by resize.");
17619 }
17620 }
17621 UserOp = InsElt;
17622 }
17623 if (UserOp) {
17624 unsigned FoundLane = Entries.front()->findLaneForValue(V);
17625 ExternalUses.emplace_back(V, UserOp, *Entries.front(), FoundLane);
17626 }
17627 }
17628 }
17629 return Vec;
17630 };
17631 auto *VecTy = getWidenedType(ScalarTy, VL.size());
17632 Value *Vec = PoisonValue::get(VecTy);
17633 SmallVector<int> NonConsts;
17635 std::iota(Mask.begin(), Mask.end(), 0);
17636 Value *OriginalRoot = Root;
17637 if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
17638 SV && isa<PoisonValue>(SV->getOperand(1)) &&
17639 SV->getOperand(0)->getType() == VecTy) {
17640 Root = SV->getOperand(0);
17641 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
17642 }
17643 // Insert constant values at first.
17644 for (int I = 0, E = VL.size(); I < E; ++I) {
17645 if (PostponedIndices.contains(I))
17646 continue;
17647 if (!isConstant(VL[I])) {
17648 NonConsts.push_back(I);
17649 continue;
17650 }
17651 if (isa<PoisonValue>(VL[I]))
17652 continue;
17653 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
17654 Mask[I] = I + E;
17655 }
17656 if (Root) {
17657 if (isa<PoisonValue>(Vec)) {
17658 Vec = OriginalRoot;
17659 } else {
17660 Vec = CreateShuffle(Root, Vec, Mask);
17661 if (auto *OI = dyn_cast<Instruction>(OriginalRoot);
17662 OI && OI->use_empty() &&
17663 none_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
17664 return TE->VectorizedValue == OI;
17665 }))
17666 eraseInstruction(OI);
17667 }
17668 }
17669 // Insert non-constant values.
17670 for (int I : NonConsts)
17671 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
17672 // Append instructions, which are/may be part of the loop, in the end to make
17673 // it possible to hoist non-loop-based instructions.
17674 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
17675 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
17676
17677 return Vec;
17678}
17679
17680/// Merges shuffle masks and emits final shuffle instruction, if required. It
17681/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
17682/// when the actual shuffle instruction is generated only if this is actually
17683/// required. Otherwise, the shuffle instruction emission is delayed till the
17684/// end of the process, to reduce the number of emitted instructions and further
17685/// analysis/transformations.
17686/// The class also will look through the previously emitted shuffle instructions
17687/// and properly mark indices in mask as undef.
17688/// For example, given the code
17689/// \code
17690/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
17691/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
17692/// \endcode
17693/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
17694/// look through %s1 and %s2 and emit
17695/// \code
17696/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
17697/// \endcode
17698/// instead.
17699/// If 2 operands are of different size, the smallest one will be resized and
17700/// the mask recalculated properly.
17701/// For example, given the code
17702/// \code
17703/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
17704/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
17705/// \endcode
17706/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
17707/// look through %s1 and %s2 and emit
17708/// \code
17709/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
17710/// \endcode
17711/// instead.
17712class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
17713 bool IsFinalized = false;
17714 /// Combined mask for all applied operands and masks. It is built during
17715 /// analysis and actual emission of shuffle vector instructions.
17716 SmallVector<int> CommonMask;
17717 /// List of operands for the shuffle vector instruction. It hold at max 2
17718 /// operands, if the 3rd is going to be added, the first 2 are combined into
17719 /// shuffle with \p CommonMask mask, the first operand sets to be the
17720 /// resulting shuffle and the second operand sets to be the newly added
17721 /// operand. The \p CommonMask is transformed in the proper way after that.
17722 SmallVector<Value *, 2> InVectors;
17723 IRBuilderBase &Builder;
17724 BoUpSLP &R;
17725
17726 class ShuffleIRBuilder {
17727 IRBuilderBase &Builder;
17728 /// Holds all of the instructions that we gathered.
17729 SetVector<Instruction *> &GatherShuffleExtractSeq;
17730 /// A list of blocks that we are going to CSE.
17731 DenseSet<BasicBlock *> &CSEBlocks;
17732 /// Data layout.
17733 const DataLayout &DL;
17734
17735 public:
17736 ShuffleIRBuilder(IRBuilderBase &Builder,
17737 SetVector<Instruction *> &GatherShuffleExtractSeq,
17738 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
17739 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
17740 CSEBlocks(CSEBlocks), DL(DL) {}
17741 ~ShuffleIRBuilder() = default;
17742 /// Creates shufflevector for the 2 operands with the given mask.
17743 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
17744 if (V1->getType() != V2->getType()) {
17746 V1->getType()->isIntOrIntVectorTy() &&
17747 "Expected integer vector types only.");
17748 if (V1->getType() != V2->getType()) {
17749 if (cast<VectorType>(V2->getType())
17750 ->getElementType()
17751 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
17752 ->getElementType()
17753 ->getIntegerBitWidth())
17754 V2 = Builder.CreateIntCast(
17755 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
17756 else
17757 V1 = Builder.CreateIntCast(
17758 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
17759 }
17760 }
17761 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
17762 if (auto *I = dyn_cast<Instruction>(Vec)) {
17763 GatherShuffleExtractSeq.insert(I);
17764 CSEBlocks.insert(I->getParent());
17765 }
17766 return Vec;
17767 }
17768 /// Creates permutation of the single vector operand with the given mask, if
17769 /// it is not identity mask.
17770 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
17771 if (Mask.empty())
17772 return V1;
17773 unsigned VF = Mask.size();
17774 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
17775 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
17776 return V1;
17777 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
17778 if (auto *I = dyn_cast<Instruction>(Vec)) {
17779 GatherShuffleExtractSeq.insert(I);
17780 CSEBlocks.insert(I->getParent());
17781 }
17782 return Vec;
17783 }
17784 Value *createIdentity(Value *V) { return V; }
17785 Value *createPoison(Type *Ty, unsigned VF) {
17786 return PoisonValue::get(getWidenedType(Ty, VF));
17787 }
17788 /// Resizes 2 input vector to match the sizes, if the they are not equal
17789 /// yet. The smallest vector is resized to the size of the larger vector.
17790 void resizeToMatch(Value *&V1, Value *&V2) {
17791 if (V1->getType() == V2->getType())
17792 return;
17793 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
17794 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
17795 int VF = std::max(V1VF, V2VF);
17796 int MinVF = std::min(V1VF, V2VF);
17797 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
17798 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
17799 0);
17800 Value *&Op = MinVF == V1VF ? V1 : V2;
17801 Op = Builder.CreateShuffleVector(Op, IdentityMask);
17802 if (auto *I = dyn_cast<Instruction>(Op)) {
17803 GatherShuffleExtractSeq.insert(I);
17804 CSEBlocks.insert(I->getParent());
17805 }
17806 if (MinVF == V1VF)
17807 V1 = Op;
17808 else
17809 V2 = Op;
17810 }
17811 };
17812
17813 /// Smart shuffle instruction emission, walks through shuffles trees and
17814 /// tries to find the best matching vector for the actual shuffle
17815 /// instruction.
17816 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
17817 assert(V1 && "Expected at least one vector value.");
17818 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
17819 R.CSEBlocks, *R.DL);
17820 return BaseShuffleAnalysis::createShuffle<Value *>(
17821 V1, V2, Mask, ShuffleBuilder, ScalarTy);
17822 }
17823
17824 /// Cast value \p V to the vector type with the same number of elements, but
17825 /// the base type \p ScalarTy.
17826 Value *castToScalarTyElem(Value *V,
17827 std::optional<bool> IsSigned = std::nullopt) {
17828 auto *VecTy = cast<VectorType>(V->getType());
17829 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
17830 if (VecTy->getElementType() == ScalarTy->getScalarType())
17831 return V;
17832 return Builder.CreateIntCast(
17833 V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
17834 IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
17835 }
17836
17837public:
17839 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
17840
17841 /// Adjusts extractelements after reusing them.
17842 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
17843 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
17844 unsigned NumParts, bool &UseVecBaseAsInput) {
17845 UseVecBaseAsInput = false;
17846 SmallPtrSet<Value *, 4> UniqueBases;
17847 Value *VecBase = nullptr;
17848 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
17849 if (!E->ReorderIndices.empty()) {
17850 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
17851 E->ReorderIndices.end());
17852 reorderScalars(VL, ReorderMask);
17853 }
17854 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
17855 int Idx = Mask[I];
17856 if (Idx == PoisonMaskElem)
17857 continue;
17858 auto *EI = cast<ExtractElementInst>(VL[I]);
17859 VecBase = EI->getVectorOperand();
17860 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecBase); !TEs.empty())
17861 VecBase = TEs.front()->VectorizedValue;
17862 assert(VecBase && "Expected vectorized value.");
17863 UniqueBases.insert(VecBase);
17864 // If the only one use is vectorized - can delete the extractelement
17865 // itself.
17866 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
17867 (NumParts != 1 && count(VL, EI) > 1) ||
17868 any_of(EI->users(), [&](User *U) {
17869 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
17870 return UTEs.empty() || UTEs.size() > 1 ||
17871 (isa<GetElementPtrInst>(U) &&
17872 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
17873 (!UTEs.empty() &&
17874 count_if(R.VectorizableTree,
17875 [&](const std::unique_ptr<TreeEntry> &TE) {
17876 return TE->UserTreeIndex.UserTE ==
17877 UTEs.front() &&
17878 is_contained(VL, EI);
17879 }) != 1);
17880 }))
17881 continue;
17882 R.eraseInstruction(EI);
17883 }
17884 if (NumParts == 1 || UniqueBases.size() == 1) {
17885 assert(VecBase && "Expected vectorized value.");
17886 return castToScalarTyElem(VecBase);
17887 }
17888 UseVecBaseAsInput = true;
17889 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
17890 for (auto [I, Idx] : enumerate(Mask))
17891 if (Idx != PoisonMaskElem)
17892 Idx = I;
17893 };
17894 // Perform multi-register vector shuffle, joining them into a single virtual
17895 // long vector.
17896 // Need to shuffle each part independently and then insert all this parts
17897 // into a long virtual vector register, forming the original vector.
17898 Value *Vec = nullptr;
17899 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
17900 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
17901 for (unsigned Part : seq<unsigned>(NumParts)) {
17902 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
17903 ArrayRef<Value *> SubVL = ArrayRef(VL).slice(Part * SliceSize, Limit);
17904 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
17905 constexpr int MaxBases = 2;
17906 SmallVector<Value *, MaxBases> Bases(MaxBases);
17907 auto VLMask = zip(SubVL, SubMask);
17908 const unsigned VF = std::accumulate(
17909 VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) {
17910 if (std::get<1>(D) == PoisonMaskElem)
17911 return S;
17912 Value *VecOp =
17913 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
17914 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
17915 !TEs.empty())
17916 VecOp = TEs.front()->VectorizedValue;
17917 assert(VecOp && "Expected vectorized value.");
17918 const unsigned Size =
17919 cast<FixedVectorType>(VecOp->getType())->getNumElements();
17920 return std::max(S, Size);
17921 });
17922 for (const auto [V, I] : VLMask) {
17923 if (I == PoisonMaskElem)
17924 continue;
17925 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
17926 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp); !TEs.empty())
17927 VecOp = TEs.front()->VectorizedValue;
17928 assert(VecOp && "Expected vectorized value.");
17929 VecOp = castToScalarTyElem(VecOp);
17930 Bases[I / VF] = VecOp;
17931 }
17932 if (!Bases.front())
17933 continue;
17934 Value *SubVec;
17935 if (Bases.back()) {
17936 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
17937 TransformToIdentity(SubMask);
17938 } else {
17939 SubVec = Bases.front();
17940 }
17941 if (!Vec) {
17942 Vec = SubVec;
17943 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
17944 [&](unsigned P) {
17945 ArrayRef<int> SubMask =
17946 Mask.slice(P * SliceSize,
17947 getNumElems(Mask.size(),
17948 SliceSize, P));
17949 return all_of(SubMask, [](int Idx) {
17950 return Idx == PoisonMaskElem;
17951 });
17952 })) &&
17953 "Expected first part or all previous parts masked.");
17954 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
17955 } else {
17956 unsigned NewVF =
17957 cast<FixedVectorType>(Vec->getType())->getNumElements();
17958 if (Vec->getType() != SubVec->getType()) {
17959 unsigned SubVecVF =
17960 cast<FixedVectorType>(SubVec->getType())->getNumElements();
17961 NewVF = std::max(NewVF, SubVecVF);
17962 }
17963 // Adjust SubMask.
17964 for (int &Idx : SubMask)
17965 if (Idx != PoisonMaskElem)
17966 Idx += NewVF;
17967 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
17968 Vec = createShuffle(Vec, SubVec, VecMask);
17969 TransformToIdentity(VecMask);
17970 }
17971 }
17972 copy(VecMask, Mask.begin());
17973 return Vec;
17974 }
17975 /// Checks if the specified entry \p E needs to be delayed because of its
17976 /// dependency nodes.
17977 std::optional<Value *>
17978 needToDelay(const TreeEntry *E,
17980 // No need to delay emission if all deps are ready.
17981 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
17982 return all_of(
17983 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
17984 }))
17985 return std::nullopt;
17986 // Postpone gather emission, will be emitted after the end of the
17987 // process to keep correct order.
17988 auto *ResVecTy = getWidenedType(ScalarTy, E->getVectorFactor());
17989 return Builder.CreateAlignedLoad(
17990 ResVecTy,
17992 MaybeAlign());
17993 }
17994 /// Reset the builder to handle perfect diamond match.
17996 IsFinalized = false;
17997 CommonMask.clear();
17998 InVectors.clear();
17999 }
18000 /// Adds 2 input vectors (in form of tree entries) and the mask for their
18001 /// shuffling.
18002 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
18003 Value *V1 = E1.VectorizedValue;
18004 if (V1->getType()->isIntOrIntVectorTy())
18005 V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
18006 if (isa<PoisonValue>(V))
18007 return false;
18008 return !isKnownNonNegative(
18009 V, SimplifyQuery(*R.DL));
18010 }));
18011 Value *V2 = E2.VectorizedValue;
18012 if (V2->getType()->isIntOrIntVectorTy())
18013 V2 = castToScalarTyElem(V2, any_of(E2.Scalars, [&](Value *V) {
18014 if (isa<PoisonValue>(V))
18015 return false;
18016 return !isKnownNonNegative(
18017 V, SimplifyQuery(*R.DL));
18018 }));
18019 add(V1, V2, Mask);
18020 }
18021 /// Adds single input vector (in form of tree entry) and the mask for its
18022 /// shuffling.
18023 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
18024 Value *V1 = E1.VectorizedValue;
18025 if (V1->getType()->isIntOrIntVectorTy())
18026 V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
18027 if (isa<PoisonValue>(V))
18028 return false;
18029 return !isKnownNonNegative(
18030 V, SimplifyQuery(*R.DL));
18031 }));
18032 add(V1, Mask);
18033 }
18034 /// Adds 2 input vectors and the mask for their shuffling.
18035 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
18036 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
18037 assert(isa<FixedVectorType>(V1->getType()) &&
18038 isa<FixedVectorType>(V2->getType()) &&
18039 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
18040 V1 = castToScalarTyElem(V1);
18041 V2 = castToScalarTyElem(V2);
18042 if (InVectors.empty()) {
18043 InVectors.push_back(V1);
18044 InVectors.push_back(V2);
18045 CommonMask.assign(Mask.begin(), Mask.end());
18046 return;
18047 }
18048 Value *Vec = InVectors.front();
18049 if (InVectors.size() == 2) {
18050 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18051 transformMaskAfterShuffle(CommonMask, CommonMask);
18052 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
18053 Mask.size()) {
18054 Vec = createShuffle(Vec, nullptr, CommonMask);
18055 transformMaskAfterShuffle(CommonMask, CommonMask);
18056 }
18057 V1 = createShuffle(V1, V2, Mask);
18058 unsigned VF = std::max(getVF(V1), getVF(Vec));
18059 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18060 if (Mask[Idx] != PoisonMaskElem)
18061 CommonMask[Idx] = Idx + VF;
18062 InVectors.front() = Vec;
18063 if (InVectors.size() == 2)
18064 InVectors.back() = V1;
18065 else
18066 InVectors.push_back(V1);
18067 }
18068 /// Adds another one input vector and the mask for the shuffling.
18069 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
18070 assert(isa<FixedVectorType>(V1->getType()) &&
18071 "castToScalarTyElem expects V1 to be FixedVectorType");
18072 V1 = castToScalarTyElem(V1);
18073 if (InVectors.empty()) {
18074 InVectors.push_back(V1);
18075 CommonMask.assign(Mask.begin(), Mask.end());
18076 return;
18077 }
18078 const auto *It = find(InVectors, V1);
18079 if (It == InVectors.end()) {
18080 if (InVectors.size() == 2 ||
18081 InVectors.front()->getType() != V1->getType()) {
18082 Value *V = InVectors.front();
18083 if (InVectors.size() == 2) {
18084 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
18085 transformMaskAfterShuffle(CommonMask, CommonMask);
18086 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
18087 CommonMask.size()) {
18088 V = createShuffle(InVectors.front(), nullptr, CommonMask);
18089 transformMaskAfterShuffle(CommonMask, CommonMask);
18090 }
18091 unsigned VF = std::max(CommonMask.size(), Mask.size());
18092 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18093 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
18094 CommonMask[Idx] = V->getType() != V1->getType()
18095 ? Idx + VF
18096 : Mask[Idx] + getVF(V1);
18097 if (V->getType() != V1->getType())
18098 V1 = createShuffle(V1, nullptr, Mask);
18099 InVectors.front() = V;
18100 if (InVectors.size() == 2)
18101 InVectors.back() = V1;
18102 else
18103 InVectors.push_back(V1);
18104 return;
18105 }
18106 // Check if second vector is required if the used elements are already
18107 // used from the first one.
18108 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18109 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
18110 InVectors.push_back(V1);
18111 break;
18112 }
18113 }
18114 unsigned VF = 0;
18115 for (Value *V : InVectors)
18116 VF = std::max(VF, getVF(V));
18117 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18118 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
18119 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
18120 }
18121 /// Adds another one input vector and the mask for the shuffling.
18123 SmallVector<int> NewMask;
18124 inversePermutation(Order, NewMask);
18125 add(V1, NewMask);
18126 }
18127 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
18128 Value *Root = nullptr) {
18129 return R.gather(VL, Root, ScalarTy,
18130 [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
18131 return createShuffle(V1, V2, Mask);
18132 });
18133 }
18134 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
18135 /// Finalize emission of the shuffles.
18136 /// \param Action the action (if any) to be performed before final applying of
18137 /// the \p ExtMask mask.
18139 ArrayRef<int> ExtMask,
18140 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
18141 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
18144 Action = {}) {
18145 IsFinalized = true;
18146 if (Action) {
18147 Value *Vec = InVectors.front();
18148 if (InVectors.size() == 2) {
18149 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18150 InVectors.pop_back();
18151 } else {
18152 Vec = createShuffle(Vec, nullptr, CommonMask);
18153 }
18154 transformMaskAfterShuffle(CommonMask, CommonMask);
18155 assert(VF > 0 &&
18156 "Expected vector length for the final value before action.");
18157 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
18158 if (VecVF < VF) {
18159 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
18160 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
18161 Vec = createShuffle(Vec, nullptr, ResizeMask);
18162 }
18163 Action(Vec, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
18164 return createShuffle(V1, V2, Mask);
18165 });
18166 InVectors.front() = Vec;
18167 }
18168 if (!SubVectors.empty()) {
18169 Value *Vec = InVectors.front();
18170 if (InVectors.size() == 2) {
18171 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18172 InVectors.pop_back();
18173 } else {
18174 Vec = createShuffle(Vec, nullptr, CommonMask);
18175 }
18176 transformMaskAfterShuffle(CommonMask, CommonMask);
18177 auto CreateSubVectors = [&](Value *Vec,
18178 SmallVectorImpl<int> &CommonMask) {
18179 for (auto [E, Idx] : SubVectors) {
18180 Value *V = E->VectorizedValue;
18181 if (V->getType()->isIntOrIntVectorTy())
18182 V = castToScalarTyElem(V, any_of(E->Scalars, [&](Value *V) {
18183 if (isa<PoisonValue>(V))
18184 return false;
18185 return !isKnownNonNegative(
18186 V, SimplifyQuery(*R.DL));
18187 }));
18188 unsigned InsertionIndex = Idx * getNumElements(ScalarTy);
18189 // Use scalar version of the SCalarType to correctly handle shuffles
18190 // for revectorization. The revectorization mode operates by the
18191 // vectors, but here we need to operate on the scalars, because the
18192 // masks were already transformed for the vector elements and we don't
18193 // need doing this transformation again.
18194 Type *OrigScalarTy = ScalarTy;
18195 ScalarTy = ScalarTy->getScalarType();
18196 Vec = createInsertVector(
18197 Builder, Vec, V, InsertionIndex,
18198 std::bind(&ShuffleInstructionBuilder::createShuffle, this, _1, _2,
18199 _3));
18200 ScalarTy = OrigScalarTy;
18201 if (!CommonMask.empty()) {
18202 std::iota(std::next(CommonMask.begin(), Idx),
18203 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
18204 Idx);
18205 }
18206 }
18207 return Vec;
18208 };
18209 if (SubVectorsMask.empty()) {
18210 Vec = CreateSubVectors(Vec, CommonMask);
18211 } else {
18212 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
18213 copy(SubVectorsMask, SVMask.begin());
18214 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
18215 if (I2 != PoisonMaskElem) {
18216 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
18217 I1 = I2 + CommonMask.size();
18218 }
18219 }
18220 Value *InsertVec =
18221 CreateSubVectors(PoisonValue::get(Vec->getType()), CommonMask);
18222 Vec = createShuffle(InsertVec, Vec, SVMask);
18223 transformMaskAfterShuffle(CommonMask, SVMask);
18224 }
18225 InVectors.front() = Vec;
18226 }
18227
18228 if (!ExtMask.empty()) {
18229 if (CommonMask.empty()) {
18230 CommonMask.assign(ExtMask.begin(), ExtMask.end());
18231 } else {
18232 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
18233 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
18234 if (ExtMask[I] == PoisonMaskElem)
18235 continue;
18236 NewMask[I] = CommonMask[ExtMask[I]];
18237 }
18238 CommonMask.swap(NewMask);
18239 }
18240 }
18241 if (CommonMask.empty()) {
18242 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
18243 return InVectors.front();
18244 }
18245 if (InVectors.size() == 2)
18246 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
18247 return createShuffle(InVectors.front(), nullptr, CommonMask);
18248 }
18249
18251 assert((IsFinalized || CommonMask.empty()) &&
18252 "Shuffle construction must be finalized.");
18253 }
18254};
18255
18256Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
18257 return vectorizeTree(getOperandEntry(E, NodeIdx));
18258}
18259
18260template <typename BVTy, typename ResTy, typename... Args>
18261ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
18262 Args &...Params) {
18263 assert(E->isGather() && "Expected gather node.");
18264 unsigned VF = E->getVectorFactor();
18265
18266 bool NeedFreeze = false;
18267 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
18268 // Clear values, to be replaced by insertvector instructions.
18269 for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
18270 for_each(MutableArrayRef(GatheredScalars)
18271 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
18272 [&](Value *&V) { V = PoisonValue::get(V->getType()); });
18274 E->CombinedEntriesWithIndices.size());
18275 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
18276 [&](const auto &P) {
18277 return std::make_pair(VectorizableTree[P.first].get(), P.second);
18278 });
18279 // Build a mask out of the reorder indices and reorder scalars per this
18280 // mask.
18281 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
18282 E->ReorderIndices.end());
18283 if (!ReorderMask.empty())
18284 reorderScalars(GatheredScalars, ReorderMask);
18285 SmallVector<int> SubVectorsMask;
18286 inversePermutation(E->ReorderIndices, SubVectorsMask);
18287 // Transform non-clustered elements in the mask to poison (-1).
18288 // "Clustered" operations will be reordered using this mask later.
18289 if (!SubVectors.empty() && !SubVectorsMask.empty()) {
18290 for (unsigned I : seq<unsigned>(GatheredScalars.size()))
18291 if (E->Scalars[I] == GatheredScalars[ReorderMask[I]])
18292 SubVectorsMask[ReorderMask[I]] = PoisonMaskElem;
18293 } else {
18294 SubVectorsMask.clear();
18295 }
18296 SmallVector<Value *> StoredGS(GatheredScalars);
18297 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
18298 unsigned I, unsigned SliceSize,
18299 bool IsNotPoisonous) {
18300 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
18301 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
18302 }))
18303 return false;
18304 TreeEntry *UserTE = E->UserTreeIndex.UserTE;
18305 unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;
18306 if (UserTE->getNumOperands() != 2)
18307 return false;
18308 if (!IsNotPoisonous) {
18309 auto *It = find_if(ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
18310 [=](const std::unique_ptr<TreeEntry> &TE) {
18311 return TE->UserTreeIndex.UserTE == UserTE &&
18312 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
18313 });
18314 if (It == VectorizableTree.end())
18315 return false;
18316 SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end());
18317 if (!(*It)->ReorderIndices.empty()) {
18318 inversePermutation((*It)->ReorderIndices, ReorderMask);
18319 reorderScalars(GS, ReorderMask);
18320 }
18321 if (!all_of(zip(GatheredScalars, GS), [&](const auto &P) {
18322 Value *V0 = std::get<0>(P);
18323 Value *V1 = std::get<1>(P);
18324 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
18325 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
18326 is_contained(E->Scalars, V1));
18327 }))
18328 return false;
18329 }
18330 int Idx;
18331 if ((Mask.size() < InputVF &&
18333 Idx == 0) ||
18334 (Mask.size() == InputVF &&
18335 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
18336 std::iota(
18337 std::next(Mask.begin(), I * SliceSize),
18338 std::next(Mask.begin(),
18339 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
18340 0);
18341 } else {
18342 unsigned IVal =
18343 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
18344 std::fill(
18345 std::next(Mask.begin(), I * SliceSize),
18346 std::next(Mask.begin(),
18347 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
18348 IVal);
18349 }
18350 return true;
18351 };
18352 BVTy ShuffleBuilder(ScalarTy, Params...);
18353 ResTy Res = ResTy();
18355 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
18357 Value *ExtractVecBase = nullptr;
18358 bool UseVecBaseAsInput = false;
18361 Type *OrigScalarTy = GatheredScalars.front()->getType();
18362 auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
18363 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, GatheredScalars.size());
18364 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
18365 // Check for gathered extracts.
18366 bool Resized = false;
18367 ExtractShuffles =
18368 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
18369 if (!ExtractShuffles.empty()) {
18370 SmallVector<const TreeEntry *> ExtractEntries;
18371 for (auto [Idx, I] : enumerate(ExtractMask)) {
18372 if (I == PoisonMaskElem)
18373 continue;
18374 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(
18375 cast<ExtractElementInst>(StoredGS[Idx])->getVectorOperand());
18376 !TEs.empty())
18377 ExtractEntries.append(TEs.begin(), TEs.end());
18378 }
18379 if (std::optional<ResTy> Delayed =
18380 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
18381 // Delay emission of gathers which are not ready yet.
18382 PostponedGathers.insert(E);
18383 // Postpone gather emission, will be emitted after the end of the
18384 // process to keep correct order.
18385 return *Delayed;
18386 }
18387 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
18388 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
18389 ExtractVecBase = VecBase;
18390 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
18391 if (VF == VecBaseTy->getNumElements() &&
18392 GatheredScalars.size() != VF) {
18393 Resized = true;
18394 GatheredScalars.append(VF - GatheredScalars.size(),
18395 PoisonValue::get(OrigScalarTy));
18396 NumParts =
18397 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF), VF);
18398 }
18399 }
18400 }
18401 // Gather extracts after we check for full matched gathers only.
18402 if (!ExtractShuffles.empty() || !E->hasState() ||
18403 E->getOpcode() != Instruction::Load ||
18404 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
18405 any_of(E->Scalars, IsaPred<LoadInst>)) &&
18406 any_of(E->Scalars,
18407 [this](Value *V) {
18408 return isa<LoadInst>(V) && isVectorized(V);
18409 })) ||
18410 (E->hasState() && E->isAltShuffle()) ||
18411 all_of(E->Scalars, [this](Value *V) { return isVectorized(V); }) ||
18412 isSplat(E->Scalars) ||
18413 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
18414 GatherShuffles =
18415 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
18416 }
18417 if (!GatherShuffles.empty()) {
18418 if (std::optional<ResTy> Delayed =
18419 ShuffleBuilder.needToDelay(E, Entries)) {
18420 // Delay emission of gathers which are not ready yet.
18421 PostponedGathers.insert(E);
18422 // Postpone gather emission, will be emitted after the end of the
18423 // process to keep correct order.
18424 return *Delayed;
18425 }
18426 if (GatherShuffles.size() == 1 &&
18427 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
18428 Entries.front().front()->isSame(E->Scalars)) {
18429 // Perfect match in the graph, will reuse the previously vectorized
18430 // node. Cost is 0.
18431 LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "
18432 << shortBundleName(E->Scalars, E->Idx) << ".\n");
18433 // Restore the mask for previous partially matched values.
18434 Mask.resize(E->Scalars.size());
18435 const TreeEntry *FrontTE = Entries.front().front();
18436 if (FrontTE->ReorderIndices.empty() &&
18437 ((FrontTE->ReuseShuffleIndices.empty() &&
18438 E->Scalars.size() == FrontTE->Scalars.size()) ||
18439 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
18440 std::iota(Mask.begin(), Mask.end(), 0);
18441 } else {
18442 for (auto [I, V] : enumerate(E->Scalars)) {
18443 if (isa<PoisonValue>(V)) {
18445 continue;
18446 }
18447 Mask[I] = FrontTE->findLaneForValue(V);
18448 }
18449 }
18450 // Reset the builder(s) to correctly handle perfect diamond matched
18451 // nodes.
18452 ShuffleBuilder.resetForSameNode();
18453 ShuffleBuilder.add(*FrontTE, Mask);
18454 // Full matched entry found, no need to insert subvectors.
18455 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
18456 return Res;
18457 }
18458 if (!Resized) {
18459 if (GatheredScalars.size() != VF &&
18460 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
18461 return any_of(TEs, [&](const TreeEntry *TE) {
18462 return TE->getVectorFactor() == VF;
18463 });
18464 }))
18465 GatheredScalars.append(VF - GatheredScalars.size(),
18466 PoisonValue::get(OrigScalarTy));
18467 }
18468 // Remove shuffled elements from list of gathers.
18469 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
18470 if (Mask[I] != PoisonMaskElem)
18471 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
18472 }
18473 }
18474 }
18475 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
18476 SmallVectorImpl<int> &ReuseMask,
18477 bool IsRootPoison) {
18478 // For splats with can emit broadcasts instead of gathers, so try to find
18479 // such sequences.
18480 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
18481 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
18482 Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
18483 SmallVector<int> UndefPos;
18484 DenseMap<Value *, unsigned> UniquePositions;
18485 // Gather unique non-const values and all constant values.
18486 // For repeated values, just shuffle them.
18487 int NumNonConsts = 0;
18488 int SinglePos = 0;
18489 for (auto [I, V] : enumerate(Scalars)) {
18490 if (isa<UndefValue>(V)) {
18491 if (!isa<PoisonValue>(V)) {
18492 ReuseMask[I] = I;
18493 UndefPos.push_back(I);
18494 }
18495 continue;
18496 }
18497 if (isConstant(V)) {
18498 ReuseMask[I] = I;
18499 continue;
18500 }
18501 ++NumNonConsts;
18502 SinglePos = I;
18503 Value *OrigV = V;
18504 Scalars[I] = PoisonValue::get(OrigScalarTy);
18505 if (IsSplat) {
18506 Scalars.front() = OrigV;
18507 ReuseMask[I] = 0;
18508 } else {
18509 const auto Res = UniquePositions.try_emplace(OrigV, I);
18510 Scalars[Res.first->second] = OrigV;
18511 ReuseMask[I] = Res.first->second;
18512 }
18513 }
18514 if (NumNonConsts == 1) {
18515 // Restore single insert element.
18516 if (IsSplat) {
18517 ReuseMask.assign(VF, PoisonMaskElem);
18518 std::swap(Scalars.front(), Scalars[SinglePos]);
18519 if (!UndefPos.empty() && UndefPos.front() == 0)
18520 Scalars.front() = UndefValue::get(OrigScalarTy);
18521 }
18522 ReuseMask[SinglePos] = SinglePos;
18523 } else if (!UndefPos.empty() && IsSplat) {
18524 // For undef values, try to replace them with the simple broadcast.
18525 // We can do it if the broadcasted value is guaranteed to be
18526 // non-poisonous, or by freezing the incoming scalar value first.
18527 auto *It = find_if(Scalars, [this, E](Value *V) {
18528 return !isa<UndefValue>(V) &&
18530 (E->UserTreeIndex && any_of(V->uses(), [E](const Use &U) {
18531 // Check if the value already used in the same operation in
18532 // one of the nodes already.
18533 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
18534 is_contained(E->UserTreeIndex.UserTE->Scalars,
18535 U.getUser());
18536 })));
18537 });
18538 if (It != Scalars.end()) {
18539 // Replace undefs by the non-poisoned scalars and emit broadcast.
18540 int Pos = std::distance(Scalars.begin(), It);
18541 for (int I : UndefPos) {
18542 // Set the undef position to the non-poisoned scalar.
18543 ReuseMask[I] = Pos;
18544 // Replace the undef by the poison, in the mask it is replaced by
18545 // non-poisoned scalar already.
18546 if (I != Pos)
18547 Scalars[I] = PoisonValue::get(OrigScalarTy);
18548 }
18549 } else {
18550 // Replace undefs by the poisons, emit broadcast and then emit
18551 // freeze.
18552 for (int I : UndefPos) {
18553 ReuseMask[I] = PoisonMaskElem;
18554 if (isa<UndefValue>(Scalars[I]))
18555 Scalars[I] = PoisonValue::get(OrigScalarTy);
18556 }
18557 NeedFreeze = true;
18558 }
18559 }
18560 };
18561 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
18562 bool IsNonPoisoned = true;
18563 bool IsUsedInExpr = true;
18564 Value *Vec1 = nullptr;
18565 if (!ExtractShuffles.empty()) {
18566 // Gather of extractelements can be represented as just a shuffle of
18567 // a single/two vectors the scalars are extracted from.
18568 // Find input vectors.
18569 Value *Vec2 = nullptr;
18570 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
18571 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
18572 ExtractMask[I] = PoisonMaskElem;
18573 }
18574 if (UseVecBaseAsInput) {
18575 Vec1 = ExtractVecBase;
18576 } else {
18577 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
18578 if (ExtractMask[I] == PoisonMaskElem)
18579 continue;
18580 if (isa<UndefValue>(StoredGS[I]))
18581 continue;
18582 auto *EI = cast<ExtractElementInst>(StoredGS[I]);
18583 Value *VecOp = EI->getVectorOperand();
18584 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(VecOp);
18585 !TEs.empty() && TEs.front()->VectorizedValue)
18586 VecOp = TEs.front()->VectorizedValue;
18587 if (!Vec1) {
18588 Vec1 = VecOp;
18589 } else if (Vec1 != VecOp) {
18590 assert((!Vec2 || Vec2 == VecOp) &&
18591 "Expected only 1 or 2 vectors shuffle.");
18592 Vec2 = VecOp;
18593 }
18594 }
18595 }
18596 if (Vec2) {
18597 IsUsedInExpr = false;
18598 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1, AC) &&
18599 isGuaranteedNotToBePoison(Vec2, AC);
18600 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
18601 } else if (Vec1) {
18602 bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1, AC);
18603 IsUsedInExpr &= FindReusedSplat(
18604 ExtractMask,
18605 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
18606 ExtractMask.size(), IsNotPoisonedVec);
18607 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
18608 IsNonPoisoned &= IsNotPoisonedVec;
18609 } else {
18610 IsUsedInExpr = false;
18611 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
18612 /*ForExtracts=*/true);
18613 }
18614 }
18615 if (!GatherShuffles.empty()) {
18616 unsigned SliceSize =
18617 getPartNumElems(E->Scalars.size(),
18618 ::getNumberOfParts(*TTI, VecTy, E->Scalars.size()));
18619 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
18620 for (const auto [I, TEs] : enumerate(Entries)) {
18621 if (TEs.empty()) {
18622 assert(!GatherShuffles[I] &&
18623 "No shuffles with empty entries list expected.");
18624 continue;
18625 }
18626 assert((TEs.size() == 1 || TEs.size() == 2) &&
18627 "Expected shuffle of 1 or 2 entries.");
18628 unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
18629 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
18630 VecMask.assign(VecMask.size(), PoisonMaskElem);
18631 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
18632 if (TEs.size() == 1) {
18633 bool IsNotPoisonedVec =
18634 TEs.front()->VectorizedValue
18635 ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC)
18636 : true;
18637 IsUsedInExpr &=
18638 FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I,
18639 SliceSize, IsNotPoisonedVec);
18640 ShuffleBuilder.add(*TEs.front(), VecMask);
18641 IsNonPoisoned &= IsNotPoisonedVec;
18642 } else {
18643 IsUsedInExpr = false;
18644 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
18645 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
18646 IsNonPoisoned &=
18647 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) &&
18648 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue, AC);
18649 }
18650 }
18651 }
18652 // Try to figure out best way to combine values: build a shuffle and insert
18653 // elements or just build several shuffles.
18654 // Insert non-constant scalars.
18655 SmallVector<Value *> NonConstants(GatheredScalars);
18656 int EMSz = ExtractMask.size();
18657 int MSz = Mask.size();
18658 // Try to build constant vector and shuffle with it only if currently we
18659 // have a single permutation and more than 1 scalar constants.
18660 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
18661 bool IsIdentityShuffle =
18662 ((UseVecBaseAsInput ||
18663 all_of(ExtractShuffles,
18664 [](const std::optional<TTI::ShuffleKind> &SK) {
18665 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
18667 })) &&
18668 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
18669 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
18670 (!GatherShuffles.empty() &&
18671 all_of(GatherShuffles,
18672 [](const std::optional<TTI::ShuffleKind> &SK) {
18673 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
18675 }) &&
18676 none_of(Mask, [&](int I) { return I >= MSz; }) &&
18678 bool EnoughConstsForShuffle =
18679 IsSingleShuffle &&
18680 (none_of(GatheredScalars,
18681 [](Value *V) {
18682 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
18683 }) ||
18684 any_of(GatheredScalars,
18685 [](Value *V) {
18686 return isa<Constant>(V) && !isa<UndefValue>(V);
18687 })) &&
18688 (!IsIdentityShuffle ||
18689 (GatheredScalars.size() == 2 &&
18690 any_of(GatheredScalars,
18691 [](Value *V) { return !isa<UndefValue>(V); })) ||
18692 count_if(GatheredScalars, [](Value *V) {
18693 return isa<Constant>(V) && !isa<PoisonValue>(V);
18694 }) > 1);
18695 // NonConstants array contains just non-constant values, GatheredScalars
18696 // contains only constant to build final vector and then shuffle.
18697 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
18698 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
18699 NonConstants[I] = PoisonValue::get(OrigScalarTy);
18700 else
18701 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
18702 }
18703 // Generate constants for final shuffle and build a mask for them.
18704 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
18705 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
18706 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
18707 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
18708 ShuffleBuilder.add(BV, BVMask);
18709 }
18710 if (all_of(NonConstants, [=](Value *V) {
18711 return isa<PoisonValue>(V) ||
18712 (IsSingleShuffle && ((IsIdentityShuffle &&
18713 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
18714 }))
18715 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
18716 SubVectorsMask);
18717 else
18718 Res = ShuffleBuilder.finalize(
18719 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
18720 [&](Value *&Vec, SmallVectorImpl<int> &Mask, auto CreateShuffle) {
18721 bool IsSplat = isSplat(NonConstants);
18722 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
18723 TryPackScalars(NonConstants, BVMask, /*IsRootPoison=*/false);
18724 auto CheckIfSplatIsProfitable = [&]() {
18725 // Estimate the cost of splatting + shuffle and compare with
18726 // insert + shuffle.
18728 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
18729 if (isa<ExtractElementInst>(V) || isVectorized(V))
18730 return false;
18732 Instruction::InsertElement, VecTy, CostKind, /*Index=*/0,
18733 PoisonValue::get(VecTy), V);
18734 SmallVector<int> NewMask(Mask.begin(), Mask.end());
18735 for (auto [Idx, I] : enumerate(BVMask))
18736 if (I != PoisonMaskElem)
18737 NewMask[Idx] = Mask.size();
18738 SplatCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy,
18739 NewMask, CostKind);
18741 Instruction::InsertElement, VecTy, CostKind,
18742 *find_if(Mask, [](int I) { return I != PoisonMaskElem; }),
18743 Vec, V);
18744 // Shuffle required?
18745 if (count(BVMask, PoisonMaskElem) <
18746 static_cast<int>(BVMask.size() - 1)) {
18747 SmallVector<int> NewMask(Mask.begin(), Mask.end());
18748 for (auto [Idx, I] : enumerate(BVMask))
18749 if (I != PoisonMaskElem)
18750 NewMask[Idx] = I;
18752 VecTy, NewMask, CostKind);
18753 }
18754 return SplatCost <= BVCost;
18755 };
18756 if (!IsSplat || Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
18757 for (auto [Idx, I] : enumerate(BVMask))
18758 if (I != PoisonMaskElem)
18759 Mask[Idx] = I;
18760 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
18761 } else {
18762 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
18763 SmallVector<Value *> Values(NonConstants.size(),
18764 PoisonValue::get(ScalarTy));
18765 Values[0] = V;
18766 Value *BV = ShuffleBuilder.gather(Values, BVMask.size());
18767 SmallVector<int> SplatMask(BVMask.size(), PoisonMaskElem);
18768 transform(BVMask, SplatMask.begin(), [](int I) {
18769 return I == PoisonMaskElem ? PoisonMaskElem : 0;
18770 });
18771 if (!ShuffleVectorInst::isIdentityMask(SplatMask, VF))
18772 BV = CreateShuffle(BV, nullptr, SplatMask);
18773 for (auto [Idx, I] : enumerate(BVMask))
18774 if (I != PoisonMaskElem)
18775 Mask[Idx] = BVMask.size() + Idx;
18776 Vec = CreateShuffle(Vec, BV, Mask);
18777 for (auto [Idx, I] : enumerate(Mask))
18778 if (I != PoisonMaskElem)
18779 Mask[Idx] = Idx;
18780 }
18781 });
18782 } else if (!allConstant(GatheredScalars)) {
18783 // Gather unique scalars and all constants.
18784 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
18785 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
18786 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
18787 ShuffleBuilder.add(BV, ReuseMask);
18788 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
18789 SubVectorsMask);
18790 } else {
18791 // Gather all constants.
18792 SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem);
18793 for (auto [I, V] : enumerate(GatheredScalars)) {
18794 if (!isa<PoisonValue>(V))
18795 Mask[I] = I;
18796 }
18797 Value *BV = ShuffleBuilder.gather(GatheredScalars);
18798 ShuffleBuilder.add(BV, Mask);
18799 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
18800 SubVectorsMask);
18801 }
18802
18803 if (NeedFreeze)
18804 Res = ShuffleBuilder.createFreeze(Res);
18805 return Res;
18806}
18807
18808Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
18809 for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
18810 (void)vectorizeTree(VectorizableTree[EIdx].get());
18811 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
18812 Builder, *this);
18813}
18814
18815/// \returns \p I after propagating metadata from \p VL only for instructions in
18816/// \p VL.
18819 for (Value *V : VL)
18820 if (isa<Instruction>(V))
18821 Insts.push_back(V);
18822 return llvm::propagateMetadata(Inst, Insts);
18823}
18824
18826 if (DebugLoc DL = PN.getDebugLoc())
18827 return DL;
18828 return DebugLoc::getUnknown();
18829}
18830
18831Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
18832 IRBuilderBase::InsertPointGuard Guard(Builder);
18833
18834 Value *V = E->Scalars.front();
18835 Type *ScalarTy = V->getType();
18836 if (!isa<CmpInst>(V))
18837 ScalarTy = getValueType(V);
18838 auto It = MinBWs.find(E);
18839 if (It != MinBWs.end()) {
18840 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
18841 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
18842 if (VecTy)
18843 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
18844 }
18845 if (E->VectorizedValue)
18846 return E->VectorizedValue;
18847 auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
18848 if (E->isGather()) {
18849 // Set insert point for non-reduction initial nodes.
18850 if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
18851 setInsertPointAfterBundle(E);
18852 Value *Vec = createBuildVector(E, ScalarTy);
18853 E->VectorizedValue = Vec;
18854 return Vec;
18855 }
18856 if (E->State == TreeEntry::SplitVectorize) {
18857 assert(E->CombinedEntriesWithIndices.size() == 2 &&
18858 "Expected exactly 2 combined entries.");
18859 setInsertPointAfterBundle(E);
18860 TreeEntry &OpTE1 =
18861 *VectorizableTree[E->CombinedEntriesWithIndices.front().first];
18862 assert(OpTE1.isSame(
18863 ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) &&
18864 "Expected same first part of scalars.");
18865 Value *Op1 = vectorizeTree(&OpTE1);
18866 TreeEntry &OpTE2 =
18867 *VectorizableTree[E->CombinedEntriesWithIndices.back().first];
18868 assert(
18869 OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) &&
18870 "Expected same second part of scalars.");
18871 Value *Op2 = vectorizeTree(&OpTE2);
18872 auto GetOperandSignedness = [&](const TreeEntry *OpE) {
18873 bool IsSigned = false;
18874 auto It = MinBWs.find(OpE);
18875 if (It != MinBWs.end())
18876 IsSigned = It->second.second;
18877 else
18878 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
18879 if (isa<PoisonValue>(V))
18880 return false;
18881 return !isKnownNonNegative(R, SimplifyQuery(*DL));
18882 });
18883 return IsSigned;
18884 };
18885 if (cast<VectorType>(Op1->getType())->getElementType() !=
18886 ScalarTy->getScalarType()) {
18887 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
18888 Op1 = Builder.CreateIntCast(
18889 Op1,
18891 ScalarTy,
18892 cast<FixedVectorType>(Op1->getType())->getNumElements()),
18893 GetOperandSignedness(&OpTE1));
18894 }
18895 if (cast<VectorType>(Op2->getType())->getElementType() !=
18896 ScalarTy->getScalarType()) {
18897 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
18898 Op2 = Builder.CreateIntCast(
18899 Op2,
18901 ScalarTy,
18902 cast<FixedVectorType>(Op2->getType())->getNumElements()),
18903 GetOperandSignedness(&OpTE2));
18904 }
18905 if (E->ReorderIndices.empty()) {
18906 SmallVector<int> Mask(E->getVectorFactor(), PoisonMaskElem);
18907 std::iota(
18908 Mask.begin(),
18909 std::next(Mask.begin(), E->CombinedEntriesWithIndices.back().second),
18910 0);
18911 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
18912 if (ScalarTyNumElements != 1) {
18913 assert(SLPReVec && "Only supported by REVEC.");
18914 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, Mask);
18915 }
18916 Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
18917 Vec = createInsertVector(Builder, Vec, Op2,
18918 E->CombinedEntriesWithIndices.back().second *
18919 ScalarTyNumElements);
18920 E->VectorizedValue = Vec;
18921 return Vec;
18922 }
18923 unsigned CommonVF =
18924 std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
18925 if (getNumElements(Op1->getType()) != CommonVF) {
18927 std::iota(Mask.begin(), std::next(Mask.begin(), OpTE1.getVectorFactor()),
18928 0);
18929 Op1 = Builder.CreateShuffleVector(Op1, Mask);
18930 }
18931 if (getNumElements(Op2->getType()) != CommonVF) {
18933 std::iota(Mask.begin(), std::next(Mask.begin(), OpTE2.getVectorFactor()),
18934 0);
18935 Op2 = Builder.CreateShuffleVector(Op2, Mask);
18936 }
18937 Value *Vec = Builder.CreateShuffleVector(Op1, Op2, E->getSplitMask());
18938 E->VectorizedValue = Vec;
18939 return Vec;
18940 }
18941
18942 bool IsReverseOrder =
18943 !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices);
18944 auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
18945 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
18946 if (E->getOpcode() == Instruction::Store &&
18947 E->State == TreeEntry::Vectorize) {
18949 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
18950 E->ReorderIndices.size());
18951 ShuffleBuilder.add(V, Mask);
18952 } else if ((E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
18953 E->State == TreeEntry::CompressVectorize) {
18954 ShuffleBuilder.addOrdered(V, {});
18955 } else {
18956 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
18957 }
18959 E->CombinedEntriesWithIndices.size());
18960 transform(
18961 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
18962 return std::make_pair(VectorizableTree[P.first].get(), P.second);
18963 });
18964 assert(
18965 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
18966 "Expected either combined subnodes or reordering");
18967 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
18968 };
18969
18970 assert(!E->isGather() && "Unhandled state");
18971 unsigned ShuffleOrOp =
18972 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
18973 Instruction *VL0 = E->getMainOp();
18974 auto GetOperandSignedness = [&](unsigned Idx) {
18975 const TreeEntry *OpE = getOperandEntry(E, Idx);
18976 bool IsSigned = false;
18977 auto It = MinBWs.find(OpE);
18978 if (It != MinBWs.end())
18979 IsSigned = It->second.second;
18980 else
18981 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
18982 if (isa<PoisonValue>(V))
18983 return false;
18984 return !isKnownNonNegative(R, SimplifyQuery(*DL));
18985 });
18986 return IsSigned;
18987 };
18988 switch (ShuffleOrOp) {
18989 case Instruction::PHI: {
18990 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
18991 E != VectorizableTree.front().get() || E->UserTreeIndex) &&
18992 "PHI reordering is free.");
18993 auto *PH = cast<PHINode>(VL0);
18994 Builder.SetInsertPoint(PH->getParent(),
18995 PH->getParent()->getFirstNonPHIIt());
18997 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
18998 Value *V = NewPhi;
18999
19000 // Adjust insertion point once all PHI's have been generated.
19001 Builder.SetInsertPoint(PH->getParent(),
19002 PH->getParent()->getFirstInsertionPt());
19004
19005 V = FinalShuffle(V, E);
19006
19007 E->VectorizedValue = V;
19008 // If phi node is fully emitted - exit.
19009 if (NewPhi->getNumIncomingValues() != 0)
19010 return NewPhi;
19011
19012 // PHINodes may have multiple entries from the same block. We want to
19013 // visit every block once.
19015
19016 for (unsigned I : seq<unsigned>(PH->getNumIncomingValues())) {
19017 BasicBlock *IBB = PH->getIncomingBlock(I);
19018
19019 // Stop emission if all incoming values are generated.
19020 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
19021 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
19022 return NewPhi;
19023 }
19024
19025 if (!VisitedBBs.insert(IBB).second) {
19026 Value *VecOp = NewPhi->getIncomingValueForBlock(IBB);
19027 NewPhi->addIncoming(VecOp, IBB);
19028 TreeEntry *OpTE = getOperandEntry(E, I);
19029 assert(!OpTE->VectorizedValue && "Expected no vectorized value.");
19030 OpTE->VectorizedValue = VecOp;
19031 continue;
19032 }
19033
19034 Builder.SetInsertPoint(IBB->getTerminator());
19036 Value *Vec = vectorizeOperand(E, I);
19037 if (VecTy != Vec->getType()) {
19038 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
19039 MinBWs.contains(getOperandEntry(E, I))) &&
19040 "Expected item in MinBWs.");
19041 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
19042 }
19043 NewPhi->addIncoming(Vec, IBB);
19044 }
19045
19046 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
19047 "Invalid number of incoming values");
19048 assert(E->VectorizedValue && "Expected vectorized value.");
19049 return E->VectorizedValue;
19050 }
19051
19052 case Instruction::ExtractElement: {
19053 Value *V = E->getSingleOperand(0);
19054 setInsertPointAfterBundle(E);
19055 V = FinalShuffle(V, E);
19056 E->VectorizedValue = V;
19057 return V;
19058 }
19059 case Instruction::ExtractValue: {
19060 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
19061 Builder.SetInsertPoint(LI);
19062 Value *Ptr = LI->getPointerOperand();
19063 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
19064 Value *NewV = ::propagateMetadata(V, E->Scalars);
19065 NewV = FinalShuffle(NewV, E);
19066 E->VectorizedValue = NewV;
19067 return NewV;
19068 }
19069 case Instruction::InsertElement: {
19070 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
19071 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
19072 Value *V = vectorizeOperand(E, 1);
19073 ArrayRef<Value *> Op = E->getOperand(1);
19074 Type *ScalarTy = Op.front()->getType();
19075 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
19076 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
19077 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
19078 assert(Res.first > 0 && "Expected item in MinBWs.");
19079 V = Builder.CreateIntCast(
19080 V,
19082 ScalarTy,
19083 cast<FixedVectorType>(V->getType())->getNumElements()),
19084 Res.second);
19085 }
19086
19087 // Create InsertVector shuffle if necessary
19088 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
19089 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
19090 }));
19091 const unsigned NumElts =
19092 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
19093 const unsigned NumScalars = E->Scalars.size();
19094
19095 unsigned Offset = *getElementIndex(VL0);
19096 assert(Offset < NumElts && "Failed to find vector index offset");
19097
19098 // Create shuffle to resize vector
19100 if (!E->ReorderIndices.empty()) {
19101 inversePermutation(E->ReorderIndices, Mask);
19102 Mask.append(NumElts - NumScalars, PoisonMaskElem);
19103 } else {
19104 Mask.assign(NumElts, PoisonMaskElem);
19105 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
19106 }
19107 // Create InsertVector shuffle if necessary
19108 bool IsIdentity = true;
19109 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
19110 Mask.swap(PrevMask);
19111 for (unsigned I = 0; I < NumScalars; ++I) {
19112 Value *Scalar = E->Scalars[PrevMask[I]];
19113 unsigned InsertIdx = *getElementIndex(Scalar);
19114 IsIdentity &= InsertIdx - Offset == I;
19115 Mask[InsertIdx - Offset] = I;
19116 }
19117 if (!IsIdentity || NumElts != NumScalars) {
19118 Value *V2 = nullptr;
19119 bool IsVNonPoisonous =
19121 SmallVector<int> InsertMask(Mask);
19122 if (NumElts != NumScalars && Offset == 0) {
19123 // Follow all insert element instructions from the current buildvector
19124 // sequence.
19125 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
19126 do {
19127 std::optional<unsigned> InsertIdx = getElementIndex(Ins);
19128 if (!InsertIdx)
19129 break;
19130 if (InsertMask[*InsertIdx] == PoisonMaskElem)
19131 InsertMask[*InsertIdx] = *InsertIdx;
19132 if (!Ins->hasOneUse())
19133 break;
19134 Ins = dyn_cast_or_null<InsertElementInst>(
19135 Ins->getUniqueUndroppableUser());
19136 } while (Ins);
19137 SmallBitVector UseMask =
19138 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19139 SmallBitVector IsFirstPoison =
19140 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19141 SmallBitVector IsFirstUndef =
19142 isUndefVector(FirstInsert->getOperand(0), UseMask);
19143 if (!IsFirstPoison.all()) {
19144 unsigned Idx = 0;
19145 for (unsigned I = 0; I < NumElts; I++) {
19146 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
19147 IsFirstUndef.test(I)) {
19148 if (IsVNonPoisonous) {
19149 InsertMask[I] = I < NumScalars ? I : 0;
19150 continue;
19151 }
19152 if (!V2)
19153 V2 = UndefValue::get(V->getType());
19154 if (Idx >= NumScalars)
19155 Idx = NumScalars - 1;
19156 InsertMask[I] = NumScalars + Idx;
19157 ++Idx;
19158 } else if (InsertMask[I] != PoisonMaskElem &&
19159 Mask[I] == PoisonMaskElem) {
19160 InsertMask[I] = PoisonMaskElem;
19161 }
19162 }
19163 } else {
19164 InsertMask = Mask;
19165 }
19166 }
19167 if (!V2)
19168 V2 = PoisonValue::get(V->getType());
19169 V = Builder.CreateShuffleVector(V, V2, InsertMask);
19170 if (auto *I = dyn_cast<Instruction>(V)) {
19171 GatherShuffleExtractSeq.insert(I);
19172 CSEBlocks.insert(I->getParent());
19173 }
19174 }
19175
19176 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
19177 for (unsigned I = 0; I < NumElts; I++) {
19178 if (Mask[I] != PoisonMaskElem)
19179 InsertMask[Offset + I] = I;
19180 }
19181 SmallBitVector UseMask =
19182 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19183 SmallBitVector IsFirstUndef =
19184 isUndefVector(FirstInsert->getOperand(0), UseMask);
19185 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
19186 NumElts != NumScalars) {
19187 if (IsFirstUndef.all()) {
19188 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
19189 SmallBitVector IsFirstPoison =
19190 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19191 if (!IsFirstPoison.all()) {
19192 for (unsigned I = 0; I < NumElts; I++) {
19193 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
19194 InsertMask[I] = I + NumElts;
19195 }
19196 }
19197 V = Builder.CreateShuffleVector(
19198 V,
19199 IsFirstPoison.all() ? PoisonValue::get(V->getType())
19200 : FirstInsert->getOperand(0),
19201 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
19202 if (auto *I = dyn_cast<Instruction>(V)) {
19203 GatherShuffleExtractSeq.insert(I);
19204 CSEBlocks.insert(I->getParent());
19205 }
19206 }
19207 } else {
19208 SmallBitVector IsFirstPoison =
19209 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19210 for (unsigned I = 0; I < NumElts; I++) {
19211 if (InsertMask[I] == PoisonMaskElem)
19212 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
19213 else
19214 InsertMask[I] += NumElts;
19215 }
19216 V = Builder.CreateShuffleVector(
19217 FirstInsert->getOperand(0), V, InsertMask,
19218 cast<Instruction>(E->Scalars.back())->getName());
19219 if (auto *I = dyn_cast<Instruction>(V)) {
19220 GatherShuffleExtractSeq.insert(I);
19221 CSEBlocks.insert(I->getParent());
19222 }
19223 }
19224 }
19225
19226 ++NumVectorInstructions;
19227 E->VectorizedValue = V;
19228 return V;
19229 }
19230 case Instruction::ZExt:
19231 case Instruction::SExt:
19232 case Instruction::FPToUI:
19233 case Instruction::FPToSI:
19234 case Instruction::FPExt:
19235 case Instruction::PtrToInt:
19236 case Instruction::IntToPtr:
19237 case Instruction::SIToFP:
19238 case Instruction::UIToFP:
19239 case Instruction::Trunc:
19240 case Instruction::FPTrunc:
19241 case Instruction::BitCast: {
19242 setInsertPointAfterBundle(E);
19243
19244 Value *InVec = vectorizeOperand(E, 0);
19245
19246 auto *CI = cast<CastInst>(VL0);
19247 Instruction::CastOps VecOpcode = CI->getOpcode();
19248 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
19249 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
19250 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
19251 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
19252 SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {
19253 // Check if the values are candidates to demote.
19254 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
19255 if (SrcIt != MinBWs.end())
19256 SrcBWSz = SrcIt->second.first;
19257 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
19258 if (BWSz == SrcBWSz) {
19259 VecOpcode = Instruction::BitCast;
19260 } else if (BWSz < SrcBWSz) {
19261 VecOpcode = Instruction::Trunc;
19262 } else if (It != MinBWs.end()) {
19263 assert(BWSz > SrcBWSz && "Invalid cast!");
19264 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
19265 } else if (SrcIt != MinBWs.end()) {
19266 assert(BWSz > SrcBWSz && "Invalid cast!");
19267 VecOpcode =
19268 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
19269 }
19270 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
19271 !SrcIt->second.second) {
19272 VecOpcode = Instruction::UIToFP;
19273 }
19274 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
19275 ? InVec
19276 : Builder.CreateCast(VecOpcode, InVec, VecTy);
19277 V = FinalShuffle(V, E);
19278
19279 E->VectorizedValue = V;
19280 ++NumVectorInstructions;
19281 return V;
19282 }
19283 case Instruction::FCmp:
19284 case Instruction::ICmp: {
19285 setInsertPointAfterBundle(E);
19286
19287 Value *L = vectorizeOperand(E, 0);
19288 Value *R = vectorizeOperand(E, 1);
19289 if (L->getType() != R->getType()) {
19290 assert((getOperandEntry(E, 0)->isGather() ||
19291 getOperandEntry(E, 1)->isGather() ||
19292 MinBWs.contains(getOperandEntry(E, 0)) ||
19293 MinBWs.contains(getOperandEntry(E, 1))) &&
19294 "Expected item in MinBWs.");
19295 if (cast<VectorType>(L->getType())
19296 ->getElementType()
19297 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
19298 ->getElementType()
19299 ->getIntegerBitWidth()) {
19300 Type *CastTy = R->getType();
19301 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
19302 } else {
19303 Type *CastTy = L->getType();
19304 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
19305 }
19306 }
19307
19308 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
19309 Value *V = Builder.CreateCmp(P0, L, R);
19310 propagateIRFlags(V, E->Scalars, VL0);
19311 if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end())
19312 ICmp->setSameSign(/*B=*/false);
19313 // Do not cast for cmps.
19314 VecTy = cast<FixedVectorType>(V->getType());
19315 V = FinalShuffle(V, E);
19316
19317 E->VectorizedValue = V;
19318 ++NumVectorInstructions;
19319 return V;
19320 }
19321 case Instruction::Select: {
19322 setInsertPointAfterBundle(E);
19323
19324 Value *Cond = vectorizeOperand(E, 0);
19325 Value *True = vectorizeOperand(E, 1);
19326 Value *False = vectorizeOperand(E, 2);
19327 if (True->getType() != VecTy || False->getType() != VecTy) {
19328 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
19329 getOperandEntry(E, 2)->isGather() ||
19330 MinBWs.contains(getOperandEntry(E, 1)) ||
19331 MinBWs.contains(getOperandEntry(E, 2))) &&
19332 "Expected item in MinBWs.");
19333 if (True->getType() != VecTy)
19334 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
19335 if (False->getType() != VecTy)
19336 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
19337 }
19338
19339 unsigned CondNumElements = getNumElements(Cond->getType());
19340 unsigned TrueNumElements = getNumElements(True->getType());
19341 assert(TrueNumElements >= CondNumElements &&
19342 TrueNumElements % CondNumElements == 0 &&
19343 "Cannot vectorize Instruction::Select");
19344 assert(TrueNumElements == getNumElements(False->getType()) &&
19345 "Cannot vectorize Instruction::Select");
19346 if (CondNumElements != TrueNumElements) {
19347 // When the return type is i1 but the source is fixed vector type, we
19348 // need to duplicate the condition value.
19349 Cond = Builder.CreateShuffleVector(
19350 Cond, createReplicatedMask(TrueNumElements / CondNumElements,
19351 CondNumElements));
19352 }
19353 assert(getNumElements(Cond->getType()) == TrueNumElements &&
19354 "Cannot vectorize Instruction::Select");
19355 Value *V = Builder.CreateSelect(Cond, True, False);
19356 V = FinalShuffle(V, E);
19357
19358 E->VectorizedValue = V;
19359 ++NumVectorInstructions;
19360 return V;
19361 }
19362 case Instruction::FNeg: {
19363 setInsertPointAfterBundle(E);
19364
19365 Value *Op = vectorizeOperand(E, 0);
19366
19367 Value *V = Builder.CreateUnOp(
19368 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
19369 propagateIRFlags(V, E->Scalars, VL0);
19370 if (auto *I = dyn_cast<Instruction>(V))
19371 V = ::propagateMetadata(I, E->Scalars);
19372
19373 V = FinalShuffle(V, E);
19374
19375 E->VectorizedValue = V;
19376 ++NumVectorInstructions;
19377
19378 return V;
19379 }
19380 case Instruction::Freeze: {
19381 setInsertPointAfterBundle(E);
19382
19383 Value *Op = vectorizeOperand(E, 0);
19384
19385 if (Op->getType() != VecTy) {
19386 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
19387 MinBWs.contains(getOperandEntry(E, 0))) &&
19388 "Expected item in MinBWs.");
19389 Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0));
19390 }
19391 Value *V = Builder.CreateFreeze(Op);
19392 V = FinalShuffle(V, E);
19393
19394 E->VectorizedValue = V;
19395 ++NumVectorInstructions;
19396
19397 return V;
19398 }
19399 case Instruction::Add:
19400 case Instruction::FAdd:
19401 case Instruction::Sub:
19402 case Instruction::FSub:
19403 case Instruction::Mul:
19404 case Instruction::FMul:
19405 case Instruction::UDiv:
19406 case Instruction::SDiv:
19407 case Instruction::FDiv:
19408 case Instruction::URem:
19409 case Instruction::SRem:
19410 case Instruction::FRem:
19411 case Instruction::Shl:
19412 case Instruction::LShr:
19413 case Instruction::AShr:
19414 case Instruction::And:
19415 case Instruction::Or:
19416 case Instruction::Xor: {
19417 setInsertPointAfterBundle(E);
19418
19419 Value *LHS = vectorizeOperand(E, 0);
19420 Value *RHS = vectorizeOperand(E, 1);
19421 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
19422 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
19423 ArrayRef<Value *> Ops = E->getOperand(I);
19424 if (all_of(Ops, [&](Value *Op) {
19425 auto *CI = dyn_cast<ConstantInt>(Op);
19426 return CI && CI->getValue().countr_one() >= It->second.first;
19427 })) {
19428 V = FinalShuffle(I == 0 ? RHS : LHS, E);
19429 E->VectorizedValue = V;
19430 ++NumVectorInstructions;
19431 return V;
19432 }
19433 }
19434 }
19435 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
19436 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
19437 getOperandEntry(E, 1)->isGather() ||
19438 MinBWs.contains(getOperandEntry(E, 0)) ||
19439 MinBWs.contains(getOperandEntry(E, 1))) &&
19440 "Expected item in MinBWs.");
19441 if (LHS->getType() != VecTy)
19442 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
19443 if (RHS->getType() != VecTy)
19444 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
19445 }
19446
19447 Value *V = Builder.CreateBinOp(
19448 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
19449 RHS);
19450 propagateIRFlags(V, E->Scalars, nullptr, It == MinBWs.end());
19451 if (auto *I = dyn_cast<Instruction>(V)) {
19452 V = ::propagateMetadata(I, E->Scalars);
19453 // Drop nuw flags for abs(sub(commutative), true).
19454 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
19455 any_of(E->Scalars, [](Value *V) {
19456 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
19457 }))
19458 I->setHasNoUnsignedWrap(/*b=*/false);
19459 }
19460
19461 V = FinalShuffle(V, E);
19462
19463 E->VectorizedValue = V;
19464 ++NumVectorInstructions;
19465
19466 return V;
19467 }
19468 case Instruction::Load: {
19469 // Loads are inserted at the head of the tree because we don't want to
19470 // sink them all the way down past store instructions.
19471 setInsertPointAfterBundle(E);
19472
19473 LoadInst *LI = cast<LoadInst>(VL0);
19474 Instruction *NewLI;
19475 Value *PO = LI->getPointerOperand();
19476 if (E->State == TreeEntry::Vectorize) {
19477 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
19478 } else if (E->State == TreeEntry::CompressVectorize) {
19479 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
19480 CompressEntryToData.at(E);
19481 Align CommonAlignment = LI->getAlign();
19482 if (IsMasked) {
19483 unsigned VF = getNumElements(LoadVecTy);
19484 SmallVector<Constant *> MaskValues(
19485 VF / getNumElements(LI->getType()),
19487 for (int I : CompressMask)
19488 MaskValues[I] = ConstantInt::getTrue(VecTy->getContext());
19489 if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
19490 assert(SLPReVec && "Only supported by REVEC.");
19491 MaskValues = replicateMask(MaskValues, VecTy->getNumElements());
19492 }
19493 Constant *MaskValue = ConstantVector::get(MaskValues);
19494 NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
19495 MaskValue);
19496 } else {
19497 NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
19498 }
19499 NewLI = ::propagateMetadata(NewLI, E->Scalars);
19500 // TODO: include this cost into CommonCost.
19501 if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
19502 assert(SLPReVec && "FixedVectorType is not expected.");
19504 CompressMask);
19505 }
19506 NewLI =
19507 cast<Instruction>(Builder.CreateShuffleVector(NewLI, CompressMask));
19508 } else if (E->State == TreeEntry::StridedVectorize) {
19509 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
19510 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
19511 PO = IsReverseOrder ? PtrN : Ptr0;
19512 std::optional<int64_t> Diff = getPointersDiff(
19513 VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
19514 Type *StrideTy = DL->getIndexType(PO->getType());
19515 Value *StrideVal;
19516 if (Diff) {
19517 int64_t Stride =
19518 *Diff / (static_cast<int64_t>(E->Scalars.size()) - 1);
19519 StrideVal =
19520 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
19521 DL->getTypeAllocSize(ScalarTy));
19522 } else {
19523 SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
19524 transform(E->Scalars, PointerOps.begin(), [](Value *V) {
19525 return cast<LoadInst>(V)->getPointerOperand();
19526 });
19527 OrdersType Order;
19528 std::optional<Value *> Stride =
19529 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,
19530 &*Builder.GetInsertPoint());
19531 Value *NewStride =
19532 Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true);
19533 StrideVal = Builder.CreateMul(
19534 NewStride,
19535 ConstantInt::get(
19536 StrideTy,
19537 (IsReverseOrder ? -1 : 1) *
19538 static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
19539 }
19540 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
19541 auto *Inst = Builder.CreateIntrinsic(
19542 Intrinsic::experimental_vp_strided_load,
19543 {VecTy, PO->getType(), StrideTy},
19544 {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
19545 Builder.getInt32(E->Scalars.size())});
19546 Inst->addParamAttr(
19547 /*ArgNo=*/0,
19548 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
19549 NewLI = Inst;
19550 } else {
19551 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
19552 Value *VecPtr = vectorizeOperand(E, 0);
19553 if (isa<FixedVectorType>(ScalarTy)) {
19554 assert(SLPReVec && "FixedVectorType is not expected.");
19555 // CreateMaskedGather expects VecTy and VecPtr have same size. We need
19556 // to expand VecPtr if ScalarTy is a vector type.
19557 unsigned ScalarTyNumElements =
19558 cast<FixedVectorType>(ScalarTy)->getNumElements();
19559 unsigned VecTyNumElements =
19560 cast<FixedVectorType>(VecTy)->getNumElements();
19561 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
19562 "Cannot expand getelementptr.");
19563 unsigned VF = VecTyNumElements / ScalarTyNumElements;
19564 SmallVector<Constant *> Indices(VecTyNumElements);
19565 transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) {
19566 return Builder.getInt64(I % ScalarTyNumElements);
19567 });
19568 VecPtr = Builder.CreateGEP(
19569 VecTy->getElementType(),
19570 Builder.CreateShuffleVector(
19571 VecPtr, createReplicatedMask(ScalarTyNumElements, VF)),
19572 ConstantVector::get(Indices));
19573 }
19574 // Use the minimum alignment of the gathered loads.
19575 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
19576 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
19577 }
19578 Value *V = E->State == TreeEntry::CompressVectorize
19579 ? NewLI
19580 : ::propagateMetadata(NewLI, E->Scalars);
19581
19582 V = FinalShuffle(V, E);
19583 E->VectorizedValue = V;
19584 ++NumVectorInstructions;
19585 return V;
19586 }
19587 case Instruction::Store: {
19588 auto *SI = cast<StoreInst>(VL0);
19589
19590 setInsertPointAfterBundle(E);
19591
19592 Value *VecValue = vectorizeOperand(E, 0);
19593 if (VecValue->getType() != VecTy)
19594 VecValue =
19595 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
19596 VecValue = FinalShuffle(VecValue, E);
19597
19598 Value *Ptr = SI->getPointerOperand();
19599 Instruction *ST;
19600 if (E->State == TreeEntry::Vectorize) {
19601 ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
19602 } else {
19603 assert(E->State == TreeEntry::StridedVectorize &&
19604 "Expected either strided or consecutive stores.");
19605 if (!E->ReorderIndices.empty()) {
19606 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
19607 Ptr = SI->getPointerOperand();
19608 }
19609 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
19610 Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
19611 auto *Inst = Builder.CreateIntrinsic(
19612 Intrinsic::experimental_vp_strided_store,
19613 {VecTy, Ptr->getType(), StrideTy},
19614 {VecValue, Ptr,
19615 ConstantInt::get(
19616 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
19617 Builder.getAllOnesMask(VecTy->getElementCount()),
19618 Builder.getInt32(E->Scalars.size())});
19619 Inst->addParamAttr(
19620 /*ArgNo=*/1,
19621 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
19622 ST = Inst;
19623 }
19624
19625 Value *V = ::propagateMetadata(ST, E->Scalars);
19626
19627 E->VectorizedValue = V;
19628 ++NumVectorInstructions;
19629 return V;
19630 }
19631 case Instruction::GetElementPtr: {
19632 auto *GEP0 = cast<GetElementPtrInst>(VL0);
19633 setInsertPointAfterBundle(E);
19634
19635 Value *Op0 = vectorizeOperand(E, 0);
19636
19637 SmallVector<Value *> OpVecs;
19638 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
19639 Value *OpVec = vectorizeOperand(E, J);
19640 OpVecs.push_back(OpVec);
19641 }
19642
19643 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
19644 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
19646 for (Value *V : E->Scalars) {
19647 if (isa<GetElementPtrInst>(V))
19648 GEPs.push_back(V);
19649 }
19650 V = ::propagateMetadata(I, GEPs);
19651 }
19652
19653 V = FinalShuffle(V, E);
19654
19655 E->VectorizedValue = V;
19656 ++NumVectorInstructions;
19657
19658 return V;
19659 }
19660 case Instruction::Call: {
19661 CallInst *CI = cast<CallInst>(VL0);
19662 setInsertPointAfterBundle(E);
19663
19665
19667 CI, ID, VecTy->getNumElements(),
19668 It != MinBWs.end() ? It->second.first : 0, TTI);
19669 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
19670 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
19671 VecCallCosts.first <= VecCallCosts.second;
19672
19673 Value *ScalarArg = nullptr;
19674 SmallVector<Value *> OpVecs;
19675 SmallVector<Type *, 2> TysForDecl;
19676 // Add return type if intrinsic is overloaded on it.
19677 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI))
19678 TysForDecl.push_back(VecTy);
19679 auto *CEI = cast<CallInst>(VL0);
19680 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
19681 // Some intrinsics have scalar arguments. This argument should not be
19682 // vectorized.
19683 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) {
19684 ScalarArg = CEI->getArgOperand(I);
19685 // if decided to reduce bitwidth of abs intrinsic, it second argument
19686 // must be set false (do not return poison, if value issigned min).
19687 if (ID == Intrinsic::abs && It != MinBWs.end() &&
19688 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
19689 ScalarArg = Builder.getFalse();
19690 OpVecs.push_back(ScalarArg);
19692 TysForDecl.push_back(ScalarArg->getType());
19693 continue;
19694 }
19695
19696 Value *OpVec = vectorizeOperand(E, I);
19697 ScalarArg = CEI->getArgOperand(I);
19698 if (cast<VectorType>(OpVec->getType())->getElementType() !=
19699 ScalarArg->getType()->getScalarType() &&
19700 It == MinBWs.end()) {
19701 auto *CastTy =
19702 getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
19703 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
19704 } else if (It != MinBWs.end()) {
19705 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
19706 }
19707 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
19708 OpVecs.push_back(OpVec);
19709 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))
19710 TysForDecl.push_back(OpVec->getType());
19711 }
19712
19713 Function *CF;
19714 if (!UseIntrinsic) {
19715 VFShape Shape =
19718 false /*HasGlobalPred*/);
19719 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
19720 } else {
19721 CF = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, TysForDecl);
19722 }
19723
19725 CI->getOperandBundlesAsDefs(OpBundles);
19726 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
19727
19728 propagateIRFlags(V, E->Scalars, VL0);
19729 V = FinalShuffle(V, E);
19730
19731 E->VectorizedValue = V;
19732 ++NumVectorInstructions;
19733 return V;
19734 }
19735 case Instruction::ShuffleVector: {
19736 Value *V;
19737 if (SLPReVec && !E->isAltShuffle()) {
19738 setInsertPointAfterBundle(E);
19739 Value *Src = vectorizeOperand(E, 0);
19740 SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));
19741 if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
19742 SmallVector<int> NewMask(ThisMask.size());
19743 transform(ThisMask, NewMask.begin(), [&SVSrc](int Mask) {
19744 return SVSrc->getShuffleMask()[Mask];
19745 });
19746 V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
19747 SVSrc->getOperand(1), NewMask);
19748 } else {
19749 V = Builder.CreateShuffleVector(Src, ThisMask);
19750 }
19751 propagateIRFlags(V, E->Scalars, VL0);
19752 if (auto *I = dyn_cast<Instruction>(V))
19753 V = ::propagateMetadata(I, E->Scalars);
19754 V = FinalShuffle(V, E);
19755 } else {
19756 assert(E->isAltShuffle() &&
19757 ((Instruction::isBinaryOp(E->getOpcode()) &&
19758 Instruction::isBinaryOp(E->getAltOpcode())) ||
19759 (Instruction::isCast(E->getOpcode()) &&
19760 Instruction::isCast(E->getAltOpcode())) ||
19761 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
19762 "Invalid Shuffle Vector Operand");
19763
19764 Value *LHS = nullptr, *RHS = nullptr;
19765 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
19766 setInsertPointAfterBundle(E);
19767 LHS = vectorizeOperand(E, 0);
19768 RHS = vectorizeOperand(E, 1);
19769 } else {
19770 setInsertPointAfterBundle(E);
19771 LHS = vectorizeOperand(E, 0);
19772 }
19773 if (LHS && RHS &&
19774 ((Instruction::isBinaryOp(E->getOpcode()) &&
19775 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
19776 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
19777 assert((It != MinBWs.end() ||
19778 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
19779 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
19780 MinBWs.contains(getOperandEntry(E, 0)) ||
19781 MinBWs.contains(getOperandEntry(E, 1))) &&
19782 "Expected item in MinBWs.");
19783 Type *CastTy = VecTy;
19784 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
19785 if (cast<VectorType>(LHS->getType())
19786 ->getElementType()
19787 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
19788 ->getElementType()
19789 ->getIntegerBitWidth())
19790 CastTy = RHS->getType();
19791 else
19792 CastTy = LHS->getType();
19793 }
19794 if (LHS->getType() != CastTy)
19795 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
19796 if (RHS->getType() != CastTy)
19797 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
19798 }
19799
19800 Value *V0, *V1;
19801 if (Instruction::isBinaryOp(E->getOpcode())) {
19802 V0 = Builder.CreateBinOp(
19803 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
19804 V1 = Builder.CreateBinOp(
19805 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
19806 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
19807 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
19808 auto *AltCI = cast<CmpInst>(E->getAltOp());
19809 CmpInst::Predicate AltPred = AltCI->getPredicate();
19810 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
19811 } else {
19812 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
19813 unsigned SrcBWSz = DL->getTypeSizeInBits(
19814 cast<VectorType>(LHS->getType())->getElementType());
19815 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
19816 if (BWSz <= SrcBWSz) {
19817 if (BWSz < SrcBWSz)
19818 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
19819 assert(LHS->getType() == VecTy &&
19820 "Expected same type as operand.");
19821 if (auto *I = dyn_cast<Instruction>(LHS))
19822 LHS = ::propagateMetadata(I, E->Scalars);
19823 LHS = FinalShuffle(LHS, E);
19824 E->VectorizedValue = LHS;
19825 ++NumVectorInstructions;
19826 return LHS;
19827 }
19828 }
19829 V0 = Builder.CreateCast(
19830 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
19831 V1 = Builder.CreateCast(
19832 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
19833 }
19834 // Add V0 and V1 to later analysis to try to find and remove matching
19835 // instruction, if any.
19836 for (Value *V : {V0, V1}) {
19837 if (auto *I = dyn_cast<Instruction>(V)) {
19838 GatherShuffleExtractSeq.insert(I);
19839 CSEBlocks.insert(I->getParent());
19840 }
19841 }
19842
19843 // Create shuffle to take alternate operations from the vector.
19844 // Also, gather up main and alt scalar ops to propagate IR flags to
19845 // each vector operation.
19846 ValueList OpScalars, AltScalars;
19848 E->buildAltOpShuffleMask(
19849 [E, this](Instruction *I) {
19850 assert(E->getMatchingMainOpOrAltOp(I) &&
19851 "Unexpected main/alternate opcode");
19852 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
19853 *TLI);
19854 },
19855 Mask, &OpScalars, &AltScalars);
19856
19857 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
19858 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
19859 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
19860 // Drop nuw flags for abs(sub(commutative), true).
19861 if (auto *I = dyn_cast<Instruction>(Vec);
19862 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
19863 any_of(E->Scalars, [](Value *V) {
19864 if (isa<PoisonValue>(V))
19865 return false;
19866 auto *IV = cast<Instruction>(V);
19867 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
19868 }))
19869 I->setHasNoUnsignedWrap(/*b=*/false);
19870 };
19871 DropNuwFlag(V0, E->getOpcode());
19872 DropNuwFlag(V1, E->getAltOpcode());
19873
19874 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
19875 assert(SLPReVec && "FixedVectorType is not expected.");
19877 }
19878 V = Builder.CreateShuffleVector(V0, V1, Mask);
19879 if (auto *I = dyn_cast<Instruction>(V)) {
19880 V = ::propagateMetadata(I, E->Scalars);
19881 GatherShuffleExtractSeq.insert(I);
19882 CSEBlocks.insert(I->getParent());
19883 }
19884 }
19885
19886 E->VectorizedValue = V;
19887 ++NumVectorInstructions;
19888
19889 return V;
19890 }
19891 default:
19892 llvm_unreachable("unknown inst");
19893 }
19894 return nullptr;
19895}
19896
19898 ExtraValueToDebugLocsMap ExternallyUsedValues;
19899 return vectorizeTree(ExternallyUsedValues);
19900}
19901
19903 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
19904 Instruction *ReductionRoot,
19905 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
19906 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
19907 // need to rebuild it.
19908 EntryToLastInstruction.clear();
19909 // All blocks must be scheduled before any instructions are inserted.
19910 for (auto &BSIter : BlocksSchedules)
19911 scheduleBlock(*this, BSIter.second.get());
19912 // Cache last instructions for the nodes to avoid side effects, which may
19913 // appear during vectorization, like extra uses, etc.
19914 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
19915 if (TE->isGather())
19916 continue;
19917 (void)getLastInstructionInBundle(TE.get());
19918 }
19919
19920 if (ReductionRoot)
19921 Builder.SetInsertPoint(ReductionRoot->getParent(),
19922 ReductionRoot->getIterator());
19923 else
19924 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
19925
19926 // Vectorize gather operands of the nodes with the external uses only.
19928 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
19929 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
19930 TE->UserTreeIndex.UserTE->hasState() &&
19931 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
19932 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
19933 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
19934 !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
19935 all_of(TE->UserTreeIndex.UserTE->Scalars,
19936 [](Value *V) { return isUsedOutsideBlock(V); })) {
19937 Instruction &LastInst =
19938 getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
19939 GatherEntries.emplace_back(TE.get(), &LastInst);
19940 }
19941 }
19942 for (auto &Entry : GatherEntries) {
19943 IRBuilderBase::InsertPointGuard Guard(Builder);
19944 Builder.SetInsertPoint(Entry.second);
19945 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
19946 (void)vectorizeTree(Entry.first);
19947 }
19948 // Emit gathered loads first to emit better code for the users of those
19949 // gathered loads.
19950 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
19951 if (GatheredLoadsEntriesFirst.has_value() &&
19952 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
19953 (!TE->isGather() || TE->UserTreeIndex)) {
19954 assert((TE->UserTreeIndex ||
19955 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
19956 "Expected gathered load node.");
19957 (void)vectorizeTree(TE.get());
19958 }
19959 }
19960 (void)vectorizeTree(VectorizableTree[0].get());
19961 // Run through the list of postponed gathers and emit them, replacing the temp
19962 // emitted allocas with actual vector instructions.
19963 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
19965 for (const TreeEntry *E : PostponedNodes) {
19966 auto *TE = const_cast<TreeEntry *>(E);
19967 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
19968 TE->VectorizedValue = nullptr;
19969 auto *UserI = cast<Instruction>(TE->UserTreeIndex.UserTE->VectorizedValue);
19970 // If user is a PHI node, its vector code have to be inserted right before
19971 // block terminator. Since the node was delayed, there were some unresolved
19972 // dependencies at the moment when stab instruction was emitted. In a case
19973 // when any of these dependencies turn out an operand of another PHI, coming
19974 // from this same block, position of a stab instruction will become invalid.
19975 // The is because source vector that supposed to feed this gather node was
19976 // inserted at the end of the block [after stab instruction]. So we need
19977 // to adjust insertion point again to the end of block.
19978 if (isa<PHINode>(UserI)) {
19979 // Insert before all users.
19980 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
19981 for (User *U : PrevVec->users()) {
19982 if (U == UserI)
19983 continue;
19984 auto *UI = dyn_cast<Instruction>(U);
19985 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
19986 continue;
19987 if (UI->comesBefore(InsertPt))
19988 InsertPt = UI;
19989 }
19990 Builder.SetInsertPoint(InsertPt);
19991 } else {
19992 Builder.SetInsertPoint(PrevVec);
19993 }
19994 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
19995 Value *Vec = vectorizeTree(TE);
19996 if (auto *VecI = dyn_cast<Instruction>(Vec);
19997 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
19998 Builder.GetInsertPoint()->comesBefore(VecI))
19999 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
20000 Builder.GetInsertPoint());
20001 if (Vec->getType() != PrevVec->getType()) {
20002 assert(Vec->getType()->isIntOrIntVectorTy() &&
20003 PrevVec->getType()->isIntOrIntVectorTy() &&
20004 "Expected integer vector types only.");
20005 std::optional<bool> IsSigned;
20006 for (Value *V : TE->Scalars) {
20007 if (isVectorized(V)) {
20008 for (const TreeEntry *MNTE : getTreeEntries(V)) {
20009 auto It = MinBWs.find(MNTE);
20010 if (It != MinBWs.end()) {
20011 IsSigned = IsSigned.value_or(false) || It->second.second;
20012 if (*IsSigned)
20013 break;
20014 }
20015 }
20016 if (IsSigned.value_or(false))
20017 break;
20018 // Scan through gather nodes.
20019 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
20020 auto It = MinBWs.find(BVE);
20021 if (It != MinBWs.end()) {
20022 IsSigned = IsSigned.value_or(false) || It->second.second;
20023 if (*IsSigned)
20024 break;
20025 }
20026 }
20027 if (IsSigned.value_or(false))
20028 break;
20029 if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
20030 IsSigned =
20031 IsSigned.value_or(false) ||
20032 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
20033 continue;
20034 }
20035 if (IsSigned.value_or(false))
20036 break;
20037 }
20038 }
20039 if (IsSigned.value_or(false)) {
20040 // Final attempt - check user node.
20041 auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
20042 if (It != MinBWs.end())
20043 IsSigned = It->second.second;
20044 }
20045 assert(IsSigned &&
20046 "Expected user node or perfect diamond match in MinBWs.");
20047 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
20048 }
20049 PrevVec->replaceAllUsesWith(Vec);
20050 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
20051 // Replace the stub vector node, if it was used before for one of the
20052 // buildvector nodes already.
20053 auto It = PostponedValues.find(PrevVec);
20054 if (It != PostponedValues.end()) {
20055 for (TreeEntry *VTE : It->getSecond())
20056 VTE->VectorizedValue = Vec;
20057 }
20058 eraseInstruction(PrevVec);
20059 }
20060
20061 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
20062 << " values .\n");
20063
20065 // Maps vector instruction to original insertelement instruction
20066 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
20067 // Maps extract Scalar to the corresponding extractelement instruction in the
20068 // basic block. Only one extractelement per block should be emitted.
20070 ScalarToEEs;
20071 SmallDenseSet<Value *, 4> UsedInserts;
20073 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
20075 // Extract all of the elements with the external uses.
20076 for (const auto &ExternalUse : ExternalUses) {
20077 Value *Scalar = ExternalUse.Scalar;
20078 llvm::User *User = ExternalUse.User;
20079
20080 // Skip users that we already RAUW. This happens when one instruction
20081 // has multiple uses of the same value.
20082 if (User && !is_contained(Scalar->users(), User))
20083 continue;
20084 const TreeEntry *E = &ExternalUse.E;
20085 assert(E && "Invalid scalar");
20086 assert(!E->isGather() && "Extracting from a gather list");
20087 // Non-instruction pointers are not deleted, just skip them.
20088 if (E->getOpcode() == Instruction::GetElementPtr &&
20089 !isa<GetElementPtrInst>(Scalar))
20090 continue;
20091
20092 Value *Vec = E->VectorizedValue;
20093 assert(Vec && "Can't find vectorizable value");
20094
20095 Value *Lane = Builder.getInt32(ExternalUse.Lane);
20096 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
20097 if (Scalar->getType() != Vec->getType()) {
20098 Value *Ex = nullptr;
20099 Value *ExV = nullptr;
20100 auto *Inst = dyn_cast<Instruction>(Scalar);
20101 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
20102 auto It = ScalarToEEs.find(Scalar);
20103 if (It != ScalarToEEs.end()) {
20104 // No need to emit many extracts, just move the only one in the
20105 // current block.
20106 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
20107 : Builder.GetInsertBlock());
20108 if (EEIt != It->second.end()) {
20109 Value *PrevV = EEIt->second.first;
20110 if (auto *I = dyn_cast<Instruction>(PrevV);
20111 I && !ReplaceInst &&
20112 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
20113 Builder.GetInsertPoint()->comesBefore(I)) {
20114 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
20115 Builder.GetInsertPoint());
20116 if (auto *CI = dyn_cast<Instruction>(EEIt->second.second))
20117 CI->moveAfter(I);
20118 }
20119 Ex = PrevV;
20120 ExV = EEIt->second.second ? EEIt->second.second : Ex;
20121 }
20122 }
20123 if (!Ex) {
20124 // "Reuse" the existing extract to improve final codegen.
20125 if (ReplaceInst) {
20126 // Leave the instruction as is, if it cheaper extracts and all
20127 // operands are scalar.
20128 if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
20129 IgnoredExtracts.insert(EE);
20130 Ex = EE;
20131 } else {
20132 auto *CloneInst = Inst->clone();
20133 CloneInst->insertBefore(Inst->getIterator());
20134 if (Inst->hasName())
20135 CloneInst->takeName(Inst);
20136 Ex = CloneInst;
20137 }
20138 } else if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
20139 ES && isa<Instruction>(Vec)) {
20140 Value *V = ES->getVectorOperand();
20141 auto *IVec = cast<Instruction>(Vec);
20142 if (ArrayRef<TreeEntry *> ETEs = getTreeEntries(V); !ETEs.empty())
20143 V = ETEs.front()->VectorizedValue;
20144 if (auto *IV = dyn_cast<Instruction>(V);
20145 !IV || IV == Vec || IV->getParent() != IVec->getParent() ||
20146 IV->comesBefore(IVec))
20147 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
20148 else
20149 Ex = Builder.CreateExtractElement(Vec, Lane);
20150 } else if (auto *VecTy =
20151 dyn_cast<FixedVectorType>(Scalar->getType())) {
20152 assert(SLPReVec && "FixedVectorType is not expected.");
20153 unsigned VecTyNumElements = VecTy->getNumElements();
20154 // When REVEC is enabled, we need to extract a vector.
20155 // Note: The element size of Scalar may be different from the
20156 // element size of Vec.
20157 Ex = createExtractVector(Builder, Vec, VecTyNumElements,
20158 ExternalUse.Lane * VecTyNumElements);
20159 } else {
20160 Ex = Builder.CreateExtractElement(Vec, Lane);
20161 }
20162 // If necessary, sign-extend or zero-extend ScalarRoot
20163 // to the larger type.
20164 ExV = Ex;
20165 if (Scalar->getType() != Ex->getType())
20166 ExV = Builder.CreateIntCast(
20167 Ex, Scalar->getType(),
20168 !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
20169 auto *I = dyn_cast<Instruction>(Ex);
20170 ScalarToEEs[Scalar].try_emplace(I ? I->getParent()
20171 : &F->getEntryBlock(),
20172 std::make_pair(Ex, ExV));
20173 }
20174 // The then branch of the previous if may produce constants, since 0
20175 // operand might be a constant.
20176 if (auto *ExI = dyn_cast<Instruction>(Ex);
20177 ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {
20178 GatherShuffleExtractSeq.insert(ExI);
20179 CSEBlocks.insert(ExI->getParent());
20180 }
20181 return ExV;
20182 }
20183 assert(isa<FixedVectorType>(Scalar->getType()) &&
20184 isa<InsertElementInst>(Scalar) &&
20185 "In-tree scalar of vector type is not insertelement?");
20186 auto *IE = cast<InsertElementInst>(Scalar);
20187 VectorToInsertElement.try_emplace(Vec, IE);
20188 return Vec;
20189 };
20190 // If User == nullptr, the Scalar remains as scalar in vectorized
20191 // instructions or is used as extra arg. Generate ExtractElement instruction
20192 // and update the record for this scalar in ExternallyUsedValues.
20193 if (!User) {
20194 if (!ScalarsWithNullptrUser.insert(Scalar).second)
20195 continue;
20196 assert(
20197 (ExternallyUsedValues.count(Scalar) ||
20198 ExternalUsesWithNonUsers.count(Scalar) ||
20199 ExternalUsesAsOriginalScalar.contains(Scalar) ||
20200 any_of(
20201 Scalar->users(),
20202 [&, TTI = TTI](llvm::User *U) {
20203 if (ExternalUsesAsOriginalScalar.contains(U))
20204 return true;
20205 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
20206 return !UseEntries.empty() &&
20207 (E->State == TreeEntry::Vectorize ||
20208 E->State == TreeEntry::StridedVectorize ||
20209 E->State == TreeEntry::CompressVectorize) &&
20210 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
20211 return (UseEntry->State == TreeEntry::Vectorize ||
20212 UseEntry->State ==
20213 TreeEntry::StridedVectorize ||
20214 UseEntry->State ==
20215 TreeEntry::CompressVectorize) &&
20216 doesInTreeUserNeedToExtract(
20217 Scalar, getRootEntryInstruction(*UseEntry),
20218 TLI, TTI);
20219 });
20220 })) &&
20221 "Scalar with nullptr User must be registered in "
20222 "ExternallyUsedValues map or remain as scalar in vectorized "
20223 "instructions");
20224 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
20225 if (auto *PHI = dyn_cast<PHINode>(VecI)) {
20226 if (PHI->getParent()->isLandingPad())
20227 Builder.SetInsertPoint(
20228 PHI->getParent(),
20229 std::next(
20230 PHI->getParent()->getLandingPadInst()->getIterator()));
20231 else
20232 Builder.SetInsertPoint(PHI->getParent(),
20233 PHI->getParent()->getFirstNonPHIIt());
20234 } else {
20235 Builder.SetInsertPoint(VecI->getParent(),
20236 std::next(VecI->getIterator()));
20237 }
20238 } else {
20239 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20240 }
20241 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20242 // Required to update internally referenced instructions.
20243 if (Scalar != NewInst) {
20244 assert((!isa<ExtractElementInst>(Scalar) ||
20245 !IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&
20246 "Extractelements should not be replaced.");
20247 Scalar->replaceAllUsesWith(NewInst);
20248 }
20249 continue;
20250 }
20251
20252 if (auto *VU = dyn_cast<InsertElementInst>(User);
20253 VU && VU->getOperand(1) == Scalar) {
20254 // Skip if the scalar is another vector op or Vec is not an instruction.
20255 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
20256 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
20257 if (!UsedInserts.insert(VU).second)
20258 continue;
20259 // Need to use original vector, if the root is truncated.
20260 auto BWIt = MinBWs.find(E);
20261 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
20262 auto *ScalarTy = FTy->getElementType();
20263 auto Key = std::make_pair(Vec, ScalarTy);
20264 auto VecIt = VectorCasts.find(Key);
20265 if (VecIt == VectorCasts.end()) {
20266 IRBuilderBase::InsertPointGuard Guard(Builder);
20267 if (auto *IVec = dyn_cast<PHINode>(Vec)) {
20268 if (IVec->getParent()->isLandingPad())
20269 Builder.SetInsertPoint(IVec->getParent(),
20270 std::next(IVec->getParent()
20271 ->getLandingPadInst()
20272 ->getIterator()));
20273 else
20274 Builder.SetInsertPoint(
20275 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
20276 } else if (auto *IVec = dyn_cast<Instruction>(Vec)) {
20277 Builder.SetInsertPoint(IVec->getNextNode());
20278 }
20279 Vec = Builder.CreateIntCast(
20280 Vec,
20282 ScalarTy,
20283 cast<FixedVectorType>(Vec->getType())->getNumElements()),
20284 BWIt->second.second);
20285 VectorCasts.try_emplace(Key, Vec);
20286 } else {
20287 Vec = VecIt->second;
20288 }
20289 }
20290
20291 std::optional<unsigned> InsertIdx = getElementIndex(VU);
20292 if (InsertIdx) {
20293 auto *It = find_if(
20294 ShuffledInserts, [VU](const ShuffledInsertData<Value *> &Data) {
20295 // Checks if 2 insertelements are from the same buildvector.
20296 InsertElementInst *VecInsert = Data.InsertElements.front();
20298 VU, VecInsert,
20299 [](InsertElementInst *II) { return II->getOperand(0); });
20300 });
20301 unsigned Idx = *InsertIdx;
20302 if (It == ShuffledInserts.end()) {
20303 (void)ShuffledInserts.emplace_back();
20304 It = std::next(ShuffledInserts.begin(),
20305 ShuffledInserts.size() - 1);
20306 }
20307 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
20308 if (Mask.empty())
20309 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
20310 Mask[Idx] = ExternalUse.Lane;
20311 It->InsertElements.push_back(cast<InsertElementInst>(User));
20312 continue;
20313 }
20314 }
20315 }
20316 }
20317
20318 // Generate extracts for out-of-tree users.
20319 // Find the insertion point for the extractelement lane.
20320 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
20321 if (PHINode *PH = dyn_cast<PHINode>(User)) {
20322 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
20323 if (PH->getIncomingValue(I) == Scalar) {
20324 Instruction *IncomingTerminator =
20325 PH->getIncomingBlock(I)->getTerminator();
20326 if (isa<CatchSwitchInst>(IncomingTerminator)) {
20327 Builder.SetInsertPoint(VecI->getParent(),
20328 std::next(VecI->getIterator()));
20329 } else {
20330 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
20331 }
20332 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20333 PH->setOperand(I, NewInst);
20334 }
20335 }
20336 } else {
20337 Builder.SetInsertPoint(cast<Instruction>(User));
20338 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20339 User->replaceUsesOfWith(Scalar, NewInst);
20340 }
20341 } else {
20342 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20343 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20344 User->replaceUsesOfWith(Scalar, NewInst);
20345 }
20346
20347 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
20348 }
20349
20350 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
20351 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
20352 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
20353 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
20354 for (int I = 0, E = Mask.size(); I < E; ++I) {
20355 if (Mask[I] < VF)
20356 CombinedMask1[I] = Mask[I];
20357 else
20358 CombinedMask2[I] = Mask[I] - VF;
20359 }
20360 ShuffleInstructionBuilder ShuffleBuilder(
20361 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
20362 ShuffleBuilder.add(V1, CombinedMask1);
20363 if (V2)
20364 ShuffleBuilder.add(V2, CombinedMask2);
20365 return ShuffleBuilder.finalize({}, {}, {});
20366 };
20367
20368 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
20369 bool ForSingleMask) {
20370 unsigned VF = Mask.size();
20371 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
20372 if (VF != VecVF) {
20373 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
20374 Vec = CreateShuffle(Vec, nullptr, Mask);
20375 return std::make_pair(Vec, true);
20376 }
20377 if (!ForSingleMask) {
20378 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
20379 for (unsigned I = 0; I < VF; ++I) {
20380 if (Mask[I] != PoisonMaskElem)
20381 ResizeMask[Mask[I]] = Mask[I];
20382 }
20383 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
20384 }
20385 }
20386
20387 return std::make_pair(Vec, false);
20388 };
20389 // Perform shuffling of the vectorize tree entries for better handling of
20390 // external extracts.
20391 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
20392 // Find the first and the last instruction in the list of insertelements.
20393 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
20394 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
20395 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
20396 Builder.SetInsertPoint(LastInsert);
20397 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
20398 Value *NewInst = performExtractsShuffleAction<Value>(
20399 MutableArrayRef(Vector.data(), Vector.size()),
20400 FirstInsert->getOperand(0),
20401 [](Value *Vec) {
20402 return cast<VectorType>(Vec->getType())
20403 ->getElementCount()
20404 .getKnownMinValue();
20405 },
20406 ResizeToVF,
20407 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
20408 ArrayRef<Value *> Vals) {
20409 assert((Vals.size() == 1 || Vals.size() == 2) &&
20410 "Expected exactly 1 or 2 input values.");
20411 if (Vals.size() == 1) {
20412 // Do not create shuffle if the mask is a simple identity
20413 // non-resizing mask.
20414 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
20415 ->getNumElements() ||
20416 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
20417 return CreateShuffle(Vals.front(), nullptr, Mask);
20418 return Vals.front();
20419 }
20420 return CreateShuffle(Vals.front() ? Vals.front()
20421 : FirstInsert->getOperand(0),
20422 Vals.back(), Mask);
20423 });
20424 auto It = ShuffledInserts[I].InsertElements.rbegin();
20425 // Rebuild buildvector chain.
20426 InsertElementInst *II = nullptr;
20427 if (It != ShuffledInserts[I].InsertElements.rend())
20428 II = *It;
20430 while (It != ShuffledInserts[I].InsertElements.rend()) {
20431 assert(II && "Must be an insertelement instruction.");
20432 if (*It == II)
20433 ++It;
20434 else
20435 Inserts.push_back(cast<Instruction>(II));
20436 II = dyn_cast<InsertElementInst>(II->getOperand(0));
20437 }
20438 for (Instruction *II : reverse(Inserts)) {
20439 II->replaceUsesOfWith(II->getOperand(0), NewInst);
20440 if (auto *NewI = dyn_cast<Instruction>(NewInst))
20441 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
20442 II->moveAfter(NewI);
20443 NewInst = II;
20444 }
20445 LastInsert->replaceAllUsesWith(NewInst);
20446 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
20447 IE->replaceUsesOfWith(IE->getOperand(0),
20448 PoisonValue::get(IE->getOperand(0)->getType()));
20449 IE->replaceUsesOfWith(IE->getOperand(1),
20450 PoisonValue::get(IE->getOperand(1)->getType()));
20451 eraseInstruction(IE);
20452 }
20453 CSEBlocks.insert(LastInsert->getParent());
20454 }
20455
20456 SmallVector<Instruction *> RemovedInsts;
20457 // For each vectorized value:
20458 for (auto &TEPtr : VectorizableTree) {
20459 TreeEntry *Entry = TEPtr.get();
20460
20461 // No need to handle users of gathered values.
20462 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
20463 continue;
20464
20465 assert(Entry->VectorizedValue && "Can't find vectorizable value");
20466
20467 // For each lane:
20468 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
20469 Value *Scalar = Entry->Scalars[Lane];
20470
20471 if (Entry->getOpcode() == Instruction::GetElementPtr &&
20472 !isa<GetElementPtrInst>(Scalar))
20473 continue;
20474 if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);
20475 EE && IgnoredExtracts.contains(EE))
20476 continue;
20477 if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
20478 continue;
20479#ifndef NDEBUG
20480 Type *Ty = Scalar->getType();
20481 if (!Ty->isVoidTy()) {
20482 for (User *U : Scalar->users()) {
20483 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
20484
20485 // It is legal to delete users in the ignorelist.
20486 assert((isVectorized(U) ||
20487 (UserIgnoreList && UserIgnoreList->contains(U)) ||
20488 (isa_and_nonnull<Instruction>(U) &&
20489 isDeleted(cast<Instruction>(U)))) &&
20490 "Deleting out-of-tree value");
20491 }
20492 }
20493#endif
20494 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
20495 auto *I = cast<Instruction>(Scalar);
20496 RemovedInsts.push_back(I);
20497 }
20498 }
20499
20500 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
20501 // new vector instruction.
20502 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
20503 V->mergeDIAssignID(RemovedInsts);
20504
20505 // Clear up reduction references, if any.
20506 if (UserIgnoreList) {
20507 for (Instruction *I : RemovedInsts) {
20508 const TreeEntry *IE = getTreeEntries(I).front();
20509 if (IE->Idx != 0 &&
20510 !(VectorizableTree.front()->isGather() && IE->UserTreeIndex &&
20511 (ValueToGatherNodes.lookup(I).contains(
20512 VectorizableTree.front().get()) ||
20513 (IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
20514 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
20515 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
20516 IE->UserTreeIndex &&
20517 is_contained(VectorizableTree.front()->Scalars, I)) &&
20518 !(GatheredLoadsEntriesFirst.has_value() &&
20519 IE->Idx >= *GatheredLoadsEntriesFirst &&
20520 VectorizableTree.front()->isGather() &&
20521 is_contained(VectorizableTree.front()->Scalars, I)))
20522 continue;
20523 SmallVector<SelectInst *> LogicalOpSelects;
20524 I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
20525 // Do not replace condition of the logical op in form select <cond>.
20526 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
20527 (match(U.getUser(), m_LogicalAnd()) ||
20528 match(U.getUser(), m_LogicalOr())) &&
20529 U.getOperandNo() == 0;
20530 if (IsPoisoningLogicalOp) {
20531 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
20532 return false;
20533 }
20534 return UserIgnoreList->contains(U.getUser());
20535 });
20536 // Replace conditions of the poisoning logical ops with the non-poison
20537 // constant value.
20538 for (SelectInst *SI : LogicalOpSelects)
20539 SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));
20540 }
20541 }
20542 // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
20543 // cache correctness.
20544 // NOTE: removeInstructionAndOperands only marks the instruction for deletion
20545 // - instructions are not deleted until later.
20546 removeInstructionsAndOperands(ArrayRef(RemovedInsts), VectorValuesAndScales);
20547
20548 Builder.ClearInsertionPoint();
20549 InstrElementSize.clear();
20550
20551 const TreeEntry &RootTE = *VectorizableTree.front();
20552 Value *Vec = RootTE.VectorizedValue;
20553 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
20554 It != MinBWs.end() &&
20555 ReductionBitWidth != It->second.first) {
20556 IRBuilder<>::InsertPointGuard Guard(Builder);
20557 Builder.SetInsertPoint(ReductionRoot->getParent(),
20558 ReductionRoot->getIterator());
20559 Vec = Builder.CreateIntCast(
20560 Vec,
20561 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
20562 cast<VectorType>(Vec->getType())->getElementCount()),
20563 It->second.second);
20564 }
20565 return Vec;
20566}
20567
20569 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
20570 << " gather sequences instructions.\n");
20571 // LICM InsertElementInst sequences.
20572 for (Instruction *I : GatherShuffleExtractSeq) {
20573 if (isDeleted(I))
20574 continue;
20575
20576 // Check if this block is inside a loop.
20577 Loop *L = LI->getLoopFor(I->getParent());
20578 if (!L)
20579 continue;
20580
20581 // Check if it has a preheader.
20582 BasicBlock *PreHeader = L->getLoopPreheader();
20583 if (!PreHeader)
20584 continue;
20585
20586 // If the vector or the element that we insert into it are
20587 // instructions that are defined in this basic block then we can't
20588 // hoist this instruction.
20589 if (any_of(I->operands(), [L](Value *V) {
20590 auto *OpI = dyn_cast<Instruction>(V);
20591 return OpI && L->contains(OpI);
20592 }))
20593 continue;
20594
20595 // We can hoist this instruction. Move it to the pre-header.
20596 I->moveBefore(PreHeader->getTerminator()->getIterator());
20597 CSEBlocks.insert(PreHeader);
20598 }
20599
20600 // Make a list of all reachable blocks in our CSE queue.
20602 CSEWorkList.reserve(CSEBlocks.size());
20603 for (BasicBlock *BB : CSEBlocks)
20604 if (DomTreeNode *N = DT->getNode(BB)) {
20606 CSEWorkList.push_back(N);
20607 }
20608
20609 // Sort blocks by domination. This ensures we visit a block after all blocks
20610 // dominating it are visited.
20611 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
20612 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
20613 "Different nodes should have different DFS numbers");
20614 return A->getDFSNumIn() < B->getDFSNumIn();
20615 });
20616
20617 // Less defined shuffles can be replaced by the more defined copies.
20618 // Between two shuffles one is less defined if it has the same vector operands
20619 // and its mask indeces are the same as in the first one or undefs. E.g.
20620 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
20621 // poison, <0, 0, 0, 0>.
20622 auto &&IsIdenticalOrLessDefined = [TTI = TTI](Instruction *I1,
20623 Instruction *I2,
20624 SmallVectorImpl<int> &NewMask) {
20625 if (I1->getType() != I2->getType())
20626 return false;
20627 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
20628 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
20629 if (!SI1 || !SI2)
20630 return I1->isIdenticalTo(I2);
20631 if (SI1->isIdenticalTo(SI2))
20632 return true;
20633 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
20634 if (SI1->getOperand(I) != SI2->getOperand(I))
20635 return false;
20636 // Check if the second instruction is more defined than the first one.
20637 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
20638 ArrayRef<int> SM1 = SI1->getShuffleMask();
20639 // Count trailing undefs in the mask to check the final number of used
20640 // registers.
20641 unsigned LastUndefsCnt = 0;
20642 for (int I = 0, E = NewMask.size(); I < E; ++I) {
20643 if (SM1[I] == PoisonMaskElem)
20644 ++LastUndefsCnt;
20645 else
20646 LastUndefsCnt = 0;
20647 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
20648 NewMask[I] != SM1[I])
20649 return false;
20650 if (NewMask[I] == PoisonMaskElem)
20651 NewMask[I] = SM1[I];
20652 }
20653 // Check if the last undefs actually change the final number of used vector
20654 // registers.
20655 return SM1.size() - LastUndefsCnt > 1 &&
20656 ::getNumberOfParts(*TTI, SI1->getType()) ==
20658 *TTI, getWidenedType(SI1->getType()->getElementType(),
20659 SM1.size() - LastUndefsCnt));
20660 };
20661 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
20662 // instructions. TODO: We can further optimize this scan if we split the
20663 // instructions into different buckets based on the insert lane.
20665 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
20666 assert(*I &&
20667 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
20668 "Worklist not sorted properly!");
20669 BasicBlock *BB = (*I)->getBlock();
20670 // For all instructions in blocks containing gather sequences:
20671 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
20672 if (isDeleted(&In))
20673 continue;
20674 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
20675 !GatherShuffleExtractSeq.contains(&In))
20676 continue;
20677
20678 // Check if we can replace this instruction with any of the
20679 // visited instructions.
20680 bool Replaced = false;
20681 for (Instruction *&V : Visited) {
20682 SmallVector<int> NewMask;
20683 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
20684 DT->dominates(V->getParent(), In.getParent())) {
20685 In.replaceAllUsesWith(V);
20686 eraseInstruction(&In);
20687 if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
20688 if (!NewMask.empty())
20689 SI->setShuffleMask(NewMask);
20690 Replaced = true;
20691 break;
20692 }
20693 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
20694 GatherShuffleExtractSeq.contains(V) &&
20695 IsIdenticalOrLessDefined(V, &In, NewMask) &&
20696 DT->dominates(In.getParent(), V->getParent())) {
20697 In.moveAfter(V);
20698 V->replaceAllUsesWith(&In);
20700 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
20701 if (!NewMask.empty())
20702 SI->setShuffleMask(NewMask);
20703 V = &In;
20704 Replaced = true;
20705 break;
20706 }
20707 }
20708 if (!Replaced) {
20709 assert(!is_contained(Visited, &In));
20710 Visited.push_back(&In);
20711 }
20712 }
20713 }
20714 CSEBlocks.clear();
20715 GatherShuffleExtractSeq.clear();
20716}
20717
20718BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
20719 ArrayRef<Value *> VL, const InstructionsState &S, const EdgeInfo &EI) {
20720 auto &BundlePtr =
20721 ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
20722 for (Value *V : VL) {
20723 if (S.isNonSchedulable(V))
20724 continue;
20725 auto *I = cast<Instruction>(V);
20726 if (S.isCopyableElement(V)) {
20727 // Add a copyable element model.
20728 ScheduleCopyableData &SD =
20729 addScheduleCopyableData(EI, I, SchedulingRegionID, *BundlePtr);
20730 // Group the instructions to a bundle.
20731 BundlePtr->add(&SD);
20732 continue;
20733 }
20734 ScheduleData *BundleMember = getScheduleData(V);
20735 assert(BundleMember && "no ScheduleData for bundle member "
20736 "(maybe not in same basic block)");
20737 // Group the instructions to a bundle.
20738 BundlePtr->add(BundleMember);
20739 ScheduledBundles.try_emplace(I).first->getSecond().push_back(
20740 BundlePtr.get());
20741 }
20742 assert(BundlePtr && *BundlePtr && "Failed to find schedule bundle");
20743 return *BundlePtr;
20744}
20745
20746// Groups the instructions to a bundle (which is then a single scheduling entity)
20747// and schedules instructions until the bundle gets ready.
20748std::optional<BoUpSLP::ScheduleBundle *>
20749BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
20750 const InstructionsState &S,
20751 const EdgeInfo &EI) {
20752 // No need to schedule PHIs, insertelement, extractelement and extractvalue
20753 // instructions.
20754 bool HasCopyables = S.areInstructionsWithCopyableElements();
20755 if (isa<PHINode>(S.getMainOp()) ||
20756 isVectorLikeInstWithConstOps(S.getMainOp()) ||
20757 (!HasCopyables && doesNotNeedToSchedule(VL)) ||
20758 all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))
20759 return nullptr;
20760
20761 // Initialize the instruction bundle.
20762 Instruction *OldScheduleEnd = ScheduleEnd;
20763 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
20764
20765 auto TryScheduleBundleImpl = [=](bool ReSchedule, ScheduleBundle &Bundle) {
20766 // Clear deps or recalculate the region, if the memory instruction is a
20767 // copyable. It may have memory deps, which must be recalculated.
20768 SmallVector<ScheduleData *> ControlDependentMembers;
20769 auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
20770 SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
20771 for (ScheduleEntity *SE : Bundle.getBundle()) {
20772 if (ScheduleCopyableData *SD = dyn_cast<ScheduleCopyableData>(SE)) {
20773 if (ScheduleData *BundleMember = getScheduleData(SD->getInst());
20774 BundleMember && BundleMember->hasValidDependencies()) {
20775 BundleMember->clearDirectDependencies();
20776 if (RegionHasStackSave ||
20778 BundleMember->getInst()))
20779 ControlDependentMembers.push_back(BundleMember);
20780 }
20781 continue;
20782 }
20783 auto *SD = cast<ScheduleData>(SE);
20784 for (const Use &U : SD->getInst()->operands()) {
20785 unsigned &NumOps =
20786 UserOpToNumOps
20787 .try_emplace(std::make_pair(SD->getInst(), U.get()), 0)
20788 .first->getSecond();
20789 ++NumOps;
20790 if (auto *Op = dyn_cast<Instruction>(U.get());
20791 Op && areAllOperandsReplacedByCopyableData(SD->getInst(), Op,
20792 *SLP, NumOps)) {
20793 if (ScheduleData *OpSD = getScheduleData(Op)) {
20794 OpSD->clearDirectDependencies();
20795 if (RegionHasStackSave ||
20797 ControlDependentMembers.push_back(OpSD);
20798 }
20799 }
20800 }
20801 }
20802 };
20803 // The scheduling region got new instructions at the lower end (or it is a
20804 // new region for the first bundle). This makes it necessary to
20805 // recalculate all dependencies.
20806 // It is seldom that this needs to be done a second time after adding the
20807 // initial bundle to the region.
20808 if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
20809 for_each(ScheduleDataMap, [&](auto &P) {
20810 if (BB != P.first->getParent())
20811 return;
20812 ScheduleData *SD = P.second;
20813 if (isInSchedulingRegion(*SD))
20814 SD->clearDependencies();
20815 });
20816 for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {
20817 for_each(P.second, [&](ScheduleCopyableData *SD) {
20818 if (isInSchedulingRegion(*SD))
20819 SD->clearDependencies();
20820 });
20821 });
20822 ReSchedule = true;
20823 }
20824 // Check if the bundle data has deps for copyable elements already. In
20825 // this case need to reset deps and recalculate it.
20826 if (Bundle && !Bundle.getBundle().empty()) {
20827 if (S.areInstructionsWithCopyableElements() ||
20828 !ScheduleCopyableDataMap.empty())
20829 CheckIfNeedToClearDeps(Bundle);
20830 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << Bundle << " in block "
20831 << BB->getName() << "\n");
20832 calculateDependencies(Bundle, /*InsertInReadyList=*/!ReSchedule, SLP,
20833 ControlDependentMembers);
20834 } else if (!ControlDependentMembers.empty()) {
20835 ScheduleBundle Invalid = ScheduleBundle::invalid();
20836 calculateDependencies(Invalid, /*InsertInReadyList=*/!ReSchedule, SLP,
20837 ControlDependentMembers);
20838 }
20839
20840 if (ReSchedule) {
20841 resetSchedule();
20842 initialFillReadyList(ReadyInsts);
20843 }
20844
20845 // Now try to schedule the new bundle or (if no bundle) just calculate
20846 // dependencies. As soon as the bundle is "ready" it means that there are no
20847 // cyclic dependencies and we can schedule it. Note that's important that we
20848 // don't "schedule" the bundle yet.
20849 while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
20850 !ReadyInsts.empty()) {
20851 ScheduleEntity *Picked = ReadyInsts.pop_back_val();
20852 assert(Picked->isReady() && "must be ready to schedule");
20853 schedule(*SLP, S, EI, Picked, ReadyInsts);
20854 if (Picked == &Bundle)
20855 break;
20856 }
20857 };
20858
20859 // Make sure that the scheduling region contains all
20860 // instructions of the bundle.
20861 for (Value *V : VL) {
20862 if (S.isNonSchedulable(V))
20863 continue;
20864 if (!extendSchedulingRegion(V, S)) {
20865 // If the scheduling region got new instructions at the lower end (or it
20866 // is a new region for the first bundle). This makes it necessary to
20867 // recalculate all dependencies.
20868 // Otherwise the compiler may crash trying to incorrectly calculate
20869 // dependencies and emit instruction in the wrong order at the actual
20870 // scheduling.
20871 ScheduleBundle Invalid = ScheduleBundle::invalid();
20872 TryScheduleBundleImpl(/*ReSchedule=*/false, Invalid);
20873 return std::nullopt;
20874 }
20875 }
20876
20877 bool ReSchedule = false;
20878 for (Value *V : VL) {
20879 if (S.isNonSchedulable(V))
20880 continue;
20882 getScheduleCopyableData(cast<Instruction>(V));
20883 if (!CopyableData.empty()) {
20884 for (ScheduleCopyableData *SD : CopyableData)
20885 ReadyInsts.remove(SD);
20886 }
20887 ScheduleData *BundleMember = getScheduleData(V);
20888 assert((BundleMember || S.isCopyableElement(V)) &&
20889 "no ScheduleData for bundle member (maybe not in same basic block)");
20890 if (!BundleMember)
20891 continue;
20892
20893 // Make sure we don't leave the pieces of the bundle in the ready list when
20894 // whole bundle might not be ready.
20895 ReadyInsts.remove(BundleMember);
20896 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V);
20897 !Bundles.empty()) {
20898 for (ScheduleBundle *B : Bundles)
20899 ReadyInsts.remove(B);
20900 }
20901
20902 if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
20903 continue;
20904 // A bundle member was scheduled as single instruction before and now
20905 // needs to be scheduled as part of the bundle. We just get rid of the
20906 // existing schedule.
20907 // A bundle member has deps calculated before it was copyable element - need
20908 // to reschedule.
20909 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
20910 << " was already scheduled\n");
20911 ReSchedule = true;
20912 }
20913
20914 ScheduleBundle &Bundle = buildBundle(VL, S, EI);
20915 TryScheduleBundleImpl(ReSchedule, Bundle);
20916 if (!Bundle.isReady()) {
20917 for (ScheduleEntity *BD : Bundle.getBundle()) {
20918 // Copyable data scheduling is just removed.
20919 if (isa<ScheduleCopyableData>(BD))
20920 continue;
20921 if (BD->isReady()) {
20922 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(BD->getInst());
20923 if (Bundles.empty()) {
20924 ReadyInsts.insert(BD);
20925 continue;
20926 }
20927 for (ScheduleBundle *B : Bundles)
20928 if (B->isReady())
20929 ReadyInsts.insert(B);
20930 }
20931 }
20932 ScheduledBundlesList.pop_back();
20933 SmallVector<ScheduleData *> ControlDependentMembers;
20935 for (Value *V : VL) {
20936 if (S.isNonSchedulable(V))
20937 continue;
20938 auto *I = cast<Instruction>(V);
20939 if (S.isCopyableElement(I)) {
20940 // Remove the copyable data from the scheduling region and restore
20941 // previous mappings.
20942 auto KV = std::make_pair(EI, I);
20943 assert(ScheduleCopyableDataMap.contains(KV) &&
20944 "no ScheduleCopyableData for copyable element");
20945 ScheduleCopyableData *SD =
20946 ScheduleCopyableDataMapByInst.find(I)->getSecond().pop_back_val();
20947 ScheduleCopyableDataMapByUsers[I].remove(SD);
20948 if (EI.UserTE) {
20949 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
20950 const auto *It = find(Op, I);
20951 assert(It != Op.end() && "Lane not set");
20953 do {
20954 int Lane = std::distance(Op.begin(), It);
20955 assert(Lane >= 0 && "Lane not set");
20956 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
20957 !EI.UserTE->ReorderIndices.empty())
20958 Lane = EI.UserTE->ReorderIndices[Lane];
20959 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
20960 "Couldn't find extract lane");
20961 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
20962 if (!Visited.insert(In).second) {
20963 It = find(make_range(std::next(It), Op.end()), I);
20964 break;
20965 }
20966 ScheduleCopyableDataMapByInstUser
20967 [std::make_pair(std::make_pair(In, EI.EdgeIdx), I)]
20968 .pop_back();
20969 It = find(make_range(std::next(It), Op.end()), I);
20970 } while (It != Op.end());
20971 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
20972 if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI, I))
20973 ScheduleCopyableDataMapByUsers[I].insert(UserCD);
20974 }
20975 if (ScheduleCopyableDataMapByUsers[I].empty())
20976 ScheduleCopyableDataMapByUsers.erase(I);
20977 ScheduleCopyableDataMap.erase(KV);
20978 // Need to recalculate dependencies for the actual schedule data.
20979 if (ScheduleData *OpSD = getScheduleData(I)) {
20980 OpSD->clearDirectDependencies();
20981 if (RegionHasStackSave ||
20983 ControlDependentMembers.push_back(OpSD);
20984 }
20985 continue;
20986 }
20987 ScheduledBundles.find(I)->getSecond().pop_back();
20988 }
20989 if (!ControlDependentMembers.empty()) {
20990 ScheduleBundle Invalid = ScheduleBundle::invalid();
20991 calculateDependencies(Invalid, /*InsertInReadyList=*/false, SLP,
20992 ControlDependentMembers);
20993 }
20994 return std::nullopt;
20995 }
20996 return &Bundle;
20997}
20998
20999BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
21000 // Allocate a new ScheduleData for the instruction.
21001 if (ChunkPos >= ChunkSize) {
21002 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
21003 ChunkPos = 0;
21004 }
21005 return &(ScheduleDataChunks.back()[ChunkPos++]);
21006}
21007
21008bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
21009 Value *V, const InstructionsState &S) {
21010 Instruction *I = dyn_cast<Instruction>(V);
21011 assert(I && "bundle member must be an instruction");
21012 if (getScheduleData(I))
21013 return true;
21014 if (!ScheduleStart) {
21015 // It's the first instruction in the new region.
21016 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
21017 ScheduleStart = I;
21018 ScheduleEnd = I->getNextNode();
21019 assert(ScheduleEnd && "tried to vectorize a terminator?");
21020 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
21021 return true;
21022 }
21023 // Search up and down at the same time, because we don't know if the new
21024 // instruction is above or below the existing scheduling region.
21025 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
21026 // against the budget. Otherwise debug info could affect codegen.
21028 ++ScheduleStart->getIterator().getReverse();
21029 BasicBlock::reverse_iterator UpperEnd = BB->rend();
21030 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
21031 BasicBlock::iterator LowerEnd = BB->end();
21032 auto IsAssumeLikeIntr = [](const Instruction &I) {
21033 if (auto *II = dyn_cast<IntrinsicInst>(&I))
21034 return II->isAssumeLikeIntrinsic();
21035 return false;
21036 };
21037 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21038 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21039 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
21040 &*DownIter != I) {
21041 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
21042 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
21043 return false;
21044 }
21045
21046 ++UpIter;
21047 ++DownIter;
21048
21049 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21050 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21051 }
21052 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
21053 assert(I->getParent() == ScheduleStart->getParent() &&
21054 "Instruction is in wrong basic block.");
21055 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
21056 ScheduleStart = I;
21057 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
21058 << "\n");
21059 return true;
21060 }
21061 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
21062 "Expected to reach top of the basic block or instruction down the "
21063 "lower end.");
21064 assert(I->getParent() == ScheduleEnd->getParent() &&
21065 "Instruction is in wrong basic block.");
21066 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
21067 nullptr);
21068 ScheduleEnd = I->getNextNode();
21069 assert(ScheduleEnd && "tried to vectorize a terminator?");
21070 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
21071 return true;
21072}
21073
21074void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
21075 Instruction *ToI,
21076 ScheduleData *PrevLoadStore,
21077 ScheduleData *NextLoadStore) {
21078 ScheduleData *CurrentLoadStore = PrevLoadStore;
21079 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
21080 // No need to allocate data for non-schedulable instructions.
21081 if (isa<PHINode>(I))
21082 continue;
21083 ScheduleData *SD = ScheduleDataMap.lookup(I);
21084 if (!SD) {
21085 SD = allocateScheduleDataChunks();
21086 ScheduleDataMap[I] = SD;
21087 }
21088 assert(!isInSchedulingRegion(*SD) &&
21089 "new ScheduleData already in scheduling region");
21090 SD->init(SchedulingRegionID, I);
21091
21092 if (I->mayReadOrWriteMemory() &&
21093 (!isa<IntrinsicInst>(I) ||
21094 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
21095 cast<IntrinsicInst>(I)->getIntrinsicID() !=
21096 Intrinsic::pseudoprobe))) {
21097 // Update the linked list of memory accessing instructions.
21098 if (CurrentLoadStore) {
21099 CurrentLoadStore->setNextLoadStore(SD);
21100 } else {
21101 FirstLoadStoreInRegion = SD;
21102 }
21103 CurrentLoadStore = SD;
21104 }
21105
21106 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
21107 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
21108 RegionHasStackSave = true;
21109 }
21110 if (NextLoadStore) {
21111 if (CurrentLoadStore)
21112 CurrentLoadStore->setNextLoadStore(NextLoadStore);
21113 } else {
21114 LastLoadStoreInRegion = CurrentLoadStore;
21115 }
21116}
21117
21118void BoUpSLP::BlockScheduling::calculateDependencies(
21119 ScheduleBundle &Bundle, bool InsertInReadyList, BoUpSLP *SLP,
21120 ArrayRef<ScheduleData *> ControlDeps) {
21122 auto ProcessNode = [&](ScheduleEntity *SE) {
21123 if (auto *CD = dyn_cast<ScheduleCopyableData>(SE)) {
21124 if (CD->hasValidDependencies())
21125 return;
21126 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *CD << "\n");
21127 CD->initDependencies();
21128 CD->resetUnscheduledDeps();
21129 const EdgeInfo &EI = CD->getEdgeInfo();
21130 if (EI.UserTE) {
21131 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
21132 const auto *It = find(Op, CD->getInst());
21133 assert(It != Op.end() && "Lane not set");
21135 do {
21136 int Lane = std::distance(Op.begin(), It);
21137 assert(Lane >= 0 && "Lane not set");
21138 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
21139 !EI.UserTE->ReorderIndices.empty())
21140 Lane = EI.UserTE->ReorderIndices[Lane];
21141 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
21142 "Couldn't find extract lane");
21143 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
21144 if (EI.UserTE->isCopyableElement(In)) {
21145 // We may have not have related copyable scheduling data, if the
21146 // instruction is non-schedulable.
21147 if (ScheduleCopyableData *UseSD =
21148 getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) {
21149 CD->incDependencies();
21150 if (!UseSD->isScheduled())
21151 CD->incrementUnscheduledDeps(1);
21152 if (!UseSD->hasValidDependencies() ||
21153 (InsertInReadyList && UseSD->isReady()))
21154 WorkList.push_back(UseSD);
21155 }
21156 } else if (Visited.insert(In).second) {
21157 if (ScheduleData *UseSD = getScheduleData(In)) {
21158 CD->incDependencies();
21159 if (!UseSD->isScheduled())
21160 CD->incrementUnscheduledDeps(1);
21161 if (!UseSD->hasValidDependencies() ||
21162 (InsertInReadyList && UseSD->isReady()))
21163 WorkList.push_back(UseSD);
21164 }
21165 }
21166 It = find(make_range(std::next(It), Op.end()), CD->getInst());
21167 } while (It != Op.end());
21168 if (CD->isReady() && CD->getDependencies() == 0 &&
21169 (EI.UserTE->hasState() &&
21170 (EI.UserTE->getMainOp()->getParent() !=
21171 CD->getInst()->getParent() ||
21172 (isa<PHINode>(EI.UserTE->getMainOp()) &&
21173 (EI.UserTE->getMainOp()->hasNUsesOrMore(UsesLimit) ||
21174 any_of(EI.UserTE->getMainOp()->users(), [&](User *U) {
21175 auto *IU = dyn_cast<Instruction>(U);
21176 if (!IU)
21177 return true;
21178 return IU->getParent() == EI.UserTE->getMainOp()->getParent();
21179 })))))) {
21180 // If no uses in the block - mark as having pseudo-use, which cannot
21181 // be scheduled.
21182 // Prevents incorrect def-use tracking between external user and
21183 // actual instruction.
21184 CD->incDependencies();
21185 CD->incrementUnscheduledDeps(1);
21186 }
21187 }
21188 return;
21189 }
21190 auto *BundleMember = cast<ScheduleData>(SE);
21191 if (BundleMember->hasValidDependencies())
21192 return;
21193 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
21194 BundleMember->initDependencies();
21195 BundleMember->resetUnscheduledDeps();
21196 // Handle def-use chain dependencies.
21198 for (User *U : BundleMember->getInst()->users()) {
21199 if (isa<PHINode>(U))
21200 continue;
21201 if (ScheduleData *UseSD = getScheduleData(U)) {
21202 // The operand is a copyable element - skip.
21203 unsigned &NumOps = UserToNumOps.try_emplace(U, 0).first->getSecond();
21204 ++NumOps;
21205 if (areAllOperandsReplacedByCopyableData(
21206 cast<Instruction>(U), BundleMember->getInst(), *SLP, NumOps))
21207 continue;
21208 BundleMember->incDependencies();
21209 if (!UseSD->isScheduled())
21210 BundleMember->incrementUnscheduledDeps(1);
21211 if (!UseSD->hasValidDependencies() ||
21212 (InsertInReadyList && UseSD->isReady()))
21213 WorkList.push_back(UseSD);
21214 }
21215 }
21216 for (ScheduleCopyableData *UseSD :
21217 getScheduleCopyableDataUsers(BundleMember->getInst())) {
21218 BundleMember->incDependencies();
21219 if (!UseSD->isScheduled())
21220 BundleMember->incrementUnscheduledDeps(1);
21221 if (!UseSD->hasValidDependencies() ||
21222 (InsertInReadyList && UseSD->isReady()))
21223 WorkList.push_back(UseSD);
21224 }
21225
21227 auto MakeControlDependent = [&](Instruction *I) {
21228 // Do not mark control dependent twice.
21229 if (!Visited.insert(I).second)
21230 return;
21231 auto *DepDest = getScheduleData(I);
21232 assert(DepDest && "must be in schedule window");
21233 DepDest->addControlDependency(BundleMember);
21234 BundleMember->incDependencies();
21235 if (!DepDest->isScheduled())
21236 BundleMember->incrementUnscheduledDeps(1);
21237 if (!DepDest->hasValidDependencies() ||
21238 (InsertInReadyList && DepDest->isReady()))
21239 WorkList.push_back(DepDest);
21240 };
21241
21242 // Any instruction which isn't safe to speculate at the beginning of the
21243 // block is control depend on any early exit or non-willreturn call
21244 // which proceeds it.
21245 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->getInst())) {
21246 for (Instruction *I = BundleMember->getInst()->getNextNode();
21247 I != ScheduleEnd; I = I->getNextNode()) {
21248 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
21249 continue;
21250
21251 // Add the dependency
21252 MakeControlDependent(I);
21253
21255 // Everything past here must be control dependent on I.
21256 break;
21257 }
21258 }
21259
21260 if (RegionHasStackSave) {
21261 // If we have an inalloc alloca instruction, it needs to be scheduled
21262 // after any preceeding stacksave. We also need to prevent any alloca
21263 // from reordering above a preceeding stackrestore.
21264 if (match(BundleMember->getInst(), m_Intrinsic<Intrinsic::stacksave>()) ||
21265 match(BundleMember->getInst(),
21266 m_Intrinsic<Intrinsic::stackrestore>())) {
21267 for (Instruction *I = BundleMember->getInst()->getNextNode();
21268 I != ScheduleEnd; I = I->getNextNode()) {
21269 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
21270 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
21271 // Any allocas past here must be control dependent on I, and I
21272 // must be memory dependend on BundleMember->Inst.
21273 break;
21274
21275 if (!isa<AllocaInst>(I))
21276 continue;
21277
21278 // Add the dependency
21279 MakeControlDependent(I);
21280 }
21281 }
21282
21283 // In addition to the cases handle just above, we need to prevent
21284 // allocas and loads/stores from moving below a stacksave or a
21285 // stackrestore. Avoiding moving allocas below stackrestore is currently
21286 // thought to be conservatism. Moving loads/stores below a stackrestore
21287 // can lead to incorrect code.
21288 if (isa<AllocaInst>(BundleMember->getInst()) ||
21289 BundleMember->getInst()->mayReadOrWriteMemory()) {
21290 for (Instruction *I = BundleMember->getInst()->getNextNode();
21291 I != ScheduleEnd; I = I->getNextNode()) {
21292 if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&
21293 !match(I, m_Intrinsic<Intrinsic::stackrestore>()))
21294 continue;
21295
21296 // Add the dependency
21297 MakeControlDependent(I);
21298 break;
21299 }
21300 }
21301 }
21302
21303 // Handle the memory dependencies (if any).
21304 ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
21305 if (!NextLoadStore)
21306 return;
21307 Instruction *SrcInst = BundleMember->getInst();
21308 assert(SrcInst->mayReadOrWriteMemory() &&
21309 "NextLoadStore list for non memory effecting bundle?");
21310 MemoryLocation SrcLoc = getLocation(SrcInst);
21311 bool SrcMayWrite = SrcInst->mayWriteToMemory();
21312 unsigned NumAliased = 0;
21313 unsigned DistToSrc = 1;
21314 bool IsNonSimpleSrc = !SrcLoc.Ptr || !isSimple(SrcInst);
21315
21316 for (ScheduleData *DepDest = NextLoadStore; DepDest;
21317 DepDest = DepDest->getNextLoadStore()) {
21318 assert(isInSchedulingRegion(*DepDest) && "Expected to be in region");
21319
21320 // We have two limits to reduce the complexity:
21321 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
21322 // SLP->isAliased (which is the expensive part in this loop).
21323 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
21324 // the whole loop (even if the loop is fast, it's quadratic).
21325 // It's important for the loop break condition (see below) to
21326 // check this limit even between two read-only instructions.
21327 if (DistToSrc >= MaxMemDepDistance ||
21328 ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
21329 (IsNonSimpleSrc || NumAliased >= AliasedCheckLimit ||
21330 SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
21331
21332 // We increment the counter only if the locations are aliased
21333 // (instead of counting all alias checks). This gives a better
21334 // balance between reduced runtime and accurate dependencies.
21335 NumAliased++;
21336
21337 DepDest->addMemoryDependency(BundleMember);
21338 BundleMember->incDependencies();
21339 if (!DepDest->isScheduled())
21340 BundleMember->incrementUnscheduledDeps(1);
21341 if (!DepDest->hasValidDependencies() ||
21342 (InsertInReadyList && DepDest->isReady()))
21343 WorkList.push_back(DepDest);
21344 }
21345
21346 // Example, explaining the loop break condition: Let's assume our
21347 // starting instruction is i0 and MaxMemDepDistance = 3.
21348 //
21349 // +--------v--v--v
21350 // i0,i1,i2,i3,i4,i5,i6,i7,i8
21351 // +--------^--^--^
21352 //
21353 // MaxMemDepDistance let us stop alias-checking at i3 and we add
21354 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
21355 // Previously we already added dependencies from i3 to i6,i7,i8
21356 // (because of MaxMemDepDistance). As we added a dependency from
21357 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
21358 // and we can abort this loop at i6.
21359 if (DistToSrc >= 2 * MaxMemDepDistance)
21360 break;
21361 DistToSrc++;
21362 }
21363 };
21364
21365 assert((Bundle || !ControlDeps.empty()) &&
21366 "expected at least one instruction to schedule");
21367 if (Bundle)
21368 WorkList.push_back(Bundle.getBundle().front());
21369 WorkList.append(ControlDeps.begin(), ControlDeps.end());
21371 while (!WorkList.empty()) {
21372 ScheduleEntity *SD = WorkList.pop_back_val();
21373 SmallVector<ScheduleBundle *, 1> CopyableBundle;
21375 if (auto *CD = dyn_cast<ScheduleCopyableData>(SD)) {
21376 CopyableBundle.push_back(&CD->getBundle());
21377 Bundles = CopyableBundle;
21378 } else {
21379 Bundles = getScheduleBundles(SD->getInst());
21380 }
21381 if (Bundles.empty()) {
21382 if (!SD->hasValidDependencies())
21383 ProcessNode(SD);
21384 if (InsertInReadyList && SD->isReady()) {
21385 ReadyInsts.insert(SD);
21386 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD << "\n");
21387 }
21388 continue;
21389 }
21390 for (ScheduleBundle *Bundle : Bundles) {
21391 if (Bundle->hasValidDependencies() || !Visited.insert(Bundle).second)
21392 continue;
21393 assert(isInSchedulingRegion(*Bundle) &&
21394 "ScheduleData not in scheduling region");
21395 for_each(Bundle->getBundle(), ProcessNode);
21396 }
21397 if (InsertInReadyList && SD->isReady()) {
21398 for (ScheduleBundle *Bundle : Bundles) {
21399 assert(isInSchedulingRegion(*Bundle) &&
21400 "ScheduleData not in scheduling region");
21401 if (!Bundle->isReady())
21402 continue;
21403 ReadyInsts.insert(Bundle);
21404 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *Bundle
21405 << "\n");
21406 }
21407 }
21408 }
21409}
21410
21411void BoUpSLP::BlockScheduling::resetSchedule() {
21412 assert(ScheduleStart &&
21413 "tried to reset schedule on block which has not been scheduled");
21414 for_each(ScheduleDataMap, [&](auto &P) {
21415 if (BB != P.first->getParent())
21416 return;
21417 ScheduleData *SD = P.second;
21418 if (isInSchedulingRegion(*SD)) {
21419 SD->setScheduled(/*Scheduled=*/false);
21420 SD->resetUnscheduledDeps();
21421 }
21422 });
21423 for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {
21424 for_each(P.second, [&](ScheduleCopyableData *SD) {
21425 if (isInSchedulingRegion(*SD)) {
21426 SD->setScheduled(/*Scheduled=*/false);
21427 SD->resetUnscheduledDeps();
21428 }
21429 });
21430 });
21431 for_each(ScheduledBundles, [&](auto &P) {
21432 for_each(P.second, [&](ScheduleBundle *Bundle) {
21433 if (isInSchedulingRegion(*Bundle))
21434 Bundle->setScheduled(/*Scheduled=*/false);
21435 });
21436 });
21437 // Reset schedule data for copyable elements.
21438 for (auto &P : ScheduleCopyableDataMap) {
21439 if (isInSchedulingRegion(*P.second)) {
21440 P.second->setScheduled(/*Scheduled=*/false);
21441 P.second->resetUnscheduledDeps();
21442 }
21443 }
21444 ReadyInsts.clear();
21445}
21446
21447void BoUpSLP::scheduleBlock(const BoUpSLP &R, BlockScheduling *BS) {
21448 if (!BS->ScheduleStart)
21449 return;
21450
21451 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
21452
21453 // A key point - if we got here, pre-scheduling was able to find a valid
21454 // scheduling of the sub-graph of the scheduling window which consists
21455 // of all vector bundles and their transitive users. As such, we do not
21456 // need to reschedule anything *outside of* that subgraph.
21457
21458 BS->resetSchedule();
21459
21460 // For the real scheduling we use a more sophisticated ready-list: it is
21461 // sorted by the original instruction location. This lets the final schedule
21462 // be as close as possible to the original instruction order.
21463 // WARNING: If changing this order causes a correctness issue, that means
21464 // there is some missing dependence edge in the schedule data graph.
21465 struct ScheduleDataCompare {
21466 bool operator()(const ScheduleEntity *SD1,
21467 const ScheduleEntity *SD2) const {
21468 return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
21469 }
21470 };
21471 std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
21472
21473 // Ensure that all dependency data is updated (for nodes in the sub-graph)
21474 // and fill the ready-list with initial instructions.
21475 int Idx = 0;
21476 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
21477 I = I->getNextNode()) {
21478 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
21479 if (!Bundles.empty()) {
21480 for (ScheduleBundle *Bundle : Bundles) {
21481 Bundle->setSchedulingPriority(Idx++);
21482 if (!Bundle->hasValidDependencies())
21483 BS->calculateDependencies(*Bundle, /*InsertInReadyList=*/false, this);
21484 }
21485 SmallVector<ScheduleCopyableData *> SDs = BS->getScheduleCopyableData(I);
21486 for (ScheduleCopyableData *SD : reverse(SDs)) {
21487 ScheduleBundle &Bundle = SD->getBundle();
21488 Bundle.setSchedulingPriority(Idx++);
21489 if (!Bundle.hasValidDependencies())
21490 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
21491 }
21492 continue;
21493 }
21495 BS->getScheduleCopyableDataUsers(I);
21496 if (ScheduleData *SD = BS->getScheduleData(I)) {
21497 [[maybe_unused]] ArrayRef<TreeEntry *> SDTEs = getTreeEntries(I);
21498 assert((isVectorLikeInstWithConstOps(SD->getInst()) || SDTEs.empty() ||
21499 SDTEs.front()->doesNotNeedToSchedule() ||
21501 "scheduler and vectorizer bundle mismatch");
21502 SD->setSchedulingPriority(Idx++);
21503 if (!SD->hasValidDependencies() &&
21504 (!CopyableData.empty() ||
21505 any_of(R.ValueToGatherNodes.lookup(I), [&](const TreeEntry *TE) {
21506 assert(TE->isGather() && "expected gather node");
21507 return TE->hasState() && TE->hasCopyableElements() &&
21508 TE->isCopyableElement(I);
21509 }))) {
21510 // Need to calculate deps for these nodes to correctly handle copyable
21511 // dependencies, even if they were cancelled.
21512 // If copyables bundle was cancelled, the deps are cleared and need to
21513 // recalculate them.
21514 ScheduleBundle Bundle;
21515 Bundle.add(SD);
21516 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
21517 }
21518 }
21519 for (ScheduleCopyableData *SD : reverse(CopyableData)) {
21520 ScheduleBundle &Bundle = SD->getBundle();
21521 Bundle.setSchedulingPriority(Idx++);
21522 if (!Bundle.hasValidDependencies())
21523 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
21524 }
21525 }
21526 BS->initialFillReadyList(ReadyInsts);
21527
21528 Instruction *LastScheduledInst = BS->ScheduleEnd;
21529
21530 // Do the "real" scheduling.
21532 while (!ReadyInsts.empty()) {
21533 auto *Picked = *ReadyInsts.begin();
21534 ReadyInsts.erase(ReadyInsts.begin());
21535
21536 // Move the scheduled instruction(s) to their dedicated places, if not
21537 // there yet.
21538 if (auto *Bundle = dyn_cast<ScheduleBundle>(Picked)) {
21539 for (const ScheduleEntity *BundleMember : Bundle->getBundle()) {
21540 Instruction *PickedInst = BundleMember->getInst();
21541 // If copyable must be schedule as part of something else, skip it.
21542 bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst);
21543 if ((IsCopyable && BS->getScheduleData(PickedInst)) ||
21544 (!IsCopyable && !Scheduled.insert(PickedInst).second))
21545 continue;
21546 if (PickedInst->getNextNode() != LastScheduledInst)
21547 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
21548 LastScheduledInst = PickedInst;
21549 }
21550 EntryToLastInstruction.try_emplace(Bundle->getTreeEntry(),
21551 LastScheduledInst);
21552 } else {
21553 auto *SD = cast<ScheduleData>(Picked);
21554 Instruction *PickedInst = SD->getInst();
21555 if (PickedInst->getNextNode() != LastScheduledInst)
21556 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
21557 LastScheduledInst = PickedInst;
21558 }
21559 auto Invalid = InstructionsState::invalid();
21560 BS->schedule(R, Invalid, EdgeInfo(), Picked, ReadyInsts);
21561 }
21562
21563 // Check that we didn't break any of our invariants.
21564#ifdef EXPENSIVE_CHECKS
21565 BS->verify();
21566#endif
21567
21568#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
21569 // Check that all schedulable entities got scheduled
21570 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
21571 I = I->getNextNode()) {
21572 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
21573 assert(all_of(Bundles,
21574 [](const ScheduleBundle *Bundle) {
21575 return Bundle->isScheduled();
21576 }) &&
21577 "must be scheduled at this point");
21578 }
21579#endif
21580
21581 // Avoid duplicate scheduling of the block.
21582 BS->ScheduleStart = nullptr;
21583}
21584
21586 // If V is a store, just return the width of the stored value (or value
21587 // truncated just before storing) without traversing the expression tree.
21588 // This is the common case.
21589 if (auto *Store = dyn_cast<StoreInst>(V))
21590 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
21591
21592 if (auto *IEI = dyn_cast<InsertElementInst>(V))
21593 return getVectorElementSize(IEI->getOperand(1));
21594
21595 auto E = InstrElementSize.find(V);
21596 if (E != InstrElementSize.end())
21597 return E->second;
21598
21599 // If V is not a store, we can traverse the expression tree to find loads
21600 // that feed it. The type of the loaded value may indicate a more suitable
21601 // width than V's type. We want to base the vector element size on the width
21602 // of memory operations where possible.
21605 if (auto *I = dyn_cast<Instruction>(V)) {
21606 Worklist.emplace_back(I, I->getParent(), 0);
21607 Visited.insert(I);
21608 }
21609
21610 // Traverse the expression tree in bottom-up order looking for loads. If we
21611 // encounter an instruction we don't yet handle, we give up.
21612 auto Width = 0u;
21613 Value *FirstNonBool = nullptr;
21614 while (!Worklist.empty()) {
21615 auto [I, Parent, Level] = Worklist.pop_back_val();
21616
21617 // We should only be looking at scalar instructions here. If the current
21618 // instruction has a vector type, skip.
21619 auto *Ty = I->getType();
21620 if (isa<VectorType>(Ty))
21621 continue;
21622 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
21623 FirstNonBool = I;
21624 if (Level > RecursionMaxDepth)
21625 continue;
21626
21627 // If the current instruction is a load, update MaxWidth to reflect the
21628 // width of the loaded value.
21629 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(I))
21630 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
21631
21632 // Otherwise, we need to visit the operands of the instruction. We only
21633 // handle the interesting cases from buildTree here. If an operand is an
21634 // instruction we haven't yet visited and from the same basic block as the
21635 // user or the use is a PHI node, we add it to the worklist.
21638 for (Use &U : I->operands()) {
21639 if (auto *J = dyn_cast<Instruction>(U.get()))
21640 if (Visited.insert(J).second &&
21641 (isa<PHINode>(I) || J->getParent() == Parent)) {
21642 Worklist.emplace_back(J, J->getParent(), Level + 1);
21643 continue;
21644 }
21645 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
21646 FirstNonBool = U.get();
21647 }
21648 } else {
21649 break;
21650 }
21651 }
21652
21653 // If we didn't encounter a memory access in the expression tree, or if we
21654 // gave up for some reason, just return the width of V. Otherwise, return the
21655 // maximum width we found.
21656 if (!Width) {
21657 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
21658 V = FirstNonBool;
21659 Width = DL->getTypeSizeInBits(V->getType());
21660 }
21661
21662 for (Instruction *I : Visited)
21663 InstrElementSize[I] = Width;
21664
21665 return Width;
21666}
21667
21668bool BoUpSLP::collectValuesToDemote(
21669 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
21671 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
21672 bool &IsProfitableToDemote, bool IsTruncRoot) const {
21673 // We can always demote constants.
21674 if (all_of(E.Scalars, IsaPred<Constant>))
21675 return true;
21676
21677 unsigned OrigBitWidth =
21678 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
21679 if (OrigBitWidth == BitWidth) {
21680 MaxDepthLevel = 1;
21681 return true;
21682 }
21683
21684 // Check if the node was analyzed already and must keep its original bitwidth.
21685 if (NodesToKeepBWs.contains(E.Idx))
21686 return false;
21687
21688 // If the value is not a vectorized instruction in the expression and not used
21689 // by the insertelement instruction and not used in multiple vector nodes, it
21690 // cannot be demoted.
21691 bool IsSignedNode = any_of(E.Scalars, [&](Value *R) {
21692 if (isa<PoisonValue>(R))
21693 return false;
21694 return !isKnownNonNegative(R, SimplifyQuery(*DL));
21695 });
21696 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
21697 if (isa<PoisonValue>(V))
21698 return true;
21699 if (getTreeEntries(V).size() > 1)
21700 return false;
21701 // For lat shuffle of sext/zext with many uses need to check the extra bit
21702 // for unsigned values, otherwise may have incorrect casting for reused
21703 // scalars.
21704 bool IsSignedVal = !isKnownNonNegative(V, SimplifyQuery(*DL));
21705 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
21706 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
21707 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
21708 return true;
21709 }
21710 unsigned NumSignBits = ComputeNumSignBits(V, *DL, AC, nullptr, DT);
21711 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
21712 if (IsSignedNode)
21713 ++BitWidth1;
21714 if (auto *I = dyn_cast<Instruction>(V)) {
21715 APInt Mask = DB->getDemandedBits(I);
21716 unsigned BitWidth2 =
21717 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
21718 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
21719 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
21720 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
21721 break;
21722 BitWidth2 *= 2;
21723 }
21724 BitWidth1 = std::min(BitWidth1, BitWidth2);
21725 }
21726 BitWidth = std::max(BitWidth, BitWidth1);
21727 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
21728 };
21729 auto FinalAnalysis = [&, TTI = TTI]() {
21730 if (!IsProfitableToDemote)
21731 return false;
21732 bool Res = all_of(
21733 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
21734 // Demote gathers.
21735 if (Res && E.isGather()) {
21736 if (E.hasState()) {
21737 if (const TreeEntry *SameTE =
21738 getSameValuesTreeEntry(E.getMainOp(), E.Scalars);
21739 SameTE)
21740 if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot, BitWidth,
21741 ToDemote, Visited, NodesToKeepBWs,
21742 MaxDepthLevel, IsProfitableToDemote,
21743 IsTruncRoot)) {
21744 ToDemote.push_back(E.Idx);
21745 return true;
21746 }
21747 }
21748 // Check possible extractelement instructions bases and final vector
21749 // length.
21750 SmallPtrSet<Value *, 4> UniqueBases;
21751 for (Value *V : E.Scalars) {
21752 auto *EE = dyn_cast<ExtractElementInst>(V);
21753 if (!EE)
21754 continue;
21755 UniqueBases.insert(EE->getVectorOperand());
21756 }
21757 const unsigned VF = E.Scalars.size();
21758 Type *OrigScalarTy = E.Scalars.front()->getType();
21759 if (UniqueBases.size() <= 2 ||
21760 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF)) >=
21762 *TTI,
21764 IntegerType::get(OrigScalarTy->getContext(), BitWidth),
21765 VF))) {
21766 ToDemote.push_back(E.Idx);
21767 return true;
21768 }
21769 }
21770 return Res;
21771 };
21772 if (E.isGather() || !Visited.insert(&E).second ||
21773 any_of(E.Scalars, [&](Value *V) {
21774 return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {
21775 return isa<InsertElementInst>(U) && !isVectorized(U);
21776 });
21777 }))
21778 return FinalAnalysis();
21779
21780 if (any_of(E.Scalars, [&](Value *V) {
21781 return !isa<Constant>(V) && !all_of(V->users(), [=](User *U) {
21782 return isVectorized(U) ||
21783 (E.Idx == 0 && UserIgnoreList &&
21784 UserIgnoreList->contains(U)) ||
21785 (!isa<CmpInst>(U) && U->getType()->isSized() &&
21786 !U->getType()->isScalableTy() &&
21787 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
21788 }) && !IsPotentiallyTruncated(V, BitWidth);
21789 }))
21790 return false;
21791
21792 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
21793 bool &NeedToExit) {
21794 NeedToExit = false;
21795 unsigned InitLevel = MaxDepthLevel;
21796 for (const TreeEntry *Op : Operands) {
21797 unsigned Level = InitLevel;
21798 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
21799 ToDemote, Visited, NodesToKeepBWs, Level,
21800 IsProfitableToDemote, IsTruncRoot)) {
21801 if (!IsProfitableToDemote)
21802 return false;
21803 NeedToExit = true;
21804 if (!FinalAnalysis())
21805 return false;
21806 continue;
21807 }
21808 MaxDepthLevel = std::max(MaxDepthLevel, Level);
21809 }
21810 return true;
21811 };
21812 auto AttemptCheckBitwidth =
21813 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
21814 // Try all bitwidth < OrigBitWidth.
21815 NeedToExit = false;
21816 unsigned BestFailBitwidth = 0;
21817 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
21818 if (Checker(BitWidth, OrigBitWidth))
21819 return true;
21820 if (BestFailBitwidth == 0 && FinalAnalysis())
21821 BestFailBitwidth = BitWidth;
21822 }
21823 if (BitWidth >= OrigBitWidth) {
21824 if (BestFailBitwidth == 0) {
21825 BitWidth = OrigBitWidth;
21826 return false;
21827 }
21828 MaxDepthLevel = 1;
21829 BitWidth = BestFailBitwidth;
21830 NeedToExit = true;
21831 return true;
21832 }
21833 return false;
21834 };
21835 auto TryProcessInstruction =
21836 [&](unsigned &BitWidth, ArrayRef<const TreeEntry *> Operands = {},
21837 function_ref<bool(unsigned, unsigned)> Checker = {}) {
21838 if (Operands.empty()) {
21839 if (!IsTruncRoot)
21840 MaxDepthLevel = 1;
21841 for (Value *V : E.Scalars)
21842 (void)IsPotentiallyTruncated(V, BitWidth);
21843 } else {
21844 // Several vectorized uses? Check if we can truncate it, otherwise -
21845 // exit.
21846 if (any_of(E.Scalars, [&](Value *V) {
21847 return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth);
21848 }))
21849 return false;
21850 bool NeedToExit = false;
21851 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
21852 return false;
21853 if (NeedToExit)
21854 return true;
21855 if (!ProcessOperands(Operands, NeedToExit))
21856 return false;
21857 if (NeedToExit)
21858 return true;
21859 }
21860
21861 ++MaxDepthLevel;
21862 // Record the entry that we can demote.
21863 ToDemote.push_back(E.Idx);
21864 return IsProfitableToDemote;
21865 };
21866
21867 if (E.State == TreeEntry::SplitVectorize)
21868 return TryProcessInstruction(
21869 BitWidth,
21870 {VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(),
21871 VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()});
21872
21873 switch (E.getOpcode()) {
21874
21875 // We can always demote truncations and extensions. Since truncations can
21876 // seed additional demotion, we save the truncated value.
21877 case Instruction::Trunc:
21878 if (IsProfitableToDemoteRoot)
21879 IsProfitableToDemote = true;
21880 return TryProcessInstruction(BitWidth);
21881 case Instruction::ZExt:
21882 case Instruction::SExt:
21883 IsProfitableToDemote = true;
21884 return TryProcessInstruction(BitWidth);
21885
21886 // We can demote certain binary operations if we can demote both of their
21887 // operands.
21888 case Instruction::Add:
21889 case Instruction::Sub:
21890 case Instruction::Mul:
21891 case Instruction::And:
21892 case Instruction::Or:
21893 case Instruction::Xor: {
21894 return TryProcessInstruction(
21895 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
21896 }
21897 case Instruction::Freeze:
21898 return TryProcessInstruction(BitWidth, getOperandEntry(&E, 0));
21899 case Instruction::Shl: {
21900 // If we are truncating the result of this SHL, and if it's a shift of an
21901 // inrange amount, we can always perform a SHL in a smaller type.
21902 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
21903 return all_of(E.Scalars, [&](Value *V) {
21904 if (isa<PoisonValue>(V))
21905 return true;
21906 auto *I = cast<Instruction>(V);
21907 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
21908 return AmtKnownBits.getMaxValue().ult(BitWidth);
21909 });
21910 };
21911 return TryProcessInstruction(
21912 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
21913 }
21914 case Instruction::LShr: {
21915 // If this is a truncate of a logical shr, we can truncate it to a smaller
21916 // lshr iff we know that the bits we would otherwise be shifting in are
21917 // already zeros.
21918 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
21919 return all_of(E.Scalars, [&](Value *V) {
21920 if (isa<PoisonValue>(V))
21921 return true;
21922 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
21923 if (E.isCopyableElement(V))
21924 return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
21925 auto *I = cast<Instruction>(V);
21926 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
21927 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
21928 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
21929 SimplifyQuery(*DL));
21930 });
21931 };
21932 return TryProcessInstruction(
21933 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
21934 LShrChecker);
21935 }
21936 case Instruction::AShr: {
21937 // If this is a truncate of an arithmetic shr, we can truncate it to a
21938 // smaller ashr iff we know that all the bits from the sign bit of the
21939 // original type and the sign bit of the truncate type are similar.
21940 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
21941 return all_of(E.Scalars, [&](Value *V) {
21942 if (isa<PoisonValue>(V))
21943 return true;
21944 auto *I = cast<Instruction>(V);
21945 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
21946 unsigned ShiftedBits = OrigBitWidth - BitWidth;
21947 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
21948 ShiftedBits <
21949 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
21950 });
21951 };
21952 return TryProcessInstruction(
21953 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
21954 AShrChecker);
21955 }
21956 case Instruction::UDiv:
21957 case Instruction::URem: {
21958 // UDiv and URem can be truncated if all the truncated bits are zero.
21959 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
21960 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
21961 return all_of(E.Scalars, [&](Value *V) {
21962 auto *I = cast<Instruction>(V);
21963 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
21964 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
21965 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
21966 });
21967 };
21968 return TryProcessInstruction(
21969 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
21970 }
21971
21972 // We can demote selects if we can demote their true and false values.
21973 case Instruction::Select: {
21974 return TryProcessInstruction(
21975 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
21976 }
21977
21978 // We can demote phis if we can demote all their incoming operands.
21979 case Instruction::PHI: {
21980 const unsigned NumOps = E.getNumOperands();
21982 transform(seq<unsigned>(0, NumOps), Ops.begin(),
21983 [&](unsigned Idx) { return getOperandEntry(&E, Idx); });
21984
21985 return TryProcessInstruction(BitWidth, Ops);
21986 }
21987
21988 case Instruction::Call: {
21989 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
21990 if (!IC)
21991 break;
21993 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
21994 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
21995 break;
21996 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
21997 function_ref<bool(unsigned, unsigned)> CallChecker;
21998 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
21999 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
22000 return all_of(E.Scalars, [&](Value *V) {
22001 auto *I = cast<Instruction>(V);
22002 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
22003 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22004 return MaskedValueIsZero(I->getOperand(0), Mask,
22005 SimplifyQuery(*DL)) &&
22006 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22007 }
22008 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
22009 "Expected min/max intrinsics only.");
22010 unsigned SignBits = OrigBitWidth - BitWidth;
22011 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
22012 unsigned Op0SignBits =
22013 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22014 unsigned Op1SignBits =
22015 ComputeNumSignBits(I->getOperand(1), *DL, AC, nullptr, DT);
22016 return SignBits <= Op0SignBits &&
22017 ((SignBits != Op0SignBits &&
22018 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
22019 MaskedValueIsZero(I->getOperand(0), Mask,
22020 SimplifyQuery(*DL))) &&
22021 SignBits <= Op1SignBits &&
22022 ((SignBits != Op1SignBits &&
22023 !isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) ||
22024 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
22025 });
22026 };
22027 auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22028 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
22029 return all_of(E.Scalars, [&](Value *V) {
22030 auto *I = cast<Instruction>(V);
22031 unsigned SignBits = OrigBitWidth - BitWidth;
22032 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
22033 unsigned Op0SignBits =
22034 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22035 return SignBits <= Op0SignBits &&
22036 ((SignBits != Op0SignBits &&
22037 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
22038 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
22039 });
22040 };
22041 if (ID != Intrinsic::abs) {
22042 Operands.push_back(getOperandEntry(&E, 1));
22043 CallChecker = CompChecker;
22044 } else {
22045 CallChecker = AbsChecker;
22046 }
22047 InstructionCost BestCost =
22048 std::numeric_limits<InstructionCost::CostType>::max();
22049 unsigned BestBitWidth = BitWidth;
22050 unsigned VF = E.Scalars.size();
22051 // Choose the best bitwidth based on cost estimations.
22052 auto Checker = [&](unsigned BitWidth, unsigned) {
22053 unsigned MinBW = PowerOf2Ceil(BitWidth);
22054 SmallVector<Type *> ArgTys =
22055 buildIntrinsicArgTypes(IC, ID, VF, MinBW, TTI);
22056 auto VecCallCosts = getVectorCallCosts(
22057 IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),
22058 TTI, TLI, ArgTys);
22059 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
22060 if (Cost < BestCost) {
22061 BestCost = Cost;
22062 BestBitWidth = BitWidth;
22063 }
22064 return false;
22065 };
22066 [[maybe_unused]] bool NeedToExit;
22067 (void)AttemptCheckBitwidth(Checker, NeedToExit);
22068 BitWidth = BestBitWidth;
22069 return TryProcessInstruction(BitWidth, Operands, CallChecker);
22070 }
22071
22072 // Otherwise, conservatively give up.
22073 default:
22074 break;
22075 }
22076 MaxDepthLevel = 1;
22077 return FinalAnalysis();
22078}
22079
22080static RecurKind getRdxKind(Value *V);
22081
22083 // We only attempt to truncate integer expressions.
22084 bool IsStoreOrInsertElt =
22085 VectorizableTree.front()->hasState() &&
22086 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
22087 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
22088 if ((IsStoreOrInsertElt || UserIgnoreList) &&
22089 ExtraBitWidthNodes.size() <= 1 &&
22090 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
22091 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
22092 return;
22093
22094 unsigned NodeIdx = 0;
22095 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
22096 NodeIdx = 1;
22097
22098 // Ensure the roots of the vectorizable tree don't form a cycle.
22099 assert((VectorizableTree[NodeIdx]->isGather() || NodeIdx != 0 ||
22100 !VectorizableTree[NodeIdx]->UserTreeIndex) &&
22101 "Unexpected tree is graph.");
22102
22103 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
22104 // resize to the final type.
22105 bool IsTruncRoot = false;
22106 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
22107 SmallVector<unsigned> RootDemotes;
22108 SmallDenseSet<unsigned, 8> NodesToKeepBWs;
22109 if (NodeIdx != 0 &&
22110 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22111 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
22112 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
22113 IsTruncRoot = true;
22114 RootDemotes.push_back(NodeIdx);
22115 IsProfitableToDemoteRoot = true;
22116 ++NodeIdx;
22117 }
22118
22119 // Analyzed the reduction already and not profitable - exit.
22120 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
22121 return;
22122
22123 SmallVector<unsigned> ToDemote;
22124 auto ComputeMaxBitWidth =
22125 [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot,
22126 unsigned Limit, bool IsTruncRoot, bool IsSignedCmp) -> unsigned {
22127 ToDemote.clear();
22128 // Check if the root is trunc and the next node is gather/buildvector, then
22129 // keep trunc in scalars, which is free in most cases.
22130 if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
22131 !NodesToKeepBWs.contains(E.Idx) &&
22132 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
22133 all_of(E.Scalars, [&](Value *V) {
22134 return V->hasOneUse() || isa<Constant>(V) ||
22135 (!V->hasNUsesOrMore(UsesLimit) &&
22136 none_of(V->users(), [&](User *U) {
22137 ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
22138 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22139 if (TEs.empty() || is_contained(TEs, UserTE))
22140 return false;
22141 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22142 SelectInst>(U) ||
22143 isa<SIToFPInst, UIToFPInst>(U) ||
22144 (UserTE->hasState() &&
22145 (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22146 SelectInst>(UserTE->getMainOp()) ||
22147 isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))))
22148 return true;
22149 unsigned UserTESz = DL->getTypeSizeInBits(
22150 UserTE->Scalars.front()->getType());
22151 if (all_of(TEs, [&](const TreeEntry *TE) {
22152 auto It = MinBWs.find(TE);
22153 return It != MinBWs.end() &&
22154 It->second.first > UserTESz;
22155 }))
22156 return true;
22157 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
22158 }));
22159 })) {
22160 ToDemote.push_back(E.Idx);
22161 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22162 auto It = MinBWs.find(UserTE);
22163 if (It != MinBWs.end())
22164 return It->second.first;
22165 unsigned MaxBitWidth =
22166 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
22167 MaxBitWidth = bit_ceil(MaxBitWidth);
22168 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22169 MaxBitWidth = 8;
22170 return MaxBitWidth;
22171 }
22172
22173 if (!E.hasState())
22174 return 0u;
22175
22176 unsigned VF = E.getVectorFactor();
22177 Type *ScalarTy = E.Scalars.front()->getType();
22178 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
22179 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
22180 if (!TreeRootIT)
22181 return 0u;
22182
22183 if (any_of(E.Scalars,
22184 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
22185 return 0u;
22186
22187 unsigned NumParts = ::getNumberOfParts(
22188 *TTI, getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
22189
22190 // The maximum bit width required to represent all the values that can be
22191 // demoted without loss of precision. It would be safe to truncate the roots
22192 // of the expression to this width.
22193 unsigned MaxBitWidth = 1u;
22194
22195 // True if the roots can be zero-extended back to their original type,
22196 // rather than sign-extended. We know that if the leading bits are not
22197 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
22198 // True.
22199 // Determine if the sign bit of all the roots is known to be zero. If not,
22200 // IsKnownPositive is set to False.
22201 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
22202 if (isa<PoisonValue>(R))
22203 return true;
22204 KnownBits Known = computeKnownBits(R, *DL);
22205 return Known.isNonNegative();
22206 });
22207
22208 if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
22209 E.UserTreeIndex.UserTE->hasState() &&
22210 E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
22211 MaxBitWidth =
22212 std::min(DL->getTypeSizeInBits(
22213 E.UserTreeIndex.UserTE->Scalars.front()->getType()),
22214 DL->getTypeSizeInBits(ScalarTy));
22215
22216 // We first check if all the bits of the roots are demanded. If they're not,
22217 // we can truncate the roots to this narrower type.
22218 for (Value *Root : E.Scalars) {
22219 if (isa<PoisonValue>(Root))
22220 continue;
22221 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, AC, nullptr, DT);
22222 TypeSize NumTypeBits =
22223 DL->getTypeSizeInBits(Root->getType()->getScalarType());
22224 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22225 // If we can't prove that the sign bit is zero, we must add one to the
22226 // maximum bit width to account for the unknown sign bit. This preserves
22227 // the existing sign bit so we can safely sign-extend the root back to the
22228 // original type. Otherwise, if we know the sign bit is zero, we will
22229 // zero-extend the root instead.
22230 //
22231 // FIXME: This is somewhat suboptimal, as there will be cases where adding
22232 // one to the maximum bit width will yield a larger-than-necessary
22233 // type. In general, we need to add an extra bit only if we can't
22234 // prove that the upper bit of the original type is equal to the
22235 // upper bit of the proposed smaller type. If these two bits are
22236 // the same (either zero or one) we know that sign-extending from
22237 // the smaller type will result in the same value. Here, since we
22238 // can't yet prove this, we are just making the proposed smaller
22239 // type larger to ensure correctness.
22240 if (!IsKnownPositive)
22241 ++BitWidth1;
22242
22243 auto *I = dyn_cast<Instruction>(Root);
22244 if (!I) {
22245 MaxBitWidth = std::max(BitWidth1, MaxBitWidth);
22246 continue;
22247 }
22248 APInt Mask = DB->getDemandedBits(I);
22249 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22250 MaxBitWidth =
22251 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
22252 }
22253
22254 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22255 MaxBitWidth = 8;
22256
22257 // If the original type is large, but reduced type does not improve the reg
22258 // use - ignore it.
22259 if (NumParts > 1 &&
22260 NumParts ==
22262 *TTI, getWidenedType(IntegerType::get(F->getContext(),
22263 bit_ceil(MaxBitWidth)),
22264 VF)))
22265 return 0u;
22266
22267 unsigned Opcode = E.getOpcode();
22268 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
22269 Opcode == Instruction::SExt ||
22270 Opcode == Instruction::ZExt || NumParts > 1;
22271 // Conservatively determine if we can actually truncate the roots of the
22272 // expression. Collect the values that can be demoted in ToDemote and
22273 // additional roots that require investigating in Roots.
22275 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
22276 bool NeedToDemote = IsProfitableToDemote;
22277
22278 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
22279 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
22280 NeedToDemote, IsTruncRoot) ||
22281 (MaxDepthLevel <= Limit &&
22282 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
22283 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
22284 DL->getTypeSizeInBits(TreeRootIT) /
22285 DL->getTypeSizeInBits(
22286 E.getMainOp()->getOperand(0)->getType()) >
22287 2)))))
22288 return 0u;
22289 // Round MaxBitWidth up to the next power-of-two.
22290 MaxBitWidth = bit_ceil(MaxBitWidth);
22291
22292 return MaxBitWidth;
22293 };
22294
22295 // If we can truncate the root, we must collect additional values that might
22296 // be demoted as a result. That is, those seeded by truncations we will
22297 // modify.
22298 // Add reduction ops sizes, if any.
22299 if (UserIgnoreList &&
22300 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
22301 // Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
22302 // x i1> to in)).
22303 if (all_of(*UserIgnoreList,
22304 [](Value *V) {
22305 return isa<PoisonValue>(V) ||
22306 cast<Instruction>(V)->getOpcode() == Instruction::Add;
22307 }) &&
22308 VectorizableTree.front()->State == TreeEntry::Vectorize &&
22309 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
22310 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
22311 Builder.getInt1Ty()) {
22312 ReductionBitWidth = 1;
22313 } else {
22314 for (Value *V : *UserIgnoreList) {
22315 if (isa<PoisonValue>(V))
22316 continue;
22317 unsigned NumSignBits = ComputeNumSignBits(V, *DL, AC, nullptr, DT);
22318 TypeSize NumTypeBits = DL->getTypeSizeInBits(V->getType());
22319 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22321 ++BitWidth1;
22322 unsigned BitWidth2 = BitWidth1;
22324 APInt Mask = DB->getDemandedBits(cast<Instruction>(V));
22325 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22326 }
22327 ReductionBitWidth =
22328 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
22329 }
22330 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
22331 ReductionBitWidth = 8;
22332
22333 ReductionBitWidth = bit_ceil(ReductionBitWidth);
22334 }
22335 }
22336 bool IsTopRoot = NodeIdx == 0;
22337 while (NodeIdx < VectorizableTree.size() &&
22338 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22339 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
22340 RootDemotes.push_back(NodeIdx);
22341 ++NodeIdx;
22342 IsTruncRoot = true;
22343 }
22344 bool IsSignedCmp = false;
22345 if (UserIgnoreList && all_of(*UserIgnoreList, [](Value *V) {
22346 return match(V, m_SMin(m_Value(), m_Value())) ||
22347 match(V, m_SMax(m_Value(), m_Value()));
22348 }))
22349 IsSignedCmp = true;
22350 while (NodeIdx < VectorizableTree.size()) {
22351 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
22352 unsigned Limit = 2;
22353 if (IsTopRoot &&
22354 ReductionBitWidth ==
22355 DL->getTypeSizeInBits(
22356 VectorizableTree.front()->Scalars.front()->getType()))
22357 Limit = 3;
22358 unsigned MaxBitWidth = ComputeMaxBitWidth(
22359 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
22360 IsTruncRoot, IsSignedCmp);
22361 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
22362 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
22363 ReductionBitWidth = bit_ceil(MaxBitWidth);
22364 else if (MaxBitWidth == 0)
22365 ReductionBitWidth = 0;
22366 }
22367
22368 for (unsigned Idx : RootDemotes) {
22369 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
22370 uint32_t OrigBitWidth =
22371 DL->getTypeSizeInBits(V->getType()->getScalarType());
22372 if (OrigBitWidth > MaxBitWidth) {
22373 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
22374 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
22375 }
22376 return false;
22377 }))
22378 ToDemote.push_back(Idx);
22379 }
22380 RootDemotes.clear();
22381 IsTopRoot = false;
22382 IsProfitableToDemoteRoot = true;
22383
22384 if (ExtraBitWidthNodes.empty()) {
22385 NodeIdx = VectorizableTree.size();
22386 } else {
22387 unsigned NewIdx = 0;
22388 do {
22389 NewIdx = *ExtraBitWidthNodes.begin();
22390 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
22391 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
22392 NodeIdx = NewIdx;
22393 IsTruncRoot =
22394 NodeIdx < VectorizableTree.size() &&
22395 VectorizableTree[NodeIdx]->UserTreeIndex &&
22396 VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
22397 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22398 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22399 Instruction::Trunc &&
22400 !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
22401 IsSignedCmp =
22402 NodeIdx < VectorizableTree.size() &&
22403 VectorizableTree[NodeIdx]->UserTreeIndex &&
22404 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22405 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22406 Instruction::ICmp &&
22407 any_of(
22408 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
22409 [&](Value *V) {
22410 auto *IC = dyn_cast<ICmpInst>(V);
22411 return IC && (IC->isSigned() ||
22412 !isKnownNonNegative(IC->getOperand(0),
22413 SimplifyQuery(*DL)) ||
22414 !isKnownNonNegative(IC->getOperand(1),
22415 SimplifyQuery(*DL)));
22416 });
22417 }
22418
22419 // If the maximum bit width we compute is less than the width of the roots'
22420 // type, we can proceed with the narrowing. Otherwise, do nothing.
22421 if (MaxBitWidth == 0 ||
22422 MaxBitWidth >=
22423 cast<IntegerType>(TreeRoot.front()->getType()->getScalarType())
22424 ->getBitWidth()) {
22425 if (UserIgnoreList)
22426 AnalyzedMinBWVals.insert_range(TreeRoot);
22427 NodesToKeepBWs.insert_range(ToDemote);
22428 continue;
22429 }
22430
22431 // Finally, map the values we can demote to the maximum bit with we
22432 // computed.
22433 for (unsigned Idx : ToDemote) {
22434 TreeEntry *TE = VectorizableTree[Idx].get();
22435 if (MinBWs.contains(TE))
22436 continue;
22437 bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
22438 if (isa<PoisonValue>(R))
22439 return false;
22440 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22441 });
22442 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
22443 }
22444 }
22445}
22446
22448 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
22449 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
22450 auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
22451 auto *AA = &AM.getResult<AAManager>(F);
22452 auto *LI = &AM.getResult<LoopAnalysis>(F);
22453 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
22454 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
22455 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
22457
22458 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
22459 if (!Changed)
22460 return PreservedAnalyses::all();
22461
22464 return PA;
22465}
22466
22468 TargetTransformInfo *TTI_,
22469 TargetLibraryInfo *TLI_, AAResults *AA_,
22470 LoopInfo *LI_, DominatorTree *DT_,
22471 AssumptionCache *AC_, DemandedBits *DB_,
22474 return false;
22475 SE = SE_;
22476 TTI = TTI_;
22477 TLI = TLI_;
22478 AA = AA_;
22479 LI = LI_;
22480 DT = DT_;
22481 AC = AC_;
22482 DB = DB_;
22483 DL = &F.getDataLayout();
22484
22485 Stores.clear();
22486 GEPs.clear();
22487 bool Changed = false;
22488
22489 // If the target claims to have no vector registers don't attempt
22490 // vectorization.
22492 LLVM_DEBUG(
22493 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
22494 return false;
22495 }
22496
22497 // Don't vectorize when the attribute NoImplicitFloat is used.
22498 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
22499 return false;
22500
22501 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
22502
22503 // Use the bottom up slp vectorizer to construct chains that start with
22504 // store instructions.
22505 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
22506
22507 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
22508 // delete instructions.
22509
22510 // Update DFS numbers now so that we can use them for ordering.
22511 DT->updateDFSNumbers();
22512
22513 // Scan the blocks in the function in post order.
22514 for (auto *BB : post_order(&F.getEntryBlock())) {
22515 if (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()))
22516 continue;
22517
22518 // Start new block - clear the list of reduction roots.
22519 R.clearReductionData();
22520 collectSeedInstructions(BB);
22521
22522 // Vectorize trees that end at stores.
22523 if (!Stores.empty()) {
22524 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
22525 << " underlying objects.\n");
22526 Changed |= vectorizeStoreChains(R);
22527 }
22528
22529 // Vectorize trees that end at reductions.
22530 Changed |= vectorizeChainsInBlock(BB, R);
22531
22532 // Vectorize the index computations of getelementptr instructions. This
22533 // is primarily intended to catch gather-like idioms ending at
22534 // non-consecutive loads.
22535 if (!GEPs.empty()) {
22536 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
22537 << " underlying objects.\n");
22538 Changed |= vectorizeGEPIndices(BB, R);
22539 }
22540 }
22541
22542 if (Changed) {
22543 R.optimizeGatherSequence();
22544 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
22545 }
22546 return Changed;
22547}
22548
22549std::optional<bool>
22550SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
22551 unsigned Idx, unsigned MinVF,
22552 unsigned &Size) {
22553 Size = 0;
22554 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
22555 << "\n");
22556 const unsigned Sz = R.getVectorElementSize(Chain[0]);
22557 unsigned VF = Chain.size();
22558
22559 if (!has_single_bit(Sz) ||
22561 *TTI, cast<StoreInst>(Chain.front())->getValueOperand()->getType(),
22562 VF) ||
22563 VF < 2 || VF < MinVF) {
22564 // Check if vectorizing with a non-power-of-2 VF should be considered. At
22565 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
22566 // all vector lanes are used.
22567 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
22568 return false;
22569 }
22570
22571 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
22572 << "\n");
22573
22574 SetVector<Value *> ValOps;
22575 for (Value *V : Chain)
22576 ValOps.insert(cast<StoreInst>(V)->getValueOperand());
22577 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
22578 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
22579 InstructionsState S = Analysis.buildInstructionsState(
22580 ValOps.getArrayRef(), R, /*TryCopyableElementsVectorization=*/true);
22581 if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
22582 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
22583 bool IsAllowedSize =
22584 hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(),
22585 ValOps.size()) ||
22586 (VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1));
22587 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
22588 (!S.getMainOp()->isSafeToRemove() ||
22589 any_of(ValOps.getArrayRef(),
22590 [&](Value *V) {
22591 return !isa<ExtractElementInst>(V) &&
22592 (V->getNumUses() > Chain.size() ||
22593 any_of(V->users(), [&](User *U) {
22594 return !Stores.contains(U);
22595 }));
22596 }))) ||
22597 (ValOps.size() > Chain.size() / 2 && !S)) {
22598 Size = (!IsAllowedSize && S) ? 1 : 2;
22599 return false;
22600 }
22601 }
22602 if (R.isLoadCombineCandidate(Chain))
22603 return true;
22604 R.buildTree(Chain);
22605 // Check if tree tiny and store itself or its value is not vectorized.
22606 if (R.isTreeTinyAndNotFullyVectorizable()) {
22607 if (R.isGathered(Chain.front()) ||
22608 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
22609 return std::nullopt;
22610 Size = R.getCanonicalGraphSize();
22611 return false;
22612 }
22613 if (R.isProfitableToReorder()) {
22614 R.reorderTopToBottom();
22615 R.reorderBottomToTop();
22616 }
22617 R.transformNodes();
22618 R.buildExternalUses();
22619
22620 R.computeMinimumValueSizes();
22621
22622 Size = R.getCanonicalGraphSize();
22623 if (S && S.getOpcode() == Instruction::Load)
22624 Size = 2; // cut off masked gather small trees
22625 InstructionCost Cost = R.getTreeCost();
22626
22627 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
22628 if (Cost < -SLPCostThreshold) {
22629 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
22630
22631 using namespace ore;
22632
22633 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
22634 cast<StoreInst>(Chain[0]))
22635 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
22636 << " and with tree size "
22637 << NV("TreeSize", R.getTreeSize()));
22638
22639 R.vectorizeTree();
22640 return true;
22641 }
22642
22643 return false;
22644}
22645
22646/// Checks if the quadratic mean deviation is less than 90% of the mean size.
22647static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
22648 bool First) {
22649 unsigned Num = 0;
22650 uint64_t Sum = std::accumulate(
22651 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
22652 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
22653 unsigned Size = First ? Val.first : Val.second;
22654 if (Size == 1)
22655 return V;
22656 ++Num;
22657 return V + Size;
22658 });
22659 if (Num == 0)
22660 return true;
22661 uint64_t Mean = Sum / Num;
22662 if (Mean == 0)
22663 return true;
22664 uint64_t Dev = std::accumulate(
22665 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
22666 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
22667 unsigned P = First ? Val.first : Val.second;
22668 if (P == 1)
22669 return V;
22670 return V + (P - Mean) * (P - Mean);
22671 }) /
22672 Num;
22673 return Dev * 96 / (Mean * Mean) == 0;
22674}
22675
22676namespace {
22677
22678/// A group of stores that we'll try to bundle together using vector ops.
22679/// They are ordered using the signed distance of their address operand to the
22680/// address of this group's BaseInstr.
22681class RelatedStoreInsts {
22682public:
22683 RelatedStoreInsts(unsigned BaseInstrIdx, ArrayRef<StoreInst *> AllStores)
22684 : AllStores(AllStores) {
22685 reset(BaseInstrIdx);
22686 }
22687
22688 void reset(unsigned NewBaseInstr) {
22689 assert(NewBaseInstr < AllStores.size() &&
22690 "Instruction index out of bounds");
22691 BaseInstrIdx = NewBaseInstr;
22692 Instrs.clear();
22693 insertOrLookup(NewBaseInstr, 0);
22694 }
22695
22696 /// Tries to insert \p InstrIdx as the store with a pointer distance of
22697 /// \p PtrDist.
22698 /// Does nothing if there is already a store with that \p PtrDist.
22699 /// \returns The previously associated Instruction index, or std::nullopt
22700 std::optional<unsigned> insertOrLookup(unsigned InstrIdx, int64_t PtrDist) {
22701 auto [It, Inserted] = Instrs.emplace(PtrDist, InstrIdx);
22702 return Inserted ? std::nullopt : std::make_optional(It->second);
22703 }
22704
22705 using DistToInstMap = std::map<int64_t, unsigned>;
22706 const DistToInstMap &getStores() const { return Instrs; }
22707
22708 /// If \p SI is related to this group of stores, return the distance of its
22709 /// pointer operand to the one the group's BaseInstr.
22710 std::optional<int64_t> getPointerDiff(StoreInst &SI, const DataLayout &DL,
22711 ScalarEvolution &SE) const {
22712 StoreInst &BaseStore = *AllStores[BaseInstrIdx];
22713 return getPointersDiff(
22714 BaseStore.getValueOperand()->getType(), BaseStore.getPointerOperand(),
22715 SI.getValueOperand()->getType(), SI.getPointerOperand(), DL, SE,
22716 /*StrictCheck=*/true);
22717 }
22718
22719 /// Recompute the pointer distances to be based on \p NewBaseInstIdx.
22720 /// Stores whose index is less than \p MinSafeIdx will be dropped.
22721 void rebase(unsigned MinSafeIdx, unsigned NewBaseInstIdx,
22722 int64_t DistFromCurBase) {
22723 DistToInstMap PrevSet = std::move(Instrs);
22724 reset(NewBaseInstIdx);
22725
22726 // Re-insert stores that come after MinSafeIdx to try and vectorize them
22727 // again. Their distance will be "rebased" to use NewBaseInstIdx as
22728 // reference.
22729 for (auto [Dist, InstIdx] : PrevSet) {
22730 if (InstIdx >= MinSafeIdx)
22731 insertOrLookup(InstIdx, Dist - DistFromCurBase);
22732 }
22733 }
22734
22735 /// Remove all stores that have been vectorized from this group.
22736 void clearVectorizedStores(const BoUpSLP::ValueSet &VectorizedStores) {
22737 DistToInstMap::reverse_iterator LastVectorizedStore = find_if(
22738 reverse(Instrs), [&](const std::pair<int64_t, unsigned> &DistAndIdx) {
22739 return VectorizedStores.contains(AllStores[DistAndIdx.second]);
22740 });
22741
22742 // Get a forward iterator pointing after the last vectorized store and erase
22743 // all stores before it so we don't try to vectorize them again.
22744 DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
22745 Instrs.erase(Instrs.begin(), VectorizedStoresEnd);
22746 }
22747
22748private:
22749 /// The index of the Base instruction, i.e. the one with a 0 pointer distance.
22750 unsigned BaseInstrIdx;
22751
22752 /// Maps a pointer distance from \p BaseInstrIdx to an instruction index.
22753 DistToInstMap Instrs;
22754
22755 /// Reference to all the stores in the BB being analyzed.
22756 ArrayRef<StoreInst *> AllStores;
22757};
22758
22759} // end anonymous namespace
22760
22761bool SLPVectorizerPass::vectorizeStores(
22762 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
22763 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
22764 &Visited) {
22765 // We may run into multiple chains that merge into a single chain. We mark the
22766 // stores that we vectorized so that we don't visit the same store twice.
22767 BoUpSLP::ValueSet VectorizedStores;
22768 bool Changed = false;
22769
22770 auto TryToVectorize = [&](const RelatedStoreInsts::DistToInstMap &StoreSeq) {
22771 int64_t PrevDist = -1;
22773 // Collect the chain into a list.
22774 for (auto [Idx, Data] : enumerate(StoreSeq)) {
22775 auto &[Dist, InstIdx] = Data;
22776 if (Operands.empty() || Dist - PrevDist == 1) {
22777 Operands.push_back(Stores[InstIdx]);
22778 PrevDist = Dist;
22779 if (Idx != StoreSeq.size() - 1)
22780 continue;
22781 }
22782 auto E = make_scope_exit([&, &Dist = Dist, &InstIdx = InstIdx]() {
22783 Operands.clear();
22784 Operands.push_back(Stores[InstIdx]);
22785 PrevDist = Dist;
22786 });
22787
22788 if (Operands.size() <= 1 ||
22789 !Visited
22790 .insert({Operands.front(),
22791 cast<StoreInst>(Operands.front())->getValueOperand(),
22792 Operands.back(),
22793 cast<StoreInst>(Operands.back())->getValueOperand(),
22794 Operands.size()})
22795 .second)
22796 continue;
22797
22798 unsigned MaxVecRegSize = R.getMaxVecRegSize();
22799 unsigned EltSize = R.getVectorElementSize(Operands[0]);
22800 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
22801
22802 unsigned MaxVF =
22803 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
22804 auto *Store = cast<StoreInst>(Operands[0]);
22805 Type *StoreTy = Store->getValueOperand()->getType();
22806 Type *ValueTy = StoreTy;
22807 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
22808 ValueTy = Trunc->getSrcTy();
22809 // When REVEC is enabled, StoreTy and ValueTy may be FixedVectorType. But
22810 // getStoreMinimumVF only support scalar type as arguments. As a result,
22811 // we need to use the element type of StoreTy and ValueTy to retrieve the
22812 // VF and then transform it back.
22813 // Remember: VF is defined as the number we want to vectorize, not the
22814 // number of elements in the final vector.
22815 Type *StoreScalarTy = StoreTy->getScalarType();
22816 unsigned MinVF = PowerOf2Ceil(TTI->getStoreMinimumVF(
22817 R.getMinVF(DL->getTypeStoreSizeInBits(StoreScalarTy)), StoreScalarTy,
22818 ValueTy->getScalarType()));
22819 MinVF /= getNumElements(StoreTy);
22820 MinVF = std::max<unsigned>(2, MinVF);
22821
22822 if (MaxVF < MinVF) {
22823 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
22824 << ") < "
22825 << "MinVF (" << MinVF << ")\n");
22826 continue;
22827 }
22828
22829 unsigned NonPowerOf2VF = 0;
22831 // First try vectorizing with a non-power-of-2 VF. At the moment, only
22832 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
22833 // lanes are used.
22834 unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF);
22835 if (has_single_bit(CandVF + 1)) {
22836 NonPowerOf2VF = CandVF;
22837 assert(NonPowerOf2VF != MaxVF &&
22838 "Non-power-of-2 VF should not be equal to MaxVF");
22839 }
22840 }
22841
22842 // MaxRegVF represents the number of instructions (scalar, or vector in
22843 // case of revec) that can be vectorized to naturally fit in a vector
22844 // register.
22845 unsigned MaxRegVF = MaxVF;
22846
22847 MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
22848 if (MaxVF < MinVF) {
22849 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
22850 << ") < "
22851 << "MinVF (" << MinVF << ")\n");
22852 continue;
22853 }
22854
22855 SmallVector<unsigned> CandidateVFs;
22856 for (unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
22857 VF = divideCeil(VF, 2))
22858 CandidateVFs.push_back(VF);
22859
22860 unsigned End = Operands.size();
22861 unsigned Repeat = 0;
22862 constexpr unsigned MaxAttempts = 4;
22864 for (std::pair<unsigned, unsigned> &P : RangeSizes)
22865 P.first = P.second = 1;
22867 auto IsNotVectorized = [](bool First,
22868 const std::pair<unsigned, unsigned> &P) {
22869 return First ? P.first > 0 : P.second > 0;
22870 };
22871 auto IsVectorized = [](bool First,
22872 const std::pair<unsigned, unsigned> &P) {
22873 return First ? P.first == 0 : P.second == 0;
22874 };
22875 auto VFIsProfitable = [](bool First, unsigned Size,
22876 const std::pair<unsigned, unsigned> &P) {
22877 return First ? Size >= P.first : Size >= P.second;
22878 };
22879 auto FirstSizeSame = [](unsigned Size,
22880 const std::pair<unsigned, unsigned> &P) {
22881 return Size == P.first;
22882 };
22883 while (true) {
22884 ++Repeat;
22885 bool RepeatChanged = false;
22886 bool AnyProfitableGraph = false;
22887 for (unsigned VF : CandidateVFs) {
22888 AnyProfitableGraph = false;
22889 unsigned FirstUnvecStore =
22890 std::distance(RangeSizes.begin(),
22891 find_if(RangeSizes, std::bind(IsNotVectorized,
22892 VF >= MaxRegVF, _1)));
22893
22894 // Form slices of size VF starting from FirstUnvecStore and try to
22895 // vectorize them.
22896 while (FirstUnvecStore < End) {
22897 unsigned FirstVecStore = std::distance(
22898 RangeSizes.begin(),
22899 find_if(RangeSizes.drop_front(FirstUnvecStore),
22900 std::bind(IsVectorized, VF >= MaxRegVF, _1)));
22901 unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
22902 for (unsigned SliceStartIdx = FirstUnvecStore;
22903 SliceStartIdx + VF <= MaxSliceEnd;) {
22904 if (!checkTreeSizes(RangeSizes.slice(SliceStartIdx, VF),
22905 VF >= MaxRegVF)) {
22906 ++SliceStartIdx;
22907 continue;
22908 }
22909 ArrayRef<Value *> Slice =
22910 ArrayRef(Operands).slice(SliceStartIdx, VF);
22911 assert(all_of(Slice,
22912 [&](Value *V) {
22913 return cast<StoreInst>(V)
22914 ->getValueOperand()
22915 ->getType() ==
22916 cast<StoreInst>(Slice.front())
22917 ->getValueOperand()
22918 ->getType();
22919 }) &&
22920 "Expected all operands of same type.");
22921 if (!NonSchedulable.empty()) {
22922 auto [NonSchedSizeMax, NonSchedSizeMin] =
22923 NonSchedulable.lookup(Slice.front());
22924 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
22925 // VF is too ambitious. Try to vectorize another slice before
22926 // trying a smaller VF.
22927 SliceStartIdx += NonSchedSizeMax;
22928 continue;
22929 }
22930 }
22931 unsigned TreeSize;
22932 std::optional<bool> Res =
22933 vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
22934 if (!Res) {
22935 // Update the range of non schedulable VFs for slices starting
22936 // at SliceStartIdx.
22937 NonSchedulable
22938 .try_emplace(Slice.front(), std::make_pair(VF, VF))
22939 .first->getSecond()
22940 .second = VF;
22941 } else if (*Res) {
22942 // Mark the vectorized stores so that we don't vectorize them
22943 // again.
22944 VectorizedStores.insert_range(Slice);
22945 // Mark the vectorized stores so that we don't vectorize them
22946 // again.
22947 AnyProfitableGraph = RepeatChanged = Changed = true;
22948 // If we vectorized initial block, no need to try to vectorize
22949 // it again.
22950 for (std::pair<unsigned, unsigned> &P :
22951 RangeSizes.slice(SliceStartIdx, VF))
22952 P.first = P.second = 0;
22953 if (SliceStartIdx < FirstUnvecStore + MinVF) {
22954 for (std::pair<unsigned, unsigned> &P : RangeSizes.slice(
22955 FirstUnvecStore, SliceStartIdx - FirstUnvecStore))
22956 P.first = P.second = 0;
22957 FirstUnvecStore = SliceStartIdx + VF;
22958 }
22959 if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
22960 for (std::pair<unsigned, unsigned> &P :
22961 RangeSizes.slice(SliceStartIdx + VF,
22962 MaxSliceEnd - (SliceStartIdx + VF)))
22963 P.first = P.second = 0;
22964 if (MaxSliceEnd == End)
22965 End = SliceStartIdx;
22966 MaxSliceEnd = SliceStartIdx;
22967 }
22968 SliceStartIdx += VF;
22969 continue;
22970 }
22971 if (VF > 2 && Res &&
22972 !all_of(RangeSizes.slice(SliceStartIdx, VF),
22973 std::bind(VFIsProfitable, VF >= MaxRegVF, TreeSize,
22974 _1))) {
22975 SliceStartIdx += VF;
22976 continue;
22977 }
22978 // Check for the very big VFs that we're not rebuilding same
22979 // trees, just with larger number of elements.
22980 if (VF > MaxRegVF && TreeSize > 1 &&
22981 all_of(RangeSizes.slice(SliceStartIdx, VF),
22982 std::bind(FirstSizeSame, TreeSize, _1))) {
22983 SliceStartIdx += VF;
22984 while (SliceStartIdx != MaxSliceEnd &&
22985 RangeSizes[SliceStartIdx].first == TreeSize)
22986 ++SliceStartIdx;
22987 continue;
22988 }
22989 if (TreeSize > 1) {
22990 for (std::pair<unsigned, unsigned> &P :
22991 RangeSizes.slice(SliceStartIdx, VF)) {
22992 if (VF >= MaxRegVF)
22993 P.second = std::max(P.second, TreeSize);
22994 else
22995 P.first = std::max(P.first, TreeSize);
22996 }
22997 }
22998 ++SliceStartIdx;
22999 AnyProfitableGraph = true;
23000 }
23001 if (FirstUnvecStore >= End)
23002 break;
23003 if (MaxSliceEnd - FirstUnvecStore < VF &&
23004 MaxSliceEnd - FirstUnvecStore >= MinVF)
23005 AnyProfitableGraph = true;
23006 FirstUnvecStore = std::distance(
23007 RangeSizes.begin(),
23008 find_if(RangeSizes.drop_front(MaxSliceEnd),
23009 std::bind(IsNotVectorized, VF >= MaxRegVF, _1)));
23010 }
23011 if (!AnyProfitableGraph && VF >= MaxRegVF && has_single_bit(VF))
23012 break;
23013 }
23014 // All values vectorized - exit.
23015 if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
23016 return P.first == 0 && P.second == 0;
23017 }))
23018 break;
23019 // Check if tried all attempts or no need for the last attempts at all.
23020 if (Repeat >= MaxAttempts ||
23021 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
23022 break;
23023 constexpr unsigned StoresLimit = 64;
23024 const unsigned MaxTotalNum = std::min<unsigned>(
23025 Operands.size(),
23026 static_cast<unsigned>(
23027 End -
23028 std::distance(
23029 RangeSizes.begin(),
23030 find_if(RangeSizes, std::bind(IsNotVectorized, true, _1))) +
23031 1));
23032 unsigned VF = bit_ceil(CandidateVFs.front()) * 2;
23033 unsigned Limit =
23034 getFloorFullVectorNumberOfElements(*TTI, StoreTy, MaxTotalNum);
23035 CandidateVFs.clear();
23036 if (bit_floor(Limit) == VF)
23037 CandidateVFs.push_back(Limit);
23038 if (VF > MaxTotalNum || VF >= StoresLimit)
23039 break;
23040 for (std::pair<unsigned, unsigned> &P : RangeSizes) {
23041 if (P.first != 0)
23042 P.first = std::max(P.second, P.first);
23043 }
23044 // Last attempt to vectorize max number of elements, if all previous
23045 // attempts were unsuccessful because of the cost issues.
23046 CandidateVFs.push_back(VF);
23047 }
23048 }
23049 };
23050
23051 /// Groups of stores to vectorize
23052 SmallVector<RelatedStoreInsts> SortedStores;
23053
23054 // Inserts the specified store SI with the given index Idx to the set of the
23055 // stores. If the store with the same distance is found already - stop
23056 // insertion, try to vectorize already found stores. If some stores from this
23057 // sequence were not vectorized - try to vectorize them with the new store
23058 // later. But this logic is applied only to the stores, that come before the
23059 // previous store with the same distance.
23060 // Example:
23061 // 1. store x, %p
23062 // 2. store y, %p+1
23063 // 3. store z, %p+2
23064 // 4. store a, %p
23065 // 5. store b, %p+3
23066 // - Scan this from the last to first store. The very first bunch of stores is
23067 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
23068 // vector).
23069 // - The next store in the list - #1 - has the same distance from store #5 as
23070 // the store #4.
23071 // - Try to vectorize sequence of stores 4,2,3,5.
23072 // - If all these stores are vectorized - just drop them.
23073 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
23074 // - Start new stores sequence.
23075 // The new bunch of stores is {1, {1, 0}}.
23076 // - Add the stores from previous sequence, that were not vectorized.
23077 // Here we consider the stores in the reversed order, rather they are used in
23078 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
23079 // Store #3 can be added -> comes after store #4 with the same distance as
23080 // store #1.
23081 // Store #5 cannot be added - comes before store #4.
23082 // This logic allows to improve the compile time, we assume that the stores
23083 // after previous store with the same distance most likely have memory
23084 // dependencies and no need to waste compile time to try to vectorize them.
23085 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
23086 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
23087 std::optional<int64_t> PtrDist;
23088 auto *RelatedStores = find_if(
23089 SortedStores, [&PtrDist, SI, this](const RelatedStoreInsts &StoreSeq) {
23090 PtrDist = StoreSeq.getPointerDiff(*SI, *DL, *SE);
23091 return PtrDist.has_value();
23092 });
23093
23094 // We did not find a comparable store, start a new group.
23095 if (RelatedStores == SortedStores.end()) {
23096 SortedStores.emplace_back(Idx, Stores);
23097 return;
23098 }
23099
23100 // If there is already a store in the group with the same PtrDiff, try to
23101 // vectorize the existing instructions before adding the current store.
23102 // Otherwise, insert this store and keep collecting.
23103 if (std::optional<unsigned> PrevInst =
23104 RelatedStores->insertOrLookup(Idx, *PtrDist)) {
23105 TryToVectorize(RelatedStores->getStores());
23106 RelatedStores->clearVectorizedStores(VectorizedStores);
23107 RelatedStores->rebase(/*MinSafeIdx=*/*PrevInst + 1,
23108 /*NewBaseInstIdx=*/Idx,
23109 /*DistFromCurBase=*/*PtrDist);
23110 }
23111 };
23112 Type *PrevValTy = nullptr;
23113 for (auto [I, SI] : enumerate(Stores)) {
23114 if (R.isDeleted(SI))
23115 continue;
23116 if (!PrevValTy)
23117 PrevValTy = SI->getValueOperand()->getType();
23118 // Check that we do not try to vectorize stores of different types.
23119 if (PrevValTy != SI->getValueOperand()->getType()) {
23120 for (RelatedStoreInsts &StoreSeq : SortedStores)
23121 TryToVectorize(StoreSeq.getStores());
23122 SortedStores.clear();
23123 PrevValTy = SI->getValueOperand()->getType();
23124 }
23125 FillStoresSet(I, SI);
23126 }
23127
23128 // Final vectorization attempt.
23129 for (RelatedStoreInsts &StoreSeq : SortedStores)
23130 TryToVectorize(StoreSeq.getStores());
23131
23132 return Changed;
23133}
23134
23135void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
23136 // Initialize the collections. We will make a single pass over the block.
23137 Stores.clear();
23138 GEPs.clear();
23139
23140 // Visit the store and getelementptr instructions in BB and organize them in
23141 // Stores and GEPs according to the underlying objects of their pointer
23142 // operands.
23143 for (Instruction &I : *BB) {
23144 // Ignore store instructions that are volatile or have a pointer operand
23145 // that doesn't point to a scalar type.
23146 if (auto *SI = dyn_cast<StoreInst>(&I)) {
23147 if (!SI->isSimple())
23148 continue;
23149 if (!isValidElementType(SI->getValueOperand()->getType()))
23150 continue;
23151 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
23152 }
23153
23154 // Ignore getelementptr instructions that have more than one index, a
23155 // constant index, or a pointer operand that doesn't point to a scalar
23156 // type.
23157 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
23158 if (GEP->getNumIndices() != 1)
23159 continue;
23160 Value *Idx = GEP->idx_begin()->get();
23161 if (isa<Constant>(Idx))
23162 continue;
23163 if (!isValidElementType(Idx->getType()))
23164 continue;
23165 if (GEP->getType()->isVectorTy())
23166 continue;
23167 GEPs[GEP->getPointerOperand()].push_back(GEP);
23168 }
23169 }
23170}
23171
23172bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
23173 bool MaxVFOnly) {
23174 if (VL.size() < 2)
23175 return false;
23176
23177 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
23178 << VL.size() << ".\n");
23179
23180 // Check that all of the parts are instructions of the same type,
23181 // we permit an alternate opcode via InstructionsState.
23182 InstructionsState S = getSameOpcode(VL, *TLI);
23183 if (!S)
23184 return false;
23185
23186 Instruction *I0 = S.getMainOp();
23187 // Make sure invalid types (including vector type) are rejected before
23188 // determining vectorization factor for scalar instructions.
23189 for (Value *V : VL) {
23190 Type *Ty = V->getType();
23191 if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) {
23192 // NOTE: the following will give user internal llvm type name, which may
23193 // not be useful.
23194 R.getORE()->emit([&]() {
23195 std::string TypeStr;
23197 Ty->print(OS);
23198 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
23199 << "Cannot SLP vectorize list: type "
23200 << TypeStr + " is unsupported by vectorizer";
23201 });
23202 return false;
23203 }
23204 }
23205
23206 Type *ScalarTy = getValueType(VL[0]);
23207 unsigned Sz = R.getVectorElementSize(I0);
23208 unsigned MinVF = R.getMinVF(Sz);
23209 unsigned MaxVF = std::max<unsigned>(
23210 getFloorFullVectorNumberOfElements(*TTI, ScalarTy, VL.size()), MinVF);
23211 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
23212 if (MaxVF < 2) {
23213 R.getORE()->emit([&]() {
23214 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
23215 << "Cannot SLP vectorize list: vectorization factor "
23216 << "less than 2 is not supported";
23217 });
23218 return false;
23219 }
23220
23221 bool Changed = false;
23222 bool CandidateFound = false;
23223 InstructionCost MinCost = SLPCostThreshold.getValue();
23224
23225 unsigned NextInst = 0, MaxInst = VL.size();
23226 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
23227 VF = getFloorFullVectorNumberOfElements(*TTI, I0->getType(), VF - 1)) {
23228 // No actual vectorization should happen, if number of parts is the same as
23229 // provided vectorization factor (i.e. the scalar type is used for vector
23230 // code during codegen).
23231 auto *VecTy = getWidenedType(ScalarTy, VF);
23232 if (TTI->getNumberOfParts(VecTy) == VF)
23233 continue;
23234 for (unsigned I = NextInst; I < MaxInst; ++I) {
23235 unsigned ActualVF = std::min(MaxInst - I, VF);
23236
23237 if (!hasFullVectorsOrPowerOf2(*TTI, ScalarTy, ActualVF))
23238 continue;
23239
23240 if (MaxVFOnly && ActualVF < MaxVF)
23241 break;
23242 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
23243 break;
23244
23245 SmallVector<Value *> Ops(ActualVF, nullptr);
23246 unsigned Idx = 0;
23247 for (Value *V : VL.drop_front(I)) {
23248 // Check that a previous iteration of this loop did not delete the
23249 // Value.
23250 if (auto *Inst = dyn_cast<Instruction>(V);
23251 !Inst || !R.isDeleted(Inst)) {
23252 Ops[Idx] = V;
23253 ++Idx;
23254 if (Idx == ActualVF)
23255 break;
23256 }
23257 }
23258 // Not enough vectorizable instructions - exit.
23259 if (Idx != ActualVF)
23260 break;
23261
23262 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
23263 << "\n");
23264
23265 R.buildTree(Ops);
23266 if (R.isTreeTinyAndNotFullyVectorizable())
23267 continue;
23268 if (R.isProfitableToReorder()) {
23269 R.reorderTopToBottom();
23270 R.reorderBottomToTop(!isa<InsertElementInst>(Ops.front()));
23271 }
23272 R.transformNodes();
23273 R.buildExternalUses();
23274
23275 R.computeMinimumValueSizes();
23276 InstructionCost Cost = R.getTreeCost();
23277 CandidateFound = true;
23278 MinCost = std::min(MinCost, Cost);
23279
23280 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
23281 << " for VF=" << ActualVF << "\n");
23282 if (Cost < -SLPCostThreshold) {
23283 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
23284 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
23285 cast<Instruction>(Ops[0]))
23286 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
23287 << " and with tree size "
23288 << ore::NV("TreeSize", R.getTreeSize()));
23289
23290 R.vectorizeTree();
23291 // Move to the next bundle.
23292 I += VF - 1;
23293 NextInst = I + 1;
23294 Changed = true;
23295 }
23296 }
23297 }
23298
23299 if (!Changed && CandidateFound) {
23300 R.getORE()->emit([&]() {
23301 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
23302 << "List vectorization was possible but not beneficial with cost "
23303 << ore::NV("Cost", MinCost) << " >= "
23304 << ore::NV("Treshold", -SLPCostThreshold);
23305 });
23306 } else if (!Changed) {
23307 R.getORE()->emit([&]() {
23308 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
23309 << "Cannot SLP vectorize list: vectorization was impossible"
23310 << " with available vectorization factors";
23311 });
23312 }
23313 return Changed;
23314}
23315
23316namespace {
23317
23318/// Model horizontal reductions.
23319///
23320/// A horizontal reduction is a tree of reduction instructions that has values
23321/// that can be put into a vector as its leaves. For example:
23322///
23323/// mul mul mul mul
23324/// \ / \ /
23325/// + +
23326/// \ /
23327/// +
23328/// This tree has "mul" as its leaf values and "+" as its reduction
23329/// instructions. A reduction can feed into a store or a binary operation
23330/// feeding a phi.
23331/// ...
23332/// \ /
23333/// +
23334/// |
23335/// phi +=
23336///
23337/// Or:
23338/// ...
23339/// \ /
23340/// +
23341/// |
23342/// *p =
23343///
23344class HorizontalReduction {
23345 using ReductionOpsType = SmallVector<Value *, 16>;
23346 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
23347 ReductionOpsListType ReductionOps;
23348 /// List of possibly reduced values.
23350 /// Maps reduced value to the corresponding reduction operation.
23352 WeakTrackingVH ReductionRoot;
23353 /// The type of reduction operation.
23354 RecurKind RdxKind;
23355 /// Checks if the optimization of original scalar identity operations on
23356 /// matched horizontal reductions is enabled and allowed.
23357 bool IsSupportedHorRdxIdentityOp = false;
23358 /// The minimum number of the reduced values.
23359 const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
23360 /// Contains vector values for reduction including their scale factor and
23361 /// signedness.
23363
23364 static bool isCmpSelMinMax(Instruction *I) {
23365 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
23367 }
23368
23369 // And/or are potentially poison-safe logical patterns like:
23370 // select x, y, false
23371 // select x, true, y
23372 static bool isBoolLogicOp(Instruction *I) {
23373 return isa<SelectInst>(I) &&
23374 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
23375 }
23376
23377 /// Checks if instruction is associative and can be vectorized.
23378 static bool isVectorizable(RecurKind Kind, Instruction *I,
23379 bool TwoElementReduction = false) {
23380 if (Kind == RecurKind::None)
23381 return false;
23382
23383 // Integer ops that map to select instructions or intrinsics are fine.
23385 isBoolLogicOp(I))
23386 return true;
23387
23388 // No need to check for associativity, if 2 reduced values.
23389 if (TwoElementReduction)
23390 return true;
23391
23392 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
23393 // FP min/max are associative except for NaN and -0.0. We do not
23394 // have to rule out -0.0 here because the intrinsic semantics do not
23395 // specify a fixed result for it.
23396 return I->getFastMathFlags().noNaNs();
23397 }
23398
23399 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
23400 return true;
23401
23402 return I->isAssociative();
23403 }
23404
23405 static Value *getRdxOperand(Instruction *I, unsigned Index) {
23406 // Poison-safe 'or' takes the form: select X, true, Y
23407 // To make that work with the normal operand processing, we skip the
23408 // true value operand.
23409 // TODO: Change the code and data structures to handle this without a hack.
23410 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
23411 return I->getOperand(2);
23412 return I->getOperand(Index);
23413 }
23414
23415 /// Creates reduction operation with the current opcode.
23416 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
23417 Value *RHS, const Twine &Name, bool UseSelect) {
23418 Type *OpTy = LHS->getType();
23419 assert(OpTy == RHS->getType() && "Expected LHS and RHS of same type");
23420 switch (Kind) {
23421 case RecurKind::Or: {
23422 if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
23423 return Builder.CreateSelect(
23424 LHS, ConstantInt::getAllOnesValue(CmpInst::makeCmpResultType(OpTy)),
23425 RHS, Name);
23426 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
23427 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
23428 Name);
23429 }
23430 case RecurKind::And: {
23431 if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
23432 return Builder.CreateSelect(
23433 LHS, RHS,
23434 ConstantInt::getNullValue(CmpInst::makeCmpResultType(OpTy)), Name);
23435 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
23436 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
23437 Name);
23438 }
23439 case RecurKind::Add:
23440 case RecurKind::Mul:
23441 case RecurKind::Xor:
23442 case RecurKind::FAdd:
23443 case RecurKind::FMul: {
23444 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
23445 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
23446 Name);
23447 }
23448 case RecurKind::SMax:
23449 case RecurKind::SMin:
23450 case RecurKind::UMax:
23451 case RecurKind::UMin:
23452 if (UseSelect) {
23454 Value *Cmp = Builder.CreateICmp(Pred, LHS, RHS, Name);
23455 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
23456 }
23457 [[fallthrough]];
23458 case RecurKind::FMax:
23459 case RecurKind::FMin:
23460 case RecurKind::FMaximum:
23461 case RecurKind::FMinimum:
23462 case RecurKind::FMaximumNum:
23463 case RecurKind::FMinimumNum: {
23465 return Builder.CreateBinaryIntrinsic(Id, LHS, RHS);
23466 }
23467 default:
23468 llvm_unreachable("Unknown reduction operation.");
23469 }
23470 }
23471
23472 /// Creates reduction operation with the current opcode with the IR flags
23473 /// from \p ReductionOps, dropping nuw/nsw flags.
23474 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
23475 Value *RHS, const Twine &Name,
23476 const ReductionOpsListType &ReductionOps) {
23477 bool UseSelect = ReductionOps.size() == 2 ||
23478 // Logical or/and.
23479 (ReductionOps.size() == 1 &&
23480 any_of(ReductionOps.front(), IsaPred<SelectInst>));
23481 assert((!UseSelect || ReductionOps.size() != 2 ||
23482 isa<SelectInst>(ReductionOps[1][0])) &&
23483 "Expected cmp + select pairs for reduction");
23484 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
23486 if (auto *Sel = dyn_cast<SelectInst>(Op)) {
23487 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
23488 /*IncludeWrapFlags=*/false);
23489 propagateIRFlags(Op, ReductionOps[1], nullptr,
23490 /*IncludeWrapFlags=*/false);
23491 return Op;
23492 }
23493 }
23494 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
23495 return Op;
23496 }
23497
23498public:
23499 static RecurKind getRdxKind(Value *V) {
23500 auto *I = dyn_cast<Instruction>(V);
23501 if (!I)
23502 return RecurKind::None;
23503 if (match(I, m_Add(m_Value(), m_Value())))
23504 return RecurKind::Add;
23505 if (match(I, m_Mul(m_Value(), m_Value())))
23506 return RecurKind::Mul;
23507 if (match(I, m_And(m_Value(), m_Value())) ||
23509 return RecurKind::And;
23510 if (match(I, m_Or(m_Value(), m_Value())) ||
23512 return RecurKind::Or;
23513 if (match(I, m_Xor(m_Value(), m_Value())))
23514 return RecurKind::Xor;
23515 if (match(I, m_FAdd(m_Value(), m_Value())))
23516 return RecurKind::FAdd;
23517 if (match(I, m_FMul(m_Value(), m_Value())))
23518 return RecurKind::FMul;
23519
23520 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
23521 return RecurKind::FMax;
23522 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
23523 return RecurKind::FMin;
23524
23525 if (match(I, m_FMaximum(m_Value(), m_Value())))
23526 return RecurKind::FMaximum;
23527 if (match(I, m_FMinimum(m_Value(), m_Value())))
23528 return RecurKind::FMinimum;
23529 // This matches either cmp+select or intrinsics. SLP is expected to handle
23530 // either form.
23531 // TODO: If we are canonicalizing to intrinsics, we can remove several
23532 // special-case paths that deal with selects.
23533 if (match(I, m_SMax(m_Value(), m_Value())))
23534 return RecurKind::SMax;
23535 if (match(I, m_SMin(m_Value(), m_Value())))
23536 return RecurKind::SMin;
23537 if (match(I, m_UMax(m_Value(), m_Value())))
23538 return RecurKind::UMax;
23539 if (match(I, m_UMin(m_Value(), m_Value())))
23540 return RecurKind::UMin;
23541
23542 if (auto *Select = dyn_cast<SelectInst>(I)) {
23543 // Try harder: look for min/max pattern based on instructions producing
23544 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
23545 // During the intermediate stages of SLP, it's very common to have
23546 // pattern like this (since optimizeGatherSequence is run only once
23547 // at the end):
23548 // %1 = extractelement <2 x i32> %a, i32 0
23549 // %2 = extractelement <2 x i32> %a, i32 1
23550 // %cond = icmp sgt i32 %1, %2
23551 // %3 = extractelement <2 x i32> %a, i32 0
23552 // %4 = extractelement <2 x i32> %a, i32 1
23553 // %select = select i1 %cond, i32 %3, i32 %4
23554 CmpPredicate Pred;
23555 Instruction *L1;
23556 Instruction *L2;
23557
23558 Value *LHS = Select->getTrueValue();
23559 Value *RHS = Select->getFalseValue();
23560 Value *Cond = Select->getCondition();
23561
23562 // TODO: Support inverse predicates.
23563 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
23564 if (!isa<ExtractElementInst>(RHS) ||
23565 !L2->isIdenticalTo(cast<Instruction>(RHS)))
23566 return RecurKind::None;
23567 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
23568 if (!isa<ExtractElementInst>(LHS) ||
23569 !L1->isIdenticalTo(cast<Instruction>(LHS)))
23570 return RecurKind::None;
23571 } else {
23572 if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
23573 return RecurKind::None;
23574 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
23575 !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
23576 !L2->isIdenticalTo(cast<Instruction>(RHS)))
23577 return RecurKind::None;
23578 }
23579
23580 switch (Pred) {
23581 default:
23582 return RecurKind::None;
23583 case CmpInst::ICMP_SGT:
23584 case CmpInst::ICMP_SGE:
23585 return RecurKind::SMax;
23586 case CmpInst::ICMP_SLT:
23587 case CmpInst::ICMP_SLE:
23588 return RecurKind::SMin;
23589 case CmpInst::ICMP_UGT:
23590 case CmpInst::ICMP_UGE:
23591 return RecurKind::UMax;
23592 case CmpInst::ICMP_ULT:
23593 case CmpInst::ICMP_ULE:
23594 return RecurKind::UMin;
23595 }
23596 }
23597 return RecurKind::None;
23598 }
23599
23600 /// Get the index of the first operand.
23601 static unsigned getFirstOperandIndex(Instruction *I) {
23602 return isCmpSelMinMax(I) ? 1 : 0;
23603 }
23604
23605private:
23606 /// Total number of operands in the reduction operation.
23607 static unsigned getNumberOfOperands(Instruction *I) {
23608 return isCmpSelMinMax(I) ? 3 : 2;
23609 }
23610
23611 /// Checks if the instruction is in basic block \p BB.
23612 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
23613 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
23614 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
23615 auto *Sel = cast<SelectInst>(I);
23616 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
23617 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
23618 }
23619 return I->getParent() == BB;
23620 }
23621
23622 /// Expected number of uses for reduction operations/reduced values.
23623 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
23624 if (IsCmpSelMinMax) {
23625 // SelectInst must be used twice while the condition op must have single
23626 // use only.
23627 if (auto *Sel = dyn_cast<SelectInst>(I))
23628 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
23629 return I->hasNUses(2);
23630 }
23631
23632 // Arithmetic reduction operation must be used once only.
23633 return I->hasOneUse();
23634 }
23635
23636 /// Initializes the list of reduction operations.
23637 void initReductionOps(Instruction *I) {
23638 if (isCmpSelMinMax(I))
23639 ReductionOps.assign(2, ReductionOpsType());
23640 else
23641 ReductionOps.assign(1, ReductionOpsType());
23642 }
23643
23644 /// Add all reduction operations for the reduction instruction \p I.
23645 void addReductionOps(Instruction *I) {
23646 if (isCmpSelMinMax(I)) {
23647 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
23648 ReductionOps[1].emplace_back(I);
23649 } else {
23650 ReductionOps[0].emplace_back(I);
23651 }
23652 }
23653
23654 static bool isGoodForReduction(ArrayRef<Value *> Data) {
23655 int Sz = Data.size();
23656 auto *I = dyn_cast<Instruction>(Data.front());
23657 return Sz > 1 || isConstant(Data.front()) ||
23658 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
23659 }
23660
23661public:
23662 HorizontalReduction() = default;
23664 : ReductionRoot(I), ReductionLimit(2) {
23665 RdxKind = HorizontalReduction::getRdxKind(I);
23666 ReductionOps.emplace_back().push_back(I);
23667 ReducedVals.emplace_back().assign(Ops.begin(), Ops.end());
23668 for (Value *V : Ops)
23669 ReducedValsToOps[V].push_back(I);
23670 }
23671
23672 bool matchReductionForOperands() const {
23673 // Analyze "regular" integer/FP types for reductions - no target-specific
23674 // types or pointers.
23675 assert(ReductionRoot && "Reduction root is not set!");
23676 if (!isVectorizable(RdxKind, cast<Instruction>(ReductionRoot),
23677 all_of(ReducedVals, [](ArrayRef<Value *> Ops) {
23678 return Ops.size() == 2;
23679 })))
23680 return false;
23681
23682 return true;
23683 }
23684
23685 /// Try to find a reduction tree.
23686 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
23687 ScalarEvolution &SE, const DataLayout &DL,
23688 const TargetLibraryInfo &TLI) {
23689 RdxKind = HorizontalReduction::getRdxKind(Root);
23690 if (!isVectorizable(RdxKind, Root))
23691 return false;
23692
23693 // Analyze "regular" integer/FP types for reductions - no target-specific
23694 // types or pointers.
23695 Type *Ty = Root->getType();
23696 if (!isValidElementType(Ty) || Ty->isPointerTy())
23697 return false;
23698
23699 // Though the ultimate reduction may have multiple uses, its condition must
23700 // have only single use.
23701 if (auto *Sel = dyn_cast<SelectInst>(Root))
23702 if (!Sel->getCondition()->hasOneUse())
23703 return false;
23704
23705 ReductionRoot = Root;
23706
23707 // Iterate through all the operands of the possible reduction tree and
23708 // gather all the reduced values, sorting them by their value id.
23709 BasicBlock *BB = Root->getParent();
23710 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
23712 1, std::make_pair(Root, 0));
23713 // Checks if the operands of the \p TreeN instruction are also reduction
23714 // operations or should be treated as reduced values or an extra argument,
23715 // which is not part of the reduction.
23716 auto CheckOperands = [&](Instruction *TreeN,
23717 SmallVectorImpl<Value *> &PossibleReducedVals,
23718 SmallVectorImpl<Instruction *> &ReductionOps,
23719 unsigned Level) {
23720 for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
23721 getNumberOfOperands(TreeN)))) {
23722 Value *EdgeVal = getRdxOperand(TreeN, I);
23723 ReducedValsToOps[EdgeVal].push_back(TreeN);
23724 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
23725 // If the edge is not an instruction, or it is different from the main
23726 // reduction opcode or has too many uses - possible reduced value.
23727 // Also, do not try to reduce const values, if the operation is not
23728 // foldable.
23729 if (!EdgeInst || Level > RecursionMaxDepth ||
23730 getRdxKind(EdgeInst) != RdxKind ||
23731 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
23732 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
23733 !isVectorizable(RdxKind, EdgeInst) ||
23734 (R.isAnalyzedReductionRoot(EdgeInst) &&
23735 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
23736 PossibleReducedVals.push_back(EdgeVal);
23737 continue;
23738 }
23739 ReductionOps.push_back(EdgeInst);
23740 }
23741 };
23742 // Try to regroup reduced values so that it gets more profitable to try to
23743 // reduce them. Values are grouped by their value ids, instructions - by
23744 // instruction op id and/or alternate op id, plus do extra analysis for
23745 // loads (grouping them by the distance between pointers) and cmp
23746 // instructions (grouping them by the predicate).
23749 8>
23750 PossibleReducedVals;
23751 initReductionOps(Root);
23753 SmallSet<size_t, 2> LoadKeyUsed;
23754
23755 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
23756 Key = hash_combine(hash_value(LI->getParent()), Key);
23757 Value *Ptr =
23759 if (!LoadKeyUsed.insert(Key).second) {
23760 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
23761 if (LIt != LoadsMap.end()) {
23762 for (LoadInst *RLI : LIt->second) {
23763 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
23764 LI->getType(), LI->getPointerOperand(), DL, SE,
23765 /*StrictCheck=*/true))
23766 return hash_value(RLI->getPointerOperand());
23767 }
23768 for (LoadInst *RLI : LIt->second) {
23770 LI->getPointerOperand(), TLI)) {
23771 hash_code SubKey = hash_value(RLI->getPointerOperand());
23772 return SubKey;
23773 }
23774 }
23775 if (LIt->second.size() > 2) {
23776 hash_code SubKey =
23777 hash_value(LIt->second.back()->getPointerOperand());
23778 return SubKey;
23779 }
23780 }
23781 }
23782 LoadsMap.try_emplace(std::make_pair(Key, Ptr))
23783 .first->second.push_back(LI);
23784 return hash_value(LI->getPointerOperand());
23785 };
23786
23787 while (!Worklist.empty()) {
23788 auto [TreeN, Level] = Worklist.pop_back_val();
23789 SmallVector<Value *> PossibleRedVals;
23790 SmallVector<Instruction *> PossibleReductionOps;
23791 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
23792 addReductionOps(TreeN);
23793 // Add reduction values. The values are sorted for better vectorization
23794 // results.
23795 for (Value *V : PossibleRedVals) {
23796 size_t Key, Idx;
23797 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
23798 /*AllowAlternate=*/false);
23799 ++PossibleReducedVals[Key][Idx].try_emplace(V, 0).first->second;
23800 }
23801 for (Instruction *I : reverse(PossibleReductionOps))
23802 Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);
23803 }
23804 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
23805 // Sort values by the total number of values kinds to start the reduction
23806 // from the longest possible reduced values sequences.
23807 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
23808 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
23809 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
23810 for (auto &Slice : PossibleRedVals) {
23811 PossibleRedValsVect.emplace_back();
23812 auto RedValsVect = Slice.second.takeVector();
23813 stable_sort(RedValsVect, llvm::less_second());
23814 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
23815 PossibleRedValsVect.back().append(Data.second, Data.first);
23816 }
23817 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
23818 return P1.size() > P2.size();
23819 });
23820 int NewIdx = -1;
23821 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
23822 if (NewIdx < 0 ||
23823 (!isGoodForReduction(Data) &&
23824 (!isa<LoadInst>(Data.front()) ||
23825 !isa<LoadInst>(ReducedVals[NewIdx].front()) ||
23827 cast<LoadInst>(Data.front())->getPointerOperand()) !=
23829 cast<LoadInst>(ReducedVals[NewIdx].front())
23830 ->getPointerOperand())))) {
23831 NewIdx = ReducedVals.size();
23832 ReducedVals.emplace_back();
23833 }
23834 ReducedVals[NewIdx].append(Data.rbegin(), Data.rend());
23835 }
23836 }
23837 // Sort the reduced values by number of same/alternate opcode and/or pointer
23838 // operand.
23839 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
23840 return P1.size() > P2.size();
23841 });
23842 return true;
23843 }
23844
23845 /// Attempt to vectorize the tree found by matchAssociativeReduction.
23846 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
23847 const TargetLibraryInfo &TLI, AssumptionCache *AC) {
23848 constexpr unsigned RegMaxNumber = 4;
23849 constexpr unsigned RedValsMaxNumber = 128;
23850 // If there are a sufficient number of reduction values, reduce
23851 // to a nearby power-of-2. We can safely generate oversized
23852 // vectors and rely on the backend to split them to legal sizes.
23853 if (unsigned NumReducedVals = std::accumulate(
23854 ReducedVals.begin(), ReducedVals.end(), 0,
23855 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
23856 if (!isGoodForReduction(Vals))
23857 return Num;
23858 return Num + Vals.size();
23859 });
23860 NumReducedVals < ReductionLimit &&
23861 all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
23862 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
23863 })) {
23864 for (ReductionOpsType &RdxOps : ReductionOps)
23865 for (Value *RdxOp : RdxOps)
23866 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
23867 return nullptr;
23868 }
23869
23870 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
23871 TargetFolder(DL));
23872 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
23873
23874 // Track the reduced values in case if they are replaced by extractelement
23875 // because of the vectorization.
23876 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *
23877 ReducedVals.front().size());
23878
23879 // The compare instruction of a min/max is the insertion point for new
23880 // instructions and may be replaced with a new compare instruction.
23881 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
23882 assert(isa<SelectInst>(RdxRootInst) &&
23883 "Expected min/max reduction to have select root instruction");
23884 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
23885 assert(isa<Instruction>(ScalarCond) &&
23886 "Expected min/max reduction to have compare condition");
23887 return cast<Instruction>(ScalarCond);
23888 };
23889
23890 bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {
23891 return isBoolLogicOp(cast<Instruction>(V));
23892 });
23893 // Return new VectorizedTree, based on previous value.
23894 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
23895 if (VectorizedTree) {
23896 // Update the final value in the reduction.
23898 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
23899 if (AnyBoolLogicOp) {
23900 auto It = ReducedValsToOps.find(VectorizedTree);
23901 auto It1 = ReducedValsToOps.find(Res);
23902 if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) ||
23903 isGuaranteedNotToBePoison(VectorizedTree, AC) ||
23904 (It != ReducedValsToOps.end() &&
23905 any_of(It->getSecond(), [&](Instruction *I) {
23906 return isBoolLogicOp(I) &&
23907 getRdxOperand(I, 0) == VectorizedTree;
23908 }))) {
23909 ;
23910 } else if (isGuaranteedNotToBePoison(Res, AC) ||
23911 (It1 != ReducedValsToOps.end() &&
23912 any_of(It1->getSecond(), [&](Instruction *I) {
23913 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
23914 }))) {
23915 std::swap(VectorizedTree, Res);
23916 } else {
23917 VectorizedTree = Builder.CreateFreeze(VectorizedTree);
23918 }
23919 }
23920
23921 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
23922 ReductionOps);
23923 }
23924 // Initialize the final value in the reduction.
23925 return Res;
23926 };
23927 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
23928 ReductionOps.front().size());
23929 for (ReductionOpsType &RdxOps : ReductionOps)
23930 for (Value *RdxOp : RdxOps) {
23931 if (!RdxOp)
23932 continue;
23933 IgnoreList.insert(RdxOp);
23934 }
23935 // Intersect the fast-math-flags from all reduction operations.
23936 FastMathFlags RdxFMF;
23937 RdxFMF.set();
23938 for (Value *U : IgnoreList)
23939 if (auto *FPMO = dyn_cast<FPMathOperator>(U))
23940 RdxFMF &= FPMO->getFastMathFlags();
23941 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
23942
23943 // Need to track reduced vals, they may be changed during vectorization of
23944 // subvectors.
23945 for (ArrayRef<Value *> Candidates : ReducedVals)
23946 for (Value *V : Candidates)
23947 TrackedVals.try_emplace(V, V);
23948
23950 Value *V) -> unsigned & {
23951 auto *It = MV.find(V);
23952 assert(It != MV.end() && "Unable to find given key.");
23953 return It->second;
23954 };
23955
23956 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
23957 // List of the values that were reduced in other trees as part of gather
23958 // nodes and thus requiring extract if fully vectorized in other trees.
23959 SmallPtrSet<Value *, 4> RequiredExtract;
23960 WeakTrackingVH VectorizedTree = nullptr;
23961 bool CheckForReusedReductionOps = false;
23962 // Try to vectorize elements based on their type.
23964 for (ArrayRef<Value *> RV : ReducedVals)
23965 States.push_back(getSameOpcode(RV, TLI));
23966 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
23967 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
23968 InstructionsState S = States[I];
23969 SmallVector<Value *> Candidates;
23970 Candidates.reserve(2 * OrigReducedVals.size());
23971 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
23972 for (Value *ReducedVal : OrigReducedVals) {
23973 Value *RdxVal = TrackedVals.at(ReducedVal);
23974 // Check if the reduction value was not overriden by the extractelement
23975 // instruction because of the vectorization and exclude it, if it is not
23976 // compatible with other values.
23977 // Also check if the instruction was folded to constant/other value.
23978 auto *Inst = dyn_cast<Instruction>(RdxVal);
23979 if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
23980 (!S || !S.getMatchingMainOpOrAltOp(Inst))) ||
23981 (S && !Inst))
23982 continue;
23983 Candidates.push_back(RdxVal);
23984 TrackedToOrig.try_emplace(RdxVal, ReducedVal);
23985 }
23986 bool ShuffledExtracts = false;
23987 // Try to handle shuffled extractelements.
23988 if (S && S.getOpcode() == Instruction::ExtractElement &&
23989 !S.isAltShuffle() && I + 1 < E) {
23990 SmallVector<Value *> CommonCandidates(Candidates);
23991 for (Value *RV : ReducedVals[I + 1]) {
23992 Value *RdxVal = TrackedVals.at(RV);
23993 // Check if the reduction value was not overriden by the
23994 // extractelement instruction because of the vectorization and
23995 // exclude it, if it is not compatible with other values.
23996 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
23997 if (!Inst)
23998 continue;
23999 CommonCandidates.push_back(RdxVal);
24000 TrackedToOrig.try_emplace(RdxVal, RV);
24001 }
24003 if (isFixedVectorShuffle(CommonCandidates, Mask, AC)) {
24004 ++I;
24005 Candidates.swap(CommonCandidates);
24006 ShuffledExtracts = true;
24007 }
24008 }
24009
24010 // Emit code for constant values.
24011 if (Candidates.size() > 1 && allConstant(Candidates)) {
24012 Value *Res = Candidates.front();
24013 Value *OrigV = TrackedToOrig.at(Candidates.front());
24014 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24015 for (Value *VC : ArrayRef(Candidates).drop_front()) {
24016 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
24017 Value *OrigV = TrackedToOrig.at(VC);
24018 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24019 if (auto *ResI = dyn_cast<Instruction>(Res))
24020 V.analyzedReductionRoot(ResI);
24021 }
24022 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
24023 continue;
24024 }
24025
24026 unsigned NumReducedVals = Candidates.size();
24027 if (NumReducedVals < ReductionLimit &&
24028 (NumReducedVals < 2 || !isSplat(Candidates)))
24029 continue;
24030
24031 // Check if we support repeated scalar values processing (optimization of
24032 // original scalar identity operations on matched horizontal reductions).
24033 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
24034 RdxKind != RecurKind::FMul &&
24035 RdxKind != RecurKind::FMulAdd;
24036 // Gather same values.
24037 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
24038 if (IsSupportedHorRdxIdentityOp)
24039 for (Value *V : Candidates) {
24040 Value *OrigV = TrackedToOrig.at(V);
24041 ++SameValuesCounter.try_emplace(OrigV).first->second;
24042 }
24043 // Used to check if the reduced values used same number of times. In this
24044 // case the compiler may produce better code. E.g. if reduced values are
24045 // aabbccdd (8 x values), then the first node of the tree will have a node
24046 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
24047 // Plus, the final reduction will be performed on <8 x aabbccdd>.
24048 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
24049 // x abcd) * 2.
24050 // Currently it only handles add/fadd/xor. and/or/min/max do not require
24051 // this analysis, other operations may require an extra estimation of
24052 // the profitability.
24053 bool SameScaleFactor = false;
24054 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
24055 SameValuesCounter.size() != Candidates.size();
24056 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
24057 if (OptReusedScalars) {
24058 SameScaleFactor =
24059 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
24060 RdxKind == RecurKind::Xor) &&
24061 all_of(drop_begin(SameValuesCounter),
24062 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
24063 return P.second == SameValuesCounter.front().second;
24064 });
24065 Candidates.resize(SameValuesCounter.size());
24066 transform(SameValuesCounter, Candidates.begin(),
24067 [&](const auto &P) { return TrackedVals.at(P.first); });
24068 NumReducedVals = Candidates.size();
24069 // Have a reduction of the same element.
24070 if (NumReducedVals == 1) {
24071 Value *OrigV = TrackedToOrig.at(Candidates.front());
24072 unsigned Cnt = At(SameValuesCounter, OrigV);
24073 Value *RedVal =
24074 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
24075 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24076 VectorizedVals.try_emplace(OrigV, Cnt);
24077 ExternallyUsedValues.insert(OrigV);
24078 continue;
24079 }
24080 }
24081
24082 unsigned MaxVecRegSize = V.getMaxVecRegSize();
24083 unsigned EltSize = V.getVectorElementSize(Candidates[0]);
24084 const unsigned MaxElts = std::clamp<unsigned>(
24085 llvm::bit_floor(MaxVecRegSize / EltSize), RedValsMaxNumber,
24086 RegMaxNumber * RedValsMaxNumber);
24087
24088 unsigned ReduxWidth = NumReducedVals;
24089 auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) {
24090 unsigned NumParts, NumRegs;
24091 Type *ScalarTy = Candidates.front()->getType();
24092 ReduxWidth =
24093 getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth);
24094 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
24095 NumParts = ::getNumberOfParts(TTI, Tp);
24096 NumRegs =
24098 while (NumParts > NumRegs) {
24099 assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0.");
24100 ReduxWidth = bit_floor(ReduxWidth - 1);
24101 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
24102 NumParts = ::getNumberOfParts(TTI, Tp);
24103 NumRegs =
24105 }
24106 if (NumParts > NumRegs / 2)
24107 ReduxWidth = bit_floor(ReduxWidth);
24108 return ReduxWidth;
24109 };
24110 if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))
24111 ReduxWidth = GetVectorFactor(ReduxWidth);
24112 ReduxWidth = std::min(ReduxWidth, MaxElts);
24113
24114 unsigned Start = 0;
24115 unsigned Pos = Start;
24116 // Restarts vectorization attempt with lower vector factor.
24117 unsigned PrevReduxWidth = ReduxWidth;
24118 bool CheckForReusedReductionOpsLocal = false;
24119 auto AdjustReducedVals = [&](bool IgnoreVL = false) {
24120 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
24121 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
24122 // Check if any of the reduction ops are gathered. If so, worth
24123 // trying again with less number of reduction ops.
24124 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
24125 }
24126 ++Pos;
24127 if (Pos < NumReducedVals - ReduxWidth + 1)
24128 return IsAnyRedOpGathered;
24129 Pos = Start;
24130 --ReduxWidth;
24131 if (ReduxWidth > 1)
24132 ReduxWidth = GetVectorFactor(ReduxWidth);
24133 return IsAnyRedOpGathered;
24134 };
24135 bool AnyVectorized = false;
24136 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
24137 while (Pos < NumReducedVals - ReduxWidth + 1 &&
24138 ReduxWidth >= ReductionLimit) {
24139 // Dependency in tree of the reduction ops - drop this attempt, try
24140 // later.
24141 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
24142 Start == 0) {
24143 CheckForReusedReductionOps = true;
24144 break;
24145 }
24146 PrevReduxWidth = ReduxWidth;
24147 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
24148 // Been analyzed already - skip.
24149 if (IgnoredCandidates.contains(std::make_pair(Pos, ReduxWidth)) ||
24150 (!has_single_bit(ReduxWidth) &&
24151 (IgnoredCandidates.contains(
24152 std::make_pair(Pos, bit_floor(ReduxWidth))) ||
24153 IgnoredCandidates.contains(
24154 std::make_pair(Pos + (ReduxWidth - bit_floor(ReduxWidth)),
24155 bit_floor(ReduxWidth))))) ||
24156 V.areAnalyzedReductionVals(VL)) {
24157 (void)AdjustReducedVals(/*IgnoreVL=*/true);
24158 continue;
24159 }
24160 // Early exit if any of the reduction values were deleted during
24161 // previous vectorization attempts.
24162 if (any_of(VL, [&V](Value *RedVal) {
24163 auto *RedValI = dyn_cast<Instruction>(RedVal);
24164 if (!RedValI)
24165 return false;
24166 return V.isDeleted(RedValI);
24167 }))
24168 break;
24169 V.buildTree(VL, IgnoreList);
24170 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
24171 if (!AdjustReducedVals())
24172 V.analyzedReductionVals(VL);
24173 continue;
24174 }
24175 if (V.isLoadCombineReductionCandidate(RdxKind)) {
24176 if (!AdjustReducedVals())
24177 V.analyzedReductionVals(VL);
24178 continue;
24179 }
24180 V.reorderTopToBottom();
24181 // No need to reorder the root node at all for reassociative reduction.
24182 V.reorderBottomToTop(/*IgnoreReorder=*/RdxFMF.allowReassoc() ||
24183 VL.front()->getType()->isIntOrIntVectorTy() ||
24184 ReductionLimit > 2);
24185 // Keep extracted other reduction values, if they are used in the
24186 // vectorization trees.
24187 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
24188 ExternallyUsedValues);
24189 // The reduction root is used as the insertion point for new
24190 // instructions, so set it as externally used to prevent it from being
24191 // deleted.
24192 LocalExternallyUsedValues.insert(ReductionRoot);
24193 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
24194 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
24195 continue;
24196 for (Value *V : ReducedVals[Cnt])
24197 if (isa<Instruction>(V))
24198 LocalExternallyUsedValues.insert(TrackedVals[V]);
24199 }
24200 if (!IsSupportedHorRdxIdentityOp) {
24201 // Number of uses of the candidates in the vector of values.
24202 assert(SameValuesCounter.empty() &&
24203 "Reused values counter map is not empty");
24204 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24205 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24206 continue;
24207 Value *V = Candidates[Cnt];
24208 Value *OrigV = TrackedToOrig.at(V);
24209 ++SameValuesCounter.try_emplace(OrigV).first->second;
24210 }
24211 }
24212 V.transformNodes();
24214 // Gather externally used values.
24216 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24217 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24218 continue;
24219 Value *RdxVal = Candidates[Cnt];
24220 if (auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
24221 RdxVal = It->second;
24222 if (!Visited.insert(RdxVal).second)
24223 continue;
24224 // Check if the scalar was vectorized as part of the vectorization
24225 // tree but not the top node.
24226 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
24227 LocalExternallyUsedValues.insert(RdxVal);
24228 continue;
24229 }
24230 Value *OrigV = TrackedToOrig.at(RdxVal);
24231 unsigned NumOps =
24232 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
24233 if (NumOps != ReducedValsToOps.at(OrigV).size())
24234 LocalExternallyUsedValues.insert(RdxVal);
24235 }
24236 // Do not need the list of reused scalars in regular mode anymore.
24237 if (!IsSupportedHorRdxIdentityOp)
24238 SameValuesCounter.clear();
24239 for (Value *RdxVal : VL)
24240 if (RequiredExtract.contains(RdxVal))
24241 LocalExternallyUsedValues.insert(RdxVal);
24242 V.buildExternalUses(LocalExternallyUsedValues);
24243
24244 V.computeMinimumValueSizes();
24245
24246 // Estimate cost.
24247 InstructionCost ReductionCost =
24248 getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V);
24249 InstructionCost Cost = V.getTreeCost(VL, ReductionCost);
24250 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
24251 << " for reduction\n");
24252 if (!Cost.isValid())
24253 break;
24254 if (Cost >= -SLPCostThreshold) {
24255 V.getORE()->emit([&]() {
24256 return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
24257 ReducedValsToOps.at(VL[0]).front())
24258 << "Vectorizing horizontal reduction is possible "
24259 << "but not beneficial with cost " << ore::NV("Cost", Cost)
24260 << " and threshold "
24261 << ore::NV("Threshold", -SLPCostThreshold);
24262 });
24263 if (!AdjustReducedVals()) {
24264 V.analyzedReductionVals(VL);
24265 unsigned Offset = Pos == Start ? Pos : Pos - 1;
24266 if (ReduxWidth > ReductionLimit && V.isTreeNotExtendable()) {
24267 // Add subvectors of VL to the list of the analyzed values.
24268 for (unsigned VF = getFloorFullVectorNumberOfElements(
24269 *TTI, VL.front()->getType(), ReduxWidth - 1);
24270 VF >= ReductionLimit;
24272 *TTI, VL.front()->getType(), VF - 1)) {
24273 if (has_single_bit(VF) &&
24274 V.getCanonicalGraphSize() != V.getTreeSize())
24275 continue;
24276 for (unsigned Idx : seq<unsigned>(ReduxWidth - VF))
24277 IgnoredCandidates.insert(std::make_pair(Offset + Idx, VF));
24278 }
24279 }
24280 }
24281 continue;
24282 }
24283
24284 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
24285 << Cost << ". (HorRdx)\n");
24286 V.getORE()->emit([&]() {
24287 return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
24288 ReducedValsToOps.at(VL[0]).front())
24289 << "Vectorized horizontal reduction with cost "
24290 << ore::NV("Cost", Cost) << " and with tree size "
24291 << ore::NV("TreeSize", V.getTreeSize());
24292 });
24293
24294 Builder.setFastMathFlags(RdxFMF);
24295
24296 // Emit a reduction. If the root is a select (min/max idiom), the insert
24297 // point is the compare condition of that select.
24298 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
24299 Instruction *InsertPt = RdxRootInst;
24300 if (IsCmpSelMinMax)
24301 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
24302
24303 // Vectorize a tree.
24304 Value *VectorizedRoot = V.vectorizeTree(
24305 LocalExternallyUsedValues, InsertPt, VectorValuesAndScales);
24306 // Update TrackedToOrig mapping, since the tracked values might be
24307 // updated.
24308 for (Value *RdxVal : Candidates) {
24309 Value *OrigVal = TrackedToOrig.at(RdxVal);
24310 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
24311 if (TransformedRdxVal != RdxVal)
24312 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
24313 }
24314
24315 Builder.SetInsertPoint(InsertPt);
24316
24317 // To prevent poison from leaking across what used to be sequential,
24318 // safe, scalar boolean logic operations, the reduction operand must be
24319 // frozen.
24320 if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot, AC))
24321 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
24322
24323 // Emit code to correctly handle reused reduced values, if required.
24324 if (OptReusedScalars && !SameScaleFactor) {
24325 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
24326 SameValuesCounter, TrackedToOrig);
24327 }
24328
24329 Type *ScalarTy = VL.front()->getType();
24330 Type *VecTy = VectorizedRoot->getType();
24331 Type *RedScalarTy = VecTy->getScalarType();
24332 VectorValuesAndScales.emplace_back(
24333 VectorizedRoot,
24334 OptReusedScalars && SameScaleFactor
24335 ? SameValuesCounter.front().second
24336 : 1,
24337 RedScalarTy != ScalarTy->getScalarType()
24338 ? V.isSignedMinBitwidthRootNode()
24339 : true);
24340
24341 // Count vectorized reduced values to exclude them from final reduction.
24342 for (Value *RdxVal : VL) {
24343 Value *OrigV = TrackedToOrig.at(RdxVal);
24344 if (IsSupportedHorRdxIdentityOp) {
24345 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
24346 continue;
24347 }
24348 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24349 if (!V.isVectorized(RdxVal))
24350 RequiredExtract.insert(RdxVal);
24351 }
24352 Pos += ReduxWidth;
24353 Start = Pos;
24354 ReduxWidth = NumReducedVals - Pos;
24355 if (ReduxWidth > 1)
24356 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
24357 AnyVectorized = true;
24358 }
24359 if (OptReusedScalars && !AnyVectorized) {
24360 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
24361 Value *RdxVal = TrackedVals.at(P.first);
24362 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder, P.second);
24363 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24364 VectorizedVals.try_emplace(P.first, P.second);
24365 }
24366 continue;
24367 }
24368 }
24369 if (!VectorValuesAndScales.empty())
24370 VectorizedTree = GetNewVectorizedTree(
24371 VectorizedTree,
24372 emitReduction(Builder, *TTI, ReductionRoot->getType()));
24373 if (VectorizedTree) {
24374 // Reorder operands of bool logical op in the natural order to avoid
24375 // possible problem with poison propagation. If not possible to reorder
24376 // (both operands are originally RHS), emit an extra freeze instruction
24377 // for the LHS operand.
24378 // I.e., if we have original code like this:
24379 // RedOp1 = select i1 ?, i1 LHS, i1 false
24380 // RedOp2 = select i1 RHS, i1 ?, i1 false
24381
24382 // Then, we swap LHS/RHS to create a new op that matches the poison
24383 // semantics of the original code.
24384
24385 // If we have original code like this and both values could be poison:
24386 // RedOp1 = select i1 ?, i1 LHS, i1 false
24387 // RedOp2 = select i1 ?, i1 RHS, i1 false
24388
24389 // Then, we must freeze LHS in the new op.
24390 auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS,
24391 Instruction *RedOp1,
24392 Instruction *RedOp2,
24393 bool InitStep) {
24394 if (!AnyBoolLogicOp)
24395 return;
24396 if (isBoolLogicOp(RedOp1) && ((!InitStep && LHS == VectorizedTree) ||
24397 getRdxOperand(RedOp1, 0) == LHS ||
24399 return;
24400 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
24401 getRdxOperand(RedOp2, 0) == RHS ||
24403 std::swap(LHS, RHS);
24404 return;
24405 }
24406 if (LHS != VectorizedTree)
24407 LHS = Builder.CreateFreeze(LHS);
24408 };
24409 // Finish the reduction.
24410 // Need to add extra arguments and not vectorized possible reduction
24411 // values.
24412 // Try to avoid dependencies between the scalar remainders after
24413 // reductions.
24414 auto FinalGen =
24416 bool InitStep) {
24417 unsigned Sz = InstVals.size();
24419 Sz % 2);
24420 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
24421 Instruction *RedOp = InstVals[I + 1].first;
24422 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
24423 Value *RdxVal1 = InstVals[I].second;
24424 Value *StableRdxVal1 = RdxVal1;
24425 auto It1 = TrackedVals.find(RdxVal1);
24426 if (It1 != TrackedVals.end())
24427 StableRdxVal1 = It1->second;
24428 Value *RdxVal2 = InstVals[I + 1].second;
24429 Value *StableRdxVal2 = RdxVal2;
24430 auto It2 = TrackedVals.find(RdxVal2);
24431 if (It2 != TrackedVals.end())
24432 StableRdxVal2 = It2->second;
24433 // To prevent poison from leaking across what used to be
24434 // sequential, safe, scalar boolean logic operations, the
24435 // reduction operand must be frozen.
24436 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
24437 RedOp, InitStep);
24438 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
24439 StableRdxVal2, "op.rdx", ReductionOps);
24440 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
24441 }
24442 if (Sz % 2 == 1)
24443 ExtraReds[Sz / 2] = InstVals.back();
24444 return ExtraReds;
24445 };
24447 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
24448 VectorizedTree);
24450 for (ArrayRef<Value *> Candidates : ReducedVals) {
24451 for (Value *RdxVal : Candidates) {
24452 if (!Visited.insert(RdxVal).second)
24453 continue;
24454 unsigned NumOps = VectorizedVals.lookup(RdxVal);
24455 for (Instruction *RedOp :
24456 ArrayRef(ReducedValsToOps.at(RdxVal)).drop_back(NumOps))
24457 ExtraReductions.emplace_back(RedOp, RdxVal);
24458 }
24459 }
24460 // Iterate through all not-vectorized reduction values/extra arguments.
24461 bool InitStep = true;
24462 while (ExtraReductions.size() > 1) {
24464 FinalGen(ExtraReductions, InitStep);
24465 ExtraReductions.swap(NewReds);
24466 InitStep = false;
24467 }
24468 VectorizedTree = ExtraReductions.front().second;
24469
24470 ReductionRoot->replaceAllUsesWith(VectorizedTree);
24471
24472 // The original scalar reduction is expected to have no remaining
24473 // uses outside the reduction tree itself. Assert that we got this
24474 // correct, replace internal uses with undef, and mark for eventual
24475 // deletion.
24476#ifndef NDEBUG
24477 SmallPtrSet<Value *, 4> IgnoreSet;
24478 for (ArrayRef<Value *> RdxOps : ReductionOps)
24479 IgnoreSet.insert_range(RdxOps);
24480#endif
24481 for (ArrayRef<Value *> RdxOps : ReductionOps) {
24482 for (Value *Ignore : RdxOps) {
24483 if (!Ignore)
24484 continue;
24485#ifndef NDEBUG
24486 for (auto *U : Ignore->users()) {
24487 assert(IgnoreSet.count(U) &&
24488 "All users must be either in the reduction ops list.");
24489 }
24490#endif
24491 if (!Ignore->use_empty()) {
24492 Value *P = PoisonValue::get(Ignore->getType());
24493 Ignore->replaceAllUsesWith(P);
24494 }
24495 }
24496 V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
24497 }
24498 } else if (!CheckForReusedReductionOps) {
24499 for (ReductionOpsType &RdxOps : ReductionOps)
24500 for (Value *RdxOp : RdxOps)
24501 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
24502 }
24503 return VectorizedTree;
24504 }
24505
24506private:
24507 /// Creates the reduction from the given \p Vec vector value with the given
24508 /// scale \p Scale and signedness \p IsSigned.
24509 Value *createSingleOp(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
24510 Value *Vec, unsigned Scale, bool IsSigned,
24511 Type *DestTy) {
24512 Value *Rdx;
24513 if (auto *VecTy = dyn_cast<FixedVectorType>(DestTy)) {
24514 unsigned DestTyNumElements = getNumElements(VecTy);
24515 unsigned VF = getNumElements(Vec->getType()) / DestTyNumElements;
24516 Rdx = PoisonValue::get(
24517 getWidenedType(Vec->getType()->getScalarType(), DestTyNumElements));
24518 for (unsigned I : seq<unsigned>(DestTyNumElements)) {
24519 // Do reduction for each lane.
24520 // e.g., do reduce add for
24521 // VL[0] = <4 x Ty> <a, b, c, d>
24522 // VL[1] = <4 x Ty> <e, f, g, h>
24523 // Lane[0] = <2 x Ty> <a, e>
24524 // Lane[1] = <2 x Ty> <b, f>
24525 // Lane[2] = <2 x Ty> <c, g>
24526 // Lane[3] = <2 x Ty> <d, h>
24527 // result[0] = reduce add Lane[0]
24528 // result[1] = reduce add Lane[1]
24529 // result[2] = reduce add Lane[2]
24530 // result[3] = reduce add Lane[3]
24531 SmallVector<int, 16> Mask = createStrideMask(I, DestTyNumElements, VF);
24532 Value *Lane = Builder.CreateShuffleVector(Vec, Mask);
24533 Rdx = Builder.CreateInsertElement(
24534 Rdx, emitReduction(Lane, Builder, &TTI, DestTy), I);
24535 }
24536 } else {
24537 Rdx = emitReduction(Vec, Builder, &TTI, DestTy);
24538 }
24539 if (Rdx->getType() != DestTy)
24540 Rdx = Builder.CreateIntCast(Rdx, DestTy, IsSigned);
24541 // Improved analysis for add/fadd/xor reductions with same scale
24542 // factor for all operands of reductions. We can emit scalar ops for
24543 // them instead.
24544 if (Scale > 1)
24545 Rdx = emitScaleForReusedOps(Rdx, Builder, Scale);
24546 return Rdx;
24547 }
24548
24549 /// Calculate the cost of a reduction.
24550 InstructionCost getReductionCost(TargetTransformInfo *TTI,
24551 ArrayRef<Value *> ReducedVals,
24552 bool IsCmpSelMinMax, FastMathFlags FMF,
24553 const BoUpSLP &R) {
24555 Type *ScalarTy = ReducedVals.front()->getType();
24556 unsigned ReduxWidth = ReducedVals.size();
24557 FixedVectorType *VectorTy = R.getReductionType();
24558 InstructionCost VectorCost = 0, ScalarCost;
24559 // If all of the reduced values are constant, the vector cost is 0, since
24560 // the reduction value can be calculated at the compile time.
24561 bool AllConsts = allConstant(ReducedVals);
24562 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
24564 // Scalar cost is repeated for N-1 elements.
24565 int Cnt = ReducedVals.size();
24566 for (Value *RdxVal : ReducedVals) {
24567 if (Cnt == 1)
24568 break;
24569 --Cnt;
24570 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
24571 Cost += GenCostFn();
24572 continue;
24573 }
24574 InstructionCost ScalarCost = 0;
24575 for (User *U : RdxVal->users()) {
24576 auto *RdxOp = cast<Instruction>(U);
24577 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
24578 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
24579 continue;
24580 }
24581 ScalarCost = InstructionCost::getInvalid();
24582 break;
24583 }
24584 if (ScalarCost.isValid())
24585 Cost += ScalarCost;
24586 else
24587 Cost += GenCostFn();
24588 }
24589 return Cost;
24590 };
24591 // Require reduction cost if:
24592 // 1. This type is not a full register type and no other vectors with the
24593 // same type in the storage (first vector with small type).
24594 // 2. The storage does not have any vector with full vector use (first
24595 // vector with full register use).
24596 bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.empty();
24597 switch (RdxKind) {
24598 case RecurKind::Add:
24599 case RecurKind::Mul:
24600 case RecurKind::Or:
24601 case RecurKind::And:
24602 case RecurKind::Xor:
24603 case RecurKind::FAdd:
24604 case RecurKind::FMul: {
24605 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
24606 if (!AllConsts) {
24607 if (DoesRequireReductionOp) {
24608 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
24609 assert(SLPReVec && "FixedVectorType is not expected.");
24610 unsigned ScalarTyNumElements = VecTy->getNumElements();
24611 for (unsigned I : seq<unsigned>(ReducedVals.size())) {
24612 VectorCost += TTI->getShuffleCost(
24615 ReducedVals.size()),
24616 VectorTy,
24617 createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));
24618 VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy,
24619 FMF, CostKind);
24620 }
24621 VectorCost += TTI->getScalarizationOverhead(
24622 VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
24623 /*Extract*/ false, TTI::TCK_RecipThroughput);
24624 } else {
24625 Type *RedTy = VectorTy->getElementType();
24626 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
24627 std::make_pair(RedTy, true));
24628 if (RType == RedTy) {
24629 VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
24630 FMF, CostKind);
24631 } else {
24632 VectorCost = TTI->getExtendedReductionCost(
24633 RdxOpcode, !IsSigned, RedTy,
24634 getWidenedType(RType, ReduxWidth), FMF, CostKind);
24635 }
24636 }
24637 } else {
24638 Type *RedTy = VectorTy->getElementType();
24639 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
24640 std::make_pair(RedTy, true));
24641 VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
24642 VectorCost +=
24643 TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind);
24644 if (RType != RedTy) {
24645 unsigned Opcode = Instruction::Trunc;
24646 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
24647 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
24648 VectorCost += TTI->getCastInstrCost(
24649 Opcode, VectorTy, RVecTy, TTI::CastContextHint::None, CostKind);
24650 }
24651 }
24652 }
24653 ScalarCost = EvaluateScalarCost([&]() {
24654 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
24655 });
24656 break;
24657 }
24658 case RecurKind::FMax:
24659 case RecurKind::FMin:
24660 case RecurKind::FMaximum:
24661 case RecurKind::FMinimum:
24662 case RecurKind::SMax:
24663 case RecurKind::SMin:
24664 case RecurKind::UMax:
24665 case RecurKind::UMin: {
24667 if (!AllConsts) {
24668 if (DoesRequireReductionOp) {
24669 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
24670 } else {
24671 // Check if the previous reduction already exists and account it as
24672 // series of operations + single reduction.
24673 Type *RedTy = VectorTy->getElementType();
24674 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
24675 std::make_pair(RedTy, true));
24676 VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
24677 IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);
24678 VectorCost += TTI->getIntrinsicInstrCost(ICA, CostKind);
24679 if (RType != RedTy) {
24680 unsigned Opcode = Instruction::Trunc;
24681 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
24682 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
24683 VectorCost += TTI->getCastInstrCost(
24684 Opcode, VectorTy, RVecTy, TTI::CastContextHint::None, CostKind);
24685 }
24686 }
24687 }
24688 ScalarCost = EvaluateScalarCost([&]() {
24689 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
24690 return TTI->getIntrinsicInstrCost(ICA, CostKind);
24691 });
24692 break;
24693 }
24694 default:
24695 llvm_unreachable("Expected arithmetic or min/max reduction operation");
24696 }
24697
24698 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
24699 << " for reduction of " << shortBundleName(ReducedVals)
24700 << " (It is a splitting reduction)\n");
24701 return VectorCost - ScalarCost;
24702 }
24703
24704 /// Splits the values, stored in VectorValuesAndScales, into registers/free
24705 /// sub-registers, combines them with the given reduction operation as a
24706 /// vector operation and then performs single (small enough) reduction.
24707 Value *emitReduction(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
24708 Type *DestTy) {
24709 Value *ReducedSubTree = nullptr;
24710 // Creates reduction and combines with the previous reduction.
24711 auto CreateSingleOp = [&](Value *Vec, unsigned Scale, bool IsSigned) {
24712 Value *Rdx = createSingleOp(Builder, TTI, Vec, Scale, IsSigned, DestTy);
24713 if (ReducedSubTree)
24714 ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx,
24715 "op.rdx", ReductionOps);
24716 else
24717 ReducedSubTree = Rdx;
24718 };
24719 if (VectorValuesAndScales.size() == 1) {
24720 const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.front();
24721 CreateSingleOp(Vec, Scale, IsSigned);
24722 return ReducedSubTree;
24723 }
24724 // Scales Vec using given Cnt scale factor and then performs vector combine
24725 // with previous value of VecOp.
24726 Value *VecRes = nullptr;
24727 bool VecResSignedness = false;
24728 auto CreateVecOp = [&](Value *Vec, unsigned Cnt, bool IsSigned) {
24729 Type *ScalarTy = Vec->getType()->getScalarType();
24730 // Scale Vec using given Cnt scale factor.
24731 if (Cnt > 1) {
24732 ElementCount EC = cast<VectorType>(Vec->getType())->getElementCount();
24733 switch (RdxKind) {
24734 case RecurKind::Add: {
24735 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy) {
24736 unsigned VF = getNumElements(Vec->getType());
24737 LLVM_DEBUG(dbgs() << "SLP: ctpop " << Cnt << "of " << Vec
24738 << ". (HorRdx)\n");
24740 for (unsigned I : seq<unsigned>(Cnt))
24741 std::iota(std::next(Mask.begin(), VF * I),
24742 std::next(Mask.begin(), VF * (I + 1)), 0);
24743 ++NumVectorInstructions;
24744 Vec = Builder.CreateShuffleVector(Vec, Mask);
24745 break;
24746 }
24747 // res = mul vv, n
24748 if (ScalarTy != DestTy->getScalarType())
24749 Vec = Builder.CreateIntCast(
24750 Vec, getWidenedType(DestTy, getNumElements(Vec->getType())),
24751 IsSigned);
24753 EC, ConstantInt::get(DestTy->getScalarType(), Cnt));
24754 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of " << Vec
24755 << ". (HorRdx)\n");
24756 ++NumVectorInstructions;
24757 Vec = Builder.CreateMul(Vec, Scale);
24758 break;
24759 }
24760 case RecurKind::Xor: {
24761 // res = n % 2 ? 0 : vv
24763 << "SLP: Xor " << Cnt << "of " << Vec << ". (HorRdx)\n");
24764 if (Cnt % 2 == 0)
24765 Vec = Constant::getNullValue(Vec->getType());
24766 break;
24767 }
24768 case RecurKind::FAdd: {
24769 // res = fmul v, n
24770 Value *Scale =
24771 ConstantVector::getSplat(EC, ConstantFP::get(ScalarTy, Cnt));
24772 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of " << Vec
24773 << ". (HorRdx)\n");
24774 ++NumVectorInstructions;
24775 Vec = Builder.CreateFMul(Vec, Scale);
24776 break;
24777 }
24778 case RecurKind::And:
24779 case RecurKind::Or:
24780 case RecurKind::SMax:
24781 case RecurKind::SMin:
24782 case RecurKind::UMax:
24783 case RecurKind::UMin:
24784 case RecurKind::FMax:
24785 case RecurKind::FMin:
24786 case RecurKind::FMaximum:
24787 case RecurKind::FMinimum:
24788 // res = vv
24789 break;
24790 case RecurKind::Sub:
24791 case RecurKind::AddChainWithSubs:
24792 case RecurKind::Mul:
24793 case RecurKind::FMul:
24794 case RecurKind::FMulAdd:
24795 case RecurKind::AnyOf:
24796 case RecurKind::FindFirstIVSMin:
24797 case RecurKind::FindFirstIVUMin:
24798 case RecurKind::FindLastIVSMax:
24799 case RecurKind::FindLastIVUMax:
24800 case RecurKind::FMaxNum:
24801 case RecurKind::FMinNum:
24802 case RecurKind::FMaximumNum:
24803 case RecurKind::FMinimumNum:
24804 case RecurKind::None:
24805 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
24806 }
24807 }
24808 // Combine Vec with the previous VecOp.
24809 if (!VecRes) {
24810 VecRes = Vec;
24811 VecResSignedness = IsSigned;
24812 } else {
24813 ++NumVectorInstructions;
24814 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy &&
24815 VecRes->getType()->getScalarType() == Builder.getInt1Ty()) {
24816 // Handle ctpop.
24817 unsigned VecResVF = getNumElements(VecRes->getType());
24818 unsigned VecVF = getNumElements(Vec->getType());
24819 SmallVector<int> Mask(VecResVF + VecVF, PoisonMaskElem);
24820 std::iota(Mask.begin(), Mask.end(), 0);
24821 // Ensure that VecRes is always larger than Vec
24822 if (VecResVF < VecVF) {
24823 std::swap(VecRes, Vec);
24824 std::swap(VecResVF, VecVF);
24825 }
24826 if (VecResVF != VecVF) {
24827 SmallVector<int> ResizeMask(VecResVF, PoisonMaskElem);
24828 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
24829 Vec = Builder.CreateShuffleVector(Vec, ResizeMask);
24830 }
24831 VecRes = Builder.CreateShuffleVector(VecRes, Vec, Mask, "rdx.op");
24832 return;
24833 }
24834 if (VecRes->getType()->getScalarType() != DestTy->getScalarType())
24835 VecRes = Builder.CreateIntCast(
24836 VecRes, getWidenedType(DestTy, getNumElements(VecRes->getType())),
24837 VecResSignedness);
24838 if (ScalarTy != DestTy->getScalarType())
24839 Vec = Builder.CreateIntCast(
24840 Vec, getWidenedType(DestTy, getNumElements(Vec->getType())),
24841 IsSigned);
24842 unsigned VecResVF = getNumElements(VecRes->getType());
24843 unsigned VecVF = getNumElements(Vec->getType());
24844 // Ensure that VecRes is always larger than Vec
24845 if (VecResVF < VecVF) {
24846 std::swap(VecRes, Vec);
24847 std::swap(VecResVF, VecVF);
24848 }
24849 // extract + op + insert
24850 Value *Op = VecRes;
24851 if (VecResVF != VecVF)
24852 Op = createExtractVector(Builder, VecRes, VecVF, /*Index=*/0);
24853 Op = createOp(Builder, RdxKind, Op, Vec, "rdx.op", ReductionOps);
24854 if (VecResVF != VecVF)
24855 Op = createInsertVector(Builder, VecRes, Op, /*Index=*/0);
24856 VecRes = Op;
24857 }
24858 };
24859 for (auto [Vec, Scale, IsSigned] : VectorValuesAndScales)
24860 CreateVecOp(Vec, Scale, IsSigned);
24861 CreateSingleOp(VecRes, /*Scale=*/1, /*IsSigned=*/false);
24862
24863 return ReducedSubTree;
24864 }
24865
24866 /// Emit a horizontal reduction of the vectorized value.
24867 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
24868 const TargetTransformInfo *TTI, Type *DestTy) {
24869 assert(VectorizedValue && "Need to have a vectorized tree node");
24870 assert(RdxKind != RecurKind::FMulAdd &&
24871 "A call to the llvm.fmuladd intrinsic is not handled yet");
24872
24873 auto *FTy = cast<FixedVectorType>(VectorizedValue->getType());
24874 if (FTy->getScalarType() == Builder.getInt1Ty() &&
24875 RdxKind == RecurKind::Add &&
24876 DestTy->getScalarType() != FTy->getScalarType()) {
24877 // Convert vector_reduce_add(ZExt(<n x i1>)) to
24878 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
24879 Value *V = Builder.CreateBitCast(
24880 VectorizedValue, Builder.getIntNTy(FTy->getNumElements()));
24881 ++NumVectorInstructions;
24882 return Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, V);
24883 }
24884 ++NumVectorInstructions;
24885 return createSimpleReduction(Builder, VectorizedValue, RdxKind);
24886 }
24887
24888 /// Emits optimized code for unique scalar value reused \p Cnt times.
24889 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
24890 unsigned Cnt) {
24891 assert(IsSupportedHorRdxIdentityOp &&
24892 "The optimization of matched scalar identity horizontal reductions "
24893 "must be supported.");
24894 if (Cnt == 1)
24895 return VectorizedValue;
24896 switch (RdxKind) {
24897 case RecurKind::Add: {
24898 // res = mul vv, n
24899 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
24900 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
24901 << VectorizedValue << ". (HorRdx)\n");
24902 return Builder.CreateMul(VectorizedValue, Scale);
24903 }
24904 case RecurKind::Xor: {
24905 // res = n % 2 ? 0 : vv
24906 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
24907 << ". (HorRdx)\n");
24908 if (Cnt % 2 == 0)
24909 return Constant::getNullValue(VectorizedValue->getType());
24910 return VectorizedValue;
24911 }
24912 case RecurKind::FAdd: {
24913 // res = fmul v, n
24914 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
24915 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
24916 << VectorizedValue << ". (HorRdx)\n");
24917 return Builder.CreateFMul(VectorizedValue, Scale);
24918 }
24919 case RecurKind::And:
24920 case RecurKind::Or:
24921 case RecurKind::SMax:
24922 case RecurKind::SMin:
24923 case RecurKind::UMax:
24924 case RecurKind::UMin:
24925 case RecurKind::FMax:
24926 case RecurKind::FMin:
24927 case RecurKind::FMaximum:
24928 case RecurKind::FMinimum:
24929 // res = vv
24930 return VectorizedValue;
24931 case RecurKind::Sub:
24932 case RecurKind::AddChainWithSubs:
24933 case RecurKind::Mul:
24934 case RecurKind::FMul:
24935 case RecurKind::FMulAdd:
24936 case RecurKind::AnyOf:
24937 case RecurKind::FindFirstIVSMin:
24938 case RecurKind::FindFirstIVUMin:
24939 case RecurKind::FindLastIVSMax:
24940 case RecurKind::FindLastIVUMax:
24941 case RecurKind::FMaxNum:
24942 case RecurKind::FMinNum:
24943 case RecurKind::FMaximumNum:
24944 case RecurKind::FMinimumNum:
24945 case RecurKind::None:
24946 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
24947 }
24948 return nullptr;
24949 }
24950
24951 /// Emits actual operation for the scalar identity values, found during
24952 /// horizontal reduction analysis.
24953 Value *
24954 emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
24955 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
24956 const DenseMap<Value *, Value *> &TrackedToOrig) {
24957 assert(IsSupportedHorRdxIdentityOp &&
24958 "The optimization of matched scalar identity horizontal reductions "
24959 "must be supported.");
24960 ArrayRef<Value *> VL = R.getRootNodeScalars();
24961 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
24962 if (VTy->getElementType() != VL.front()->getType()) {
24963 VectorizedValue = Builder.CreateIntCast(
24964 VectorizedValue,
24965 getWidenedType(VL.front()->getType(), VTy->getNumElements()),
24966 R.isSignedMinBitwidthRootNode());
24967 }
24968 switch (RdxKind) {
24969 case RecurKind::Add: {
24970 // root = mul prev_root, <1, 1, n, 1>
24972 for (Value *V : VL) {
24973 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
24974 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
24975 }
24976 auto *Scale = ConstantVector::get(Vals);
24977 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
24978 << VectorizedValue << ". (HorRdx)\n");
24979 return Builder.CreateMul(VectorizedValue, Scale);
24980 }
24981 case RecurKind::And:
24982 case RecurKind::Or:
24983 // No need for multiple or/and(s).
24984 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
24985 << ". (HorRdx)\n");
24986 return VectorizedValue;
24987 case RecurKind::SMax:
24988 case RecurKind::SMin:
24989 case RecurKind::UMax:
24990 case RecurKind::UMin:
24991 case RecurKind::FMax:
24992 case RecurKind::FMin:
24993 case RecurKind::FMaximum:
24994 case RecurKind::FMinimum:
24995 // No need for multiple min/max(s) of the same value.
24996 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
24997 << ". (HorRdx)\n");
24998 return VectorizedValue;
24999 case RecurKind::Xor: {
25000 // Replace values with even number of repeats with 0, since
25001 // x xor x = 0.
25002 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
25003 // 7>, if elements 4th and 6th elements have even number of repeats.
25005 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
25007 std::iota(Mask.begin(), Mask.end(), 0);
25008 bool NeedShuffle = false;
25009 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
25010 Value *V = VL[I];
25011 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
25012 if (Cnt % 2 == 0) {
25013 Mask[I] = VF;
25014 NeedShuffle = true;
25015 }
25016 }
25017 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
25018 : Mask) dbgs()
25019 << I << " ";
25020 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
25021 if (NeedShuffle)
25022 VectorizedValue = Builder.CreateShuffleVector(
25023 VectorizedValue,
25024 ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
25025 return VectorizedValue;
25026 }
25027 case RecurKind::FAdd: {
25028 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
25030 for (Value *V : VL) {
25031 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
25032 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
25033 }
25034 auto *Scale = ConstantVector::get(Vals);
25035 return Builder.CreateFMul(VectorizedValue, Scale);
25036 }
25037 case RecurKind::Sub:
25038 case RecurKind::AddChainWithSubs:
25039 case RecurKind::Mul:
25040 case RecurKind::FMul:
25041 case RecurKind::FMulAdd:
25042 case RecurKind::AnyOf:
25043 case RecurKind::FindFirstIVSMin:
25044 case RecurKind::FindFirstIVUMin:
25045 case RecurKind::FindLastIVSMax:
25046 case RecurKind::FindLastIVUMax:
25047 case RecurKind::FMaxNum:
25048 case RecurKind::FMinNum:
25049 case RecurKind::FMaximumNum:
25050 case RecurKind::FMinimumNum:
25051 case RecurKind::None:
25052 llvm_unreachable("Unexpected reduction kind for reused scalars.");
25053 }
25054 return nullptr;
25055 }
25056};
25057} // end anonymous namespace
25058
25059/// Gets recurrence kind from the specified value.
25061 return HorizontalReduction::getRdxKind(V);
25062}
25063static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
25064 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
25065 return cast<FixedVectorType>(IE->getType())->getNumElements();
25066
25067 unsigned AggregateSize = 1;
25068 auto *IV = cast<InsertValueInst>(InsertInst);
25069 Type *CurrentType = IV->getType();
25070 do {
25071 if (auto *ST = dyn_cast<StructType>(CurrentType)) {
25072 for (auto *Elt : ST->elements())
25073 if (Elt != ST->getElementType(0)) // check homogeneity
25074 return std::nullopt;
25075 AggregateSize *= ST->getNumElements();
25076 CurrentType = ST->getElementType(0);
25077 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
25078 AggregateSize *= AT->getNumElements();
25079 CurrentType = AT->getElementType();
25080 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
25081 AggregateSize *= VT->getNumElements();
25082 return AggregateSize;
25083 } else if (CurrentType->isSingleValueType()) {
25084 return AggregateSize;
25085 } else {
25086 return std::nullopt;
25087 }
25088 } while (true);
25089}
25090
25091static void findBuildAggregateRec(Instruction *LastInsertInst,
25093 SmallVectorImpl<Value *> &BuildVectorOpds,
25094 SmallVectorImpl<Value *> &InsertElts,
25095 unsigned OperandOffset, const BoUpSLP &R) {
25096 do {
25097 Value *InsertedOperand = LastInsertInst->getOperand(1);
25098 std::optional<unsigned> OperandIndex =
25099 getElementIndex(LastInsertInst, OperandOffset);
25100 if (!OperandIndex || R.isDeleted(LastInsertInst))
25101 return;
25102 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
25103 findBuildAggregateRec(cast<Instruction>(InsertedOperand), TTI,
25104 BuildVectorOpds, InsertElts, *OperandIndex, R);
25105
25106 } else {
25107 BuildVectorOpds[*OperandIndex] = InsertedOperand;
25108 InsertElts[*OperandIndex] = LastInsertInst;
25109 }
25110 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
25111 } while (LastInsertInst != nullptr &&
25112 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
25113 LastInsertInst->hasOneUse());
25114}
25115
25116/// Recognize construction of vectors like
25117/// %ra = insertelement <4 x float> poison, float %s0, i32 0
25118/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
25119/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
25120/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
25121/// starting from the last insertelement or insertvalue instruction.
25122///
25123/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
25124/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
25125/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
25126///
25127/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
25128///
25129/// \return true if it matches.
25130static bool findBuildAggregate(Instruction *LastInsertInst,
25132 SmallVectorImpl<Value *> &BuildVectorOpds,
25133 SmallVectorImpl<Value *> &InsertElts,
25134 const BoUpSLP &R) {
25135
25136 assert((isa<InsertElementInst>(LastInsertInst) ||
25137 isa<InsertValueInst>(LastInsertInst)) &&
25138 "Expected insertelement or insertvalue instruction!");
25139
25140 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
25141 "Expected empty result vectors!");
25142
25143 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
25144 if (!AggregateSize)
25145 return false;
25146 BuildVectorOpds.resize(*AggregateSize);
25147 InsertElts.resize(*AggregateSize);
25148
25149 findBuildAggregateRec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0, R);
25150 llvm::erase(BuildVectorOpds, nullptr);
25151 llvm::erase(InsertElts, nullptr);
25152 if (BuildVectorOpds.size() >= 2)
25153 return true;
25154
25155 return false;
25156}
25157
25158/// Try and get a reduction instruction from a phi node.
25159///
25160/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
25161/// if they come from either \p ParentBB or a containing loop latch.
25162///
25163/// \returns A candidate reduction value if possible, or \code nullptr \endcode
25164/// if not possible.
25166 BasicBlock *ParentBB, LoopInfo *LI) {
25167 // There are situations where the reduction value is not dominated by the
25168 // reduction phi. Vectorizing such cases has been reported to cause
25169 // miscompiles. See PR25787.
25170 auto DominatedReduxValue = [&](Value *R) {
25171 return isa<Instruction>(R) &&
25172 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
25173 };
25174
25175 Instruction *Rdx = nullptr;
25176
25177 // Return the incoming value if it comes from the same BB as the phi node.
25178 if (P->getIncomingBlock(0) == ParentBB) {
25179 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
25180 } else if (P->getIncomingBlock(1) == ParentBB) {
25181 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
25182 }
25183
25184 if (Rdx && DominatedReduxValue(Rdx))
25185 return Rdx;
25186
25187 // Otherwise, check whether we have a loop latch to look at.
25188 Loop *BBL = LI->getLoopFor(ParentBB);
25189 if (!BBL)
25190 return nullptr;
25191 BasicBlock *BBLatch = BBL->getLoopLatch();
25192 if (!BBLatch)
25193 return nullptr;
25194
25195 // There is a loop latch, return the incoming value if it comes from
25196 // that. This reduction pattern occasionally turns up.
25197 if (P->getIncomingBlock(0) == BBLatch) {
25198 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
25199 } else if (P->getIncomingBlock(1) == BBLatch) {
25200 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
25201 }
25202
25203 if (Rdx && DominatedReduxValue(Rdx))
25204 return Rdx;
25205
25206 return nullptr;
25207}
25208
25209static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
25210 if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
25211 return true;
25212 if (match(I, m_FMaxNum(m_Value(V0), m_Value(V1))))
25213 return true;
25214 if (match(I, m_FMinNum(m_Value(V0), m_Value(V1))))
25215 return true;
25216 if (match(I, m_FMaximum(m_Value(V0), m_Value(V1))))
25217 return true;
25218 if (match(I, m_FMinimum(m_Value(V0), m_Value(V1))))
25219 return true;
25220 if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1))))
25221 return true;
25222 if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1))))
25223 return true;
25224 if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1))))
25225 return true;
25226 if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1))))
25227 return true;
25228 return false;
25229}
25230
25231/// We could have an initial reduction that is not an add.
25232/// r *= v1 + v2 + v3 + v4
25233/// In such a case start looking for a tree rooted in the first '+'.
25234/// \Returns the new root if found, which may be nullptr if not an instruction.
25236 Instruction *Root) {
25237 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
25238 isa<IntrinsicInst>(Root)) &&
25239 "Expected binop, select, or intrinsic for reduction matching");
25240 Value *LHS =
25241 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
25242 Value *RHS =
25243 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
25244 if (LHS == Phi)
25245 return dyn_cast<Instruction>(RHS);
25246 if (RHS == Phi)
25247 return dyn_cast<Instruction>(LHS);
25248 return nullptr;
25249}
25250
25251/// \p Returns the first operand of \p I that does not match \p Phi. If
25252/// operand is not an instruction it returns nullptr.
25254 Value *Op0 = nullptr;
25255 Value *Op1 = nullptr;
25256 if (!matchRdxBop(I, Op0, Op1))
25257 return nullptr;
25258 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
25259}
25260
25261/// \Returns true if \p I is a candidate instruction for reduction vectorization.
25263 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
25264 Value *B0 = nullptr, *B1 = nullptr;
25265 bool IsBinop = matchRdxBop(I, B0, B1);
25266 return IsBinop || IsSelect;
25267}
25268
25269bool SLPVectorizerPass::vectorizeHorReduction(
25270 PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
25271 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
25272 if (!ShouldVectorizeHor)
25273 return false;
25274 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
25275
25276 if (Root->getParent() != BB || isa<PHINode>(Root))
25277 return false;
25278
25279 // If we can find a secondary reduction root, use that instead.
25280 auto SelectRoot = [&]() {
25281 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
25282 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
25283 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
25284 return NewRoot;
25285 return Root;
25286 };
25287
25288 // Start analysis starting from Root instruction. If horizontal reduction is
25289 // found, try to vectorize it. If it is not a horizontal reduction or
25290 // vectorization is not possible or not effective, and currently analyzed
25291 // instruction is a binary operation, try to vectorize the operands, using
25292 // pre-order DFS traversal order. If the operands were not vectorized, repeat
25293 // the same procedure considering each operand as a possible root of the
25294 // horizontal reduction.
25295 // Interrupt the process if the Root instruction itself was vectorized or all
25296 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
25297 // If a horizintal reduction was not matched or vectorized we collect
25298 // instructions for possible later attempts for vectorization.
25299 std::queue<std::pair<Instruction *, unsigned>> Stack;
25300 Stack.emplace(SelectRoot(), 0);
25301 SmallPtrSet<Value *, 8> VisitedInstrs;
25302 bool Res = false;
25303 auto TryToReduce = [this, &R, TTI = TTI](Instruction *Inst) -> Value * {
25304 if (R.isAnalyzedReductionRoot(Inst))
25305 return nullptr;
25306 if (!isReductionCandidate(Inst))
25307 return nullptr;
25308 HorizontalReduction HorRdx;
25309 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
25310 return nullptr;
25311 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC);
25312 };
25313 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
25314 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
25315 FutureSeed = getNonPhiOperand(Root, P);
25316 if (!FutureSeed)
25317 return false;
25318 }
25319 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
25320 // analysis is done separately.
25321 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
25322 PostponedInsts.push_back(FutureSeed);
25323 return true;
25324 };
25325
25326 while (!Stack.empty()) {
25327 Instruction *Inst;
25328 unsigned Level;
25329 std::tie(Inst, Level) = Stack.front();
25330 Stack.pop();
25331 // Do not try to analyze instruction that has already been vectorized.
25332 // This may happen when we vectorize instruction operands on a previous
25333 // iteration while stack was populated before that happened.
25334 if (R.isDeleted(Inst))
25335 continue;
25336 if (Value *VectorizedV = TryToReduce(Inst)) {
25337 Res = true;
25338 if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
25339 // Try to find another reduction.
25340 Stack.emplace(I, Level);
25341 continue;
25342 }
25343 if (R.isDeleted(Inst))
25344 continue;
25345 } else {
25346 // We could not vectorize `Inst` so try to use it as a future seed.
25347 if (!TryAppendToPostponedInsts(Inst)) {
25348 assert(Stack.empty() && "Expected empty stack");
25349 break;
25350 }
25351 }
25352
25353 // Try to vectorize operands.
25354 // Continue analysis for the instruction from the same basic block only to
25355 // save compile time.
25356 if (++Level < RecursionMaxDepth)
25357 for (auto *Op : Inst->operand_values())
25358 if (VisitedInstrs.insert(Op).second)
25359 if (auto *I = dyn_cast<Instruction>(Op))
25360 // Do not try to vectorize CmpInst operands, this is done
25361 // separately.
25362 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&
25363 !R.isDeleted(I) && I->getParent() == BB)
25364 Stack.emplace(I, Level);
25365 }
25366 return Res;
25367}
25368
25369bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
25370 if (!I)
25371 return false;
25372
25373 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
25374 return false;
25375 // Skip potential FMA candidates.
25376 if ((I->getOpcode() == Instruction::FAdd ||
25377 I->getOpcode() == Instruction::FSub) &&
25378 canConvertToFMA(I, getSameOpcode(I, *TLI), *DT, *DL, *TTI, *TLI)
25379 .isValid())
25380 return false;
25381
25382 Value *P = I->getParent();
25383
25384 // Vectorize in current basic block only.
25385 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
25386 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
25387 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
25388 R.isDeleted(Op0) || R.isDeleted(Op1))
25389 return false;
25390
25391 // First collect all possible candidates
25393 Candidates.emplace_back(Op0, Op1);
25394
25395 auto *A = dyn_cast<BinaryOperator>(Op0);
25396 auto *B = dyn_cast<BinaryOperator>(Op1);
25397 // Try to skip B.
25398 if (A && B && B->hasOneUse()) {
25399 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
25400 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
25401 if (B0 && B0->getParent() == P && !R.isDeleted(B0))
25402 Candidates.emplace_back(A, B0);
25403 if (B1 && B1->getParent() == P && !R.isDeleted(B1))
25404 Candidates.emplace_back(A, B1);
25405 }
25406 // Try to skip A.
25407 if (B && A && A->hasOneUse()) {
25408 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
25409 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
25410 if (A0 && A0->getParent() == P && !R.isDeleted(A0))
25411 Candidates.emplace_back(A0, B);
25412 if (A1 && A1->getParent() == P && !R.isDeleted(A1))
25413 Candidates.emplace_back(A1, B);
25414 }
25415
25416 auto TryToReduce = [this, &R, &TTI = *TTI](Instruction *Inst,
25417 ArrayRef<Value *> Ops) {
25418 if (!isReductionCandidate(Inst))
25419 return false;
25420 Type *Ty = Inst->getType();
25421 if (!isValidElementType(Ty) || Ty->isPointerTy())
25422 return false;
25423 HorizontalReduction HorRdx(Inst, Ops);
25424 if (!HorRdx.matchReductionForOperands())
25425 return false;
25426 // Check the cost of operations.
25427 VectorType *VecTy = getWidenedType(Ty, Ops.size());
25429 InstructionCost ScalarCost =
25431 VecTy, APInt::getAllOnes(getNumElements(VecTy)), /*Insert=*/false,
25432 /*Extract=*/true, CostKind) +
25434 InstructionCost RedCost;
25435 switch (::getRdxKind(Inst)) {
25436 case RecurKind::Add:
25437 case RecurKind::Mul:
25438 case RecurKind::Or:
25439 case RecurKind::And:
25440 case RecurKind::Xor:
25441 case RecurKind::FAdd:
25442 case RecurKind::FMul: {
25443 FastMathFlags FMF;
25444 if (auto *FPCI = dyn_cast<FPMathOperator>(Inst))
25445 FMF = FPCI->getFastMathFlags();
25446 RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
25447 CostKind);
25448 break;
25449 }
25450 default:
25451 return false;
25452 }
25453 if (RedCost >= ScalarCost)
25454 return false;
25455
25456 return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC) != nullptr;
25457 };
25458 if (Candidates.size() == 1)
25459 return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R);
25460
25461 // We have multiple options. Try to pick the single best.
25462 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
25463 if (!BestCandidate)
25464 return false;
25465 return (*BestCandidate == 0 &&
25466 TryToReduce(I, {Candidates[*BestCandidate].first,
25467 Candidates[*BestCandidate].second})) ||
25468 tryToVectorizeList({Candidates[*BestCandidate].first,
25469 Candidates[*BestCandidate].second},
25470 R);
25471}
25472
25473bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
25474 BasicBlock *BB, BoUpSLP &R) {
25475 SmallVector<WeakTrackingVH> PostponedInsts;
25476 bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts);
25477 Res |= tryToVectorize(PostponedInsts, R);
25478 return Res;
25479}
25480
25481bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
25482 BoUpSLP &R) {
25483 bool Res = false;
25484 for (Value *V : Insts)
25485 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
25486 Res |= tryToVectorize(Inst, R);
25487 return Res;
25488}
25489
25490bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
25491 BasicBlock *BB, BoUpSLP &R,
25492 bool MaxVFOnly) {
25493 if (!R.canMapToVector(IVI->getType()))
25494 return false;
25495
25496 SmallVector<Value *, 16> BuildVectorOpds;
25497 SmallVector<Value *, 16> BuildVectorInsts;
25498 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts, R))
25499 return false;
25500
25501 if (MaxVFOnly && BuildVectorOpds.size() == 2) {
25502 R.getORE()->emit([&]() {
25503 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
25504 << "Cannot SLP vectorize list: only 2 elements of buildvalue, "
25505 "trying reduction first.";
25506 });
25507 return false;
25508 }
25509 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
25510 // Aggregate value is unlikely to be processed in vector register.
25511 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
25512}
25513
25514bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
25515 BasicBlock *BB, BoUpSLP &R,
25516 bool MaxVFOnly) {
25517 SmallVector<Value *, 16> BuildVectorInsts;
25518 SmallVector<Value *, 16> BuildVectorOpds;
25520 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts, R) ||
25521 (all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
25522 isFixedVectorShuffle(BuildVectorOpds, Mask, AC)))
25523 return false;
25524
25525 if (MaxVFOnly && BuildVectorInsts.size() == 2) {
25526 R.getORE()->emit([&]() {
25527 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
25528 << "Cannot SLP vectorize list: only 2 elements of buildvector, "
25529 "trying reduction first.";
25530 });
25531 return false;
25532 }
25533 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
25534 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
25535}
25536
25537template <typename T>
25539 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
25540 function_ref<bool(T *, T *)> AreCompatible,
25541 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
25542 bool MaxVFOnly, BoUpSLP &R) {
25543 bool Changed = false;
25544 // Sort by type, parent, operands.
25545 stable_sort(Incoming, Comparator);
25546
25547 // Try to vectorize elements base on their type.
25548 SmallVector<T *> Candidates;
25550 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
25551 VL.clear()) {
25552 // Look for the next elements with the same type, parent and operand
25553 // kinds.
25554 auto *I = dyn_cast<Instruction>(*IncIt);
25555 if (!I || R.isDeleted(I)) {
25556 ++IncIt;
25557 continue;
25558 }
25559 auto *SameTypeIt = IncIt;
25560 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
25561 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
25562 AreCompatible(*SameTypeIt, *IncIt))) {
25563 auto *I = dyn_cast<Instruction>(*SameTypeIt);
25564 ++SameTypeIt;
25565 if (I && !R.isDeleted(I))
25566 VL.push_back(cast<T>(I));
25567 }
25568
25569 // Try to vectorize them.
25570 unsigned NumElts = VL.size();
25571 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
25572 << NumElts << ")\n");
25573 // The vectorization is a 3-state attempt:
25574 // 1. Try to vectorize instructions with the same/alternate opcodes with the
25575 // size of maximal register at first.
25576 // 2. Try to vectorize remaining instructions with the same type, if
25577 // possible. This may result in the better vectorization results rather than
25578 // if we try just to vectorize instructions with the same/alternate opcodes.
25579 // 3. Final attempt to try to vectorize all instructions with the
25580 // same/alternate ops only, this may result in some extra final
25581 // vectorization.
25582 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
25583 // Success start over because instructions might have been changed.
25584 Changed = true;
25585 VL.swap(Candidates);
25586 Candidates.clear();
25587 for (T *V : VL) {
25588 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
25589 Candidates.push_back(V);
25590 }
25591 } else {
25592 /// \Returns the minimum number of elements that we will attempt to
25593 /// vectorize.
25594 auto GetMinNumElements = [&R](Value *V) {
25595 unsigned EltSize = R.getVectorElementSize(V);
25596 return std::max(2U, R.getMaxVecRegSize() / EltSize);
25597 };
25598 if (NumElts < GetMinNumElements(*IncIt) &&
25599 (Candidates.empty() ||
25600 Candidates.front()->getType() == (*IncIt)->getType())) {
25601 for (T *V : VL) {
25602 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
25603 Candidates.push_back(V);
25604 }
25605 }
25606 }
25607 // Final attempt to vectorize instructions with the same types.
25608 if (Candidates.size() > 1 &&
25609 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
25610 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
25611 // Success start over because instructions might have been changed.
25612 Changed = true;
25613 } else if (MaxVFOnly) {
25614 // Try to vectorize using small vectors.
25616 for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
25617 VL.clear()) {
25618 auto *I = dyn_cast<Instruction>(*It);
25619 if (!I || R.isDeleted(I)) {
25620 ++It;
25621 continue;
25622 }
25623 auto *SameTypeIt = It;
25624 while (SameTypeIt != End &&
25625 (!isa<Instruction>(*SameTypeIt) ||
25626 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
25627 AreCompatible(*SameTypeIt, *It))) {
25628 auto *I = dyn_cast<Instruction>(*SameTypeIt);
25629 ++SameTypeIt;
25630 if (I && !R.isDeleted(I))
25631 VL.push_back(cast<T>(I));
25632 }
25633 unsigned NumElts = VL.size();
25634 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
25635 /*MaxVFOnly=*/false))
25636 Changed = true;
25637 It = SameTypeIt;
25638 }
25639 }
25640 Candidates.clear();
25641 }
25642
25643 // Start over at the next instruction of a different type (or the end).
25644 IncIt = SameTypeIt;
25645 }
25646 return Changed;
25647}
25648
25649/// Compare two cmp instructions. If IsCompatibility is true, function returns
25650/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
25651/// operands. If IsCompatibility is false, function implements strict weak
25652/// ordering relation between two cmp instructions, returning true if the first
25653/// instruction is "less" than the second, i.e. its predicate is less than the
25654/// predicate of the second or the operands IDs are less than the operands IDs
25655/// of the second cmp instruction.
25656template <bool IsCompatibility>
25657static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
25658 const DominatorTree &DT) {
25659 assert(isValidElementType(V->getType()) &&
25660 isValidElementType(V2->getType()) &&
25661 "Expected valid element types only.");
25662 if (V == V2)
25663 return IsCompatibility;
25664 auto *CI1 = cast<CmpInst>(V);
25665 auto *CI2 = cast<CmpInst>(V2);
25666 if (CI1->getOperand(0)->getType()->getTypeID() <
25667 CI2->getOperand(0)->getType()->getTypeID())
25668 return !IsCompatibility;
25669 if (CI1->getOperand(0)->getType()->getTypeID() >
25670 CI2->getOperand(0)->getType()->getTypeID())
25671 return false;
25672 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
25674 return !IsCompatibility;
25675 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
25677 return false;
25678 CmpInst::Predicate Pred1 = CI1->getPredicate();
25679 CmpInst::Predicate Pred2 = CI2->getPredicate();
25682 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
25683 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
25684 if (BasePred1 < BasePred2)
25685 return !IsCompatibility;
25686 if (BasePred1 > BasePred2)
25687 return false;
25688 // Compare operands.
25689 bool CI1Preds = Pred1 == BasePred1;
25690 bool CI2Preds = Pred2 == BasePred1;
25691 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
25692 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
25693 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
25694 if (Op1 == Op2)
25695 continue;
25696 if (Op1->getValueID() < Op2->getValueID())
25697 return !IsCompatibility;
25698 if (Op1->getValueID() > Op2->getValueID())
25699 return false;
25700 if (auto *I1 = dyn_cast<Instruction>(Op1))
25701 if (auto *I2 = dyn_cast<Instruction>(Op2)) {
25702 if (IsCompatibility) {
25703 if (I1->getParent() != I2->getParent())
25704 return false;
25705 } else {
25706 // Try to compare nodes with same parent.
25707 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
25708 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
25709 if (!NodeI1)
25710 return NodeI2 != nullptr;
25711 if (!NodeI2)
25712 return false;
25713 assert((NodeI1 == NodeI2) ==
25714 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
25715 "Different nodes should have different DFS numbers");
25716 if (NodeI1 != NodeI2)
25717 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
25718 }
25719 InstructionsState S = getSameOpcode({I1, I2}, TLI);
25720 if (S && (IsCompatibility || !S.isAltShuffle()))
25721 continue;
25722 if (IsCompatibility)
25723 return false;
25724 if (I1->getOpcode() != I2->getOpcode())
25725 return I1->getOpcode() < I2->getOpcode();
25726 }
25727 }
25728 return IsCompatibility;
25729}
25730
25731template <typename ItT>
25732bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
25733 BasicBlock *BB, BoUpSLP &R) {
25734 bool Changed = false;
25735 // Try to find reductions first.
25736 for (CmpInst *I : CmpInsts) {
25737 if (R.isDeleted(I))
25738 continue;
25739 for (Value *Op : I->operands())
25740 if (auto *RootOp = dyn_cast<Instruction>(Op)) {
25741 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R);
25742 if (R.isDeleted(I))
25743 break;
25744 }
25745 }
25746 // Try to vectorize operands as vector bundles.
25747 for (CmpInst *I : CmpInsts) {
25748 if (R.isDeleted(I))
25749 continue;
25750 Changed |= tryToVectorize(I, R);
25751 }
25752 // Try to vectorize list of compares.
25753 // Sort by type, compare predicate, etc.
25754 auto CompareSorter = [&](Value *V, Value *V2) {
25755 if (V == V2)
25756 return false;
25757 return compareCmp<false>(V, V2, *TLI, *DT);
25758 };
25759
25760 auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
25761 if (V1 == V2)
25762 return true;
25763 return compareCmp<true>(V1, V2, *TLI, *DT);
25764 };
25765
25767 for (Instruction *V : CmpInsts)
25768 if (!R.isDeleted(V) && isValidElementType(getValueType(V)))
25769 Vals.push_back(V);
25770 if (Vals.size() <= 1)
25771 return Changed;
25772 Changed |= tryToVectorizeSequence<Value>(
25773 Vals, CompareSorter, AreCompatibleCompares,
25774 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
25775 // Exclude possible reductions from other blocks.
25776 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
25777 return any_of(V->users(), [V](User *U) {
25778 auto *Select = dyn_cast<SelectInst>(U);
25779 return Select &&
25780 Select->getParent() != cast<Instruction>(V)->getParent();
25781 });
25782 });
25783 if (ArePossiblyReducedInOtherBlock)
25784 return false;
25785 return tryToVectorizeList(Candidates, R, MaxVFOnly);
25786 },
25787 /*MaxVFOnly=*/true, R);
25788 return Changed;
25789}
25790
25791bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
25792 BasicBlock *BB, BoUpSLP &R) {
25793 assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
25794 "This function only accepts Insert instructions");
25795 bool OpsChanged = false;
25796 SmallVector<WeakTrackingVH> PostponedInsts;
25797 for (auto *I : reverse(Instructions)) {
25798 // pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
25799 if (R.isDeleted(I) || isa<CmpInst>(I))
25800 continue;
25801 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
25802 OpsChanged |=
25803 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/true);
25804 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
25805 OpsChanged |=
25806 vectorizeInsertElementInst(LastInsertElem, BB, R, /*MaxVFOnly=*/true);
25807 }
25808 // pass2 - try to vectorize reductions only
25809 if (R.isDeleted(I))
25810 continue;
25811 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, PostponedInsts);
25812 if (R.isDeleted(I) || isa<CmpInst>(I))
25813 continue;
25814 // pass3 - try to match and vectorize a buildvector sequence.
25815 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
25816 OpsChanged |=
25817 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/false);
25818 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
25819 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
25820 /*MaxVFOnly=*/false);
25821 }
25822 }
25823 // Now try to vectorize postponed instructions.
25824 OpsChanged |= tryToVectorize(PostponedInsts, R);
25825
25826 Instructions.clear();
25827 return OpsChanged;
25828}
25829
25830bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
25831 bool Changed = false;
25833 SmallPtrSet<Value *, 16> VisitedInstrs;
25834 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
25835 // node. Allows better to identify the chains that can be vectorized in the
25836 // better way.
25838 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
25840 isValidElementType(V2->getType()) &&
25841 "Expected vectorizable types only.");
25842 if (V1 == V2)
25843 return false;
25844 // It is fine to compare type IDs here, since we expect only vectorizable
25845 // types, like ints, floats and pointers, we don't care about other type.
25846 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
25847 return true;
25848 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
25849 return false;
25850 if (V1->getType()->getScalarSizeInBits() <
25851 V2->getType()->getScalarSizeInBits())
25852 return true;
25853 if (V1->getType()->getScalarSizeInBits() >
25854 V2->getType()->getScalarSizeInBits())
25855 return false;
25856 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
25857 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
25858 if (Opcodes1.size() < Opcodes2.size())
25859 return true;
25860 if (Opcodes1.size() > Opcodes2.size())
25861 return false;
25862 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
25863 {
25864 // Instructions come first.
25865 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
25866 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
25867 if (I1 && I2) {
25868 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
25869 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
25870 if (!NodeI1)
25871 return NodeI2 != nullptr;
25872 if (!NodeI2)
25873 return false;
25874 assert((NodeI1 == NodeI2) ==
25875 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
25876 "Different nodes should have different DFS numbers");
25877 if (NodeI1 != NodeI2)
25878 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
25879 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
25880 if (S && !S.isAltShuffle() && I1->getOpcode() == I2->getOpcode()) {
25881 const auto *E1 = dyn_cast<ExtractElementInst>(I1);
25882 const auto *E2 = dyn_cast<ExtractElementInst>(I2);
25883 if (!E1 || !E2)
25884 continue;
25885
25886 // Sort on ExtractElementInsts primarily by vector operands. Prefer
25887 // program order of the vector operands.
25888 const auto *V1 = dyn_cast<Instruction>(E1->getVectorOperand());
25889 const auto *V2 = dyn_cast<Instruction>(E2->getVectorOperand());
25890 if (V1 != V2) {
25891 if (V1 && !V2)
25892 return true;
25893 if (!V1 && V2)
25894 return false;
25896 DT->getNode(V1->getParent());
25898 DT->getNode(V2->getParent());
25899 if (!NodeI1)
25900 return NodeI2 != nullptr;
25901 if (!NodeI2)
25902 return false;
25903 assert((NodeI1 == NodeI2) ==
25904 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
25905 "Different nodes should have different DFS numbers");
25906 if (NodeI1 != NodeI2)
25907 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
25908 return V1->comesBefore(V2);
25909 }
25910 // If we have the same vector operand, try to sort by constant
25911 // index.
25912 std::optional<unsigned> Id1 = getExtractIndex(E1);
25913 std::optional<unsigned> Id2 = getExtractIndex(E2);
25914 // Bring constants to the top
25915 if (Id1 && !Id2)
25916 return true;
25917 if (!Id1 && Id2)
25918 return false;
25919 // First elements come first.
25920 if (Id1 && Id2)
25921 return *Id1 < *Id2;
25922
25923 continue;
25924 }
25925 if (I1->getOpcode() == I2->getOpcode())
25926 continue;
25927 return I1->getOpcode() < I2->getOpcode();
25928 }
25929 if (I1)
25930 return true;
25931 if (I2)
25932 return false;
25933 }
25934 {
25935 // Non-undef constants come next.
25936 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
25937 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
25938 if (C1 && C2)
25939 continue;
25940 if (C1)
25941 return true;
25942 if (C2)
25943 return false;
25944 }
25945 bool U1 = isa<UndefValue>(Opcodes1[I]);
25946 bool U2 = isa<UndefValue>(Opcodes2[I]);
25947 {
25948 // Non-constant non-instructions come next.
25949 if (!U1 && !U2) {
25950 auto ValID1 = Opcodes1[I]->getValueID();
25951 auto ValID2 = Opcodes2[I]->getValueID();
25952 if (ValID1 == ValID2)
25953 continue;
25954 if (ValID1 < ValID2)
25955 return true;
25956 if (ValID1 > ValID2)
25957 return false;
25958 }
25959 if (!U1)
25960 return true;
25961 if (!U2)
25962 return false;
25963 }
25964 // Undefs come last.
25965 assert(U1 && U2 && "The only thing left should be undef & undef.");
25966 }
25967 return false;
25968 };
25969 auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](Value *V1, Value *V2) {
25970 if (V1 == V2)
25971 return true;
25972 if (V1->getType() != V2->getType())
25973 return false;
25974 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
25975 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
25976 if (Opcodes1.size() != Opcodes2.size())
25977 return false;
25978 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
25979 // Undefs are compatible with any other value.
25980 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
25981 continue;
25982 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
25983 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
25984 if (R.isDeleted(I1) || R.isDeleted(I2))
25985 return false;
25986 if (I1->getParent() != I2->getParent())
25987 return false;
25988 if (getSameOpcode({I1, I2}, *TLI))
25989 continue;
25990 return false;
25991 }
25992 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
25993 continue;
25994 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
25995 return false;
25996 }
25997 return true;
25998 };
25999
26000 bool HaveVectorizedPhiNodes = false;
26001 do {
26002 // Collect the incoming values from the PHIs.
26003 Incoming.clear();
26004 for (Instruction &I : *BB) {
26005 auto *P = dyn_cast<PHINode>(&I);
26006 if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
26007 break;
26008
26009 // No need to analyze deleted, vectorized and non-vectorizable
26010 // instructions.
26011 if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
26012 isValidElementType(P->getType()))
26013 Incoming.push_back(P);
26014 }
26015
26016 if (Incoming.size() <= 1)
26017 break;
26018
26019 // Find the corresponding non-phi nodes for better matching when trying to
26020 // build the tree.
26021 for (Value *V : Incoming) {
26022 SmallVectorImpl<Value *> &Opcodes =
26023 PHIToOpcodes.try_emplace(V).first->getSecond();
26024 if (!Opcodes.empty())
26025 continue;
26026 SmallVector<Value *, 4> Nodes(1, V);
26028 while (!Nodes.empty()) {
26029 auto *PHI = cast<PHINode>(Nodes.pop_back_val());
26030 if (!Visited.insert(PHI).second)
26031 continue;
26032 for (Value *V : PHI->incoming_values()) {
26033 if (auto *PHI1 = dyn_cast<PHINode>((V))) {
26034 Nodes.push_back(PHI1);
26035 continue;
26036 }
26037 Opcodes.emplace_back(V);
26038 }
26039 }
26040 }
26041
26042 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
26043 Incoming, PHICompare, AreCompatiblePHIs,
26044 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
26045 return tryToVectorizeList(Candidates, R, MaxVFOnly);
26046 },
26047 /*MaxVFOnly=*/true, R);
26048 Changed |= HaveVectorizedPhiNodes;
26049 if (HaveVectorizedPhiNodes && any_of(PHIToOpcodes, [&](const auto &P) {
26050 auto *PHI = dyn_cast<PHINode>(P.first);
26051 return !PHI || R.isDeleted(PHI);
26052 }))
26053 PHIToOpcodes.clear();
26054 VisitedInstrs.insert_range(Incoming);
26055 } while (HaveVectorizedPhiNodes);
26056
26057 VisitedInstrs.clear();
26058
26059 InstSetVector PostProcessInserts;
26060 SmallSetVector<CmpInst *, 8> PostProcessCmps;
26061 // Vectorizes Inserts in `PostProcessInserts` and if `VectorizeCmps` is true
26062 // also vectorizes `PostProcessCmps`.
26063 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
26064 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
26065 if (VectorizeCmps) {
26066 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
26067 PostProcessCmps.clear();
26068 }
26069 PostProcessInserts.clear();
26070 return Changed;
26071 };
26072 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
26073 auto IsInPostProcessInstrs = [&](Instruction *I) {
26074 if (auto *Cmp = dyn_cast<CmpInst>(I))
26075 return PostProcessCmps.contains(Cmp);
26076 return isa<InsertElementInst, InsertValueInst>(I) &&
26077 PostProcessInserts.contains(I);
26078 };
26079 // Returns true if `I` is an instruction without users, like terminator, or
26080 // function call with ignored return value, store. Ignore unused instructions
26081 // (basing on instruction type, except for CallInst and InvokeInst).
26082 auto HasNoUsers = [](Instruction *I) {
26083 return I->use_empty() &&
26084 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
26085 };
26086 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
26087 // Skip instructions with scalable type. The num of elements is unknown at
26088 // compile-time for scalable type.
26089 if (isa<ScalableVectorType>(It->getType()))
26090 continue;
26091
26092 // Skip instructions marked for the deletion.
26093 if (R.isDeleted(&*It))
26094 continue;
26095 // We may go through BB multiple times so skip the one we have checked.
26096 if (!VisitedInstrs.insert(&*It).second) {
26097 if (HasNoUsers(&*It) &&
26098 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
26099 // We would like to start over since some instructions are deleted
26100 // and the iterator may become invalid value.
26101 Changed = true;
26102 It = BB->begin();
26103 E = BB->end();
26104 }
26105 continue;
26106 }
26107
26108 // Try to vectorize reductions that use PHINodes.
26109 if (PHINode *P = dyn_cast<PHINode>(It)) {
26110 // Check that the PHI is a reduction PHI.
26111 if (P->getNumIncomingValues() == 2) {
26112 // Try to match and vectorize a horizontal reduction.
26113 Instruction *Root = getReductionInstr(DT, P, BB, LI);
26114 if (Root && vectorizeRootInstruction(P, Root, BB, R)) {
26115 Changed = true;
26116 It = BB->begin();
26117 E = BB->end();
26118 continue;
26119 }
26120 }
26121 // Try to vectorize the incoming values of the PHI, to catch reductions
26122 // that feed into PHIs.
26123 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
26124 // Skip if the incoming block is the current BB for now. Also, bypass
26125 // unreachable IR for efficiency and to avoid crashing.
26126 // TODO: Collect the skipped incoming values and try to vectorize them
26127 // after processing BB.
26128 if (BB == P->getIncomingBlock(I) ||
26129 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
26130 continue;
26131
26132 // Postponed instructions should not be vectorized here, delay their
26133 // vectorization.
26134 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
26135 PI && !IsInPostProcessInstrs(PI)) {
26136 bool Res =
26137 vectorizeRootInstruction(nullptr, PI, P->getIncomingBlock(I), R);
26138 Changed |= Res;
26139 if (Res && R.isDeleted(P)) {
26140 It = BB->begin();
26141 E = BB->end();
26142 break;
26143 }
26144 }
26145 }
26146 continue;
26147 }
26148
26149 if (HasNoUsers(&*It)) {
26150 bool OpsChanged = false;
26151 auto *SI = dyn_cast<StoreInst>(It);
26152 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
26153 if (SI) {
26154 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
26155 // Try to vectorize chain in store, if this is the only store to the
26156 // address in the block.
26157 // TODO: This is just a temporarily solution to save compile time. Need
26158 // to investigate if we can safely turn on slp-vectorize-hor-store
26159 // instead to allow lookup for reduction chains in all non-vectorized
26160 // stores (need to check side effects and compile time).
26161 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
26162 SI->getValueOperand()->hasOneUse();
26163 }
26164 if (TryToVectorizeRoot) {
26165 for (auto *V : It->operand_values()) {
26166 // Postponed instructions should not be vectorized here, delay their
26167 // vectorization.
26168 if (auto *VI = dyn_cast<Instruction>(V);
26169 VI && !IsInPostProcessInstrs(VI))
26170 // Try to match and vectorize a horizontal reduction.
26171 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R);
26172 }
26173 }
26174 // Start vectorization of post-process list of instructions from the
26175 // top-tree instructions to try to vectorize as many instructions as
26176 // possible.
26177 OpsChanged |=
26178 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
26179 if (OpsChanged) {
26180 // We would like to start over since some instructions are deleted
26181 // and the iterator may become invalid value.
26182 Changed = true;
26183 It = BB->begin();
26184 E = BB->end();
26185 continue;
26186 }
26187 }
26188
26189 if (isa<InsertElementInst, InsertValueInst>(It))
26190 PostProcessInserts.insert(&*It);
26191 else if (isa<CmpInst>(It))
26192 PostProcessCmps.insert(cast<CmpInst>(&*It));
26193 }
26194
26195 return Changed;
26196}
26197
26198bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
26199 auto Changed = false;
26200 for (auto &Entry : GEPs) {
26201 // If the getelementptr list has fewer than two elements, there's nothing
26202 // to do.
26203 if (Entry.second.size() < 2)
26204 continue;
26205
26206 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
26207 << Entry.second.size() << ".\n");
26208
26209 // Process the GEP list in chunks suitable for the target's supported
26210 // vector size. If a vector register can't hold 1 element, we are done. We
26211 // are trying to vectorize the index computations, so the maximum number of
26212 // elements is based on the size of the index expression, rather than the
26213 // size of the GEP itself (the target's pointer size).
26214 auto *It = find_if(Entry.second, [&](GetElementPtrInst *GEP) {
26215 return !R.isDeleted(GEP);
26216 });
26217 if (It == Entry.second.end())
26218 continue;
26219 unsigned MaxVecRegSize = R.getMaxVecRegSize();
26220 unsigned EltSize = R.getVectorElementSize(*(*It)->idx_begin());
26221 if (MaxVecRegSize < EltSize)
26222 continue;
26223
26224 unsigned MaxElts = MaxVecRegSize / EltSize;
26225 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
26226 auto Len = std::min<unsigned>(BE - BI, MaxElts);
26227 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
26228
26229 // Initialize a set a candidate getelementptrs. Note that we use a
26230 // SetVector here to preserve program order. If the index computations
26231 // are vectorizable and begin with loads, we want to minimize the chance
26232 // of having to reorder them later.
26233 SetVector<Value *> Candidates(llvm::from_range, GEPList);
26234
26235 // Some of the candidates may have already been vectorized after we
26236 // initially collected them or their index is optimized to constant value.
26237 // If so, they are marked as deleted, so remove them from the set of
26238 // candidates.
26239 Candidates.remove_if([&R](Value *I) {
26240 return R.isDeleted(cast<Instruction>(I)) ||
26241 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
26242 });
26243
26244 // Remove from the set of candidates all pairs of getelementptrs with
26245 // constant differences. Such getelementptrs are likely not good
26246 // candidates for vectorization in a bottom-up phase since one can be
26247 // computed from the other. We also ensure all candidate getelementptr
26248 // indices are unique.
26249 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
26250 auto *GEPI = GEPList[I];
26251 if (!Candidates.count(GEPI))
26252 continue;
26253 const SCEV *SCEVI = SE->getSCEV(GEPList[I]);
26254 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
26255 auto *GEPJ = GEPList[J];
26256 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
26257 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
26258 Candidates.remove(GEPI);
26259 Candidates.remove(GEPJ);
26260 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
26261 Candidates.remove(GEPJ);
26262 }
26263 }
26264 }
26265
26266 // We break out of the above computation as soon as we know there are
26267 // fewer than two candidates remaining.
26268 if (Candidates.size() < 2)
26269 continue;
26270
26271 // Add the single, non-constant index of each candidate to the bundle. We
26272 // ensured the indices met these constraints when we originally collected
26273 // the getelementptrs.
26274 SmallVector<Value *, 16> Bundle(Candidates.size());
26275 auto BundleIndex = 0u;
26276 for (auto *V : Candidates) {
26277 auto *GEP = cast<GetElementPtrInst>(V);
26278 auto *GEPIdx = GEP->idx_begin()->get();
26279 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
26280 Bundle[BundleIndex++] = GEPIdx;
26281 }
26282
26283 // Try and vectorize the indices. We are currently only interested in
26284 // gather-like cases of the form:
26285 //
26286 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
26287 //
26288 // where the loads of "a", the loads of "b", and the subtractions can be
26289 // performed in parallel. It's likely that detecting this pattern in a
26290 // bottom-up phase will be simpler and less costly than building a
26291 // full-blown top-down phase beginning at the consecutive loads.
26292 Changed |= tryToVectorizeList(Bundle, R);
26293 }
26294 }
26295 return Changed;
26296}
26297
26298bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
26299 bool Changed = false;
26300 // Sort by type, base pointers and values operand. Value operands must be
26301 // compatible (have the same opcode, same parent), otherwise it is
26302 // definitely not profitable to try to vectorize them.
26303 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
26304 if (V->getValueOperand()->getType()->getTypeID() <
26305 V2->getValueOperand()->getType()->getTypeID())
26306 return true;
26307 if (V->getValueOperand()->getType()->getTypeID() >
26308 V2->getValueOperand()->getType()->getTypeID())
26309 return false;
26310 if (V->getPointerOperandType()->getTypeID() <
26311 V2->getPointerOperandType()->getTypeID())
26312 return true;
26313 if (V->getPointerOperandType()->getTypeID() >
26314 V2->getPointerOperandType()->getTypeID())
26315 return false;
26316 if (V->getValueOperand()->getType()->getScalarSizeInBits() <
26317 V2->getValueOperand()->getType()->getScalarSizeInBits())
26318 return true;
26319 if (V->getValueOperand()->getType()->getScalarSizeInBits() >
26320 V2->getValueOperand()->getType()->getScalarSizeInBits())
26321 return false;
26322 // UndefValues are compatible with all other values.
26323 if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
26324 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
26326 DT->getNode(I1->getParent());
26328 DT->getNode(I2->getParent());
26329 assert(NodeI1 && "Should only process reachable instructions");
26330 assert(NodeI2 && "Should only process reachable instructions");
26331 assert((NodeI1 == NodeI2) ==
26332 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
26333 "Different nodes should have different DFS numbers");
26334 if (NodeI1 != NodeI2)
26335 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
26336 return I1->getOpcode() < I2->getOpcode();
26337 }
26338 return V->getValueOperand()->getValueID() <
26339 V2->getValueOperand()->getValueID();
26340 };
26341
26342 auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {
26343 if (V1 == V2)
26344 return true;
26345 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
26346 return false;
26347 if (V1->getPointerOperandType() != V2->getPointerOperandType())
26348 return false;
26349 // Undefs are compatible with any other value.
26350 if (isa<UndefValue>(V1->getValueOperand()) ||
26351 isa<UndefValue>(V2->getValueOperand()))
26352 return true;
26353 if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))
26354 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
26355 if (I1->getParent() != I2->getParent())
26356 return false;
26357 return getSameOpcode({I1, I2}, *TLI).valid();
26358 }
26359 if (isa<Constant>(V1->getValueOperand()) &&
26360 isa<Constant>(V2->getValueOperand()))
26361 return true;
26362 return V1->getValueOperand()->getValueID() ==
26363 V2->getValueOperand()->getValueID();
26364 };
26365
26366 // Attempt to sort and vectorize each of the store-groups.
26368 for (auto &Pair : Stores) {
26369 if (Pair.second.size() < 2)
26370 continue;
26371
26372 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
26373 << Pair.second.size() << ".\n");
26374
26375 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
26376 continue;
26377
26378 // Reverse stores to do bottom-to-top analysis. This is important if the
26379 // values are stores to the same addresses several times, in this case need
26380 // to follow the stores order (reversed to meet the memory dependecies).
26381 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
26382 Pair.second.rend());
26383 Changed |= tryToVectorizeSequence<StoreInst>(
26384 ReversedStores, StoreSorter, AreCompatibleStores,
26385 [&](ArrayRef<StoreInst *> Candidates, bool) {
26386 return vectorizeStores(Candidates, R, Attempted);
26387 },
26388 /*MaxVFOnly=*/false, R);
26389 }
26390 return Changed;
26391}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isConstant(const MachineInstr &MI)
AMDGPU Register Bank Select
Rewrite undef for PHI
ReachingDefAnalysis InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
#define LLVM_MARK_AS_BITMASK_ENUM(LargestValue)
LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can perform bitwise operatio...
Definition: BitmaskEnum.h:42
block Block Frequency Analysis
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:638
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
Definition: DataLayout.cpp:919
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
Definition: DebugCounter.h:194
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
uint32_t Index
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
Early If Converter
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
Definition: ExpandFp.cpp:597
This is the interface for a simple mod/ref and alias analysis over globals.
static const HTTPClientCleanup Cleanup
Definition: HTTPClient.cpp:42
Hexagon Common GEP
#define _
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition: IVUsers.cpp:48
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Definition: LICM.cpp:1451
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
ppc ctr loops verify
static bool IsSelect(MachineInstr &MI)
if(PassOpts->AAPipeline)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static cl::opt< bool > SplitAlternateInstructions("slp-split-alternate-instructions", cl::init(true), cl::Hidden, cl::desc("Improve the code quality by splitting alternate instructions"))
static bool isStridedLoad(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, const bool IsAnyPointerUsedOutGraph, const int64_t Diff)
Checks if strided loads can be generated out of VL loads with pointers PointerOps:
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static bool isMaskedLoadCompress(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, const DominatorTree &DT, const TargetLibraryInfo &TLI, const function_ref< bool(Value *)> AreAllUsersVectorized, bool &IsMasked, unsigned &InterleaveFactor, SmallVectorImpl< int > &CompressMask, VectorType *&LoadVecTy)
Checks if the VL can be transformed to a (masked)load + compress or (masked) interleaved load.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< bool > VectorizeCopyableElements("slp-copyable-elements", cl::init(true), cl::Hidden, cl::desc("Try to replace values with the idempotent instructions for " "better vectorization."))
Enables vectorization of copyable elements.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static bool isCommutative(Instruction *I, Value *ValWithUses)
static bool allSameOpcode(ArrayRef< Value * > VL)
static InstructionCost canConvertToFMA(ArrayRef< Value * > VL, const InstructionsState &S, DominatorTree &DT, const DataLayout &DL, TargetTransformInfo &TTI, const TargetLibraryInfo &TLI)
Check if we can convert fadd/fsub sequence to FMAD.
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
#define SV_NAME
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Creates subvector insert.
static void findBuildAggregateRec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static DebugLoc getDebugLocFromPHI(PHINode &PN)
static std::optional< unsigned > getExtractIndex(const Instruction *E)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static bool isMainInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an main operation for the given MainOp and AltOp instruction...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I)
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={})
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
static bool buildCompressMask(ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, Type *ScalarTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< int > &CompressMask)
Builds compress-like mask for shuffles for the given PointerOps, ordered with Order.
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
Calculates the costs of vectorized intrinsic (if possible) and vectorized function (if possible) call...
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static SmallVector< Constant * > replicateMask(ArrayRef< Constant * > Val, unsigned VF)
Replicates the given Val VF times.
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static Instruction * findInstructionWithOpcode(ArrayRef< Value * > VL, unsigned Opcode)
Find an instruction with a specific opcode in VL.
static InstructionCost getExtractWithExtendCost(const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst is a FixedVectorType,...
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static std::pair< Instruction *, Instruction * > getMainAltOpsNoStateVL(ArrayRef< Value * > VL)
Returns main/alternate instructions for the given VL.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int64_t > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static bool tryToFindDuplicates(SmallVectorImpl< Value * > &VL, SmallVectorImpl< int > &ReuseShuffleIndices, const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const InstructionsState &S, const BoUpSLP::EdgeInfo &UserTreeIdx, bool TryPad=false)
Checks that every instruction appears once in the list and if not, packs them, building ReuseShuffleI...
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Generates subvector extract using Generator or using default shuffle.
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static InstructionCost getVectorInstrCost(const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef< std::tuple< Value *, User *, int > > ScalarUserAndIdx)
This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy is a FixedVectorType,...
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, Type *ScalarTy, unsigned Opcode0, unsigned Opcode1)
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:480
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
#define LLVM_DEBUG(...)
Definition: Debug.h:119
static const int BlockSize
Definition: TarWriter.cpp:33
This pass exposes codegen information to IR-level passes.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:247
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
Definition: VPlanSLP.cpp:210
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:83
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
A private abstract base class describing the concept of an individual alias analysis implementation.
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1406
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1540
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1330
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:371
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:380
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1666
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1488
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1111
void clearAllBits()
Set every bit to 0.
Definition: APInt.h:1396
void negate()
Negate this APInt in place.
Definition: APInt.h:1468
unsigned logBase2() const
Definition: APInt.h:1761
void setAllBits()
Set every bit to 1.
Definition: APInt.h:1319
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition: APInt.h:1367
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:440
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:200
bool isOne() const
Determine if this is a value of 1.
Definition: APInt.h:389
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition: APInt.h:286
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:239
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:255
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:431
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:412
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition: ArrayRef.h:183
const T & back() const
back - Get the last element.
Definition: ArrayRef.h:156
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition: ArrayRef.h:224
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:200
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:150
iterator end() const
Definition: ArrayRef.h:136
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:147
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:206
iterator begin() const
Definition: ArrayRef.h:135
ArrayRef< T > take_back(size_t N=1) const
Return a copy of *this with only the last N elements.
Definition: ArrayRef.h:231
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:142
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:191
const T & consume_front()
consume_front() - Returns the first element and drops it from ArrayRef.
Definition: ArrayRef.h:162
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:234
LLVM Basic Block Representation.
Definition: BasicBlock.h:62
iterator end()
Definition: BasicBlock.h:472
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:459
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:172
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:213
reverse_iterator rend()
Definition: BasicBlock.h:477
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:170
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
Definition: BasicBlock.cpp:406
size_t size() const
Definition: BasicBlock.h:480
InstListType::const_reverse_iterator const_reverse_iterator
Definition: BasicBlock.h:173
bool isEHPad() const
Return true if this basic block is an exception handling block.
Definition: BasicBlock.h:707
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:233
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:73
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1116
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
Definition: InstrTypes.h:2010
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:1905
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1348
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
Definition: InstrTypes.h:2148
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Definition: InstrTypes.h:2004
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1292
FunctionType * getFunctionType() const
Definition: InstrTypes.h:1205
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1283
unsigned arg_size() const
Definition: InstrTypes.h:1290
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1506
bool hasOperandBundles() const
Return true if this User has any operand bundles.
Definition: InstrTypes.h:2001
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:448
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:666
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:984
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:678
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:707
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:708
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:702
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:701
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:705
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:703
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:706
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:704
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition: InstrTypes.h:829
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:791
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:767
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition: CmpPredicate.h:23
static LLVM_ABI Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2314
static LLVM_ABI Constant * getBinOpIdentity(unsigned Opcode, Type *Ty, bool AllowRHSConstant=false, bool NSZ=false)
Return the identity constant for a binary opcode.
Definition: Constants.cpp:2694
This is the shared class of boolean and integer constants.
Definition: Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:868
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:875
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:163
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:154
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
Definition: Constants.cpp:1474
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1423
This is an important base class in LLVM.
Definition: Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:420
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
static bool shouldExecute(unsigned CounterName)
Definition: DebugCounter.h:88
A debug info location.
Definition: DebugLoc.h:124
static DebugLoc getUnknown()
Definition: DebugLoc.h:162
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:104
LLVM_ABI APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:203
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:177
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition: DenseMap.h:245
bool erase(const KeyT &Val)
Definition: DenseMap.h:319
unsigned size() const
Definition: DenseMap.h:120
bool empty() const
Definition: DenseMap.h:119
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:173
iterator end()
Definition: DenseMap.h:87
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:221
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:168
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:230
Implements a dense probed hash-table based set.
Definition: DenseSet.h:263
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:284
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:165
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
Definition: Dominators.cpp:334
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:135
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:312
This instruction extracts a single (scalar) element from a VectorType value.
This instruction extracts a struct member or array element value from an aggregate value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:22
void set()
Definition: FMF.h:61
bool allowReassoc() const
Flag queries.
Definition: FMF.h:64
bool allowContract() const
Definition: FMF.h:69
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:592
unsigned getNumElements() const
Definition: DerivedTypes.h:635
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:803
ArrayRef< Type * > params() const
Definition: DerivedTypes.h:132
Type * getReturnType() const
Definition: DerivedTypes.h:126
bool empty() const
Definition: Function.h:857
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:949
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2571
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition: IRBuilder.h:547
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2559
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:575
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1864
LLVM_ABI CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
Definition: IRBuilder.cpp:488
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1005
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:202
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition: IRBuilder.h:2637
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition: IRBuilder.h:2238
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:201
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:345
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:247
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1923
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Definition: IRBuilder.h:862
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1809
LLVM_ABI Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition: IRBuilder.cpp:823
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:834
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:522
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2463
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2494
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2204
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Definition: IRBuilder.cpp:815
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2593
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:507
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2508
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1708
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:196
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2277
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:207
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1883
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2439
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1651
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1437
LLVM_ABI CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
Definition: IRBuilder.cpp:538
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2780
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
Definition: Instruction.h:321
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
Definition: Instruction.h:808
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:513
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isBinaryOp() const
Definition: Instruction.h:317
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:312
bool isShift() const
Definition: Instruction.h:320
LLVM_ABI bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
bool isIntDivRem() const
Definition: Instruction.h:318
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:319
const SmallVectorImpl< Type * > & getArgTypes() const
An instruction for reading from memory.
Definition: Instructions.h:180
Value * getPointerOperand()
Definition: Instructions.h:259
bool isSimple() const
Definition: Instructions.h:251
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:215
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:570
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:40
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator end()
Definition: MapVector.h:67
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition: MapVector.h:48
iterator find(const KeyT &Key)
Definition: MapVector.h:141
bool empty() const
Definition: MapVector.h:75
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition: MapVector.h:107
ValueT lookup(const KeyT &Key) const
Definition: MapVector.h:99
size_type size() const
Definition: MapVector.h:56
std::pair< KeyT, ValueT > & front()
Definition: MapVector.h:79
void clear()
Definition: MapVector.h:84
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:303
T & front() const
front - Get the first element.
Definition: ArrayRef.h:354
iterator end() const
Definition: ArrayRef.h:348
iterator begin() const
Definition: ArrayRef.h:347
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:381
The optimization diagnostic interface.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
This is a MutableArrayRef that owns its array.
Definition: ArrayRef.h:454
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:99
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:720
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
Definition: PointerUnion.h:142
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
Definition: PointerUnion.h:168
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1885
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:151
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:59
ArrayRef< value_type > getArrayRef() const
Definition: SetVector.h:90
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:104
const value_type & front() const
Return the first element of the SetVector.
Definition: SetVector.h:149
void insert_range(Range &&R)
Definition: SetVector.h:193
Vector takeVector()
Clear the SetVector and return the underlying vector.
Definition: SetVector.h:93
void clear()
Completely clear the SetVector.
Definition: SetVector.h:284
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:99
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:168
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:269
This instruction constructs a fixed permutation of two input vectors.
static LLVM_ABI bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static LLVM_ABI bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static LLVM_ABI bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:283
size_type size() const
Definition: SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:380
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition: SmallPtrSet.h:418
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:470
iterator end() const
Definition: SmallPtrSet.h:499
void insert_range(Range &&R)
Definition: SmallPtrSet.h:490
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:401
iterator begin() const
Definition: SmallPtrSet.h:494
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:476
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:356
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:134
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:176
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition: SmallSet.h:227
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:182
size_type size() const
Definition: SmallSet.h:171
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
bool empty() const
Definition: SmallVector.h:82
size_t size() const
Definition: SmallVector.h:79
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:705
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:938
void reserve(size_type N)
Definition: SmallVector.h:664
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:684
void swap(SmallVectorImpl &RHS)
Definition: SmallVector.h:969
void resize(size_type N)
Definition: SmallVector.h:639
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
An instruction for storing to memory.
Definition: Instructions.h:296
Type * getPointerOperandType() const
Definition: Instructions.h:389
Value * getValueOperand()
Definition: Instructions.h:383
Value * getPointerOperand()
Definition: Instructions.h:386
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
TargetFolder - Create constants with target dependent folding.
Definition: TargetFolder.h:35
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
LLVM_ABI InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
LLVM_ABI InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, const Value *Op0=nullptr, const Value *Op1=nullptr) const
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
LLVM_ABI bool isLegalMaskedLoad(Type *DataType, Align Alignment, unsigned AddressSpace) const
Return true if the target supports masked load.
LLVM_ABI bool preferAlternateOpcodeVectorization() const
LLVM_ABI InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI TypeSize getRegisterBitWidth(RegisterKind K) const
LLVM_ABI bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
LLVM_ABI InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
LLVM_ABI InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType=nullptr, TargetCostKind CostKind=TCK_SizeAndLatency) const
Estimate the cost of a GEP operation when lowered.
LLVM_ABI bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, Align Alignment, unsigned AddrSpace) const
Return true is the target supports interleaved access for the given vector type VTy,...
LLVM_ABI bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
\Returns true if the target supports broadcasting a load to a vector of type <NumElements x ElementTy...
LLVM_ABI InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
LLVM_ABI bool forceScalarizeMaskedGather(VectorType *Type, Align Alignment) const
Return true if the target forces scalarizing of llvm.masked.gather intrinsics.
LLVM_ABI bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const
Return true if the target supports strided load.
LLVM_ABI InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
OperandValueProperties
Additional properties of an operand's values.
LLVM_ABI InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const PointersChainInfo &Info, Type *AccessTy, TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Estimate the cost of a chain of pointers (typically pointer operands of a chain of loads or stores wi...
LLVM_ABI unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
LLVM_ABI bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
LLVM_ABI InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
LLVM_ABI InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const
LLVM_ABI unsigned getMinVectorRegisterBitWidth() const
LLVM_ABI InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
LLVM_ABI bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
Return true if this is an alternating opcode pattern that can be lowered to a single instruction on t...
LLVM_ABI bool isFPVectorizationPotentiallyUnsafe() const
Indicate that it is potentially unsafe to automatically vectorize floating-point operations because t...
LLVM_ABI unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
LLVM_ABI unsigned getNumberOfParts(Type *Tp) const
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
LLVM_ABI InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ GatherScatter
The cast is used with a gather/scatter.
LLVM_ABI InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind) const
OperandValueKind
Additional information about an operand's possible values.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:82
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:273
bool isX86_FP80Ty() const
Return true if this is x86 long double.
Definition: Type.h:159
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:246
LLVM_ABI bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:267
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition: Type.h:296
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
Definition: Type.h:165
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:270
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:240
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:136
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:225
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
LLVM_ABI unsigned getStructNumElements() const
LLVM_ABI unsigned getIntegerBitWidth() const
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:352
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1866
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
op_range operands()
Definition: User.h:292
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Definition: User.h:119
op_iterator op_begin()
Definition: User.h:284
Value * getOperand(unsigned i) const
Definition: User.h:232
unsigned getNumOperands() const
Definition: User.h:254
iterator_range< value_op_iterator > operand_values()
Definition: User.h:316
The Vector Function Database.
Definition: VectorUtils.h:33
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:74
LLVM Value Representation.
Definition: Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
user_iterator user_begin()
Definition: Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:546
iterator_range< user_iterator > users()
Definition: Value.h:426
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition: Value.h:543
LLVM_ABI bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition: Value.cpp:158
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:150
bool use_empty() const
Definition: Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1098
LLVM_ABI unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition: Value.cpp:265
iterator_range< use_iterator > uses()
Definition: Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:322
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:396
Base class of all SIMD vector types.
Definition: DerivedTypes.h:430
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:695
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Definition: DerivedTypes.h:463
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:205
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:194
iterator find(const_arg_type_t< ValueT > V)
Definition: DenseSet.h:163
void insert_range(Range &&R)
Definition: DenseSet.h:222
size_type size() const
Definition: DenseSet.h:87
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:169
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:174
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:203
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
Definition: Hashing.h:76
const ParentTy * getParent() const
Definition: ilist_node.h:34
self_iterator getIterator()
Definition: ilist_node.h:134
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition: ilist_node.h:359
CRTP base class for adapting an iterator to a different type.
Definition: iterator.h:237
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:53
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:662
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:692
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
VLOperands(ArrayRef< Value * > RootVL, ArrayRef< ValueList > Operands, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
Bottom Up SLP Vectorizer.
bool isProfitableToReorder() const
Checks if it is profitable to reorder the current tree.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleEntity &SE)
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleBundle &Bundle)
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={}, InstructionCost ReductionCost=TTI::TCC_Free)
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isVectorized(const Value *V) const
Check if the value is vectorized in the tree.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleData &SD)
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals, ArrayRef< std::tuple< Value *, unsigned, bool > > VectorValuesAndScales)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
static bool isIdentityOrder(ArrayRef< unsigned > Order)
Does this non-empty order represent an identity order? Identity should be represented as an empty ord...
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Gets reordering data for the given tree entry.
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
Definition: VectorUtils.h:108
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Key
PAL metadata keys.
@ HorizontalReduction
Definition: ARMBaseInfo.h:425
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:126
@ Entry
Definition: COFF.h:862
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
Definition: ISDOpcodes.h:1572
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:751
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:862
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaxNum(const Opnd0 &Op0, const Opnd1 &Op1)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:962
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaximum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
Definition: PatternMatch.h:299
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
Definition: PatternMatch.h:239
@ GS
Definition: X86.h:213
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
unsigned combineHashValue(unsigned a, unsigned b)
Simplistic combination of 32-bit hash values into 32-bit hash values.
Definition: DenseMapInfo.h:41
DiagnosticInfoOptimizationBase::Argument NV
LLVM_ABI const_iterator begin(StringRef path LLVM_LIFETIME_BOUND, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:226
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:338
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
Definition: LoopUtils.cpp:1313
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
@ Offset
Definition: DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:860
void stable_sort(R &&Range)
Definition: STLExtras.h:2077
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1770
void fill(R &&Range, T &&Value)
Provide wrappers to std::fill which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1764
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1737
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1744
hash_code hash_value(const FixedPointSemantics &Val)
Definition: APFixedPoint.h:137
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:1023
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1702
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:533
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition: ScopeExit.h:59
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2491
auto pred_end(const MachineBasicBlock *BB)
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
Definition: SetOperations.h:58
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:7502
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition: Utils.cpp:1723
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
constexpr from_range_t from_range
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
LLVM_ABI std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition: STLExtras.h:2250
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:663
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:551
iterator_range< po_iterator< T > > post_order(const T &G)
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
auto binary_search(R &&Range, T &&Value)
Provide wrappers to std::binary_search which take ranges instead of having to pass begin/end explicit...
Definition: STLExtras.h:2000
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:295
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:390
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2147
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1987
constexpr bool has_single_bit(T Value) noexcept
Definition: bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1751
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition: Local.cpp:402
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:428
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1669
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
Definition: STLExtras.h:1782
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition: Loads.cpp:431
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
Definition: SPIRVUtils.h:288
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1758
auto make_first_range(ContainerTy &&c)
Given a container of pairs, return a range over the first elements.
Definition: STLExtras.h:1444
LLVM_ABI std::optional< int64_t > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition: Local.cpp:421
bool isModOrRefSet(const ModRefInfo MRI)
Definition: ModRef.h:43
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
Definition: STLExtras.h:1939
LLVM_ABI bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition: Casting.h:548
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
Definition: LoopUtils.cpp:1393
constexpr int PoisonMaskElem
@ Ref
The access may reference the value stored in memory.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:399
@ Other
Any other memory.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
TargetTransformInfo TTI
LLVM_ABI CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
Definition: LoopUtils.cpp:1094
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:34
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ None
Not a recurrence.
@ Xor
Bitwise or logical XOR of integers.
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
@ FAdd
Sum of floats.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1973
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:2049
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
Definition: GraphWriter.h:443
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1854
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
Definition: STLExtras.h:1454
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:223
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1980
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
auto pred_begin(const MachineBasicBlock *BB)
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1777
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1916
InstructionCost Cost
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition: Sequence.h:305
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:595
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition: STLExtras.h:2107
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:280
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
LLVM_ABI Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
Definition: VectorUtils.cpp:46
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:469
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:858
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Used to keep track of an operand bundle.
Definition: InstrTypes.h:2169
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
static bool isEqual(const BoUpSLP::EdgeInfo &LHS, const BoUpSLP::EdgeInfo &RHS)
static BoUpSLP::EdgeInfo getEmptyKey()
static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val)
static BoUpSLP::EdgeInfo getTombstoneKey()
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:54
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
Direction
An enum for the direction of the loop.
Definition: LoopInfo.h:217
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:249
Describe known properties for a set of pointers.
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Definition: STLExtras.h:1472
Function object to check whether the second component of a container supported by std::get (like std:...
Definition: STLExtras.h:1481
This structure holds any data we need about the edges being traversed during buildTreeRec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.