LLVM 22.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
41#include "llvm/Analysis/Loads.h"
52#include "llvm/IR/Attributes.h"
53#include "llvm/IR/BasicBlock.h"
54#include "llvm/IR/Constant.h"
55#include "llvm/IR/Constants.h"
56#include "llvm/IR/DataLayout.h"
58#include "llvm/IR/Dominators.h"
59#include "llvm/IR/Function.h"
60#include "llvm/IR/IRBuilder.h"
61#include "llvm/IR/InstrTypes.h"
62#include "llvm/IR/Instruction.h"
65#include "llvm/IR/Intrinsics.h"
66#include "llvm/IR/Module.h"
67#include "llvm/IR/Operator.h"
69#include "llvm/IR/Type.h"
70#include "llvm/IR/Use.h"
71#include "llvm/IR/User.h"
72#include "llvm/IR/Value.h"
73#include "llvm/IR/ValueHandle.h"
74#ifdef EXPENSIVE_CHECKS
75#include "llvm/IR/Verifier.h"
76#endif
77#include "llvm/Pass.h"
82#include "llvm/Support/Debug.h"
94#include <algorithm>
95#include <cassert>
96#include <cstdint>
97#include <iterator>
98#include <memory>
99#include <optional>
100#include <set>
101#include <string>
102#include <tuple>
103#include <utility>
104
105using namespace llvm;
106using namespace llvm::PatternMatch;
107using namespace slpvectorizer;
108using namespace std::placeholders;
109
110#define SV_NAME "slp-vectorizer"
111#define DEBUG_TYPE "SLP"
112
113STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
114
115DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",
116 "Controls which SLP graphs should be vectorized.");
117
118static cl::opt<bool>
119 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
120 cl::desc("Run the SLP vectorization passes"));
121
122static cl::opt<bool>
123 SLPReVec("slp-revec", cl::init(false), cl::Hidden,
124 cl::desc("Enable vectorization for wider vector utilization"));
125
126static cl::opt<int>
128 cl::desc("Only vectorize if you gain more than this "
129 "number "));
130
132 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
133 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
134 "heuristics and makes vectorization decision via cost modeling."));
135
136static cl::opt<bool>
137ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
138 cl::desc("Attempt to vectorize horizontal reductions"));
139
141 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
142 cl::desc(
143 "Attempt to vectorize horizontal reductions feeding into a store"));
144
146 "slp-split-alternate-instructions", cl::init(true), cl::Hidden,
147 cl::desc("Improve the code quality by splitting alternate instructions"));
148
149static cl::opt<int>
151 cl::desc("Attempt to vectorize for this register size in bits"));
152
155 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
156
157/// Limits the size of scheduling regions in a block.
158/// It avoid long compile times for _very_ large blocks where vector
159/// instructions are spread over a wide range.
160/// This limit is way higher than needed by real-world functions.
161static cl::opt<int>
162ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
163 cl::desc("Limit the size of the SLP scheduling region per block"));
164
166 "slp-min-reg-size", cl::init(128), cl::Hidden,
167 cl::desc("Attempt to vectorize for this register size in bits"));
168
170 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
171 cl::desc("Limit the recursion depth when building a vectorizable tree"));
172
174 "slp-min-tree-size", cl::init(3), cl::Hidden,
175 cl::desc("Only vectorize small trees if they are fully vectorizable"));
176
177// The maximum depth that the look-ahead score heuristic will explore.
178// The higher this value, the higher the compilation time overhead.
180 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
181 cl::desc("The maximum look-ahead depth for operand reordering scores"));
182
183// The maximum depth that the look-ahead score heuristic will explore
184// when it probing among candidates for vectorization tree roots.
185// The higher this value, the higher the compilation time overhead but unlike
186// similar limit for operands ordering this is less frequently used, hence
187// impact of higher value is less noticeable.
189 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
190 cl::desc("The maximum look-ahead depth for searching best rooting option"));
191
193 "slp-min-strided-loads", cl::init(2), cl::Hidden,
194 cl::desc("The minimum number of loads, which should be considered strided, "
195 "if the stride is > 1 or is runtime value"));
196
198 "slp-max-stride", cl::init(8), cl::Hidden,
199 cl::desc("The maximum stride, considered to be profitable."));
200
201static cl::opt<bool>
202 ViewSLPTree("view-slp-tree", cl::Hidden,
203 cl::desc("Display the SLP trees with Graphviz"));
204
206 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
207 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
208
209/// Enables vectorization of copyable elements.
211 "slp-copyable-elements", cl::init(true), cl::Hidden,
212 cl::desc("Try to replace values with the idempotent instructions for "
213 "better vectorization."));
214
215// Limit the number of alias checks. The limit is chosen so that
216// it has no negative effect on the llvm benchmarks.
217static const unsigned AliasedCheckLimit = 10;
218
219// Limit of the number of uses for potentially transformed instructions/values,
220// used in checks to avoid compile-time explode.
221static constexpr int UsesLimit = 64;
222
223// Another limit for the alias checks: The maximum distance between load/store
224// instructions where alias checks are done.
225// This limit is useful for very large basic blocks.
226static const unsigned MaxMemDepDistance = 160;
227
228/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
229/// regions to be handled.
230static const int MinScheduleRegionSize = 16;
231
232/// Maximum allowed number of operands in the PHI nodes.
233static const unsigned MaxPHINumOperands = 128;
234
235/// Predicate for the element types that the SLP vectorizer supports.
236///
237/// The most important thing to filter here are types which are invalid in LLVM
238/// vectors. We also filter target specific types which have absolutely no
239/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
240/// avoids spending time checking the cost model and realizing that they will
241/// be inevitably scalarized.
242static bool isValidElementType(Type *Ty) {
243 // TODO: Support ScalableVectorType.
244 if (SLPReVec && isa<FixedVectorType>(Ty))
245 Ty = Ty->getScalarType();
246 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
247 !Ty->isPPC_FP128Ty();
248}
249
250/// Returns the type of the given value/instruction \p V. If it is store,
251/// returns the type of its value operand, for Cmp - the types of the compare
252/// operands and for insertelement - the type os the inserted operand.
253/// Otherwise, just the type of the value is returned.
255 if (auto *SI = dyn_cast<StoreInst>(V))
256 return SI->getValueOperand()->getType();
257 if (auto *CI = dyn_cast<CmpInst>(V))
258 return CI->getOperand(0)->getType();
259 if (auto *IE = dyn_cast<InsertElementInst>(V))
260 return IE->getOperand(1)->getType();
261 return V->getType();
262}
263
264/// \returns the number of elements for Ty.
265static unsigned getNumElements(Type *Ty) {
266 assert(!isa<ScalableVectorType>(Ty) &&
267 "ScalableVectorType is not supported.");
268 if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
269 return VecTy->getNumElements();
270 return 1;
271}
272
273/// \returns the vector type of ScalarTy based on vectorization factor.
274static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
275 return FixedVectorType::get(ScalarTy->getScalarType(),
276 VF * getNumElements(ScalarTy));
277}
278
279/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
280/// which forms type, which splits by \p TTI into whole vector types during
281/// legalization.
283 Type *Ty, unsigned Sz) {
284 if (!isValidElementType(Ty))
285 return bit_ceil(Sz);
286 // Find the number of elements, which forms full vectors.
287 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
288 if (NumParts == 0 || NumParts >= Sz)
289 return bit_ceil(Sz);
290 return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
291}
292
293/// Returns the number of elements of the given type \p Ty, not greater than \p
294/// Sz, which forms type, which splits by \p TTI into whole vector types during
295/// legalization.
296static unsigned
298 unsigned Sz) {
299 if (!isValidElementType(Ty))
300 return bit_floor(Sz);
301 // Find the number of elements, which forms full vectors.
302 unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
303 if (NumParts == 0 || NumParts >= Sz)
304 return bit_floor(Sz);
305 unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
306 if (RegVF > Sz)
307 return bit_floor(Sz);
308 return (Sz / RegVF) * RegVF;
309}
310
311static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
312 SmallVectorImpl<int> &Mask) {
313 // The ShuffleBuilder implementation use shufflevector to splat an "element".
314 // But the element have different meaning for SLP (scalar) and REVEC
315 // (vector). We need to expand Mask into masks which shufflevector can use
316 // directly.
317 SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
318 for (unsigned I : seq<unsigned>(Mask.size()))
319 for (auto [J, MaskV] : enumerate(MutableArrayRef(NewMask).slice(
320 I * VecTyNumElements, VecTyNumElements)))
321 MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem
322 : Mask[I] * VecTyNumElements + J;
323 Mask.swap(NewMask);
324}
325
326/// \returns the number of groups of shufflevector
327/// A group has the following features
328/// 1. All of value in a group are shufflevector.
329/// 2. The mask of all shufflevector is isExtractSubvectorMask.
330/// 3. The mask of all shufflevector uses all of the elements of the source.
331/// e.g., it is 1 group (%0)
332/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
333/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
334/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
335/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
336/// it is 2 groups (%3 and %4)
337/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
338/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
339/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
340/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
341/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
342/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
343/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
344/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
345/// it is 0 group
346/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
347/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
348/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
349/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
351 if (VL.empty())
352 return 0;
353 if (!all_of(VL, IsaPred<ShuffleVectorInst>))
354 return 0;
355 auto *SV = cast<ShuffleVectorInst>(VL.front());
356 unsigned SVNumElements =
357 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
358 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
359 if (SVNumElements % ShuffleMaskSize != 0)
360 return 0;
361 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
362 if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
363 return 0;
364 unsigned NumGroup = 0;
365 for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) {
366 auto *SV = cast<ShuffleVectorInst>(VL[I]);
367 Value *Src = SV->getOperand(0);
368 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
369 SmallBitVector ExpectedIndex(GroupSize);
370 if (!all_of(Group, [&](Value *V) {
371 auto *SV = cast<ShuffleVectorInst>(V);
372 // From the same source.
373 if (SV->getOperand(0) != Src)
374 return false;
375 int Index;
376 if (!SV->isExtractSubvectorMask(Index))
377 return false;
378 ExpectedIndex.set(Index / ShuffleMaskSize);
379 return true;
380 }))
381 return 0;
382 if (!ExpectedIndex.all())
383 return 0;
384 ++NumGroup;
385 }
386 assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
387 return NumGroup;
388}
389
390/// \returns a shufflevector mask which is used to vectorize shufflevectors
391/// e.g.,
392/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
393/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
394/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
395/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
396/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
397/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
398/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
399/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
400/// the result is
401/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
403 assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
404 auto *SV = cast<ShuffleVectorInst>(VL.front());
405 unsigned SVNumElements =
406 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
407 SmallVector<int> Mask;
408 unsigned AccumulateLength = 0;
409 for (Value *V : VL) {
410 auto *SV = cast<ShuffleVectorInst>(V);
411 for (int M : SV->getShuffleMask())
412 Mask.push_back(M == PoisonMaskElem ? PoisonMaskElem
413 : AccumulateLength + M);
414 AccumulateLength += SVNumElements;
415 }
416 return Mask;
417}
418
419/// \returns True if the value is a constant (but not globals/constant
420/// expressions).
421static bool isConstant(Value *V) {
422 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
423}
424
425/// Checks if \p V is one of vector-like instructions, i.e. undef,
426/// insertelement/extractelement with constant indices for fixed vector type or
427/// extractvalue instruction.
429 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
430 !isa<ExtractValueInst, UndefValue>(V))
431 return false;
432 auto *I = dyn_cast<Instruction>(V);
433 if (!I || isa<ExtractValueInst>(I))
434 return true;
435 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
436 return false;
437 if (isa<ExtractElementInst>(I))
438 return isConstant(I->getOperand(1));
439 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
440 return isConstant(I->getOperand(2));
441}
442
443/// Returns power-of-2 number of elements in a single register (part), given the
444/// total number of elements \p Size and number of registers (parts) \p
445/// NumParts.
446static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
447 return std::min<unsigned>(Size, bit_ceil(divideCeil(Size, NumParts)));
448}
449
450/// Returns correct remaining number of elements, considering total amount \p
451/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
452/// and current register (part) \p Part.
453static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
454 unsigned Part) {
455 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
456}
457
458#if !defined(NDEBUG)
459/// Print a short descriptor of the instruction bundle suitable for debug output.
460static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
461 std::string Result;
462 raw_string_ostream OS(Result);
463 if (Idx >= 0)
464 OS << "Idx: " << Idx << ", ";
465 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
466 return Result;
467}
468#endif
469
470/// \returns true if all of the instructions in \p VL are in the same block or
471/// false otherwise.
473 auto *It = find_if(VL, IsaPred<Instruction>);
474 if (It == VL.end())
475 return false;
476 Instruction *I0 = cast<Instruction>(*It);
478 return true;
479
480 BasicBlock *BB = I0->getParent();
481 for (Value *V : iterator_range(It, VL.end())) {
482 if (isa<PoisonValue>(V))
483 continue;
484 auto *II = dyn_cast<Instruction>(V);
485 if (!II)
486 return false;
487
488 if (BB != II->getParent())
489 return false;
490 }
491 return true;
492}
493
494/// \returns True if all of the values in \p VL are constants (but not
495/// globals/constant expressions).
497 // Constant expressions and globals can't be vectorized like normal integer/FP
498 // constants.
499 return all_of(VL, isConstant);
500}
501
502/// \returns True if all of the values in \p VL are identical or some of them
503/// are UndefValue.
504static bool isSplat(ArrayRef<Value *> VL) {
505 Value *FirstNonUndef = nullptr;
506 for (Value *V : VL) {
507 if (isa<UndefValue>(V))
508 continue;
509 if (!FirstNonUndef) {
510 FirstNonUndef = V;
511 continue;
512 }
513 if (V != FirstNonUndef)
514 return false;
515 }
516 return FirstNonUndef != nullptr;
517}
518
519/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
520/// For BinaryOperator, it also checks if \p InstWithUses is used in specific
521/// patterns that make it effectively commutative (like equality comparisons
522/// with zero).
523/// In most cases, users should not call this function directly (since \p I and
524/// \p InstWithUses are the same). However, when analyzing interchangeable
525/// instructions, we need to use the converted opcode along with the original
526/// uses.
527/// \param I The instruction to check for commutativity
528/// \param ValWithUses The value whose uses are analyzed for special
529/// patterns
530static bool isCommutative(Instruction *I, Value *ValWithUses) {
531 if (auto *Cmp = dyn_cast<CmpInst>(I))
532 return Cmp->isCommutative();
533 if (auto *BO = dyn_cast<BinaryOperator>(I))
534 return BO->isCommutative() ||
535 (BO->getOpcode() == Instruction::Sub &&
536 !ValWithUses->hasNUsesOrMore(UsesLimit) &&
537 all_of(
538 ValWithUses->uses(),
539 [](const Use &U) {
540 // Commutative, if icmp eq/ne sub, 0
541 CmpPredicate Pred;
542 if (match(U.getUser(),
543 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
544 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
545 return true;
546 // Commutative, if abs(sub nsw, true) or abs(sub, false).
547 ConstantInt *Flag;
548 return match(U.getUser(),
549 m_Intrinsic<Intrinsic::abs>(
550 m_Specific(U.get()), m_ConstantInt(Flag))) &&
551 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
552 Flag->isOne());
553 })) ||
554 (BO->getOpcode() == Instruction::FSub &&
555 !ValWithUses->hasNUsesOrMore(UsesLimit) &&
556 all_of(ValWithUses->uses(), [](const Use &U) {
557 return match(U.getUser(),
558 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
559 }));
560 return I->isCommutative();
561}
562
563/// This is a helper function to check whether \p I is commutative.
564/// This is a convenience wrapper that calls the two-parameter version of
565/// isCommutative with the same instruction for both parameters. This is
566/// the common case where the instruction being checked for commutativity
567/// is the same as the instruction whose uses are analyzed for special
568/// patterns (see the two-parameter version above for details).
569/// \param I The instruction to check for commutativity
570/// \returns true if the instruction is commutative, false otherwise
571static bool isCommutative(Instruction *I) { return isCommutative(I, I); }
572
573/// \returns number of operands of \p I, considering commutativity. Returns 2
574/// for commutative instrinsics.
575/// \param I The instruction to check for commutativity
577 if (isa<IntrinsicInst>(I) && isCommutative(I)) {
578 // IntrinsicInst::isCommutative returns true if swapping the first "two"
579 // arguments to the intrinsic produces the same result.
580 constexpr unsigned IntrinsicNumOperands = 2;
581 return IntrinsicNumOperands;
582 }
583 return I->getNumOperands();
584}
585
586template <typename T>
587static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
588 unsigned Offset) {
589 static_assert(std::is_same_v<T, InsertElementInst> ||
590 std::is_same_v<T, ExtractElementInst>,
591 "unsupported T");
592 int Index = Offset;
593 if (const auto *IE = dyn_cast<T>(Inst)) {
594 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
595 if (!VT)
596 return std::nullopt;
597 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
598 if (!CI)
599 return std::nullopt;
600 if (CI->getValue().uge(VT->getNumElements()))
601 return std::nullopt;
602 Index *= VT->getNumElements();
603 Index += CI->getZExtValue();
604 return Index;
605 }
606 return std::nullopt;
607}
608
609/// \returns inserting or extracting index of InsertElement, ExtractElement or
610/// InsertValue instruction, using Offset as base offset for index.
611/// \returns std::nullopt if the index is not an immediate.
612static std::optional<unsigned> getElementIndex(const Value *Inst,
613 unsigned Offset = 0) {
614 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
615 return Index;
616 if (auto Index = getInsertExtractIndex<ExtractElementInst>(Inst, Offset))
617 return Index;
618
619 int Index = Offset;
620
621 const auto *IV = dyn_cast<InsertValueInst>(Inst);
622 if (!IV)
623 return std::nullopt;
624
625 Type *CurrentType = IV->getType();
626 for (unsigned I : IV->indices()) {
627 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
628 Index *= ST->getNumElements();
629 CurrentType = ST->getElementType(I);
630 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
631 Index *= AT->getNumElements();
632 CurrentType = AT->getElementType();
633 } else {
634 return std::nullopt;
635 }
636 Index += I;
637 }
638 return Index;
639}
640
641/// \returns true if all of the values in \p VL use the same opcode.
642/// For comparison instructions, also checks if predicates match.
643/// PoisonValues are considered matching.
644/// Interchangeable instructions are not considered.
646 auto *It = find_if(VL, IsaPred<Instruction>);
647 if (It == VL.end())
648 return true;
649 Instruction *MainOp = cast<Instruction>(*It);
650 unsigned Opcode = MainOp->getOpcode();
651 bool IsCmpOp = isa<CmpInst>(MainOp);
652 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
654 return std::all_of(It, VL.end(), [&](Value *V) {
655 if (auto *CI = dyn_cast<CmpInst>(V))
656 return BasePred == CI->getPredicate();
657 if (auto *I = dyn_cast<Instruction>(V))
658 return I->getOpcode() == Opcode;
659 return isa<PoisonValue>(V);
660 });
661}
662
663namespace {
664/// Specifies the way the mask should be analyzed for undefs/poisonous elements
665/// in the shuffle mask.
666enum class UseMask {
667 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
668 ///< check for the mask elements for the first argument (mask
669 ///< indices are in range [0:VF)).
670 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
671 ///< for the mask elements for the second argument (mask indices
672 ///< are in range [VF:2*VF))
673 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
674 ///< future shuffle elements and mark them as ones as being used
675 ///< in future. Non-undef elements are considered as unused since
676 ///< they're already marked as used in the mask.
677};
678} // namespace
679
680/// Prepares a use bitset for the given mask either for the first argument or
681/// for the second.
683 UseMask MaskArg) {
684 SmallBitVector UseMask(VF, true);
685 for (auto [Idx, Value] : enumerate(Mask)) {
686 if (Value == PoisonMaskElem) {
687 if (MaskArg == UseMask::UndefsAsMask)
688 UseMask.reset(Idx);
689 continue;
690 }
691 if (MaskArg == UseMask::FirstArg && Value < VF)
692 UseMask.reset(Value);
693 else if (MaskArg == UseMask::SecondArg && Value >= VF)
694 UseMask.reset(Value - VF);
695 }
696 return UseMask;
697}
698
699/// Checks if the given value is actually an undefined constant vector.
700/// Also, if the \p UseMask is not empty, tries to check if the non-masked
701/// elements actually mask the insertelement buildvector, if any.
702template <bool IsPoisonOnly = false>
704 const SmallBitVector &UseMask = {}) {
705 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
706 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
707 if (isa<T>(V))
708 return Res;
709 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
710 if (!VecTy)
711 return Res.reset();
712 auto *C = dyn_cast<Constant>(V);
713 if (!C) {
714 if (!UseMask.empty()) {
715 const Value *Base = V;
716 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
717 Base = II->getOperand(0);
718 if (isa<T>(II->getOperand(1)))
719 continue;
720 std::optional<unsigned> Idx = getElementIndex(II);
721 if (!Idx) {
722 Res.reset();
723 return Res;
724 }
725 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
726 Res.reset(*Idx);
727 }
728 // TODO: Add analysis for shuffles here too.
729 if (V == Base) {
730 Res.reset();
731 } else {
732 SmallBitVector SubMask(UseMask.size(), false);
733 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
734 }
735 } else {
736 Res.reset();
737 }
738 return Res;
739 }
740 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
741 if (Constant *Elem = C->getAggregateElement(I))
742 if (!isa<T>(Elem) &&
743 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
744 Res.reset(I);
745 }
746 return Res;
747}
748
749/// Checks if the vector of instructions can be represented as a shuffle, like:
750/// %x0 = extractelement <4 x i8> %x, i32 0
751/// %x3 = extractelement <4 x i8> %x, i32 3
752/// %y1 = extractelement <4 x i8> %y, i32 1
753/// %y2 = extractelement <4 x i8> %y, i32 2
754/// %x0x0 = mul i8 %x0, %x0
755/// %x3x3 = mul i8 %x3, %x3
756/// %y1y1 = mul i8 %y1, %y1
757/// %y2y2 = mul i8 %y2, %y2
758/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
759/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
760/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
761/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
762/// ret <4 x i8> %ins4
763/// can be transformed into:
764/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
765/// i32 6>
766/// %2 = mul <4 x i8> %1, %1
767/// ret <4 x i8> %2
768/// Mask will return the Shuffle Mask equivalent to the extracted elements.
769/// TODO: Can we split off and reuse the shuffle mask detection from
770/// ShuffleVectorInst/getShuffleCost?
771static std::optional<TargetTransformInfo::ShuffleKind>
773 AssumptionCache *AC) {
774 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
775 if (It == VL.end())
776 return std::nullopt;
777 unsigned Size =
778 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) {
779 auto *EI = dyn_cast<ExtractElementInst>(V);
780 if (!EI)
781 return S;
782 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
783 if (!VTy)
784 return S;
785 return std::max(S, VTy->getNumElements());
786 });
787
788 Value *Vec1 = nullptr;
789 Value *Vec2 = nullptr;
790 bool HasNonUndefVec = any_of(VL, [&](Value *V) {
791 auto *EE = dyn_cast<ExtractElementInst>(V);
792 if (!EE)
793 return false;
794 Value *Vec = EE->getVectorOperand();
795 if (isa<UndefValue>(Vec))
796 return false;
797 return isGuaranteedNotToBePoison(Vec, AC);
798 });
799 enum ShuffleMode { Unknown, Select, Permute };
800 ShuffleMode CommonShuffleMode = Unknown;
801 Mask.assign(VL.size(), PoisonMaskElem);
802 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
803 // Undef can be represented as an undef element in a vector.
804 if (isa<UndefValue>(VL[I]))
805 continue;
806 auto *EI = cast<ExtractElementInst>(VL[I]);
807 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
808 return std::nullopt;
809 auto *Vec = EI->getVectorOperand();
810 // We can extractelement from undef or poison vector.
811 if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
812 continue;
813 // All vector operands must have the same number of vector elements.
814 if (isa<UndefValue>(Vec)) {
815 Mask[I] = I;
816 } else {
817 if (isa<UndefValue>(EI->getIndexOperand()))
818 continue;
819 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
820 if (!Idx)
821 return std::nullopt;
822 // Undefined behavior if Idx is negative or >= Size.
823 if (Idx->getValue().uge(Size))
824 continue;
825 unsigned IntIdx = Idx->getValue().getZExtValue();
826 Mask[I] = IntIdx;
827 }
828 if (isUndefVector(Vec).all() && HasNonUndefVec)
829 continue;
830 // For correct shuffling we have to have at most 2 different vector operands
831 // in all extractelement instructions.
832 if (!Vec1 || Vec1 == Vec) {
833 Vec1 = Vec;
834 } else if (!Vec2 || Vec2 == Vec) {
835 Vec2 = Vec;
836 Mask[I] += Size;
837 } else {
838 return std::nullopt;
839 }
840 if (CommonShuffleMode == Permute)
841 continue;
842 // If the extract index is not the same as the operation number, it is a
843 // permutation.
844 if (Mask[I] % Size != I) {
845 CommonShuffleMode = Permute;
846 continue;
847 }
848 CommonShuffleMode = Select;
849 }
850 // If we're not crossing lanes in different vectors, consider it as blending.
851 if (CommonShuffleMode == Select && Vec2)
853 // If Vec2 was never used, we have a permutation of a single vector, otherwise
854 // we have permutation of 2 vectors.
857}
858
859/// \returns True if Extract{Value,Element} instruction extracts element Idx.
860static std::optional<unsigned> getExtractIndex(const Instruction *E) {
861 unsigned Opcode = E->getOpcode();
862 assert((Opcode == Instruction::ExtractElement ||
863 Opcode == Instruction::ExtractValue) &&
864 "Expected extractelement or extractvalue instruction.");
865 if (Opcode == Instruction::ExtractElement) {
866 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
867 if (!CI)
868 return std::nullopt;
869 return CI->getZExtValue();
870 }
871 auto *EI = cast<ExtractValueInst>(E);
872 if (EI->getNumIndices() != 1)
873 return std::nullopt;
874 return *EI->idx_begin();
875}
876
877namespace llvm {
878/// Checks if the provided value does not require scheduling. It does not
879/// require scheduling if this is not an instruction or it is an instruction
880/// that does not read/write memory and all operands are either not instructions
881/// or phi nodes or instructions from different blocks.
882static bool areAllOperandsNonInsts(Value *V);
883/// Checks if the provided value does not require scheduling. It does not
884/// require scheduling if this is not an instruction or it is an instruction
885/// that does not read/write memory and all users are phi nodes or instructions
886/// from the different blocks.
887static bool isUsedOutsideBlock(Value *V);
888/// Checks if the specified value does not require scheduling. It does not
889/// require scheduling if all operands and all users do not need to be scheduled
890/// in the current basic block.
891static bool doesNotNeedToBeScheduled(Value *V);
892} // namespace llvm
893
894namespace {
895/// \returns true if \p Opcode is allowed as part of the main/alternate
896/// instruction for SLP vectorization.
897///
898/// Example of unsupported opcode is SDIV that can potentially cause UB if the
899/// "shuffled out" lane would result in division by zero.
900bool isValidForAlternation(unsigned Opcode) {
901 return !Instruction::isIntDivRem(Opcode);
902}
903
904/// Helper class that determines VL can use the same opcode.
905/// Alternate instruction is supported. In addition, it supports interchangeable
906/// instruction. An interchangeable instruction is an instruction that can be
907/// converted to another instruction with same semantics. For example, x << 1 is
908/// equal to x * 2. x * 1 is equal to x | 0.
909class BinOpSameOpcodeHelper {
910 using MaskType = std::uint_fast16_t;
911 /// Sort SupportedOp because it is used by binary_search.
912 constexpr static std::initializer_list<unsigned> SupportedOp = {
913 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
914 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
915 enum : MaskType {
916 ShlBIT = 0b1,
917 AShrBIT = 0b10,
918 MulBIT = 0b100,
919 AddBIT = 0b1000,
920 SubBIT = 0b10000,
921 AndBIT = 0b100000,
922 OrBIT = 0b1000000,
923 XorBIT = 0b10000000,
924 MainOpBIT = 0b100000000,
926 };
927 /// Return a non-nullptr if either operand of I is a ConstantInt.
928 /// The second return value represents the operand position. We check the
929 /// right-hand side first (1). If the right hand side is not a ConstantInt and
930 /// the instruction is neither Sub, Shl, nor AShr, we then check the left hand
931 /// side (0).
932 static std::pair<ConstantInt *, unsigned>
933 isBinOpWithConstantInt(const Instruction *I) {
934 unsigned Opcode = I->getOpcode();
935 assert(binary_search(SupportedOp, Opcode) && "Unsupported opcode.");
936 (void)SupportedOp;
937 auto *BinOp = cast<BinaryOperator>(I);
938 if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1)))
939 return {CI, 1};
940 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
941 Opcode == Instruction::AShr)
942 return {nullptr, 0};
943 if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(0)))
944 return {CI, 0};
945 return {nullptr, 0};
946 }
947 struct InterchangeableInfo {
948 const Instruction *I = nullptr;
949 /// The bit it sets represents whether MainOp can be converted to.
950 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
951 MulBIT | AShrBIT | ShlBIT;
952 /// We cannot create an interchangeable instruction that does not exist in
953 /// VL. For example, VL [x + 0, y * 1] can be converted to [x << 0, y << 0],
954 /// but << does not exist in VL. In the end, we convert VL to [x * 1, y *
955 /// 1]. SeenBefore is used to know what operations have been seen before.
956 MaskType SeenBefore = 0;
957 InterchangeableInfo(const Instruction *I) : I(I) {}
958 /// Return false allows BinOpSameOpcodeHelper to find an alternate
959 /// instruction. Directly setting the mask will destroy the mask state,
960 /// preventing us from determining which instruction it should convert to.
961 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
962 if (Mask & InterchangeableMask) {
963 SeenBefore |= OpcodeInMaskForm;
964 Mask &= InterchangeableMask;
965 return true;
966 }
967 return false;
968 }
969 bool equal(unsigned Opcode) {
970 return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
971 }
972 unsigned getOpcode() const {
973 MaskType Candidate = Mask & SeenBefore;
974 if (Candidate & MainOpBIT)
975 return I->getOpcode();
976 if (Candidate & ShlBIT)
977 return Instruction::Shl;
978 if (Candidate & AShrBIT)
979 return Instruction::AShr;
980 if (Candidate & MulBIT)
981 return Instruction::Mul;
982 if (Candidate & AddBIT)
983 return Instruction::Add;
984 if (Candidate & SubBIT)
985 return Instruction::Sub;
986 if (Candidate & AndBIT)
987 return Instruction::And;
988 if (Candidate & OrBIT)
989 return Instruction::Or;
990 if (Candidate & XorBIT)
991 return Instruction::Xor;
992 llvm_unreachable("Cannot find interchangeable instruction.");
993 }
994
995 /// Return true if the instruction can be converted to \p Opcode.
996 bool hasCandidateOpcode(unsigned Opcode) const {
997 MaskType Candidate = Mask & SeenBefore;
998 switch (Opcode) {
999 case Instruction::Shl:
1000 return Candidate & ShlBIT;
1001 case Instruction::AShr:
1002 return Candidate & AShrBIT;
1003 case Instruction::Mul:
1004 return Candidate & MulBIT;
1005 case Instruction::Add:
1006 return Candidate & AddBIT;
1007 case Instruction::Sub:
1008 return Candidate & SubBIT;
1009 case Instruction::And:
1010 return Candidate & AndBIT;
1011 case Instruction::Or:
1012 return Candidate & OrBIT;
1013 case Instruction::Xor:
1014 return Candidate & XorBIT;
1015 case Instruction::LShr:
1016 case Instruction::FAdd:
1017 case Instruction::FSub:
1018 case Instruction::FMul:
1019 case Instruction::SDiv:
1020 case Instruction::UDiv:
1021 case Instruction::FDiv:
1022 case Instruction::SRem:
1023 case Instruction::URem:
1024 case Instruction::FRem:
1025 return false;
1026 default:
1027 break;
1028 }
1029 llvm_unreachable("Cannot find interchangeable instruction.");
1030 }
1031
1032 SmallVector<Value *> getOperand(const Instruction *To) const {
1033 unsigned ToOpcode = To->getOpcode();
1034 unsigned FromOpcode = I->getOpcode();
1035 if (FromOpcode == ToOpcode)
1036 return SmallVector<Value *>(I->operands());
1037 assert(binary_search(SupportedOp, ToOpcode) && "Unsupported opcode.");
1038 auto [CI, Pos] = isBinOpWithConstantInt(I);
1039 const APInt &FromCIValue = CI->getValue();
1040 unsigned FromCIValueBitWidth = FromCIValue.getBitWidth();
1041 APInt ToCIValue;
1042 switch (FromOpcode) {
1043 case Instruction::Shl:
1044 if (ToOpcode == Instruction::Mul) {
1045 ToCIValue = APInt::getOneBitSet(FromCIValueBitWidth,
1046 FromCIValue.getZExtValue());
1047 } else {
1048 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1049 ToCIValue = ToOpcode == Instruction::And
1050 ? APInt::getAllOnes(FromCIValueBitWidth)
1051 : APInt::getZero(FromCIValueBitWidth);
1052 }
1053 break;
1054 case Instruction::Mul:
1055 assert(FromCIValue.isPowerOf2() && "Cannot convert the instruction.");
1056 if (ToOpcode == Instruction::Shl) {
1057 ToCIValue = APInt(FromCIValueBitWidth, FromCIValue.logBase2());
1058 } else {
1059 assert(FromCIValue.isOne() && "Cannot convert the instruction.");
1060 ToCIValue = ToOpcode == Instruction::And
1061 ? APInt::getAllOnes(FromCIValueBitWidth)
1062 : APInt::getZero(FromCIValueBitWidth);
1063 }
1064 break;
1065 case Instruction::Add:
1066 case Instruction::Sub:
1067 if (FromCIValue.isZero()) {
1068 ToCIValue = APInt::getZero(FromCIValueBitWidth);
1069 } else {
1070 assert(is_contained({Instruction::Add, Instruction::Sub}, ToOpcode) &&
1071 "Cannot convert the instruction.");
1072 ToCIValue = FromCIValue;
1073 ToCIValue.negate();
1074 }
1075 break;
1076 case Instruction::And:
1077 assert(FromCIValue.isAllOnes() && "Cannot convert the instruction.");
1078 ToCIValue = ToOpcode == Instruction::Mul
1079 ? APInt::getOneBitSet(FromCIValueBitWidth, 0)
1080 : APInt::getZero(FromCIValueBitWidth);
1081 break;
1082 default:
1083 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1084 ToCIValue = APInt::getZero(FromCIValueBitWidth);
1085 break;
1086 }
1087 Value *LHS = I->getOperand(1 - Pos);
1088 Constant *RHS =
1089 ConstantInt::get(I->getOperand(Pos)->getType(), ToCIValue);
1090 // constant + x cannot be -constant - x
1091 // instead, it should be x - -constant
1092 if (Pos == 1 ||
1093 (FromOpcode == Instruction::Add && ToOpcode == Instruction::Sub))
1094 return SmallVector<Value *>({LHS, RHS});
1095 return SmallVector<Value *>({RHS, LHS});
1096 }
1097 };
1098 InterchangeableInfo MainOp;
1099 InterchangeableInfo AltOp;
1100 bool isValidForAlternation(const Instruction *I) const {
1101 return ::isValidForAlternation(MainOp.I->getOpcode()) &&
1102 ::isValidForAlternation(I->getOpcode());
1103 }
1104 bool initializeAltOp(const Instruction *I) {
1105 if (AltOp.I)
1106 return true;
1107 if (!isValidForAlternation(I))
1108 return false;
1109 AltOp.I = I;
1110 return true;
1111 }
1112
1113public:
1114 BinOpSameOpcodeHelper(const Instruction *MainOp,
1115 const Instruction *AltOp = nullptr)
1116 : MainOp(MainOp), AltOp(AltOp) {
1117 assert(is_sorted(SupportedOp) && "SupportedOp is not sorted.");
1118 }
1119 bool add(const Instruction *I) {
1120 assert(isa<BinaryOperator>(I) &&
1121 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1122 unsigned Opcode = I->getOpcode();
1123 MaskType OpcodeInMaskForm;
1124 // Prefer Shl, AShr, Mul, Add, Sub, And, Or and Xor over MainOp.
1125 switch (Opcode) {
1126 case Instruction::Shl:
1127 OpcodeInMaskForm = ShlBIT;
1128 break;
1129 case Instruction::AShr:
1130 OpcodeInMaskForm = AShrBIT;
1131 break;
1132 case Instruction::Mul:
1133 OpcodeInMaskForm = MulBIT;
1134 break;
1135 case Instruction::Add:
1136 OpcodeInMaskForm = AddBIT;
1137 break;
1138 case Instruction::Sub:
1139 OpcodeInMaskForm = SubBIT;
1140 break;
1141 case Instruction::And:
1142 OpcodeInMaskForm = AndBIT;
1143 break;
1144 case Instruction::Or:
1145 OpcodeInMaskForm = OrBIT;
1146 break;
1147 case Instruction::Xor:
1148 OpcodeInMaskForm = XorBIT;
1149 break;
1150 default:
1151 return MainOp.equal(Opcode) ||
1152 (initializeAltOp(I) && AltOp.equal(Opcode));
1153 }
1154 MaskType InterchangeableMask = OpcodeInMaskForm;
1155 ConstantInt *CI = isBinOpWithConstantInt(I).first;
1156 if (CI) {
1157 constexpr MaskType CanBeAll =
1158 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1159 const APInt &CIValue = CI->getValue();
1160 switch (Opcode) {
1161 case Instruction::Shl:
1162 if (CIValue.ult(CIValue.getBitWidth()))
1163 InterchangeableMask = CIValue.isZero() ? CanBeAll : MulBIT | ShlBIT;
1164 break;
1165 case Instruction::Mul:
1166 if (CIValue.isOne()) {
1167 InterchangeableMask = CanBeAll;
1168 break;
1169 }
1170 if (CIValue.isPowerOf2())
1171 InterchangeableMask = MulBIT | ShlBIT;
1172 break;
1173 case Instruction::Add:
1174 case Instruction::Sub:
1175 InterchangeableMask = CIValue.isZero() ? CanBeAll : SubBIT | AddBIT;
1176 break;
1177 case Instruction::And:
1178 if (CIValue.isAllOnes())
1179 InterchangeableMask = CanBeAll;
1180 break;
1181 default:
1182 if (CIValue.isZero())
1183 InterchangeableMask = CanBeAll;
1184 break;
1185 }
1186 }
1187 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1188 (initializeAltOp(I) &&
1189 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1190 }
1191 unsigned getMainOpcode() const { return MainOp.getOpcode(); }
1192 /// Checks if the list of potential opcodes includes \p Opcode.
1193 bool hasCandidateOpcode(unsigned Opcode) const {
1194 return MainOp.hasCandidateOpcode(Opcode);
1195 }
1196 bool hasAltOp() const { return AltOp.I; }
1197 unsigned getAltOpcode() const {
1198 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1199 }
1200 SmallVector<Value *> getOperand(const Instruction *I) const {
1201 return MainOp.getOperand(I);
1202 }
1203};
1204
1205/// Main data required for vectorization of instructions.
1206class InstructionsState {
1207 /// MainOp and AltOp are primarily determined by getSameOpcode. Currently,
1208 /// only BinaryOperator, CastInst, and CmpInst support alternate instructions
1209 /// (i.e., AltOp is not equal to MainOp; this can be checked using
1210 /// isAltShuffle).
1211 /// A rare exception is TrySplitNode, where the InstructionsState is derived
1212 /// from getMainAltOpsNoStateVL.
1213 /// For those InstructionsState that use alternate instructions, the resulting
1214 /// vectorized output ultimately comes from a shufflevector. For example,
1215 /// given a vector list (VL):
1216 /// VL[0] = add i32 a, e
1217 /// VL[1] = sub i32 b, f
1218 /// VL[2] = add i32 c, g
1219 /// VL[3] = sub i32 d, h
1220 /// The vectorized result would be:
1221 /// intermediated_0 = add <4 x i32> <a, b, c, d>, <e, f, g, h>
1222 /// intermediated_1 = sub <4 x i32> <a, b, c, d>, <e, f, g, h>
1223 /// result = shufflevector <4 x i32> intermediated_0,
1224 /// <4 x i32> intermediated_1,
1225 /// <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1226 /// Since shufflevector is used in the final result, when calculating the cost
1227 /// (getEntryCost), we must account for the usage of shufflevector in
1228 /// GetVectorCost.
1229 Instruction *MainOp = nullptr;
1230 Instruction *AltOp = nullptr;
1231 /// Wether the instruction state represents copyable instructions.
1232 bool HasCopyables = false;
1233
1234public:
1235 Instruction *getMainOp() const {
1236 assert(valid() && "InstructionsState is invalid.");
1237 return MainOp;
1238 }
1239
1240 Instruction *getAltOp() const {
1241 assert(valid() && "InstructionsState is invalid.");
1242 return AltOp;
1243 }
1244
1245 /// The main/alternate opcodes for the list of instructions.
1246 unsigned getOpcode() const { return getMainOp()->getOpcode(); }
1247
1248 unsigned getAltOpcode() const { return getAltOp()->getOpcode(); }
1249
1250 /// Some of the instructions in the list have alternate opcodes.
1251 bool isAltShuffle() const { return getMainOp() != getAltOp(); }
1252
1253 /// Checks if the instruction matches either the main or alternate opcode.
1254 /// \returns
1255 /// - MainOp if \param I matches MainOp's opcode directly or can be converted
1256 /// to it
1257 /// - AltOp if \param I matches AltOp's opcode directly or can be converted to
1258 /// it
1259 /// - nullptr if \param I cannot be matched or converted to either opcode
1260 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
1261 assert(MainOp && "MainOp cannot be nullptr.");
1262 if (I->getOpcode() == MainOp->getOpcode())
1263 return MainOp;
1264 // Prefer AltOp instead of interchangeable instruction of MainOp.
1265 assert(AltOp && "AltOp cannot be nullptr.");
1266 if (I->getOpcode() == AltOp->getOpcode())
1267 return AltOp;
1268 if (!I->isBinaryOp())
1269 return nullptr;
1270 BinOpSameOpcodeHelper Converter(MainOp);
1271 if (!Converter.add(I) || !Converter.add(MainOp))
1272 return nullptr;
1273 if (isAltShuffle() && !Converter.hasCandidateOpcode(MainOp->getOpcode())) {
1274 BinOpSameOpcodeHelper AltConverter(AltOp);
1275 if (AltConverter.add(I) && AltConverter.add(AltOp) &&
1276 AltConverter.hasCandidateOpcode(AltOp->getOpcode()))
1277 return AltOp;
1278 }
1279 if (Converter.hasAltOp() && !isAltShuffle())
1280 return nullptr;
1281 return Converter.hasAltOp() ? AltOp : MainOp;
1282 }
1283
1284 /// Checks if main/alt instructions are shift operations.
1285 bool isShiftOp() const {
1286 return getMainOp()->isShift() && getAltOp()->isShift();
1287 }
1288
1289 /// Checks if main/alt instructions are bitwise logic operations.
1290 bool isBitwiseLogicOp() const {
1291 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1292 }
1293
1294 /// Checks if main/alt instructions are mul/div/rem/fmul/fdiv/frem operations.
1295 bool isMulDivLikeOp() const {
1296 constexpr std::array<unsigned, 8> MulDiv = {
1297 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1298 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1299 Instruction::URem, Instruction::FRem};
1300 return is_contained(MulDiv, getOpcode()) &&
1301 is_contained(MulDiv, getAltOpcode());
1302 }
1303
1304 /// Checks if main/alt instructions are add/sub/fadd/fsub operations.
1305 bool isAddSubLikeOp() const {
1306 constexpr std::array<unsigned, 4> AddSub = {
1307 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1308 Instruction::FSub};
1309 return is_contained(AddSub, getOpcode()) &&
1310 is_contained(AddSub, getAltOpcode());
1311 }
1312
1313 /// Checks if main/alt instructions are cmp operations.
1314 bool isCmpOp() const {
1315 return (getOpcode() == Instruction::ICmp ||
1316 getOpcode() == Instruction::FCmp) &&
1317 getAltOpcode() == getOpcode();
1318 }
1319
1320 /// Checks if the current state is valid, i.e. has non-null MainOp
1321 bool valid() const { return MainOp && AltOp; }
1322
1323 explicit operator bool() const { return valid(); }
1324
1325 InstructionsState() = delete;
1326 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1327 bool HasCopyables = false)
1328 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1329 static InstructionsState invalid() { return {nullptr, nullptr}; }
1330
1331 /// Checks if the value is a copyable element.
1332 bool isCopyableElement(Value *V) const {
1333 assert(valid() && "InstructionsState is invalid.");
1334 if (!HasCopyables)
1335 return false;
1336 if (isAltShuffle() || getOpcode() == Instruction::GetElementPtr)
1337 return false;
1338 auto *I = dyn_cast<Instruction>(V);
1339 if (!I)
1340 return !isa<PoisonValue>(V);
1341 if (I->getParent() != MainOp->getParent() &&
1344 return true;
1345 if (I->getOpcode() == MainOp->getOpcode())
1346 return false;
1347 if (!I->isBinaryOp())
1348 return true;
1349 BinOpSameOpcodeHelper Converter(MainOp);
1350 return !Converter.add(I) || !Converter.add(MainOp) ||
1351 Converter.hasAltOp() || !Converter.hasCandidateOpcode(getOpcode());
1352 }
1353
1354 /// Checks if the value is non-schedulable.
1355 bool isNonSchedulable(Value *V) const {
1356 assert(valid() && "InstructionsState is invalid.");
1357 auto *I = dyn_cast<Instruction>(V);
1358 if (!HasCopyables)
1359 return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
1361 // MainOp for copyables always schedulable to correctly identify
1362 // non-schedulable copyables.
1363 if (getMainOp() == V)
1364 return false;
1365 if (isCopyableElement(V)) {
1366 auto IsNonSchedulableCopyableElement = [this](Value *V) {
1367 auto *I = dyn_cast<Instruction>(V);
1368 return !I || isa<PHINode>(I) || I->getParent() != MainOp->getParent() ||
1370 // If the copyable instructions comes after MainOp
1371 // (non-schedulable, but used in the block) - cannot vectorize
1372 // it, will possibly generate use before def.
1373 !MainOp->comesBefore(I));
1374 };
1375
1376 return IsNonSchedulableCopyableElement(V);
1377 }
1378 return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
1380 }
1381
1382 /// Checks if the state represents copyable instructions.
1383 bool areInstructionsWithCopyableElements() const {
1384 assert(valid() && "InstructionsState is invalid.");
1385 return HasCopyables;
1386 }
1387};
1388
1389std::pair<Instruction *, SmallVector<Value *>>
1390convertTo(Instruction *I, const InstructionsState &S) {
1391 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(I);
1392 assert(SelectedOp && "Cannot convert the instruction.");
1393 if (I->isBinaryOp()) {
1394 BinOpSameOpcodeHelper Converter(I);
1395 return std::make_pair(SelectedOp, Converter.getOperand(SelectedOp));
1396 }
1397 return std::make_pair(SelectedOp, SmallVector<Value *>(I->operands()));
1398}
1399
1400} // end anonymous namespace
1401
1402static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1403 const TargetLibraryInfo &TLI);
1404
1405/// Find an instruction with a specific opcode in VL.
1406/// \param VL Array of values to search through. Must contain only Instructions
1407/// and PoisonValues.
1408/// \param Opcode The instruction opcode to search for
1409/// \returns
1410/// - The first instruction found with matching opcode
1411/// - nullptr if no matching instruction is found
1413 unsigned Opcode) {
1414 for (Value *V : VL) {
1415 if (isa<PoisonValue>(V))
1416 continue;
1417 assert(isa<Instruction>(V) && "Only accepts PoisonValue and Instruction.");
1418 auto *Inst = cast<Instruction>(V);
1419 if (Inst->getOpcode() == Opcode)
1420 return Inst;
1421 }
1422 return nullptr;
1423}
1424
1425/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
1426/// compatible instructions or constants, or just some other regular values.
1427static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
1428 Value *Op1, const TargetLibraryInfo &TLI) {
1429 return (isConstant(BaseOp0) && isConstant(Op0)) ||
1430 (isConstant(BaseOp1) && isConstant(Op1)) ||
1431 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
1432 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
1433 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1434 getSameOpcode({BaseOp0, Op0}, TLI) ||
1435 getSameOpcode({BaseOp1, Op1}, TLI);
1436}
1437
1438/// \returns true if a compare instruction \p CI has similar "look" and
1439/// same predicate as \p BaseCI, "as is" or with its operands and predicate
1440/// swapped, false otherwise.
1441static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
1442 const TargetLibraryInfo &TLI) {
1443 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
1444 "Assessing comparisons of different types?");
1445 CmpInst::Predicate BasePred = BaseCI->getPredicate();
1446 CmpInst::Predicate Pred = CI->getPredicate();
1448
1449 Value *BaseOp0 = BaseCI->getOperand(0);
1450 Value *BaseOp1 = BaseCI->getOperand(1);
1451 Value *Op0 = CI->getOperand(0);
1452 Value *Op1 = CI->getOperand(1);
1453
1454 return (BasePred == Pred &&
1455 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
1456 (BasePred == SwappedPred &&
1457 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
1458}
1459
1460/// \returns analysis of the Instructions in \p VL described in
1461/// InstructionsState, the Opcode that we suppose the whole list
1462/// could be vectorized even if its structure is diverse.
1463static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1464 const TargetLibraryInfo &TLI) {
1465 // Make sure these are all Instructions.
1466 if (!all_of(VL, IsaPred<Instruction, PoisonValue>))
1467 return InstructionsState::invalid();
1468
1469 auto *It = find_if(VL, IsaPred<Instruction>);
1470 if (It == VL.end())
1471 return InstructionsState::invalid();
1472
1473 Instruction *MainOp = cast<Instruction>(*It);
1474 unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);
1475 if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) ||
1476 (VL.size() == 2 && InstCnt < 2))
1477 return InstructionsState::invalid();
1478
1479 bool IsCastOp = isa<CastInst>(MainOp);
1480 bool IsBinOp = isa<BinaryOperator>(MainOp);
1481 bool IsCmpOp = isa<CmpInst>(MainOp);
1482 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
1484 Instruction *AltOp = MainOp;
1485 unsigned Opcode = MainOp->getOpcode();
1486 unsigned AltOpcode = Opcode;
1487
1488 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1489 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1490 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
1491 UniquePreds.insert(BasePred);
1492 UniqueNonSwappedPreds.insert(BasePred);
1493 for (Value *V : VL) {
1494 auto *I = dyn_cast<CmpInst>(V);
1495 if (!I)
1496 return false;
1497 CmpInst::Predicate CurrentPred = I->getPredicate();
1498 CmpInst::Predicate SwappedCurrentPred =
1499 CmpInst::getSwappedPredicate(CurrentPred);
1500 UniqueNonSwappedPreds.insert(CurrentPred);
1501 if (!UniquePreds.contains(CurrentPred) &&
1502 !UniquePreds.contains(SwappedCurrentPred))
1503 UniquePreds.insert(CurrentPred);
1504 }
1505 // Total number of predicates > 2, but if consider swapped predicates
1506 // compatible only 2, consider swappable predicates as compatible opcodes,
1507 // not alternate.
1508 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
1509 }();
1510 // Check for one alternate opcode from another BinaryOperator.
1511 // TODO - generalize to support all operators (types, calls etc.).
1512 Intrinsic::ID BaseID = 0;
1513 SmallVector<VFInfo> BaseMappings;
1514 if (auto *CallBase = dyn_cast<CallInst>(MainOp)) {
1515 BaseID = getVectorIntrinsicIDForCall(CallBase, &TLI);
1516 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
1517 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
1518 return InstructionsState::invalid();
1519 }
1520 bool AnyPoison = InstCnt != VL.size();
1521 // Check MainOp too to be sure that it matches the requirements for the
1522 // instructions.
1523 for (Value *V : iterator_range(It, VL.end())) {
1524 auto *I = dyn_cast<Instruction>(V);
1525 if (!I)
1526 continue;
1527
1528 // Cannot combine poison and divisions.
1529 // TODO: do some smart analysis of the CallInsts to exclude divide-like
1530 // intrinsics/functions only.
1531 if (AnyPoison && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
1532 return InstructionsState::invalid();
1533 unsigned InstOpcode = I->getOpcode();
1534 if (IsBinOp && isa<BinaryOperator>(I)) {
1535 if (BinOpHelper.add(I))
1536 continue;
1537 } else if (IsCastOp && isa<CastInst>(I)) {
1538 Value *Op0 = MainOp->getOperand(0);
1539 Type *Ty0 = Op0->getType();
1540 Value *Op1 = I->getOperand(0);
1541 Type *Ty1 = Op1->getType();
1542 if (Ty0 == Ty1) {
1543 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1544 continue;
1545 if (Opcode == AltOpcode) {
1546 assert(isValidForAlternation(Opcode) &&
1547 isValidForAlternation(InstOpcode) &&
1548 "Cast isn't safe for alternation, logic needs to be updated!");
1549 AltOpcode = InstOpcode;
1550 AltOp = I;
1551 continue;
1552 }
1553 }
1554 } else if (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) {
1555 auto *BaseInst = cast<CmpInst>(MainOp);
1556 Type *Ty0 = BaseInst->getOperand(0)->getType();
1557 Type *Ty1 = Inst->getOperand(0)->getType();
1558 if (Ty0 == Ty1) {
1559 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
1560 assert(InstOpcode == AltOpcode &&
1561 "Alternate instructions are only supported by BinaryOperator "
1562 "and CastInst.");
1563 // Check for compatible operands. If the corresponding operands are not
1564 // compatible - need to perform alternate vectorization.
1565 CmpInst::Predicate CurrentPred = Inst->getPredicate();
1566 CmpInst::Predicate SwappedCurrentPred =
1567 CmpInst::getSwappedPredicate(CurrentPred);
1568
1569 if ((VL.size() == 2 || SwappedPredsCompatible) &&
1570 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1571 continue;
1572
1573 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
1574 continue;
1575 auto *AltInst = cast<CmpInst>(AltOp);
1576 if (MainOp != AltOp) {
1577 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
1578 continue;
1579 } else if (BasePred != CurrentPred) {
1580 assert(
1581 isValidForAlternation(InstOpcode) &&
1582 "CmpInst isn't safe for alternation, logic needs to be updated!");
1583 AltOp = I;
1584 continue;
1585 }
1586 CmpInst::Predicate AltPred = AltInst->getPredicate();
1587 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1588 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1589 continue;
1590 }
1591 } else if (InstOpcode == Opcode) {
1592 assert(InstOpcode == AltOpcode &&
1593 "Alternate instructions are only supported by BinaryOperator and "
1594 "CastInst.");
1595 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
1596 if (Gep->getNumOperands() != 2 ||
1597 Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType())
1598 return InstructionsState::invalid();
1599 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
1601 return InstructionsState::invalid();
1602 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
1603 auto *BaseLI = cast<LoadInst>(MainOp);
1604 if (!LI->isSimple() || !BaseLI->isSimple())
1605 return InstructionsState::invalid();
1606 } else if (auto *Call = dyn_cast<CallInst>(I)) {
1607 auto *CallBase = cast<CallInst>(MainOp);
1608 if (Call->getCalledFunction() != CallBase->getCalledFunction())
1609 return InstructionsState::invalid();
1610 if (Call->hasOperandBundles() &&
1612 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1613 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1614 CallBase->op_begin() +
1616 return InstructionsState::invalid();
1618 if (ID != BaseID)
1619 return InstructionsState::invalid();
1620 if (!ID) {
1621 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
1622 if (Mappings.size() != BaseMappings.size() ||
1623 Mappings.front().ISA != BaseMappings.front().ISA ||
1624 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
1625 Mappings.front().VectorName != BaseMappings.front().VectorName ||
1626 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
1627 Mappings.front().Shape.Parameters !=
1628 BaseMappings.front().Shape.Parameters)
1629 return InstructionsState::invalid();
1630 }
1631 }
1632 continue;
1633 }
1634 return InstructionsState::invalid();
1635 }
1636
1637 if (IsBinOp) {
1638 MainOp = findInstructionWithOpcode(VL, BinOpHelper.getMainOpcode());
1639 assert(MainOp && "Cannot find MainOp with Opcode from BinOpHelper.");
1640 AltOp = findInstructionWithOpcode(VL, BinOpHelper.getAltOpcode());
1641 assert(MainOp && "Cannot find AltOp with Opcode from BinOpHelper.");
1642 }
1643 assert((MainOp == AltOp || !allSameOpcode(VL)) &&
1644 "Incorrect implementation of allSameOpcode.");
1645 InstructionsState S(MainOp, AltOp);
1646 assert(all_of(VL,
1647 [&](Value *V) {
1648 return isa<PoisonValue>(V) ||
1649 S.getMatchingMainOpOrAltOp(cast<Instruction>(V));
1650 }) &&
1651 "Invalid InstructionsState.");
1652 return S;
1653}
1654
1655/// \returns true if all of the values in \p VL have the same type or false
1656/// otherwise.
1658 Type *Ty = VL.consume_front()->getType();
1659 return all_of(VL, [&](Value *V) { return V->getType() == Ty; });
1660}
1661
1662/// \returns True if in-tree use also needs extract. This refers to
1663/// possible scalar operand in vectorized instruction.
1664static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
1665 TargetLibraryInfo *TLI,
1666 const TargetTransformInfo *TTI) {
1667 if (!UserInst)
1668 return false;
1669 unsigned Opcode = UserInst->getOpcode();
1670 switch (Opcode) {
1671 case Instruction::Load: {
1672 LoadInst *LI = cast<LoadInst>(UserInst);
1673 return (LI->getPointerOperand() == Scalar);
1674 }
1675 case Instruction::Store: {
1676 StoreInst *SI = cast<StoreInst>(UserInst);
1677 return (SI->getPointerOperand() == Scalar);
1678 }
1679 case Instruction::Call: {
1680 CallInst *CI = cast<CallInst>(UserInst);
1682 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
1683 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1684 Arg.value().get() == Scalar;
1685 });
1686 }
1687 default:
1688 return false;
1689 }
1690}
1691
1692/// \returns the AA location that is being access by the instruction.
1694 if (StoreInst *SI = dyn_cast<StoreInst>(I))
1695 return MemoryLocation::get(SI);
1696 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1697 return MemoryLocation::get(LI);
1698 return MemoryLocation();
1699}
1700
1701/// \returns True if the instruction is not a volatile or atomic load/store.
1702static bool isSimple(Instruction *I) {
1703 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1704 return LI->isSimple();
1705 if (StoreInst *SI = dyn_cast<StoreInst>(I))
1706 return SI->isSimple();
1707 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
1708 return !MI->isVolatile();
1709 return true;
1710}
1711
1712/// Shuffles \p Mask in accordance with the given \p SubMask.
1713/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1714/// one but two input vectors.
1715static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
1716 bool ExtendingManyInputs = false) {
1717 if (SubMask.empty())
1718 return;
1719 assert(
1720 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
1721 // Check if input scalars were extended to match the size of other node.
1722 (SubMask.size() == Mask.size() && Mask.back() == PoisonMaskElem)) &&
1723 "SubMask with many inputs support must be larger than the mask.");
1724 if (Mask.empty()) {
1725 Mask.append(SubMask.begin(), SubMask.end());
1726 return;
1727 }
1728 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1729 int TermValue = std::min(Mask.size(), SubMask.size());
1730 for (int I = 0, E = SubMask.size(); I < E; ++I) {
1731 if (SubMask[I] == PoisonMaskElem ||
1732 (!ExtendingManyInputs &&
1733 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1734 continue;
1735 NewMask[I] = Mask[SubMask[I]];
1736 }
1737 Mask.swap(NewMask);
1738}
1739
1740/// Order may have elements assigned special value (size) which is out of
1741/// bounds. Such indices only appear on places which correspond to undef values
1742/// (see canReuseExtract for details) and used in order to avoid undef values
1743/// have effect on operands ordering.
1744/// The first loop below simply finds all unused indices and then the next loop
1745/// nest assigns these indices for undef values positions.
1746/// As an example below Order has two undef positions and they have assigned
1747/// values 3 and 7 respectively:
1748/// before: 6 9 5 4 9 2 1 0
1749/// after: 6 3 5 4 7 2 1 0
1751 const size_t Sz = Order.size();
1752 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1753 SmallBitVector MaskedIndices(Sz);
1754 for (unsigned I = 0; I < Sz; ++I) {
1755 if (Order[I] < Sz)
1756 UnusedIndices.reset(Order[I]);
1757 else
1758 MaskedIndices.set(I);
1759 }
1760 if (MaskedIndices.none())
1761 return;
1762 assert(UnusedIndices.count() == MaskedIndices.count() &&
1763 "Non-synced masked/available indices.");
1764 int Idx = UnusedIndices.find_first();
1765 int MIdx = MaskedIndices.find_first();
1766 while (MIdx >= 0) {
1767 assert(Idx >= 0 && "Indices must be synced.");
1768 Order[MIdx] = Idx;
1769 Idx = UnusedIndices.find_next(Idx);
1770 MIdx = MaskedIndices.find_next(MIdx);
1771 }
1772}
1773
1774/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1775/// Opcode1.
1777 unsigned Opcode0, unsigned Opcode1) {
1778 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
1779 SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
1780 for (unsigned Lane : seq<unsigned>(VL.size())) {
1781 if (isa<PoisonValue>(VL[Lane]))
1782 continue;
1783 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1784 OpcodeMask.set(Lane * ScalarTyNumElements,
1785 Lane * ScalarTyNumElements + ScalarTyNumElements);
1786 }
1787 return OpcodeMask;
1788}
1789
1790/// Replicates the given \p Val \p VF times.
1792 unsigned VF) {
1793 assert(none_of(Val, [](Constant *C) { return C->getType()->isVectorTy(); }) &&
1794 "Expected scalar constants.");
1795 SmallVector<Constant *> NewVal(Val.size() * VF);
1796 for (auto [I, V] : enumerate(Val))
1797 std::fill_n(NewVal.begin() + I * VF, VF, V);
1798 return NewVal;
1799}
1800
1801namespace llvm {
1802
1804 SmallVectorImpl<int> &Mask) {
1805 Mask.clear();
1806 const unsigned E = Indices.size();
1807 Mask.resize(E, PoisonMaskElem);
1808 for (unsigned I = 0; I < E; ++I)
1809 Mask[Indices[I]] = I;
1810}
1811
1812/// Reorders the list of scalars in accordance with the given \p Mask.
1814 ArrayRef<int> Mask) {
1815 assert(!Mask.empty() && "Expected non-empty mask.");
1816 SmallVector<Value *> Prev(Scalars.size(),
1817 PoisonValue::get(Scalars.front()->getType()));
1818 Prev.swap(Scalars);
1819 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1820 if (Mask[I] != PoisonMaskElem)
1821 Scalars[Mask[I]] = Prev[I];
1822}
1823
1824/// Checks if the provided value does not require scheduling. It does not
1825/// require scheduling if this is not an instruction or it is an instruction
1826/// that does not read/write memory and all operands are either not instructions
1827/// or phi nodes or instructions from different blocks.
1829 auto *I = dyn_cast<Instruction>(V);
1830 if (!I)
1831 return true;
1832 return !mayHaveNonDefUseDependency(*I) &&
1833 all_of(I->operands(), [I](Value *V) {
1834 auto *IO = dyn_cast<Instruction>(V);
1835 if (!IO)
1836 return true;
1837 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1838 });
1839}
1840
1841/// Checks if the provided value does not require scheduling. It does not
1842/// require scheduling if this is not an instruction or it is an instruction
1843/// that does not read/write memory and all users are phi nodes or instructions
1844/// from the different blocks.
1845static bool isUsedOutsideBlock(Value *V) {
1846 auto *I = dyn_cast<Instruction>(V);
1847 if (!I)
1848 return true;
1849 // Limits the number of uses to save compile time.
1850 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1851 all_of(I->users(), [I](User *U) {
1852 auto *IU = dyn_cast<Instruction>(U);
1853 if (!IU)
1854 return true;
1855 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1856 });
1857}
1858
1859/// Checks if the specified value does not require scheduling. It does not
1860/// require scheduling if all operands and all users do not need to be scheduled
1861/// in the current basic block.
1864}
1865
1866/// Checks if the specified array of instructions does not require scheduling.
1867/// It is so if all either instructions have operands that do not require
1868/// scheduling or their users do not require scheduling since they are phis or
1869/// in other basic blocks.
1871 return !VL.empty() &&
1873}
1874
1875/// Returns true if widened type of \p Ty elements with size \p Sz represents
1876/// full vector type, i.e. adding extra element results in extra parts upon type
1877/// legalization.
1879 unsigned Sz) {
1880 if (Sz <= 1)
1881 return false;
1882 if (!isValidElementType(Ty) && !isa<FixedVectorType>(Ty))
1883 return false;
1884 if (has_single_bit(Sz))
1885 return true;
1886 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
1887 return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
1888 Sz % NumParts == 0;
1889}
1890
1891/// Returns number of parts, the type \p VecTy will be split at the codegen
1892/// phase. If the type is going to be scalarized or does not uses whole
1893/// registers, returns 1.
1894static unsigned
1896 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1897 unsigned NumParts = TTI.getNumberOfParts(VecTy);
1898 if (NumParts == 0 || NumParts >= Limit)
1899 return 1;
1900 unsigned Sz = getNumElements(VecTy);
1901 if (NumParts >= Sz || Sz % NumParts != 0 ||
1902 !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), Sz / NumParts))
1903 return 1;
1904 return NumParts;
1905}
1906
1907namespace slpvectorizer {
1908
1909/// Bottom Up SLP Vectorizer.
1910class BoUpSLP {
1911 class TreeEntry;
1912 class ScheduleEntity;
1913 class ScheduleData;
1914 class ScheduleCopyableData;
1915 class ScheduleBundle;
1918
1919public:
1920 /// Tracks the state we can represent the loads in the given sequence.
1921 enum class LoadsState {
1922 Gather,
1923 Vectorize,
1927 };
1928
1935
1937 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1940 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1941 AC(AC), DB(DB), DL(DL), ORE(ORE),
1942 Builder(Se->getContext(), TargetFolder(*DL)) {
1943 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1944 // Use the vector register size specified by the target unless overridden
1945 // by a command-line option.
1946 // TODO: It would be better to limit the vectorization factor based on
1947 // data type rather than just register size. For example, x86 AVX has
1948 // 256-bit registers, but it does not support integer operations
1949 // at that width (that requires AVX2).
1950 if (MaxVectorRegSizeOption.getNumOccurrences())
1951 MaxVecRegSize = MaxVectorRegSizeOption;
1952 else
1953 MaxVecRegSize =
1955 .getFixedValue();
1956
1957 if (MinVectorRegSizeOption.getNumOccurrences())
1958 MinVecRegSize = MinVectorRegSizeOption;
1959 else
1960 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1961 }
1962
1963 /// Vectorize the tree that starts with the elements in \p VL.
1964 /// Returns the vectorized root.
1966
1967 /// Vectorize the tree but with the list of externally used values \p
1968 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1969 /// generated extractvalue instructions.
1971 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1972 Instruction *ReductionRoot = nullptr,
1973 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {});
1974
1975 /// \returns the cost incurred by unwanted spills and fills, caused by
1976 /// holding live values over call sites.
1978
1979 /// \returns the vectorization cost of the subtree that starts at \p VL.
1980 /// A negative number means that this is profitable.
1981 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = {},
1982 InstructionCost ReductionCost = TTI::TCC_Free);
1983
1984 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1985 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1986 void buildTree(ArrayRef<Value *> Roots,
1987 const SmallDenseSet<Value *> &UserIgnoreLst);
1988
1989 /// Construct a vectorizable tree that starts at \p Roots.
1990 void buildTree(ArrayRef<Value *> Roots);
1991
1992 /// Return the scalars of the root node.
1994 assert(!VectorizableTree.empty() && "No graph to get the first node from");
1995 return VectorizableTree.front()->Scalars;
1996 }
1997
1998 /// Returns the type/is-signed info for the root node in the graph without
1999 /// casting.
2000 std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const {
2001 const TreeEntry &Root = *VectorizableTree.front();
2002 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2003 !Root.Scalars.front()->getType()->isIntegerTy())
2004 return std::nullopt;
2005 auto It = MinBWs.find(&Root);
2006 if (It != MinBWs.end())
2007 return std::make_pair(IntegerType::get(Root.Scalars.front()->getContext(),
2008 It->second.first),
2009 It->second.second);
2010 if (Root.getOpcode() == Instruction::ZExt ||
2011 Root.getOpcode() == Instruction::SExt)
2012 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
2013 Root.getOpcode() == Instruction::SExt);
2014 return std::nullopt;
2015 }
2016
2017 /// Checks if the root graph node can be emitted with narrower bitwidth at
2018 /// codegen and returns it signedness, if so.
2020 return MinBWs.at(VectorizableTree.front().get()).second;
2021 }
2022
2023 /// Returns reduction type after minbitdth analysis.
2025 if (ReductionBitWidth == 0 ||
2026 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2027 ReductionBitWidth >=
2028 DL->getTypeSizeInBits(
2029 VectorizableTree.front()->Scalars.front()->getType()))
2030 return getWidenedType(
2031 VectorizableTree.front()->Scalars.front()->getType(),
2032 VectorizableTree.front()->getVectorFactor());
2033 return getWidenedType(
2035 VectorizableTree.front()->Scalars.front()->getContext(),
2036 ReductionBitWidth),
2037 VectorizableTree.front()->getVectorFactor());
2038 }
2039
2040 /// Builds external uses of the vectorized scalars, i.e. the list of
2041 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
2042 /// ExternallyUsedValues contains additional list of external uses to handle
2043 /// vectorization of reductions.
2044 void
2045 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
2046
2047 /// Transforms graph nodes to target specific representations, if profitable.
2048 void transformNodes();
2049
2050 /// Clear the internal data structures that are created by 'buildTree'.
2051 void deleteTree() {
2052 VectorizableTree.clear();
2053 ScalarToTreeEntries.clear();
2054 OperandsToTreeEntry.clear();
2055 ScalarsInSplitNodes.clear();
2056 MustGather.clear();
2057 NonScheduledFirst.clear();
2058 EntryToLastInstruction.clear();
2059 LoadEntriesToVectorize.clear();
2060 IsGraphTransformMode = false;
2061 GatheredLoadsEntriesFirst.reset();
2062 CompressEntryToData.clear();
2063 ExternalUses.clear();
2064 ExternalUsesAsOriginalScalar.clear();
2065 ExternalUsesWithNonUsers.clear();
2066 for (auto &Iter : BlocksSchedules) {
2067 BlockScheduling *BS = Iter.second.get();
2068 BS->clear();
2069 }
2070 MinBWs.clear();
2071 ReductionBitWidth = 0;
2072 BaseGraphSize = 1;
2073 CastMaxMinBWSizes.reset();
2074 ExtraBitWidthNodes.clear();
2075 InstrElementSize.clear();
2076 UserIgnoreList = nullptr;
2077 PostponedGathers.clear();
2078 ValueToGatherNodes.clear();
2079 }
2080
2081 unsigned getTreeSize() const { return VectorizableTree.size(); }
2082
2083 /// Returns the base graph size, before any transformations.
2084 unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
2085
2086 /// Perform LICM and CSE on the newly generated gather sequences.
2088
2089 /// Does this non-empty order represent an identity order? Identity
2090 /// should be represented as an empty order, so this is used to
2091 /// decide if we can canonicalize a computed order. Undef elements
2092 /// (represented as size) are ignored.
2094 assert(!Order.empty() && "expected non-empty order");
2095 const unsigned Sz = Order.size();
2096 return all_of(enumerate(Order), [&](const auto &P) {
2097 return P.value() == P.index() || P.value() == Sz;
2098 });
2099 }
2100
2101 /// Checks if the specified gather tree entry \p TE can be represented as a
2102 /// shuffled vector entry + (possibly) permutation with other gathers. It
2103 /// implements the checks only for possibly ordered scalars (Loads,
2104 /// ExtractElement, ExtractValue), which can be part of the graph.
2105 /// \param TopToBottom If true, used for the whole tree rotation, false - for
2106 /// sub-tree rotations. \param IgnoreReorder true, if the order of the root
2107 /// node might be ignored.
2108 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE,
2109 bool TopToBottom,
2110 bool IgnoreReorder);
2111
2112 /// Sort loads into increasing pointers offsets to allow greater clustering.
2113 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
2114
2115 /// Gets reordering data for the given tree entry. If the entry is vectorized
2116 /// - just return ReorderIndices, otherwise check if the scalars can be
2117 /// reordered and return the most optimal order.
2118 /// \return std::nullopt if ordering is not important, empty order, if
2119 /// identity order is important, or the actual order.
2120 /// \param TopToBottom If true, include the order of vectorized stores and
2121 /// insertelement nodes, otherwise skip them.
2122 /// \param IgnoreReorder true, if the root node order can be ignored.
2123 std::optional<OrdersType>
2124 getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder);
2125
2126 /// Checks if it is profitable to reorder the current tree.
2127 /// If the tree does not contain many profitable reordable nodes, better to
2128 /// skip it to save compile time.
2129 bool isProfitableToReorder() const;
2130
2131 /// Reorders the current graph to the most profitable order starting from the
2132 /// root node to the leaf nodes. The best order is chosen only from the nodes
2133 /// of the same size (vectorization factor). Smaller nodes are considered
2134 /// parts of subgraph with smaller VF and they are reordered independently. We
2135 /// can make it because we still need to extend smaller nodes to the wider VF
2136 /// and we can merge reordering shuffles with the widening shuffles.
2137 void reorderTopToBottom();
2138
2139 /// Reorders the current graph to the most profitable order starting from
2140 /// leaves to the root. It allows to rotate small subgraphs and reduce the
2141 /// number of reshuffles if the leaf nodes use the same order. In this case we
2142 /// can merge the orders and just shuffle user node instead of shuffling its
2143 /// operands. Plus, even the leaf nodes have different orders, it allows to
2144 /// sink reordering in the graph closer to the root node and merge it later
2145 /// during analysis.
2146 void reorderBottomToTop(bool IgnoreReorder = false);
2147
2148 /// \return The vector element size in bits to use when vectorizing the
2149 /// expression tree ending at \p V. If V is a store, the size is the width of
2150 /// the stored value. Otherwise, the size is the width of the largest loaded
2151 /// value reaching V. This method is used by the vectorizer to calculate
2152 /// vectorization factors.
2153 unsigned getVectorElementSize(Value *V);
2154
2155 /// Compute the minimum type sizes required to represent the entries in a
2156 /// vectorizable tree.
2158
2159 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
2160 unsigned getMaxVecRegSize() const {
2161 return MaxVecRegSize;
2162 }
2163
2164 // \returns minimum vector register size as set by cl::opt.
2165 unsigned getMinVecRegSize() const {
2166 return MinVecRegSize;
2167 }
2168
2169 unsigned getMinVF(unsigned Sz) const {
2170 return std::max(2U, getMinVecRegSize() / Sz);
2171 }
2172
2173 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
2174 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
2175 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2176 return MaxVF ? MaxVF : UINT_MAX;
2177 }
2178
2179 /// Check if homogeneous aggregate is isomorphic to some VectorType.
2180 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
2181 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
2182 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
2183 ///
2184 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
2185 unsigned canMapToVector(Type *T) const;
2186
2187 /// \returns True if the VectorizableTree is both tiny and not fully
2188 /// vectorizable. We do not vectorize such trees.
2189 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
2190
2191 /// Checks if the graph and all its subgraphs cannot be better vectorized.
2192 /// It may happen, if all gather nodes are loads and they cannot be
2193 /// "clusterized". In this case even subgraphs cannot be vectorized more
2194 /// effectively than the base graph.
2195 bool isTreeNotExtendable() const;
2196
2197 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
2198 /// can be load combined in the backend. Load combining may not be allowed in
2199 /// the IR optimizer, so we do not want to alter the pattern. For example,
2200 /// partially transforming a scalar bswap() pattern into vector code is
2201 /// effectively impossible for the backend to undo.
2202 /// TODO: If load combining is allowed in the IR optimizer, this analysis
2203 /// may not be necessary.
2204 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
2205
2206 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
2207 /// can be load combined in the backend. Load combining may not be allowed in
2208 /// the IR optimizer, so we do not want to alter the pattern. For example,
2209 /// partially transforming a scalar bswap() pattern into vector code is
2210 /// effectively impossible for the backend to undo.
2211 /// TODO: If load combining is allowed in the IR optimizer, this analysis
2212 /// may not be necessary.
2213 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
2214
2215 /// Checks if the given array of loads can be represented as a vectorized,
2216 /// scatter or just simple gather.
2217 /// \param VL list of loads.
2218 /// \param VL0 main load value.
2219 /// \param Order returned order of load instructions.
2220 /// \param PointerOps returned list of pointer operands.
2221 /// \param BestVF return best vector factor, if recursive check found better
2222 /// vectorization sequences rather than masked gather.
2223 /// \param TryRecursiveCheck used to check if long masked gather can be
2224 /// represented as a serie of loads/insert subvector, if profitable.
2227 SmallVectorImpl<Value *> &PointerOps,
2228 unsigned *BestVF = nullptr,
2229 bool TryRecursiveCheck = true) const;
2230
2231 /// Registers non-vectorizable sequence of loads
2232 template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
2233 ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
2234 }
2235
2236 /// Checks if the given loads sequence is known as not vectorizable
2237 template <typename T>
2239 return ListOfKnonwnNonVectorizableLoads.contains(hash_value(VL));
2240 }
2241
2243
2244 /// This structure holds any data we need about the edges being traversed
2245 /// during buildTreeRec(). We keep track of:
2246 /// (i) the user TreeEntry index, and
2247 /// (ii) the index of the edge.
2248 struct EdgeInfo {
2249 EdgeInfo() = default;
2250 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
2252 /// The user TreeEntry.
2253 TreeEntry *UserTE = nullptr;
2254 /// The operand index of the use.
2255 unsigned EdgeIdx = UINT_MAX;
2256#ifndef NDEBUG
2258 const BoUpSLP::EdgeInfo &EI) {
2259 EI.dump(OS);
2260 return OS;
2261 }
2262 /// Debug print.
2263 void dump(raw_ostream &OS) const {
2264 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
2265 << " EdgeIdx:" << EdgeIdx << "}";
2266 }
2267 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
2268#endif
2269 bool operator == (const EdgeInfo &Other) const {
2270 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
2271 }
2272
2273 operator bool() const { return UserTE != nullptr; }
2274 };
2275 friend struct DenseMapInfo<EdgeInfo>;
2276
2277 /// A helper class used for scoring candidates for two consecutive lanes.
2279 const TargetLibraryInfo &TLI;
2280 const DataLayout &DL;
2281 ScalarEvolution &SE;
2282 const BoUpSLP &R;
2283 int NumLanes; // Total number of lanes (aka vectorization factor).
2284 int MaxLevel; // The maximum recursion depth for accumulating score.
2285
2286 public:
2288 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
2289 int MaxLevel)
2290 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2291 MaxLevel(MaxLevel) {}
2292
2293 // The hard-coded scores listed here are not very important, though it shall
2294 // be higher for better matches to improve the resulting cost. When
2295 // computing the scores of matching one sub-tree with another, we are
2296 // basically counting the number of values that are matching. So even if all
2297 // scores are set to 1, we would still get a decent matching result.
2298 // However, sometimes we have to break ties. For example we may have to
2299 // choose between matching loads vs matching opcodes. This is what these
2300 // scores are helping us with: they provide the order of preference. Also,
2301 // this is important if the scalar is externally used or used in another
2302 // tree entry node in the different lane.
2303
2304 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
2305 static const int ScoreConsecutiveLoads = 4;
2306 /// The same load multiple times. This should have a better score than
2307 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
2308 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
2309 /// a vector load and 1.0 for a broadcast.
2310 static const int ScoreSplatLoads = 3;
2311 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
2312 static const int ScoreReversedLoads = 3;
2313 /// A load candidate for masked gather.
2314 static const int ScoreMaskedGatherCandidate = 1;
2315 /// ExtractElementInst from same vector and consecutive indexes.
2316 static const int ScoreConsecutiveExtracts = 4;
2317 /// ExtractElementInst from same vector and reversed indices.
2318 static const int ScoreReversedExtracts = 3;
2319 /// Constants.
2320 static const int ScoreConstants = 2;
2321 /// Instructions with the same opcode.
2322 static const int ScoreSameOpcode = 2;
2323 /// Instructions with alt opcodes (e.g, add + sub).
2324 static const int ScoreAltOpcodes = 1;
2325 /// Identical instructions (a.k.a. splat or broadcast).
2326 static const int ScoreSplat = 1;
2327 /// Matching with an undef is preferable to failing.
2328 static const int ScoreUndef = 1;
2329 /// Score for failing to find a decent match.
2330 static const int ScoreFail = 0;
2331 /// Score if all users are vectorized.
2332 static const int ScoreAllUserVectorized = 1;
2333
2334 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
2335 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
2336 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
2337 /// MainAltOps.
2339 ArrayRef<Value *> MainAltOps) const {
2340 if (!isValidElementType(V1->getType()) ||
2343
2344 if (V1 == V2) {
2345 if (isa<LoadInst>(V1)) {
2346 // Retruns true if the users of V1 and V2 won't need to be extracted.
2347 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
2348 // Bail out if we have too many uses to save compilation time.
2349 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
2350 return false;
2351
2352 auto AllUsersVectorized = [U1, U2, this](Value *V) {
2353 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
2354 return U == U1 || U == U2 || R.isVectorized(U);
2355 });
2356 };
2357 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2358 };
2359 // A broadcast of a load can be cheaper on some targets.
2360 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
2361 ElementCount::getFixed(NumLanes)) &&
2362 ((int)V1->getNumUses() == NumLanes ||
2363 AllUsersAreInternal(V1, V2)))
2365 }
2367 }
2368
2369 auto CheckSameEntryOrFail = [&]() {
2370 if (ArrayRef<TreeEntry *> TEs1 = R.getTreeEntries(V1); !TEs1.empty()) {
2372 if (ArrayRef<TreeEntry *> TEs2 = R.getTreeEntries(V2);
2373 !TEs2.empty() &&
2374 any_of(TEs2, [&](TreeEntry *E) { return Set.contains(E); }))
2376 }
2378 };
2379
2380 auto *LI1 = dyn_cast<LoadInst>(V1);
2381 auto *LI2 = dyn_cast<LoadInst>(V2);
2382 if (LI1 && LI2) {
2383 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2384 !LI2->isSimple())
2385 return CheckSameEntryOrFail();
2386
2387 std::optional<int64_t> Dist = getPointersDiff(
2388 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
2389 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
2390 if (!Dist || *Dist == 0) {
2391 if (getUnderlyingObject(LI1->getPointerOperand()) ==
2392 getUnderlyingObject(LI2->getPointerOperand()) &&
2393 R.TTI->isLegalMaskedGather(
2394 getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
2396 return CheckSameEntryOrFail();
2397 }
2398 // The distance is too large - still may be profitable to use masked
2399 // loads/gathers.
2400 if (std::abs(*Dist) > NumLanes / 2)
2402 // This still will detect consecutive loads, but we might have "holes"
2403 // in some cases. It is ok for non-power-2 vectorization and may produce
2404 // better results. It should not affect current vectorization.
2407 }
2408
2409 auto *C1 = dyn_cast<Constant>(V1);
2410 auto *C2 = dyn_cast<Constant>(V2);
2411 if (C1 && C2)
2413
2414 // Consider constants and buildvector compatible.
2415 if ((C1 && isa<InsertElementInst>(V2)) ||
2416 (C2 && isa<InsertElementInst>(V1)))
2418
2419 // Extracts from consecutive indexes of the same vector better score as
2420 // the extracts could be optimized away.
2421 Value *EV1;
2422 ConstantInt *Ex1Idx;
2423 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
2424 // Undefs are always profitable for extractelements.
2425 // Compiler can easily combine poison and extractelement <non-poison> or
2426 // undef and extractelement <poison>. But combining undef +
2427 // extractelement <non-poison-but-may-produce-poison> requires some
2428 // extra operations.
2429 if (isa<UndefValue>(V2))
2430 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
2433 Value *EV2 = nullptr;
2434 ConstantInt *Ex2Idx = nullptr;
2435 if (match(V2,
2437 m_Undef())))) {
2438 // Undefs are always profitable for extractelements.
2439 if (!Ex2Idx)
2441 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
2443 if (EV2 == EV1) {
2444 int Idx1 = Ex1Idx->getZExtValue();
2445 int Idx2 = Ex2Idx->getZExtValue();
2446 int Dist = Idx2 - Idx1;
2447 // The distance is too large - still may be profitable to use
2448 // shuffles.
2449 if (std::abs(Dist) == 0)
2451 if (std::abs(Dist) > NumLanes / 2)
2455 }
2457 }
2458 return CheckSameEntryOrFail();
2459 }
2460
2461 auto *I1 = dyn_cast<Instruction>(V1);
2462 auto *I2 = dyn_cast<Instruction>(V2);
2463 if (I1 && I2) {
2464 if (I1->getParent() != I2->getParent())
2465 return CheckSameEntryOrFail();
2466 SmallVector<Value *, 4> Ops(MainAltOps);
2467 Ops.push_back(I1);
2468 Ops.push_back(I2);
2469 InstructionsState S = getSameOpcode(Ops, TLI);
2470 // Note: Only consider instructions with <= 2 operands to avoid
2471 // complexity explosion.
2472 if (S &&
2473 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||
2474 !S.isAltShuffle()) &&
2475 all_of(Ops, [&S](Value *V) {
2476 return isa<PoisonValue>(V) ||
2477 cast<Instruction>(V)->getNumOperands() ==
2478 S.getMainOp()->getNumOperands();
2479 }))
2480 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
2482 }
2483
2484 if (I1 && isa<PoisonValue>(V2))
2486
2487 if (isa<UndefValue>(V2))
2489
2490 return CheckSameEntryOrFail();
2491 }
2492
2493 /// Go through the operands of \p LHS and \p RHS recursively until
2494 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
2495 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
2496 /// of \p U1 and \p U2), except at the beginning of the recursion where
2497 /// these are set to nullptr.
2498 ///
2499 /// For example:
2500 /// \verbatim
2501 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
2502 /// \ / \ / \ / \ /
2503 /// + + + +
2504 /// G1 G2 G3 G4
2505 /// \endverbatim
2506 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
2507 /// each level recursively, accumulating the score. It starts from matching
2508 /// the additions at level 0, then moves on to the loads (level 1). The
2509 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
2510 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
2511 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
2512 /// Please note that the order of the operands does not matter, as we
2513 /// evaluate the score of all profitable combinations of operands. In
2514 /// other words the score of G1 and G4 is the same as G1 and G2. This
2515 /// heuristic is based on ideas described in:
2516 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
2517 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
2518 /// Luís F. W. Góes
2520 Instruction *U2, int CurrLevel,
2521 ArrayRef<Value *> MainAltOps) const {
2522
2523 // Get the shallow score of V1 and V2.
2524 int ShallowScoreAtThisLevel =
2525 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
2526
2527 // If reached MaxLevel,
2528 // or if V1 and V2 are not instructions,
2529 // or if they are SPLAT,
2530 // or if they are not consecutive,
2531 // or if profitable to vectorize loads or extractelements, early return
2532 // the current cost.
2533 auto *I1 = dyn_cast<Instruction>(LHS);
2534 auto *I2 = dyn_cast<Instruction>(RHS);
2535 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2536 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
2537 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
2538 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2539 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
2540 ShallowScoreAtThisLevel))
2541 return ShallowScoreAtThisLevel;
2542 assert(I1 && I2 && "Should have early exited.");
2543
2544 // Contains the I2 operand indexes that got matched with I1 operands.
2545 SmallSet<unsigned, 4> Op2Used;
2546
2547 // Recursion towards the operands of I1 and I2. We are trying all possible
2548 // operand pairs, and keeping track of the best score.
2549 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2550 OpIdx1 != NumOperands1; ++OpIdx1) {
2551 // Try to pair op1I with the best operand of I2.
2552 int MaxTmpScore = 0;
2553 unsigned MaxOpIdx2 = 0;
2554 bool FoundBest = false;
2555 // If I2 is commutative try all combinations.
2556 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
2557 unsigned ToIdx = isCommutative(I2)
2558 ? I2->getNumOperands()
2559 : std::min(I2->getNumOperands(), OpIdx1 + 1);
2560 assert(FromIdx <= ToIdx && "Bad index");
2561 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2562 // Skip operands already paired with OpIdx1.
2563 if (Op2Used.count(OpIdx2))
2564 continue;
2565 // Recursively calculate the cost at each level
2566 int TmpScore =
2567 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
2568 I1, I2, CurrLevel + 1, {});
2569 // Look for the best score.
2570 if (TmpScore > LookAheadHeuristics::ScoreFail &&
2571 TmpScore > MaxTmpScore) {
2572 MaxTmpScore = TmpScore;
2573 MaxOpIdx2 = OpIdx2;
2574 FoundBest = true;
2575 }
2576 }
2577 if (FoundBest) {
2578 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
2579 Op2Used.insert(MaxOpIdx2);
2580 ShallowScoreAtThisLevel += MaxTmpScore;
2581 }
2582 }
2583 return ShallowScoreAtThisLevel;
2584 }
2585 };
2586 /// A helper data structure to hold the operands of a vector of instructions.
2587 /// This supports a fixed vector length for all operand vectors.
2589 /// For each operand we need (i) the value, and (ii) the opcode that it
2590 /// would be attached to if the expression was in a left-linearized form.
2591 /// This is required to avoid illegal operand reordering.
2592 /// For example:
2593 /// \verbatim
2594 /// 0 Op1
2595 /// |/
2596 /// Op1 Op2 Linearized + Op2
2597 /// \ / ----------> |/
2598 /// - -
2599 ///
2600 /// Op1 - Op2 (0 + Op1) - Op2
2601 /// \endverbatim
2602 ///
2603 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
2604 ///
2605 /// Another way to think of this is to track all the operations across the
2606 /// path from the operand all the way to the root of the tree and to
2607 /// calculate the operation that corresponds to this path. For example, the
2608 /// path from Op2 to the root crosses the RHS of the '-', therefore the
2609 /// corresponding operation is a '-' (which matches the one in the
2610 /// linearized tree, as shown above).
2611 ///
2612 /// For lack of a better term, we refer to this operation as Accumulated
2613 /// Path Operation (APO).
2614 struct OperandData {
2615 OperandData() = default;
2616 OperandData(Value *V, bool APO, bool IsUsed)
2617 : V(V), APO(APO), IsUsed(IsUsed) {}
2618 /// The operand value.
2619 Value *V = nullptr;
2620 /// TreeEntries only allow a single opcode, or an alternate sequence of
2621 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
2622 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
2623 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
2624 /// (e.g., Add/Mul)
2625 bool APO = false;
2626 /// Helper data for the reordering function.
2627 bool IsUsed = false;
2628 };
2629
2630 /// During operand reordering, we are trying to select the operand at lane
2631 /// that matches best with the operand at the neighboring lane. Our
2632 /// selection is based on the type of value we are looking for. For example,
2633 /// if the neighboring lane has a load, we need to look for a load that is
2634 /// accessing a consecutive address. These strategies are summarized in the
2635 /// 'ReorderingMode' enumerator.
2636 enum class ReorderingMode {
2637 Load, ///< Matching loads to consecutive memory addresses
2638 Opcode, ///< Matching instructions based on opcode (same or alternate)
2639 Constant, ///< Matching constants
2640 Splat, ///< Matching the same instruction multiple times (broadcast)
2641 Failed, ///< We failed to create a vectorizable group
2642 };
2643
2645
2646 /// A vector of operand vectors.
2648 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
2649 /// is not IntrinsicInst, ArgSize is User::getNumOperands.
2650 unsigned ArgSize = 0;
2651
2652 const TargetLibraryInfo &TLI;
2653 const DataLayout &DL;
2654 ScalarEvolution &SE;
2655 const BoUpSLP &R;
2656 const Loop *L = nullptr;
2657
2658 /// \returns the operand data at \p OpIdx and \p Lane.
2659 OperandData &getData(unsigned OpIdx, unsigned Lane) {
2660 return OpsVec[OpIdx][Lane];
2661 }
2662
2663 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
2664 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
2665 return OpsVec[OpIdx][Lane];
2666 }
2667
2668 /// Clears the used flag for all entries.
2669 void clearUsed() {
2670 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
2671 OpIdx != NumOperands; ++OpIdx)
2672 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2673 ++Lane)
2674 OpsVec[OpIdx][Lane].IsUsed = false;
2675 }
2676
2677 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
2678 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
2679 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2680 }
2681
2682 /// \param Lane lane of the operands under analysis.
2683 /// \param OpIdx operand index in \p Lane lane we're looking the best
2684 /// candidate for.
2685 /// \param Idx operand index of the current candidate value.
2686 /// \returns The additional score due to possible broadcasting of the
2687 /// elements in the lane. It is more profitable to have power-of-2 unique
2688 /// elements in the lane, it will be vectorized with higher probability
2689 /// after removing duplicates. Currently the SLP vectorizer supports only
2690 /// vectorization of the power-of-2 number of unique scalars.
2691 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
2692 const SmallBitVector &UsedLanes) const {
2693 Value *IdxLaneV = getData(Idx, Lane).V;
2694 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2695 isa<ExtractElementInst>(IdxLaneV))
2696 return 0;
2698 for (unsigned Ln : seq<unsigned>(getNumLanes())) {
2699 if (Ln == Lane)
2700 continue;
2701 Value *OpIdxLnV = getData(OpIdx, Ln).V;
2702 if (!isa<Instruction>(OpIdxLnV))
2703 return 0;
2704 Uniques.try_emplace(OpIdxLnV, Ln);
2705 }
2706 unsigned UniquesCount = Uniques.size();
2707 auto IdxIt = Uniques.find(IdxLaneV);
2708 unsigned UniquesCntWithIdxLaneV =
2709 IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2710 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2711 auto OpIdxIt = Uniques.find(OpIdxLaneV);
2712 unsigned UniquesCntWithOpIdxLaneV =
2713 OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2714 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2715 return 0;
2716 return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) -
2717 UniquesCntWithOpIdxLaneV,
2718 UniquesCntWithOpIdxLaneV -
2719 bit_floor(UniquesCntWithOpIdxLaneV)) -
2720 ((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second))
2721 ? UniquesCntWithIdxLaneV - bit_floor(UniquesCntWithIdxLaneV)
2722 : bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2723 }
2724
2725 /// \param Lane lane of the operands under analysis.
2726 /// \param OpIdx operand index in \p Lane lane we're looking the best
2727 /// candidate for.
2728 /// \param Idx operand index of the current candidate value.
2729 /// \returns The additional score for the scalar which users are all
2730 /// vectorized.
2731 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
2732 Value *IdxLaneV = getData(Idx, Lane).V;
2733 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2734 // Do not care about number of uses for vector-like instructions
2735 // (extractelement/extractvalue with constant indices), they are extracts
2736 // themselves and already externally used. Vectorization of such
2737 // instructions does not add extra extractelement instruction, just may
2738 // remove it.
2739 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
2740 isVectorLikeInstWithConstOps(OpIdxLaneV))
2742 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2743 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2744 return 0;
2745 return R.areAllUsersVectorized(IdxLaneI)
2747 : 0;
2748 }
2749
2750 /// Score scaling factor for fully compatible instructions but with
2751 /// different number of external uses. Allows better selection of the
2752 /// instructions with less external uses.
2753 static const int ScoreScaleFactor = 10;
2754
2755 /// \Returns the look-ahead score, which tells us how much the sub-trees
2756 /// rooted at \p LHS and \p RHS match, the more they match the higher the
2757 /// score. This helps break ties in an informed way when we cannot decide on
2758 /// the order of the operands by just considering the immediate
2759 /// predecessors.
2760 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
2761 int Lane, unsigned OpIdx, unsigned Idx,
2762 bool &IsUsed, const SmallBitVector &UsedLanes) {
2763 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
2765 // Keep track of the instruction stack as we recurse into the operands
2766 // during the look-ahead score exploration.
2767 int Score =
2768 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
2769 /*CurrLevel=*/1, MainAltOps);
2770 if (Score) {
2771 int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
2772 if (Score <= -SplatScore) {
2773 // Failed score.
2774 Score = 0;
2775 } else {
2776 Score += SplatScore;
2777 // Scale score to see the difference between different operands
2778 // and similar operands but all vectorized/not all vectorized
2779 // uses. It does not affect actual selection of the best
2780 // compatible operand in general, just allows to select the
2781 // operand with all vectorized uses.
2782 Score *= ScoreScaleFactor;
2783 Score += getExternalUseScore(Lane, OpIdx, Idx);
2784 IsUsed = true;
2785 }
2786 }
2787 return Score;
2788 }
2789
2790 /// Best defined scores per lanes between the passes. Used to choose the
2791 /// best operand (with the highest score) between the passes.
2792 /// The key - {Operand Index, Lane}.
2793 /// The value - the best score between the passes for the lane and the
2794 /// operand.
2796 BestScoresPerLanes;
2797
2798 // Search all operands in Ops[*][Lane] for the one that matches best
2799 // Ops[OpIdx][LastLane] and return its opreand index.
2800 // If no good match can be found, return std::nullopt.
2801 std::optional<unsigned>
2802 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
2803 ArrayRef<ReorderingMode> ReorderingModes,
2804 ArrayRef<Value *> MainAltOps,
2805 const SmallBitVector &UsedLanes) {
2806 unsigned NumOperands = getNumOperands();
2807
2808 // The operand of the previous lane at OpIdx.
2809 Value *OpLastLane = getData(OpIdx, LastLane).V;
2810
2811 // Our strategy mode for OpIdx.
2812 ReorderingMode RMode = ReorderingModes[OpIdx];
2813 if (RMode == ReorderingMode::Failed)
2814 return std::nullopt;
2815
2816 // The linearized opcode of the operand at OpIdx, Lane.
2817 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2818
2819 // The best operand index and its score.
2820 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
2821 // are using the score to differentiate between the two.
2822 struct BestOpData {
2823 std::optional<unsigned> Idx;
2824 unsigned Score = 0;
2825 } BestOp;
2826 BestOp.Score =
2827 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
2828 .first->second;
2829
2830 // Track if the operand must be marked as used. If the operand is set to
2831 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
2832 // want to reestimate the operands again on the following iterations).
2833 bool IsUsed = RMode == ReorderingMode::Splat ||
2834 RMode == ReorderingMode::Constant ||
2835 RMode == ReorderingMode::Load;
2836 // Iterate through all unused operands and look for the best.
2837 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2838 // Get the operand at Idx and Lane.
2839 OperandData &OpData = getData(Idx, Lane);
2840 Value *Op = OpData.V;
2841 bool OpAPO = OpData.APO;
2842
2843 // Skip already selected operands.
2844 if (OpData.IsUsed)
2845 continue;
2846
2847 // Skip if we are trying to move the operand to a position with a
2848 // different opcode in the linearized tree form. This would break the
2849 // semantics.
2850 if (OpAPO != OpIdxAPO)
2851 continue;
2852
2853 // Look for an operand that matches the current mode.
2854 switch (RMode) {
2855 case ReorderingMode::Load:
2856 case ReorderingMode::Opcode: {
2857 bool LeftToRight = Lane > LastLane;
2858 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
2859 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
2860 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2861 OpIdx, Idx, IsUsed, UsedLanes);
2862 if (Score > static_cast<int>(BestOp.Score) ||
2863 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
2864 Idx == OpIdx)) {
2865 BestOp.Idx = Idx;
2866 BestOp.Score = Score;
2867 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2868 }
2869 break;
2870 }
2871 case ReorderingMode::Constant:
2872 if (isa<Constant>(Op) ||
2873 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
2874 BestOp.Idx = Idx;
2875 if (isa<Constant>(Op)) {
2877 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2879 }
2880 if (isa<UndefValue>(Op) || !isa<Constant>(Op))
2881 IsUsed = false;
2882 }
2883 break;
2884 case ReorderingMode::Splat:
2885 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
2886 IsUsed = Op == OpLastLane;
2887 if (Op == OpLastLane) {
2888 BestOp.Score = LookAheadHeuristics::ScoreSplat;
2889 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2891 }
2892 BestOp.Idx = Idx;
2893 }
2894 break;
2895 case ReorderingMode::Failed:
2896 llvm_unreachable("Not expected Failed reordering mode.");
2897 }
2898 }
2899
2900 if (BestOp.Idx) {
2901 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2902 return BestOp.Idx;
2903 }
2904 // If we could not find a good match return std::nullopt.
2905 return std::nullopt;
2906 }
2907
2908 /// Helper for reorderOperandVecs.
2909 /// \returns the lane that we should start reordering from. This is the one
2910 /// which has the least number of operands that can freely move about or
2911 /// less profitable because it already has the most optimal set of operands.
2912 unsigned getBestLaneToStartReordering() const {
2913 unsigned Min = UINT_MAX;
2914 unsigned SameOpNumber = 0;
2915 // std::pair<unsigned, unsigned> is used to implement a simple voting
2916 // algorithm and choose the lane with the least number of operands that
2917 // can freely move about or less profitable because it already has the
2918 // most optimal set of operands. The first unsigned is a counter for
2919 // voting, the second unsigned is the counter of lanes with instructions
2920 // with same/alternate opcodes and same parent basic block.
2922 // Try to be closer to the original results, if we have multiple lanes
2923 // with same cost. If 2 lanes have the same cost, use the one with the
2924 // highest index.
2925 for (int I = getNumLanes(); I > 0; --I) {
2926 unsigned Lane = I - 1;
2927 OperandsOrderData NumFreeOpsHash =
2928 getMaxNumOperandsThatCanBeReordered(Lane);
2929 // Compare the number of operands that can move and choose the one with
2930 // the least number.
2931 if (NumFreeOpsHash.NumOfAPOs < Min) {
2932 Min = NumFreeOpsHash.NumOfAPOs;
2933 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2934 HashMap.clear();
2935 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2936 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2937 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2938 // Select the most optimal lane in terms of number of operands that
2939 // should be moved around.
2940 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2941 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2942 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2943 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2944 auto [It, Inserted] =
2945 HashMap.try_emplace(NumFreeOpsHash.Hash, 1, Lane);
2946 if (!Inserted)
2947 ++It->second.first;
2948 }
2949 }
2950 // Select the lane with the minimum counter.
2951 unsigned BestLane = 0;
2952 unsigned CntMin = UINT_MAX;
2953 for (const auto &Data : reverse(HashMap)) {
2954 if (Data.second.first < CntMin) {
2955 CntMin = Data.second.first;
2956 BestLane = Data.second.second;
2957 }
2958 }
2959 return BestLane;
2960 }
2961
2962 /// Data structure that helps to reorder operands.
2963 struct OperandsOrderData {
2964 /// The best number of operands with the same APOs, which can be
2965 /// reordered.
2966 unsigned NumOfAPOs = UINT_MAX;
2967 /// Number of operands with the same/alternate instruction opcode and
2968 /// parent.
2969 unsigned NumOpsWithSameOpcodeParent = 0;
2970 /// Hash for the actual operands ordering.
2971 /// Used to count operands, actually their position id and opcode
2972 /// value. It is used in the voting mechanism to find the lane with the
2973 /// least number of operands that can freely move about or less profitable
2974 /// because it already has the most optimal set of operands. Can be
2975 /// replaced with SmallVector<unsigned> instead but hash code is faster
2976 /// and requires less memory.
2977 unsigned Hash = 0;
2978 };
2979 /// \returns the maximum number of operands that are allowed to be reordered
2980 /// for \p Lane and the number of compatible instructions(with the same
2981 /// parent/opcode). This is used as a heuristic for selecting the first lane
2982 /// to start operand reordering.
2983 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
2984 unsigned CntTrue = 0;
2985 unsigned NumOperands = getNumOperands();
2986 // Operands with the same APO can be reordered. We therefore need to count
2987 // how many of them we have for each APO, like this: Cnt[APO] = x.
2988 // Since we only have two APOs, namely true and false, we can avoid using
2989 // a map. Instead we can simply count the number of operands that
2990 // correspond to one of them (in this case the 'true' APO), and calculate
2991 // the other by subtracting it from the total number of operands.
2992 // Operands with the same instruction opcode and parent are more
2993 // profitable since we don't need to move them in many cases, with a high
2994 // probability such lane already can be vectorized effectively.
2995 bool AllUndefs = true;
2996 unsigned NumOpsWithSameOpcodeParent = 0;
2997 Instruction *OpcodeI = nullptr;
2998 BasicBlock *Parent = nullptr;
2999 unsigned Hash = 0;
3000 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3001 const OperandData &OpData = getData(OpIdx, Lane);
3002 if (OpData.APO)
3003 ++CntTrue;
3004 // Use Boyer-Moore majority voting for finding the majority opcode and
3005 // the number of times it occurs.
3006 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
3007 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI) ||
3008 I->getParent() != Parent) {
3009 if (NumOpsWithSameOpcodeParent == 0) {
3010 NumOpsWithSameOpcodeParent = 1;
3011 OpcodeI = I;
3012 Parent = I->getParent();
3013 } else {
3014 --NumOpsWithSameOpcodeParent;
3015 }
3016 } else {
3017 ++NumOpsWithSameOpcodeParent;
3018 }
3019 }
3020 Hash = hash_combine(
3021 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
3022 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
3023 }
3024 if (AllUndefs)
3025 return {};
3026 OperandsOrderData Data;
3027 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
3028 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3029 Data.Hash = Hash;
3030 return Data;
3031 }
3032
3033 /// Go through the instructions in VL and append their operands.
3034 void appendOperands(ArrayRef<Value *> VL, ArrayRef<ValueList> Operands,
3035 const InstructionsState &S) {
3036 assert(!Operands.empty() && !VL.empty() && "Bad list of operands");
3037 assert((empty() || all_of(Operands,
3038 [this](const ValueList &VL) {
3039 return VL.size() == getNumLanes();
3040 })) &&
3041 "Expected same number of lanes");
3042 assert(S.valid() && "InstructionsState is invalid.");
3043 // IntrinsicInst::isCommutative returns true if swapping the first "two"
3044 // arguments to the intrinsic produces the same result.
3045 Instruction *MainOp = S.getMainOp();
3046 unsigned NumOperands = MainOp->getNumOperands();
3048 OpsVec.resize(ArgSize);
3049 unsigned NumLanes = VL.size();
3050 for (OperandDataVec &Ops : OpsVec)
3051 Ops.resize(NumLanes);
3052 for (unsigned Lane : seq<unsigned>(NumLanes)) {
3053 // Our tree has just 3 nodes: the root and two operands.
3054 // It is therefore trivial to get the APO. We only need to check the
3055 // opcode of V and whether the operand at OpIdx is the LHS or RHS
3056 // operand. The LHS operand of both add and sub is never attached to an
3057 // inversese operation in the linearized form, therefore its APO is
3058 // false. The RHS is true only if V is an inverse operation.
3059
3060 // Since operand reordering is performed on groups of commutative
3061 // operations or alternating sequences (e.g., +, -), we can safely tell
3062 // the inverse operations by checking commutativity.
3063 auto *I = dyn_cast<Instruction>(VL[Lane]);
3064 if (!I && isa<PoisonValue>(VL[Lane])) {
3065 for (unsigned OpIdx : seq<unsigned>(NumOperands))
3066 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], true, false};
3067 continue;
3068 }
3069 bool IsInverseOperation = false;
3070 if (S.isCopyableElement(VL[Lane])) {
3071 // The value is a copyable element.
3072 IsInverseOperation = !isCommutative(MainOp, VL[Lane]);
3073 } else {
3074 assert(I && "Expected instruction");
3075 auto [SelectedOp, Ops] = convertTo(I, S);
3076 // We cannot check commutativity by the converted instruction
3077 // (SelectedOp) because isCommutative also examines def-use
3078 // relationships.
3079 IsInverseOperation = !isCommutative(SelectedOp, I);
3080 }
3081 for (unsigned OpIdx : seq<unsigned>(ArgSize)) {
3082 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
3083 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], APO, false};
3084 }
3085 }
3086 }
3087
3088 /// \returns the number of operands.
3089 unsigned getNumOperands() const { return ArgSize; }
3090
3091 /// \returns the number of lanes.
3092 unsigned getNumLanes() const { return OpsVec[0].size(); }
3093
3094 /// \returns the operand value at \p OpIdx and \p Lane.
3095 Value *getValue(unsigned OpIdx, unsigned Lane) const {
3096 return getData(OpIdx, Lane).V;
3097 }
3098
3099 /// \returns true if the data structure is empty.
3100 bool empty() const { return OpsVec.empty(); }
3101
3102 /// Clears the data.
3103 void clear() { OpsVec.clear(); }
3104
3105 /// \Returns true if there are enough operands identical to \p Op to fill
3106 /// the whole vector (it is mixed with constants or loop invariant values).
3107 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
3108 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
3109 assert(Op == getValue(OpIdx, Lane) &&
3110 "Op is expected to be getValue(OpIdx, Lane).");
3111 // Small number of loads - try load matching.
3112 if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
3113 return false;
3114 bool OpAPO = getData(OpIdx, Lane).APO;
3115 bool IsInvariant = L && L->isLoopInvariant(Op);
3116 unsigned Cnt = 0;
3117 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3118 if (Ln == Lane)
3119 continue;
3120 // This is set to true if we found a candidate for broadcast at Lane.
3121 bool FoundCandidate = false;
3122 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3123 OperandData &Data = getData(OpI, Ln);
3124 if (Data.APO != OpAPO || Data.IsUsed)
3125 continue;
3126 Value *OpILane = getValue(OpI, Lane);
3127 bool IsConstantOp = isa<Constant>(OpILane);
3128 // Consider the broadcast candidate if:
3129 // 1. Same value is found in one of the operands.
3130 if (Data.V == Op ||
3131 // 2. The operand in the given lane is not constant but there is a
3132 // constant operand in another lane (which can be moved to the
3133 // given lane). In this case we can represent it as a simple
3134 // permutation of constant and broadcast.
3135 (!IsConstantOp &&
3136 ((Lns > 2 && isa<Constant>(Data.V)) ||
3137 // 2.1. If we have only 2 lanes, need to check that value in the
3138 // next lane does not build same opcode sequence.
3139 (Lns == 2 &&
3140 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) &&
3141 isa<Constant>(Data.V)))) ||
3142 // 3. The operand in the current lane is loop invariant (can be
3143 // hoisted out) and another operand is also a loop invariant
3144 // (though not a constant). In this case the whole vector can be
3145 // hoisted out.
3146 // FIXME: need to teach the cost model about this case for better
3147 // estimation.
3148 (IsInvariant && !isa<Constant>(Data.V) &&
3149 !getSameOpcode({Op, Data.V}, TLI) &&
3150 L->isLoopInvariant(Data.V))) {
3151 FoundCandidate = true;
3152 Data.IsUsed = Data.V == Op;
3153 if (Data.V == Op)
3154 ++Cnt;
3155 break;
3156 }
3157 }
3158 if (!FoundCandidate)
3159 return false;
3160 }
3161 return getNumLanes() == 2 || Cnt > 1;
3162 }
3163
3164 /// Checks if there is at least single compatible operand in lanes other
3165 /// than \p Lane, compatible with the operand \p Op.
3166 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
3167 assert(Op == getValue(OpIdx, Lane) &&
3168 "Op is expected to be getValue(OpIdx, Lane).");
3169 bool OpAPO = getData(OpIdx, Lane).APO;
3170 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3171 if (Ln == Lane)
3172 continue;
3173 if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
3174 const OperandData &Data = getData(OpI, Ln);
3175 if (Data.APO != OpAPO || Data.IsUsed)
3176 return true;
3177 Value *OpILn = getValue(OpI, Ln);
3178 return (L && L->isLoopInvariant(OpILn)) ||
3179 (getSameOpcode({Op, OpILn}, TLI) &&
3180 allSameBlock({Op, OpILn}));
3181 }))
3182 return true;
3183 }
3184 return false;
3185 }
3186
3187 public:
3188 /// Initialize with all the operands of the instruction vector \p RootVL.
3190 const InstructionsState &S, const BoUpSLP &R)
3191 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3192 L(R.LI->getLoopFor(S.getMainOp()->getParent())) {
3193 // Append all the operands of RootVL.
3194 appendOperands(RootVL, Operands, S);
3195 }
3196
3197 /// \Returns a value vector with the operands across all lanes for the
3198 /// opearnd at \p OpIdx.
3199 ValueList getVL(unsigned OpIdx) const {
3200 ValueList OpVL(OpsVec[OpIdx].size());
3201 assert(OpsVec[OpIdx].size() == getNumLanes() &&
3202 "Expected same num of lanes across all operands");
3203 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3204 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
3205 return OpVL;
3206 }
3207
3208 // Performs operand reordering for 2 or more operands.
3209 // The original operands are in OrigOps[OpIdx][Lane].
3210 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
3211 void reorder() {
3212 unsigned NumOperands = getNumOperands();
3213 unsigned NumLanes = getNumLanes();
3214 // Each operand has its own mode. We are using this mode to help us select
3215 // the instructions for each lane, so that they match best with the ones
3216 // we have selected so far.
3217 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
3218
3219 // This is a greedy single-pass algorithm. We are going over each lane
3220 // once and deciding on the best order right away with no back-tracking.
3221 // However, in order to increase its effectiveness, we start with the lane
3222 // that has operands that can move the least. For example, given the
3223 // following lanes:
3224 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
3225 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
3226 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
3227 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
3228 // we will start at Lane 1, since the operands of the subtraction cannot
3229 // be reordered. Then we will visit the rest of the lanes in a circular
3230 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
3231
3232 // Find the first lane that we will start our search from.
3233 unsigned FirstLane = getBestLaneToStartReordering();
3234
3235 // Initialize the modes.
3236 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3237 Value *OpLane0 = getValue(OpIdx, FirstLane);
3238 // Keep track if we have instructions with all the same opcode on one
3239 // side.
3240 if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
3241 // Check if OpLane0 should be broadcast.
3242 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
3243 !canBeVectorized(OpILane0, OpIdx, FirstLane))
3244 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3245 else if (isa<LoadInst>(OpILane0))
3246 ReorderingModes[OpIdx] = ReorderingMode::Load;
3247 else
3248 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
3249 } else if (isa<Constant>(OpLane0)) {
3250 ReorderingModes[OpIdx] = ReorderingMode::Constant;
3251 } else if (isa<Argument>(OpLane0)) {
3252 // Our best hope is a Splat. It may save some cost in some cases.
3253 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3254 } else {
3255 llvm_unreachable("Unexpected value kind.");
3256 }
3257 }
3258
3259 // Check that we don't have same operands. No need to reorder if operands
3260 // are just perfect diamond or shuffled diamond match. Do not do it only
3261 // for possible broadcasts or non-power of 2 number of scalars (just for
3262 // now).
3263 auto &&SkipReordering = [this]() {
3264 SmallPtrSet<Value *, 4> UniqueValues;
3265 ArrayRef<OperandData> Op0 = OpsVec.front();
3266 for (const OperandData &Data : Op0)
3267 UniqueValues.insert(Data.V);
3269 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
3270 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
3271 return !UniqueValues.contains(Data.V);
3272 }))
3273 return false;
3274 }
3275 // TODO: Check if we can remove a check for non-power-2 number of
3276 // scalars after full support of non-power-2 vectorization.
3277 return UniqueValues.size() != 2 &&
3278 hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(),
3279 UniqueValues.size());
3280 };
3281
3282 // If the initial strategy fails for any of the operand indexes, then we
3283 // perform reordering again in a second pass. This helps avoid assigning
3284 // high priority to the failed strategy, and should improve reordering for
3285 // the non-failed operand indexes.
3286 for (int Pass = 0; Pass != 2; ++Pass) {
3287 // Check if no need to reorder operands since they're are perfect or
3288 // shuffled diamond match.
3289 // Need to do it to avoid extra external use cost counting for
3290 // shuffled matches, which may cause regressions.
3291 if (SkipReordering())
3292 break;
3293 // Skip the second pass if the first pass did not fail.
3294 bool StrategyFailed = false;
3295 // Mark all operand data as free to use.
3296 clearUsed();
3297 // We keep the original operand order for the FirstLane, so reorder the
3298 // rest of the lanes. We are visiting the nodes in a circular fashion,
3299 // using FirstLane as the center point and increasing the radius
3300 // distance.
3301 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
3302 for (unsigned I = 0; I < NumOperands; ++I)
3303 MainAltOps[I].push_back(getData(I, FirstLane).V);
3304
3305 SmallBitVector UsedLanes(NumLanes);
3306 UsedLanes.set(FirstLane);
3307 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3308 // Visit the lane on the right and then the lane on the left.
3309 for (int Direction : {+1, -1}) {
3310 int Lane = FirstLane + Direction * Distance;
3311 if (Lane < 0 || Lane >= (int)NumLanes)
3312 continue;
3313 UsedLanes.set(Lane);
3314 int LastLane = Lane - Direction;
3315 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
3316 "Out of bounds");
3317 // Look for a good match for each operand.
3318 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3319 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
3320 std::optional<unsigned> BestIdx =
3321 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
3322 MainAltOps[OpIdx], UsedLanes);
3323 // By not selecting a value, we allow the operands that follow to
3324 // select a better matching value. We will get a non-null value in
3325 // the next run of getBestOperand().
3326 if (BestIdx) {
3327 // Swap the current operand with the one returned by
3328 // getBestOperand().
3329 swap(OpIdx, *BestIdx, Lane);
3330 } else {
3331 // Enable the second pass.
3332 StrategyFailed = true;
3333 }
3334 // Try to get the alternate opcode and follow it during analysis.
3335 if (MainAltOps[OpIdx].size() != 2) {
3336 OperandData &AltOp = getData(OpIdx, Lane);
3337 InstructionsState OpS =
3338 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
3339 if (OpS && OpS.isAltShuffle())
3340 MainAltOps[OpIdx].push_back(AltOp.V);
3341 }
3342 }
3343 }
3344 }
3345 // Skip second pass if the strategy did not fail.
3346 if (!StrategyFailed)
3347 break;
3348 }
3349 }
3350
3351#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3352 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
3353 switch (RMode) {
3354 case ReorderingMode::Load:
3355 return "Load";
3356 case ReorderingMode::Opcode:
3357 return "Opcode";
3358 case ReorderingMode::Constant:
3359 return "Constant";
3360 case ReorderingMode::Splat:
3361 return "Splat";
3362 case ReorderingMode::Failed:
3363 return "Failed";
3364 }
3365 llvm_unreachable("Unimplemented Reordering Type");
3366 }
3367
3368 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
3369 raw_ostream &OS) {
3370 return OS << getModeStr(RMode);
3371 }
3372
3373 /// Debug print.
3374 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
3375 printMode(RMode, dbgs());
3376 }
3377
3378 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
3379 return printMode(RMode, OS);
3380 }
3381
3383 const unsigned Indent = 2;
3384 unsigned Cnt = 0;
3385 for (const OperandDataVec &OpDataVec : OpsVec) {
3386 OS << "Operand " << Cnt++ << "\n";
3387 for (const OperandData &OpData : OpDataVec) {
3388 OS.indent(Indent) << "{";
3389 if (Value *V = OpData.V)
3390 OS << *V;
3391 else
3392 OS << "null";
3393 OS << ", APO:" << OpData.APO << "}\n";
3394 }
3395 OS << "\n";
3396 }
3397 return OS;
3398 }
3399
3400 /// Debug print.
3401 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
3402#endif
3403 };
3404
3405 /// Evaluate each pair in \p Candidates and return index into \p Candidates
3406 /// for a pair which have highest score deemed to have best chance to form
3407 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
3408 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
3409 /// of the cost, considered to be good enough score.
3410 std::optional<int>
3411 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
3412 int Limit = LookAheadHeuristics::ScoreFail) const {
3413 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
3415 int BestScore = Limit;
3416 std::optional<int> Index;
3417 for (int I : seq<int>(0, Candidates.size())) {
3418 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
3419 Candidates[I].second,
3420 /*U1=*/nullptr, /*U2=*/nullptr,
3421 /*CurrLevel=*/1, {});
3422 if (Score > BestScore) {
3423 BestScore = Score;
3424 Index = I;
3425 }
3426 }
3427 return Index;
3428 }
3429
3430 /// Checks if the instruction is marked for deletion.
3431 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
3432
3433 /// Removes an instruction from its block and eventually deletes it.
3434 /// It's like Instruction::eraseFromParent() except that the actual deletion
3435 /// is delayed until BoUpSLP is destructed.
3437 DeletedInstructions.insert(I);
3438 }
3439
3440 /// Remove instructions from the parent function and clear the operands of \p
3441 /// DeadVals instructions, marking for deletion trivially dead operands.
3442 template <typename T>
3444 ArrayRef<T *> DeadVals,
3445 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
3447 for (T *V : DeadVals) {
3448 auto *I = cast<Instruction>(V);
3450 }
3451 DenseSet<Value *> Processed;
3452 for (T *V : DeadVals) {
3453 if (!V || !Processed.insert(V).second)
3454 continue;
3455 auto *I = cast<Instruction>(V);
3457 ArrayRef<TreeEntry *> Entries = getTreeEntries(I);
3458 for (Use &U : I->operands()) {
3459 if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
3460 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
3462 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
3463 return Entry->VectorizedValue == OpI;
3464 })))
3465 DeadInsts.push_back(OpI);
3466 }
3467 I->dropAllReferences();
3468 }
3469 for (T *V : DeadVals) {
3470 auto *I = cast<Instruction>(V);
3471 if (!I->getParent())
3472 continue;
3473 assert((I->use_empty() || all_of(I->uses(),
3474 [&](Use &U) {
3475 return isDeleted(
3476 cast<Instruction>(U.getUser()));
3477 })) &&
3478 "trying to erase instruction with users.");
3479 I->removeFromParent();
3480 SE->forgetValue(I);
3481 }
3482 // Process the dead instruction list until empty.
3483 while (!DeadInsts.empty()) {
3484 Value *V = DeadInsts.pop_back_val();
3485 Instruction *VI = cast_or_null<Instruction>(V);
3486 if (!VI || !VI->getParent())
3487 continue;
3489 "Live instruction found in dead worklist!");
3490 assert(VI->use_empty() && "Instructions with uses are not dead.");
3491
3492 // Don't lose the debug info while deleting the instructions.
3493 salvageDebugInfo(*VI);
3494
3495 // Null out all of the instruction's operands to see if any operand
3496 // becomes dead as we go.
3497 for (Use &OpU : VI->operands()) {
3498 Value *OpV = OpU.get();
3499 if (!OpV)
3500 continue;
3501 OpU.set(nullptr);
3502
3503 if (!OpV->use_empty())
3504 continue;
3505
3506 // If the operand is an instruction that became dead as we nulled out
3507 // the operand, and if it is 'trivially' dead, delete it in a future
3508 // loop iteration.
3509 if (auto *OpI = dyn_cast<Instruction>(OpV))
3510 if (!DeletedInstructions.contains(OpI) &&
3511 (!OpI->getType()->isVectorTy() ||
3512 none_of(VectorValuesAndScales,
3513 [&](const std::tuple<Value *, unsigned, bool> &V) {
3514 return std::get<0>(V) == OpI;
3515 })) &&
3517 DeadInsts.push_back(OpI);
3518 }
3519
3520 VI->removeFromParent();
3521 eraseInstruction(VI);
3522 SE->forgetValue(VI);
3523 }
3524 }
3525
3526 /// Checks if the instruction was already analyzed for being possible
3527 /// reduction root.
3529 return AnalyzedReductionsRoots.count(I);
3530 }
3531 /// Register given instruction as already analyzed for being possible
3532 /// reduction root.
3534 AnalyzedReductionsRoots.insert(I);
3535 }
3536 /// Checks if the provided list of reduced values was checked already for
3537 /// vectorization.
3539 return AnalyzedReductionVals.contains(hash_value(VL));
3540 }
3541 /// Adds the list of reduced values to list of already checked values for the
3542 /// vectorization.
3544 AnalyzedReductionVals.insert(hash_value(VL));
3545 }
3546 /// Clear the list of the analyzed reduction root instructions.
3548 AnalyzedReductionsRoots.clear();
3549 AnalyzedReductionVals.clear();
3550 AnalyzedMinBWVals.clear();
3551 }
3552 /// Checks if the given value is gathered in one of the nodes.
3553 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
3554 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
3555 }
3556 /// Checks if the given value is gathered in one of the nodes.
3557 bool isGathered(const Value *V) const {
3558 return MustGather.contains(V);
3559 }
3560 /// Checks if the specified value was not schedule.
3561 bool isNotScheduled(const Value *V) const {
3562 return NonScheduledFirst.contains(V);
3563 }
3564
3565 /// Check if the value is vectorized in the tree.
3566 bool isVectorized(const Value *V) const {
3567 assert(V && "V cannot be nullptr.");
3568 return ScalarToTreeEntries.contains(V);
3569 }
3570
3571 ~BoUpSLP();
3572
3573private:
3574 /// Determine if a node \p E in can be demoted to a smaller type with a
3575 /// truncation. We collect the entries that will be demoted in ToDemote.
3576 /// \param E Node for analysis
3577 /// \param ToDemote indices of the nodes to be demoted.
3578 bool collectValuesToDemote(
3579 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
3581 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
3582 bool &IsProfitableToDemote, bool IsTruncRoot) const;
3583
3584 /// Builds the list of reorderable operands on the edges \p Edges of the \p
3585 /// UserTE, which allow reordering (i.e. the operands can be reordered because
3586 /// they have only one user and reordarable).
3587 /// \param ReorderableGathers List of all gather nodes that require reordering
3588 /// (e.g., gather of extractlements or partially vectorizable loads).
3589 /// \param GatherOps List of gather operand nodes for \p UserTE that require
3590 /// reordering, subset of \p NonVectorized.
3591 void buildReorderableOperands(
3592 TreeEntry *UserTE,
3593 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
3594 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
3595 SmallVectorImpl<TreeEntry *> &GatherOps);
3596
3597 /// Checks if the given \p TE is a gather node with clustered reused scalars
3598 /// and reorders it per given \p Mask.
3599 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
3600
3601 /// Checks if all users of \p I are the part of the vectorization tree.
3602 bool areAllUsersVectorized(
3603 Instruction *I,
3604 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
3605
3606 /// Return information about the vector formed for the specified index
3607 /// of a vector of (the same) instruction.
3609
3610 /// \returns the graph entry for the \p Idx operand of the \p E entry.
3611 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
3612 TreeEntry *getOperandEntry(TreeEntry *E, unsigned Idx) {
3613 return const_cast<TreeEntry *>(
3614 getOperandEntry(const_cast<const TreeEntry *>(E), Idx));
3615 }
3616
3617 /// Gets the root instruction for the given node. If the node is a strided
3618 /// load/store node with the reverse order, the root instruction is the last
3619 /// one.
3620 Instruction *getRootEntryInstruction(const TreeEntry &Entry) const;
3621
3622 /// \returns Cast context for the given graph node.
3624 getCastContextHint(const TreeEntry &TE) const;
3625
3626 /// \returns the cost of the vectorizable entry.
3627 InstructionCost getEntryCost(const TreeEntry *E,
3628 ArrayRef<Value *> VectorizedVals,
3629 SmallPtrSetImpl<Value *> &CheckedExtracts);
3630
3631 /// Checks if it is legal and profitable to build SplitVectorize node for the
3632 /// given \p VL.
3633 /// \param Op1 first homogeneous scalars.
3634 /// \param Op2 second homogeneous scalars.
3635 /// \param ReorderIndices indices to reorder the scalars.
3636 /// \returns true if the node was successfully built.
3637 bool canBuildSplitNode(ArrayRef<Value *> VL,
3638 const InstructionsState &LocalState,
3641 OrdersType &ReorderIndices) const;
3642
3643 /// This is the recursive part of buildTree.
3644 void buildTreeRec(ArrayRef<Value *> Roots, unsigned Depth, const EdgeInfo &EI,
3645 unsigned InterleaveFactor = 0);
3646
3647 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
3648 /// be vectorized to use the original vector (or aggregate "bitcast" to a
3649 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
3650 /// returns false, setting \p CurrentOrder to either an empty vector or a
3651 /// non-identity permutation that allows to reuse extract instructions.
3652 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
3653 /// extract order.
3654 bool canReuseExtract(ArrayRef<Value *> VL,
3655 SmallVectorImpl<unsigned> &CurrentOrder,
3656 bool ResizeAllowed = false) const;
3657
3658 /// Vectorize a single entry in the tree.
3659 Value *vectorizeTree(TreeEntry *E);
3660
3661 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
3662 /// \p E.
3663 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx);
3664
3665 /// Create a new vector from a list of scalar values. Produces a sequence
3666 /// which exploits values reused across lanes, and arranges the inserts
3667 /// for ease of later optimization.
3668 template <typename BVTy, typename ResTy, typename... Args>
3669 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
3670
3671 /// Create a new vector from a list of scalar values. Produces a sequence
3672 /// which exploits values reused across lanes, and arranges the inserts
3673 /// for ease of later optimization.
3674 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
3675
3676 /// Returns the instruction in the bundle, which can be used as a base point
3677 /// for scheduling. Usually it is the last instruction in the bundle, except
3678 /// for the case when all operands are external (in this case, it is the first
3679 /// instruction in the list).
3680 Instruction &getLastInstructionInBundle(const TreeEntry *E);
3681
3682 /// Tries to find extractelement instructions with constant indices from fixed
3683 /// vector type and gather such instructions into a bunch, which highly likely
3684 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3685 /// was successful, the matched scalars are replaced by poison values in \p VL
3686 /// for future analysis.
3687 std::optional<TargetTransformInfo::ShuffleKind>
3688 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
3689 SmallVectorImpl<int> &Mask) const;
3690
3691 /// Tries to find extractelement instructions with constant indices from fixed
3692 /// vector type and gather such instructions into a bunch, which highly likely
3693 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3694 /// was successful, the matched scalars are replaced by poison values in \p VL
3695 /// for future analysis.
3697 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
3699 unsigned NumParts) const;
3700
3701 /// Checks if the gathered \p VL can be represented as a single register
3702 /// shuffle(s) of previous tree entries.
3703 /// \param TE Tree entry checked for permutation.
3704 /// \param VL List of scalars (a subset of the TE scalar), checked for
3705 /// permutations. Must form single-register vector.
3706 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3707 /// commands to build the mask using the original vector value, without
3708 /// relying on the potential reordering.
3709 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
3710 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
3711 std::optional<TargetTransformInfo::ShuffleKind>
3712 isGatherShuffledSingleRegisterEntry(
3713 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
3714 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
3715 bool ForOrder);
3716
3717 /// Checks if the gathered \p VL can be represented as multi-register
3718 /// shuffle(s) of previous tree entries.
3719 /// \param TE Tree entry checked for permutation.
3720 /// \param VL List of scalars (a subset of the TE scalar), checked for
3721 /// permutations.
3722 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3723 /// commands to build the mask using the original vector value, without
3724 /// relying on the potential reordering.
3725 /// \returns per-register series of ShuffleKind, if gathered values can be
3726 /// represented as shuffles of previous tree entries. \p Mask is filled with
3727 /// the shuffle mask (also on per-register base).
3729 isGatherShuffledEntry(
3730 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
3732 unsigned NumParts, bool ForOrder = false);
3733
3734 /// \returns the cost of gathering (inserting) the values in \p VL into a
3735 /// vector.
3736 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
3737 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
3738 Type *ScalarTy) const;
3739
3740 /// Set the Builder insert point to one after the last instruction in
3741 /// the bundle
3742 void setInsertPointAfterBundle(const TreeEntry *E);
3743
3744 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
3745 /// specified, the starting vector value is poison.
3746 Value *
3747 gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
3748 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle);
3749
3750 /// \returns whether the VectorizableTree is fully vectorizable and will
3751 /// be beneficial even the tree height is tiny.
3752 bool isFullyVectorizableTinyTree(bool ForReduction) const;
3753
3754 /// Run through the list of all gathered loads in the graph and try to find
3755 /// vector loads/masked gathers instead of regular gathers. Later these loads
3756 /// are reshufled to build final gathered nodes.
3757 void tryToVectorizeGatheredLoads(
3758 const SmallMapVector<
3759 std::tuple<BasicBlock *, Value *, Type *>,
3760 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
3761 &GatheredLoads);
3762
3763 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
3764 /// users of \p TE and collects the stores. It returns the map from the store
3765 /// pointers to the collected stores.
3767 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
3768
3769 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
3770 /// stores in \p StoresVec can form a vector instruction. If so it returns
3771 /// true and populates \p ReorderIndices with the shuffle indices of the
3772 /// stores when compared to the sorted vector.
3773 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
3774 OrdersType &ReorderIndices) const;
3775
3776 /// Iterates through the users of \p TE, looking for scalar stores that can be
3777 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
3778 /// their order and builds an order index vector for each store bundle. It
3779 /// returns all these order vectors found.
3780 /// We run this after the tree has formed, otherwise we may come across user
3781 /// instructions that are not yet in the tree.
3783 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
3784
3785 /// Tries to reorder the gathering node for better vectorization
3786 /// opportunities.
3787 void reorderGatherNode(TreeEntry &TE);
3788
3789 class TreeEntry {
3790 public:
3791 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
3792 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3793
3794 /// \returns Common mask for reorder indices and reused scalars.
3795 SmallVector<int> getCommonMask() const {
3796 if (State == TreeEntry::SplitVectorize)
3797 return {};
3799 inversePermutation(ReorderIndices, Mask);
3800 ::addMask(Mask, ReuseShuffleIndices);
3801 return Mask;
3802 }
3803
3804 /// \returns The mask for split nodes.
3805 SmallVector<int> getSplitMask() const {
3806 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3807 "Expected only split vectorize node.");
3808 SmallVector<int> Mask(getVectorFactor(), PoisonMaskElem);
3809 unsigned CommonVF = std::max<unsigned>(
3810 CombinedEntriesWithIndices.back().second,
3811 Scalars.size() - CombinedEntriesWithIndices.back().second);
3812 for (auto [Idx, I] : enumerate(ReorderIndices))
3813 Mask[I] =
3814 Idx + (Idx >= CombinedEntriesWithIndices.back().second
3815 ? CommonVF - CombinedEntriesWithIndices.back().second
3816 : 0);
3817 return Mask;
3818 }
3819
3820 /// Updates (reorders) SplitVectorize node according to the given mask \p
3821 /// Mask and order \p MaskOrder.
3822 void reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
3823 ArrayRef<int> MaskOrder);
3824
3825 /// \returns true if the scalars in VL are equal to this entry.
3826 bool isSame(ArrayRef<Value *> VL) const {
3827 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
3828 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
3829 return std::equal(VL.begin(), VL.end(), Scalars.begin());
3830 return VL.size() == Mask.size() &&
3831 std::equal(VL.begin(), VL.end(), Mask.begin(),
3832 [Scalars](Value *V, int Idx) {
3833 return (isa<UndefValue>(V) &&
3834 Idx == PoisonMaskElem) ||
3835 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3836 });
3837 };
3838 if (!ReorderIndices.empty()) {
3839 // TODO: implement matching if the nodes are just reordered, still can
3840 // treat the vector as the same if the list of scalars matches VL
3841 // directly, without reordering.
3843 inversePermutation(ReorderIndices, Mask);
3844 if (VL.size() == Scalars.size())
3845 return IsSame(Scalars, Mask);
3846 if (VL.size() == ReuseShuffleIndices.size()) {
3847 ::addMask(Mask, ReuseShuffleIndices);
3848 return IsSame(Scalars, Mask);
3849 }
3850 return false;
3851 }
3852 return IsSame(Scalars, ReuseShuffleIndices);
3853 }
3854
3855 /// \returns true if current entry has same operands as \p TE.
3856 bool hasEqualOperands(const TreeEntry &TE) const {
3857 if (TE.getNumOperands() != getNumOperands())
3858 return false;
3859 SmallBitVector Used(getNumOperands());
3860 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
3861 unsigned PrevCount = Used.count();
3862 for (unsigned K = 0; K < E; ++K) {
3863 if (Used.test(K))
3864 continue;
3865 if (getOperand(K) == TE.getOperand(I)) {
3866 Used.set(K);
3867 break;
3868 }
3869 }
3870 // Check if we actually found the matching operand.
3871 if (PrevCount == Used.count())
3872 return false;
3873 }
3874 return true;
3875 }
3876
3877 /// \return Final vectorization factor for the node. Defined by the total
3878 /// number of vectorized scalars, including those, used several times in the
3879 /// entry and counted in the \a ReuseShuffleIndices, if any.
3880 unsigned getVectorFactor() const {
3881 if (!ReuseShuffleIndices.empty())
3882 return ReuseShuffleIndices.size();
3883 return Scalars.size();
3884 };
3885
3886 /// Checks if the current node is a gather node.
3887 bool isGather() const { return State == NeedToGather; }
3888
3889 /// A vector of scalars.
3890 ValueList Scalars;
3891
3892 /// The Scalars are vectorized into this value. It is initialized to Null.
3893 WeakTrackingVH VectorizedValue = nullptr;
3894
3895 /// Do we need to gather this sequence or vectorize it
3896 /// (either with vector instruction or with scatter/gather
3897 /// intrinsics for store/load)?
3898 enum EntryState {
3899 Vectorize, ///< The node is regularly vectorized.
3900 ScatterVectorize, ///< Masked scatter/gather node.
3901 StridedVectorize, ///< Strided loads (and stores)
3902 CompressVectorize, ///< (Masked) load with compress.
3903 NeedToGather, ///< Gather/buildvector node.
3904 CombinedVectorize, ///< Vectorized node, combined with its user into more
3905 ///< complex node like select/cmp to minmax, mul/add to
3906 ///< fma, etc. Must be used for the following nodes in
3907 ///< the pattern, not the very first one.
3908 SplitVectorize, ///< Splits the node into 2 subnodes, vectorizes them
3909 ///< independently and then combines back.
3910 };
3911 EntryState State;
3912
3913 /// List of combined opcodes supported by the vectorizer.
3914 enum CombinedOpcode {
3915 NotCombinedOp = -1,
3916 MinMax = Instruction::OtherOpsEnd + 1,
3917 FMulAdd,
3918 };
3919 CombinedOpcode CombinedOp = NotCombinedOp;
3920
3921 /// Does this sequence require some shuffling?
3922 SmallVector<int, 4> ReuseShuffleIndices;
3923
3924 /// Does this entry require reordering?
3925 SmallVector<unsigned, 4> ReorderIndices;
3926
3927 /// Points back to the VectorizableTree.
3928 ///
3929 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
3930 /// to be a pointer and needs to be able to initialize the child iterator.
3931 /// Thus we need a reference back to the container to translate the indices
3932 /// to entries.
3933 VecTreeTy &Container;
3934
3935 /// The TreeEntry index containing the user of this entry.
3936 EdgeInfo UserTreeIndex;
3937
3938 /// The index of this treeEntry in VectorizableTree.
3939 unsigned Idx = 0;
3940
3941 /// For gather/buildvector/alt opcode nodes, which are combined from
3942 /// other nodes as a series of insertvector instructions.
3943 SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
3944
3945 private:
3946 /// The operands of each instruction in each lane Operands[op_index][lane].
3947 /// Note: This helps avoid the replication of the code that performs the
3948 /// reordering of operands during buildTreeRec() and vectorizeTree().
3950
3951 /// Copyable elements of the entry node.
3952 SmallPtrSet<const Value *, 4> CopyableElements;
3953
3954 /// MainOp and AltOp are recorded inside. S should be obtained from
3955 /// newTreeEntry.
3956 InstructionsState S = InstructionsState::invalid();
3957
3958 /// Interleaving factor for interleaved loads Vectorize nodes.
3959 unsigned InterleaveFactor = 0;
3960
3961 /// True if the node does not require scheduling.
3962 bool DoesNotNeedToSchedule = false;
3963
3964 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
3965 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
3966 if (Operands.size() < OpIdx + 1)
3967 Operands.resize(OpIdx + 1);
3968 assert(Operands[OpIdx].empty() && "Already resized?");
3969 assert(OpVL.size() <= Scalars.size() &&
3970 "Number of operands is greater than the number of scalars.");
3971 Operands[OpIdx].resize(OpVL.size());
3972 copy(OpVL, Operands[OpIdx].begin());
3973 }
3974
3975 public:
3976 /// Returns interleave factor for interleave nodes.
3977 unsigned getInterleaveFactor() const { return InterleaveFactor; }
3978 /// Sets interleaving factor for the interleaving nodes.
3979 void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
3980
3981 /// Marks the node as one that does not require scheduling.
3982 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule = true; }
3983 /// Returns true if the node is marked as one that does not require
3984 /// scheduling.
3985 bool doesNotNeedToSchedule() const { return DoesNotNeedToSchedule; }
3986
3987 /// Set this bundle's operands from \p Operands.
3988 void setOperands(ArrayRef<ValueList> Operands) {
3989 for (unsigned I : seq<unsigned>(Operands.size()))
3990 setOperand(I, Operands[I]);
3991 }
3992
3993 /// Reorders operands of the node to the given mask \p Mask.
3994 void reorderOperands(ArrayRef<int> Mask) {
3995 for (ValueList &Operand : Operands)
3996 reorderScalars(Operand, Mask);
3997 }
3998
3999 /// \returns the \p OpIdx operand of this TreeEntry.
4000 ValueList &getOperand(unsigned OpIdx) {
4001 assert(OpIdx < Operands.size() && "Off bounds");
4002 return Operands[OpIdx];
4003 }
4004
4005 /// \returns the \p OpIdx operand of this TreeEntry.
4006 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
4007 assert(OpIdx < Operands.size() && "Off bounds");
4008 return Operands[OpIdx];
4009 }
4010
4011 /// \returns the number of operands.
4012 unsigned getNumOperands() const { return Operands.size(); }
4013
4014 /// \return the single \p OpIdx operand.
4015 Value *getSingleOperand(unsigned OpIdx) const {
4016 assert(OpIdx < Operands.size() && "Off bounds");
4017 assert(!Operands[OpIdx].empty() && "No operand available");
4018 return Operands[OpIdx][0];
4019 }
4020
4021 /// Some of the instructions in the list have alternate opcodes.
4022 bool isAltShuffle() const { return S.isAltShuffle(); }
4023
4024 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
4025 return S.getMatchingMainOpOrAltOp(I);
4026 }
4027
4028 /// Chooses the correct key for scheduling data. If \p Op has the same (or
4029 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
4030 /// \p OpValue.
4031 Value *isOneOf(Value *Op) const {
4032 auto *I = dyn_cast<Instruction>(Op);
4033 if (I && getMatchingMainOpOrAltOp(I))
4034 return Op;
4035 return S.getMainOp();
4036 }
4037
4038 void setOperations(const InstructionsState &S) {
4039 assert(S && "InstructionsState is invalid.");
4040 this->S = S;
4041 }
4042
4043 Instruction *getMainOp() const { return S.getMainOp(); }
4044
4045 Instruction *getAltOp() const { return S.getAltOp(); }
4046
4047 /// The main/alternate opcodes for the list of instructions.
4048 unsigned getOpcode() const { return S.getOpcode(); }
4049
4050 unsigned getAltOpcode() const { return S.getAltOpcode(); }
4051
4052 bool hasState() const { return S.valid(); }
4053
4054 /// Add \p V to the list of copyable elements.
4055 void addCopyableElement(Value *V) {
4056 assert(S.isCopyableElement(V) && "Not a copyable element.");
4057 CopyableElements.insert(V);
4058 }
4059
4060 /// Returns true if \p V is a copyable element.
4061 bool isCopyableElement(Value *V) const {
4062 return CopyableElements.contains(V);
4063 }
4064
4065 /// Returns true if any scalar in the list is a copyable element.
4066 bool hasCopyableElements() const { return !CopyableElements.empty(); }
4067
4068 /// Returns the state of the operations.
4069 const InstructionsState &getOperations() const { return S; }
4070
4071 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
4072 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
4073 unsigned findLaneForValue(Value *V) const {
4074 unsigned FoundLane = getVectorFactor();
4075 for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End;
4076 std::advance(It, 1)) {
4077 if (*It != V)
4078 continue;
4079 FoundLane = std::distance(Scalars.begin(), It);
4080 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4081 if (!ReorderIndices.empty())
4082 FoundLane = ReorderIndices[FoundLane];
4083 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4084 if (ReuseShuffleIndices.empty())
4085 break;
4086 if (auto *RIt = find(ReuseShuffleIndices, FoundLane);
4087 RIt != ReuseShuffleIndices.end()) {
4088 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
4089 break;
4090 }
4091 }
4092 assert(FoundLane < getVectorFactor() && "Unable to find given value.");
4093 return FoundLane;
4094 }
4095
4096 /// Build a shuffle mask for graph entry which represents a merge of main
4097 /// and alternate operations.
4098 void
4099 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
4101 SmallVectorImpl<Value *> *OpScalars = nullptr,
4102 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
4103
4104 /// Return true if this is a non-power-of-2 node.
4105 bool isNonPowOf2Vec() const {
4106 bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
4107 return IsNonPowerOf2;
4108 }
4109
4110 /// Return true if this is a node, which tries to vectorize number of
4111 /// elements, forming whole vectors.
4112 bool
4113 hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
4114 bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
4115 TTI, getValueType(Scalars.front()), Scalars.size());
4116 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
4117 "Reshuffling not supported with non-power-of-2 vectors yet.");
4118 return IsNonPowerOf2;
4119 }
4120
4121 Value *getOrdered(unsigned Idx) const {
4122 assert(isGather() && "Must be used only for buildvectors/gathers.");
4123 if (ReorderIndices.empty())
4124 return Scalars[Idx];
4126 inversePermutation(ReorderIndices, Mask);
4127 return Scalars[Mask[Idx]];
4128 }
4129
4130#ifndef NDEBUG
4131 /// Debug printer.
4132 LLVM_DUMP_METHOD void dump() const {
4133 dbgs() << Idx << ".\n";
4134 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4135 dbgs() << "Operand " << OpI << ":\n";
4136 for (const Value *V : Operands[OpI])
4137 dbgs().indent(2) << *V << "\n";
4138 }
4139 dbgs() << "Scalars: \n";
4140 for (Value *V : Scalars)
4141 dbgs().indent(2) << *V << "\n";
4142 dbgs() << "State: ";
4143 if (S && hasCopyableElements())
4144 dbgs() << "[[Copyable]] ";
4145 switch (State) {
4146 case Vectorize:
4147 if (InterleaveFactor > 0) {
4148 dbgs() << "Vectorize with interleave factor " << InterleaveFactor
4149 << "\n";
4150 } else {
4151 dbgs() << "Vectorize\n";
4152 }
4153 break;
4154 case ScatterVectorize:
4155 dbgs() << "ScatterVectorize\n";
4156 break;
4157 case StridedVectorize:
4158 dbgs() << "StridedVectorize\n";
4159 break;
4160 case CompressVectorize:
4161 dbgs() << "CompressVectorize\n";
4162 break;
4163 case NeedToGather:
4164 dbgs() << "NeedToGather\n";
4165 break;
4166 case CombinedVectorize:
4167 dbgs() << "CombinedVectorize\n";
4168 break;
4169 case SplitVectorize:
4170 dbgs() << "SplitVectorize\n";
4171 break;
4172 }
4173 if (S) {
4174 dbgs() << "MainOp: " << *S.getMainOp() << "\n";
4175 dbgs() << "AltOp: " << *S.getAltOp() << "\n";
4176 } else {
4177 dbgs() << "MainOp: NULL\n";
4178 dbgs() << "AltOp: NULL\n";
4179 }
4180 dbgs() << "VectorizedValue: ";
4181 if (VectorizedValue)
4182 dbgs() << *VectorizedValue << "\n";
4183 else
4184 dbgs() << "NULL\n";
4185 dbgs() << "ReuseShuffleIndices: ";
4186 if (ReuseShuffleIndices.empty())
4187 dbgs() << "Empty";
4188 else
4189 for (int ReuseIdx : ReuseShuffleIndices)
4190 dbgs() << ReuseIdx << ", ";
4191 dbgs() << "\n";
4192 dbgs() << "ReorderIndices: ";
4193 for (unsigned ReorderIdx : ReorderIndices)
4194 dbgs() << ReorderIdx << ", ";
4195 dbgs() << "\n";
4196 dbgs() << "UserTreeIndex: ";
4197 if (UserTreeIndex)
4198 dbgs() << UserTreeIndex;
4199 else
4200 dbgs() << "<invalid>";
4201 dbgs() << "\n";
4202 if (!CombinedEntriesWithIndices.empty()) {
4203 dbgs() << "Combined entries: ";
4204 interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
4205 dbgs() << "Entry index " << P.first << " with offset " << P.second;
4206 });
4207 dbgs() << "\n";
4208 }
4209 }
4210#endif
4211 };
4212
4213#ifndef NDEBUG
4214 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
4215 InstructionCost VecCost, InstructionCost ScalarCost,
4216 StringRef Banner) const {
4217 dbgs() << "SLP: " << Banner << ":\n";
4218 E->dump();
4219 dbgs() << "SLP: Costs:\n";
4220 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
4221 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
4222 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
4223 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4224 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
4225 }
4226#endif
4227
4228 /// Create a new gather TreeEntry
4229 TreeEntry *newGatherTreeEntry(ArrayRef<Value *> VL,
4230 const InstructionsState &S,
4231 const EdgeInfo &UserTreeIdx,
4232 ArrayRef<int> ReuseShuffleIndices = {}) {
4233 auto Invalid = ScheduleBundle::invalid();
4234 return newTreeEntry(VL, Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4235 }
4236
4237 /// Create a new VectorizableTree entry.
4238 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, ScheduleBundle &Bundle,
4239 const InstructionsState &S,
4240 const EdgeInfo &UserTreeIdx,
4241 ArrayRef<int> ReuseShuffleIndices = {},
4242 ArrayRef<unsigned> ReorderIndices = {},
4243 unsigned InterleaveFactor = 0) {
4244 TreeEntry::EntryState EntryState =
4245 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4246 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4247 ReuseShuffleIndices, ReorderIndices);
4248 if (E && InterleaveFactor > 0)
4249 E->setInterleave(InterleaveFactor);
4250 return E;
4251 }
4252
4253 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
4254 TreeEntry::EntryState EntryState,
4255 ScheduleBundle &Bundle, const InstructionsState &S,
4256 const EdgeInfo &UserTreeIdx,
4257 ArrayRef<int> ReuseShuffleIndices = {},
4258 ArrayRef<unsigned> ReorderIndices = {}) {
4259 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4260 EntryState == TreeEntry::SplitVectorize)) ||
4261 (Bundle && EntryState != TreeEntry::NeedToGather &&
4262 EntryState != TreeEntry::SplitVectorize)) &&
4263 "Need to vectorize gather entry?");
4264 // Gathered loads still gathered? Do not create entry, use the original one.
4265 if (GatheredLoadsEntriesFirst.has_value() &&
4266 EntryState == TreeEntry::NeedToGather && S &&
4267 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4268 !UserTreeIdx.UserTE)
4269 return nullptr;
4270 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
4271 TreeEntry *Last = VectorizableTree.back().get();
4272 Last->Idx = VectorizableTree.size() - 1;
4273 Last->State = EntryState;
4274 if (UserTreeIdx.UserTE)
4275 OperandsToTreeEntry.try_emplace(
4276 std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx), Last);
4277 // FIXME: Remove once support for ReuseShuffleIndices has been implemented
4278 // for non-power-of-two vectors.
4279 assert(
4281 ReuseShuffleIndices.empty()) &&
4282 "Reshuffling scalars not yet supported for nodes with padding");
4283 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
4284 ReuseShuffleIndices.end());
4285 if (ReorderIndices.empty()) {
4286 Last->Scalars.assign(VL.begin(), VL.end());
4287 if (S)
4288 Last->setOperations(S);
4289 } else {
4290 // Reorder scalars and build final mask.
4291 Last->Scalars.assign(VL.size(), nullptr);
4292 transform(ReorderIndices, Last->Scalars.begin(),
4293 [VL](unsigned Idx) -> Value * {
4294 if (Idx >= VL.size())
4295 return UndefValue::get(VL.front()->getType());
4296 return VL[Idx];
4297 });
4298 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
4299 if (S)
4300 Last->setOperations(S);
4301 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
4302 }
4303 if (EntryState == TreeEntry::SplitVectorize) {
4304 assert(S && "Split nodes must have operations.");
4305 Last->setOperations(S);
4306 SmallPtrSet<Value *, 4> Processed;
4307 for (Value *V : VL) {
4308 auto *I = dyn_cast<Instruction>(V);
4309 if (!I)
4310 continue;
4311 auto It = ScalarsInSplitNodes.find(V);
4312 if (It == ScalarsInSplitNodes.end()) {
4313 ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(Last);
4314 (void)Processed.insert(V);
4315 } else if (Processed.insert(V).second) {
4316 assert(!is_contained(It->getSecond(), Last) &&
4317 "Value already associated with the node.");
4318 It->getSecond().push_back(Last);
4319 }
4320 }
4321 } else if (!Last->isGather()) {
4322 if (isa<PHINode>(S.getMainOp()) ||
4323 isVectorLikeInstWithConstOps(S.getMainOp()) ||
4324 (!S.areInstructionsWithCopyableElements() &&
4325 doesNotNeedToSchedule(VL)) ||
4326 all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))
4327 Last->setDoesNotNeedToSchedule();
4328 SmallPtrSet<Value *, 4> Processed;
4329 for (Value *V : VL) {
4330 if (isa<PoisonValue>(V))
4331 continue;
4332 if (S.isCopyableElement(V)) {
4333 Last->addCopyableElement(V);
4334 continue;
4335 }
4336 auto It = ScalarToTreeEntries.find(V);
4337 if (It == ScalarToTreeEntries.end()) {
4338 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(Last);
4339 (void)Processed.insert(V);
4340 } else if (Processed.insert(V).second) {
4341 assert(!is_contained(It->getSecond(), Last) &&
4342 "Value already associated with the node.");
4343 It->getSecond().push_back(Last);
4344 }
4345 }
4346 // Update the scheduler bundle to point to this TreeEntry.
4347 assert((!Bundle.getBundle().empty() || Last->doesNotNeedToSchedule()) &&
4348 "Bundle and VL out of sync");
4349 if (!Bundle.getBundle().empty()) {
4350#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4351 auto *BundleMember = Bundle.getBundle().begin();
4352 SmallPtrSet<Value *, 4> Processed;
4353 for (Value *V : VL) {
4354 if (S.isNonSchedulable(V) || !Processed.insert(V).second)
4355 continue;
4356 ++BundleMember;
4357 }
4358 assert(BundleMember == Bundle.getBundle().end() &&
4359 "Bundle and VL out of sync");
4360#endif
4361 Bundle.setTreeEntry(Last);
4362 }
4363 } else {
4364 // Build a map for gathered scalars to the nodes where they are used.
4365 bool AllConstsOrCasts = true;
4366 for (Value *V : VL) {
4367 if (S && S.areInstructionsWithCopyableElements() &&
4368 S.isCopyableElement(V))
4369 Last->addCopyableElement(V);
4370 if (!isConstant(V)) {
4371 auto *I = dyn_cast<CastInst>(V);
4372 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
4373 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4374 !UserTreeIdx.UserTE->isGather())
4375 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
4376 }
4377 }
4378 if (AllConstsOrCasts)
4379 CastMaxMinBWSizes =
4380 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
4381 MustGather.insert_range(VL);
4382 }
4383
4384 if (UserTreeIdx.UserTE)
4385 Last->UserTreeIndex = UserTreeIdx;
4386 return Last;
4387 }
4388
4389 /// -- Vectorization State --
4390 /// Holds all of the tree entries.
4391 TreeEntry::VecTreeTy VectorizableTree;
4392
4393#ifndef NDEBUG
4394 /// Debug printer.
4395 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
4396 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4397 VectorizableTree[Id]->dump();
4398 dbgs() << "\n";
4399 }
4400 }
4401#endif
4402
4403 /// Get list of vector entries, associated with the value \p V.
4404 ArrayRef<TreeEntry *> getTreeEntries(Value *V) const {
4405 assert(V && "V cannot be nullptr.");
4406 auto It = ScalarToTreeEntries.find(V);
4407 if (It == ScalarToTreeEntries.end())
4408 return {};
4409 return It->getSecond();
4410 }
4411
4412 /// Get list of split vector entries, associated with the value \p V.
4413 ArrayRef<TreeEntry *> getSplitTreeEntries(Value *V) const {
4414 assert(V && "V cannot be nullptr.");
4415 auto It = ScalarsInSplitNodes.find(V);
4416 if (It == ScalarsInSplitNodes.end())
4417 return {};
4418 return It->getSecond();
4419 }
4420
4421 /// Returns first vector node for value \p V, matching values \p VL.
4422 TreeEntry *getSameValuesTreeEntry(Value *V, ArrayRef<Value *> VL,
4423 bool SameVF = false) const {
4424 assert(V && "V cannot be nullptr.");
4425 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
4426 if ((!SameVF || TE->getVectorFactor() == VL.size()) && TE->isSame(VL))
4427 return TE;
4428 return nullptr;
4429 }
4430
4431 /// Check that the operand node of alternate node does not generate
4432 /// buildvector sequence. If it is, then probably not worth it to build
4433 /// alternate shuffle, if number of buildvector operands + alternate
4434 /// instruction > than the number of buildvector instructions.
4435 /// \param S the instructions state of the analyzed values.
4436 /// \param VL list of the instructions with alternate opcodes.
4437 bool areAltOperandsProfitable(const InstructionsState &S,
4438 ArrayRef<Value *> VL) const;
4439
4440 /// Contains all the outputs of legality analysis for a list of values to
4441 /// vectorize.
4442 class ScalarsVectorizationLegality {
4443 InstructionsState S;
4444 bool IsLegal;
4445 bool TryToFindDuplicates;
4446 bool TrySplitVectorize;
4447
4448 public:
4449 ScalarsVectorizationLegality(InstructionsState S, bool IsLegal,
4450 bool TryToFindDuplicates = true,
4451 bool TrySplitVectorize = false)
4452 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4453 TrySplitVectorize(TrySplitVectorize) {
4454 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4455 "Inconsistent state");
4456 }
4457 const InstructionsState &getInstructionsState() const { return S; };
4458 bool isLegal() const { return IsLegal; }
4459 bool tryToFindDuplicates() const { return TryToFindDuplicates; }
4460 bool trySplitVectorize() const { return TrySplitVectorize; }
4461 };
4462
4463 /// Checks if the specified list of the instructions/values can be vectorized
4464 /// in general.
4465 ScalarsVectorizationLegality
4466 getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
4467 const EdgeInfo &UserTreeIdx,
4468 bool TryCopyableElementsVectorization) const;
4469
4470 /// Checks if the specified list of the instructions/values can be vectorized
4471 /// and fills required data before actual scheduling of the instructions.
4472 TreeEntry::EntryState
4473 getScalarsVectorizationState(const InstructionsState &S, ArrayRef<Value *> VL,
4474 bool IsScatterVectorizeUserTE,
4475 OrdersType &CurrentOrder,
4476 SmallVectorImpl<Value *> &PointerOps);
4477
4478 /// Maps a specific scalar to its tree entry(ies).
4480
4481 /// Maps the operand index and entry to the corresponding tree entry.
4483 OperandsToTreeEntry;
4484
4485 /// Scalars, used in split vectorize nodes.
4487
4488 /// Maps a value to the proposed vectorizable size.
4489 SmallDenseMap<Value *, unsigned> InstrElementSize;
4490
4491 /// A list of scalars that we found that we need to keep as scalars.
4492 ValueSet MustGather;
4493
4494 /// A set of first non-schedulable values.
4495 ValueSet NonScheduledFirst;
4496
4497 /// A map between the vectorized entries and the last instructions in the
4498 /// bundles. The bundles are built in use order, not in the def order of the
4499 /// instructions. So, we cannot rely directly on the last instruction in the
4500 /// bundle being the last instruction in the program order during
4501 /// vectorization process since the basic blocks are affected, need to
4502 /// pre-gather them before.
4504
4505 /// List of gather nodes, depending on other gather/vector nodes, which should
4506 /// be emitted after the vector instruction emission process to correctly
4507 /// handle order of the vector instructions and shuffles.
4508 SetVector<const TreeEntry *> PostponedGathers;
4509
4510 using ValueToGatherNodesMap =
4512 ValueToGatherNodesMap ValueToGatherNodes;
4513
4514 /// A list of the load entries (node indices), which can be vectorized using
4515 /// strided or masked gather approach, but attempted to be represented as
4516 /// contiguous loads.
4517 SetVector<unsigned> LoadEntriesToVectorize;
4518
4519 /// true if graph nodes transforming mode is on.
4520 bool IsGraphTransformMode = false;
4521
4522 /// The index of the first gathered load entry in the VectorizeTree.
4523 std::optional<unsigned> GatheredLoadsEntriesFirst;
4524
4525 /// Maps compress entries to their mask data for the final codegen.
4526 SmallDenseMap<const TreeEntry *,
4527 std::tuple<SmallVector<int>, VectorType *, unsigned, bool>>
4528 CompressEntryToData;
4529
4530 /// This POD struct describes one external user in the vectorized tree.
4531 struct ExternalUser {
4532 ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, unsigned L)
4533 : Scalar(S), User(U), E(E), Lane(L) {}
4534
4535 /// Which scalar in our function.
4536 Value *Scalar = nullptr;
4537
4538 /// Which user that uses the scalar.
4539 llvm::User *User = nullptr;
4540
4541 /// Vector node, the value is part of.
4542 const TreeEntry &E;
4543
4544 /// Which lane does the scalar belong to.
4545 unsigned Lane;
4546 };
4547 using UserList = SmallVector<ExternalUser, 16>;
4548
4549 /// Checks if two instructions may access the same memory.
4550 ///
4551 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
4552 /// is invariant in the calling loop.
4553 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
4554 Instruction *Inst2) {
4555 assert(Loc1.Ptr && isSimple(Inst1) && "Expected simple first instruction.");
4556 // First check if the result is already in the cache.
4557 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
4558 auto Res = AliasCache.try_emplace(Key);
4559 if (!Res.second)
4560 return Res.first->second;
4561 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
4562 // Store the result in the cache.
4563 Res.first->getSecond() = Aliased;
4564 return Aliased;
4565 }
4566
4567 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4568
4569 /// Cache for alias results.
4570 /// TODO: consider moving this to the AliasAnalysis itself.
4572
4573 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
4574 // globally through SLP because we don't perform any action which
4575 // invalidates capture results.
4576 BatchAAResults BatchAA;
4577
4578 /// Temporary store for deleted instructions. Instructions will be deleted
4579 /// eventually when the BoUpSLP is destructed. The deferral is required to
4580 /// ensure that there are no incorrect collisions in the AliasCache, which
4581 /// can happen if a new instruction is allocated at the same address as a
4582 /// previously deleted instruction.
4583 DenseSet<Instruction *> DeletedInstructions;
4584
4585 /// Set of the instruction, being analyzed already for reductions.
4586 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
4587
4588 /// Set of hashes for the list of reduction values already being analyzed.
4589 DenseSet<size_t> AnalyzedReductionVals;
4590
4591 /// Values, already been analyzed for mininmal bitwidth and found to be
4592 /// non-profitable.
4593 DenseSet<Value *> AnalyzedMinBWVals;
4594
4595 /// A list of values that need to extracted out of the tree.
4596 /// This list holds pairs of (Internal Scalar : External User). External User
4597 /// can be nullptr, it means that this Internal Scalar will be used later,
4598 /// after vectorization.
4599 UserList ExternalUses;
4600
4601 /// A list of GEPs which can be reaplced by scalar GEPs instead of
4602 /// extractelement instructions.
4603 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
4604
4605 /// A list of scalar to be extracted without specific user necause of too many
4606 /// uses.
4607 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
4608
4609 /// Values used only by @llvm.assume calls.
4611
4612 /// Holds all of the instructions that we gathered, shuffle instructions and
4613 /// extractelements.
4614 SetVector<Instruction *> GatherShuffleExtractSeq;
4615
4616 /// A list of blocks that we are going to CSE.
4617 DenseSet<BasicBlock *> CSEBlocks;
4618
4619 /// List of hashes of vector of loads, which are known to be non vectorizable.
4620 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
4621
4622 /// Represents a scheduling entity, either ScheduleData, ScheduleCopyableData
4623 /// or ScheduleBundle. ScheduleData used to gather dependecies for a single
4624 /// instructions, while ScheduleBundle represents a batch of instructions,
4625 /// going to be groupped together. ScheduleCopyableData models extra user for
4626 /// "copyable" instructions.
4627 class ScheduleEntity {
4628 friend class ScheduleBundle;
4629 friend class ScheduleData;
4630 friend class ScheduleCopyableData;
4631
4632 protected:
4633 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
4634 Kind getKind() const { return K; }
4635 ScheduleEntity(Kind K) : K(K) {}
4636
4637 private:
4638 /// Used for getting a "good" final ordering of instructions.
4639 int SchedulingPriority = 0;
4640 /// True if this instruction (or bundle) is scheduled (or considered as
4641 /// scheduled in the dry-run).
4642 bool IsScheduled = false;
4643 /// The kind of the ScheduleEntity.
4644 const Kind K = Kind::ScheduleData;
4645
4646 public:
4647 ScheduleEntity() = delete;
4648 /// Gets/sets the scheduling priority.
4649 void setSchedulingPriority(int Priority) { SchedulingPriority = Priority; }
4650 int getSchedulingPriority() const { return SchedulingPriority; }
4651 bool isReady() const {
4652 if (const auto *SD = dyn_cast<ScheduleData>(this))
4653 return SD->isReady();
4654 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4655 return CD->isReady();
4656 return cast<ScheduleBundle>(this)->isReady();
4657 }
4658 /// Returns true if the dependency information has been calculated.
4659 /// Note that depenendency validity can vary between instructions within
4660 /// a single bundle.
4661 bool hasValidDependencies() const {
4662 if (const auto *SD = dyn_cast<ScheduleData>(this))
4663 return SD->hasValidDependencies();
4664 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4665 return CD->hasValidDependencies();
4666 return cast<ScheduleBundle>(this)->hasValidDependencies();
4667 }
4668 /// Gets the number of unscheduled dependencies.
4669 int getUnscheduledDeps() const {
4670 if (const auto *SD = dyn_cast<ScheduleData>(this))
4671 return SD->getUnscheduledDeps();
4672 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4673 return CD->getUnscheduledDeps();
4674 return cast<ScheduleBundle>(this)->unscheduledDepsInBundle();
4675 }
4676 /// Increments the number of unscheduled dependencies.
4677 int incrementUnscheduledDeps(int Incr) {
4678 if (auto *SD = dyn_cast<ScheduleData>(this))
4679 return SD->incrementUnscheduledDeps(Incr);
4680 return cast<ScheduleCopyableData>(this)->incrementUnscheduledDeps(Incr);
4681 }
4682 /// Gets the number of dependencies.
4683 int getDependencies() const {
4684 if (const auto *SD = dyn_cast<ScheduleData>(this))
4685 return SD->getDependencies();
4686 return cast<ScheduleCopyableData>(this)->getDependencies();
4687 }
4688 /// Gets the instruction.
4689 Instruction *getInst() const {
4690 if (const auto *SD = dyn_cast<ScheduleData>(this))
4691 return SD->getInst();
4692 return cast<ScheduleCopyableData>(this)->getInst();
4693 }
4694
4695 /// Gets/sets if the bundle is scheduled.
4696 bool isScheduled() const { return IsScheduled; }
4697 void setScheduled(bool Scheduled) { IsScheduled = Scheduled; }
4698
4699 static bool classof(const ScheduleEntity *) { return true; }
4700
4701#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4702 void dump(raw_ostream &OS) const {
4703 if (const auto *SD = dyn_cast<ScheduleData>(this))
4704 return SD->dump(OS);
4705 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4706 return CD->dump(OS);
4707 return cast<ScheduleBundle>(this)->dump(OS);
4708 }
4709
4710 LLVM_DUMP_METHOD void dump() const {
4711 dump(dbgs());
4712 dbgs() << '\n';
4713 }
4714#endif // if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4715 };
4716
4717#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4719 const BoUpSLP::ScheduleEntity &SE) {
4720 SE.dump(OS);
4721 return OS;
4722 }
4723#endif
4724
4725 /// Contains all scheduling relevant data for an instruction.
4726 /// A ScheduleData either represents a single instruction or a member of an
4727 /// instruction bundle (= a group of instructions which is combined into a
4728 /// vector instruction).
4729 class ScheduleData final : public ScheduleEntity {
4730 public:
4731 // The initial value for the dependency counters. It means that the
4732 // dependencies are not calculated yet.
4733 enum { InvalidDeps = -1 };
4734
4735 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
4736 static bool classof(const ScheduleEntity *Entity) {
4737 return Entity->getKind() == Kind::ScheduleData;
4738 }
4739
4740 void init(int BlockSchedulingRegionID, Instruction *I) {
4741 NextLoadStore = nullptr;
4742 IsScheduled = false;
4743 SchedulingRegionID = BlockSchedulingRegionID;
4744 clearDependencies();
4745 Inst = I;
4746 }
4747
4748 /// Verify basic self consistency properties
4749 void verify() {
4750 if (hasValidDependencies()) {
4751 assert(UnscheduledDeps <= Dependencies && "invariant");
4752 } else {
4753 assert(UnscheduledDeps == Dependencies && "invariant");
4754 }
4755
4756 if (IsScheduled) {
4757 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
4758 "unexpected scheduled state");
4759 }
4760 }
4761
4762 /// Returns true if the dependency information has been calculated.
4763 /// Note that depenendency validity can vary between instructions within
4764 /// a single bundle.
4765 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
4766
4767 /// Returns true if it is ready for scheduling, i.e. it has no more
4768 /// unscheduled depending instructions/bundles.
4769 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
4770
4771 /// Modifies the number of unscheduled dependencies for this instruction,
4772 /// and returns the number of remaining dependencies for the containing
4773 /// bundle.
4774 int incrementUnscheduledDeps(int Incr) {
4775 assert(hasValidDependencies() &&
4776 "increment of unscheduled deps would be meaningless");
4777 UnscheduledDeps += Incr;
4778 return UnscheduledDeps;
4779 }
4780
4781 /// Sets the number of unscheduled dependencies to the number of
4782 /// dependencies.
4783 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
4784
4785 /// Clears all dependency information.
4786 void clearDependencies() {
4787 clearDirectDependencies();
4788 MemoryDependencies.clear();
4789 ControlDependencies.clear();
4790 }
4791
4792 /// Clears all direct dependencies only, except for control and memory
4793 /// dependencies.
4794 /// Required for copyable elements to correctly handle control/memory deps
4795 /// and avoid extra reclaculation of such deps.
4796 void clearDirectDependencies() {
4797 Dependencies = InvalidDeps;
4798 resetUnscheduledDeps();
4799 IsScheduled = false;
4800 }
4801
4802 /// Gets the number of unscheduled dependencies.
4803 int getUnscheduledDeps() const { return UnscheduledDeps; }
4804 /// Gets the number of dependencies.
4805 int getDependencies() const { return Dependencies; }
4806 /// Initializes the number of dependencies.
4807 void initDependencies() { Dependencies = 0; }
4808 /// Increments the number of dependencies.
4809 void incDependencies() { Dependencies++; }
4810
4811 /// Gets scheduling region ID.
4812 int getSchedulingRegionID() const { return SchedulingRegionID; }
4813
4814 /// Gets the instruction.
4815 Instruction *getInst() const { return Inst; }
4816
4817 /// Gets the list of memory dependencies.
4818 ArrayRef<ScheduleData *> getMemoryDependencies() const {
4819 return MemoryDependencies;
4820 }
4821 /// Adds a memory dependency.
4822 void addMemoryDependency(ScheduleData *Dep) {
4823 MemoryDependencies.push_back(Dep);
4824 }
4825 /// Gets the list of control dependencies.
4826 ArrayRef<ScheduleData *> getControlDependencies() const {
4827 return ControlDependencies;
4828 }
4829 /// Adds a control dependency.
4830 void addControlDependency(ScheduleData *Dep) {
4831 ControlDependencies.push_back(Dep);
4832 }
4833 /// Gets/sets the next load/store instruction in the block.
4834 ScheduleData *getNextLoadStore() const { return NextLoadStore; }
4835 void setNextLoadStore(ScheduleData *Next) { NextLoadStore = Next; }
4836
4837 void dump(raw_ostream &OS) const { OS << *Inst; }
4838
4839 LLVM_DUMP_METHOD void dump() const {
4840 dump(dbgs());
4841 dbgs() << '\n';
4842 }
4843
4844 private:
4845 Instruction *Inst = nullptr;
4846
4847 /// Single linked list of all memory instructions (e.g. load, store, call)
4848 /// in the block - until the end of the scheduling region.
4849 ScheduleData *NextLoadStore = nullptr;
4850
4851 /// The dependent memory instructions.
4852 /// This list is derived on demand in calculateDependencies().
4853 SmallVector<ScheduleData *> MemoryDependencies;
4854
4855 /// List of instructions which this instruction could be control dependent
4856 /// on. Allowing such nodes to be scheduled below this one could introduce
4857 /// a runtime fault which didn't exist in the original program.
4858 /// ex: this is a load or udiv following a readonly call which inf loops
4859 SmallVector<ScheduleData *> ControlDependencies;
4860
4861 /// This ScheduleData is in the current scheduling region if this matches
4862 /// the current SchedulingRegionID of BlockScheduling.
4863 int SchedulingRegionID = 0;
4864
4865 /// The number of dependencies. Constitutes of the number of users of the
4866 /// instruction plus the number of dependent memory instructions (if any).
4867 /// This value is calculated on demand.
4868 /// If InvalidDeps, the number of dependencies is not calculated yet.
4869 int Dependencies = InvalidDeps;
4870
4871 /// The number of dependencies minus the number of dependencies of scheduled
4872 /// instructions. As soon as this is zero, the instruction/bundle gets ready
4873 /// for scheduling.
4874 /// Note that this is negative as long as Dependencies is not calculated.
4875 int UnscheduledDeps = InvalidDeps;
4876 };
4877
4878#ifndef NDEBUG
4880 const BoUpSLP::ScheduleData &SD) {
4881 SD.dump(OS);
4882 return OS;
4883 }
4884#endif
4885
4886 class ScheduleBundle final : public ScheduleEntity {
4887 /// The schedule data for the instructions in the bundle.
4889 /// True if this bundle is valid.
4890 bool IsValid = true;
4891 /// The TreeEntry that this instruction corresponds to.
4892 TreeEntry *TE = nullptr;
4893 ScheduleBundle(bool IsValid)
4894 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
4895
4896 public:
4897 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
4898 static bool classof(const ScheduleEntity *Entity) {
4899 return Entity->getKind() == Kind::ScheduleBundle;
4900 }
4901
4902 /// Verify basic self consistency properties
4903 void verify() const {
4904 for (const ScheduleEntity *SD : Bundle) {
4905 if (SD->hasValidDependencies()) {
4906 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
4907 "invariant");
4908 } else {
4909 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
4910 "invariant");
4911 }
4912
4913 if (isScheduled()) {
4914 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
4915 "unexpected scheduled state");
4916 }
4917 }
4918 }
4919
4920 /// Returns the number of unscheduled dependencies in the bundle.
4921 int unscheduledDepsInBundle() const {
4922 assert(*this && "bundle must not be empty");
4923 int Sum = 0;
4924 for (const ScheduleEntity *BundleMember : Bundle) {
4925 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
4926 return ScheduleData::InvalidDeps;
4927 Sum += BundleMember->getUnscheduledDeps();
4928 }
4929 return Sum;
4930 }
4931
4932 /// Returns true if the dependency information has been calculated.
4933 /// Note that depenendency validity can vary between instructions within
4934 /// a single bundle.
4935 bool hasValidDependencies() const {
4936 return all_of(Bundle, [](const ScheduleEntity *SD) {
4937 return SD->hasValidDependencies();
4938 });
4939 }
4940
4941 /// Returns true if it is ready for scheduling, i.e. it has no more
4942 /// unscheduled depending instructions/bundles.
4943 bool isReady() const {
4944 assert(*this && "bundle must not be empty");
4945 return unscheduledDepsInBundle() == 0 && !isScheduled();
4946 }
4947
4948 /// Returns the bundle of scheduling data, associated with the current
4949 /// instruction.
4950 ArrayRef<ScheduleEntity *> getBundle() { return Bundle; }
4951 ArrayRef<const ScheduleEntity *> getBundle() const { return Bundle; }
4952 /// Adds an instruction to the bundle.
4953 void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
4954
4955 /// Gets/sets the associated tree entry.
4956 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
4957 TreeEntry *getTreeEntry() const { return TE; }
4958
4959 static ScheduleBundle invalid() { return {false}; }
4960
4961 operator bool() const { return IsValid; }
4962
4963#ifndef NDEBUG
4964 void dump(raw_ostream &OS) const {
4965 if (!*this) {
4966 OS << "[]";
4967 return;
4968 }
4969 OS << '[';
4970 interleaveComma(Bundle, OS, [&](const ScheduleEntity *SD) {
4971 if (isa<ScheduleCopyableData>(SD))
4972 OS << "<Copyable>";
4973 OS << *SD->getInst();
4974 });
4975 OS << ']';
4976 }
4977
4978 LLVM_DUMP_METHOD void dump() const {
4979 dump(dbgs());
4980 dbgs() << '\n';
4981 }
4982#endif // NDEBUG
4983 };
4984
4985#ifndef NDEBUG
4987 const BoUpSLP::ScheduleBundle &Bundle) {
4988 Bundle.dump(OS);
4989 return OS;
4990 }
4991#endif
4992
4993 /// Contains all scheduling relevant data for the copyable instruction.
4994 /// It models the virtual instructions, supposed to replace the original
4995 /// instructions. E.g., if instruction %0 = load is a part of the bundle [%0,
4996 /// %1], where %1 = add, then the ScheduleCopyableData models virtual
4997 /// instruction %virt = add %0, 0.
4998 class ScheduleCopyableData final : public ScheduleEntity {
4999 /// The source schedule data for the instruction.
5000 Instruction *Inst = nullptr;
5001 /// The edge information for the instruction.
5002 const EdgeInfo EI;
5003 /// This ScheduleData is in the current scheduling region if this matches
5004 /// the current SchedulingRegionID of BlockScheduling.
5005 int SchedulingRegionID = 0;
5006 /// Bundle, this data is part of.
5007 ScheduleBundle &Bundle;
5008
5009 public:
5010 ScheduleCopyableData(int BlockSchedulingRegionID, Instruction *I,
5011 const EdgeInfo &EI, ScheduleBundle &Bundle)
5012 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(I), EI(EI),
5013 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5014 static bool classof(const ScheduleEntity *Entity) {
5015 return Entity->getKind() == Kind::ScheduleCopyableData;
5016 }
5017
5018 /// Verify basic self consistency properties
5019 void verify() {
5020 if (hasValidDependencies()) {
5021 assert(UnscheduledDeps <= Dependencies && "invariant");
5022 } else {
5023 assert(UnscheduledDeps == Dependencies && "invariant");
5024 }
5025
5026 if (IsScheduled) {
5027 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5028 "unexpected scheduled state");
5029 }
5030 }
5031
5032 /// Returns true if the dependency information has been calculated.
5033 /// Note that depenendency validity can vary between instructions within
5034 /// a single bundle.
5035 bool hasValidDependencies() const {
5036 return Dependencies != ScheduleData::InvalidDeps;
5037 }
5038
5039 /// Returns true if it is ready for scheduling, i.e. it has no more
5040 /// unscheduled depending instructions/bundles.
5041 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
5042
5043 /// Modifies the number of unscheduled dependencies for this instruction,
5044 /// and returns the number of remaining dependencies for the containing
5045 /// bundle.
5046 int incrementUnscheduledDeps(int Incr) {
5047 assert(hasValidDependencies() &&
5048 "increment of unscheduled deps would be meaningless");
5049 UnscheduledDeps += Incr;
5050 assert(UnscheduledDeps >= 0 && "invariant");
5051 return UnscheduledDeps;
5052 }
5053
5054 /// Sets the number of unscheduled dependencies to the number of
5055 /// dependencies.
5056 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5057
5058 /// Gets the number of unscheduled dependencies.
5059 int getUnscheduledDeps() const { return UnscheduledDeps; }
5060 /// Gets the number of dependencies.
5061 int getDependencies() const { return Dependencies; }
5062 /// Initializes the number of dependencies.
5063 void initDependencies() { Dependencies = 0; }
5064 /// Increments the number of dependencies.
5065 void incDependencies() { Dependencies++; }
5066
5067 /// Gets scheduling region ID.
5068 int getSchedulingRegionID() const { return SchedulingRegionID; }
5069
5070 /// Gets the instruction.
5071 Instruction *getInst() const { return Inst; }
5072
5073 /// Clears all dependency information.
5074 void clearDependencies() {
5075 Dependencies = ScheduleData::InvalidDeps;
5076 UnscheduledDeps = ScheduleData::InvalidDeps;
5077 IsScheduled = false;
5078 }
5079
5080 /// Gets the edge information.
5081 const EdgeInfo &getEdgeInfo() const { return EI; }
5082
5083 /// Gets the bundle.
5084 ScheduleBundle &getBundle() { return Bundle; }
5085 const ScheduleBundle &getBundle() const { return Bundle; }
5086
5087#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5088 void dump(raw_ostream &OS) const { OS << "[Copyable]" << *getInst(); }
5089
5090 LLVM_DUMP_METHOD void dump() const {
5091 dump(dbgs());
5092 dbgs() << '\n';
5093 }
5094#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5095
5096 private:
5097 /// true, if it has valid dependency information. These nodes always have
5098 /// only single dependency.
5099 int Dependencies = ScheduleData::InvalidDeps;
5100
5101 /// The number of dependencies minus the number of dependencies of scheduled
5102 /// instructions. As soon as this is zero, the instruction/bundle gets ready
5103 /// for scheduling.
5104 /// Note that this is negative as long as Dependencies is not calculated.
5105 int UnscheduledDeps = ScheduleData::InvalidDeps;
5106 };
5107
5108#ifndef NDEBUG
5109 friend inline raw_ostream &
5110 operator<<(raw_ostream &OS, const BoUpSLP::ScheduleCopyableData &SD) {
5111 SD.dump(OS);
5112 return OS;
5113 }
5114#endif
5115
5116 friend struct GraphTraits<BoUpSLP *>;
5117 friend struct DOTGraphTraits<BoUpSLP *>;
5118
5119 /// Contains all scheduling data for a basic block.
5120 /// It does not schedules instructions, which are not memory read/write
5121 /// instructions and their operands are either constants, or arguments, or
5122 /// phis, or instructions from others blocks, or their users are phis or from
5123 /// the other blocks. The resulting vector instructions can be placed at the
5124 /// beginning of the basic block without scheduling (if operands does not need
5125 /// to be scheduled) or at the end of the block (if users are outside of the
5126 /// block). It allows to save some compile time and memory used by the
5127 /// compiler.
5128 /// ScheduleData is assigned for each instruction in between the boundaries of
5129 /// the tree entry, even for those, which are not part of the graph. It is
5130 /// required to correctly follow the dependencies between the instructions and
5131 /// their correct scheduling. The ScheduleData is not allocated for the
5132 /// instructions, which do not require scheduling, like phis, nodes with
5133 /// extractelements/insertelements only or nodes with instructions, with
5134 /// uses/operands outside of the block.
5135 struct BlockScheduling {
5136 BlockScheduling(BasicBlock *BB)
5137 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
5138
5139 void clear() {
5140 ScheduledBundles.clear();
5141 ScheduledBundlesList.clear();
5142 ScheduleCopyableDataMap.clear();
5143 ScheduleCopyableDataMapByInst.clear();
5144 ScheduleCopyableDataMapByInstUser.clear();
5145 ScheduleCopyableDataMapByUsers.clear();
5146 ReadyInsts.clear();
5147 ScheduleStart = nullptr;
5148 ScheduleEnd = nullptr;
5149 FirstLoadStoreInRegion = nullptr;
5150 LastLoadStoreInRegion = nullptr;
5151 RegionHasStackSave = false;
5152
5153 // Reduce the maximum schedule region size by the size of the
5154 // previous scheduling run.
5155 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5156 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
5157 ScheduleRegionSizeLimit = MinScheduleRegionSize;
5158 ScheduleRegionSize = 0;
5159
5160 // Make a new scheduling region, i.e. all existing ScheduleData is not
5161 // in the new region yet.
5162 ++SchedulingRegionID;
5163 }
5164
5165 ScheduleData *getScheduleData(Instruction *I) {
5166 if (!I)
5167 return nullptr;
5168 if (BB != I->getParent())
5169 // Avoid lookup if can't possibly be in map.
5170 return nullptr;
5171 ScheduleData *SD = ScheduleDataMap.lookup(I);
5172 if (SD && isInSchedulingRegion(*SD))
5173 return SD;
5174 return nullptr;
5175 }
5176
5177 ScheduleData *getScheduleData(Value *V) {
5178 return getScheduleData(dyn_cast<Instruction>(V));
5179 }
5180
5181 /// Returns the ScheduleCopyableData for the given edge (user tree entry and
5182 /// operand number) and value.
5183 ScheduleCopyableData *getScheduleCopyableData(const EdgeInfo &EI,
5184 const Value *V) const {
5185 if (ScheduleCopyableDataMap.empty())
5186 return nullptr;
5187 auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
5188 if (It == ScheduleCopyableDataMap.end())
5189 return nullptr;
5190 ScheduleCopyableData *SD = It->getSecond().get();
5191 if (!isInSchedulingRegion(*SD))
5192 return nullptr;
5193 return SD;
5194 }
5195
5196 /// Returns the ScheduleCopyableData for the given user \p User, operand
5197 /// number and operand \p V.
5199 getScheduleCopyableData(const Value *User, unsigned OperandIdx,
5200 const Value *V) {
5201 if (ScheduleCopyableDataMapByInstUser.empty())
5202 return {};
5203 const auto It = ScheduleCopyableDataMapByInstUser.find(
5204 std::make_pair(std::make_pair(User, OperandIdx), V));
5205 if (It == ScheduleCopyableDataMapByInstUser.end())
5206 return {};
5208 for (ScheduleCopyableData *SD : It->getSecond()) {
5209 if (isInSchedulingRegion(*SD))
5210 Res.push_back(SD);
5211 }
5212 return Res;
5213 }
5214
5215 /// Returns true if all operands of the given instruction \p User are
5216 /// replaced by copyable data.
5217 /// \param User The user instruction.
5218 /// \param Op The operand, which might be replaced by the copyable data.
5219 /// \param SLP The SLP tree.
5220 /// \param NumOps The number of operands used. If the instruction uses the
5221 /// same operand several times, check for the first use, then the second,
5222 /// etc.
5223 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5224 Instruction *Op, BoUpSLP &SLP,
5225 unsigned NumOps) const {
5226 assert(NumOps > 0 && "No operands");
5227 if (ScheduleCopyableDataMap.empty())
5228 return false;
5229 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5231 for (const Use &U : User->operands()) {
5232 if (U.get() != Op)
5233 continue;
5234 ArrayRef<TreeEntry *> Entries = SLP.getTreeEntries(User);
5235 if (Entries.empty())
5236 return false;
5237 // Check all tree entries, if they have operands replaced by copyable
5238 // data.
5239 for (TreeEntry *TE : Entries) {
5240 // Check if the user is commutative.
5241 // The commutatives are handled later, as their oeprands can be
5242 // reordered.
5243 // Same applies even for non-commutative cmps, because we can invert
5244 // their predicate potentially and, thus, reorder the operands.
5245 bool IsCommutativeUser =
5246 ::isCommutative(TE->getMatchingMainOpOrAltOp(User), User);
5247 EdgeInfo EI(TE, U.getOperandNo());
5248 if (!IsCommutativeUser && !isa<CmpInst>(User)) {
5249 unsigned &OpCnt =
5250 OrderedEntriesCount.try_emplace(TE, 0).first->getSecond();
5251 if (!getScheduleCopyableData(EI, Op) && OpCnt < NumOps)
5252 return false;
5253 // Found copyable operand - continue.
5254 ++OpCnt;
5255 continue;
5256 }
5257 ++PotentiallyReorderedEntriesCount.try_emplace(TE, 0)
5258 .first->getSecond();
5259 }
5260 }
5261 // Check the commutative/cmp entries.
5262 if (!PotentiallyReorderedEntriesCount.empty()) {
5263 for (auto &P : PotentiallyReorderedEntriesCount) {
5264 auto *It = find(P.first->Scalars, User);
5265 assert(It != P.first->Scalars.end() &&
5266 "User is not in the tree entry");
5267 int Lane = std::distance(P.first->Scalars.begin(), It);
5268 assert(Lane >= 0 && "Lane is not found");
5269 if (isa<StoreInst>(User) && !P.first->ReorderIndices.empty())
5270 Lane = P.first->ReorderIndices[Lane];
5271 assert(Lane < static_cast<int>(P.first->Scalars.size()) &&
5272 "Couldn't find extract lane");
5273 SmallVector<unsigned> OpIndices;
5274 for (unsigned OpIdx :
5276 P.first->getMainOp()))) {
5277 if (P.first->getOperand(OpIdx)[Lane] == Op &&
5278 getScheduleCopyableData(EdgeInfo(P.first, OpIdx), Op))
5279 --P.getSecond();
5280 }
5281 }
5282 return all_of(PotentiallyReorderedEntriesCount,
5283 [&](const std::pair<const TreeEntry *, unsigned> &P) {
5284 return P.second == NumOps - 1;
5285 });
5286 }
5287 return true;
5288 }
5289
5291 getScheduleCopyableData(const Instruction *I) const {
5292 if (ScheduleCopyableDataMapByInst.empty())
5293 return {};
5294 const auto It = ScheduleCopyableDataMapByInst.find(I);
5295 if (It == ScheduleCopyableDataMapByInst.end())
5296 return {};
5298 for (ScheduleCopyableData *SD : It->getSecond()) {
5299 if (isInSchedulingRegion(*SD))
5300 Res.push_back(SD);
5301 }
5302 return Res;
5303 }
5304
5306 getScheduleCopyableDataUsers(const Instruction *User) const {
5307 if (ScheduleCopyableDataMapByUsers.empty())
5308 return {};
5309 const auto It = ScheduleCopyableDataMapByUsers.find(User);
5310 if (It == ScheduleCopyableDataMapByUsers.end())
5311 return {};
5313 for (ScheduleCopyableData *SD : It->getSecond()) {
5314 if (isInSchedulingRegion(*SD))
5315 Res.push_back(SD);
5316 }
5317 return Res;
5318 }
5319
5320 ScheduleCopyableData &addScheduleCopyableData(const EdgeInfo &EI,
5321 Instruction *I,
5322 int SchedulingRegionID,
5323 ScheduleBundle &Bundle) {
5324 assert(!getScheduleCopyableData(EI, I) && "already in the map");
5325 ScheduleCopyableData *CD =
5326 ScheduleCopyableDataMap
5327 .try_emplace(std::make_pair(EI, I),
5328 std::make_unique<ScheduleCopyableData>(
5329 SchedulingRegionID, I, EI, Bundle))
5330 .first->getSecond()
5331 .get();
5332 ScheduleCopyableDataMapByInst[I].push_back(CD);
5333 if (EI.UserTE) {
5334 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
5335 const auto *It = find(Op, I);
5336 assert(It != Op.end() && "Lane not set");
5338 do {
5339 int Lane = std::distance(Op.begin(), It);
5340 assert(Lane >= 0 && "Lane not set");
5341 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
5342 !EI.UserTE->ReorderIndices.empty())
5343 Lane = EI.UserTE->ReorderIndices[Lane];
5344 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
5345 "Couldn't find extract lane");
5346 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
5347 if (!Visited.insert(In).second) {
5348 It = find(make_range(std::next(It), Op.end()), I);
5349 continue;
5350 }
5351 ScheduleCopyableDataMapByInstUser
5352 .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx), I))
5353 .first->getSecond()
5354 .push_back(CD);
5355 ScheduleCopyableDataMapByUsers.try_emplace(I)
5356 .first->getSecond()
5357 .insert(CD);
5358 // Remove extra deps for users, becoming non-immediate users of the
5359 // instruction. It may happen, if the chain of same copyable elements
5360 // appears in the tree.
5361 if (In == I) {
5362 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5363 if (ScheduleCopyableData *UserCD =
5364 getScheduleCopyableData(UserEI, In))
5365 ScheduleCopyableDataMapByUsers[I].remove(UserCD);
5366 }
5367 It = find(make_range(std::next(It), Op.end()), I);
5368 } while (It != Op.end());
5369 } else {
5370 ScheduleCopyableDataMapByUsers.try_emplace(I).first->getSecond().insert(
5371 CD);
5372 }
5373 return *CD;
5374 }
5375
5376 ArrayRef<ScheduleBundle *> getScheduleBundles(Value *V) const {
5377 auto *I = dyn_cast<Instruction>(V);
5378 if (!I)
5379 return {};
5380 auto It = ScheduledBundles.find(I);
5381 if (It == ScheduledBundles.end())
5382 return {};
5383 return It->getSecond();
5384 }
5385
5386 /// Returns true if the entity is in the scheduling region.
5387 bool isInSchedulingRegion(const ScheduleEntity &SD) const {
5388 if (const auto *Data = dyn_cast<ScheduleData>(&SD))
5389 return Data->getSchedulingRegionID() == SchedulingRegionID;
5390 if (const auto *CD = dyn_cast<ScheduleCopyableData>(&SD))
5391 return CD->getSchedulingRegionID() == SchedulingRegionID;
5392 return all_of(cast<ScheduleBundle>(SD).getBundle(),
5393 [&](const ScheduleEntity *BundleMember) {
5394 return isInSchedulingRegion(*BundleMember);
5395 });
5396 }
5397
5398 /// Marks an instruction as scheduled and puts all dependent ready
5399 /// instructions into the ready-list.
5400 template <typename ReadyListType>
5401 void schedule(const BoUpSLP &R, const InstructionsState &S,
5402 const EdgeInfo &EI, ScheduleEntity *Data,
5403 ReadyListType &ReadyList) {
5404 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5406 // Handle the def-use chain dependencies.
5407
5408 // Decrement the unscheduled counter and insert to ready list if ready.
5409 auto DecrUnsched = [&](auto *Data, bool IsControl = false) {
5410 if ((IsControl || Data->hasValidDependencies()) &&
5411 Data->incrementUnscheduledDeps(-1) == 0) {
5412 // There are no more unscheduled dependencies after
5413 // decrementing, so we can put the dependent instruction
5414 // into the ready list.
5415 SmallVector<ScheduleBundle *, 1> CopyableBundle;
5417 if (auto *CD = dyn_cast<ScheduleCopyableData>(Data)) {
5418 CopyableBundle.push_back(&CD->getBundle());
5419 Bundles = CopyableBundle;
5420 } else {
5421 Bundles = getScheduleBundles(Data->getInst());
5422 }
5423 if (!Bundles.empty()) {
5424 for (ScheduleBundle *Bundle : Bundles) {
5425 if (Bundle->unscheduledDepsInBundle() == 0) {
5426 assert(!Bundle->isScheduled() &&
5427 "already scheduled bundle gets ready");
5428 ReadyList.insert(Bundle);
5430 << "SLP: gets ready: " << *Bundle << "\n");
5431 }
5432 }
5433 return;
5434 }
5435 assert(!Data->isScheduled() &&
5436 "already scheduled bundle gets ready");
5437 assert(!isa<ScheduleCopyableData>(Data) &&
5438 "Expected non-copyable data");
5439 ReadyList.insert(Data);
5440 LLVM_DEBUG(dbgs() << "SLP: gets ready: " << *Data << "\n");
5441 }
5442 };
5443
5444 auto DecrUnschedForInst = [&](Instruction *User, unsigned OpIdx,
5445 Instruction *I) {
5446 if (!ScheduleCopyableDataMap.empty()) {
5448 getScheduleCopyableData(User, OpIdx, I);
5449 for (ScheduleCopyableData *CD : CopyableData)
5450 DecrUnsched(CD, /*IsControl=*/false);
5451 if (!CopyableData.empty())
5452 return;
5453 }
5454 if (ScheduleData *OpSD = getScheduleData(I))
5455 DecrUnsched(OpSD, /*IsControl=*/false);
5456 };
5457
5458 // If BundleMember is a vector bundle, its operands may have been
5459 // reordered during buildTree(). We therefore need to get its operands
5460 // through the TreeEntry.
5461 if (!Bundles.empty()) {
5462 auto *In = BundleMember->getInst();
5463 // Count uses of each instruction operand.
5465 unsigned TotalOpCount = 0;
5466 if (isa<ScheduleCopyableData>(BundleMember)) {
5467 // Copyable data is used only once (uses itself).
5468 TotalOpCount = OperandsUses[In] = 1;
5469 } else {
5470 for (const Use &U : In->operands()) {
5471 if (auto *I = dyn_cast<Instruction>(U.get())) {
5472 auto Res = OperandsUses.try_emplace(I, 0);
5473 ++Res.first->getSecond();
5474 ++TotalOpCount;
5475 }
5476 }
5477 }
5478 // Decrement the unscheduled counter and insert to ready list if
5479 // ready.
5480 auto DecrUnschedForInst = [&](Instruction *I, TreeEntry *UserTE,
5481 unsigned OpIdx) {
5482 if (!ScheduleCopyableDataMap.empty()) {
5483 const EdgeInfo EI = {UserTE, OpIdx};
5484 if (ScheduleCopyableData *CD = getScheduleCopyableData(EI, I)) {
5485 DecrUnsched(CD, /*IsControl=*/false);
5486 return;
5487 }
5488 }
5489 auto It = OperandsUses.find(I);
5490 assert(It != OperandsUses.end() && "Operand not found");
5491 if (It->second > 0) {
5492 --It->getSecond();
5493 assert(TotalOpCount > 0 && "No more operands to decrement");
5494 --TotalOpCount;
5495 if (ScheduleData *OpSD = getScheduleData(I))
5496 DecrUnsched(OpSD, /*IsControl=*/false);
5497 }
5498 };
5499
5500 for (ScheduleBundle *Bundle : Bundles) {
5501 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
5502 break;
5503 // Need to search for the lane since the tree entry can be
5504 // reordered.
5505 int Lane = std::distance(Bundle->getTreeEntry()->Scalars.begin(),
5506 find(Bundle->getTreeEntry()->Scalars, In));
5507 assert(Lane >= 0 && "Lane not set");
5508 if (isa<StoreInst>(In) &&
5509 !Bundle->getTreeEntry()->ReorderIndices.empty())
5510 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
5511 assert(Lane < static_cast<int>(
5512 Bundle->getTreeEntry()->Scalars.size()) &&
5513 "Couldn't find extract lane");
5514
5515 // Since vectorization tree is being built recursively this
5516 // assertion ensures that the tree entry has all operands set before
5517 // reaching this code. Couple of exceptions known at the moment are
5518 // extracts where their second (immediate) operand is not added.
5519 // Since immediates do not affect scheduler behavior this is
5520 // considered okay.
5521 assert(In &&
5522 (isa<ExtractValueInst, ExtractElementInst, CallBase>(In) ||
5523 In->getNumOperands() ==
5524 Bundle->getTreeEntry()->getNumOperands() ||
5525 Bundle->getTreeEntry()->isCopyableElement(In)) &&
5526 "Missed TreeEntry operands?");
5527
5528 for (unsigned OpIdx :
5529 seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
5530 if (auto *I = dyn_cast<Instruction>(
5531 Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
5532 LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): " << *I
5533 << "\n");
5534 DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx);
5535 }
5536 }
5537 } else {
5538 // If BundleMember is a stand-alone instruction, no operand reordering
5539 // has taken place, so we directly access its operands.
5540 for (Use &U : BundleMember->getInst()->operands()) {
5541 if (auto *I = dyn_cast<Instruction>(U.get())) {
5543 << "SLP: check for readiness (def): " << *I << "\n");
5544 DecrUnschedForInst(BundleMember->getInst(), U.getOperandNo(), I);
5545 }
5546 }
5547 }
5548 // Handle the memory dependencies.
5549 auto *SD = dyn_cast<ScheduleData>(BundleMember);
5550 if (!SD)
5551 return;
5553 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
5554 if (!VisitedMemory.insert(MemoryDep).second)
5555 continue;
5556 // There are no more unscheduled dependencies after decrementing,
5557 // so we can put the dependent instruction into the ready list.
5558 LLVM_DEBUG(dbgs() << "SLP: check for readiness (mem): "
5559 << *MemoryDep << "\n");
5560 DecrUnsched(MemoryDep);
5561 }
5562 // Handle the control dependencies.
5564 for (ScheduleData *Dep : SD->getControlDependencies()) {
5565 if (!VisitedControl.insert(Dep).second)
5566 continue;
5567 // There are no more unscheduled dependencies after decrementing,
5568 // so we can put the dependent instruction into the ready list.
5570 << "SLP: check for readiness (ctrl): " << *Dep << "\n");
5571 DecrUnsched(Dep, /*IsControl=*/true);
5572 }
5573 };
5574 if (auto *SD = dyn_cast<ScheduleData>(Data)) {
5575 SD->setScheduled(/*Scheduled=*/true);
5576 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
5577 ProcessBundleMember(SD, {});
5578 } else {
5579 ScheduleBundle &Bundle = *cast<ScheduleBundle>(Data);
5580 Bundle.setScheduled(/*Scheduled=*/true);
5581 LLVM_DEBUG(dbgs() << "SLP: schedule " << Bundle << "\n");
5582 auto AreAllBundlesScheduled =
5583 [&](const ScheduleEntity *SD,
5584 ArrayRef<ScheduleBundle *> SDBundles) {
5585 if (isa<ScheduleCopyableData>(SD))
5586 return true;
5587 return !SDBundles.empty() &&
5588 all_of(SDBundles, [&](const ScheduleBundle *SDBundle) {
5589 return SDBundle->isScheduled();
5590 });
5591 };
5592 for (ScheduleEntity *SD : Bundle.getBundle()) {
5594 if (!isa<ScheduleCopyableData>(SD))
5595 SDBundles = getScheduleBundles(SD->getInst());
5596 if (AreAllBundlesScheduled(SD, SDBundles)) {
5597 SD->setScheduled(/*Scheduled=*/true);
5598 ProcessBundleMember(SD, isa<ScheduleCopyableData>(SD) ? &Bundle
5599 : SDBundles);
5600 }
5601 }
5602 }
5603 }
5604
5605 /// Verify basic self consistency properties of the data structure.
5606 void verify() {
5607 if (!ScheduleStart)
5608 return;
5609
5610 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
5611 ScheduleStart->comesBefore(ScheduleEnd) &&
5612 "Not a valid scheduling region?");
5613
5614 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5615 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
5616 if (!Bundles.empty()) {
5617 for (ScheduleBundle *Bundle : Bundles) {
5618 assert(isInSchedulingRegion(*Bundle) &&
5619 "primary schedule data not in window?");
5620 Bundle->verify();
5621 }
5622 continue;
5623 }
5624 auto *SD = getScheduleData(I);
5625 if (!SD)
5626 continue;
5627 assert(isInSchedulingRegion(*SD) &&
5628 "primary schedule data not in window?");
5629 SD->verify();
5630 }
5631
5632 assert(all_of(ReadyInsts,
5633 [](const ScheduleEntity *Bundle) {
5634 return Bundle->isReady();
5635 }) &&
5636 "item in ready list not ready?");
5637 }
5638
5639 /// Put all instructions into the ReadyList which are ready for scheduling.
5640 template <typename ReadyListType>
5641 void initialFillReadyList(ReadyListType &ReadyList) {
5643 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5644 ScheduleData *SD = getScheduleData(I);
5645 if (SD && SD->hasValidDependencies() && SD->isReady()) {
5646 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
5647 !Bundles.empty()) {
5648 for (ScheduleBundle *Bundle : Bundles) {
5649 if (!Visited.insert(Bundle).second)
5650 continue;
5651 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
5652 ReadyList.insert(Bundle);
5653 LLVM_DEBUG(dbgs() << "SLP: initially in ready list: "
5654 << *Bundle << "\n");
5655 }
5656 }
5657 continue;
5658 }
5659 ReadyList.insert(SD);
5661 << "SLP: initially in ready list: " << *SD << "\n");
5662 }
5663 }
5664 }
5665
5666 /// Build a bundle from the ScheduleData nodes corresponding to the
5667 /// scalar instruction for each lane.
5668 /// \param VL The list of scalar instructions.
5669 /// \param S The state of the instructions.
5670 /// \param EI The edge in the SLP graph or the user node/operand number.
5671 ScheduleBundle &buildBundle(ArrayRef<Value *> VL,
5672 const InstructionsState &S, const EdgeInfo &EI);
5673
5674 /// Checks if a bundle of instructions can be scheduled, i.e. has no
5675 /// cyclic dependencies. This is only a dry-run, no instructions are
5676 /// actually moved at this stage.
5677 /// \returns the scheduling bundle. The returned Optional value is not
5678 /// std::nullopt if \p VL is allowed to be scheduled.
5679 std::optional<ScheduleBundle *>
5680 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
5681 const InstructionsState &S, const EdgeInfo &EI);
5682
5683 /// Allocates schedule data chunk.
5684 ScheduleData *allocateScheduleDataChunks();
5685
5686 /// Extends the scheduling region so that V is inside the region.
5687 /// \returns true if the region size is within the limit.
5688 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
5689
5690 /// Initialize the ScheduleData structures for new instructions in the
5691 /// scheduling region.
5692 void initScheduleData(Instruction *FromI, Instruction *ToI,
5693 ScheduleData *PrevLoadStore,
5694 ScheduleData *NextLoadStore);
5695
5696 /// Updates the dependency information of a bundle and of all instructions/
5697 /// bundles which depend on the original bundle.
5698 void calculateDependencies(ScheduleBundle &Bundle, bool InsertInReadyList,
5699 BoUpSLP *SLP,
5700 ArrayRef<ScheduleData *> ControlDeps = {});
5701
5702 /// Sets all instruction in the scheduling region to un-scheduled.
5703 void resetSchedule();
5704
5705 BasicBlock *BB;
5706
5707 /// Simple memory allocation for ScheduleData.
5709
5710 /// The size of a ScheduleData array in ScheduleDataChunks.
5711 int ChunkSize;
5712
5713 /// The allocator position in the current chunk, which is the last entry
5714 /// of ScheduleDataChunks.
5715 int ChunkPos;
5716
5717 /// Attaches ScheduleData to Instruction.
5718 /// Note that the mapping survives during all vectorization iterations, i.e.
5719 /// ScheduleData structures are recycled.
5721
5722 /// Attaches ScheduleCopyableData to EdgeInfo (UserTreeEntry + operand
5723 /// number) and the operand instruction, represented as copyable element.
5725 std::unique_ptr<ScheduleCopyableData>>
5726 ScheduleCopyableDataMap;
5727
5728 /// Represents mapping between instruction and all related
5729 /// ScheduleCopyableData (for all uses in the tree, represenedt as copyable
5730 /// element). The SLP tree may contain several representations of the same
5731 /// instruction.
5733 ScheduleCopyableDataMapByInst;
5734
5735 /// Represents mapping between user value and operand number, the operand
5736 /// value and all related ScheduleCopyableData. The relation is 1:n, because
5737 /// the same user may refernce the same operand in different tree entries
5738 /// and the operand may be modelled by the different copyable data element.
5741 ScheduleCopyableDataMapByInstUser;
5742
5743 /// Represents mapping between instruction and all related
5744 /// ScheduleCopyableData. It represents the mapping between the actual
5745 /// instruction and the last copyable data element in the chain. E.g., if
5746 /// the graph models the following instructions:
5747 /// %0 = non-add instruction ...
5748 /// ...
5749 /// %4 = add %3, 1
5750 /// %5 = add %4, 1
5751 /// %6 = insertelement poison, %0, 0
5752 /// %7 = insertelement %6, %5, 1
5753 /// And the graph is modeled as:
5754 /// [%5, %0] -> [%4, copyable %0 <0> ] -> [%3, copyable %0 <1> ]
5755 /// -> [1, 0] -> [%1, 0]
5756 ///
5757 /// this map will map %0 only to the copyable element <1>, which is the last
5758 /// user (direct user of the actual instruction). <0> uses <1>, so <1> will
5759 /// keep the map to <0>, not the %0.
5760 SmallDenseMap<const Instruction *,
5762 ScheduleCopyableDataMapByUsers;
5763
5764 /// Attaches ScheduleBundle to Instruction.
5766 ScheduledBundles;
5767 /// The list of ScheduleBundles.
5768 SmallVector<std::unique_ptr<ScheduleBundle>> ScheduledBundlesList;
5769
5770 /// The ready-list for scheduling (only used for the dry-run).
5771 SetVector<ScheduleEntity *> ReadyInsts;
5772
5773 /// The first instruction of the scheduling region.
5774 Instruction *ScheduleStart = nullptr;
5775
5776 /// The first instruction _after_ the scheduling region.
5777 Instruction *ScheduleEnd = nullptr;
5778
5779 /// The first memory accessing instruction in the scheduling region
5780 /// (can be null).
5781 ScheduleData *FirstLoadStoreInRegion = nullptr;
5782
5783 /// The last memory accessing instruction in the scheduling region
5784 /// (can be null).
5785 ScheduleData *LastLoadStoreInRegion = nullptr;
5786
5787 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
5788 /// region? Used to optimize the dependence calculation for the
5789 /// common case where there isn't.
5790 bool RegionHasStackSave = false;
5791
5792 /// The current size of the scheduling region.
5793 int ScheduleRegionSize = 0;
5794
5795 /// The maximum size allowed for the scheduling region.
5796 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
5797
5798 /// The ID of the scheduling region. For a new vectorization iteration this
5799 /// is incremented which "removes" all ScheduleData from the region.
5800 /// Make sure that the initial SchedulingRegionID is greater than the
5801 /// initial SchedulingRegionID in ScheduleData (which is 0).
5802 int SchedulingRegionID = 1;
5803 };
5804
5805 /// Attaches the BlockScheduling structures to basic blocks.
5807
5808 /// Performs the "real" scheduling. Done before vectorization is actually
5809 /// performed in a basic block.
5810 void scheduleBlock(const BoUpSLP &R, BlockScheduling *BS);
5811
5812 /// List of users to ignore during scheduling and that don't need extracting.
5813 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
5814
5815 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
5816 /// sorted SmallVectors of unsigned.
5817 struct OrdersTypeDenseMapInfo {
5818 static OrdersType getEmptyKey() {
5819 OrdersType V;
5820 V.push_back(~1U);
5821 return V;
5822 }
5823
5824 static OrdersType getTombstoneKey() {
5825 OrdersType V;
5826 V.push_back(~2U);
5827 return V;
5828 }
5829
5830 static unsigned getHashValue(const OrdersType &V) {
5831 return static_cast<unsigned>(hash_combine_range(V));
5832 }
5833
5834 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
5835 return LHS == RHS;
5836 }
5837 };
5838
5839 // Analysis and block reference.
5840 Function *F;
5841 ScalarEvolution *SE;
5843 TargetLibraryInfo *TLI;
5844 LoopInfo *LI;
5845 DominatorTree *DT;
5846 AssumptionCache *AC;
5847 DemandedBits *DB;
5848 const DataLayout *DL;
5850
5851 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
5852 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
5853
5854 /// Instruction builder to construct the vectorized tree.
5856
5857 /// A map of scalar integer values to the smallest bit width with which they
5858 /// can legally be represented. The values map to (width, signed) pairs,
5859 /// where "width" indicates the minimum bit width and "signed" is True if the
5860 /// value must be signed-extended, rather than zero-extended, back to its
5861 /// original width.
5863
5864 /// Final size of the reduced vector, if the current graph represents the
5865 /// input for the reduction and it was possible to narrow the size of the
5866 /// reduction.
5867 unsigned ReductionBitWidth = 0;
5868
5869 /// Canonical graph size before the transformations.
5870 unsigned BaseGraphSize = 1;
5871
5872 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
5873 /// type sizes, used in the tree.
5874 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
5875
5876 /// Indices of the vectorized nodes, which supposed to be the roots of the new
5877 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
5878 DenseSet<unsigned> ExtraBitWidthNodes;
5879};
5880
5881} // end namespace slpvectorizer
5882
5883template <> struct DenseMapInfo<BoUpSLP::EdgeInfo> {
5887 return BoUpSLP::EdgeInfo(FirstInfo::getEmptyKey(),
5888 SecondInfo::getEmptyKey());
5889 }
5890
5892 return BoUpSLP::EdgeInfo(FirstInfo::getTombstoneKey(),
5893 SecondInfo::getTombstoneKey());
5894 }
5895
5896 static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val) {
5897 return detail::combineHashValue(FirstInfo::getHashValue(Val.UserTE),
5898 SecondInfo::getHashValue(Val.EdgeIdx));
5899 }
5900
5901 static bool isEqual(const BoUpSLP::EdgeInfo &LHS,
5902 const BoUpSLP::EdgeInfo &RHS) {
5903 return LHS == RHS;
5904 }
5905};
5906
5907template <> struct GraphTraits<BoUpSLP *> {
5908 using TreeEntry = BoUpSLP::TreeEntry;
5909
5910 /// NodeRef has to be a pointer per the GraphWriter.
5912
5914
5915 /// Add the VectorizableTree to the index iterator to be able to return
5916 /// TreeEntry pointers.
5917 struct ChildIteratorType
5918 : public iterator_adaptor_base<
5919 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
5921
5923 ContainerTy &VT)
5924 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
5925
5926 NodeRef operator*() { return I->UserTE; }
5927 };
5928
5930 return R.VectorizableTree[0].get();
5931 }
5932
5933 static ChildIteratorType child_begin(NodeRef N) {
5934 return {&N->UserTreeIndex, N->Container};
5935 }
5936
5937 static ChildIteratorType child_end(NodeRef N) {
5938 return {&N->UserTreeIndex + 1, N->Container};
5939 }
5940
5941 /// For the node iterator we just need to turn the TreeEntry iterator into a
5942 /// TreeEntry* iterator so that it dereferences to NodeRef.
5943 class nodes_iterator {
5945 ItTy It;
5946
5947 public:
5948 nodes_iterator(const ItTy &It2) : It(It2) {}
5949 NodeRef operator*() { return It->get(); }
5950 nodes_iterator operator++() {
5951 ++It;
5952 return *this;
5953 }
5954 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
5955 };
5956
5957 static nodes_iterator nodes_begin(BoUpSLP *R) {
5958 return nodes_iterator(R->VectorizableTree.begin());
5959 }
5960
5961 static nodes_iterator nodes_end(BoUpSLP *R) {
5962 return nodes_iterator(R->VectorizableTree.end());
5963 }
5964
5965 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
5966};
5967
5968template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
5969 using TreeEntry = BoUpSLP::TreeEntry;
5970
5971 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
5972
5973 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
5974 std::string Str;
5976 OS << Entry->Idx << ".\n";
5977 if (isSplat(Entry->Scalars))
5978 OS << "<splat> ";
5979 for (auto *V : Entry->Scalars) {
5980 OS << *V;
5981 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
5982 return EU.Scalar == V;
5983 }))
5984 OS << " <extract>";
5985 OS << "\n";
5986 }
5987 return Str;
5988 }
5989
5990 static std::string getNodeAttributes(const TreeEntry *Entry,
5991 const BoUpSLP *) {
5992 if (Entry->isGather())
5993 return "color=red";
5994 if (Entry->State == TreeEntry::ScatterVectorize ||
5995 Entry->State == TreeEntry::StridedVectorize ||
5996 Entry->State == TreeEntry::CompressVectorize)
5997 return "color=blue";
5998 return "";
5999 }
6000};
6001
6002} // end namespace llvm
6003
6006 for (auto *I : DeletedInstructions) {
6007 if (!I->getParent()) {
6008 // Temporarily insert instruction back to erase them from parent and
6009 // memory later.
6010 if (isa<PHINode>(I))
6011 // Phi nodes must be the very first instructions in the block.
6012 I->insertBefore(F->getEntryBlock(),
6013 F->getEntryBlock().getFirstNonPHIIt());
6014 else
6015 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
6016 continue;
6017 }
6018 for (Use &U : I->operands()) {
6019 auto *Op = dyn_cast<Instruction>(U.get());
6020 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
6022 DeadInsts.emplace_back(Op);
6023 }
6024 I->dropAllReferences();
6025 }
6026 for (auto *I : DeletedInstructions) {
6027 assert(I->use_empty() &&
6028 "trying to erase instruction with users.");
6029 I->eraseFromParent();
6030 }
6031
6032 // Cleanup any dead scalar code feeding the vectorized instructions
6034
6035#ifdef EXPENSIVE_CHECKS
6036 // If we could guarantee that this call is not extremely slow, we could
6037 // remove the ifdef limitation (see PR47712).
6038 assert(!verifyFunction(*F, &dbgs()));
6039#endif
6040}
6041
6042/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
6043/// contains original mask for the scalars reused in the node. Procedure
6044/// transform this mask in accordance with the given \p Mask.
6046 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
6047 "Expected non-empty mask.");
6048 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
6049 Prev.swap(Reuses);
6050 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
6051 if (Mask[I] != PoisonMaskElem)
6052 Reuses[Mask[I]] = Prev[I];
6053}
6054
6055/// Reorders the given \p Order according to the given \p Mask. \p Order - is
6056/// the original order of the scalars. Procedure transforms the provided order
6057/// in accordance with the given \p Mask. If the resulting \p Order is just an
6058/// identity order, \p Order is cleared.
6060 bool BottomOrder = false) {
6061 assert(!Mask.empty() && "Expected non-empty mask.");
6062 unsigned Sz = Mask.size();
6063 if (BottomOrder) {
6064 SmallVector<unsigned> PrevOrder;
6065 if (Order.empty()) {
6066 PrevOrder.resize(Sz);
6067 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
6068 } else {
6069 PrevOrder.swap(Order);
6070 }
6071 Order.assign(Sz, Sz);
6072 for (unsigned I = 0; I < Sz; ++I)
6073 if (Mask[I] != PoisonMaskElem)
6074 Order[I] = PrevOrder[Mask[I]];
6075 if (all_of(enumerate(Order), [&](const auto &Data) {
6076 return Data.value() == Sz || Data.index() == Data.value();
6077 })) {
6078 Order.clear();
6079 return;
6080 }
6081 fixupOrderingIndices(Order);
6082 return;
6083 }
6084 SmallVector<int> MaskOrder;
6085 if (Order.empty()) {
6086 MaskOrder.resize(Sz);
6087 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
6088 } else {
6089 inversePermutation(Order, MaskOrder);
6090 }
6091 reorderReuses(MaskOrder, Mask);
6092 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
6093 Order.clear();
6094 return;
6095 }
6096 Order.assign(Sz, Sz);
6097 for (unsigned I = 0; I < Sz; ++I)
6098 if (MaskOrder[I] != PoisonMaskElem)
6099 Order[MaskOrder[I]] = I;
6100 fixupOrderingIndices(Order);
6101}
6102
6103std::optional<BoUpSLP::OrdersType>
6104BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
6105 bool TopToBottom, bool IgnoreReorder) {
6106 assert(TE.isGather() && "Expected gather node only.");
6107 // Try to find subvector extract/insert patterns and reorder only such
6108 // patterns.
6109 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
6110 Type *ScalarTy = GatheredScalars.front()->getType();
6111 size_t NumScalars = GatheredScalars.size();
6112 if (!isValidElementType(ScalarTy))
6113 return std::nullopt;
6114 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
6115 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, NumScalars);
6116 SmallVector<int> ExtractMask;
6117 SmallVector<int> Mask;
6120 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
6122 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
6123 /*ForOrder=*/true);
6124 // No shuffled operands - ignore.
6125 if (GatherShuffles.empty() && ExtractShuffles.empty())
6126 return std::nullopt;
6127 OrdersType CurrentOrder(NumScalars, NumScalars);
6128 if (GatherShuffles.size() == 1 &&
6129 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
6130 Entries.front().front()->isSame(TE.Scalars)) {
6131 // If the full matched node in whole tree rotation - no need to consider the
6132 // matching order, rotating the whole tree.
6133 if (TopToBottom)
6134 return std::nullopt;
6135 // No need to keep the order for the same user node.
6136 if (Entries.front().front()->UserTreeIndex.UserTE ==
6137 TE.UserTreeIndex.UserTE)
6138 return std::nullopt;
6139 // No need to keep the order for the matched root node, if it can be freely
6140 // reordered.
6141 if (!IgnoreReorder && Entries.front().front()->Idx == 0)
6142 return std::nullopt;
6143 // If shuffling 2 elements only and the matching node has reverse reuses -
6144 // no need to count order, both work fine.
6145 if (!Entries.front().front()->ReuseShuffleIndices.empty() &&
6146 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6147 any_of(enumerate(Entries.front().front()->ReuseShuffleIndices),
6148 [](const auto &P) {
6149 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6150 }))
6151 return std::nullopt;
6152
6153 // Perfect match in the graph, will reuse the previously vectorized
6154 // node. Cost is 0.
6155 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
6156 return CurrentOrder;
6157 }
6158 auto IsSplatMask = [](ArrayRef<int> Mask) {
6159 int SingleElt = PoisonMaskElem;
6160 return all_of(Mask, [&](int I) {
6161 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
6162 SingleElt = I;
6163 return I == PoisonMaskElem || I == SingleElt;
6164 });
6165 };
6166 // Exclusive broadcast mask - ignore.
6167 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
6168 (Entries.size() != 1 ||
6169 Entries.front().front()->ReorderIndices.empty())) ||
6170 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
6171 return std::nullopt;
6172 SmallBitVector ShuffledSubMasks(NumParts);
6173 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
6174 ArrayRef<int> Mask, int PartSz, int NumParts,
6175 function_ref<unsigned(unsigned)> GetVF) {
6176 for (int I : seq<int>(0, NumParts)) {
6177 if (ShuffledSubMasks.test(I))
6178 continue;
6179 const int VF = GetVF(I);
6180 if (VF == 0)
6181 continue;
6182 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
6183 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
6184 // Shuffle of at least 2 vectors - ignore.
6185 if (any_of(Slice, [&](unsigned I) { return I != NumScalars; })) {
6186 llvm::fill(Slice, NumScalars);
6187 ShuffledSubMasks.set(I);
6188 continue;
6189 }
6190 // Try to include as much elements from the mask as possible.
6191 int FirstMin = INT_MAX;
6192 int SecondVecFound = false;
6193 for (int K : seq<int>(Limit)) {
6194 int Idx = Mask[I * PartSz + K];
6195 if (Idx == PoisonMaskElem) {
6196 Value *V = GatheredScalars[I * PartSz + K];
6197 if (isConstant(V) && !isa<PoisonValue>(V)) {
6198 SecondVecFound = true;
6199 break;
6200 }
6201 continue;
6202 }
6203 if (Idx < VF) {
6204 if (FirstMin > Idx)
6205 FirstMin = Idx;
6206 } else {
6207 SecondVecFound = true;
6208 break;
6209 }
6210 }
6211 FirstMin = (FirstMin / PartSz) * PartSz;
6212 // Shuffle of at least 2 vectors - ignore.
6213 if (SecondVecFound) {
6214 llvm::fill(Slice, NumScalars);
6215 ShuffledSubMasks.set(I);
6216 continue;
6217 }
6218 for (int K : seq<int>(Limit)) {
6219 int Idx = Mask[I * PartSz + K];
6220 if (Idx == PoisonMaskElem)
6221 continue;
6222 Idx -= FirstMin;
6223 if (Idx >= PartSz) {
6224 SecondVecFound = true;
6225 break;
6226 }
6227 if (CurrentOrder[I * PartSz + Idx] >
6228 static_cast<unsigned>(I * PartSz + K) &&
6229 CurrentOrder[I * PartSz + Idx] !=
6230 static_cast<unsigned>(I * PartSz + Idx))
6231 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
6232 }
6233 // Shuffle of at least 2 vectors - ignore.
6234 if (SecondVecFound) {
6235 llvm::fill(Slice, NumScalars);
6236 ShuffledSubMasks.set(I);
6237 continue;
6238 }
6239 }
6240 };
6241 int PartSz = getPartNumElems(NumScalars, NumParts);
6242 if (!ExtractShuffles.empty())
6243 TransformMaskToOrder(
6244 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
6245 if (!ExtractShuffles[I])
6246 return 0U;
6247 unsigned VF = 0;
6248 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
6249 for (unsigned Idx : seq<unsigned>(Sz)) {
6250 int K = I * PartSz + Idx;
6251 if (ExtractMask[K] == PoisonMaskElem)
6252 continue;
6253 if (!TE.ReuseShuffleIndices.empty())
6254 K = TE.ReuseShuffleIndices[K];
6255 if (K == PoisonMaskElem)
6256 continue;
6257 if (!TE.ReorderIndices.empty())
6258 K = std::distance(TE.ReorderIndices.begin(),
6259 find(TE.ReorderIndices, K));
6260 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
6261 if (!EI)
6262 continue;
6263 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
6264 ->getElementCount()
6265 .getKnownMinValue());
6266 }
6267 return VF;
6268 });
6269 // Check special corner case - single shuffle of the same entry.
6270 if (GatherShuffles.size() == 1 && NumParts != 1) {
6271 if (ShuffledSubMasks.any())
6272 return std::nullopt;
6273 PartSz = NumScalars;
6274 NumParts = 1;
6275 }
6276 if (!Entries.empty())
6277 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
6278 if (!GatherShuffles[I])
6279 return 0U;
6280 return std::max(Entries[I].front()->getVectorFactor(),
6281 Entries[I].back()->getVectorFactor());
6282 });
6283 unsigned NumUndefs = count(CurrentOrder, NumScalars);
6284 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6285 return std::nullopt;
6286 return std::move(CurrentOrder);
6287}
6288
6289static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
6290 const TargetLibraryInfo &TLI,
6291 bool CompareOpcodes = true) {
6294 return false;
6295 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
6296 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
6297 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6298 (!GEP2 || GEP2->getNumOperands() == 2) &&
6299 (((!GEP1 || isConstant(GEP1->getOperand(1))) &&
6300 (!GEP2 || isConstant(GEP2->getOperand(1)))) ||
6301 !CompareOpcodes ||
6302 (GEP1 && GEP2 &&
6303 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
6304}
6305
6306/// Calculates minimal alignment as a common alignment.
6307template <typename T>
6309 Align CommonAlignment = cast<T>(VL.consume_front())->getAlign();
6310 for (Value *V : VL)
6311 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
6312 return CommonAlignment;
6313}
6314
6315/// Check if \p Order represents reverse order.
6317 assert(!Order.empty() &&
6318 "Order is empty. Please check it before using isReverseOrder.");
6319 unsigned Sz = Order.size();
6320 return all_of(enumerate(Order), [&](const auto &Pair) {
6321 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6322 });
6323}
6324
6325/// Checks if the provided list of pointers \p Pointers represents the strided
6326/// pointers for type ElemTy. If they are not, std::nullopt is returned.
6327/// Otherwise, if \p Inst is not specified, just initialized optional value is
6328/// returned to show that the pointers represent strided pointers. If \p Inst
6329/// specified, the runtime stride is materialized before the given \p Inst.
6330/// \returns std::nullopt if the pointers are not pointers with the runtime
6331/// stride, nullptr or actual stride value, otherwise.
6332static std::optional<Value *>
6334 const DataLayout &DL, ScalarEvolution &SE,
6335 SmallVectorImpl<unsigned> &SortedIndices,
6336 Instruction *Inst = nullptr) {
6338 const SCEV *PtrSCEVLowest = nullptr;
6339 const SCEV *PtrSCEVHighest = nullptr;
6340 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
6341 // addresses).
6342 for (Value *Ptr : PointerOps) {
6343 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
6344 if (!PtrSCEV)
6345 return std::nullopt;
6346 SCEVs.push_back(PtrSCEV);
6347 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6348 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6349 continue;
6350 }
6351 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
6352 if (isa<SCEVCouldNotCompute>(Diff))
6353 return std::nullopt;
6354 if (Diff->isNonConstantNegative()) {
6355 PtrSCEVLowest = PtrSCEV;
6356 continue;
6357 }
6358 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
6359 if (isa<SCEVCouldNotCompute>(Diff1))
6360 return std::nullopt;
6361 if (Diff1->isNonConstantNegative()) {
6362 PtrSCEVHighest = PtrSCEV;
6363 continue;
6364 }
6365 }
6366 // Dist = PtrSCEVHighest - PtrSCEVLowest;
6367 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
6368 if (isa<SCEVCouldNotCompute>(Dist))
6369 return std::nullopt;
6370 int Size = DL.getTypeStoreSize(ElemTy);
6371 auto TryGetStride = [&](const SCEV *Dist,
6372 const SCEV *Multiplier) -> const SCEV * {
6373 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
6374 if (M->getOperand(0) == Multiplier)
6375 return M->getOperand(1);
6376 if (M->getOperand(1) == Multiplier)
6377 return M->getOperand(0);
6378 return nullptr;
6379 }
6380 if (Multiplier == Dist)
6381 return SE.getConstant(Dist->getType(), 1);
6382 return SE.getUDivExactExpr(Dist, Multiplier);
6383 };
6384 // Stride_in_elements = Dist / element_size * (num_elems - 1).
6385 const SCEV *Stride = nullptr;
6386 if (Size != 1 || SCEVs.size() > 2) {
6387 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
6388 Stride = TryGetStride(Dist, Sz);
6389 if (!Stride)
6390 return std::nullopt;
6391 }
6392 if (!Stride || isa<SCEVConstant>(Stride))
6393 return std::nullopt;
6394 // Iterate through all pointers and check if all distances are
6395 // unique multiple of Stride.
6396 using DistOrdPair = std::pair<int64_t, int>;
6397 auto Compare = llvm::less_first();
6398 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
6399 int Cnt = 0;
6400 bool IsConsecutive = true;
6401 for (const SCEV *PtrSCEV : SCEVs) {
6402 unsigned Dist = 0;
6403 if (PtrSCEV != PtrSCEVLowest) {
6404 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
6405 const SCEV *Coeff = TryGetStride(Diff, Stride);
6406 if (!Coeff)
6407 return std::nullopt;
6408 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
6409 if (!SC || isa<SCEVCouldNotCompute>(SC))
6410 return std::nullopt;
6411 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
6412 SE.getMulExpr(Stride, SC)))
6413 ->isZero())
6414 return std::nullopt;
6415 Dist = SC->getAPInt().getZExtValue();
6416 }
6417 // If the strides are not the same or repeated, we can't vectorize.
6418 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
6419 return std::nullopt;
6420 auto Res = Offsets.emplace(Dist, Cnt);
6421 if (!Res.second)
6422 return std::nullopt;
6423 // Consecutive order if the inserted element is the last one.
6424 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
6425 ++Cnt;
6426 }
6427 if (Offsets.size() != SCEVs.size())
6428 return std::nullopt;
6429 SortedIndices.clear();
6430 if (!IsConsecutive) {
6431 // Fill SortedIndices array only if it is non-consecutive.
6432 SortedIndices.resize(PointerOps.size());
6433 Cnt = 0;
6434 for (const std::pair<int64_t, int> &Pair : Offsets) {
6435 SortedIndices[Cnt] = Pair.second;
6436 ++Cnt;
6437 }
6438 }
6439 if (!Inst)
6440 return nullptr;
6441 SCEVExpander Expander(SE, DL, "strided-load-vec");
6442 return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
6443}
6444
6445static std::pair<InstructionCost, InstructionCost>
6447 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
6448 Type *ScalarTy, VectorType *VecTy);
6449
6450/// Returns the cost of the shuffle instructions with the given \p Kind, vector
6451/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
6452/// subvector pattern.
6453static InstructionCost
6455 VectorType *Tp, ArrayRef<int> Mask = {},
6457 int Index = 0, VectorType *SubTp = nullptr,
6459 VectorType *DstTy = Tp;
6460 if (!Mask.empty())
6461 DstTy = FixedVectorType::get(Tp->getScalarType(), Mask.size());
6462
6463 if (Kind != TTI::SK_PermuteTwoSrc)
6464 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
6465 Args);
6466 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6467 int NumSubElts;
6469 Mask, NumSrcElts, NumSubElts, Index)) {
6470 if (Index + NumSubElts > NumSrcElts &&
6471 Index + NumSrcElts <= static_cast<int>(Mask.size()))
6472 return TTI.getShuffleCost(TTI::SK_InsertSubvector, DstTy, Tp, Mask,
6474 }
6475 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
6476 Args);
6477}
6478
6479/// This is similar to TargetTransformInfo::getScalarizationOverhead, but if
6480/// ScalarTy is a FixedVectorType, a vector will be inserted or extracted
6481/// instead of a scalar.
6482static InstructionCost
6484 VectorType *Ty, const APInt &DemandedElts, bool Insert,
6485 bool Extract, TTI::TargetCostKind CostKind,
6486 bool ForPoisonSrc = true, ArrayRef<Value *> VL = {}) {
6487 assert(!isa<ScalableVectorType>(Ty) &&
6488 "ScalableVectorType is not supported.");
6489 assert(getNumElements(ScalarTy) * DemandedElts.getBitWidth() ==
6490 getNumElements(Ty) &&
6491 "Incorrect usage.");
6492 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
6493 assert(SLPReVec && "Only supported by REVEC.");
6494 // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
6495 // of CreateInsertElement.
6496 unsigned ScalarTyNumElements = VecTy->getNumElements();
6498 for (unsigned I : seq(DemandedElts.getBitWidth())) {
6499 if (!DemandedElts[I])
6500 continue;
6501 if (Insert)
6503 I * ScalarTyNumElements, VecTy);
6504 if (Extract)
6506 I * ScalarTyNumElements, VecTy);
6507 }
6508 return Cost;
6509 }
6510 return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
6511 CostKind, ForPoisonSrc, VL);
6512}
6513
6514/// This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy
6515/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6517 const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val,
6518 TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar,
6519 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
6520 if (Opcode == Instruction::ExtractElement) {
6521 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
6522 assert(SLPReVec && "Only supported by REVEC.");
6523 assert(isa<VectorType>(Val) && "Val must be a vector type.");
6525 cast<VectorType>(Val), {}, CostKind,
6526 Index * VecTy->getNumElements(), VecTy);
6527 }
6528 }
6529 return TTI.getVectorInstrCost(Opcode, Val, CostKind, Index, Scalar,
6530 ScalarUserAndIdx);
6531}
6532
6533/// This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst
6534/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6536 const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst,
6537 VectorType *VecTy, unsigned Index,
6539 if (auto *ScalarTy = dyn_cast<FixedVectorType>(Dst)) {
6540 assert(SLPReVec && "Only supported by REVEC.");
6541 auto *SubTp =
6542 getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
6544 Index * ScalarTy->getNumElements(), SubTp) +
6546 CostKind);
6547 }
6548 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index, CostKind);
6549}
6550
6551/// Creates subvector insert. Generates shuffle using \p Generator or
6552/// using default shuffle.
6554 IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index,
6555 function_ref<Value *(Value *, Value *, ArrayRef<int>)> Generator = {}) {
6556 if (isa<PoisonValue>(Vec) && isa<PoisonValue>(V))
6557 return Vec;
6558 const unsigned SubVecVF = getNumElements(V->getType());
6559 // Create shuffle, insertvector requires that index is multiple of
6560 // the subvector length.
6561 const unsigned VecVF = getNumElements(Vec->getType());
6563 if (isa<PoisonValue>(Vec)) {
6564 auto *Begin = std::next(Mask.begin(), Index);
6565 std::iota(Begin, std::next(Begin, SubVecVF), 0);
6566 Vec = Builder.CreateShuffleVector(V, Mask);
6567 return Vec;
6568 }
6569 std::iota(Mask.begin(), Mask.end(), 0);
6570 std::iota(std::next(Mask.begin(), Index),
6571 std::next(Mask.begin(), Index + SubVecVF), VecVF);
6572 if (Generator)
6573 return Generator(Vec, V, Mask);
6574 // 1. Resize V to the size of Vec.
6575 SmallVector<int> ResizeMask(VecVF, PoisonMaskElem);
6576 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
6577 V = Builder.CreateShuffleVector(V, ResizeMask);
6578 // 2. Insert V into Vec.
6579 return Builder.CreateShuffleVector(Vec, V, Mask);
6580}
6581
6582/// Generates subvector extract using \p Generator or using default shuffle.
6584 unsigned SubVecVF, unsigned Index) {
6585 SmallVector<int> Mask(SubVecVF, PoisonMaskElem);
6586 std::iota(Mask.begin(), Mask.end(), Index);
6587 return Builder.CreateShuffleVector(Vec, Mask);
6588}
6589
6590/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered
6591/// with \p Order.
6592/// \return true if the mask represents strided access, false - otherwise.
6594 ArrayRef<unsigned> Order, Type *ScalarTy,
6595 const DataLayout &DL, ScalarEvolution &SE,
6596 SmallVectorImpl<int> &CompressMask) {
6597 const unsigned Sz = PointerOps.size();
6598 CompressMask.assign(Sz, PoisonMaskElem);
6599 // The first element always set.
6600 CompressMask[0] = 0;
6601 // Check if the mask represents strided access.
6602 std::optional<unsigned> Stride = 0;
6603 Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps[Order.front()];
6604 for (unsigned I : seq<unsigned>(1, Sz)) {
6605 Value *Ptr = Order.empty() ? PointerOps[I] : PointerOps[Order[I]];
6606 std::optional<int64_t> OptPos =
6607 getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
6608 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
6609 return false;
6610 unsigned Pos = static_cast<unsigned>(*OptPos);
6611 CompressMask[I] = Pos;
6612 if (!Stride)
6613 continue;
6614 if (*Stride == 0) {
6615 *Stride = Pos;
6616 continue;
6617 }
6618 if (Pos != *Stride * I)
6619 Stride.reset();
6620 }
6621 return Stride.has_value();
6622}
6623
6624/// Checks if the \p VL can be transformed to a (masked)load + compress or
6625/// (masked) interleaved load.
6627 ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
6630 const DominatorTree &DT, const TargetLibraryInfo &TLI,
6631 const function_ref<bool(Value *)> AreAllUsersVectorized, bool &IsMasked,
6632 unsigned &InterleaveFactor, SmallVectorImpl<int> &CompressMask,
6633 VectorType *&LoadVecTy) {
6634 InterleaveFactor = 0;
6635 Type *ScalarTy = VL.front()->getType();
6636 const size_t Sz = VL.size();
6637 auto *VecTy = getWidenedType(ScalarTy, Sz);
6639 SmallVector<int> Mask;
6640 if (!Order.empty())
6641 inversePermutation(Order, Mask);
6642 // Check external uses.
6643 for (const auto [I, V] : enumerate(VL)) {
6644 if (AreAllUsersVectorized(V))
6645 continue;
6646 InstructionCost ExtractCost =
6647 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
6648 Mask.empty() ? I : Mask[I]);
6649 InstructionCost ScalarCost =
6650 TTI.getInstructionCost(cast<Instruction>(V), CostKind);
6651 if (ExtractCost <= ScalarCost)
6652 return false;
6653 }
6654 Value *Ptr0;
6655 Value *PtrN;
6656 if (Order.empty()) {
6657 Ptr0 = PointerOps.front();
6658 PtrN = PointerOps.back();
6659 } else {
6660 Ptr0 = PointerOps[Order.front()];
6661 PtrN = PointerOps[Order.back()];
6662 }
6663 std::optional<int64_t> Diff =
6664 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
6665 if (!Diff)
6666 return false;
6667 const size_t MaxRegSize =
6669 .getFixedValue();
6670 // Check for very large distances between elements.
6671 if (*Diff / Sz >= MaxRegSize / 8)
6672 return false;
6673 LoadVecTy = getWidenedType(ScalarTy, *Diff + 1);
6674 auto *LI = cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]);
6675 Align CommonAlignment = LI->getAlign();
6676 IsMasked = !isSafeToLoadUnconditionally(
6677 Ptr0, LoadVecTy, CommonAlignment, DL,
6678 cast<LoadInst>(Order.empty() ? VL.back() : VL[Order.back()]), &AC, &DT,
6679 &TLI);
6680 if (IsMasked && !TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment,
6681 LI->getPointerAddressSpace()))
6682 return false;
6683 // TODO: perform the analysis of each scalar load for better
6684 // safe-load-unconditionally analysis.
6685 bool IsStrided =
6686 buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask);
6687 assert(CompressMask.size() >= 2 && "At least two elements are required");
6688 SmallVector<Value *> OrderedPointerOps(PointerOps);
6689 if (!Order.empty())
6690 reorderScalars(OrderedPointerOps, Mask);
6691 auto [ScalarGEPCost, VectorGEPCost] =
6692 getGEPCosts(TTI, OrderedPointerOps, OrderedPointerOps.front(),
6693 Instruction::GetElementPtr, CostKind, ScalarTy, LoadVecTy);
6694 // The cost of scalar loads.
6695 InstructionCost ScalarLoadsCost =
6696 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
6697 [&](InstructionCost C, Value *V) {
6698 return C + TTI.getInstructionCost(cast<Instruction>(V),
6699 CostKind);
6700 }) +
6701 ScalarGEPCost;
6702 APInt DemandedElts = APInt::getAllOnes(Sz);
6703 InstructionCost GatherCost =
6704 getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
6705 /*Insert=*/true,
6706 /*Extract=*/false, CostKind) +
6707 ScalarLoadsCost;
6708 InstructionCost LoadCost = 0;
6709 if (IsMasked) {
6710 LoadCost =
6711 TTI.getMaskedMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6712 LI->getPointerAddressSpace(), CostKind);
6713 } else {
6714 LoadCost =
6715 TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6716 LI->getPointerAddressSpace(), CostKind);
6717 }
6718 if (IsStrided && !IsMasked && Order.empty()) {
6719 // Check for potential segmented(interleaved) loads.
6720 VectorType *AlignedLoadVecTy = getWidenedType(
6721 ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1));
6722 if (!isSafeToLoadUnconditionally(Ptr0, AlignedLoadVecTy, CommonAlignment,
6723 DL, cast<LoadInst>(VL.back()), &AC, &DT,
6724 &TLI))
6725 AlignedLoadVecTy = LoadVecTy;
6726 if (TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
6727 CommonAlignment,
6728 LI->getPointerAddressSpace())) {
6729 InstructionCost InterleavedCost =
6730 VectorGEPCost + TTI.getInterleavedMemoryOpCost(
6731 Instruction::Load, AlignedLoadVecTy,
6732 CompressMask[1], {}, CommonAlignment,
6733 LI->getPointerAddressSpace(), CostKind, IsMasked);
6734 if (InterleavedCost < GatherCost) {
6735 InterleaveFactor = CompressMask[1];
6736 LoadVecTy = AlignedLoadVecTy;
6737 return true;
6738 }
6739 }
6740 }
6741 InstructionCost CompressCost = ::getShuffleCost(
6742 TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind);
6743 if (!Order.empty()) {
6744 SmallVector<int> NewMask(Sz, PoisonMaskElem);
6745 for (unsigned I : seq<unsigned>(Sz)) {
6746 NewMask[I] = CompressMask[Mask[I]];
6747 }
6748 CompressMask.swap(NewMask);
6749 }
6750 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
6751 return TotalVecCost < GatherCost;
6752}
6753
6754/// Checks if the \p VL can be transformed to a (masked)load + compress or
6755/// (masked) interleaved load.
6756static bool
6759 const DataLayout &DL, ScalarEvolution &SE,
6760 AssumptionCache &AC, const DominatorTree &DT,
6761 const TargetLibraryInfo &TLI,
6762 const function_ref<bool(Value *)> AreAllUsersVectorized) {
6763 bool IsMasked;
6764 unsigned InterleaveFactor;
6765 SmallVector<int> CompressMask;
6766 VectorType *LoadVecTy;
6767 return isMaskedLoadCompress(VL, PointerOps, Order, TTI, DL, SE, AC, DT, TLI,
6768 AreAllUsersVectorized, IsMasked, InterleaveFactor,
6769 CompressMask, LoadVecTy);
6770}
6771
6772/// Checks if strided loads can be generated out of \p VL loads with pointers \p
6773/// PointerOps:
6774/// 1. Target with strided load support is detected.
6775/// 2. The number of loads is greater than MinProfitableStridedLoads, or the
6776/// potential stride <= MaxProfitableLoadStride and the potential stride is
6777/// power-of-2 (to avoid perf regressions for the very small number of loads)
6778/// and max distance > number of loads, or potential stride is -1.
6779/// 3. The loads are ordered, or number of unordered loads <=
6780/// MaxProfitableUnorderedLoads, or loads are in reversed order. (this check is
6781/// to avoid extra costs for very expensive shuffles).
6782/// 4. Any pointer operand is an instruction with the users outside of the
6783/// current graph (for masked gathers extra extractelement instructions
6784/// might be required).
6786 ArrayRef<unsigned> Order,
6787 const TargetTransformInfo &TTI, const DataLayout &DL,
6788 ScalarEvolution &SE,
6789 const bool IsAnyPointerUsedOutGraph,
6790 const int64_t Diff) {
6791 const size_t Sz = VL.size();
6792 const uint64_t AbsoluteDiff = std::abs(Diff);
6793 Type *ScalarTy = VL.front()->getType();
6794 auto *VecTy = getWidenedType(ScalarTy, Sz);
6795 if (IsAnyPointerUsedOutGraph ||
6796 (AbsoluteDiff > Sz &&
6798 (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
6799 AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) ||
6800 Diff == -(static_cast<int64_t>(Sz) - 1)) {
6801 int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
6802 if (Diff != Stride * static_cast<int64_t>(Sz - 1))
6803 return false;
6804 Align Alignment =
6805 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
6806 ->getAlign();
6807 if (!TTI.isLegalStridedLoadStore(VecTy, Alignment))
6808 return false;
6809 Value *Ptr0;
6810 Value *PtrN;
6811 if (Order.empty()) {
6812 Ptr0 = PointerOps.front();
6813 PtrN = PointerOps.back();
6814 } else {
6815 Ptr0 = PointerOps[Order.front()];
6816 PtrN = PointerOps[Order.back()];
6817 }
6818 // Iterate through all pointers and check if all distances are
6819 // unique multiple of Dist.
6821 for (Value *Ptr : PointerOps) {
6822 int64_t Dist = 0;
6823 if (Ptr == PtrN)
6824 Dist = Diff;
6825 else if (Ptr != Ptr0)
6826 Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
6827 // If the strides are not the same or repeated, we can't
6828 // vectorize.
6829 if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second)
6830 break;
6831 }
6832 if (Dists.size() == Sz)
6833 return true;
6834 }
6835 return false;
6836}
6837
6841 SmallVectorImpl<Value *> &PointerOps,
6842 unsigned *BestVF, bool TryRecursiveCheck) const {
6843 // Check that a vectorized load would load the same memory as a scalar
6844 // load. For example, we don't want to vectorize loads that are smaller
6845 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
6846 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
6847 // from such a struct, we read/write packed bits disagreeing with the
6848 // unvectorized version.
6849 if (BestVF)
6850 *BestVF = 0;
6852 return LoadsState::Gather;
6853 Type *ScalarTy = VL0->getType();
6854
6855 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
6856 return LoadsState::Gather;
6857
6858 // Make sure all loads in the bundle are simple - we can't vectorize
6859 // atomic or volatile loads.
6860 PointerOps.clear();
6861 const size_t Sz = VL.size();
6862 PointerOps.resize(Sz);
6863 auto *POIter = PointerOps.begin();
6864 for (Value *V : VL) {
6865 auto *L = dyn_cast<LoadInst>(V);
6866 if (!L || !L->isSimple())
6867 return LoadsState::Gather;
6868 *POIter = L->getPointerOperand();
6869 ++POIter;
6870 }
6871
6872 Order.clear();
6873 // Check the order of pointer operands or that all pointers are the same.
6874 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
6875
6876 auto *VecTy = getWidenedType(ScalarTy, Sz);
6877 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
6878 if (!IsSorted) {
6879 if (Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy)) {
6880 if (TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
6881 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
6883 }
6884
6885 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
6886 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
6887 return LoadsState::Gather;
6888
6889 if (!all_of(PointerOps, [&](Value *P) {
6890 return arePointersCompatible(P, PointerOps.front(), *TLI);
6891 }))
6892 return LoadsState::Gather;
6893
6894 } else {
6895 Value *Ptr0;
6896 Value *PtrN;
6897 if (Order.empty()) {
6898 Ptr0 = PointerOps.front();
6899 PtrN = PointerOps.back();
6900 } else {
6901 Ptr0 = PointerOps[Order.front()];
6902 PtrN = PointerOps[Order.back()];
6903 }
6904 std::optional<int64_t> Diff =
6905 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
6906 // Check that the sorted loads are consecutive.
6907 if (static_cast<uint64_t>(*Diff) == Sz - 1)
6908 return LoadsState::Vectorize;
6909 if (isMaskedLoadCompress(VL, PointerOps, Order, *TTI, *DL, *SE, *AC, *DT,
6910 *TLI, [&](Value *V) {
6911 return areAllUsersVectorized(
6912 cast<Instruction>(V), UserIgnoreList);
6913 }))
6915 // Simple check if not a strided access - clear order.
6916 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
6917 // Try to generate strided load node.
6918 auto IsAnyPointerUsedOutGraph =
6919 IsPossibleStrided && any_of(PointerOps, [&](Value *V) {
6920 return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
6921 return !isVectorized(U) && !MustGather.contains(U);
6922 });
6923 });
6924 if (IsPossibleStrided &&
6925 isStridedLoad(VL, PointerOps, Order, *TTI, *DL, *SE,
6926 IsAnyPointerUsedOutGraph, *Diff))
6928 }
6929 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
6930 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
6931 return LoadsState::Gather;
6932 // Correctly identify compare the cost of loads + shuffles rather than
6933 // strided/masked gather loads. Returns true if vectorized + shuffles
6934 // representation is better than just gather.
6935 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
6936 unsigned *BestVF,
6937 bool ProfitableGatherPointers) {
6938 if (BestVF)
6939 *BestVF = 0;
6940 // Compare masked gather cost and loads + insert subvector costs.
6942 auto [ScalarGEPCost, VectorGEPCost] =
6943 getGEPCosts(TTI, PointerOps, PointerOps.front(),
6944 Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
6945 // Estimate the cost of masked gather GEP. If not a splat, roughly
6946 // estimate as a buildvector, otherwise estimate as splat.
6947 APInt DemandedElts = APInt::getAllOnes(Sz);
6948 Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType();
6949 VectorType *PtrVecTy = getWidenedType(PtrScalarTy, Sz);
6950 if (static_cast<unsigned>(count_if(
6951 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
6952 any_of(PointerOps, [&](Value *V) {
6953 return getUnderlyingObject(V) !=
6954 getUnderlyingObject(PointerOps.front());
6955 }))
6956 VectorGEPCost += getScalarizationOverhead(TTI, PtrScalarTy, PtrVecTy,
6957 DemandedElts, /*Insert=*/true,
6958 /*Extract=*/false, CostKind);
6959 else
6960 VectorGEPCost +=
6962 TTI, PtrScalarTy, PtrVecTy, APInt::getOneBitSet(Sz, 0),
6963 /*Insert=*/true, /*Extract=*/false, CostKind) +
6965 // The cost of scalar loads.
6966 InstructionCost ScalarLoadsCost =
6967 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
6968 [&](InstructionCost C, Value *V) {
6969 return C + TTI.getInstructionCost(
6970 cast<Instruction>(V), CostKind);
6971 }) +
6972 ScalarGEPCost;
6973 // The cost of masked gather.
6974 InstructionCost MaskedGatherCost =
6976 Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
6977 /*VariableMask=*/false, CommonAlignment, CostKind) +
6978 (ProfitableGatherPointers ? 0 : VectorGEPCost);
6979 InstructionCost GatherCost =
6980 getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
6981 /*Insert=*/true,
6982 /*Extract=*/false, CostKind) +
6983 ScalarLoadsCost;
6984 // The list of loads is small or perform partial check already - directly
6985 // compare masked gather cost and gather cost.
6986 constexpr unsigned ListLimit = 4;
6987 if (!TryRecursiveCheck || VL.size() < ListLimit)
6988 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
6989
6990 // FIXME: The following code has not been updated for non-power-of-2
6991 // vectors (and not whole registers). The splitting logic here does not
6992 // cover the original vector if the vector factor is not a power of two.
6993 if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size()))
6994 return false;
6995
6996 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
6997 unsigned MinVF = getMinVF(2 * Sz);
6998 DemandedElts.clearAllBits();
6999 // Iterate through possible vectorization factors and check if vectorized +
7000 // shuffles is better than just gather.
7001 for (unsigned VF =
7002 getFloorFullVectorNumberOfElements(TTI, ScalarTy, VL.size() - 1);
7003 VF >= MinVF;
7004 VF = getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {
7006 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
7007 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
7009 SmallVector<Value *> PointerOps;
7010 LoadsState LS =
7011 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps, BestVF,
7012 /*TryRecursiveCheck=*/false);
7013 // Check that the sorted loads are consecutive.
7014 if (LS == LoadsState::Gather) {
7015 if (BestVF) {
7016 DemandedElts.setAllBits();
7017 break;
7018 }
7019 DemandedElts.setBits(Cnt, Cnt + VF);
7020 continue;
7021 }
7022 // If need the reorder - consider as high-cost masked gather for now.
7023 if ((LS == LoadsState::Vectorize ||
7026 !Order.empty() && !isReverseOrder(Order))
7028 States.push_back(LS);
7029 }
7030 if (DemandedElts.isAllOnes())
7031 // All loads gathered - try smaller VF.
7032 continue;
7033 // Can be vectorized later as a serie of loads/insertelements.
7034 InstructionCost VecLdCost = 0;
7035 if (!DemandedElts.isZero()) {
7036 VecLdCost = getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
7037 /*Insert=*/true,
7038 /*Extract=*/false, CostKind) +
7039 ScalarGEPCost;
7040 for (unsigned Idx : seq<unsigned>(VL.size()))
7041 if (DemandedElts[Idx])
7042 VecLdCost +=
7043 TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
7044 }
7045 auto *SubVecTy = getWidenedType(ScalarTy, VF);
7046 for (auto [I, LS] : enumerate(States)) {
7047 auto *LI0 = cast<LoadInst>(VL[I * VF]);
7048 InstructionCost VectorGEPCost =
7049 (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
7050 ? 0
7051 : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
7052 LI0->getPointerOperand(),
7053 Instruction::GetElementPtr, CostKind, ScalarTy,
7054 SubVecTy)
7055 .second;
7056 if (LS == LoadsState::ScatterVectorize) {
7057 if (static_cast<unsigned>(
7058 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
7059 PointerOps.size() - 1 ||
7060 any_of(PointerOps, [&](Value *V) {
7061 return getUnderlyingObject(V) !=
7062 getUnderlyingObject(PointerOps.front());
7063 }))
7064 VectorGEPCost += getScalarizationOverhead(
7065 TTI, ScalarTy, SubVecTy, APInt::getAllOnes(VF),
7066 /*Insert=*/true, /*Extract=*/false, CostKind);
7067 else
7068 VectorGEPCost +=
7070 TTI, ScalarTy, SubVecTy, APInt::getOneBitSet(VF, 0),
7071 /*Insert=*/true, /*Extract=*/false, CostKind) +
7072 ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
7073 CostKind);
7074 }
7075 switch (LS) {
7077 VecLdCost +=
7078 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
7079 LI0->getPointerAddressSpace(), CostKind,
7081 VectorGEPCost;
7082 break;
7084 VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
7085 LI0->getPointerOperand(),
7086 /*VariableMask=*/false,
7087 CommonAlignment, CostKind) +
7088 VectorGEPCost;
7089 break;
7091 VecLdCost += TTI.getMaskedMemoryOpCost(
7092 Instruction::Load, SubVecTy, CommonAlignment,
7093 LI0->getPointerAddressSpace(), CostKind) +
7094 VectorGEPCost +
7096 {}, CostKind);
7097 break;
7099 VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
7100 LI0->getPointerOperand(),
7101 /*VariableMask=*/false,
7102 CommonAlignment, CostKind) +
7103 VectorGEPCost;
7104 break;
7105 case LoadsState::Gather:
7106 // Gathers are already calculated - ignore.
7107 continue;
7108 }
7109 SmallVector<int> ShuffleMask(VL.size());
7110 for (int Idx : seq<int>(0, VL.size()))
7111 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
7112 if (I > 0)
7113 VecLdCost +=
7114 ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, ShuffleMask,
7115 CostKind, I * VF, SubVecTy);
7116 }
7117 // If masked gather cost is higher - better to vectorize, so
7118 // consider it as a gather node. It will be better estimated
7119 // later.
7120 if (MaskedGatherCost >= VecLdCost &&
7121 VecLdCost - GatherCost < -SLPCostThreshold) {
7122 if (BestVF)
7123 *BestVF = VF;
7124 return true;
7125 }
7126 }
7127 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7128 };
7129 // TODO: need to improve analysis of the pointers, if not all of them are
7130 // GEPs or have > 2 operands, we end up with a gather node, which just
7131 // increases the cost.
7132 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
7133 bool ProfitableGatherPointers =
7134 L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
7135 return L->isLoopInvariant(V);
7136 })) <= Sz / 2;
7137 if (ProfitableGatherPointers || all_of(PointerOps, [](Value *P) {
7138 auto *GEP = dyn_cast<GetElementPtrInst>(P);
7139 return (!GEP && doesNotNeedToBeScheduled(P)) ||
7140 (GEP && GEP->getNumOperands() == 2 &&
7141 isa<Constant, Instruction>(GEP->getOperand(1)));
7142 })) {
7143 // Check if potential masked gather can be represented as series
7144 // of loads + insertsubvectors.
7145 // If masked gather cost is higher - better to vectorize, so
7146 // consider it as a gather node. It will be better estimated
7147 // later.
7148 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
7149 ProfitableGatherPointers))
7151 }
7152
7153 return LoadsState::Gather;
7154}
7155
7157 ArrayRef<BasicBlock *> BBs, Type *ElemTy,
7158 const DataLayout &DL, ScalarEvolution &SE,
7159 SmallVectorImpl<unsigned> &SortedIndices) {
7160 assert(
7161 all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
7162 "Expected list of pointer operands.");
7163 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
7164 // Ptr into, sort and return the sorted indices with values next to one
7165 // another.
7167 std::pair<BasicBlock *, Value *>,
7169 Bases;
7170 Bases
7171 .try_emplace(std::make_pair(
7173 .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U);
7174
7175 SortedIndices.clear();
7176 for (auto [Cnt, Ptr] : enumerate(VL.drop_front())) {
7177 auto Key = std::make_pair(BBs[Cnt + 1],
7179 bool Found = any_of(Bases.try_emplace(Key).first->second,
7180 [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) {
7181 std::optional<int64_t> Diff =
7182 getPointersDiff(ElemTy, std::get<0>(Base.front()),
7183 ElemTy, Ptr, DL, SE,
7184 /*StrictCheck=*/true);
7185 if (!Diff)
7186 return false;
7187
7188 Base.emplace_back(Ptr, *Diff, Cnt + 1);
7189 return true;
7190 });
7191
7192 if (!Found) {
7193 // If we haven't found enough to usefully cluster, return early.
7194 if (Bases.size() > VL.size() / 2 - 1)
7195 return false;
7196
7197 // Not found already - add a new Base
7198 Bases.find(Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
7199 }
7200 }
7201
7202 if (Bases.size() == VL.size())
7203 return false;
7204
7205 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
7206 Bases.front().second.size() == VL.size()))
7207 return false;
7208
7209 // For each of the bases sort the pointers by Offset and check if any of the
7210 // base become consecutively allocated.
7211 auto ComparePointers = [](Value *Ptr1, Value *Ptr2) {
7212 SmallPtrSet<Value *, 13> FirstPointers;
7213 SmallPtrSet<Value *, 13> SecondPointers;
7214 Value *P1 = Ptr1;
7215 Value *P2 = Ptr2;
7216 unsigned Depth = 0;
7217 while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1)) {
7218 if (P1 == P2 || Depth > RecursionMaxDepth)
7219 return false;
7220 FirstPointers.insert(P1);
7221 SecondPointers.insert(P2);
7222 P1 = getUnderlyingObject(P1, /*MaxLookup=*/1);
7223 P2 = getUnderlyingObject(P2, /*MaxLookup=*/1);
7224 ++Depth;
7225 }
7226 assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) &&
7227 "Unable to find matching root.");
7228 return FirstPointers.contains(P2) && !SecondPointers.contains(P1);
7229 };
7230 for (auto &Base : Bases) {
7231 for (auto &Vec : Base.second) {
7232 if (Vec.size() > 1) {
7234 int64_t InitialOffset = std::get<1>(Vec[0]);
7235 bool AnyConsecutive =
7236 all_of(enumerate(Vec), [InitialOffset](const auto &P) {
7237 return std::get<1>(P.value()) ==
7238 int64_t(P.index()) + InitialOffset;
7239 });
7240 // Fill SortedIndices array only if it looks worth-while to sort the
7241 // ptrs.
7242 if (!AnyConsecutive)
7243 return false;
7244 }
7245 }
7246 stable_sort(Base.second, [&](const auto &V1, const auto &V2) {
7247 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
7248 });
7249 }
7250
7251 for (auto &T : Bases)
7252 for (const auto &Vec : T.second)
7253 for (const auto &P : Vec)
7254 SortedIndices.push_back(std::get<2>(P));
7255
7256 assert(SortedIndices.size() == VL.size() &&
7257 "Expected SortedIndices to be the size of VL");
7258 return true;
7259}
7260
7261std::optional<BoUpSLP::OrdersType>
7262BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
7263 assert(TE.isGather() && "Expected gather node only.");
7264 Type *ScalarTy = TE.Scalars[0]->getType();
7265
7267 Ptrs.reserve(TE.Scalars.size());
7269 BBs.reserve(TE.Scalars.size());
7270 for (Value *V : TE.Scalars) {
7271 auto *L = dyn_cast<LoadInst>(V);
7272 if (!L || !L->isSimple())
7273 return std::nullopt;
7274 Ptrs.push_back(L->getPointerOperand());
7275 BBs.push_back(L->getParent());
7276 }
7277
7278 BoUpSLP::OrdersType Order;
7279 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
7280 clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order))
7281 return std::move(Order);
7282 return std::nullopt;
7283}
7284
7285/// Check if two insertelement instructions are from the same buildvector.
7288 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
7289 // Instructions must be from the same basic blocks.
7290 if (VU->getParent() != V->getParent())
7291 return false;
7292 // Checks if 2 insertelements are from the same buildvector.
7293 if (VU->getType() != V->getType())
7294 return false;
7295 // Multiple used inserts are separate nodes.
7296 if (!VU->hasOneUse() && !V->hasOneUse())
7297 return false;
7298 auto *IE1 = VU;
7299 auto *IE2 = V;
7300 std::optional<unsigned> Idx1 = getElementIndex(IE1);
7301 std::optional<unsigned> Idx2 = getElementIndex(IE2);
7302 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
7303 return false;
7304 // Go through the vector operand of insertelement instructions trying to find
7305 // either VU as the original vector for IE2 or V as the original vector for
7306 // IE1.
7307 SmallBitVector ReusedIdx(
7308 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
7309 bool IsReusedIdx = false;
7310 do {
7311 if (IE2 == VU && !IE1)
7312 return VU->hasOneUse();
7313 if (IE1 == V && !IE2)
7314 return V->hasOneUse();
7315 if (IE1 && IE1 != V) {
7316 unsigned Idx1 = getElementIndex(IE1).value_or(*Idx2);
7317 IsReusedIdx |= ReusedIdx.test(Idx1);
7318 ReusedIdx.set(Idx1);
7319 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
7320 IE1 = nullptr;
7321 else
7322 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
7323 }
7324 if (IE2 && IE2 != VU) {
7325 unsigned Idx2 = getElementIndex(IE2).value_or(*Idx1);
7326 IsReusedIdx |= ReusedIdx.test(Idx2);
7327 ReusedIdx.set(Idx2);
7328 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
7329 IE2 = nullptr;
7330 else
7331 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
7332 }
7333 } while (!IsReusedIdx && (IE1 || IE2));
7334 return false;
7335}
7336
7337/// Checks if the specified instruction \p I is an alternate operation for
7338/// the given \p MainOp and \p AltOp instructions.
7339static bool isAlternateInstruction(Instruction *I, Instruction *MainOp,
7340 Instruction *AltOp,
7341 const TargetLibraryInfo &TLI);
7342
7343std::optional<BoUpSLP::OrdersType>
7344BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
7345 bool IgnoreReorder) {
7346 // No need to reorder if need to shuffle reuses, still need to shuffle the
7347 // node.
7348 if (!TE.ReuseShuffleIndices.empty()) {
7349 // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
7350 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
7351 "Reshuffling scalars not yet supported for nodes with padding");
7352
7353 if (isSplat(TE.Scalars))
7354 return std::nullopt;
7355 // Check if reuse shuffle indices can be improved by reordering.
7356 // For this, check that reuse mask is "clustered", i.e. each scalar values
7357 // is used once in each submask of size <number_of_scalars>.
7358 // Example: 4 scalar values.
7359 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
7360 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
7361 // element 3 is used twice in the second submask.
7362 unsigned Sz = TE.Scalars.size();
7363 if (TE.isGather()) {
7364 if (std::optional<OrdersType> CurrentOrder =
7365 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder)) {
7366 SmallVector<int> Mask;
7367 fixupOrderingIndices(*CurrentOrder);
7368 inversePermutation(*CurrentOrder, Mask);
7369 ::addMask(Mask, TE.ReuseShuffleIndices);
7370 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
7371 unsigned Sz = TE.Scalars.size();
7372 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
7373 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
7374 if (Idx != PoisonMaskElem)
7375 Res[Idx + K * Sz] = I + K * Sz;
7376 }
7377 return std::move(Res);
7378 }
7379 }
7380 if (Sz == 2 && TE.getVectorFactor() == 4 &&
7381 ::getNumberOfParts(*TTI, getWidenedType(TE.Scalars.front()->getType(),
7382 2 * TE.getVectorFactor())) == 1)
7383 return std::nullopt;
7384 if (TE.ReuseShuffleIndices.size() % Sz != 0)
7385 return std::nullopt;
7386 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
7387 Sz)) {
7388 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
7389 if (TE.ReorderIndices.empty())
7390 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
7391 else
7392 inversePermutation(TE.ReorderIndices, ReorderMask);
7393 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
7394 unsigned VF = ReorderMask.size();
7395 OrdersType ResOrder(VF, VF);
7396 unsigned NumParts = divideCeil(VF, Sz);
7397 SmallBitVector UsedVals(NumParts);
7398 for (unsigned I = 0; I < VF; I += Sz) {
7399 int Val = PoisonMaskElem;
7400 unsigned UndefCnt = 0;
7401 unsigned Limit = std::min(Sz, VF - I);
7402 if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
7403 [&](int Idx) {
7404 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
7405 Val = Idx;
7406 if (Idx == PoisonMaskElem)
7407 ++UndefCnt;
7408 return Idx != PoisonMaskElem && Idx != Val;
7409 }) ||
7410 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
7411 UndefCnt > Sz / 2)
7412 return std::nullopt;
7413 UsedVals.set(Val);
7414 for (unsigned K = 0; K < NumParts; ++K) {
7415 unsigned Idx = Val + Sz * K;
7416 if (Idx < VF && I + K < VF)
7417 ResOrder[Idx] = I + K;
7418 }
7419 }
7420 return std::move(ResOrder);
7421 }
7422 unsigned VF = TE.getVectorFactor();
7423 // Try build correct order for extractelement instructions.
7424 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
7425 TE.ReuseShuffleIndices.end());
7426 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
7427 all_of(TE.Scalars, [Sz](Value *V) {
7428 if (isa<PoisonValue>(V))
7429 return true;
7430 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
7431 return Idx && *Idx < Sz;
7432 })) {
7433 assert(!TE.isAltShuffle() && "Alternate instructions are only supported "
7434 "by BinaryOperator and CastInst.");
7435 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
7436 if (TE.ReorderIndices.empty())
7437 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
7438 else
7439 inversePermutation(TE.ReorderIndices, ReorderMask);
7440 for (unsigned I = 0; I < VF; ++I) {
7441 int &Idx = ReusedMask[I];
7442 if (Idx == PoisonMaskElem)
7443 continue;
7444 Value *V = TE.Scalars[ReorderMask[Idx]];
7445 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
7446 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
7447 }
7448 }
7449 // Build the order of the VF size, need to reorder reuses shuffles, they are
7450 // always of VF size.
7451 OrdersType ResOrder(VF);
7452 std::iota(ResOrder.begin(), ResOrder.end(), 0);
7453 auto *It = ResOrder.begin();
7454 for (unsigned K = 0; K < VF; K += Sz) {
7455 OrdersType CurrentOrder(TE.ReorderIndices);
7456 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
7457 if (SubMask.front() == PoisonMaskElem)
7458 std::iota(SubMask.begin(), SubMask.end(), 0);
7459 reorderOrder(CurrentOrder, SubMask);
7460 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
7461 std::advance(It, Sz);
7462 }
7463 if (TE.isGather() && all_of(enumerate(ResOrder), [](const auto &Data) {
7464 return Data.index() == Data.value();
7465 }))
7466 return std::nullopt; // No need to reorder.
7467 return std::move(ResOrder);
7468 }
7469 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
7470 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
7471 !Instruction::isBinaryOp(TE.UserTreeIndex.UserTE->getOpcode())) &&
7472 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
7473 return std::nullopt;
7474 if (TE.State == TreeEntry::SplitVectorize ||
7475 ((TE.State == TreeEntry::Vectorize ||
7476 TE.State == TreeEntry::StridedVectorize ||
7477 TE.State == TreeEntry::CompressVectorize) &&
7478 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
7479 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))))) {
7480 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
7481 "Alternate instructions are only supported by "
7482 "BinaryOperator and CastInst.");
7483 return TE.ReorderIndices;
7484 }
7485 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
7486 TE.isAltShuffle()) {
7487 assert(TE.ReuseShuffleIndices.empty() &&
7488 "ReuseShuffleIndices should be "
7489 "empty for alternate instructions.");
7490 SmallVector<int> Mask;
7491 TE.buildAltOpShuffleMask(
7492 [&](Instruction *I) {
7493 assert(TE.getMatchingMainOpOrAltOp(I) &&
7494 "Unexpected main/alternate opcode");
7495 return isAlternateInstruction(I, TE.getMainOp(), TE.getAltOp(), *TLI);
7496 },
7497 Mask);
7498 const int VF = TE.getVectorFactor();
7499 OrdersType ResOrder(VF, VF);
7500 for (unsigned I : seq<unsigned>(VF)) {
7501 if (Mask[I] == PoisonMaskElem)
7502 continue;
7503 ResOrder[Mask[I] % VF] = I;
7504 }
7505 return std::move(ResOrder);
7506 }
7507 if (!TE.ReorderIndices.empty())
7508 return TE.ReorderIndices;
7509 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
7510 if (!TE.ReorderIndices.empty())
7511 return TE.ReorderIndices;
7512
7513 SmallVector<Instruction *> UserBVHead(TE.Scalars.size());
7514 for (auto [I, V] : zip(UserBVHead, TE.Scalars)) {
7515 if (isa<Constant>(V) || !V->hasNUsesOrMore(1))
7516 continue;
7517 auto *II = dyn_cast<InsertElementInst>(*V->user_begin());
7518 if (!II)
7519 continue;
7520 Instruction *BVHead = nullptr;
7521 BasicBlock *BB = II->getParent();
7522 while (II && II->hasOneUse() && II->getParent() == BB) {
7523 BVHead = II;
7524 II = dyn_cast<InsertElementInst>(II->getOperand(0));
7525 }
7526 I = BVHead;
7527 }
7528
7529 auto CompareByBasicBlocks = [&](BasicBlock *BB1, BasicBlock *BB2) {
7530 assert(BB1 != BB2 && "Expected different basic blocks.");
7531 if (!DT->isReachableFromEntry(BB1))
7532 return false;
7533 if (!DT->isReachableFromEntry(BB2))
7534 return true;
7535 auto *NodeA = DT->getNode(BB1);
7536 auto *NodeB = DT->getNode(BB2);
7537 assert(NodeA && "Should only process reachable instructions");
7538 assert(NodeB && "Should only process reachable instructions");
7539 assert((NodeA == NodeB) ==
7540 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
7541 "Different nodes should have different DFS numbers");
7542 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
7543 };
7544 auto PHICompare = [&](unsigned I1, unsigned I2) {
7545 Value *V1 = TE.Scalars[I1];
7546 Value *V2 = TE.Scalars[I2];
7547 if (V1 == V2 || (V1->use_empty() && V2->use_empty()))
7548 return false;
7549 if (isa<PoisonValue>(V1))
7550 return true;
7551 if (isa<PoisonValue>(V2))
7552 return false;
7553 if (V1->getNumUses() < V2->getNumUses())
7554 return true;
7555 if (V1->getNumUses() > V2->getNumUses())
7556 return false;
7557 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
7558 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
7559 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
7560 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
7561 FirstUserOfPhi2->getParent());
7562 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
7563 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
7564 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
7565 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
7566 if (IE1 && !IE2)
7567 return true;
7568 if (!IE1 && IE2)
7569 return false;
7570 if (IE1 && IE2) {
7571 if (UserBVHead[I1] && !UserBVHead[I2])
7572 return true;
7573 if (!UserBVHead[I1])
7574 return false;
7575 if (UserBVHead[I1] == UserBVHead[I2])
7576 return getElementIndex(IE1) < getElementIndex(IE2);
7577 if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent())
7578 return CompareByBasicBlocks(UserBVHead[I1]->getParent(),
7579 UserBVHead[I2]->getParent());
7580 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
7581 }
7582 if (EE1 && !EE2)
7583 return true;
7584 if (!EE1 && EE2)
7585 return false;
7586 if (EE1 && EE2) {
7587 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
7588 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
7589 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
7590 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
7591 if (!Inst2 && !P2)
7592 return Inst1 || P1;
7593 if (EE1->getOperand(0) == EE2->getOperand(0))
7594 return getElementIndex(EE1) < getElementIndex(EE2);
7595 if (!Inst1 && Inst2)
7596 return false;
7597 if (Inst1 && Inst2) {
7598 if (Inst1->getParent() != Inst2->getParent())
7599 return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent());
7600 return Inst1->comesBefore(Inst2);
7601 }
7602 if (!P1 && P2)
7603 return false;
7604 assert(P1 && P2 &&
7605 "Expected either instructions or arguments vector operands.");
7606 return P1->getArgNo() < P2->getArgNo();
7607 }
7608 return false;
7609 };
7610 OrdersType Phis(TE.Scalars.size());
7611 std::iota(Phis.begin(), Phis.end(), 0);
7612 stable_sort(Phis, PHICompare);
7613 if (isIdentityOrder(Phis))
7614 return std::nullopt; // No need to reorder.
7615 return std::move(Phis);
7616 }
7617 if (TE.isGather() &&
7618 (!TE.hasState() || !TE.isAltShuffle() ||
7619 ScalarsInSplitNodes.contains(TE.getMainOp())) &&
7620 allSameType(TE.Scalars)) {
7621 // TODO: add analysis of other gather nodes with extractelement
7622 // instructions and other values/instructions, not only undefs.
7623 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
7624 (all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
7625 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
7626 all_of(TE.Scalars, [](Value *V) {
7627 auto *EE = dyn_cast<ExtractElementInst>(V);
7628 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
7629 })) {
7630 // Check that gather of extractelements can be represented as
7631 // just a shuffle of a single vector.
7632 OrdersType CurrentOrder;
7633 bool Reuse =
7634 canReuseExtract(TE.Scalars, CurrentOrder, /*ResizeAllowed=*/true);
7635 if (Reuse || !CurrentOrder.empty())
7636 return std::move(CurrentOrder);
7637 }
7638 // If the gather node is <undef, v, .., poison> and
7639 // insertelement poison, v, 0 [+ permute]
7640 // is cheaper than
7641 // insertelement poison, v, n - try to reorder.
7642 // If rotating the whole graph, exclude the permute cost, the whole graph
7643 // might be transformed.
7644 int Sz = TE.Scalars.size();
7645 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
7646 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
7647 const auto *It = find_if_not(TE.Scalars, isConstant);
7648 if (It == TE.Scalars.begin())
7649 return OrdersType();
7650 auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
7651 if (It != TE.Scalars.end()) {
7652 OrdersType Order(Sz, Sz);
7653 unsigned Idx = std::distance(TE.Scalars.begin(), It);
7654 Order[Idx] = 0;
7655 fixupOrderingIndices(Order);
7656 SmallVector<int> Mask;
7657 inversePermutation(Order, Mask);
7658 InstructionCost PermuteCost =
7659 TopToBottom
7660 ? 0
7662 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
7663 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
7664 PoisonValue::get(Ty), *It);
7665 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
7666 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
7667 PoisonValue::get(Ty), *It);
7668 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
7669 OrdersType Order(Sz, Sz);
7670 Order[Idx] = 0;
7671 return std::move(Order);
7672 }
7673 }
7674 }
7675 if (isSplat(TE.Scalars))
7676 return std::nullopt;
7677 if (TE.Scalars.size() >= 3)
7678 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
7679 return Order;
7680 // Check if can include the order of vectorized loads. For masked gathers do
7681 // extra analysis later, so include such nodes into a special list.
7682 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
7683 SmallVector<Value *> PointerOps;
7684 OrdersType CurrentOrder;
7685 LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
7686 CurrentOrder, PointerOps);
7689 return std::move(CurrentOrder);
7690 }
7691 // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
7692 // has been auditted for correctness with non-power-of-two vectors.
7693 if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
7694 if (std::optional<OrdersType> CurrentOrder =
7695 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
7696 return CurrentOrder;
7697 }
7698 return std::nullopt;
7699}
7700
7701/// Checks if the given mask is a "clustered" mask with the same clusters of
7702/// size \p Sz, which are not identity submasks.
7704 unsigned Sz) {
7705 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
7706 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
7707 return false;
7708 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
7709 ArrayRef<int> Cluster = Mask.slice(I, Sz);
7710 if (Cluster != FirstCluster)
7711 return false;
7712 }
7713 return true;
7714}
7715
7716void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
7717 // Reorder reuses mask.
7718 reorderReuses(TE.ReuseShuffleIndices, Mask);
7719 const unsigned Sz = TE.Scalars.size();
7720 // For vectorized and non-clustered reused no need to do anything else.
7721 if (!TE.isGather() ||
7723 Sz) ||
7724 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
7725 return;
7726 SmallVector<int> NewMask;
7727 inversePermutation(TE.ReorderIndices, NewMask);
7728 addMask(NewMask, TE.ReuseShuffleIndices);
7729 // Clear reorder since it is going to be applied to the new mask.
7730 TE.ReorderIndices.clear();
7731 // Try to improve gathered nodes with clustered reuses, if possible.
7732 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
7733 SmallVector<unsigned> NewOrder(Slice);
7734 inversePermutation(NewOrder, NewMask);
7735 reorderScalars(TE.Scalars, NewMask);
7736 // Fill the reuses mask with the identity submasks.
7737 for (auto *It = TE.ReuseShuffleIndices.begin(),
7738 *End = TE.ReuseShuffleIndices.end();
7739 It != End; std::advance(It, Sz))
7740 std::iota(It, std::next(It, Sz), 0);
7741}
7742
7744 ArrayRef<unsigned> SecondaryOrder) {
7745 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
7746 "Expected same size of orders");
7747 size_t Sz = Order.size();
7748 SmallBitVector UsedIndices(Sz);
7749 for (unsigned Idx : seq<unsigned>(0, Sz)) {
7750 if (Order[Idx] != Sz)
7751 UsedIndices.set(Order[Idx]);
7752 }
7753 if (SecondaryOrder.empty()) {
7754 for (unsigned Idx : seq<unsigned>(0, Sz))
7755 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
7756 Order[Idx] = Idx;
7757 } else {
7758 for (unsigned Idx : seq<unsigned>(0, Sz))
7759 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
7760 !UsedIndices.test(SecondaryOrder[Idx]))
7761 Order[Idx] = SecondaryOrder[Idx];
7762 }
7763}
7764
7766 constexpr unsigned TinyVF = 2;
7767 constexpr unsigned TinyTree = 10;
7768 constexpr unsigned PhiOpsLimit = 12;
7769 constexpr unsigned GatherLoadsLimit = 2;
7770 if (VectorizableTree.size() <= TinyTree)
7771 return true;
7772 if (VectorizableTree.front()->hasState() &&
7773 !VectorizableTree.front()->isGather() &&
7774 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
7775 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
7776 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
7777 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
7778 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
7779 VectorizableTree.front()->ReorderIndices.empty()) {
7780 // Check if the tree has only single store and single (unordered) load node,
7781 // other nodes are phis or geps/binops, combined with phis, and/or single
7782 // gather load node
7783 if (VectorizableTree.front()->hasState() &&
7784 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
7785 VectorizableTree.front()->Scalars.size() == TinyVF &&
7786 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
7787 return false;
7788 // Single node, which require reorder - skip.
7789 if (VectorizableTree.front()->hasState() &&
7790 VectorizableTree.front()->getOpcode() == Instruction::Store &&
7791 VectorizableTree.front()->ReorderIndices.empty()) {
7792 const unsigned ReorderedSplitsCnt =
7793 count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
7794 return TE->State == TreeEntry::SplitVectorize &&
7795 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
7796 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
7797 ::isCommutative(TE->UserTreeIndex.UserTE->getMainOp());
7798 });
7799 if (ReorderedSplitsCnt <= 1 &&
7800 static_cast<unsigned>(count_if(
7801 VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
7802 return ((!TE->isGather() &&
7803 (TE->ReorderIndices.empty() ||
7804 (TE->UserTreeIndex.UserTE &&
7805 TE->UserTreeIndex.UserTE->State ==
7806 TreeEntry::Vectorize &&
7807 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
7808 .empty()))) ||
7809 (TE->isGather() && TE->ReorderIndices.empty() &&
7810 (!TE->hasState() || TE->isAltShuffle() ||
7811 TE->getOpcode() == Instruction::Load ||
7812 TE->getOpcode() == Instruction::ZExt ||
7813 TE->getOpcode() == Instruction::SExt))) &&
7814 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
7815 !TE->isGather() || none_of(TE->Scalars, [&](Value *V) {
7816 return !isConstant(V) && isVectorized(V);
7817 }));
7818 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
7819 return false;
7820 }
7821 bool HasPhis = false;
7822 bool HasLoad = true;
7823 unsigned GatherLoads = 0;
7824 for (const std::unique_ptr<TreeEntry> &TE :
7825 ArrayRef(VectorizableTree).drop_front()) {
7826 if (TE->State == TreeEntry::SplitVectorize)
7827 continue;
7828 if (!TE->hasState()) {
7829 if (all_of(TE->Scalars, IsaPred<Constant, PHINode>) ||
7830 all_of(TE->Scalars, IsaPred<BinaryOperator, PHINode>))
7831 continue;
7832 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
7833 any_of(TE->Scalars, IsaPred<PHINode, GEPOperator>))
7834 continue;
7835 return true;
7836 }
7837 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
7838 if (!TE->isGather()) {
7839 HasLoad = false;
7840 continue;
7841 }
7842 if (HasLoad)
7843 return true;
7844 ++GatherLoads;
7845 if (GatherLoads >= GatherLoadsLimit)
7846 return true;
7847 }
7848 if (TE->getOpcode() == Instruction::GetElementPtr ||
7849 Instruction::isBinaryOp(TE->getOpcode()))
7850 continue;
7851 if (TE->getOpcode() != Instruction::PHI &&
7852 (!TE->hasCopyableElements() ||
7853 static_cast<unsigned>(count_if(TE->Scalars, IsaPred<PHINode>)) <
7854 TE->Scalars.size() / 2))
7855 return true;
7856 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
7857 TE->getNumOperands() > PhiOpsLimit)
7858 return false;
7859 HasPhis = true;
7860 }
7861 return !HasPhis;
7862 }
7863 return true;
7864}
7865
7866void BoUpSLP::TreeEntry::reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
7867 ArrayRef<int> MaskOrder) {
7868 assert(State == TreeEntry::SplitVectorize && "Expected split user node.");
7869 SmallVector<int> NewMask(getVectorFactor());
7870 SmallVector<int> NewMaskOrder(getVectorFactor());
7871 std::iota(NewMask.begin(), NewMask.end(), 0);
7872 std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
7873 if (Idx == 0) {
7874 copy(Mask, NewMask.begin());
7875 copy(MaskOrder, NewMaskOrder.begin());
7876 } else {
7877 assert(Idx == 1 && "Expected either 0 or 1 index.");
7878 unsigned Offset = CombinedEntriesWithIndices.back().second;
7879 for (unsigned I : seq<unsigned>(Mask.size())) {
7880 NewMask[I + Offset] = Mask[I] + Offset;
7881 NewMaskOrder[I + Offset] = MaskOrder[I] + Offset;
7882 }
7883 }
7884 reorderScalars(Scalars, NewMask);
7885 reorderOrder(ReorderIndices, NewMaskOrder, /*BottomOrder=*/true);
7886 if (!ReorderIndices.empty() && BoUpSLP::isIdentityOrder(ReorderIndices))
7887 ReorderIndices.clear();
7888}
7889
7891 // Maps VF to the graph nodes.
7893 // ExtractElement gather nodes which can be vectorized and need to handle
7894 // their ordering.
7896
7897 // Phi nodes can have preferred ordering based on their result users
7899
7900 // AltShuffles can also have a preferred ordering that leads to fewer
7901 // instructions, e.g., the addsub instruction in x86.
7902 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
7903
7904 // Maps a TreeEntry to the reorder indices of external users.
7906 ExternalUserReorderMap;
7907 // Find all reorderable nodes with the given VF.
7908 // Currently the are vectorized stores,loads,extracts + some gathering of
7909 // extracts.
7910 for_each(VectorizableTree, [&, &TTIRef = *TTI](
7911 const std::unique_ptr<TreeEntry> &TE) {
7912 // Look for external users that will probably be vectorized.
7913 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
7914 findExternalStoreUsersReorderIndices(TE.get());
7915 if (!ExternalUserReorderIndices.empty()) {
7916 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
7917 ExternalUserReorderMap.try_emplace(TE.get(),
7918 std::move(ExternalUserReorderIndices));
7919 }
7920
7921 // Patterns like [fadd,fsub] can be combined into a single instruction in
7922 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
7923 // to take into account their order when looking for the most used order.
7924 if (TE->hasState() && TE->isAltShuffle() &&
7925 TE->State != TreeEntry::SplitVectorize) {
7926 Type *ScalarTy = TE->Scalars[0]->getType();
7927 VectorType *VecTy = getWidenedType(ScalarTy, TE->Scalars.size());
7928 unsigned Opcode0 = TE->getOpcode();
7929 unsigned Opcode1 = TE->getAltOpcode();
7930 SmallBitVector OpcodeMask(
7931 getAltInstrMask(TE->Scalars, ScalarTy, Opcode0, Opcode1));
7932 // If this pattern is supported by the target then we consider the order.
7933 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
7934 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
7935 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
7936 }
7937 // TODO: Check the reverse order too.
7938 }
7939
7940 bool IgnoreReorder =
7941 !UserIgnoreList && VectorizableTree.front()->hasState() &&
7942 (VectorizableTree.front()->getOpcode() == Instruction::InsertElement ||
7943 VectorizableTree.front()->getOpcode() == Instruction::Store);
7944 if (std::optional<OrdersType> CurrentOrder =
7945 getReorderingData(*TE, /*TopToBottom=*/true, IgnoreReorder)) {
7946 // Do not include ordering for nodes used in the alt opcode vectorization,
7947 // better to reorder them during bottom-to-top stage. If follow the order
7948 // here, it causes reordering of the whole graph though actually it is
7949 // profitable just to reorder the subgraph that starts from the alternate
7950 // opcode vectorization node. Such nodes already end-up with the shuffle
7951 // instruction and it is just enough to change this shuffle rather than
7952 // rotate the scalars for the whole graph.
7953 unsigned Cnt = 0;
7954 const TreeEntry *UserTE = TE.get();
7955 while (UserTE && Cnt < RecursionMaxDepth) {
7956 if (!UserTE->UserTreeIndex)
7957 break;
7958 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
7959 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
7960 UserTE->UserTreeIndex.UserTE->Idx != 0)
7961 return;
7962 UserTE = UserTE->UserTreeIndex.UserTE;
7963 ++Cnt;
7964 }
7965 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
7966 if (!(TE->State == TreeEntry::Vectorize ||
7967 TE->State == TreeEntry::StridedVectorize ||
7968 TE->State == TreeEntry::SplitVectorize ||
7969 TE->State == TreeEntry::CompressVectorize) ||
7970 !TE->ReuseShuffleIndices.empty())
7971 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
7972 if (TE->State == TreeEntry::Vectorize &&
7973 TE->getOpcode() == Instruction::PHI)
7974 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
7975 }
7976 });
7977
7978 // Reorder the graph nodes according to their vectorization factor.
7979 for (unsigned VF = VectorizableTree.front()->getVectorFactor();
7980 !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
7981 auto It = VFToOrderedEntries.find(VF);
7982 if (It == VFToOrderedEntries.end())
7983 continue;
7984 // Try to find the most profitable order. We just are looking for the most
7985 // used order and reorder scalar elements in the nodes according to this
7986 // mostly used order.
7987 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
7988 // Delete VF entry upon exit.
7989 auto Cleanup = make_scope_exit([&]() { VFToOrderedEntries.erase(It); });
7990
7991 // All operands are reordered and used only in this node - propagate the
7992 // most used order to the user node.
7995 OrdersUses;
7996 for (const TreeEntry *OpTE : OrderedEntries) {
7997 // No need to reorder this nodes, still need to extend and to use shuffle,
7998 // just need to merge reordering shuffle and the reuse shuffle.
7999 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE) &&
8000 OpTE->State != TreeEntry::SplitVectorize)
8001 continue;
8002 // Count number of orders uses.
8003 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8004 &PhisToOrders]() -> const OrdersType & {
8005 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8006 auto It = GathersToOrders.find(OpTE);
8007 if (It != GathersToOrders.end())
8008 return It->second;
8009 }
8010 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8011 auto It = AltShufflesToOrders.find(OpTE);
8012 if (It != AltShufflesToOrders.end())
8013 return It->second;
8014 }
8015 if (OpTE->State == TreeEntry::Vectorize &&
8016 OpTE->getOpcode() == Instruction::PHI) {
8017 auto It = PhisToOrders.find(OpTE);
8018 if (It != PhisToOrders.end())
8019 return It->second;
8020 }
8021 return OpTE->ReorderIndices;
8022 }();
8023 // First consider the order of the external scalar users.
8024 auto It = ExternalUserReorderMap.find(OpTE);
8025 if (It != ExternalUserReorderMap.end()) {
8026 const auto &ExternalUserReorderIndices = It->second;
8027 // If the OpTE vector factor != number of scalars - use natural order,
8028 // it is an attempt to reorder node with reused scalars but with
8029 // external uses.
8030 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8031 OrdersUses.try_emplace(OrdersType(), 0).first->second +=
8032 ExternalUserReorderIndices.size();
8033 } else {
8034 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
8035 ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
8036 }
8037 // No other useful reorder data in this entry.
8038 if (Order.empty())
8039 continue;
8040 }
8041 // Stores actually store the mask, not the order, need to invert.
8042 if (OpTE->State == TreeEntry::Vectorize &&
8043 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
8044 assert(!OpTE->isAltShuffle() &&
8045 "Alternate instructions are only supported by BinaryOperator "
8046 "and CastInst.");
8047 SmallVector<int> Mask;
8048 inversePermutation(Order, Mask);
8049 unsigned E = Order.size();
8050 OrdersType CurrentOrder(E, E);
8051 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
8052 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8053 });
8054 fixupOrderingIndices(CurrentOrder);
8055 ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
8056 } else {
8057 ++OrdersUses.try_emplace(Order, 0).first->second;
8058 }
8059 }
8060 if (OrdersUses.empty())
8061 continue;
8062 // Choose the most used order.
8063 unsigned IdentityCnt = 0;
8064 unsigned FilledIdentityCnt = 0;
8065 OrdersType IdentityOrder(VF, VF);
8066 for (auto &Pair : OrdersUses) {
8067 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
8068 if (!Pair.first.empty())
8069 FilledIdentityCnt += Pair.second;
8070 IdentityCnt += Pair.second;
8071 combineOrders(IdentityOrder, Pair.first);
8072 }
8073 }
8074 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
8075 unsigned Cnt = IdentityCnt;
8076 for (auto &Pair : OrdersUses) {
8077 // Prefer identity order. But, if filled identity found (non-empty order)
8078 // with same number of uses, as the new candidate order, we can choose
8079 // this candidate order.
8080 if (Cnt < Pair.second ||
8081 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8082 Cnt == Pair.second && !BestOrder.empty() &&
8083 isIdentityOrder(BestOrder))) {
8084 combineOrders(Pair.first, BestOrder);
8085 BestOrder = Pair.first;
8086 Cnt = Pair.second;
8087 } else {
8088 combineOrders(BestOrder, Pair.first);
8089 }
8090 }
8091 // Set order of the user node.
8092 if (isIdentityOrder(BestOrder))
8093 continue;
8094 fixupOrderingIndices(BestOrder);
8095 SmallVector<int> Mask;
8096 inversePermutation(BestOrder, Mask);
8097 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
8098 unsigned E = BestOrder.size();
8099 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
8100 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8101 });
8102 // Do an actual reordering, if profitable.
8103 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8104 // Just do the reordering for the nodes with the given VF.
8105 if (TE->Scalars.size() != VF) {
8106 if (TE->ReuseShuffleIndices.size() == VF) {
8107 assert(TE->State != TreeEntry::SplitVectorize &&
8108 "Split vectorized not expected.");
8109 // Need to reorder the reuses masks of the operands with smaller VF to
8110 // be able to find the match between the graph nodes and scalar
8111 // operands of the given node during vectorization/cost estimation.
8112 assert(
8113 (!TE->UserTreeIndex ||
8114 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8115 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8116 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8117 "All users must be of VF size.");
8118 if (SLPReVec) {
8119 assert(SLPReVec && "Only supported by REVEC.");
8120 // ShuffleVectorInst does not do reorderOperands (and it should not
8121 // because ShuffleVectorInst supports only a limited set of
8122 // patterns). Only do reorderNodeWithReuses if the user is not
8123 // ShuffleVectorInst.
8124 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
8125 isa<ShuffleVectorInst>(TE->UserTreeIndex.UserTE->getMainOp()))
8126 continue;
8127 }
8128 // Update ordering of the operands with the smaller VF than the given
8129 // one.
8130 reorderNodeWithReuses(*TE, Mask);
8131 // Update orders in user split vectorize nodes.
8132 if (TE->UserTreeIndex &&
8133 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8134 TE->UserTreeIndex.UserTE->reorderSplitNode(
8135 TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
8136 }
8137 continue;
8138 }
8139 if ((TE->State == TreeEntry::SplitVectorize &&
8140 TE->ReuseShuffleIndices.empty()) ||
8141 ((TE->State == TreeEntry::Vectorize ||
8142 TE->State == TreeEntry::StridedVectorize ||
8143 TE->State == TreeEntry::CompressVectorize) &&
8145 InsertElementInst>(TE->getMainOp()) ||
8146 (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp()))))) {
8147 assert(
8148 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
8149 TE->ReuseShuffleIndices.empty())) &&
8150 "Alternate instructions are only supported by BinaryOperator "
8151 "and CastInst.");
8152 // Build correct orders for extract{element,value}, loads,
8153 // stores and alternate (split) nodes.
8154 reorderOrder(TE->ReorderIndices, Mask);
8155 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
8156 TE->reorderOperands(Mask);
8157 } else {
8158 // Reorder the node and its operands.
8159 TE->reorderOperands(Mask);
8160 assert(TE->ReorderIndices.empty() &&
8161 "Expected empty reorder sequence.");
8162 reorderScalars(TE->Scalars, Mask);
8163 }
8164 if (!TE->ReuseShuffleIndices.empty()) {
8165 // Apply reversed order to keep the original ordering of the reused
8166 // elements to avoid extra reorder indices shuffling.
8167 OrdersType CurrentOrder;
8168 reorderOrder(CurrentOrder, MaskOrder);
8169 SmallVector<int> NewReuses;
8170 inversePermutation(CurrentOrder, NewReuses);
8171 addMask(NewReuses, TE->ReuseShuffleIndices);
8172 TE->ReuseShuffleIndices.swap(NewReuses);
8173 } else if (TE->UserTreeIndex &&
8174 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8175 // Update orders in user split vectorize nodes.
8176 TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
8177 Mask, MaskOrder);
8178 }
8179 }
8180}
8181
8182void BoUpSLP::buildReorderableOperands(
8183 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
8184 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
8185 SmallVectorImpl<TreeEntry *> &GatherOps) {
8186 for (unsigned I : seq<unsigned>(UserTE->getNumOperands())) {
8187 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
8188 return OpData.first == I &&
8189 (OpData.second->State == TreeEntry::Vectorize ||
8190 OpData.second->State == TreeEntry::StridedVectorize ||
8191 OpData.second->State == TreeEntry::CompressVectorize ||
8192 OpData.second->State == TreeEntry::SplitVectorize);
8193 }))
8194 continue;
8195 // Do not request operands, if they do not exist.
8196 if (UserTE->hasState()) {
8197 if (UserTE->getOpcode() == Instruction::ExtractElement ||
8198 UserTE->getOpcode() == Instruction::ExtractValue)
8199 continue;
8200 if (UserTE->getOpcode() == Instruction::InsertElement && I == 0)
8201 continue;
8202 if (UserTE->getOpcode() == Instruction::Store &&
8203 UserTE->State == TreeEntry::Vectorize && I == 1)
8204 continue;
8205 if (UserTE->getOpcode() == Instruction::Load &&
8206 (UserTE->State == TreeEntry::Vectorize ||
8207 UserTE->State == TreeEntry::StridedVectorize ||
8208 UserTE->State == TreeEntry::CompressVectorize))
8209 continue;
8210 }
8211 TreeEntry *TE = getOperandEntry(UserTE, I);
8212 assert(TE && "Expected operand entry.");
8213 if (!TE->isGather()) {
8214 // Add the node to the list of the ordered nodes with the identity
8215 // order.
8216 Edges.emplace_back(I, TE);
8217 // Add ScatterVectorize nodes to the list of operands, where just
8218 // reordering of the scalars is required. Similar to the gathers, so
8219 // simply add to the list of gathered ops.
8220 // If there are reused scalars, process this node as a regular vectorize
8221 // node, just reorder reuses mask.
8222 if (TE->State == TreeEntry::ScatterVectorize &&
8223 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
8224 GatherOps.push_back(TE);
8225 continue;
8226 }
8227 if (ReorderableGathers.contains(TE))
8228 GatherOps.push_back(TE);
8229 }
8230}
8231
8232void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
8233 struct TreeEntryCompare {
8234 bool operator()(const TreeEntry *LHS, const TreeEntry *RHS) const {
8235 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
8236 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
8237 return LHS->Idx < RHS->Idx;
8238 }
8239 };
8241 DenseSet<const TreeEntry *> GathersToOrders;
8242 // Find all reorderable leaf nodes with the given VF.
8243 // Currently the are vectorized loads,extracts without alternate operands +
8244 // some gathering of extracts.
8246 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8247 if (TE->State != TreeEntry::Vectorize &&
8248 TE->State != TreeEntry::StridedVectorize &&
8249 TE->State != TreeEntry::CompressVectorize &&
8250 TE->State != TreeEntry::SplitVectorize)
8251 NonVectorized.insert(TE.get());
8252 if (std::optional<OrdersType> CurrentOrder =
8253 getReorderingData(*TE, /*TopToBottom=*/false, IgnoreReorder)) {
8254 Queue.push(TE.get());
8255 if (!(TE->State == TreeEntry::Vectorize ||
8256 TE->State == TreeEntry::StridedVectorize ||
8257 TE->State == TreeEntry::CompressVectorize ||
8258 TE->State == TreeEntry::SplitVectorize) ||
8259 !TE->ReuseShuffleIndices.empty())
8260 GathersToOrders.insert(TE.get());
8261 }
8262 }
8263
8264 // 1. Propagate order to the graph nodes, which use only reordered nodes.
8265 // I.e., if the node has operands, that are reordered, try to make at least
8266 // one operand order in the natural order and reorder others + reorder the
8267 // user node itself.
8268 SmallPtrSet<const TreeEntry *, 4> Visited, RevisitedOps;
8269 while (!Queue.empty()) {
8270 // 1. Filter out only reordered nodes.
8271 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
8272 TreeEntry *TE = Queue.top();
8273 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
8274 Queue.pop();
8275 SmallVector<TreeEntry *> OrderedOps(1, TE);
8276 while (!Queue.empty()) {
8277 TE = Queue.top();
8278 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
8279 break;
8280 Queue.pop();
8281 OrderedOps.push_back(TE);
8282 }
8283 for (TreeEntry *TE : OrderedOps) {
8284 if (!(TE->State == TreeEntry::Vectorize ||
8285 TE->State == TreeEntry::StridedVectorize ||
8286 TE->State == TreeEntry::CompressVectorize ||
8287 TE->State == TreeEntry::SplitVectorize ||
8288 (TE->isGather() && GathersToOrders.contains(TE))) ||
8289 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
8290 !Visited.insert(TE).second)
8291 continue;
8292 // Build a map between user nodes and their operands order to speedup
8293 // search. The graph currently does not provide this dependency directly.
8294 Users.first = TE->UserTreeIndex.UserTE;
8295 Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
8296 }
8297 if (Users.first) {
8298 auto &Data = Users;
8299 if (Data.first->State == TreeEntry::SplitVectorize) {
8300 assert(
8301 Data.second.size() <= 2 &&
8302 "Expected not greater than 2 operands for split vectorize node.");
8303 if (any_of(Data.second,
8304 [](const auto &Op) { return !Op.second->UserTreeIndex; }))
8305 continue;
8306 // Update orders in user split vectorize nodes.
8307 assert(Data.first->CombinedEntriesWithIndices.size() == 2 &&
8308 "Expected exactly 2 entries.");
8309 for (const auto &P : Data.first->CombinedEntriesWithIndices) {
8310 TreeEntry &OpTE = *VectorizableTree[P.first];
8311 OrdersType Order = OpTE.ReorderIndices;
8312 if (Order.empty() || !OpTE.ReuseShuffleIndices.empty()) {
8313 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
8314 continue;
8315 const auto BestOrder =
8316 getReorderingData(OpTE, /*TopToBottom=*/false, IgnoreReorder);
8317 if (!BestOrder || BestOrder->empty() || isIdentityOrder(*BestOrder))
8318 continue;
8319 Order = *BestOrder;
8320 }
8321 fixupOrderingIndices(Order);
8322 SmallVector<int> Mask;
8323 inversePermutation(Order, Mask);
8324 const unsigned E = Order.size();
8325 SmallVector<int> MaskOrder(E, PoisonMaskElem);
8326 transform(Order, MaskOrder.begin(), [E](unsigned I) {
8327 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8328 });
8329 Data.first->reorderSplitNode(P.second ? 1 : 0, Mask, MaskOrder);
8330 // Clear ordering of the operand.
8331 if (!OpTE.ReorderIndices.empty()) {
8332 OpTE.ReorderIndices.clear();
8333 } else if (!OpTE.ReuseShuffleIndices.empty()) {
8334 reorderReuses(OpTE.ReuseShuffleIndices, Mask);
8335 } else {
8336 assert(OpTE.isGather() && "Expected only gather/buildvector node.");
8337 reorderScalars(OpTE.Scalars, Mask);
8338 }
8339 }
8340 if (Data.first->ReuseShuffleIndices.empty() &&
8341 !Data.first->ReorderIndices.empty()) {
8342 // Insert user node to the list to try to sink reordering deeper in
8343 // the graph.
8344 Queue.push(Data.first);
8345 }
8346 continue;
8347 }
8348 // Check that operands are used only in the User node.
8349 SmallVector<TreeEntry *> GatherOps;
8350 buildReorderableOperands(Data.first, Data.second, NonVectorized,
8351 GatherOps);
8352 // All operands are reordered and used only in this node - propagate the
8353 // most used order to the user node.
8356 OrdersUses;
8357 // Do the analysis for each tree entry only once, otherwise the order of
8358 // the same node my be considered several times, though might be not
8359 // profitable.
8362 for (const auto &Op : Data.second) {
8363 TreeEntry *OpTE = Op.second;
8364 if (!VisitedOps.insert(OpTE).second)
8365 continue;
8366 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
8367 continue;
8368 const auto Order = [&]() -> const OrdersType {
8369 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
8370 return getReorderingData(*OpTE, /*TopToBottom=*/false,
8371 IgnoreReorder)
8372 .value_or(OrdersType(1));
8373 return OpTE->ReorderIndices;
8374 }();
8375 // The order is partially ordered, skip it in favor of fully non-ordered
8376 // orders.
8377 if (Order.size() == 1)
8378 continue;
8379
8380 // Check that the reordering does not increase number of shuffles, i.e.
8381 // same-values-nodes has same parents or their parents has same parents.
8382 if (!Order.empty() && !isIdentityOrder(Order)) {
8383 Value *Root = OpTE->hasState()
8384 ? OpTE->getMainOp()
8385 : *find_if_not(OpTE->Scalars, isConstant);
8386 auto GetSameNodesUsers = [&](Value *Root) {
8388 for (const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
8389 if (TE != OpTE && TE->UserTreeIndex &&
8390 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8391 TE->Scalars.size() == OpTE->Scalars.size() &&
8392 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8393 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8394 Res.insert(TE->UserTreeIndex.UserTE);
8395 }
8396 for (const TreeEntry *TE : getTreeEntries(Root)) {
8397 if (TE != OpTE && TE->UserTreeIndex &&
8398 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8399 TE->Scalars.size() == OpTE->Scalars.size() &&
8400 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8401 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8402 Res.insert(TE->UserTreeIndex.UserTE);
8403 }
8404 return Res.takeVector();
8405 };
8406 auto GetNumOperands = [](const TreeEntry *TE) {
8407 if (TE->State == TreeEntry::SplitVectorize)
8408 return TE->getNumOperands();
8409 if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
8410 return CI->arg_size();
8411 return TE->getNumOperands();
8412 };
8413 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
8414 const TreeEntry *TE) {
8416 if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
8418 for (unsigned Idx : seq<unsigned>(GetNumOperands(TE))) {
8421 continue;
8422 const TreeEntry *Op = getOperandEntry(TE, Idx);
8423 if (Op->isGather() && Op->hasState()) {
8424 const TreeEntry *VecOp =
8425 getSameValuesTreeEntry(Op->getMainOp(), Op->Scalars);
8426 if (VecOp)
8427 Op = VecOp;
8428 }
8429 if (Op->ReorderIndices.empty() && Op->ReuseShuffleIndices.empty())
8430 return false;
8431 }
8432 return true;
8433 };
8434 SmallVector<TreeEntry *> Users = GetSameNodesUsers(Root);
8435 if (!Users.empty() && !all_of(Users, [&](TreeEntry *UTE) {
8436 if (!RevisitedOps.insert(UTE).second)
8437 return false;
8438 return UTE == Data.first || !UTE->ReorderIndices.empty() ||
8439 !UTE->ReuseShuffleIndices.empty() ||
8440 (UTE->UserTreeIndex &&
8441 UTE->UserTreeIndex.UserTE == Data.first) ||
8442 (Data.first->UserTreeIndex &&
8443 Data.first->UserTreeIndex.UserTE == UTE) ||
8444 (IgnoreReorder && UTE->UserTreeIndex &&
8445 UTE->UserTreeIndex.UserTE->Idx == 0) ||
8446 NodeShouldBeReorderedWithOperands(UTE);
8447 }))
8448 continue;
8449 for (TreeEntry *UTE : Users) {
8451 if (auto *CI = dyn_cast<CallInst>(UTE->getMainOp()); CI)
8453 for (unsigned Idx : seq<unsigned>(GetNumOperands(UTE))) {
8456 continue;
8457 const TreeEntry *Op = getOperandEntry(UTE, Idx);
8458 Visited.erase(Op);
8459 Queue.push(const_cast<TreeEntry *>(Op));
8460 }
8461 }
8462 }
8463 unsigned NumOps = count_if(
8464 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
8465 return P.second == OpTE;
8466 });
8467 // Stores actually store the mask, not the order, need to invert.
8468 if (OpTE->State == TreeEntry::Vectorize &&
8469 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
8470 assert(!OpTE->isAltShuffle() &&
8471 "Alternate instructions are only supported by BinaryOperator "
8472 "and CastInst.");
8473 SmallVector<int> Mask;
8474 inversePermutation(Order, Mask);
8475 unsigned E = Order.size();
8476 OrdersType CurrentOrder(E, E);
8477 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
8478 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8479 });
8480 fixupOrderingIndices(CurrentOrder);
8481 OrdersUses.try_emplace(CurrentOrder, 0).first->second += NumOps;
8482 } else {
8483 OrdersUses.try_emplace(Order, 0).first->second += NumOps;
8484 }
8485 auto Res = OrdersUses.try_emplace(OrdersType(), 0);
8486 const auto AllowsReordering = [&](const TreeEntry *TE) {
8487 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
8488 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
8489 (IgnoreReorder && TE->Idx == 0))
8490 return true;
8491 if (TE->isGather()) {
8492 if (GathersToOrders.contains(TE))
8493 return !getReorderingData(*TE, /*TopToBottom=*/false,
8494 IgnoreReorder)
8495 .value_or(OrdersType(1))
8496 .empty();
8497 return true;
8498 }
8499 return false;
8500 };
8501 if (OpTE->UserTreeIndex) {
8502 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
8503 if (!VisitedUsers.insert(UserTE).second)
8504 continue;
8505 // May reorder user node if it requires reordering, has reused
8506 // scalars, is an alternate op vectorize node or its op nodes require
8507 // reordering.
8508 if (AllowsReordering(UserTE))
8509 continue;
8510 // Check if users allow reordering.
8511 // Currently look up just 1 level of operands to avoid increase of
8512 // the compile time.
8513 // Profitable to reorder if definitely more operands allow
8514 // reordering rather than those with natural order.
8516 if (static_cast<unsigned>(count_if(
8517 Ops, [UserTE, &AllowsReordering](
8518 const std::pair<unsigned, TreeEntry *> &Op) {
8519 return AllowsReordering(Op.second) &&
8520 Op.second->UserTreeIndex.UserTE == UserTE;
8521 })) <= Ops.size() / 2)
8522 ++Res.first->second;
8523 }
8524 }
8525 if (OrdersUses.empty()) {
8526 Visited.insert_range(llvm::make_second_range(Data.second));
8527 continue;
8528 }
8529 // Choose the most used order.
8530 unsigned IdentityCnt = 0;
8531 unsigned VF = Data.second.front().second->getVectorFactor();
8532 OrdersType IdentityOrder(VF, VF);
8533 for (auto &Pair : OrdersUses) {
8534 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
8535 IdentityCnt += Pair.second;
8536 combineOrders(IdentityOrder, Pair.first);
8537 }
8538 }
8539 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
8540 unsigned Cnt = IdentityCnt;
8541 for (auto &Pair : OrdersUses) {
8542 // Prefer identity order. But, if filled identity found (non-empty
8543 // order) with same number of uses, as the new candidate order, we can
8544 // choose this candidate order.
8545 if (Cnt < Pair.second) {
8546 combineOrders(Pair.first, BestOrder);
8547 BestOrder = Pair.first;
8548 Cnt = Pair.second;
8549 } else {
8550 combineOrders(BestOrder, Pair.first);
8551 }
8552 }
8553 // Set order of the user node.
8554 if (isIdentityOrder(BestOrder)) {
8555 Visited.insert_range(llvm::make_second_range(Data.second));
8556 continue;
8557 }
8558 fixupOrderingIndices(BestOrder);
8559 // Erase operands from OrderedEntries list and adjust their orders.
8560 VisitedOps.clear();
8561 SmallVector<int> Mask;
8562 inversePermutation(BestOrder, Mask);
8563 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
8564 unsigned E = BestOrder.size();
8565 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
8566 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8567 });
8568 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
8569 TreeEntry *TE = Op.second;
8570 if (!VisitedOps.insert(TE).second)
8571 continue;
8572 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
8573 reorderNodeWithReuses(*TE, Mask);
8574 continue;
8575 }
8576 // Gathers are processed separately.
8577 if (TE->State != TreeEntry::Vectorize &&
8578 TE->State != TreeEntry::StridedVectorize &&
8579 TE->State != TreeEntry::CompressVectorize &&
8580 TE->State != TreeEntry::SplitVectorize &&
8581 (TE->State != TreeEntry::ScatterVectorize ||
8582 TE->ReorderIndices.empty()))
8583 continue;
8584 assert((BestOrder.size() == TE->ReorderIndices.size() ||
8585 TE->ReorderIndices.empty()) &&
8586 "Non-matching sizes of user/operand entries.");
8587 reorderOrder(TE->ReorderIndices, Mask);
8588 if (IgnoreReorder && TE == VectorizableTree.front().get())
8589 IgnoreReorder = false;
8590 }
8591 // For gathers just need to reorder its scalars.
8592 for (TreeEntry *Gather : GatherOps) {
8593 assert(Gather->ReorderIndices.empty() &&
8594 "Unexpected reordering of gathers.");
8595 if (!Gather->ReuseShuffleIndices.empty()) {
8596 // Just reorder reuses indices.
8597 reorderReuses(Gather->ReuseShuffleIndices, Mask);
8598 continue;
8599 }
8600 reorderScalars(Gather->Scalars, Mask);
8601 Visited.insert(Gather);
8602 }
8603 // Reorder operands of the user node and set the ordering for the user
8604 // node itself.
8605 auto IsNotProfitableAltCodeNode = [](const TreeEntry &TE) {
8606 return TE.isAltShuffle() &&
8607 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
8608 TE.ReorderIndices.empty());
8609 };
8610 if (Data.first->State != TreeEntry::Vectorize ||
8611 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
8612 Data.first->getMainOp()) ||
8613 IsNotProfitableAltCodeNode(*Data.first))
8614 Data.first->reorderOperands(Mask);
8615 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
8616 IsNotProfitableAltCodeNode(*Data.first) ||
8617 Data.first->State == TreeEntry::StridedVectorize ||
8618 Data.first->State == TreeEntry::CompressVectorize) {
8619 reorderScalars(Data.first->Scalars, Mask);
8620 reorderOrder(Data.first->ReorderIndices, MaskOrder,
8621 /*BottomOrder=*/true);
8622 if (Data.first->ReuseShuffleIndices.empty() &&
8623 !Data.first->ReorderIndices.empty() &&
8624 !IsNotProfitableAltCodeNode(*Data.first)) {
8625 // Insert user node to the list to try to sink reordering deeper in
8626 // the graph.
8627 Queue.push(Data.first);
8628 }
8629 } else {
8630 reorderOrder(Data.first->ReorderIndices, Mask);
8631 }
8632 }
8633 }
8634 // If the reordering is unnecessary, just remove the reorder.
8635 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
8636 VectorizableTree.front()->ReuseShuffleIndices.empty())
8637 VectorizableTree.front()->ReorderIndices.clear();
8638}
8639
8640Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
8641 if (Entry.hasState() &&
8642 (Entry.getOpcode() == Instruction::Store ||
8643 Entry.getOpcode() == Instruction::Load) &&
8644 Entry.State == TreeEntry::StridedVectorize &&
8645 !Entry.ReorderIndices.empty() && isReverseOrder(Entry.ReorderIndices))
8646 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
8647 return dyn_cast<Instruction>(Entry.Scalars.front());
8648}
8649
8651 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
8652 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
8653 DenseMap<Value *, unsigned> ScalarToExtUses;
8654 SmallPtrSet<Value *, 4> ExternalUsers;
8655 // Collect the values that we need to extract from the tree.
8656 for (auto &TEPtr : VectorizableTree) {
8657 TreeEntry *Entry = TEPtr.get();
8658
8659 // No need to handle users of gathered values.
8660 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
8661 continue;
8662
8663 // For each lane:
8664 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
8665 Value *Scalar = Entry->Scalars[Lane];
8666 if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
8667 continue;
8668
8669 // All uses must be replaced already? No need to do it again.
8670 auto It = ScalarToExtUses.find(Scalar);
8671 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
8672 continue;
8673
8674 if (Scalar->hasNUsesOrMore(NumVectScalars)) {
8675 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8676 LLVM_DEBUG(dbgs() << "SLP: Need to extract from lane " << FoundLane
8677 << " from " << *Scalar << "for many users.\n");
8678 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
8679 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
8680 ExternalUsesWithNonUsers.insert(Scalar);
8681 continue;
8682 }
8683
8684 // Check if the scalar is externally used as an extra arg.
8685 const auto ExtI = ExternallyUsedValues.find(Scalar);
8686 if (ExtI != ExternallyUsedValues.end()) {
8687 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8688 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
8689 << FoundLane << " from " << *Scalar << ".\n");
8690 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
8691 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
8692 continue;
8693 }
8694 for (User *U : Scalar->users()) {
8695 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
8696
8697 Instruction *UserInst = dyn_cast<Instruction>(U);
8698 if (!UserInst || isDeleted(UserInst))
8699 continue;
8700
8701 // Ignore users in the user ignore list.
8702 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
8703 continue;
8704
8705 // Skip in-tree scalars that become vectors
8706 if (ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
8707 !UseEntries.empty()) {
8708 // Some in-tree scalars will remain as scalar in vectorized
8709 // instructions. If that is the case, the one in FoundLane will
8710 // be used.
8711 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
8712 isa<LoadInst, StoreInst>(UserInst)) ||
8713 isa<CallInst>(UserInst)) ||
8714 all_of(UseEntries, [&](TreeEntry *UseEntry) {
8715 return UseEntry->State == TreeEntry::ScatterVectorize ||
8717 Scalar, getRootEntryInstruction(*UseEntry), TLI,
8718 TTI);
8719 })) {
8720 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
8721 << ".\n");
8722 assert(none_of(UseEntries,
8723 [](TreeEntry *UseEntry) {
8724 return UseEntry->isGather();
8725 }) &&
8726 "Bad state");
8727 continue;
8728 }
8729 U = nullptr;
8730 if (It != ScalarToExtUses.end()) {
8731 ExternalUses[It->second].User = nullptr;
8732 break;
8733 }
8734 }
8735
8736 if (U && Scalar->hasNUsesOrMore(UsesLimit))
8737 U = nullptr;
8738 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8739 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
8740 << " from lane " << FoundLane << " from " << *Scalar
8741 << ".\n");
8742 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
8743 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
8744 ExternalUsesWithNonUsers.insert(Scalar);
8745 if (!U)
8746 break;
8747 }
8748 }
8749 }
8750}
8751
8753BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
8756 PtrToStoresMap;
8757 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
8758 Value *V = TE->Scalars[Lane];
8759 // Don't iterate over the users of constant data.
8760 if (!isa<Instruction>(V))
8761 continue;
8762 // To save compilation time we don't visit if we have too many users.
8763 if (V->hasNUsesOrMore(UsesLimit))
8764 break;
8765
8766 // Collect stores per pointer object.
8767 for (User *U : V->users()) {
8768 auto *SI = dyn_cast<StoreInst>(U);
8769 // Test whether we can handle the store. V might be a global, which could
8770 // be used in a different function.
8771 if (SI == nullptr || !SI->isSimple() || SI->getFunction() != F ||
8772 !isValidElementType(SI->getValueOperand()->getType()))
8773 continue;
8774 // Skip entry if already
8775 if (isVectorized(U))
8776 continue;
8777
8778 Value *Ptr =
8779 getUnderlyingObject(SI->getPointerOperand(), RecursionMaxDepth);
8780 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
8781 SI->getValueOperand()->getType(), Ptr}];
8782 // For now just keep one store per pointer object per lane.
8783 // TODO: Extend this to support multiple stores per pointer per lane
8784 if (StoresVec.size() > Lane)
8785 continue;
8786 if (!StoresVec.empty()) {
8787 std::optional<int64_t> Diff = getPointersDiff(
8788 SI->getValueOperand()->getType(), SI->getPointerOperand(),
8789 SI->getValueOperand()->getType(),
8790 StoresVec.front()->getPointerOperand(), *DL, *SE,
8791 /*StrictCheck=*/true);
8792 // We failed to compare the pointers so just abandon this store.
8793 if (!Diff)
8794 continue;
8795 }
8796 StoresVec.push_back(SI);
8797 }
8798 }
8799 SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());
8800 unsigned I = 0;
8801 for (auto &P : PtrToStoresMap) {
8802 Res[I].swap(P.second);
8803 ++I;
8804 }
8805 return Res;
8806}
8807
8808bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
8809 OrdersType &ReorderIndices) const {
8810 // We check whether the stores in StoreVec can form a vector by sorting them
8811 // and checking whether they are consecutive.
8812
8813 // To avoid calling getPointersDiff() while sorting we create a vector of
8814 // pairs {store, offset from first} and sort this instead.
8816 StoreInst *S0 = StoresVec[0];
8817 StoreOffsetVec.emplace_back(0, 0);
8818 Type *S0Ty = S0->getValueOperand()->getType();
8819 Value *S0Ptr = S0->getPointerOperand();
8820 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
8821 StoreInst *SI = StoresVec[Idx];
8822 std::optional<int64_t> Diff =
8823 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
8824 SI->getPointerOperand(), *DL, *SE,
8825 /*StrictCheck=*/true);
8826 StoreOffsetVec.emplace_back(*Diff, Idx);
8827 }
8828
8829 // Check if the stores are consecutive by checking if their difference is 1.
8830 if (StoreOffsetVec.size() != StoresVec.size())
8831 return false;
8832 sort(StoreOffsetVec, llvm::less_first());
8833 unsigned Idx = 0;
8834 int64_t PrevDist = 0;
8835 for (const auto &P : StoreOffsetVec) {
8836 if (Idx > 0 && P.first != PrevDist + 1)
8837 return false;
8838 PrevDist = P.first;
8839 ++Idx;
8840 }
8841
8842 // Calculate the shuffle indices according to their offset against the sorted
8843 // StoreOffsetVec.
8844 ReorderIndices.assign(StoresVec.size(), 0);
8845 bool IsIdentity = true;
8846 for (auto [I, P] : enumerate(StoreOffsetVec)) {
8847 ReorderIndices[P.second] = I;
8848 IsIdentity &= P.second == I;
8849 }
8850 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
8851 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
8852 // same convention here.
8853 if (IsIdentity)
8854 ReorderIndices.clear();
8855
8856 return true;
8857}
8858
8859#ifndef NDEBUG
8861 for (unsigned Idx : Order)
8862 dbgs() << Idx << ", ";
8863 dbgs() << "\n";
8864}
8865#endif
8866
8868BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
8869 unsigned NumLanes = TE->Scalars.size();
8870
8871 SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);
8872
8873 // Holds the reorder indices for each candidate store vector that is a user of
8874 // the current TreeEntry.
8875 SmallVector<OrdersType, 1> ExternalReorderIndices;
8876
8877 // Now inspect the stores collected per pointer and look for vectorization
8878 // candidates. For each candidate calculate the reorder index vector and push
8879 // it into `ExternalReorderIndices`
8880 for (ArrayRef<StoreInst *> StoresVec : Stores) {
8881 // If we have fewer than NumLanes stores, then we can't form a vector.
8882 if (StoresVec.size() != NumLanes)
8883 continue;
8884
8885 // If the stores are not consecutive then abandon this StoresVec.
8886 OrdersType ReorderIndices;
8887 if (!canFormVector(StoresVec, ReorderIndices))
8888 continue;
8889
8890 // We now know that the scalars in StoresVec can form a vector instruction,
8891 // so set the reorder indices.
8892 ExternalReorderIndices.push_back(ReorderIndices);
8893 }
8894 return ExternalReorderIndices;
8895}
8896
8898 const SmallDenseSet<Value *> &UserIgnoreLst) {
8899 deleteTree();
8900 UserIgnoreList = &UserIgnoreLst;
8901 if (!allSameType(Roots))
8902 return;
8903 buildTreeRec(Roots, 0, EdgeInfo());
8904}
8905
8907 deleteTree();
8908 if (!allSameType(Roots))
8909 return;
8910 buildTreeRec(Roots, 0, EdgeInfo());
8911}
8912
8913/// Tries to find subvector of loads and builds new vector of only loads if can
8914/// be profitable.
8916 const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL,
8918 SmallVectorImpl<SmallVector<std::pair<LoadInst *, int64_t>>> &GatheredLoads,
8919 bool AddNew = true) {
8920 if (VL.empty())
8921 return;
8922 Type *ScalarTy = getValueType(VL.front());
8923 if (!isValidElementType(ScalarTy))
8924 return;
8926 SmallVector<DenseMap<int64_t, LoadInst *>> ClusteredDistToLoad;
8927 for (Value *V : VL) {
8928 auto *LI = dyn_cast<LoadInst>(V);
8929 if (!LI)
8930 continue;
8931 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
8932 continue;
8933 bool IsFound = false;
8934 for (auto [Map, Data] : zip(ClusteredDistToLoad, ClusteredLoads)) {
8935 assert(LI->getParent() == Data.front().first->getParent() &&
8936 LI->getType() == Data.front().first->getType() &&
8937 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) ==
8938 getUnderlyingObject(Data.front().first->getPointerOperand(),
8940 "Expected loads with the same type, same parent and same "
8941 "underlying pointer.");
8942 std::optional<int64_t> Dist = getPointersDiff(
8943 LI->getType(), LI->getPointerOperand(), Data.front().first->getType(),
8944 Data.front().first->getPointerOperand(), DL, SE,
8945 /*StrictCheck=*/true);
8946 if (!Dist)
8947 continue;
8948 auto It = Map.find(*Dist);
8949 if (It != Map.end() && It->second != LI)
8950 continue;
8951 if (It == Map.end()) {
8952 Data.emplace_back(LI, *Dist);
8953 Map.try_emplace(*Dist, LI);
8954 }
8955 IsFound = true;
8956 break;
8957 }
8958 if (!IsFound) {
8959 ClusteredLoads.emplace_back().emplace_back(LI, 0);
8960 ClusteredDistToLoad.emplace_back().try_emplace(0, LI);
8961 }
8962 }
8963 auto FindMatchingLoads =
8966 &GatheredLoads,
8967 SetVector<unsigned> &ToAdd, SetVector<unsigned> &Repeated,
8968 int64_t &Offset, unsigned &Start) {
8969 if (Loads.empty())
8970 return GatheredLoads.end();
8971 LoadInst *LI = Loads.front().first;
8972 for (auto [Idx, Data] : enumerate(GatheredLoads)) {
8973 if (Idx < Start)
8974 continue;
8975 ToAdd.clear();
8976 if (LI->getParent() != Data.front().first->getParent() ||
8977 LI->getType() != Data.front().first->getType())
8978 continue;
8979 std::optional<int64_t> Dist =
8981 Data.front().first->getType(),
8982 Data.front().first->getPointerOperand(), DL, SE,
8983 /*StrictCheck=*/true);
8984 if (!Dist)
8985 continue;
8986 SmallSet<int64_t, 4> DataDists;
8988 for (std::pair<LoadInst *, int64_t> P : Data) {
8989 DataDists.insert(P.second);
8990 DataLoads.insert(P.first);
8991 }
8992 // Found matching gathered loads - check if all loads are unique or
8993 // can be effectively vectorized.
8994 unsigned NumUniques = 0;
8995 for (auto [Cnt, Pair] : enumerate(Loads)) {
8996 bool Used = DataLoads.contains(Pair.first);
8997 if (!Used && !DataDists.contains(*Dist + Pair.second)) {
8998 ++NumUniques;
8999 ToAdd.insert(Cnt);
9000 } else if (Used) {
9001 Repeated.insert(Cnt);
9002 }
9003 }
9004 if (NumUniques > 0 &&
9005 (Loads.size() == NumUniques ||
9006 (Loads.size() - NumUniques >= 2 &&
9007 Loads.size() - NumUniques >= Loads.size() / 2 &&
9008 (has_single_bit(Data.size() + NumUniques) ||
9009 bit_ceil(Data.size()) <
9010 bit_ceil(Data.size() + NumUniques))))) {
9011 Offset = *Dist;
9012 Start = Idx + 1;
9013 return std::next(GatheredLoads.begin(), Idx);
9014 }
9015 }
9016 ToAdd.clear();
9017 return GatheredLoads.end();
9018 };
9019 for (ArrayRef<std::pair<LoadInst *, int64_t>> Data : ClusteredLoads) {
9020 unsigned Start = 0;
9021 SetVector<unsigned> ToAdd, LocalToAdd, Repeated;
9022 int64_t Offset = 0;
9023 auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,
9024 Offset, Start);
9025 while (It != GatheredLoads.end()) {
9026 assert(!LocalToAdd.empty() && "Expected some elements to add.");
9027 for (unsigned Idx : LocalToAdd)
9028 It->emplace_back(Data[Idx].first, Data[Idx].second + Offset);
9029 ToAdd.insert_range(LocalToAdd);
9030 It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset,
9031 Start);
9032 }
9033 if (any_of(seq<unsigned>(Data.size()), [&](unsigned Idx) {
9034 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
9035 })) {
9036 auto AddNewLoads =
9038 for (unsigned Idx : seq<unsigned>(Data.size())) {
9039 if (ToAdd.contains(Idx) || Repeated.contains(Idx))
9040 continue;
9041 Loads.push_back(Data[Idx]);
9042 }
9043 };
9044 if (!AddNew) {
9045 LoadInst *LI = Data.front().first;
9046 It = find_if(
9047 GatheredLoads, [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9048 return PD.front().first->getParent() == LI->getParent() &&
9049 PD.front().first->getType() == LI->getType();
9050 });
9051 while (It != GatheredLoads.end()) {
9052 AddNewLoads(*It);
9053 It = std::find_if(
9054 std::next(It), GatheredLoads.end(),
9055 [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9056 return PD.front().first->getParent() == LI->getParent() &&
9057 PD.front().first->getType() == LI->getType();
9058 });
9059 }
9060 }
9061 GatheredLoads.emplace_back().append(Data.begin(), Data.end());
9062 AddNewLoads(GatheredLoads.emplace_back());
9063 }
9064 }
9065}
9066
9067void BoUpSLP::tryToVectorizeGatheredLoads(
9068 const SmallMapVector<
9069 std::tuple<BasicBlock *, Value *, Type *>,
9070 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
9071 &GatheredLoads) {
9072 GatheredLoadsEntriesFirst = VectorizableTree.size();
9073
9074 SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
9075 LoadEntriesToVectorize.size());
9076 for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
9077 Set.insert_range(VectorizableTree[Idx]->Scalars);
9078
9079 // Sort loads by distance.
9080 auto LoadSorter = [](const std::pair<LoadInst *, int64_t> &L1,
9081 const std::pair<LoadInst *, int64_t> &L2) {
9082 return L1.second > L2.second;
9083 };
9084
9085 auto IsMaskedGatherSupported = [&, TTI = TTI](ArrayRef<LoadInst *> Loads) {
9086 ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
9087 Loads.size());
9088 Align Alignment = computeCommonAlignment<LoadInst>(Values);
9089 auto *Ty = getWidenedType(Loads.front()->getType(), Loads.size());
9090 return TTI->isLegalMaskedGather(Ty, Alignment) &&
9091 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
9092 };
9093
9094 auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,
9095 BoUpSLP::ValueSet &VectorizedLoads,
9096 SmallVectorImpl<LoadInst *> &NonVectorized,
9097 bool Final, unsigned MaxVF) {
9099 unsigned StartIdx = 0;
9100 SmallVector<int> CandidateVFs;
9101 if (VectorizeNonPowerOf2 && has_single_bit(MaxVF + 1))
9102 CandidateVFs.push_back(MaxVF);
9103 for (int NumElts = getFloorFullVectorNumberOfElements(
9104 *TTI, Loads.front()->getType(), MaxVF);
9105 NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(
9106 *TTI, Loads.front()->getType(), NumElts - 1)) {
9107 CandidateVFs.push_back(NumElts);
9108 if (VectorizeNonPowerOf2 && NumElts > 2)
9109 CandidateVFs.push_back(NumElts - 1);
9110 }
9111
9112 if (Final && CandidateVFs.empty())
9113 return Results;
9114
9115 unsigned BestVF = Final ? CandidateVFs.back() : 0;
9116 for (unsigned NumElts : CandidateVFs) {
9117 if (Final && NumElts > BestVF)
9118 continue;
9119 SmallVector<unsigned> MaskedGatherVectorized;
9120 for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E;
9121 ++Cnt) {
9122 ArrayRef<LoadInst *> Slice =
9123 ArrayRef(Loads).slice(Cnt, std::min(NumElts, E - Cnt));
9124 if (VectorizedLoads.count(Slice.front()) ||
9125 VectorizedLoads.count(Slice.back()) ||
9127 continue;
9128 // Check if it is profitable to try vectorizing gathered loads. It is
9129 // profitable if we have more than 3 consecutive loads or if we have
9130 // less but all users are vectorized or deleted.
9131 bool AllowToVectorize = false;
9132 // Check if it is profitable to vectorize 2-elements loads.
9133 if (NumElts == 2) {
9134 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
9135 Slice.front()->getType(), ElementCount::getFixed(NumElts));
9136 auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
9137 for (LoadInst *LI : Slice) {
9138 // If single use/user - allow to vectorize.
9139 if (LI->hasOneUse())
9140 continue;
9141 // 1. Check if number of uses equals number of users.
9142 // 2. All users are deleted.
9143 // 3. The load broadcasts are not allowed or the load is not
9144 // broadcasted.
9145 if (static_cast<unsigned int>(std::distance(
9146 LI->user_begin(), LI->user_end())) != LI->getNumUses())
9147 return false;
9148 if (!IsLegalBroadcastLoad)
9149 continue;
9150 if (LI->hasNUsesOrMore(UsesLimit))
9151 return false;
9152 for (User *U : LI->users()) {
9153 if (auto *UI = dyn_cast<Instruction>(U); UI && isDeleted(UI))
9154 continue;
9155 for (const TreeEntry *UTE : getTreeEntries(U)) {
9156 for (int I : seq<int>(UTE->getNumOperands())) {
9157 if (all_of(UTE->getOperand(I), [LI](Value *V) {
9158 return V == LI || isa<PoisonValue>(V);
9159 }))
9160 // Found legal broadcast - do not vectorize.
9161 return false;
9162 }
9163 }
9164 }
9165 }
9166 return true;
9167 };
9168 AllowToVectorize = CheckIfAllowed(Slice);
9169 } else {
9170 AllowToVectorize =
9171 (NumElts >= 3 ||
9172 any_of(ValueToGatherNodes.at(Slice.front()),
9173 [=](const TreeEntry *TE) {
9174 return TE->Scalars.size() == 2 &&
9175 ((TE->Scalars.front() == Slice.front() &&
9176 TE->Scalars.back() == Slice.back()) ||
9177 (TE->Scalars.front() == Slice.back() &&
9178 TE->Scalars.back() == Slice.front()));
9179 })) &&
9180 hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),
9181 Slice.size());
9182 }
9183 if (AllowToVectorize) {
9184 SmallVector<Value *> PointerOps;
9185 OrdersType CurrentOrder;
9186 // Try to build vector load.
9187 ArrayRef<Value *> Values(
9188 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9189 LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
9190 PointerOps, &BestVF);
9191 if (LS != LoadsState::Gather ||
9192 (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
9193 if (LS == LoadsState::ScatterVectorize) {
9194 if (MaskedGatherVectorized.empty() ||
9195 Cnt >= MaskedGatherVectorized.back() + NumElts)
9196 MaskedGatherVectorized.push_back(Cnt);
9197 continue;
9198 }
9199 if (LS != LoadsState::Gather) {
9200 Results.emplace_back(Values, LS);
9201 VectorizedLoads.insert_range(Slice);
9202 // If we vectorized initial block, no need to try to vectorize it
9203 // again.
9204 if (Cnt == StartIdx)
9205 StartIdx += NumElts;
9206 }
9207 // Check if the whole array was vectorized already - exit.
9208 if (StartIdx >= Loads.size())
9209 break;
9210 // Erase last masked gather candidate, if another candidate within
9211 // the range is found to be better.
9212 if (!MaskedGatherVectorized.empty() &&
9213 Cnt < MaskedGatherVectorized.back() + NumElts)
9214 MaskedGatherVectorized.pop_back();
9215 Cnt += NumElts - 1;
9216 continue;
9217 }
9218 }
9219 if (!AllowToVectorize || BestVF == 0)
9221 }
9222 // Mark masked gathers candidates as vectorized, if any.
9223 for (unsigned Cnt : MaskedGatherVectorized) {
9224 ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(
9225 Cnt, std::min<unsigned>(NumElts, Loads.size() - Cnt));
9226 ArrayRef<Value *> Values(
9227 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9228 Results.emplace_back(Values, LoadsState::ScatterVectorize);
9229 VectorizedLoads.insert_range(Slice);
9230 // If we vectorized initial block, no need to try to vectorize it again.
9231 if (Cnt == StartIdx)
9232 StartIdx += NumElts;
9233 }
9234 }
9235 for (LoadInst *LI : Loads) {
9236 if (!VectorizedLoads.contains(LI))
9237 NonVectorized.push_back(LI);
9238 }
9239 return Results;
9240 };
9241 auto ProcessGatheredLoads =
9242 [&, &TTI = *TTI](
9244 bool Final = false) {
9245 SmallVector<LoadInst *> NonVectorized;
9246 for (ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
9247 GatheredLoads) {
9248 if (LoadsDists.size() <= 1) {
9249 NonVectorized.push_back(LoadsDists.back().first);
9250 continue;
9251 }
9253 LoadsDists);
9254 SmallVector<LoadInst *> OriginalLoads(make_first_range(LoadsDists));
9255 stable_sort(LocalLoadsDists, LoadSorter);
9257 unsigned MaxConsecutiveDistance = 0;
9258 unsigned CurrentConsecutiveDist = 1;
9259 int64_t LastDist = LocalLoadsDists.front().second;
9260 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
9261 for (const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
9262 if (isVectorized(L.first))
9263 continue;
9264 assert(LastDist >= L.second &&
9265 "Expected first distance always not less than second");
9266 if (static_cast<uint64_t>(LastDist - L.second) ==
9267 CurrentConsecutiveDist) {
9268 ++CurrentConsecutiveDist;
9269 MaxConsecutiveDistance =
9270 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
9271 Loads.push_back(L.first);
9272 continue;
9273 }
9274 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
9275 !Loads.empty())
9276 Loads.pop_back();
9277 CurrentConsecutiveDist = 1;
9278 LastDist = L.second;
9279 Loads.push_back(L.first);
9280 }
9281 if (Loads.size() <= 1)
9282 continue;
9283 if (AllowMaskedGather)
9284 MaxConsecutiveDistance = Loads.size();
9285 else if (MaxConsecutiveDistance < 2)
9286 continue;
9287 BoUpSLP::ValueSet VectorizedLoads;
9288 SmallVector<LoadInst *> SortedNonVectorized;
9290 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
9291 Final, MaxConsecutiveDistance);
9292 if (!Results.empty() && !SortedNonVectorized.empty() &&
9293 OriginalLoads.size() == Loads.size() &&
9294 MaxConsecutiveDistance == Loads.size() &&
9296 [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
9297 return P.second == LoadsState::ScatterVectorize;
9298 })) {
9299 VectorizedLoads.clear();
9300 SmallVector<LoadInst *> UnsortedNonVectorized;
9302 UnsortedResults =
9303 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
9304 UnsortedNonVectorized, Final,
9305 OriginalLoads.size());
9306 if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
9307 SortedNonVectorized.swap(UnsortedNonVectorized);
9308 Results.swap(UnsortedResults);
9309 }
9310 }
9311 for (auto [Slice, _] : Results) {
9312 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
9313 << Slice.size() << ")\n");
9314 if (any_of(Slice, [&](Value *V) { return isVectorized(V); })) {
9315 for (Value *L : Slice)
9316 if (!isVectorized(L))
9317 SortedNonVectorized.push_back(cast<LoadInst>(L));
9318 continue;
9319 }
9320
9321 // Select maximum VF as a maximum of user gathered nodes and
9322 // distance between scalar loads in these nodes.
9323 unsigned MaxVF = Slice.size();
9324 unsigned UserMaxVF = 0;
9325 unsigned InterleaveFactor = 0;
9326 if (MaxVF == 2) {
9327 UserMaxVF = MaxVF;
9328 } else {
9329 // Found distance between segments of the interleaved loads.
9330 std::optional<unsigned> InterleavedLoadsDistance = 0;
9331 unsigned Order = 0;
9332 std::optional<unsigned> CommonVF = 0;
9334 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
9335 for (auto [Idx, V] : enumerate(Slice)) {
9336 for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
9337 UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
9338 unsigned Pos =
9339 EntryToPosition.try_emplace(E, Idx).first->second;
9340 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
9341 if (CommonVF) {
9342 if (*CommonVF == 0) {
9343 CommonVF = E->Scalars.size();
9344 continue;
9345 }
9346 if (*CommonVF != E->Scalars.size())
9347 CommonVF.reset();
9348 }
9349 // Check if the load is the part of the interleaved load.
9350 if (Pos != Idx && InterleavedLoadsDistance) {
9351 if (!DeinterleavedNodes.contains(E) &&
9352 any_of(E->Scalars, [&, Slice = Slice](Value *V) {
9353 if (isa<Constant>(V))
9354 return false;
9355 if (isVectorized(V))
9356 return true;
9357 const auto &Nodes = ValueToGatherNodes.at(V);
9358 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
9359 !is_contained(Slice, V);
9360 })) {
9361 InterleavedLoadsDistance.reset();
9362 continue;
9363 }
9364 DeinterleavedNodes.insert(E);
9365 if (*InterleavedLoadsDistance == 0) {
9366 InterleavedLoadsDistance = Idx - Pos;
9367 continue;
9368 }
9369 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
9370 (Idx - Pos) / *InterleavedLoadsDistance < Order)
9371 InterleavedLoadsDistance.reset();
9372 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
9373 }
9374 }
9375 }
9376 DeinterleavedNodes.clear();
9377 // Check if the large load represents interleaved load operation.
9378 if (InterleavedLoadsDistance.value_or(0) > 1 &&
9379 CommonVF.value_or(0) != 0) {
9380 InterleaveFactor = bit_ceil(*InterleavedLoadsDistance);
9381 unsigned VF = *CommonVF;
9382 OrdersType Order;
9383 SmallVector<Value *> PointerOps;
9384 // Segmented load detected - vectorize at maximum vector factor.
9385 if (InterleaveFactor <= Slice.size() &&
9387 getWidenedType(Slice.front()->getType(), VF),
9388 InterleaveFactor,
9389 cast<LoadInst>(Slice.front())->getAlign(),
9390 cast<LoadInst>(Slice.front())
9392 canVectorizeLoads(Slice, Slice.front(), Order,
9393 PointerOps) == LoadsState::Vectorize) {
9394 UserMaxVF = InterleaveFactor * VF;
9395 } else {
9396 InterleaveFactor = 0;
9397 }
9398 }
9399 // Cannot represent the loads as consecutive vectorizable nodes -
9400 // just exit.
9401 unsigned ConsecutiveNodesSize = 0;
9402 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
9403 any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9404 [&, Slice = Slice](const auto &P) {
9405 const auto *It = find_if(Slice, [&](Value *V) {
9406 return std::get<1>(P).contains(V);
9407 });
9408 if (It == Slice.end())
9409 return false;
9410 const TreeEntry &TE =
9411 *VectorizableTree[std::get<0>(P)];
9412 ArrayRef<Value *> VL = TE.Scalars;
9413 OrdersType Order;
9414 SmallVector<Value *> PointerOps;
9416 VL, VL.front(), Order, PointerOps);
9417 if (State == LoadsState::ScatterVectorize ||
9419 return false;
9420 ConsecutiveNodesSize += VL.size();
9421 size_t Start = std::distance(Slice.begin(), It);
9422 size_t Sz = Slice.size() - Start;
9423 return Sz < VL.size() ||
9424 Slice.slice(Start, VL.size()) != VL;
9425 }))
9426 continue;
9427 // Try to build long masked gather loads.
9428 UserMaxVF = bit_ceil(UserMaxVF);
9429 if (InterleaveFactor == 0 &&
9430 any_of(seq<unsigned>(Slice.size() / UserMaxVF),
9431 [&, Slice = Slice](unsigned Idx) {
9432 OrdersType Order;
9433 SmallVector<Value *> PointerOps;
9434 return canVectorizeLoads(
9435 Slice.slice(Idx * UserMaxVF, UserMaxVF),
9436 Slice[Idx * UserMaxVF], Order,
9437 PointerOps) ==
9438 LoadsState::ScatterVectorize;
9439 }))
9440 UserMaxVF = MaxVF;
9441 if (Slice.size() != ConsecutiveNodesSize)
9442 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
9443 }
9444 for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
9445 bool IsVectorized = true;
9446 for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
9447 ArrayRef<Value *> SubSlice =
9448 Slice.slice(I, std::min(VF, E - I));
9449 if (isVectorized(SubSlice.front()))
9450 continue;
9451 // Check if the subslice is to be-vectorized entry, which is not
9452 // equal to entry.
9453 if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9454 [&](const auto &P) {
9455 return !SubSlice.equals(
9456 VectorizableTree[std::get<0>(P)]
9457 ->Scalars) &&
9458 set_is_subset(SubSlice, std::get<1>(P));
9459 }))
9460 continue;
9461 unsigned Sz = VectorizableTree.size();
9462 buildTreeRec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
9463 if (Sz == VectorizableTree.size()) {
9464 IsVectorized = false;
9465 // Try non-interleaved vectorization with smaller vector
9466 // factor.
9467 if (InterleaveFactor > 0) {
9468 VF = 2 * (MaxVF / InterleaveFactor);
9469 InterleaveFactor = 0;
9470 }
9471 continue;
9472 }
9473 }
9474 if (IsVectorized)
9475 break;
9476 }
9477 }
9478 NonVectorized.append(SortedNonVectorized);
9479 }
9480 return NonVectorized;
9481 };
9482 for (const auto &GLs : GatheredLoads) {
9483 const auto &Ref = GLs.second;
9484 SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);
9485 if (!Ref.empty() && !NonVectorized.empty() &&
9486 std::accumulate(
9487 Ref.begin(), Ref.end(), 0u,
9488 [](unsigned S, ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
9489 -> unsigned { return S + LoadsDists.size(); }) !=
9490 NonVectorized.size() &&
9491 IsMaskedGatherSupported(NonVectorized)) {
9493 FinalGatheredLoads;
9494 for (LoadInst *LI : NonVectorized) {
9495 // Reinsert non-vectorized loads to other list of loads with the same
9496 // base pointers.
9497 gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI,
9498 FinalGatheredLoads,
9499 /*AddNew=*/false);
9500 }
9501 // Final attempt to vectorize non-vectorized loads.
9502 (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
9503 }
9504 }
9505 // Try to vectorize postponed load entries, previously marked as gathered.
9506 for (unsigned Idx : LoadEntriesToVectorize) {
9507 const TreeEntry &E = *VectorizableTree[Idx];
9508 SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
9509 // Avoid reordering, if possible.
9510 if (!E.ReorderIndices.empty()) {
9511 // Build a mask out of the reorder indices and reorder scalars per this
9512 // mask.
9513 SmallVector<int> ReorderMask;
9514 inversePermutation(E.ReorderIndices, ReorderMask);
9515 reorderScalars(GatheredScalars, ReorderMask);
9516 }
9517 buildTreeRec(GatheredScalars, 0, EdgeInfo());
9518 }
9519 // If no new entries created, consider it as no gathered loads entries must be
9520 // handled.
9521 if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
9522 VectorizableTree.size())
9523 GatheredLoadsEntriesFirst.reset();
9524}
9525
9526/// Generates key/subkey pair for the given value to provide effective sorting
9527/// of the values and better detection of the vectorizable values sequences. The
9528/// keys/subkeys can be used for better sorting of the values themselves (keys)
9529/// and in values subgroups (subkeys).
9530static std::pair<size_t, size_t> generateKeySubkey(
9531 Value *V, const TargetLibraryInfo *TLI,
9532 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
9533 bool AllowAlternate) {
9534 hash_code Key = hash_value(V->getValueID() + 2);
9535 hash_code SubKey = hash_value(0);
9536 // Sort the loads by the distance between the pointers.
9537 if (auto *LI = dyn_cast<LoadInst>(V)) {
9538 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
9539 if (LI->isSimple())
9540 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
9541 else
9542 Key = SubKey = hash_value(LI);
9543 } else if (isVectorLikeInstWithConstOps(V)) {
9544 // Sort extracts by the vector operands.
9545 if (isa<ExtractElementInst, UndefValue>(V))
9546 Key = hash_value(Value::UndefValueVal + 1);
9547 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
9548 if (!isUndefVector(EI->getVectorOperand()).all() &&
9549 !isa<UndefValue>(EI->getIndexOperand()))
9550 SubKey = hash_value(EI->getVectorOperand());
9551 }
9552 } else if (auto *I = dyn_cast<Instruction>(V)) {
9553 // Sort other instructions just by the opcodes except for CMPInst.
9554 // For CMP also sort by the predicate kind.
9555 if ((isa<BinaryOperator, CastInst>(I)) &&
9556 isValidForAlternation(I->getOpcode())) {
9557 if (AllowAlternate)
9558 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
9559 else
9560 Key = hash_combine(hash_value(I->getOpcode()), Key);
9561 SubKey = hash_combine(
9562 hash_value(I->getOpcode()), hash_value(I->getType()),
9563 hash_value(isa<BinaryOperator>(I)
9564 ? I->getType()
9565 : cast<CastInst>(I)->getOperand(0)->getType()));
9566 // For casts, look through the only operand to improve compile time.
9567 if (isa<CastInst>(I)) {
9568 std::pair<size_t, size_t> OpVals =
9569 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
9570 /*AllowAlternate=*/true);
9571 Key = hash_combine(OpVals.first, Key);
9572 SubKey = hash_combine(OpVals.first, SubKey);
9573 }
9574 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
9575 CmpInst::Predicate Pred = CI->getPredicate();
9576 if (CI->isCommutative())
9577 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
9579 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
9580 hash_value(SwapPred),
9581 hash_value(CI->getOperand(0)->getType()));
9582 } else if (auto *Call = dyn_cast<CallInst>(I)) {
9585 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
9586 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
9587 SubKey = hash_combine(hash_value(I->getOpcode()),
9588 hash_value(Call->getCalledFunction()));
9589 } else {
9590 Key = hash_combine(hash_value(Call), Key);
9591 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
9592 }
9593 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
9594 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
9595 hash_value(Op.Tag), SubKey);
9596 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
9597 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
9598 SubKey = hash_value(Gep->getPointerOperand());
9599 else
9600 SubKey = hash_value(Gep);
9601 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
9602 !isa<ConstantInt>(I->getOperand(1))) {
9603 // Do not try to vectorize instructions with potentially high cost.
9604 SubKey = hash_value(I);
9605 } else {
9606 SubKey = hash_value(I->getOpcode());
9607 }
9608 Key = hash_combine(hash_value(I->getParent()), Key);
9609 }
9610 return std::make_pair(Key, SubKey);
9611}
9612
9613/// Checks if the specified instruction \p I is an main operation for the given
9614/// \p MainOp and \p AltOp instructions.
9615static bool isMainInstruction(Instruction *I, Instruction *MainOp,
9616 Instruction *AltOp, const TargetLibraryInfo &TLI);
9617
9618bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
9619 ArrayRef<Value *> VL) const {
9620 Type *ScalarTy = S.getMainOp()->getType();
9621 unsigned Opcode0 = S.getOpcode();
9622 unsigned Opcode1 = S.getAltOpcode();
9623 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
9624 // If this pattern is supported by the target then consider it profitable.
9625 if (TTI->isLegalAltInstr(getWidenedType(ScalarTy, VL.size()), Opcode0,
9626 Opcode1, OpcodeMask))
9627 return true;
9629 for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
9630 Operands.emplace_back();
9631 // Prepare the operand vector.
9632 for (Value *V : VL) {
9633 if (isa<PoisonValue>(V)) {
9634 Operands.back().push_back(
9635 PoisonValue::get(S.getMainOp()->getOperand(I)->getType()));
9636 continue;
9637 }
9638 Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
9639 }
9640 }
9641 if (Operands.size() == 2) {
9642 // Try find best operands candidates.
9643 for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
9645 Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
9646 Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
9647 Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
9648 std::optional<int> Res = findBestRootPair(Candidates);
9649 switch (Res.value_or(0)) {
9650 case 0:
9651 break;
9652 case 1:
9653 std::swap(Operands[0][I + 1], Operands[1][I + 1]);
9654 break;
9655 case 2:
9656 std::swap(Operands[0][I], Operands[1][I]);
9657 break;
9658 default:
9659 llvm_unreachable("Unexpected index.");
9660 }
9661 }
9662 }
9663 DenseSet<unsigned> UniqueOpcodes;
9664 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
9665 unsigned NonInstCnt = 0;
9666 // Estimate number of instructions, required for the vectorized node and for
9667 // the buildvector node.
9668 unsigned UndefCnt = 0;
9669 // Count the number of extra shuffles, required for vector nodes.
9670 unsigned ExtraShuffleInsts = 0;
9671 // Check that operands do not contain same values and create either perfect
9672 // diamond match or shuffled match.
9673 if (Operands.size() == 2) {
9674 // Do not count same operands twice.
9675 if (Operands.front() == Operands.back()) {
9676 Operands.erase(Operands.begin());
9677 } else if (!allConstant(Operands.front()) &&
9678 all_of(Operands.front(), [&](Value *V) {
9679 return is_contained(Operands.back(), V);
9680 })) {
9681 Operands.erase(Operands.begin());
9682 ++ExtraShuffleInsts;
9683 }
9684 }
9685 const Loop *L = LI->getLoopFor(S.getMainOp()->getParent());
9686 // Vectorize node, if:
9687 // 1. at least single operand is constant or splat.
9688 // 2. Operands have many loop invariants (the instructions are not loop
9689 // invariants).
9690 // 3. At least single unique operands is supposed to vectorized.
9691 return none_of(Operands,
9692 [&](ArrayRef<Value *> Op) {
9693 if (allConstant(Op) ||
9694 (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
9695 getSameOpcode(Op, *TLI)))
9696 return false;
9698 for (Value *V : Op) {
9699 if (isa<Constant, ExtractElementInst>(V) ||
9700 isVectorized(V) || (L && L->isLoopInvariant(V))) {
9701 if (isa<UndefValue>(V))
9702 ++UndefCnt;
9703 continue;
9704 }
9705 auto Res = Uniques.try_emplace(V, 0);
9706 // Found first duplicate - need to add shuffle.
9707 if (!Res.second && Res.first->second == 1)
9708 ++ExtraShuffleInsts;
9709 ++Res.first->getSecond();
9710 if (auto *I = dyn_cast<Instruction>(V))
9711 UniqueOpcodes.insert(I->getOpcode());
9712 else if (Res.second)
9713 ++NonInstCnt;
9714 }
9715 return none_of(Uniques, [&](const auto &P) {
9716 return P.first->hasNUsesOrMore(P.second + 1) &&
9717 none_of(P.first->users(), [&](User *U) {
9718 return isVectorized(U) || Uniques.contains(U);
9719 });
9720 });
9721 }) ||
9722 // Do not vectorize node, if estimated number of vector instructions is
9723 // more than estimated number of buildvector instructions. Number of
9724 // vector operands is number of vector instructions + number of vector
9725 // instructions for operands (buildvectors). Number of buildvector
9726 // instructions is just number_of_operands * number_of_scalars.
9727 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
9728 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
9729 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
9730}
9731
9732/// Builds the arguments types vector for the given call instruction with the
9733/// given \p ID for the specified vector factor.
9736 const unsigned VF, unsigned MinBW,
9737 const TargetTransformInfo *TTI) {
9738 SmallVector<Type *> ArgTys;
9739 for (auto [Idx, Arg] : enumerate(CI->args())) {
9742 ArgTys.push_back(Arg->getType());
9743 continue;
9744 }
9745 if (MinBW > 0) {
9746 ArgTys.push_back(
9747 getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
9748 continue;
9749 }
9750 }
9751 ArgTys.push_back(getWidenedType(Arg->getType(), VF));
9752 }
9753 return ArgTys;
9754}
9755
9756/// Calculates the costs of vectorized intrinsic (if possible) and vectorized
9757/// function (if possible) calls. Returns invalid cost for the corresponding
9758/// calls, if they cannot be vectorized/will be scalarized.
9759static std::pair<InstructionCost, InstructionCost>
9762 ArrayRef<Type *> ArgTys) {
9763 auto Shape = VFShape::get(CI->getFunctionType(),
9765 false /*HasGlobalPred*/);
9766 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
9767 auto LibCost = InstructionCost::getInvalid();
9768 if (!CI->isNoBuiltin() && VecFunc) {
9769 // Calculate the cost of the vector library call.
9770 // If the corresponding vector call is cheaper, return its cost.
9771 LibCost =
9772 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
9773 }
9775
9776 // Calculate the cost of the vector intrinsic call.
9777 FastMathFlags FMF;
9778 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
9779 FMF = FPCI->getFastMathFlags();
9780 const InstructionCost ScalarLimit = 10000;
9781 IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF, nullptr,
9782 LibCost.isValid() ? LibCost : ScalarLimit);
9783 auto IntrinsicCost =
9785 if ((LibCost.isValid() && IntrinsicCost > LibCost) ||
9786 (!LibCost.isValid() && IntrinsicCost > ScalarLimit))
9788
9789 return {IntrinsicCost, LibCost};
9790}
9791
9792BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
9793 const InstructionsState &S, ArrayRef<Value *> VL,
9794 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
9795 SmallVectorImpl<Value *> &PointerOps) {
9796 assert(S.getMainOp() &&
9797 "Expected instructions with same/alternate opcodes only.");
9798
9799 unsigned ShuffleOrOp =
9800 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
9801 Instruction *VL0 = S.getMainOp();
9802 switch (ShuffleOrOp) {
9803 case Instruction::PHI: {
9804 // Too many operands - gather, most probably won't be vectorized.
9805 if (VL0->getNumOperands() > MaxPHINumOperands)
9806 return TreeEntry::NeedToGather;
9807 // Check for terminator values (e.g. invoke).
9808 for (Value *V : VL) {
9809 auto *PHI = dyn_cast<PHINode>(V);
9810 if (!PHI)
9811 continue;
9812 for (Value *Incoming : PHI->incoming_values()) {
9813 Instruction *Term = dyn_cast<Instruction>(Incoming);
9814 if (Term && Term->isTerminator()) {
9816 << "SLP: Need to swizzle PHINodes (terminator use).\n");
9817 return TreeEntry::NeedToGather;
9818 }
9819 }
9820 }
9821
9822 return TreeEntry::Vectorize;
9823 }
9824 case Instruction::ExtractElement:
9825 if (any_of(VL, [&](Value *V) {
9826 auto *EI = dyn_cast<ExtractElementInst>(V);
9827 if (!EI)
9828 return true;
9829 return isVectorized(EI->getOperand(0));
9830 }))
9831 return TreeEntry::NeedToGather;
9832 [[fallthrough]];
9833 case Instruction::ExtractValue: {
9834 bool Reuse = canReuseExtract(VL, CurrentOrder);
9835 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
9836 // non-full registers).
9837 if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size()))
9838 return TreeEntry::NeedToGather;
9839 if (Reuse || !CurrentOrder.empty())
9840 return TreeEntry::Vectorize;
9841 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
9842 return TreeEntry::NeedToGather;
9843 }
9844 case Instruction::InsertElement: {
9845 // Check that we have a buildvector and not a shuffle of 2 or more
9846 // different vectors.
9847 ValueSet SourceVectors;
9848 for (Value *V : VL) {
9849 if (isa<PoisonValue>(V)) {
9850 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement/poison vector.\n");
9851 return TreeEntry::NeedToGather;
9852 }
9853 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
9854 assert(getElementIndex(V) != std::nullopt &&
9855 "Non-constant or undef index?");
9856 }
9857
9858 if (count_if(VL, [&SourceVectors](Value *V) {
9859 return !SourceVectors.contains(V);
9860 }) >= 2) {
9861 // Found 2nd source vector - cancel.
9862 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
9863 "different source vectors.\n");
9864 return TreeEntry::NeedToGather;
9865 }
9866
9867 if (any_of(VL, [&SourceVectors](Value *V) {
9868 // The last InsertElement can have multiple uses.
9869 return SourceVectors.contains(V) && !V->hasOneUse();
9870 })) {
9871 assert(SLPReVec && "Only supported by REVEC.");
9872 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
9873 "multiple uses.\n");
9874 return TreeEntry::NeedToGather;
9875 }
9876
9877 return TreeEntry::Vectorize;
9878 }
9879 case Instruction::Load: {
9880 // Check that a vectorized load would load the same memory as a scalar
9881 // load. For example, we don't want to vectorize loads that are smaller
9882 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
9883 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
9884 // from such a struct, we read/write packed bits disagreeing with the
9885 // unvectorized version.
9886 auto IsGatheredNode = [&]() {
9887 if (!GatheredLoadsEntriesFirst)
9888 return false;
9889 return all_of(VL, [&](Value *V) {
9890 if (isa<PoisonValue>(V))
9891 return true;
9892 return any_of(getTreeEntries(V), [&](const TreeEntry *TE) {
9893 return TE->Idx >= *GatheredLoadsEntriesFirst;
9894 });
9895 });
9896 };
9897 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {
9899 return TreeEntry::Vectorize;
9901 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
9902 // Delay slow vectorized nodes for better vectorization attempts.
9903 LoadEntriesToVectorize.insert(VectorizableTree.size());
9904 return TreeEntry::NeedToGather;
9905 }
9906 return IsGatheredNode() ? TreeEntry::NeedToGather
9907 : TreeEntry::CompressVectorize;
9909 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
9910 // Delay slow vectorized nodes for better vectorization attempts.
9911 LoadEntriesToVectorize.insert(VectorizableTree.size());
9912 return TreeEntry::NeedToGather;
9913 }
9914 return IsGatheredNode() ? TreeEntry::NeedToGather
9915 : TreeEntry::ScatterVectorize;
9917 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
9918 // Delay slow vectorized nodes for better vectorization attempts.
9919 LoadEntriesToVectorize.insert(VectorizableTree.size());
9920 return TreeEntry::NeedToGather;
9921 }
9922 return IsGatheredNode() ? TreeEntry::NeedToGather
9923 : TreeEntry::StridedVectorize;
9924 case LoadsState::Gather:
9925#ifndef NDEBUG
9926 Type *ScalarTy = VL0->getType();
9927 if (DL->getTypeSizeInBits(ScalarTy) !=
9928 DL->getTypeAllocSizeInBits(ScalarTy))
9929 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
9930 else if (any_of(VL, [](Value *V) {
9931 auto *LI = dyn_cast<LoadInst>(V);
9932 return !LI || !LI->isSimple();
9933 }))
9934 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
9935 else
9936 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
9937#endif // NDEBUG
9939 return TreeEntry::NeedToGather;
9940 }
9941 llvm_unreachable("Unexpected state of loads");
9942 }
9943 case Instruction::ZExt:
9944 case Instruction::SExt:
9945 case Instruction::FPToUI:
9946 case Instruction::FPToSI:
9947 case Instruction::FPExt:
9948 case Instruction::PtrToInt:
9949 case Instruction::IntToPtr:
9950 case Instruction::SIToFP:
9951 case Instruction::UIToFP:
9952 case Instruction::Trunc:
9953 case Instruction::FPTrunc:
9954 case Instruction::BitCast: {
9955 Type *SrcTy = VL0->getOperand(0)->getType();
9956 for (Value *V : VL) {
9957 if (isa<PoisonValue>(V))
9958 continue;
9959 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
9960 if (Ty != SrcTy || !isValidElementType(Ty)) {
9961 LLVM_DEBUG(
9962 dbgs() << "SLP: Gathering casts with different src types.\n");
9963 return TreeEntry::NeedToGather;
9964 }
9965 }
9966 return TreeEntry::Vectorize;
9967 }
9968 case Instruction::ICmp:
9969 case Instruction::FCmp: {
9970 // Check that all of the compares have the same predicate.
9971 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
9973 Type *ComparedTy = VL0->getOperand(0)->getType();
9974 for (Value *V : VL) {
9975 if (isa<PoisonValue>(V))
9976 continue;
9977 auto *Cmp = cast<CmpInst>(V);
9978 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
9979 Cmp->getOperand(0)->getType() != ComparedTy) {
9980 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
9981 return TreeEntry::NeedToGather;
9982 }
9983 }
9984 return TreeEntry::Vectorize;
9985 }
9986 case Instruction::Select:
9987 case Instruction::FNeg:
9988 case Instruction::Add:
9989 case Instruction::FAdd:
9990 case Instruction::Sub:
9991 case Instruction::FSub:
9992 case Instruction::Mul:
9993 case Instruction::FMul:
9994 case Instruction::UDiv:
9995 case Instruction::SDiv:
9996 case Instruction::FDiv:
9997 case Instruction::URem:
9998 case Instruction::SRem:
9999 case Instruction::FRem:
10000 case Instruction::Shl:
10001 case Instruction::LShr:
10002 case Instruction::AShr:
10003 case Instruction::And:
10004 case Instruction::Or:
10005 case Instruction::Xor:
10006 case Instruction::Freeze:
10007 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10009 auto *I = dyn_cast<Instruction>(V);
10010 return I && I->isBinaryOp() && !I->isFast();
10011 }))
10012 return TreeEntry::NeedToGather;
10013 return TreeEntry::Vectorize;
10014 case Instruction::GetElementPtr: {
10015 // We don't combine GEPs with complicated (nested) indexing.
10016 for (Value *V : VL) {
10017 auto *I = dyn_cast<GetElementPtrInst>(V);
10018 if (!I)
10019 continue;
10020 if (I->getNumOperands() != 2) {
10021 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
10022 return TreeEntry::NeedToGather;
10023 }
10024 }
10025
10026 // We can't combine several GEPs into one vector if they operate on
10027 // different types.
10028 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
10029 for (Value *V : VL) {
10030 auto *GEP = dyn_cast<GEPOperator>(V);
10031 if (!GEP)
10032 continue;
10033 Type *CurTy = GEP->getSourceElementType();
10034 if (Ty0 != CurTy) {
10035 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
10036 return TreeEntry::NeedToGather;
10037 }
10038 }
10039
10040 // We don't combine GEPs with non-constant indexes.
10041 Type *Ty1 = VL0->getOperand(1)->getType();
10042 for (Value *V : VL) {
10043 auto *I = dyn_cast<GetElementPtrInst>(V);
10044 if (!I)
10045 continue;
10046 auto *Op = I->getOperand(1);
10047 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
10048 (Op->getType() != Ty1 &&
10049 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
10050 Op->getType()->getScalarSizeInBits() >
10051 DL->getIndexSizeInBits(
10052 V->getType()->getPointerAddressSpace())))) {
10053 LLVM_DEBUG(
10054 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
10055 return TreeEntry::NeedToGather;
10056 }
10057 }
10058
10059 return TreeEntry::Vectorize;
10060 }
10061 case Instruction::Store: {
10062 // Check if the stores are consecutive or if we need to swizzle them.
10063 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
10064 // Avoid types that are padded when being allocated as scalars, while
10065 // being packed together in a vector (such as i1).
10066 if (DL->getTypeSizeInBits(ScalarTy) !=
10067 DL->getTypeAllocSizeInBits(ScalarTy)) {
10068 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
10069 return TreeEntry::NeedToGather;
10070 }
10071 // Make sure all stores in the bundle are simple - we can't vectorize
10072 // atomic or volatile stores.
10073 for (Value *V : VL) {
10074 auto *SI = cast<StoreInst>(V);
10075 if (!SI->isSimple()) {
10076 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
10077 return TreeEntry::NeedToGather;
10078 }
10079 PointerOps.push_back(SI->getPointerOperand());
10080 }
10081
10082 // Check the order of pointer operands.
10083 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
10084 Value *Ptr0;
10085 Value *PtrN;
10086 if (CurrentOrder.empty()) {
10087 Ptr0 = PointerOps.front();
10088 PtrN = PointerOps.back();
10089 } else {
10090 Ptr0 = PointerOps[CurrentOrder.front()];
10091 PtrN = PointerOps[CurrentOrder.back()];
10092 }
10093 std::optional<int64_t> Dist =
10094 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
10095 // Check that the sorted pointer operands are consecutive.
10096 if (static_cast<uint64_t>(*Dist) == VL.size() - 1)
10097 return TreeEntry::Vectorize;
10098 }
10099
10100 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
10101 return TreeEntry::NeedToGather;
10102 }
10103 case Instruction::Call: {
10104 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10106 auto *I = dyn_cast<Instruction>(V);
10107 return I && !I->isFast();
10108 }))
10109 return TreeEntry::NeedToGather;
10110 // Check if the calls are all to the same vectorizable intrinsic or
10111 // library function.
10112 CallInst *CI = cast<CallInst>(VL0);
10114
10115 VFShape Shape = VFShape::get(
10116 CI->getFunctionType(),
10117 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
10118 false /*HasGlobalPred*/);
10119 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10120
10121 if (!VecFunc && !isTriviallyVectorizable(ID)) {
10122 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
10123 return TreeEntry::NeedToGather;
10124 }
10125 Function *F = CI->getCalledFunction();
10126 unsigned NumArgs = CI->arg_size();
10127 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
10128 for (unsigned J = 0; J != NumArgs; ++J)
10130 ScalarArgs[J] = CI->getArgOperand(J);
10131 for (Value *V : VL) {
10132 CallInst *CI2 = dyn_cast<CallInst>(V);
10133 if (!CI2 || CI2->getCalledFunction() != F ||
10134 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
10135 (VecFunc &&
10136 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
10138 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
10139 << "\n");
10140 return TreeEntry::NeedToGather;
10141 }
10142 // Some intrinsics have scalar arguments and should be same in order for
10143 // them to be vectorized.
10144 for (unsigned J = 0; J != NumArgs; ++J) {
10146 Value *A1J = CI2->getArgOperand(J);
10147 if (ScalarArgs[J] != A1J) {
10149 << "SLP: mismatched arguments in call:" << *CI
10150 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
10151 return TreeEntry::NeedToGather;
10152 }
10153 }
10154 }
10155 // Verify that the bundle operands are identical between the two calls.
10156 if (CI->hasOperandBundles() &&
10157 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
10158 CI->op_begin() + CI->getBundleOperandsEndIndex(),
10159 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
10160 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
10161 << "!=" << *V << '\n');
10162 return TreeEntry::NeedToGather;
10163 }
10164 }
10165 SmallVector<Type *> ArgTys =
10166 buildIntrinsicArgTypes(CI, ID, VL.size(), 0, TTI);
10167 auto *VecTy = getWidenedType(S.getMainOp()->getType(), VL.size());
10168 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
10169 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10170 return TreeEntry::NeedToGather;
10171
10172 return TreeEntry::Vectorize;
10173 }
10174 case Instruction::ShuffleVector: {
10175 if (!S.isAltShuffle()) {
10176 // REVEC can support non alternate shuffle.
10178 return TreeEntry::Vectorize;
10179 // If this is not an alternate sequence of opcode like add-sub
10180 // then do not vectorize this instruction.
10181 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
10182 return TreeEntry::NeedToGather;
10183 }
10184 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
10185 LLVM_DEBUG(
10186 dbgs()
10187 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
10188 "the whole alt sequence is not profitable.\n");
10189 return TreeEntry::NeedToGather;
10190 }
10191
10192 return TreeEntry::Vectorize;
10193 }
10194 default:
10195 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
10196 return TreeEntry::NeedToGather;
10197 }
10198}
10199
10200namespace {
10201/// Allows to correctly handle operands of the phi nodes based on the \p Main
10202/// PHINode order of incoming basic blocks/values.
10203class PHIHandler {
10204 DominatorTree &DT;
10205 PHINode *Main = nullptr;
10208
10209public:
10210 PHIHandler() = delete;
10211 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
10212 : DT(DT), Main(Main), Phis(Phis),
10213 Operands(Main->getNumIncomingValues(),
10214 SmallVector<Value *>(Phis.size(), nullptr)) {}
10215 void buildOperands() {
10216 constexpr unsigned FastLimit = 4;
10217 if (Main->getNumIncomingValues() <= FastLimit) {
10218 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
10219 BasicBlock *InBB = Main->getIncomingBlock(I);
10220 if (!DT.isReachableFromEntry(InBB)) {
10221 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
10222 continue;
10223 }
10224 // Prepare the operand vector.
10225 for (auto [Idx, V] : enumerate(Phis)) {
10226 auto *P = dyn_cast<PHINode>(V);
10227 if (!P) {
10228 assert(isa<PoisonValue>(V) &&
10229 "Expected isa instruction or poison value.");
10230 Operands[I][Idx] = V;
10231 continue;
10232 }
10233 if (P->getIncomingBlock(I) == InBB)
10234 Operands[I][Idx] = P->getIncomingValue(I);
10235 else
10236 Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
10237 }
10238 }
10239 return;
10240 }
10242 Blocks;
10243 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues())) {
10244 BasicBlock *InBB = Main->getIncomingBlock(I);
10245 if (!DT.isReachableFromEntry(InBB)) {
10246 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
10247 continue;
10248 }
10249 Blocks.try_emplace(InBB).first->second.push_back(I);
10250 }
10251 for (auto [Idx, V] : enumerate(Phis)) {
10252 if (isa<PoisonValue>(V)) {
10253 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues()))
10254 Operands[I][Idx] = V;
10255 continue;
10256 }
10257 auto *P = cast<PHINode>(V);
10258 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
10259 BasicBlock *InBB = P->getIncomingBlock(I);
10260 if (InBB == Main->getIncomingBlock(I)) {
10261 if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))
10262 continue;
10263 Operands[I][Idx] = P->getIncomingValue(I);
10264 continue;
10265 }
10266 auto *It = Blocks.find(InBB);
10267 if (It == Blocks.end())
10268 continue;
10269 Operands[It->second.front()][Idx] = P->getIncomingValue(I);
10270 }
10271 }
10272 for (const auto &P : Blocks) {
10273 ArrayRef<unsigned> IncomingValues = P.second;
10274 if (IncomingValues.size() <= 1)
10275 continue;
10276 unsigned BasicI = IncomingValues.consume_front();
10277 for (unsigned I : IncomingValues) {
10279 [&](const auto &Data) {
10280 return !Data.value() ||
10281 Data.value() == Operands[BasicI][Data.index()];
10282 }) &&
10283 "Expected empty operands list.");
10284 Operands[I] = Operands[BasicI];
10285 }
10286 }
10287 }
10288 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
10289};
10290} // namespace
10291
10292/// Returns main/alternate instructions for the given \p VL. Unlike
10293/// getSameOpcode supports non-compatible instructions for better SplitVectorize
10294/// node support.
10295/// \returns first main/alt instructions, if only poisons and instruction with
10296/// only 2 opcodes exists. Returns pair of nullptr otherwise.
10297static std::pair<Instruction *, Instruction *>
10299 Instruction *MainOp = nullptr;
10300 Instruction *AltOp = nullptr;
10301 for (Value *V : VL) {
10302 if (isa<PoisonValue>(V))
10303 continue;
10304 auto *I = dyn_cast<Instruction>(V);
10305 if (!I)
10306 return {};
10307 if (!MainOp) {
10308 MainOp = I;
10309 continue;
10310 }
10311 if (MainOp->getOpcode() == I->getOpcode()) {
10312 if (I->getParent() != MainOp->getParent())
10313 return {};
10314 continue;
10315 }
10316 if (!AltOp) {
10317 AltOp = I;
10318 continue;
10319 }
10320 if (AltOp->getOpcode() == I->getOpcode()) {
10321 if (I->getParent() != AltOp->getParent())
10322 return {};
10323 continue;
10324 }
10325 return {};
10326 }
10327 if (!AltOp)
10328 return {};
10329 assert(MainOp && AltOp && MainOp->getOpcode() != AltOp->getOpcode() &&
10330 "Expected different main and alt instructions.");
10331 return std::make_pair(MainOp, AltOp);
10332}
10333
10334/// Checks that every instruction appears once in the list and if not, packs
10335/// them, building \p ReuseShuffleIndices mask and mutating \p VL. The list of
10336/// unique scalars is extended by poison values to the whole register size.
10337///
10338/// \returns false if \p VL could not be uniquified, in which case \p VL is
10339/// unchanged and \p ReuseShuffleIndices is empty.
10341 SmallVectorImpl<int> &ReuseShuffleIndices,
10342 const TargetTransformInfo &TTI,
10343 const TargetLibraryInfo &TLI,
10344 const InstructionsState &S,
10345 const BoUpSLP::EdgeInfo &UserTreeIdx,
10346 bool TryPad = false) {
10347 // Check that every instruction appears once in this bundle.
10348 SmallVector<Value *> UniqueValues;
10349 SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
10350 for (Value *V : VL) {
10351 if (isConstant(V)) {
10352 // Constants are always considered distinct, even if the same constant
10353 // appears multiple times in VL.
10354 ReuseShuffleIndices.emplace_back(
10355 isa<PoisonValue>(V) ? PoisonMaskElem : UniqueValues.size());
10356 UniqueValues.emplace_back(V);
10357 continue;
10358 }
10359 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
10360 ReuseShuffleIndices.emplace_back(Res.first->second);
10361 if (Res.second)
10362 UniqueValues.emplace_back(V);
10363 }
10364
10365 // Easy case: VL has unique values and a "natural" size
10366 size_t NumUniqueScalarValues = UniqueValues.size();
10367 bool IsFullVectors = hasFullVectorsOrPowerOf2(
10368 TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues);
10369 if (NumUniqueScalarValues == VL.size() &&
10370 (VectorizeNonPowerOf2 || IsFullVectors)) {
10371 ReuseShuffleIndices.clear();
10372 return true;
10373 }
10374
10375 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
10376 if ((UserTreeIdx.UserTE &&
10377 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(TTI)) ||
10379 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
10380 "for nodes with padding.\n");
10381 ReuseShuffleIndices.clear();
10382 return false;
10383 }
10384
10385 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
10386 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
10387 (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
10388 return isa<UndefValue>(V) || !isConstant(V);
10389 }))) {
10390 if (TryPad && UniquePositions.size() > 1 && NumUniqueScalarValues > 1 &&
10391 S.getMainOp()->isSafeToRemove() &&
10392 (S.areInstructionsWithCopyableElements() ||
10393 all_of(UniqueValues, IsaPred<Instruction, PoisonValue>))) {
10394 // Find the number of elements, which forms full vectors.
10395 unsigned PWSz = getFullVectorNumberOfElements(
10396 TTI, UniqueValues.front()->getType(), UniqueValues.size());
10397 PWSz = std::min<unsigned>(PWSz, VL.size());
10398 if (PWSz == VL.size()) {
10399 // We ended up with the same size after removing duplicates and
10400 // upgrading the resulting vector size to a "nice size". Just keep
10401 // the initial VL then.
10402 ReuseShuffleIndices.clear();
10403 } else {
10404 // Pad unique values with poison to grow the vector to a "nice" size
10405 SmallVector<Value *> PaddedUniqueValues(UniqueValues.begin(),
10406 UniqueValues.end());
10407 PaddedUniqueValues.append(
10408 PWSz - UniqueValues.size(),
10409 PoisonValue::get(UniqueValues.front()->getType()));
10410 // Check that extended with poisons/copyable operations are still valid
10411 // for vectorization (div/rem are not allowed).
10412 if (!S.areInstructionsWithCopyableElements() &&
10413 !getSameOpcode(PaddedUniqueValues, TLI).valid()) {
10414 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
10415 ReuseShuffleIndices.clear();
10416 return false;
10417 }
10418 VL = std::move(PaddedUniqueValues);
10419 }
10420 return true;
10421 }
10422 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
10423 ReuseShuffleIndices.clear();
10424 return false;
10425 }
10426 VL = std::move(UniqueValues);
10427 return true;
10428}
10429
10430bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
10431 const InstructionsState &LocalState,
10434 OrdersType &ReorderIndices) const {
10435 constexpr unsigned SmallNodeSize = 4;
10436 if (VL.size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
10438 return false;
10439
10440 // Check if this is a duplicate of another split entry.
10441 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *LocalState.getMainOp()
10442 << ".\n");
10443 for (TreeEntry *E : getSplitTreeEntries(LocalState.getMainOp())) {
10444 if (E->isSame(VL)) {
10445 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at "
10446 << *LocalState.getMainOp() << ".\n");
10447 return false;
10448 }
10449 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
10450 if (all_of(VL, [&](Value *V) {
10451 return isa<PoisonValue>(V) || Values.contains(V);
10452 })) {
10453 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
10454 return false;
10455 }
10456 }
10457
10458 ReorderIndices.assign(VL.size(), VL.size());
10459 SmallBitVector Op1Indices(VL.size());
10460 for (auto [Idx, V] : enumerate(VL)) {
10461 auto *I = dyn_cast<Instruction>(V);
10462 if (!I) {
10463 Op1.push_back(V);
10464 Op1Indices.set(Idx);
10465 continue;
10466 }
10467 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
10468 isMainInstruction(I, LocalState.getMainOp(), LocalState.getAltOp(),
10469 *TLI)) ||
10470 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
10471 !isAlternateInstruction(I, LocalState.getMainOp(),
10472 LocalState.getAltOp(), *TLI))) {
10473 Op1.push_back(V);
10474 Op1Indices.set(Idx);
10475 continue;
10476 }
10477 Op2.push_back(V);
10478 }
10479 Type *ScalarTy = getValueType(VL.front());
10480 VectorType *VecTy = getWidenedType(ScalarTy, VL.size());
10481 unsigned Opcode0 = LocalState.getOpcode();
10482 unsigned Opcode1 = LocalState.getAltOpcode();
10483 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
10484 // Enable split node, only if all nodes do not form legal alternate
10485 // instruction (like X86 addsub).
10488 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
10489 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
10490 !hasFullVectorsOrPowerOf2(*TTI, Op1.front()->getType(), Op1.size()) ||
10491 !hasFullVectorsOrPowerOf2(*TTI, Op2.front()->getType(), Op2.size()))
10492 return false;
10493 // Enable split node, only if all nodes are power-of-2/full registers.
10494 unsigned Op1Cnt = 0, Op2Cnt = Op1.size();
10495 for (unsigned Idx : seq<unsigned>(VL.size())) {
10496 if (Op1Indices.test(Idx)) {
10497 ReorderIndices[Op1Cnt] = Idx;
10498 ++Op1Cnt;
10499 } else {
10500 ReorderIndices[Op2Cnt] = Idx;
10501 ++Op2Cnt;
10502 }
10503 }
10504 if (isIdentityOrder(ReorderIndices))
10505 ReorderIndices.clear();
10507 if (!ReorderIndices.empty())
10508 inversePermutation(ReorderIndices, Mask);
10509 unsigned NumParts = TTI->getNumberOfParts(VecTy);
10510 VectorType *Op1VecTy = getWidenedType(ScalarTy, Op1.size());
10511 VectorType *Op2VecTy = getWidenedType(ScalarTy, Op2.size());
10512 // Check non-profitable single register ops, which better to be represented
10513 // as alternate ops.
10514 if (NumParts >= VL.size())
10515 return false;
10517 InstructionCost InsertCost = ::getShuffleCost(
10518 *TTI, TTI::SK_InsertSubvector, VecTy, {}, Kind, Op1.size(), Op2VecTy);
10519 FixedVectorType *SubVecTy =
10520 getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size()));
10521 InstructionCost NewShuffleCost =
10522 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, SubVecTy, Mask, Kind);
10523 if (!LocalState.isCmpOp() && NumParts <= 1 &&
10524 (Mask.empty() || InsertCost >= NewShuffleCost))
10525 return false;
10526 if ((LocalState.getMainOp()->isBinaryOp() &&
10527 LocalState.getAltOp()->isBinaryOp() &&
10528 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
10529 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
10530 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
10531 (LocalState.getMainOp()->isUnaryOp() &&
10532 LocalState.getAltOp()->isUnaryOp())) {
10533 InstructionCost OriginalVecOpsCost =
10534 TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
10535 TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
10536 SmallVector<int> OriginalMask(VL.size(), PoisonMaskElem);
10537 for (unsigned Idx : seq<unsigned>(VL.size())) {
10538 if (isa<PoisonValue>(VL[Idx]))
10539 continue;
10540 OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.size());
10541 }
10542 InstructionCost OriginalCost =
10543 OriginalVecOpsCost + ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
10544 VecTy, OriginalMask, Kind);
10545 InstructionCost NewVecOpsCost =
10546 TTI->getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
10547 TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
10548 InstructionCost NewCost =
10549 NewVecOpsCost + InsertCost +
10550 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
10551 VectorizableTree.front()->getOpcode() == Instruction::Store
10552 ? NewShuffleCost
10553 : 0);
10554 // If not profitable to split - exit.
10555 if (NewCost >= OriginalCost)
10556 return false;
10557 }
10558 return true;
10559}
10560
10561namespace {
10562/// Class accepts incoming list of values, checks if it is able to model
10563/// "copyable" values as compatible operations, and generates the list of values
10564/// for scheduling and list of operands doe the new nodes.
10565class InstructionsCompatibilityAnalysis {
10566 DominatorTree &DT;
10567 const DataLayout &DL;
10568 const TargetTransformInfo &TTI;
10569 const TargetLibraryInfo &TLI;
10570 unsigned MainOpcode = 0;
10571 Instruction *MainOp = nullptr;
10572
10573 /// Checks if the opcode is supported as the main opcode for copyable
10574 /// elements.
10575 static bool isSupportedOpcode(const unsigned Opcode) {
10576 return Opcode == Instruction::Add || Opcode == Instruction::LShr;
10577 }
10578
10579 /// Identifies the best candidate value, which represents main opcode
10580 /// operation.
10581 /// Currently the best candidate is the Add instruction with the parent
10582 /// block with the highest DFS incoming number (block, that dominates other).
10583 void findAndSetMainInstruction(ArrayRef<Value *> VL, const BoUpSLP &R) {
10584 BasicBlock *Parent = nullptr;
10585 // Checks if the instruction has supported opcode.
10586 auto IsSupportedInstruction = [&](Instruction *I) {
10587 return I && isSupportedOpcode(I->getOpcode()) &&
10588 (!doesNotNeedToBeScheduled(I) || !R.isVectorized(I));
10589 };
10590 // Exclude operands instructions immediately to improve compile time, it
10591 // will be unable to schedule anyway.
10594 for (Value *V : VL) {
10595 auto *I = dyn_cast<Instruction>(V);
10596 if (!I)
10597 continue;
10598 if (!DT.isReachableFromEntry(I->getParent()))
10599 continue;
10600 if (Candidates.empty()) {
10601 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
10602 Parent = I->getParent();
10603 Operands.insert(I->op_begin(), I->op_end());
10604 continue;
10605 }
10606 if (Parent == I->getParent()) {
10607 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
10608 Operands.insert(I->op_begin(), I->op_end());
10609 continue;
10610 }
10611 auto *NodeA = DT.getNode(Parent);
10612 auto *NodeB = DT.getNode(I->getParent());
10613 assert(NodeA && "Should only process reachable instructions");
10614 assert(NodeB && "Should only process reachable instructions");
10615 assert((NodeA == NodeB) ==
10616 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10617 "Different nodes should have different DFS numbers");
10618 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
10619 Candidates.clear();
10620 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
10621 Parent = I->getParent();
10622 Operands.clear();
10623 Operands.insert(I->op_begin(), I->op_end());
10624 }
10625 }
10626 unsigned BestOpcodeNum = 0;
10627 MainOp = nullptr;
10628 for (const auto &P : Candidates) {
10629 if (P.second.size() < BestOpcodeNum)
10630 continue;
10631 for (Instruction *I : P.second) {
10632 if (IsSupportedInstruction(I) && !Operands.contains(I)) {
10633 MainOp = I;
10634 BestOpcodeNum = P.second.size();
10635 break;
10636 }
10637 }
10638 }
10639 if (MainOp) {
10640 // Do not match, if any copyable is a terminator from the same block as
10641 // the main operation.
10642 if (any_of(VL, [&](Value *V) {
10643 auto *I = dyn_cast<Instruction>(V);
10644 return I && I->getParent() == MainOp->getParent() &&
10645 I->isTerminator();
10646 })) {
10647 MainOp = nullptr;
10648 return;
10649 }
10650 MainOpcode = MainOp->getOpcode();
10651 }
10652 }
10653
10654 /// Returns the idempotent value for the \p MainOp with the detected \p
10655 /// MainOpcode. For Add, returns 0. For Or, it should choose between false and
10656 /// the operand itself, since V or V == V.
10657 Value *selectBestIdempotentValue() const {
10658 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
10659 return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(),
10660 !MainOp->isCommutative());
10661 }
10662
10663 /// Returns the value and operands for the \p V, considering if it is original
10664 /// instruction and its actual operands should be returned, or it is a
10665 /// copyable element and its should be represented as idempotent instruction.
10666 SmallVector<Value *> getOperands(const InstructionsState &S, Value *V) const {
10667 if (isa<PoisonValue>(V))
10668 return {V, V};
10669 if (!S.isCopyableElement(V))
10670 return convertTo(cast<Instruction>(V), S).second;
10671 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
10672 return {V, selectBestIdempotentValue()};
10673 }
10674
10675 /// Builds operands for the original instructions.
10676 void
10677 buildOriginalOperands(const InstructionsState &S, ArrayRef<Value *> VL,
10679
10680 unsigned ShuffleOrOp =
10681 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
10682 Instruction *VL0 = S.getMainOp();
10683
10684 switch (ShuffleOrOp) {
10685 case Instruction::PHI: {
10686 auto *PH = cast<PHINode>(VL0);
10687
10688 // Keeps the reordered operands to avoid code duplication.
10689 PHIHandler Handler(DT, PH, VL);
10690 Handler.buildOperands();
10691 Operands.assign(PH->getNumOperands(), {});
10692 for (unsigned I : seq<unsigned>(PH->getNumOperands()))
10693 Operands[I].assign(Handler.getOperands(I).begin(),
10694 Handler.getOperands(I).end());
10695 return;
10696 }
10697 case Instruction::ExtractValue:
10698 case Instruction::ExtractElement:
10699 // This is a special case, as it does not gather, but at the same time
10700 // we are not extending buildTree_rec() towards the operands.
10701 Operands.assign(1, {VL.size(), VL0->getOperand(0)});
10702 return;
10703 case Instruction::InsertElement:
10704 Operands.assign(2, {VL.size(), nullptr});
10705 for (auto [Idx, V] : enumerate(VL)) {
10706 auto *IE = cast<InsertElementInst>(V);
10707 for (auto [OpIdx, Ops] : enumerate(Operands))
10708 Ops[Idx] = IE->getOperand(OpIdx);
10709 }
10710 return;
10711 case Instruction::Load:
10712 Operands.assign(
10713 1, {VL.size(),
10714 PoisonValue::get(cast<LoadInst>(VL0)->getPointerOperandType())});
10715 for (auto [V, Op] : zip(VL, Operands.back())) {
10716 auto *LI = dyn_cast<LoadInst>(V);
10717 if (!LI)
10718 continue;
10719 Op = LI->getPointerOperand();
10720 }
10721 return;
10722 case Instruction::ZExt:
10723 case Instruction::SExt:
10724 case Instruction::FPToUI:
10725 case Instruction::FPToSI:
10726 case Instruction::FPExt:
10727 case Instruction::PtrToInt:
10728 case Instruction::IntToPtr:
10729 case Instruction::SIToFP:
10730 case Instruction::UIToFP:
10731 case Instruction::Trunc:
10732 case Instruction::FPTrunc:
10733 case Instruction::BitCast:
10734 case Instruction::ICmp:
10735 case Instruction::FCmp:
10736 case Instruction::Select:
10737 case Instruction::FNeg:
10738 case Instruction::Add:
10739 case Instruction::FAdd:
10740 case Instruction::Sub:
10741 case Instruction::FSub:
10742 case Instruction::Mul:
10743 case Instruction::FMul:
10744 case Instruction::UDiv:
10745 case Instruction::SDiv:
10746 case Instruction::FDiv:
10747 case Instruction::URem:
10748 case Instruction::SRem:
10749 case Instruction::FRem:
10750 case Instruction::Shl:
10751 case Instruction::LShr:
10752 case Instruction::AShr:
10753 case Instruction::And:
10754 case Instruction::Or:
10755 case Instruction::Xor:
10756 case Instruction::Freeze:
10757 case Instruction::Store:
10758 case Instruction::ShuffleVector:
10759 Operands.assign(VL0->getNumOperands(), {VL.size(), nullptr});
10760 for (auto [Idx, V] : enumerate(VL)) {
10761 auto *I = dyn_cast<Instruction>(V);
10762 if (!I) {
10763 for (auto [OpIdx, Ops] : enumerate(Operands))
10764 Ops[Idx] = PoisonValue::get(VL0->getOperand(OpIdx)->getType());
10765 continue;
10766 }
10767 auto [Op, ConvertedOps] = convertTo(I, S);
10768 for (auto [OpIdx, Ops] : enumerate(Operands))
10769 Ops[Idx] = ConvertedOps[OpIdx];
10770 }
10771 return;
10772 case Instruction::GetElementPtr: {
10773 Operands.assign(2, {VL.size(), nullptr});
10774 // Need to cast all indices to the same type before vectorization to
10775 // avoid crash.
10776 // Required to be able to find correct matches between different gather
10777 // nodes and reuse the vectorized values rather than trying to gather them
10778 // again.
10779 const unsigned IndexIdx = 1;
10780 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
10781 Type *Ty =
10782 all_of(VL,
10783 [&](Value *V) {
10784 auto *GEP = dyn_cast<GetElementPtrInst>(V);
10785 return !GEP || VL0Ty == GEP->getOperand(IndexIdx)->getType();
10786 })
10787 ? VL0Ty
10788 : DL.getIndexType(cast<GetElementPtrInst>(VL0)
10789 ->getPointerOperandType()
10790 ->getScalarType());
10791 for (auto [Idx, V] : enumerate(VL)) {
10792 auto *GEP = dyn_cast<GetElementPtrInst>(V);
10793 if (!GEP) {
10794 Operands[0][Idx] = V;
10795 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
10796 continue;
10797 }
10798 Operands[0][Idx] = GEP->getPointerOperand();
10799 auto *Op = GEP->getOperand(IndexIdx);
10800 auto *CI = dyn_cast<ConstantInt>(Op);
10802 CI, Ty, CI->getValue().isSignBitSet(), DL)
10803 : Op;
10804 }
10805 return;
10806 }
10807 case Instruction::Call: {
10808 auto *CI = cast<CallInst>(VL0);
10810 for (unsigned Idx : seq<unsigned>(CI->arg_size())) {
10812 continue;
10813 auto &Ops = Operands.emplace_back();
10814 for (Value *V : VL) {
10815 auto *I = dyn_cast<Instruction>(V);
10816 Ops.push_back(I ? I->getOperand(Idx)
10818 }
10819 }
10820 return;
10821 }
10822 default:
10823 break;
10824 }
10825 llvm_unreachable("Unexpected vectorization of the instructions.");
10826 }
10827
10828public:
10829 InstructionsCompatibilityAnalysis(DominatorTree &DT, const DataLayout &DL,
10830 const TargetTransformInfo &TTI,
10831 const TargetLibraryInfo &TLI)
10832 : DT(DT), DL(DL), TTI(TTI), TLI(TLI) {}
10833
10834 InstructionsState
10835 buildInstructionsState(ArrayRef<Value *> VL, const BoUpSLP &R,
10836 bool TryCopyableElementsVectorization,
10837 bool WithProfitabilityCheck = false,
10838 bool SkipSameCodeCheck = false) {
10839 InstructionsState S = (SkipSameCodeCheck || !allSameBlock(VL))
10840 ? InstructionsState::invalid()
10841 : getSameOpcode(VL, TLI);
10842 if (S)
10843 return S;
10844 if (!VectorizeCopyableElements || !TryCopyableElementsVectorization)
10845 return S;
10846 findAndSetMainInstruction(VL, R);
10847 if (!MainOp)
10848 return InstructionsState::invalid();
10849 S = InstructionsState(MainOp, MainOp, /*HasCopyables=*/true);
10850 if (!WithProfitabilityCheck)
10851 return S;
10852 // Check if it is profitable to vectorize the instruction.
10853 SmallVector<BoUpSLP::ValueList> Operands = buildOperands(S, VL);
10854 auto BuildCandidates =
10856 Value *V2) {
10857 if (V1 != V2 && isa<PHINode>(V1))
10858 return;
10859 auto *I1 = dyn_cast<Instruction>(V1);
10860 auto *I2 = dyn_cast<Instruction>(V2);
10861 if (I1 && I2 && I1->getOpcode() == I2->getOpcode() &&
10862 I1->getParent() != I2->getParent())
10863 return;
10864 Candidates.emplace_back(V1, (I1 || I2) ? V2 : V1);
10865 };
10866 if (VL.size() == 2) {
10867 // Check if the operands allow better vectorization.
10868 SmallVector<std::pair<Value *, Value *>, 4> Candidates1, Candidates2;
10869 BuildCandidates(Candidates1, Operands[0][0], Operands[0][1]);
10870 BuildCandidates(Candidates2, Operands[1][0], Operands[1][1]);
10871 bool Res = !Candidates1.empty() && !Candidates2.empty() &&
10872 R.findBestRootPair(Candidates1) &&
10873 R.findBestRootPair(Candidates2);
10874 if (!Res && isCommutative(MainOp)) {
10875 Candidates1.clear();
10876 Candidates2.clear();
10877 BuildCandidates(Candidates1, Operands[0][0], Operands[1][1]);
10878 BuildCandidates(Candidates2, Operands[1][0], Operands[0][1]);
10879 Res = !Candidates1.empty() && !Candidates2.empty() &&
10880 R.findBestRootPair(Candidates1) &&
10881 R.findBestRootPair(Candidates2);
10882 }
10883 if (!Res)
10884 return InstructionsState::invalid();
10886 InstructionCost ScalarCost = TTI.getInstructionCost(S.getMainOp(), Kind);
10887 InstructionCost VectorCost;
10888 FixedVectorType *VecTy =
10889 getWidenedType(S.getMainOp()->getType(), VL.size());
10890 switch (MainOpcode) {
10891 case Instruction::Add:
10892 case Instruction::LShr:
10893 VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
10894 break;
10895 default:
10896 llvm_unreachable("Unexpected instruction.");
10897 }
10898 if (VectorCost > ScalarCost)
10899 return InstructionsState::invalid();
10900 return S;
10901 }
10902 assert(Operands.size() == 2 && "Unexpected number of operands!");
10903 unsigned CopyableNum =
10904 count_if(VL, [&](Value *V) { return S.isCopyableElement(V); });
10905 if (CopyableNum < VL.size() / 2)
10906 return S;
10907 // Too many phi copyables - exit.
10908 const unsigned Limit = VL.size() / 24;
10909 if ((CopyableNum >= VL.size() - Limit ||
10910 (CopyableNum >= VL.size() - 1 && VL.size() > 4) ||
10911 CopyableNum >= MaxPHINumOperands) &&
10912 all_of(VL, [&](Value *V) {
10913 return isa<PHINode>(V) || !S.isCopyableElement(V);
10914 }))
10915 return InstructionsState::invalid();
10916 // Check profitability if number of copyables > VL.size() / 2.
10917 // 1. Reorder operands for better matching.
10918 if (isCommutative(MainOp)) {
10919 for (auto &Ops : Operands) {
10920 // Make instructions the first operands.
10921 if (!isa<Instruction>(Ops.front()) && isa<Instruction>(Ops.back())) {
10922 std::swap(Ops.front(), Ops.back());
10923 continue;
10924 }
10925 // Make constants the second operands.
10926 if (isa<Constant>(Ops.front())) {
10927 std::swap(Ops.front(), Ops.back());
10928 continue;
10929 }
10930 }
10931 }
10932 // 2. Check, if operands can be vectorized.
10933 if (count_if(Operands.back(), IsaPred<Instruction>) > 1)
10934 return InstructionsState::invalid();
10935 auto CheckOperand = [&](ArrayRef<Value *> Ops) {
10936 if (allConstant(Ops) || isSplat(Ops))
10937 return true;
10938 // Check if it is "almost" splat, i.e. has >= 4 elements and only single
10939 // one is different.
10940 constexpr unsigned Limit = 4;
10941 if (Operands.front().size() >= Limit) {
10943 for (Value *V : Ops) {
10944 if (isa<UndefValue>(V))
10945 continue;
10946 ++Counters[V];
10947 }
10948 if (Counters.size() == 2 &&
10949 any_of(Counters, [&](const std::pair<const Value *, unsigned> &C) {
10950 return C.second == 1;
10951 }))
10952 return true;
10953 }
10954 // First operand not a constant or splat? Last attempt - check for
10955 // potential vectorization.
10956 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
10957 InstructionsState OpS = Analysis.buildInstructionsState(
10958 Ops, R, /*TryCopyableElementsVectorization=*/true);
10959 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !allSameBlock(Ops)))
10960 return false;
10961 unsigned CopyableNum =
10962 count_if(Ops, [&](Value *V) { return OpS.isCopyableElement(V); });
10963 return CopyableNum <= VL.size() / 2;
10964 };
10965 if (!CheckOperand(Operands.front()))
10966 return InstructionsState::invalid();
10967
10968 return S;
10969 }
10970
10971 SmallVector<BoUpSLP::ValueList> buildOperands(const InstructionsState &S,
10972 ArrayRef<Value *> VL) {
10973 assert(S && "Invalid state!");
10975 if (S.areInstructionsWithCopyableElements()) {
10976 MainOp = S.getMainOp();
10977 MainOpcode = S.getOpcode();
10978 Operands.assign(MainOp->getNumOperands(),
10979 BoUpSLP::ValueList(VL.size(), nullptr));
10980 for (auto [Idx, V] : enumerate(VL)) {
10981 SmallVector<Value *> OperandsForValue = getOperands(S, V);
10982 for (auto [OperandIdx, Operand] : enumerate(OperandsForValue))
10983 Operands[OperandIdx][Idx] = Operand;
10984 }
10985 } else {
10986 buildOriginalOperands(S, VL, Operands);
10987 }
10988 return Operands;
10989 }
10990};
10991} // namespace
10992
10993BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
10994 ArrayRef<Value *> VL, unsigned Depth, const EdgeInfo &UserTreeIdx,
10995 bool TryCopyableElementsVectorization) const {
10996 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
10997
10998 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
10999 InstructionsState S = Analysis.buildInstructionsState(
11000 VL, *this, TryCopyableElementsVectorization,
11001 /*WithProfitabilityCheck=*/true, TryCopyableElementsVectorization);
11002
11003 // Don't go into catchswitch blocks, which can happen with PHIs.
11004 // Such blocks can only have PHIs and the catchswitch. There is no
11005 // place to insert a shuffle if we need to, so just avoid that issue.
11006 if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
11007 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
11008 // Do not try to pack to avoid extra instructions here.
11009 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11010 /*TryToFindDuplicates=*/false);
11011 }
11012
11013 // Check if this is a duplicate of another entry.
11014 if (S) {
11015 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp() << ".\n");
11016 for (TreeEntry *E : getTreeEntries(S.getMainOp())) {
11017 if (E->isSame(VL)) {
11018 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
11019 << ".\n");
11020 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11021 }
11022 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
11023 if (all_of(VL, [&](Value *V) {
11024 return isa<PoisonValue>(V) || Values.contains(V) ||
11025 (S.getOpcode() == Instruction::PHI && isa<PHINode>(V) &&
11026 LI->getLoopFor(S.getMainOp()->getParent()) &&
11027 isVectorized(V));
11028 })) {
11029 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
11030 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11031 }
11032 }
11033 }
11034
11035 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
11036 // a load), in which case peek through to include it in the tree, without
11037 // ballooning over-budget.
11038 if (Depth >= RecursionMaxDepth &&
11039 !(S && !S.isAltShuffle() && VL.size() >= 4 &&
11040 (match(S.getMainOp(), m_Load(m_Value())) ||
11041 all_of(VL, [&S](const Value *I) {
11042 return match(I,
11044 cast<Instruction>(I)->getOpcode() == S.getOpcode();
11045 })))) {
11046 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
11047 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11048 }
11049
11050 // Don't handle scalable vectors
11051 if (S && S.getOpcode() == Instruction::ExtractElement &&
11052 isa<ScalableVectorType>(
11053 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
11054 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
11055 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11056 }
11057
11058 // Don't handle vectors.
11059 if (!SLPReVec && getValueType(VL.front())->isVectorTy()) {
11060 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
11061 // Do not try to pack to avoid extra instructions here.
11062 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11063 /*TryToFindDuplicates=*/false);
11064 }
11065
11066 // If all of the operands are identical or constant we have a simple solution.
11067 // If we deal with insert/extract instructions, they all must have constant
11068 // indices, otherwise we should gather them, not try to vectorize.
11069 // If alternate op node with 2 elements with gathered operands - do not
11070 // vectorize.
11071 auto NotProfitableForVectorization = [&S, this, Depth](ArrayRef<Value *> VL) {
11072 if (!S || !S.isAltShuffle() || VL.size() > 2)
11073 return false;
11074 if (VectorizableTree.size() < MinTreeSize)
11075 return false;
11076 if (Depth >= RecursionMaxDepth - 1)
11077 return true;
11078 // Check if all operands are extracts, part of vector node or can build a
11079 // regular vectorize node.
11080 SmallVector<unsigned, 8> InstsCount;
11081 for (Value *V : VL) {
11082 auto *I = cast<Instruction>(V);
11083 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
11084 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
11085 }));
11086 }
11087 bool IsCommutative =
11088 isCommutative(S.getMainOp()) || isCommutative(S.getAltOp());
11089 if ((IsCommutative &&
11090 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
11091 (!IsCommutative &&
11092 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
11093 return true;
11094 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
11096 auto *I1 = cast<Instruction>(VL.front());
11097 auto *I2 = cast<Instruction>(VL.back());
11098 for (int Op : seq<int>(S.getMainOp()->getNumOperands()))
11099 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
11100 I2->getOperand(Op));
11101 if (static_cast<unsigned>(count_if(
11102 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
11104 })) >= S.getMainOp()->getNumOperands() / 2)
11105 return false;
11106 if (S.getMainOp()->getNumOperands() > 2)
11107 return true;
11108 if (IsCommutative) {
11109 // Check permuted operands.
11110 Candidates.clear();
11111 for (int Op = 0, E = S.getMainOp()->getNumOperands(); Op < E; ++Op)
11112 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
11113 I2->getOperand((Op + 1) % E));
11114 if (any_of(
11115 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
11117 }))
11118 return false;
11119 }
11120 return true;
11121 };
11122 SmallVector<unsigned> SortedIndices;
11123 BasicBlock *BB = nullptr;
11124 bool IsScatterVectorizeUserTE =
11125 UserTreeIdx.UserTE &&
11126 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11127 bool AreAllSameBlock = S.valid();
11128 bool AreScatterAllGEPSameBlock =
11129 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
11130 VL.size() > 2 &&
11131 all_of(VL,
11132 [&BB](Value *V) {
11133 auto *I = dyn_cast<GetElementPtrInst>(V);
11134 if (!I)
11135 return doesNotNeedToBeScheduled(V);
11136 if (!BB)
11137 BB = I->getParent();
11138 return BB == I->getParent() && I->getNumOperands() == 2;
11139 }) &&
11140 BB &&
11141 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
11142 SortedIndices));
11143 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
11144 if (!AreAllSameInsts || (!S && allConstant(VL)) || isSplat(VL) ||
11145 (S &&
11146 isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
11147 S.getMainOp()) &&
11149 NotProfitableForVectorization(VL)) {
11150 if (!S) {
11151 LLVM_DEBUG(dbgs() << "SLP: Try split and if failed, gathering due to "
11152 "C,S,B,O, small shuffle. \n";
11153 dbgs() << "[";
11154 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
11155 dbgs() << "]\n");
11156 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11157 /*TryToFindDuplicates=*/true,
11158 /*TrySplitVectorize=*/true);
11159 }
11160 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n";
11161 dbgs() << "[";
11162 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
11163 dbgs() << "]\n");
11164 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11165 }
11166
11167 // Don't vectorize ephemeral values.
11168 if (S && !EphValues.empty()) {
11169 for (Value *V : VL) {
11170 if (EphValues.count(V)) {
11171 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
11172 << ") is ephemeral.\n");
11173 // Do not try to pack to avoid extra instructions here.
11174 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11175 /*TryToFindDuplicates=*/false);
11176 }
11177 }
11178 }
11179
11180 // We now know that this is a vector of instructions of the same type from
11181 // the same block.
11182
11183 // Check that none of the instructions in the bundle are already in the tree
11184 // and the node may be not profitable for the vectorization as the small
11185 // alternate node.
11186 if (S && S.isAltShuffle()) {
11187 auto GetNumVectorizedExtracted = [&]() {
11188 APInt Extracted = APInt::getZero(VL.size());
11189 APInt Vectorized = APInt::getAllOnes(VL.size());
11190 for (auto [Idx, V] : enumerate(VL)) {
11191 auto *I = dyn_cast<Instruction>(V);
11192 if (!I || doesNotNeedToBeScheduled(I) ||
11193 all_of(I->operands(), [&](const Use &U) {
11194 return isa<ExtractElementInst>(U.get());
11195 }))
11196 continue;
11197 if (isVectorized(I))
11198 Vectorized.clearBit(Idx);
11199 else if (!I->hasOneUser() && !areAllUsersVectorized(I, UserIgnoreList))
11200 Extracted.setBit(Idx);
11201 }
11202 return std::make_pair(Vectorized, Extracted);
11203 };
11204 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
11206 bool PreferScalarize = !Vectorized.isAllOnes() && VL.size() == 2;
11207 if (!Vectorized.isAllOnes() && !PreferScalarize) {
11208 // Rough cost estimation, if the vector code (+ potential extracts) is
11209 // more profitable than the scalar + buildvector.
11210 Type *ScalarTy = VL.front()->getType();
11211 auto *VecTy = getWidenedType(ScalarTy, VL.size());
11212 InstructionCost VectorizeCostEstimate =
11214 ::getScalarizationOverhead(*TTI, ScalarTy, VecTy, Extracted,
11215 /*Insert=*/false, /*Extract=*/true, Kind);
11216 InstructionCost ScalarizeCostEstimate = ::getScalarizationOverhead(
11217 *TTI, ScalarTy, VecTy, Vectorized,
11218 /*Insert=*/true, /*Extract=*/false, Kind, /*ForPoisonSrc=*/false);
11219 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
11220 }
11221 if (PreferScalarize) {
11222 LLVM_DEBUG(dbgs() << "SLP: The instructions are in tree and alternate "
11223 "node is not profitable.\n");
11224 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11225 }
11226 }
11227
11228 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
11229 if (UserIgnoreList && !UserIgnoreList->empty()) {
11230 for (Value *V : VL) {
11231 if (UserIgnoreList->contains(V)) {
11232 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
11233 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11234 }
11235 }
11236 }
11237
11238 // Special processing for sorted pointers for ScatterVectorize node with
11239 // constant indeces only.
11240 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
11241 assert(VL.front()->getType()->isPointerTy() &&
11242 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
11243 "Expected pointers only.");
11244 // Reset S to make it GetElementPtr kind of node.
11245 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
11246 assert(It != VL.end() && "Expected at least one GEP.");
11247 S = getSameOpcode(*It, *TLI);
11248 }
11249
11250 // Check that all of the users of the scalars that we want to vectorize are
11251 // schedulable.
11252 Instruction *VL0 = S.getMainOp();
11253 BB = VL0->getParent();
11254
11255 if (S &&
11256 (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()) ||
11257 !DT->isReachableFromEntry(BB))) {
11258 // Don't go into unreachable blocks. They may contain instructions with
11259 // dependency cycles which confuse the final scheduling.
11260 // Do not vectorize EH and non-returning blocks, not profitable in most
11261 // cases.
11262 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
11263 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11264 }
11265 return ScalarsVectorizationLegality(S, /*IsLegal=*/true);
11266}
11267
11268void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
11269 const EdgeInfo &UserTreeIdx,
11270 unsigned InterleaveFactor) {
11271 assert((allConstant(VLRef) || allSameType(VLRef)) && "Invalid types!");
11272
11273 SmallVector<int> ReuseShuffleIndices;
11274 SmallVector<Value *> VL(VLRef);
11275
11276 // Tries to build split node.
11277 auto TrySplitNode = [&](const InstructionsState &LocalState) {
11278 SmallVector<Value *> Op1, Op2;
11279 OrdersType ReorderIndices;
11280 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
11281 return false;
11282
11283 SmallVector<Value *> NewVL(VL.size());
11284 copy(Op1, NewVL.begin());
11285 copy(Op2, std::next(NewVL.begin(), Op1.size()));
11286 auto Invalid = ScheduleBundle::invalid();
11287 auto *TE = newTreeEntry(VL, TreeEntry::SplitVectorize, Invalid, LocalState,
11288 UserTreeIdx, {}, ReorderIndices);
11289 LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n"; TE->dump());
11290 auto AddNode = [&](ArrayRef<Value *> Op, unsigned Idx) {
11291 InstructionsState S = getSameOpcode(Op, *TLI);
11292 if (S && (isa<LoadInst>(S.getMainOp()) ||
11293 getSameValuesTreeEntry(S.getMainOp(), Op, /*SameVF=*/true))) {
11294 // Build gather node for loads, they will be gathered later.
11295 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11296 Idx == 0 ? 0 : Op1.size());
11297 (void)newTreeEntry(Op, TreeEntry::NeedToGather, Invalid, S, {TE, Idx});
11298 } else {
11299 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11300 Idx == 0 ? 0 : Op1.size());
11301 buildTreeRec(Op, Depth, {TE, Idx});
11302 }
11303 };
11304 AddNode(Op1, 0);
11305 AddNode(Op2, 1);
11306 return true;
11307 };
11308
11309 auto AreOnlyConstsWithPHIs = [](ArrayRef<Value *> VL) {
11310 bool AreConsts = false;
11311 for (Value *V : VL) {
11312 if (isa<PoisonValue>(V))
11313 continue;
11314 if (isa<Constant>(V)) {
11315 AreConsts = true;
11316 continue;
11317 }
11318 if (!isa<PHINode>(V))
11319 return false;
11320 }
11321 return AreConsts;
11322 };
11323 if (AreOnlyConstsWithPHIs(VL)) {
11324 LLVM_DEBUG(dbgs() << "SLP: Gathering due to all constants and PHIs.\n");
11325 newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
11326 return;
11327 }
11328
11329 ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
11330 VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/false);
11331 InstructionsState S = Legality.getInstructionsState();
11332 if (!Legality.isLegal()) {
11333 if (Legality.trySplitVectorize()) {
11334 auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
11335 // Last chance to try to vectorize alternate node.
11336 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
11337 return;
11338 }
11339 if (!S)
11340 Legality = getScalarsVectorizationLegality(
11341 VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/true);
11342 if (!Legality.isLegal()) {
11343 if (Legality.tryToFindDuplicates())
11344 tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S,
11345 UserTreeIdx);
11346
11347 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11348 return;
11349 }
11350 S = Legality.getInstructionsState();
11351 }
11352
11353 // FIXME: investigate if there are profitable cases for VL.size() <= 4.
11354 if (S.isAltShuffle() && TrySplitNode(S))
11355 return;
11356
11357 // Check that every instruction appears once in this bundle.
11358 if (!tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S, UserTreeIdx,
11359 /*TryPad=*/true)) {
11360 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11361 return;
11362 }
11363
11364 // Perform specific checks for each particular instruction kind.
11365 bool IsScatterVectorizeUserTE =
11366 UserTreeIdx.UserTE &&
11367 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11368 OrdersType CurrentOrder;
11369 SmallVector<Value *> PointerOps;
11370 TreeEntry::EntryState State = getScalarsVectorizationState(
11371 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
11372 if (State == TreeEntry::NeedToGather) {
11373 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11374 return;
11375 }
11376
11377 Instruction *VL0 = S.getMainOp();
11378 BasicBlock *BB = VL0->getParent();
11379 auto &BSRef = BlocksSchedules[BB];
11380 if (!BSRef)
11381 BSRef = std::make_unique<BlockScheduling>(BB);
11382
11383 BlockScheduling &BS = *BSRef;
11384
11385 SetVector<Value *> UniqueValues(llvm::from_range, VL);
11386 std::optional<ScheduleBundle *> BundlePtr =
11387 BS.tryScheduleBundle(UniqueValues.getArrayRef(), this, S, UserTreeIdx);
11388#ifdef EXPENSIVE_CHECKS
11389 // Make sure we didn't break any internal invariants
11390 BS.verify();
11391#endif
11392 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
11393 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
11394 // Last chance to try to vectorize alternate node.
11395 if (S.isAltShuffle() && ReuseShuffleIndices.empty() && TrySplitNode(S))
11396 return;
11397 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11398 NonScheduledFirst.insert(VL.front());
11399 if (S.getOpcode() == Instruction::Load &&
11400 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
11402 return;
11403 }
11404 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
11405 SmallVector<ValueList> Operands = Analysis.buildOperands(S, VL);
11406 ScheduleBundle Empty;
11407 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() : Empty;
11408 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
11409
11410 unsigned ShuffleOrOp =
11411 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
11412 auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) {
11413 // Postpone PHI nodes creation
11414 SmallVector<unsigned> PHIOps;
11415 for (unsigned I : seq<unsigned>(Operands.size())) {
11417 if (Op.empty())
11418 continue;
11419 InstructionsState S = getSameOpcode(Op, *TLI);
11420 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
11421 buildTreeRec(Op, Depth + 1, {TE, I});
11422 else
11423 PHIOps.push_back(I);
11424 }
11425 for (unsigned I : PHIOps)
11426 buildTreeRec(Operands[I], Depth + 1, {TE, I});
11427 };
11428 switch (ShuffleOrOp) {
11429 case Instruction::PHI: {
11430 TreeEntry *TE =
11431 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
11432 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n";
11433 TE->dump());
11434
11435 TE->setOperands(Operands);
11436 CreateOperandNodes(TE, Operands);
11437 return;
11438 }
11439 case Instruction::ExtractValue:
11440 case Instruction::ExtractElement: {
11441 if (CurrentOrder.empty()) {
11442 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
11443 } else {
11444 LLVM_DEBUG({
11445 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
11446 "with order";
11447 for (unsigned Idx : CurrentOrder)
11448 dbgs() << " " << Idx;
11449 dbgs() << "\n";
11450 });
11451 fixupOrderingIndices(CurrentOrder);
11452 }
11453 // Insert new order with initial value 0, if it does not exist,
11454 // otherwise return the iterator to the existing one.
11455 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11456 ReuseShuffleIndices, CurrentOrder);
11457 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
11458 "(ExtractValueInst/ExtractElementInst).\n";
11459 TE->dump());
11460 // This is a special case, as it does not gather, but at the same time
11461 // we are not extending buildTreeRec() towards the operands.
11462 TE->setOperands(Operands);
11463 return;
11464 }
11465 case Instruction::InsertElement: {
11466 assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
11467
11468 auto OrdCompare = [](const std::pair<int, int> &P1,
11469 const std::pair<int, int> &P2) {
11470 return P1.first > P2.first;
11471 };
11473 decltype(OrdCompare)>
11474 Indices(OrdCompare);
11475 for (int I = 0, E = VL.size(); I < E; ++I) {
11476 unsigned Idx = *getElementIndex(VL[I]);
11477 Indices.emplace(Idx, I);
11478 }
11479 OrdersType CurrentOrder(VL.size(), VL.size());
11480 bool IsIdentity = true;
11481 for (int I = 0, E = VL.size(); I < E; ++I) {
11482 CurrentOrder[Indices.top().second] = I;
11483 IsIdentity &= Indices.top().second == I;
11484 Indices.pop();
11485 }
11486 if (IsIdentity)
11487 CurrentOrder.clear();
11488 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11489 {}, CurrentOrder);
11490 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
11491 TE->dump());
11492
11493 TE->setOperands(Operands);
11494 buildTreeRec(TE->getOperand(1), Depth + 1, {TE, 1});
11495 return;
11496 }
11497 case Instruction::Load: {
11498 // Check that a vectorized load would load the same memory as a scalar
11499 // load. For example, we don't want to vectorize loads that are smaller
11500 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
11501 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
11502 // from such a struct, we read/write packed bits disagreeing with the
11503 // unvectorized version.
11504 TreeEntry *TE = nullptr;
11505 fixupOrderingIndices(CurrentOrder);
11506 switch (State) {
11507 case TreeEntry::Vectorize:
11508 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11509 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
11510 if (CurrentOrder.empty())
11511 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
11512 TE->dump());
11513 else
11515 << "SLP: added a new TreeEntry (jumbled LoadInst).\n";
11516 TE->dump());
11517 break;
11518 case TreeEntry::CompressVectorize:
11519 // Vectorizing non-consecutive loads with (masked)load + compress.
11520 TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
11521 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11522 LLVM_DEBUG(
11523 dbgs()
11524 << "SLP: added a new TreeEntry (masked LoadInst + compress).\n";
11525 TE->dump());
11526 break;
11527 case TreeEntry::StridedVectorize:
11528 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
11529 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
11530 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11531 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
11532 TE->dump());
11533 break;
11534 case TreeEntry::ScatterVectorize:
11535 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
11536 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
11537 UserTreeIdx, ReuseShuffleIndices);
11538 LLVM_DEBUG(
11539 dbgs()
11540 << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
11541 TE->dump());
11542 break;
11543 case TreeEntry::CombinedVectorize:
11544 case TreeEntry::SplitVectorize:
11545 case TreeEntry::NeedToGather:
11546 llvm_unreachable("Unexpected loads state.");
11547 }
11548 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
11549 assert(Operands.size() == 1 && "Expected a single operand only");
11551 inversePermutation(CurrentOrder, Mask);
11552 reorderScalars(Operands.front(), Mask);
11553 }
11554 TE->setOperands(Operands);
11555 if (State == TreeEntry::ScatterVectorize)
11556 buildTreeRec(PointerOps, Depth + 1, {TE, 0});
11557 return;
11558 }
11559 case Instruction::ZExt:
11560 case Instruction::SExt:
11561 case Instruction::FPToUI:
11562 case Instruction::FPToSI:
11563 case Instruction::FPExt:
11564 case Instruction::PtrToInt:
11565 case Instruction::IntToPtr:
11566 case Instruction::SIToFP:
11567 case Instruction::UIToFP:
11568 case Instruction::Trunc:
11569 case Instruction::FPTrunc:
11570 case Instruction::BitCast: {
11571 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
11572 std::make_pair(std::numeric_limits<unsigned>::min(),
11573 std::numeric_limits<unsigned>::max()));
11574 if (ShuffleOrOp == Instruction::ZExt ||
11575 ShuffleOrOp == Instruction::SExt) {
11576 CastMaxMinBWSizes = std::make_pair(
11577 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
11578 PrevMaxBW),
11579 std::min<unsigned>(
11580 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
11581 PrevMinBW));
11582 } else if (ShuffleOrOp == Instruction::Trunc) {
11583 CastMaxMinBWSizes = std::make_pair(
11584 std::max<unsigned>(
11585 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
11586 PrevMaxBW),
11587 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
11588 PrevMinBW));
11589 }
11590 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11591 ReuseShuffleIndices);
11592 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
11593 TE->dump());
11594
11595 TE->setOperands(Operands);
11596 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
11597 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
11598 if (ShuffleOrOp == Instruction::Trunc) {
11599 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11600 } else if (ShuffleOrOp == Instruction::SIToFP ||
11601 ShuffleOrOp == Instruction::UIToFP) {
11602 unsigned NumSignBits =
11603 ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
11604 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
11605 APInt Mask = DB->getDemandedBits(OpI);
11606 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
11607 }
11608 if (NumSignBits * 2 >=
11609 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
11610 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11611 }
11612 return;
11613 }
11614 case Instruction::ICmp:
11615 case Instruction::FCmp: {
11616 // Check that all of the compares have the same predicate.
11617 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
11618 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11619 ReuseShuffleIndices);
11620 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n";
11621 TE->dump());
11622
11623 VLOperands Ops(VL, Operands, S, *this);
11624 if (cast<CmpInst>(VL0)->isCommutative()) {
11625 // Commutative predicate - collect + sort operands of the instructions
11626 // so that each side is more likely to have the same opcode.
11628 "Commutative Predicate mismatch");
11629 Ops.reorder();
11630 Operands.front() = Ops.getVL(0);
11631 Operands.back() = Ops.getVL(1);
11632 } else {
11633 // Collect operands - commute if it uses the swapped predicate.
11634 for (auto [Idx, V] : enumerate(VL)) {
11635 if (isa<PoisonValue>(V))
11636 continue;
11637 auto *Cmp = cast<CmpInst>(V);
11638 if (Cmp->getPredicate() != P0)
11639 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
11640 }
11641 }
11642 TE->setOperands(Operands);
11643 buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
11644 buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
11645 if (ShuffleOrOp == Instruction::ICmp) {
11646 unsigned NumSignBits0 =
11647 ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
11648 if (NumSignBits0 * 2 >=
11649 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
11650 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11651 unsigned NumSignBits1 =
11652 ComputeNumSignBits(VL0->getOperand(1), *DL, AC, nullptr, DT);
11653 if (NumSignBits1 * 2 >=
11654 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
11655 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
11656 }
11657 return;
11658 }
11659 case Instruction::Select:
11660 case Instruction::FNeg:
11661 case Instruction::Add:
11662 case Instruction::FAdd:
11663 case Instruction::Sub:
11664 case Instruction::FSub:
11665 case Instruction::Mul:
11666 case Instruction::FMul:
11667 case Instruction::UDiv:
11668 case Instruction::SDiv:
11669 case Instruction::FDiv:
11670 case Instruction::URem:
11671 case Instruction::SRem:
11672 case Instruction::FRem:
11673 case Instruction::Shl:
11674 case Instruction::LShr:
11675 case Instruction::AShr:
11676 case Instruction::And:
11677 case Instruction::Or:
11678 case Instruction::Xor:
11679 case Instruction::Freeze: {
11680 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11681 ReuseShuffleIndices);
11682 LLVM_DEBUG(
11683 dbgs() << "SLP: added a new TreeEntry "
11684 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
11685 TE->dump());
11686
11687 if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
11688 VLOperands Ops(VL, Operands, S, *this);
11689 Ops.reorder();
11690 Operands[0] = Ops.getVL(0);
11691 Operands[1] = Ops.getVL(1);
11692 }
11693 TE->setOperands(Operands);
11694 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
11695 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
11696 return;
11697 }
11698 case Instruction::GetElementPtr: {
11699 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11700 ReuseShuffleIndices);
11701 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
11702 TE->dump());
11703 TE->setOperands(Operands);
11704
11705 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
11706 buildTreeRec(Operands[I], Depth + 1, {TE, I});
11707 return;
11708 }
11709 case Instruction::Store: {
11710 bool Consecutive = CurrentOrder.empty();
11711 if (!Consecutive)
11712 fixupOrderingIndices(CurrentOrder);
11713 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11714 ReuseShuffleIndices, CurrentOrder);
11715 if (Consecutive)
11716 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
11717 TE->dump());
11718 else
11719 LLVM_DEBUG(
11720 dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n";
11721 TE->dump());
11722 TE->setOperands(Operands);
11723 buildTreeRec(TE->getOperand(0), Depth + 1, {TE, 0});
11724 return;
11725 }
11726 case Instruction::Call: {
11727 // Check if the calls are all to the same vectorizable intrinsic or
11728 // library function.
11729 CallInst *CI = cast<CallInst>(VL0);
11731
11732 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11733 ReuseShuffleIndices);
11734 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
11735 TE->dump());
11736 if (isCommutative(VL0)) {
11737 VLOperands Ops(VL, Operands, S, *this);
11738 Ops.reorder();
11739 Operands[0] = Ops.getVL(0);
11740 Operands[1] = Ops.getVL(1);
11741 }
11742 TE->setOperands(Operands);
11743 for (unsigned I : seq<unsigned>(CI->arg_size())) {
11744 // For scalar operands no need to create an entry since no need to
11745 // vectorize it.
11747 continue;
11748 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
11749 }
11750 return;
11751 }
11752 case Instruction::ShuffleVector: {
11753 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11754 ReuseShuffleIndices);
11755 if (S.isAltShuffle()) {
11756 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
11757 TE->dump());
11758 } else {
11759 assert(SLPReVec && "Only supported by REVEC.");
11760 LLVM_DEBUG(
11761 dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
11762 TE->dump());
11763 }
11764
11765 // Reorder operands if reordering would enable vectorization.
11766 auto *CI = dyn_cast<CmpInst>(VL0);
11767 if (CI && any_of(VL, [](Value *V) {
11768 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
11769 })) {
11770 auto *MainCI = cast<CmpInst>(S.getMainOp());
11771 auto *AltCI = cast<CmpInst>(S.getAltOp());
11772 CmpInst::Predicate MainP = MainCI->getPredicate();
11773 CmpInst::Predicate AltP = AltCI->getPredicate();
11774 assert(MainP != AltP &&
11775 "Expected different main/alternate predicates.");
11776 // Collect operands - commute if it uses the swapped predicate or
11777 // alternate operation.
11778 for (auto [Idx, V] : enumerate(VL)) {
11779 if (isa<PoisonValue>(V))
11780 continue;
11781 auto *Cmp = cast<CmpInst>(V);
11782
11783 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
11784 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
11785 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
11786 } else {
11787 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
11788 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
11789 }
11790 }
11791 TE->setOperands(Operands);
11792 buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
11793 buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
11794 return;
11795 }
11796
11797 if (isa<BinaryOperator>(VL0) || CI) {
11798 VLOperands Ops(VL, Operands, S, *this);
11799 Ops.reorder();
11800 Operands[0] = Ops.getVL(0);
11801 Operands[1] = Ops.getVL(1);
11802 }
11803 TE->setOperands(Operands);
11804 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
11805 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
11806 return;
11807 }
11808 default:
11809 break;
11810 }
11811 llvm_unreachable("Unexpected vectorization of the instructions.");
11812}
11813
11815 unsigned N = 1;
11816 Type *EltTy = T;
11817
11818 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
11819 if (EltTy->isEmptyTy())
11820 return 0;
11821 if (auto *ST = dyn_cast<StructType>(EltTy)) {
11822 // Check that struct is homogeneous.
11823 for (const auto *Ty : ST->elements())
11824 if (Ty != *ST->element_begin())
11825 return 0;
11826 N *= ST->getNumElements();
11827 EltTy = *ST->element_begin();
11828 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
11829 N *= AT->getNumElements();
11830 EltTy = AT->getElementType();
11831 } else {
11832 auto *VT = cast<FixedVectorType>(EltTy);
11833 N *= VT->getNumElements();
11834 EltTy = VT->getElementType();
11835 }
11836 }
11837
11838 if (!isValidElementType(EltTy))
11839 return 0;
11840 size_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N));
11841 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
11842 VTSize != DL->getTypeStoreSizeInBits(T))
11843 return 0;
11844 return N;
11845}
11846
11847bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
11848 SmallVectorImpl<unsigned> &CurrentOrder,
11849 bool ResizeAllowed) const {
11850 const auto *It = find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
11851 assert(It != VL.end() && "Expected at least one extract instruction.");
11852 auto *E0 = cast<Instruction>(*It);
11853 assert(
11854 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
11855 "Invalid opcode");
11856 // Check if all of the extracts come from the same vector and from the
11857 // correct offset.
11858 Value *Vec = E0->getOperand(0);
11859
11860 CurrentOrder.clear();
11861
11862 // We have to extract from a vector/aggregate with the same number of elements.
11863 unsigned NElts;
11864 if (E0->getOpcode() == Instruction::ExtractValue) {
11865 NElts = canMapToVector(Vec->getType());
11866 if (!NElts)
11867 return false;
11868 // Check if load can be rewritten as load of vector.
11869 LoadInst *LI = dyn_cast<LoadInst>(Vec);
11870 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
11871 return false;
11872 } else {
11873 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
11874 }
11875
11876 unsigned E = VL.size();
11877 if (!ResizeAllowed && NElts != E)
11878 return false;
11879 SmallVector<int> Indices(E, PoisonMaskElem);
11880 unsigned MinIdx = NElts, MaxIdx = 0;
11881 for (auto [I, V] : enumerate(VL)) {
11882 auto *Inst = dyn_cast<Instruction>(V);
11883 if (!Inst)
11884 continue;
11885 if (Inst->getOperand(0) != Vec)
11886 return false;
11887 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
11888 if (isa<UndefValue>(EE->getIndexOperand()))
11889 continue;
11890 std::optional<unsigned> Idx = getExtractIndex(Inst);
11891 if (!Idx)
11892 return false;
11893 const unsigned ExtIdx = *Idx;
11894 if (ExtIdx >= NElts)
11895 continue;
11896 Indices[I] = ExtIdx;
11897 if (MinIdx > ExtIdx)
11898 MinIdx = ExtIdx;
11899 if (MaxIdx < ExtIdx)
11900 MaxIdx = ExtIdx;
11901 }
11902 if (MaxIdx - MinIdx + 1 > E)
11903 return false;
11904 if (MaxIdx + 1 <= E)
11905 MinIdx = 0;
11906
11907 // Check that all of the indices extract from the correct offset.
11908 bool ShouldKeepOrder = true;
11909 // Assign to all items the initial value E + 1 so we can check if the extract
11910 // instruction index was used already.
11911 // Also, later we can check that all the indices are used and we have a
11912 // consecutive access in the extract instructions, by checking that no
11913 // element of CurrentOrder still has value E + 1.
11914 CurrentOrder.assign(E, E);
11915 for (unsigned I = 0; I < E; ++I) {
11916 if (Indices[I] == PoisonMaskElem)
11917 continue;
11918 const unsigned ExtIdx = Indices[I] - MinIdx;
11919 if (CurrentOrder[ExtIdx] != E) {
11920 CurrentOrder.clear();
11921 return false;
11922 }
11923 ShouldKeepOrder &= ExtIdx == I;
11924 CurrentOrder[ExtIdx] = I;
11925 }
11926 if (ShouldKeepOrder)
11927 CurrentOrder.clear();
11928
11929 return ShouldKeepOrder;
11930}
11931
11932bool BoUpSLP::areAllUsersVectorized(
11933 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
11934 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
11935 all_of(I->users(), [this](User *U) {
11936 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
11937 (isa<ExtractElementInst>(U) && MustGather.contains(U));
11938 });
11939}
11940
11941void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
11942 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
11943 SmallVectorImpl<Value *> *OpScalars,
11944 SmallVectorImpl<Value *> *AltScalars) const {
11945 unsigned Sz = Scalars.size();
11946 Mask.assign(Sz, PoisonMaskElem);
11947 SmallVector<int> OrderMask;
11948 if (!ReorderIndices.empty())
11949 inversePermutation(ReorderIndices, OrderMask);
11950 for (unsigned I = 0; I < Sz; ++I) {
11951 unsigned Idx = I;
11952 if (!ReorderIndices.empty())
11953 Idx = OrderMask[I];
11954 if (isa<PoisonValue>(Scalars[Idx]))
11955 continue;
11956 auto *OpInst = cast<Instruction>(Scalars[Idx]);
11957 if (IsAltOp(OpInst)) {
11958 Mask[I] = Sz + Idx;
11959 if (AltScalars)
11960 AltScalars->push_back(OpInst);
11961 } else {
11962 Mask[I] = Idx;
11963 if (OpScalars)
11964 OpScalars->push_back(OpInst);
11965 }
11966 }
11967 if (!ReuseShuffleIndices.empty()) {
11968 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
11969 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
11970 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
11971 });
11972 Mask.swap(NewMask);
11973 }
11974}
11975
11977 Instruction *AltOp,
11978 const TargetLibraryInfo &TLI) {
11979 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == MainOp;
11980}
11981
11983 Instruction *AltOp,
11984 const TargetLibraryInfo &TLI) {
11985 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
11986 auto *AltCI = cast<CmpInst>(AltOp);
11987 CmpInst::Predicate MainP = MainCI->getPredicate();
11988 [[maybe_unused]] CmpInst::Predicate AltP = AltCI->getPredicate();
11989 assert(MainP != AltP && "Expected different main/alternate predicates.");
11990 auto *CI = cast<CmpInst>(I);
11991 if (isCmpSameOrSwapped(MainCI, CI, TLI))
11992 return false;
11993 if (isCmpSameOrSwapped(AltCI, CI, TLI))
11994 return true;
11995 CmpInst::Predicate P = CI->getPredicate();
11997
11998 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
11999 "CmpInst expected to match either main or alternate predicate or "
12000 "their swap.");
12001 return MainP != P && MainP != SwappedP;
12002 }
12003 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == AltOp;
12004}
12005
12006TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
12007 assert(!Ops.empty());
12008 const auto *Op0 = Ops.front();
12009
12010 const bool IsConstant = all_of(Ops, [](Value *V) {
12011 // TODO: We should allow undef elements here
12012 return isConstant(V) && !isa<UndefValue>(V);
12013 });
12014 const bool IsUniform = all_of(Ops, [=](Value *V) {
12015 // TODO: We should allow undef elements here
12016 return V == Op0;
12017 });
12018 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
12019 // TODO: We should allow undef elements here
12020 if (auto *CI = dyn_cast<ConstantInt>(V))
12021 return CI->getValue().isPowerOf2();
12022 return false;
12023 });
12024 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
12025 // TODO: We should allow undef elements here
12026 if (auto *CI = dyn_cast<ConstantInt>(V))
12027 return CI->getValue().isNegatedPowerOf2();
12028 return false;
12029 });
12030
12032 if (IsConstant && IsUniform)
12034 else if (IsConstant)
12036 else if (IsUniform)
12038
12040 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
12041 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
12042
12043 return {VK, VP};
12044}
12045
12046namespace {
12047/// The base class for shuffle instruction emission and shuffle cost estimation.
12048class BaseShuffleAnalysis {
12049protected:
12050 Type *ScalarTy = nullptr;
12051
12052 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
12053
12054 /// V is expected to be a vectorized value.
12055 /// When REVEC is disabled, there is no difference between VF and
12056 /// VNumElements.
12057 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
12058 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
12059 /// of 8.
12060 unsigned getVF(Value *V) const {
12061 assert(V && "V cannot be nullptr");
12062 assert(isa<FixedVectorType>(V->getType()) &&
12063 "V does not have FixedVectorType");
12064 assert(ScalarTy && "ScalarTy cannot be nullptr");
12065 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
12066 unsigned VNumElements =
12067 cast<FixedVectorType>(V->getType())->getNumElements();
12068 assert(VNumElements > ScalarTyNumElements &&
12069 "the number of elements of V is not large enough");
12070 assert(VNumElements % ScalarTyNumElements == 0 &&
12071 "the number of elements of V is not a vectorized value");
12072 return VNumElements / ScalarTyNumElements;
12073 }
12074
12075 /// Checks if the mask is an identity mask.
12076 /// \param IsStrict if is true the function returns false if mask size does
12077 /// not match vector size.
12078 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
12079 bool IsStrict) {
12080 int Limit = Mask.size();
12081 int VF = VecTy->getNumElements();
12082 int Index = -1;
12083 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
12084 return true;
12085 if (!IsStrict) {
12086 // Consider extract subvector starting from index 0.
12088 Index == 0)
12089 return true;
12090 // All VF-size submasks are identity (e.g.
12091 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
12092 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
12093 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
12094 return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
12096 }))
12097 return true;
12098 }
12099 return false;
12100 }
12101
12102 /// Tries to combine 2 different masks into single one.
12103 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
12104 /// change the size of the vector, \p LocalVF is the original size of the
12105 /// shuffled vector.
12106 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
12107 ArrayRef<int> ExtMask) {
12108 unsigned VF = Mask.size();
12109 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
12110 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
12111 if (ExtMask[I] == PoisonMaskElem)
12112 continue;
12113 int MaskedIdx = Mask[ExtMask[I] % VF];
12114 NewMask[I] =
12115 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
12116 }
12117 Mask.swap(NewMask);
12118 }
12119
12120 /// Looks through shuffles trying to reduce final number of shuffles in the
12121 /// code. The function looks through the previously emitted shuffle
12122 /// instructions and properly mark indices in mask as undef.
12123 /// For example, given the code
12124 /// \code
12125 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
12126 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
12127 /// \endcode
12128 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
12129 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
12130 /// <0, 1, 2, 3> for the shuffle.
12131 /// If 2 operands are of different size, the smallest one will be resized and
12132 /// the mask recalculated properly.
12133 /// For example, given the code
12134 /// \code
12135 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
12136 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
12137 /// \endcode
12138 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
12139 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
12140 /// <0, 1, 2, 3> for the shuffle.
12141 /// So, it tries to transform permutations to simple vector merge, if
12142 /// possible.
12143 /// \param V The input vector which must be shuffled using the given \p Mask.
12144 /// If the better candidate is found, \p V is set to this best candidate
12145 /// vector.
12146 /// \param Mask The input mask for the shuffle. If the best candidate is found
12147 /// during looking-through-shuffles attempt, it is updated accordingly.
12148 /// \param SinglePermute true if the shuffle operation is originally a
12149 /// single-value-permutation. In this case the look-through-shuffles procedure
12150 /// may look for resizing shuffles as the best candidates.
12151 /// \return true if the shuffle results in the non-resizing identity shuffle
12152 /// (and thus can be ignored), false - otherwise.
12153 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
12154 bool SinglePermute) {
12155 Value *Op = V;
12156 ShuffleVectorInst *IdentityOp = nullptr;
12157 SmallVector<int> IdentityMask;
12158 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
12159 // Exit if not a fixed vector type or changing size shuffle.
12160 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
12161 if (!SVTy)
12162 break;
12163 // Remember the identity or broadcast mask, if it is not a resizing
12164 // shuffle. If no better candidates are found, this Op and Mask will be
12165 // used in the final shuffle.
12166 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
12167 if (!IdentityOp || !SinglePermute ||
12168 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
12170 IdentityMask.size()))) {
12171 IdentityOp = SV;
12172 // Store current mask in the IdentityMask so later we did not lost
12173 // this info if IdentityOp is selected as the best candidate for the
12174 // permutation.
12175 IdentityMask.assign(Mask);
12176 }
12177 }
12178 // Remember the broadcast mask. If no better candidates are found, this Op
12179 // and Mask will be used in the final shuffle.
12180 // Zero splat can be used as identity too, since it might be used with
12181 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
12182 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
12183 // expensive, the analysis founds out, that the source vector is just a
12184 // broadcast, this original mask can be transformed to identity mask <0,
12185 // 1, 2, 3>.
12186 // \code
12187 // %0 = shuffle %v, poison, zeroinitalizer
12188 // %res = shuffle %0, poison, <3, 1, 2, 0>
12189 // \endcode
12190 // may be transformed to
12191 // \code
12192 // %0 = shuffle %v, poison, zeroinitalizer
12193 // %res = shuffle %0, poison, <0, 1, 2, 3>
12194 // \endcode
12195 if (SV->isZeroEltSplat()) {
12196 IdentityOp = SV;
12197 IdentityMask.assign(Mask);
12198 }
12199 int LocalVF = Mask.size();
12200 if (auto *SVOpTy =
12201 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
12202 LocalVF = SVOpTy->getNumElements();
12203 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
12204 for (auto [Idx, I] : enumerate(Mask)) {
12205 if (I == PoisonMaskElem ||
12206 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
12207 continue;
12208 ExtMask[Idx] = SV->getMaskValue(I);
12209 }
12210 bool IsOp1Undef = isUndefVector</*isPoisonOnly=*/true>(
12211 SV->getOperand(0),
12212 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
12213 .all();
12214 bool IsOp2Undef = isUndefVector</*isPoisonOnly=*/true>(
12215 SV->getOperand(1),
12216 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
12217 .all();
12218 if (!IsOp1Undef && !IsOp2Undef) {
12219 // Update mask and mark undef elems.
12220 for (int &I : Mask) {
12221 if (I == PoisonMaskElem)
12222 continue;
12223 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
12225 I = PoisonMaskElem;
12226 }
12227 break;
12228 }
12229 SmallVector<int> ShuffleMask(SV->getShuffleMask());
12230 combineMasks(LocalVF, ShuffleMask, Mask);
12231 Mask.swap(ShuffleMask);
12232 if (IsOp2Undef)
12233 Op = SV->getOperand(0);
12234 else
12235 Op = SV->getOperand(1);
12236 }
12237 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
12238 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
12240 if (IdentityOp) {
12241 V = IdentityOp;
12242 assert(Mask.size() == IdentityMask.size() &&
12243 "Expected masks of same sizes.");
12244 // Clear known poison elements.
12245 for (auto [I, Idx] : enumerate(Mask))
12246 if (Idx == PoisonMaskElem)
12247 IdentityMask[I] = PoisonMaskElem;
12248 Mask.swap(IdentityMask);
12249 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
12250 return SinglePermute &&
12251 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
12252 /*IsStrict=*/true) ||
12253 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
12254 Shuffle->isZeroEltSplat() &&
12256 all_of(enumerate(Mask), [&](const auto &P) {
12257 return P.value() == PoisonMaskElem ||
12258 Shuffle->getShuffleMask()[P.index()] == 0;
12259 })));
12260 }
12261 V = Op;
12262 return false;
12263 }
12264 V = Op;
12265 return true;
12266 }
12267
12268 /// Smart shuffle instruction emission, walks through shuffles trees and
12269 /// tries to find the best matching vector for the actual shuffle
12270 /// instruction.
12271 template <typename T, typename ShuffleBuilderTy>
12272 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
12273 ShuffleBuilderTy &Builder, Type *ScalarTy) {
12274 assert(V1 && "Expected at least one vector value.");
12275 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
12276 SmallVector<int> NewMask(Mask);
12277 if (ScalarTyNumElements != 1) {
12278 assert(SLPReVec && "FixedVectorType is not expected.");
12279 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, NewMask);
12280 Mask = NewMask;
12281 }
12282 if (V2)
12283 Builder.resizeToMatch(V1, V2);
12284 int VF = Mask.size();
12285 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
12286 VF = FTy->getNumElements();
12287 if (V2 && !isUndefVector</*IsPoisonOnly=*/true>(
12288 V2, buildUseMask(VF, Mask, UseMask::SecondArg))
12289 .all()) {
12290 // Peek through shuffles.
12291 Value *Op1 = V1;
12292 Value *Op2 = V2;
12293 int VF =
12294 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
12295 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
12296 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
12297 for (int I = 0, E = Mask.size(); I < E; ++I) {
12298 if (Mask[I] < VF)
12299 CombinedMask1[I] = Mask[I];
12300 else
12301 CombinedMask2[I] = Mask[I] - VF;
12302 }
12303 Value *PrevOp1;
12304 Value *PrevOp2;
12305 do {
12306 PrevOp1 = Op1;
12307 PrevOp2 = Op2;
12308 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
12309 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
12310 // Check if we have 2 resizing shuffles - need to peek through operands
12311 // again.
12312 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
12313 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
12314 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
12315 for (auto [Idx, I] : enumerate(CombinedMask1)) {
12316 if (I == PoisonMaskElem)
12317 continue;
12318 ExtMask1[Idx] = SV1->getMaskValue(I);
12319 }
12320 SmallBitVector UseMask1 = buildUseMask(
12321 cast<FixedVectorType>(SV1->getOperand(1)->getType())
12322 ->getNumElements(),
12323 ExtMask1, UseMask::SecondArg);
12324 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
12325 for (auto [Idx, I] : enumerate(CombinedMask2)) {
12326 if (I == PoisonMaskElem)
12327 continue;
12328 ExtMask2[Idx] = SV2->getMaskValue(I);
12329 }
12330 SmallBitVector UseMask2 = buildUseMask(
12331 cast<FixedVectorType>(SV2->getOperand(1)->getType())
12332 ->getNumElements(),
12333 ExtMask2, UseMask::SecondArg);
12334 if (SV1->getOperand(0)->getType() ==
12335 SV2->getOperand(0)->getType() &&
12336 SV1->getOperand(0)->getType() != SV1->getType() &&
12337 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
12338 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
12339 Op1 = SV1->getOperand(0);
12340 Op2 = SV2->getOperand(0);
12341 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
12342 int LocalVF = ShuffleMask1.size();
12343 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
12344 LocalVF = FTy->getNumElements();
12345 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
12346 CombinedMask1.swap(ShuffleMask1);
12347 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
12348 LocalVF = ShuffleMask2.size();
12349 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
12350 LocalVF = FTy->getNumElements();
12351 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
12352 CombinedMask2.swap(ShuffleMask2);
12353 }
12354 }
12355 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
12356 Builder.resizeToMatch(Op1, Op2);
12357 VF = std::max(cast<VectorType>(Op1->getType())
12358 ->getElementCount()
12359 .getKnownMinValue(),
12360 cast<VectorType>(Op2->getType())
12361 ->getElementCount()
12362 .getKnownMinValue());
12363 for (int I = 0, E = Mask.size(); I < E; ++I) {
12364 if (CombinedMask2[I] != PoisonMaskElem) {
12365 assert(CombinedMask1[I] == PoisonMaskElem &&
12366 "Expected undefined mask element");
12367 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
12368 }
12369 }
12370 if (Op1 == Op2 &&
12371 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
12372 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
12373 isa<ShuffleVectorInst>(Op1) &&
12374 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
12375 ArrayRef(CombinedMask1))))
12376 return Builder.createIdentity(Op1);
12377 return Builder.createShuffleVector(
12378 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
12379 CombinedMask1);
12380 }
12381 if (isa<PoisonValue>(V1))
12382 return Builder.createPoison(
12383 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
12384 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
12385 assert(V1 && "Expected non-null value after looking through shuffles.");
12386
12387 if (!IsIdentity)
12388 return Builder.createShuffleVector(V1, NewMask);
12389 return Builder.createIdentity(V1);
12390 }
12391
12392 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
12393 /// shuffle emission.
12394 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
12395 ArrayRef<int> Mask) {
12396 for (unsigned I : seq<unsigned>(CommonMask.size()))
12397 if (Mask[I] != PoisonMaskElem)
12398 CommonMask[I] = I;
12399 }
12400};
12401} // namespace
12402
12403/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
12404static std::pair<InstructionCost, InstructionCost>
12406 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
12407 Type *ScalarTy, VectorType *VecTy) {
12408 InstructionCost ScalarCost = 0;
12409 InstructionCost VecCost = 0;
12410 // Here we differentiate two cases: (1) when Ptrs represent a regular
12411 // vectorization tree node (as they are pointer arguments of scattered
12412 // loads) or (2) when Ptrs are the arguments of loads or stores being
12413 // vectorized as plane wide unit-stride load/store since all the
12414 // loads/stores are known to be from/to adjacent locations.
12415 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
12416 // Case 2: estimate costs for pointer related costs when vectorizing to
12417 // a wide load/store.
12418 // Scalar cost is estimated as a set of pointers with known relationship
12419 // between them.
12420 // For vector code we will use BasePtr as argument for the wide load/store
12421 // but we also need to account all the instructions which are going to
12422 // stay in vectorized code due to uses outside of these scalar
12423 // loads/stores.
12424 ScalarCost = TTI.getPointersChainCost(
12425 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
12426 CostKind);
12427
12428 SmallVector<const Value *> PtrsRetainedInVecCode;
12429 for (Value *V : Ptrs) {
12430 if (V == BasePtr) {
12431 PtrsRetainedInVecCode.push_back(V);
12432 continue;
12433 }
12434 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
12435 // For simplicity assume Ptr to stay in vectorized code if it's not a
12436 // GEP instruction. We don't care since it's cost considered free.
12437 // TODO: We should check for any uses outside of vectorizable tree
12438 // rather than just single use.
12439 if (!Ptr || !Ptr->hasOneUse())
12440 PtrsRetainedInVecCode.push_back(V);
12441 }
12442
12443 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
12444 // If all pointers stay in vectorized code then we don't have
12445 // any savings on that.
12446 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
12447 }
12448 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
12449 TTI::PointersChainInfo::getKnownStride(),
12450 VecTy, CostKind);
12451 } else {
12452 // Case 1: Ptrs are the arguments of loads that we are going to transform
12453 // into masked gather load intrinsic.
12454 // All the scalar GEPs will be removed as a result of vectorization.
12455 // For any external uses of some lanes extract element instructions will
12456 // be generated (which cost is estimated separately).
12457 TTI::PointersChainInfo PtrsInfo =
12458 all_of(Ptrs,
12459 [](const Value *V) {
12460 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
12461 return Ptr && !Ptr->hasAllConstantIndices();
12462 })
12463 ? TTI::PointersChainInfo::getUnknownStride()
12464 : TTI::PointersChainInfo::getKnownStride();
12465
12466 ScalarCost =
12467 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
12468 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
12469 if (!BaseGEP) {
12470 auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
12471 if (It != Ptrs.end())
12472 BaseGEP = cast<GEPOperator>(*It);
12473 }
12474 if (BaseGEP) {
12475 SmallVector<const Value *> Indices(BaseGEP->indices());
12476 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
12477 BaseGEP->getPointerOperand(), Indices, VecTy,
12478 CostKind);
12479 }
12480 }
12481
12482 return std::make_pair(ScalarCost, VecCost);
12483}
12484
12485void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
12486 assert(TE.isGather() && TE.ReorderIndices.empty() &&
12487 "Expected gather node without reordering.");
12489 SmallSet<size_t, 2> LoadKeyUsed;
12490
12491 // Do not reorder nodes if it small (just 2 elements), all-constant or all
12492 // instructions have same opcode already.
12493 if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||
12494 all_of(TE.Scalars, isConstant))
12495 return;
12496
12497 if (any_of(seq<unsigned>(TE.Idx), [&](unsigned Idx) {
12498 return VectorizableTree[Idx]->isSame(TE.Scalars);
12499 }))
12500 return;
12501
12502 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
12503 Key = hash_combine(hash_value(LI->getParent()), Key);
12504 Value *Ptr =
12506 if (LoadKeyUsed.contains(Key)) {
12507 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
12508 if (LIt != LoadsMap.end()) {
12509 for (LoadInst *RLI : LIt->second) {
12510 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
12511 LI->getType(), LI->getPointerOperand(), *DL, *SE,
12512 /*StrictCheck=*/true))
12513 return hash_value(RLI->getPointerOperand());
12514 }
12515 for (LoadInst *RLI : LIt->second) {
12517 LI->getPointerOperand(), *TLI)) {
12518 hash_code SubKey = hash_value(RLI->getPointerOperand());
12519 return SubKey;
12520 }
12521 }
12522 if (LIt->second.size() > 2) {
12523 hash_code SubKey =
12524 hash_value(LIt->second.back()->getPointerOperand());
12525 return SubKey;
12526 }
12527 }
12528 }
12529 LoadKeyUsed.insert(Key);
12530 LoadsMap.try_emplace(std::make_pair(Key, Ptr)).first->second.push_back(LI);
12531 return hash_value(LI->getPointerOperand());
12532 };
12535 bool IsOrdered = true;
12536 unsigned NumInstructions = 0;
12537 // Try to "cluster" scalar instructions, to be able to build extra vectorized
12538 // nodes.
12539 for (auto [I, V] : enumerate(TE.Scalars)) {
12540 size_t Key = 1, Idx = 1;
12541 if (auto *Inst = dyn_cast<Instruction>(V);
12542 Inst && !isa<ExtractElementInst, LoadInst, CastInst>(V) &&
12543 !isDeleted(Inst) && !isVectorized(V)) {
12544 std::tie(Key, Idx) = generateKeySubkey(V, TLI, GenerateLoadsSubkey,
12545 /*AllowAlternate=*/false);
12546 ++NumInstructions;
12547 }
12548 auto &Container = SortedValues[Key];
12549 if (IsOrdered && !KeyToIndex.contains(V) &&
12550 !(isa<Constant, ExtractElementInst>(V) ||
12552 ((Container.contains(Idx) &&
12553 KeyToIndex.at(Container[Idx].back()).back() != I - 1) ||
12554 (!Container.empty() && !Container.contains(Idx) &&
12555 KeyToIndex.at(Container.back().second.back()).back() != I - 1)))
12556 IsOrdered = false;
12557 auto &KTI = KeyToIndex[V];
12558 if (KTI.empty())
12559 Container[Idx].push_back(V);
12560 KTI.push_back(I);
12561 }
12563 APInt DemandedElts = APInt::getAllOnes(TE.Scalars.size());
12564 if (!IsOrdered && NumInstructions > 1) {
12565 unsigned Cnt = 0;
12566 TE.ReorderIndices.resize(TE.Scalars.size(), TE.Scalars.size());
12567 for (const auto &D : SortedValues) {
12568 for (const auto &P : D.second) {
12569 unsigned Sz = 0;
12570 for (Value *V : P.second) {
12571 ArrayRef<unsigned> Indices = KeyToIndex.at(V);
12572 for (auto [K, Idx] : enumerate(Indices)) {
12573 TE.ReorderIndices[Cnt + K] = Idx;
12574 TE.Scalars[Cnt + K] = V;
12575 }
12576 Sz += Indices.size();
12577 Cnt += Indices.size();
12578 }
12579 if (Sz > 1 && isa<Instruction>(P.second.front())) {
12580 const unsigned SubVF = getFloorFullVectorNumberOfElements(
12581 *TTI, TE.Scalars.front()->getType(), Sz);
12582 SubVectors.emplace_back(Cnt - Sz, SubVF);
12583 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
12584 DemandedElts.clearBit(I);
12585 } else if (!P.second.empty() && isConstant(P.second.front())) {
12586 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
12587 DemandedElts.clearBit(I);
12588 }
12589 }
12590 }
12591 }
12592 // Reuses always require shuffles, so consider it as profitable.
12593 if (!TE.ReuseShuffleIndices.empty() || TE.ReorderIndices.empty())
12594 return;
12595 // Do simple cost estimation.
12598 auto *ScalarTy = TE.Scalars.front()->getType();
12599 auto *VecTy = getWidenedType(ScalarTy, TE.Scalars.size());
12600 for (auto [Idx, Sz] : SubVectors) {
12602 Idx, getWidenedType(ScalarTy, Sz));
12603 }
12604 Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
12605 /*Insert=*/true,
12606 /*Extract=*/false, CostKind);
12607 int Sz = TE.Scalars.size();
12608 SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
12609 TE.ReorderIndices.end());
12610 for (unsigned I : seq<unsigned>(Sz)) {
12611 Value *V = TE.getOrdered(I);
12612 if (isa<PoisonValue>(V)) {
12613 ReorderMask[I] = PoisonMaskElem;
12614 } else if (isConstant(V) || DemandedElts[I]) {
12615 ReorderMask[I] = I + TE.ReorderIndices.size();
12616 }
12617 }
12619 any_of(ReorderMask, [&](int I) { return I >= Sz; })
12622 VecTy, ReorderMask);
12623 DemandedElts = APInt::getAllOnes(TE.Scalars.size());
12624 ReorderMask.assign(Sz, PoisonMaskElem);
12625 for (unsigned I : seq<unsigned>(Sz)) {
12626 Value *V = TE.getOrdered(I);
12627 if (isConstant(V)) {
12628 DemandedElts.clearBit(I);
12629 if (!isa<PoisonValue>(V))
12630 ReorderMask[I] = I;
12631 } else {
12632 ReorderMask[I] = I + Sz;
12633 }
12634 }
12635 InstructionCost BVCost =
12636 getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
12637 /*Insert=*/true, /*Extract=*/false, CostKind);
12638 if (!DemandedElts.isAllOnes())
12639 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);
12640 if (Cost >= BVCost) {
12641 SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end());
12642 reorderScalars(TE.Scalars, Mask);
12643 TE.ReorderIndices.clear();
12644 }
12645}
12646
12647/// Check if we can convert fadd/fsub sequence to FMAD.
12648/// \returns Cost of the FMAD, if conversion is possible, invalid cost otherwise.
12650 const InstructionsState &S,
12651 DominatorTree &DT, const DataLayout &DL,
12653 const TargetLibraryInfo &TLI) {
12654 assert(all_of(VL,
12655 [](Value *V) {
12656 return V->getType()->getScalarType()->isFloatingPointTy();
12657 }) &&
12658 "Can only convert to FMA for floating point types");
12659 assert(S.isAddSubLikeOp() && "Can only convert to FMA for add/sub");
12660
12661 auto CheckForContractable = [&](ArrayRef<Value *> VL) {
12662 FastMathFlags FMF;
12663 FMF.set();
12664 for (Value *V : VL) {
12665 auto *I = dyn_cast<Instruction>(V);
12666 if (!I)
12667 continue;
12668 if (S.isCopyableElement(I))
12669 continue;
12670 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(I);
12671 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
12672 continue;
12673 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
12674 FMF &= FPCI->getFastMathFlags();
12675 }
12676 return FMF.allowContract();
12677 };
12678 if (!CheckForContractable(VL))
12680 // fmul also should be contractable
12681 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
12682 SmallVector<BoUpSLP::ValueList> Operands = Analysis.buildOperands(S, VL);
12683
12684 InstructionsState OpS = getSameOpcode(Operands.front(), TLI);
12685 if (!OpS.valid())
12687
12688 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
12690 if (!CheckForContractable(Operands.front()))
12692 // Compare the costs.
12693 InstructionCost FMulPlusFAddCost = 0;
12694 InstructionCost FMACost = 0;
12696 FastMathFlags FMF;
12697 FMF.set();
12698 for (Value *V : VL) {
12699 auto *I = dyn_cast<Instruction>(V);
12700 if (!I)
12701 continue;
12702 if (!S.isCopyableElement(I))
12703 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
12704 FMF &= FPCI->getFastMathFlags();
12705 FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
12706 }
12707 unsigned NumOps = 0;
12708 for (auto [V, Op] : zip(VL, Operands.front())) {
12709 if (S.isCopyableElement(V))
12710 continue;
12711 auto *I = dyn_cast<Instruction>(Op);
12712 if (!I || !I->hasOneUse() || OpS.isCopyableElement(I)) {
12713 if (auto *OpI = dyn_cast<Instruction>(V))
12714 FMACost += TTI.getInstructionCost(OpI, CostKind);
12715 if (I)
12716 FMACost += TTI.getInstructionCost(I, CostKind);
12717 continue;
12718 }
12719 ++NumOps;
12720 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
12721 FMF &= FPCI->getFastMathFlags();
12722 FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
12723 }
12724 Type *Ty = VL.front()->getType();
12725 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, Ty, {Ty, Ty, Ty}, FMF);
12726 FMACost += NumOps * TTI.getIntrinsicInstrCost(ICA, CostKind);
12727 return FMACost < FMulPlusFAddCost ? FMACost : InstructionCost::getInvalid();
12728}
12729
12732 BaseGraphSize = VectorizableTree.size();
12733 // Turn graph transforming mode on and off, when done.
12734 class GraphTransformModeRAAI {
12735 bool &SavedIsGraphTransformMode;
12736
12737 public:
12738 GraphTransformModeRAAI(bool &IsGraphTransformMode)
12739 : SavedIsGraphTransformMode(IsGraphTransformMode) {
12740 IsGraphTransformMode = true;
12741 }
12742 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
12743 } TransformContext(IsGraphTransformMode);
12744 // Operands are profitable if they are:
12745 // 1. At least one constant
12746 // or
12747 // 2. Splats
12748 // or
12749 // 3. Results in good vectorization opportunity, i.e. may generate vector
12750 // nodes and reduce cost of the graph.
12751 auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
12752 const InstructionsState &S) {
12754 for (unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
12755 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
12756 I2->getOperand(Op));
12757 return all_of(
12758 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
12759 return all_of(Cand,
12760 [](const std::pair<Value *, Value *> &P) {
12761 return isa<Constant>(P.first) ||
12762 isa<Constant>(P.second) || P.first == P.second;
12763 }) ||
12765 });
12766 };
12767
12768 // Try to reorder gather nodes for better vectorization opportunities.
12769 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
12770 TreeEntry &E = *VectorizableTree[Idx];
12771 if (E.isGather())
12772 reorderGatherNode(E);
12773 }
12774
12775 // Better to use full gathered loads analysis, if there are only 2 loads
12776 // gathered nodes each having less than 16 elements.
12777 constexpr unsigned VFLimit = 16;
12778 bool ForceLoadGather =
12779 count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12780 return TE->isGather() && TE->hasState() &&
12781 TE->getOpcode() == Instruction::Load &&
12782 TE->getVectorFactor() < VFLimit;
12783 }) == 2;
12784
12785 // Checks if the scalars are used in other node.
12786 auto AreReusedScalars = [&](const TreeEntry *TE, ArrayRef<Value *> VL,
12787 function_ref<bool(Value *)> CheckContainer) {
12788 return TE->isSame(VL) || all_of(VL, [&](Value *V) {
12789 if (isa<PoisonValue>(V))
12790 return true;
12791 auto *I = dyn_cast<Instruction>(V);
12792 if (!I)
12793 return false;
12794 return is_contained(TE->Scalars, I) || CheckContainer(I);
12795 });
12796 };
12797 auto CheckForSameVectorNodes = [&](const TreeEntry &E) {
12798 if (E.hasState()) {
12799 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(E.getMainOp());
12800 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
12801 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
12802 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
12803 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12804 return is_contained(TEs, TE);
12805 });
12806 });
12807 }))
12808 return true;
12809 ;
12810 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(E.getMainOp());
12811 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
12812 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
12813 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
12814 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12815 return is_contained(TEs, TE);
12816 });
12817 });
12818 }))
12819 return true;
12820 } else {
12821 // Check if the gather node full copy of split node.
12822 auto *It = find_if(E.Scalars, IsaPred<Instruction>);
12823 if (It != E.Scalars.end()) {
12824 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(*It);
12825 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
12826 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
12827 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
12828 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12829 return is_contained(TEs, TE);
12830 });
12831 });
12832 }))
12833 return true;
12834 }
12835 }
12836 return false;
12837 };
12838 // The tree may grow here, so iterate over nodes, built before.
12839 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
12840 TreeEntry &E = *VectorizableTree[Idx];
12841 if (E.isGather()) {
12842 ArrayRef<Value *> VL = E.Scalars;
12843 const unsigned Sz = getVectorElementSize(VL.front());
12844 unsigned MinVF = getMinVF(2 * Sz);
12845 // Do not try partial vectorization for small nodes (<= 2), nodes with the
12846 // same opcode and same parent block or all constants.
12847 if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
12848 !(!E.hasState() || E.getOpcode() == Instruction::Load ||
12849 // We use allSameOpcode instead of isAltShuffle because we don't
12850 // want to use interchangeable instruction here.
12851 !allSameOpcode(VL) || !allSameBlock(VL)) ||
12852 allConstant(VL) || isSplat(VL))
12853 continue;
12854 if (ForceLoadGather && E.hasState() && E.getOpcode() == Instruction::Load)
12855 continue;
12856 // Check if the node is a copy of other vector nodes.
12857 if (CheckForSameVectorNodes(E))
12858 continue;
12859 // Try to find vectorizable sequences and transform them into a series of
12860 // insertvector instructions.
12861 unsigned StartIdx = 0;
12862 unsigned End = VL.size();
12863 for (unsigned VF = getFloorFullVectorNumberOfElements(
12864 *TTI, VL.front()->getType(), VL.size() - 1);
12865 VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
12866 *TTI, VL.front()->getType(), VF - 1)) {
12867 if (StartIdx + VF > End)
12868 continue;
12870 bool AllStrided = true;
12871 for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
12872 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
12873 // If any instruction is vectorized already - do not try again.
12874 // Reuse the existing node, if it fully matches the slice.
12875 if (isVectorized(Slice.front()) &&
12876 !getSameValuesTreeEntry(Slice.front(), Slice, /*SameVF=*/true))
12877 continue;
12878 // Constant already handled effectively - skip.
12879 if (allConstant(Slice))
12880 continue;
12881 // Do not try to vectorize small splats (less than vector register and
12882 // only with the single non-undef element).
12883 bool IsSplat = isSplat(Slice);
12884 bool IsTwoRegisterSplat = true;
12885 if (IsSplat && VF == 2) {
12886 unsigned NumRegs2VF = ::getNumberOfParts(
12887 *TTI, getWidenedType(Slice.front()->getType(), 2 * VF));
12888 IsTwoRegisterSplat = NumRegs2VF == 2;
12889 }
12890 if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat ||
12891 count(Slice, Slice.front()) ==
12892 static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
12893 : 1)) {
12894 if (IsSplat)
12895 continue;
12896 InstructionsState S = getSameOpcode(Slice, *TLI);
12897 if (!S || !allSameOpcode(Slice) || !allSameBlock(Slice) ||
12898 (S.getOpcode() == Instruction::Load &&
12899 areKnownNonVectorizableLoads(Slice)) ||
12900 (S.getOpcode() != Instruction::Load &&
12901 !hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), VF)))
12902 continue;
12903 if (VF == 2) {
12904 // Try to vectorize reduced values or if all users are vectorized.
12905 // For expensive instructions extra extracts might be profitable.
12906 if ((!UserIgnoreList || E.Idx != 0) &&
12907 TTI->getInstructionCost(S.getMainOp(), CostKind) <
12909 !all_of(Slice, [&](Value *V) {
12910 if (isa<PoisonValue>(V))
12911 return true;
12912 return areAllUsersVectorized(cast<Instruction>(V),
12913 UserIgnoreList);
12914 }))
12915 continue;
12916 if (S.getOpcode() == Instruction::Load) {
12917 OrdersType Order;
12918 SmallVector<Value *> PointerOps;
12919 LoadsState Res =
12920 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps);
12921 AllStrided &= Res == LoadsState::StridedVectorize ||
12922 Res == LoadsState::ScatterVectorize ||
12923 Res == LoadsState::Gather;
12924 // Do not vectorize gathers.
12925 if (Res == LoadsState::ScatterVectorize ||
12926 Res == LoadsState::Gather) {
12927 if (Res == LoadsState::Gather) {
12928 registerNonVectorizableLoads(Slice);
12929 // If reductions and the scalars from the root node are
12930 // analyzed - mark as non-vectorizable reduction.
12931 if (UserIgnoreList && E.Idx == 0)
12932 analyzedReductionVals(Slice);
12933 }
12934 continue;
12935 }
12936 } else if (S.getOpcode() == Instruction::ExtractElement ||
12937 (TTI->getInstructionCost(S.getMainOp(), CostKind) <
12939 !CheckOperandsProfitability(
12940 S.getMainOp(),
12941 cast<Instruction>(*find_if(reverse(Slice),
12942 IsaPred<Instruction>)),
12943 S))) {
12944 // Do not vectorize extractelements (handled effectively
12945 // alread). Do not vectorize non-profitable instructions (with
12946 // low cost and non-vectorizable operands.)
12947 continue;
12948 }
12949 }
12950 }
12951 Slices.emplace_back(Cnt, Slice.size());
12952 }
12953 // Do not try to vectorize if all slides are strided or gathered with
12954 // vector factor 2 and there are more than 2 slices. Better to handle
12955 // them in gathered loads analysis, may result in better vectorization.
12956 if (VF == 2 && AllStrided && Slices.size() > 2)
12957 continue;
12958 auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
12959 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
12960 if (StartIdx == Cnt)
12961 StartIdx = Cnt + Sz;
12962 if (End == Cnt + Sz)
12963 End = Cnt;
12964 };
12965 for (auto [Cnt, Sz] : Slices) {
12966 ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);
12967 const TreeEntry *SameTE = nullptr;
12968 if (const auto *It = find_if(Slice, IsaPred<Instruction>);
12969 It != Slice.end()) {
12970 // If any instruction is vectorized already - do not try again.
12971 SameTE = getSameValuesTreeEntry(*It, Slice);
12972 }
12973 unsigned PrevSize = VectorizableTree.size();
12974 [[maybe_unused]] unsigned PrevEntriesSize =
12975 LoadEntriesToVectorize.size();
12976 buildTreeRec(Slice, 0, EdgeInfo(&E, UINT_MAX));
12977 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
12978 VectorizableTree[PrevSize]->isGather() &&
12979 VectorizableTree[PrevSize]->hasState() &&
12980 VectorizableTree[PrevSize]->getOpcode() !=
12981 Instruction::ExtractElement &&
12982 !isSplat(Slice)) {
12983 if (UserIgnoreList && E.Idx == 0 && VF == 2)
12984 analyzedReductionVals(Slice);
12985 VectorizableTree.pop_back();
12986 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
12987 "LoadEntriesToVectorize expected to remain the same");
12988 continue;
12989 }
12990 AddCombinedNode(PrevSize, Cnt, Sz);
12991 }
12992 }
12993 // Restore ordering, if no extra vectorization happened.
12994 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
12995 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
12996 reorderScalars(E.Scalars, Mask);
12997 E.ReorderIndices.clear();
12998 }
12999 }
13000 if (!E.hasState())
13001 continue;
13002 switch (E.getOpcode()) {
13003 case Instruction::Load: {
13004 // No need to reorder masked gather loads, just reorder the scalar
13005 // operands.
13006 if (E.State != TreeEntry::Vectorize)
13007 break;
13008 Type *ScalarTy = E.getMainOp()->getType();
13009 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
13010 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
13011 // Check if profitable to represent consecutive load + reverse as strided
13012 // load with stride -1.
13013 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
13014 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13016 inversePermutation(E.ReorderIndices, Mask);
13017 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
13018 InstructionCost OriginalVecCost =
13019 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
13020 BaseLI->getPointerAddressSpace(), CostKind,
13024 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
13025 /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
13026 if (StridedCost < OriginalVecCost)
13027 // Strided load is more profitable than consecutive load + reverse -
13028 // transform the node to strided load.
13029 E.State = TreeEntry::StridedVectorize;
13030 }
13031 break;
13032 }
13033 case Instruction::Store: {
13034 Type *ScalarTy =
13035 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
13036 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
13037 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
13038 // Check if profitable to represent consecutive load + reverse as strided
13039 // load with stride -1.
13040 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
13041 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13043 inversePermutation(E.ReorderIndices, Mask);
13044 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
13045 InstructionCost OriginalVecCost =
13046 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
13047 BaseSI->getPointerAddressSpace(), CostKind,
13051 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
13052 /*VariableMask=*/false, CommonAlignment, CostKind, BaseSI);
13053 if (StridedCost < OriginalVecCost)
13054 // Strided store is more profitable than reverse + consecutive store -
13055 // transform the node to strided store.
13056 E.State = TreeEntry::StridedVectorize;
13057 } else if (!E.ReorderIndices.empty()) {
13058 // Check for interleaved stores.
13059 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {
13060 auto *BaseSI = cast<StoreInst>(E.Scalars.front());
13061 assert(Mask.size() > 1 && "Expected mask greater than 1 element.");
13062 if (Mask.size() < 4)
13063 return 0u;
13064 for (unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
13066 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
13068 VecTy, Factor, BaseSI->getAlign(),
13069 BaseSI->getPointerAddressSpace()))
13070 return Factor;
13071 }
13072
13073 return 0u;
13074 };
13075 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
13076 unsigned InterleaveFactor = IsInterleaveMask(Mask);
13077 if (InterleaveFactor != 0)
13078 E.setInterleave(InterleaveFactor);
13079 }
13080 break;
13081 }
13082 case Instruction::Select: {
13083 if (E.State != TreeEntry::Vectorize)
13084 break;
13085 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
13086 if (MinMaxID == Intrinsic::not_intrinsic)
13087 break;
13088 // This node is a minmax node.
13089 E.CombinedOp = TreeEntry::MinMax;
13090 TreeEntry *CondEntry = getOperandEntry(&E, 0);
13091 if (SelectOnly && CondEntry->UserTreeIndex &&
13092 CondEntry->State == TreeEntry::Vectorize) {
13093 // The condition node is part of the combined minmax node.
13094 CondEntry->State = TreeEntry::CombinedVectorize;
13095 }
13096 break;
13097 }
13098 case Instruction::FSub:
13099 case Instruction::FAdd: {
13100 // Check if possible to convert (a*b)+c to fma.
13101 if (E.State != TreeEntry::Vectorize ||
13102 !E.getOperations().isAddSubLikeOp())
13103 break;
13104 if (!canConvertToFMA(E.Scalars, E.getOperations(), *DT, *DL, *TTI, *TLI)
13105 .isValid())
13106 break;
13107 // This node is a fmuladd node.
13108 E.CombinedOp = TreeEntry::FMulAdd;
13109 TreeEntry *FMulEntry = getOperandEntry(&E, 0);
13110 if (FMulEntry->UserTreeIndex &&
13111 FMulEntry->State == TreeEntry::Vectorize) {
13112 // The FMul node is part of the combined fmuladd node.
13113 FMulEntry->State = TreeEntry::CombinedVectorize;
13114 }
13115 break;
13116 }
13117 default:
13118 break;
13119 }
13120 }
13121
13122 if (LoadEntriesToVectorize.empty()) {
13123 // Single load node - exit.
13124 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
13125 VectorizableTree.front()->getOpcode() == Instruction::Load)
13126 return;
13127 // Small graph with small VF - exit.
13128 constexpr unsigned SmallTree = 3;
13129 constexpr unsigned SmallVF = 2;
13130 if ((VectorizableTree.size() <= SmallTree &&
13131 VectorizableTree.front()->Scalars.size() == SmallVF) ||
13132 (VectorizableTree.size() <= 2 && UserIgnoreList))
13133 return;
13134
13135 if (VectorizableTree.front()->isNonPowOf2Vec() &&
13136 getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
13137 getCanonicalGraphSize() <= SmallTree &&
13138 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
13139 [](const std::unique_ptr<TreeEntry> &TE) {
13140 return TE->isGather() && TE->hasState() &&
13141 TE->getOpcode() == Instruction::Load &&
13142 !allSameBlock(TE->Scalars);
13143 }) == 1)
13144 return;
13145 }
13146
13147 // A list of loads to be gathered during the vectorization process. We can
13148 // try to vectorize them at the end, if profitable.
13151 GatheredLoads;
13152
13153 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
13154 TreeEntry &E = *TE;
13155 if (E.isGather() &&
13156 ((E.hasState() && E.getOpcode() == Instruction::Load) ||
13157 (!E.hasState() && any_of(E.Scalars,
13158 [&](Value *V) {
13159 return isa<LoadInst>(V) &&
13160 !isVectorized(V) &&
13161 !isDeleted(cast<Instruction>(V));
13162 }))) &&
13163 !isSplat(E.Scalars)) {
13164 for (Value *V : E.Scalars) {
13165 auto *LI = dyn_cast<LoadInst>(V);
13166 if (!LI)
13167 continue;
13168 if (isDeleted(LI) || isVectorized(LI) || !LI->isSimple())
13169 continue;
13171 *this, V, *DL, *SE, *TTI,
13172 GatheredLoads[std::make_tuple(
13173 LI->getParent(),
13175 LI->getType())]);
13176 }
13177 }
13178 }
13179 // Try to vectorize gathered loads if this is not just a gather of loads.
13180 if (!GatheredLoads.empty())
13181 tryToVectorizeGatheredLoads(GatheredLoads);
13182}
13183
13184/// Merges shuffle masks and emits final shuffle instruction, if required. It
13185/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
13186/// when the actual shuffle instruction is generated only if this is actually
13187/// required. Otherwise, the shuffle instruction emission is delayed till the
13188/// end of the process, to reduce the number of emitted instructions and further
13189/// analysis/transformations.
13190class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
13191 bool IsFinalized = false;
13192 SmallVector<int> CommonMask;
13194 const TargetTransformInfo &TTI;
13196 SmallDenseSet<Value *> VectorizedVals;
13197 BoUpSLP &R;
13198 SmallPtrSetImpl<Value *> &CheckedExtracts;
13199 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13200 /// While set, still trying to estimate the cost for the same nodes and we
13201 /// can delay actual cost estimation (virtual shuffle instruction emission).
13202 /// May help better estimate the cost if same nodes must be permuted + allows
13203 /// to move most of the long shuffles cost estimation to TTI.
13204 bool SameNodesEstimated = true;
13205
13206 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
13207 if (Ty->getScalarType()->isPointerTy()) {
13211 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
13212 Ty->getScalarType());
13213 if (auto *VTy = dyn_cast<VectorType>(Ty))
13214 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
13215 return Res;
13216 }
13217 return Constant::getAllOnesValue(Ty);
13218 }
13219
13220 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
13221 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
13222 return TTI::TCC_Free;
13223 auto *VecTy = getWidenedType(ScalarTy, VL.size());
13224 InstructionCost GatherCost = 0;
13225 SmallVector<Value *> Gathers(VL);
13226 if (!Root && isSplat(VL)) {
13227 // Found the broadcasting of the single scalar, calculate the cost as
13228 // the broadcast.
13229 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
13230 assert(It != VL.end() && "Expected at least one non-undef value.");
13231 // Add broadcast for non-identity shuffle only.
13232 bool NeedShuffle =
13233 count(VL, *It) > 1 &&
13234 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
13235 if (!NeedShuffle) {
13236 if (isa<FixedVectorType>(ScalarTy)) {
13237 assert(SLPReVec && "FixedVectorType is not expected.");
13238 return TTI.getShuffleCost(
13239 TTI::SK_InsertSubvector, VecTy, VecTy, {}, CostKind,
13240 std::distance(VL.begin(), It) * getNumElements(ScalarTy),
13241 cast<FixedVectorType>(ScalarTy));
13242 }
13243 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
13244 CostKind, std::distance(VL.begin(), It),
13245 PoisonValue::get(VecTy), *It);
13246 }
13247
13248 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
13249 transform(VL, ShuffleMask.begin(), [](Value *V) {
13250 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
13251 });
13252 InstructionCost InsertCost =
13253 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
13254 PoisonValue::get(VecTy), *It);
13255 return InsertCost + ::getShuffleCost(TTI,
13257 VecTy, ShuffleMask, CostKind,
13258 /*Index=*/0, /*SubTp=*/nullptr,
13259 /*Args=*/*It);
13260 }
13261 return GatherCost +
13262 (all_of(Gathers, IsaPred<UndefValue>)
13264 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
13265 ScalarTy));
13266 };
13267
13268 /// Compute the cost of creating a vector containing the extracted values from
13269 /// \p VL.
13271 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
13272 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13273 unsigned NumParts) {
13274 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
13275 unsigned NumElts =
13276 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
13277 auto *EE = dyn_cast<ExtractElementInst>(V);
13278 if (!EE)
13279 return Sz;
13280 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
13281 if (!VecTy)
13282 return Sz;
13283 return std::max(Sz, VecTy->getNumElements());
13284 });
13285 // FIXME: this must be moved to TTI for better estimation.
13286 unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
13287 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
13289 SmallVectorImpl<unsigned> &SubVecSizes)
13290 -> std::optional<TTI::ShuffleKind> {
13291 if (NumElts <= EltsPerVector)
13292 return std::nullopt;
13293 int OffsetReg0 =
13294 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
13295 [](int S, int I) {
13296 if (I == PoisonMaskElem)
13297 return S;
13298 return std::min(S, I);
13299 }),
13300 EltsPerVector);
13301 int OffsetReg1 = OffsetReg0;
13302 DenseSet<int> RegIndices;
13303 // Check that if trying to permute same single/2 input vectors.
13305 int FirstRegId = -1;
13306 Indices.assign(1, OffsetReg0);
13307 for (auto [Pos, I] : enumerate(Mask)) {
13308 if (I == PoisonMaskElem)
13309 continue;
13310 int Idx = I - OffsetReg0;
13311 int RegId =
13312 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
13313 if (FirstRegId < 0)
13314 FirstRegId = RegId;
13315 RegIndices.insert(RegId);
13316 if (RegIndices.size() > 2)
13317 return std::nullopt;
13318 if (RegIndices.size() == 2) {
13319 ShuffleKind = TTI::SK_PermuteTwoSrc;
13320 if (Indices.size() == 1) {
13321 OffsetReg1 = alignDown(
13322 std::accumulate(
13323 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
13324 [&](int S, int I) {
13325 if (I == PoisonMaskElem)
13326 return S;
13327 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
13328 ((I - OffsetReg0) % NumElts) / EltsPerVector;
13329 if (RegId == FirstRegId)
13330 return S;
13331 return std::min(S, I);
13332 }),
13333 EltsPerVector);
13334 unsigned Index = OffsetReg1 % NumElts;
13335 Indices.push_back(Index);
13336 SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
13337 }
13338 Idx = I - OffsetReg1;
13339 }
13340 I = (Idx % NumElts) % EltsPerVector +
13341 (RegId == FirstRegId ? 0 : EltsPerVector);
13342 }
13343 return ShuffleKind;
13344 };
13346
13347 // Process extracts in blocks of EltsPerVector to check if the source vector
13348 // operand can be re-used directly. If not, add the cost of creating a
13349 // shuffle to extract the values into a vector register.
13350 for (unsigned Part : seq<unsigned>(NumParts)) {
13351 if (!ShuffleKinds[Part])
13352 continue;
13353 ArrayRef<int> MaskSlice = Mask.slice(
13354 Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
13355 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
13356 copy(MaskSlice, SubMask.begin());
13358 SmallVector<unsigned, 2> SubVecSizes;
13359 std::optional<TTI::ShuffleKind> RegShuffleKind =
13360 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
13361 if (!RegShuffleKind) {
13362 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
13364 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
13365 Cost +=
13366 ::getShuffleCost(TTI, *ShuffleKinds[Part],
13367 getWidenedType(ScalarTy, NumElts), MaskSlice);
13368 continue;
13369 }
13370 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
13371 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
13372 Cost +=
13373 ::getShuffleCost(TTI, *RegShuffleKind,
13374 getWidenedType(ScalarTy, EltsPerVector), SubMask);
13375 }
13376 const unsigned BaseVF = getFullVectorNumberOfElements(
13377 *R.TTI, VL.front()->getType(), alignTo(NumElts, EltsPerVector));
13378 for (const auto [Idx, SubVecSize] : zip(Indices, SubVecSizes)) {
13379 assert((Idx + SubVecSize) <= BaseVF &&
13380 "SK_ExtractSubvector index out of range");
13382 getWidenedType(ScalarTy, BaseVF), {}, CostKind,
13383 Idx, getWidenedType(ScalarTy, SubVecSize));
13384 }
13385 // Second attempt to check, if just a permute is better estimated than
13386 // subvector extract.
13387 SubMask.assign(NumElts, PoisonMaskElem);
13388 copy(MaskSlice, SubMask.begin());
13389 InstructionCost OriginalCost = ::getShuffleCost(
13390 TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask);
13391 if (OriginalCost < Cost)
13392 Cost = OriginalCost;
13393 }
13394 return Cost;
13395 }
13396 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
13397 /// mask \p Mask, register number \p Part, that includes \p SliceSize
13398 /// elements.
13399 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
13400 ArrayRef<int> Mask, unsigned Part,
13401 unsigned SliceSize) {
13402 if (SameNodesEstimated) {
13403 // Delay the cost estimation if the same nodes are reshuffling.
13404 // If we already requested the cost of reshuffling of E1 and E2 before, no
13405 // need to estimate another cost with the sub-Mask, instead include this
13406 // sub-Mask into the CommonMask to estimate it later and avoid double cost
13407 // estimation.
13408 if ((InVectors.size() == 2 &&
13409 cast<const TreeEntry *>(InVectors.front()) == &E1 &&
13410 cast<const TreeEntry *>(InVectors.back()) == E2) ||
13411 (!E2 && cast<const TreeEntry *>(InVectors.front()) == &E1)) {
13412 unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
13413 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
13414 [](int Idx) { return Idx == PoisonMaskElem; }) &&
13415 "Expected all poisoned elements.");
13416 ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
13417 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
13418 return;
13419 }
13420 // Found non-matching nodes - need to estimate the cost for the matched
13421 // and transform mask.
13422 Cost += createShuffle(InVectors.front(),
13423 InVectors.size() == 1 ? nullptr : InVectors.back(),
13424 CommonMask);
13425 transformMaskAfterShuffle(CommonMask, CommonMask);
13426 } else if (InVectors.size() == 2) {
13427 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
13428 transformMaskAfterShuffle(CommonMask, CommonMask);
13429 }
13430 SameNodesEstimated = false;
13431 if (!E2 && InVectors.size() == 1) {
13432 unsigned VF = E1.getVectorFactor();
13433 if (Value *V1 = dyn_cast<Value *>(InVectors.front())) {
13434 VF = std::max(VF, getVF(V1));
13435 } else {
13436 const auto *E = cast<const TreeEntry *>(InVectors.front());
13437 VF = std::max(VF, E->getVectorFactor());
13438 }
13439 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13440 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
13441 CommonMask[Idx] = Mask[Idx] + VF;
13442 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
13443 transformMaskAfterShuffle(CommonMask, CommonMask);
13444 } else {
13445 auto P = InVectors.front();
13446 Cost += createShuffle(&E1, E2, Mask);
13447 unsigned VF = Mask.size();
13448 if (Value *V1 = dyn_cast<Value *>(P)) {
13449 VF = std::max(VF,
13450 getNumElements(V1->getType()));
13451 } else {
13452 const auto *E = cast<const TreeEntry *>(P);
13453 VF = std::max(VF, E->getVectorFactor());
13454 }
13455 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13456 if (Mask[Idx] != PoisonMaskElem)
13457 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
13458 Cost += createShuffle(P, InVectors.front(), CommonMask);
13459 transformMaskAfterShuffle(CommonMask, CommonMask);
13460 }
13461 }
13462
13463 class ShuffleCostBuilder {
13464 const TargetTransformInfo &TTI;
13465
13466 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
13467 int Index = -1;
13468 return Mask.empty() ||
13469 (VF == Mask.size() &&
13472 Index == 0);
13473 }
13474
13475 public:
13476 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
13477 ~ShuffleCostBuilder() = default;
13478 InstructionCost createShuffleVector(Value *V1, Value *,
13479 ArrayRef<int> Mask) const {
13480 // Empty mask or identity mask are free.
13481 unsigned VF =
13482 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
13483 if (isEmptyOrIdentity(Mask, VF))
13484 return TTI::TCC_Free;
13485 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
13486 cast<VectorType>(V1->getType()), Mask);
13487 }
13488 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
13489 // Empty mask or identity mask are free.
13490 unsigned VF =
13491 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
13492 if (isEmptyOrIdentity(Mask, VF))
13493 return TTI::TCC_Free;
13494 return ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
13495 cast<VectorType>(V1->getType()), Mask);
13496 }
13497 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
13498 InstructionCost createPoison(Type *Ty, unsigned VF) const {
13499 return TTI::TCC_Free;
13500 }
13501 void resizeToMatch(Value *&, Value *&) const {}
13502 };
13503
13504 /// Smart shuffle instruction emission, walks through shuffles trees and
13505 /// tries to find the best matching vector for the actual shuffle
13506 /// instruction.
13508 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
13510 ArrayRef<int> Mask) {
13511 ShuffleCostBuilder Builder(TTI);
13512 SmallVector<int> CommonMask(Mask);
13513 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
13514 unsigned CommonVF = Mask.size();
13515 InstructionCost ExtraCost = 0;
13516 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
13517 unsigned VF) -> InstructionCost {
13518 if (E.isGather() && allConstant(E.Scalars))
13519 return TTI::TCC_Free;
13520 Type *EScalarTy = E.Scalars.front()->getType();
13521 bool IsSigned = true;
13522 if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
13523 EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
13524 IsSigned = It->second.second;
13525 }
13526 if (EScalarTy != ScalarTy) {
13527 unsigned CastOpcode = Instruction::Trunc;
13528 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13529 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13530 if (DstSz > SrcSz)
13531 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13532 return TTI.getCastInstrCost(CastOpcode, getWidenedType(ScalarTy, VF),
13533 getWidenedType(EScalarTy, VF),
13534 TTI::CastContextHint::None, CostKind);
13535 }
13536 return TTI::TCC_Free;
13537 };
13538 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
13539 if (isa<Constant>(V))
13540 return TTI::TCC_Free;
13541 auto *VecTy = cast<VectorType>(V->getType());
13542 Type *EScalarTy = VecTy->getElementType();
13543 if (EScalarTy != ScalarTy) {
13544 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
13545 unsigned CastOpcode = Instruction::Trunc;
13546 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13547 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13548 if (DstSz > SrcSz)
13549 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13550 return TTI.getCastInstrCost(
13551 CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
13552 VecTy, TTI::CastContextHint::None, CostKind);
13553 }
13554 return TTI::TCC_Free;
13555 };
13556 if (!V1 && !V2 && !P2.isNull()) {
13557 // Shuffle 2 entry nodes.
13558 const TreeEntry *E = cast<const TreeEntry *>(P1);
13559 unsigned VF = E->getVectorFactor();
13560 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
13561 CommonVF = std::max(VF, E2->getVectorFactor());
13562 assert(all_of(Mask,
13563 [=](int Idx) {
13564 return Idx < 2 * static_cast<int>(CommonVF);
13565 }) &&
13566 "All elements in mask must be less than 2 * CommonVF.");
13567 if (E->Scalars.size() == E2->Scalars.size()) {
13568 SmallVector<int> EMask = E->getCommonMask();
13569 SmallVector<int> E2Mask = E2->getCommonMask();
13570 if (!EMask.empty() || !E2Mask.empty()) {
13571 for (int &Idx : CommonMask) {
13572 if (Idx == PoisonMaskElem)
13573 continue;
13574 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
13575 Idx = EMask[Idx];
13576 else if (Idx >= static_cast<int>(CommonVF))
13577 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
13578 E->Scalars.size();
13579 }
13580 }
13581 CommonVF = E->Scalars.size();
13582 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
13583 GetNodeMinBWAffectedCost(*E2, CommonVF);
13584 } else {
13585 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
13586 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
13587 }
13588 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13589 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13590 } else if (!V1 && P2.isNull()) {
13591 // Shuffle single entry node.
13592 const TreeEntry *E = cast<const TreeEntry *>(P1);
13593 unsigned VF = E->getVectorFactor();
13594 CommonVF = VF;
13595 assert(
13596 all_of(Mask,
13597 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
13598 "All elements in mask must be less than CommonVF.");
13599 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
13600 SmallVector<int> EMask = E->getCommonMask();
13601 assert(!EMask.empty() && "Expected non-empty common mask.");
13602 for (int &Idx : CommonMask) {
13603 if (Idx != PoisonMaskElem)
13604 Idx = EMask[Idx];
13605 }
13606 CommonVF = E->Scalars.size();
13607 } else if (unsigned Factor = E->getInterleaveFactor();
13608 Factor > 0 && E->Scalars.size() != Mask.size() &&
13610 Factor)) {
13611 // Deinterleaved nodes are free.
13612 std::iota(CommonMask.begin(), CommonMask.end(), 0);
13613 }
13614 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
13615 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13616 // Not identity/broadcast? Try to see if the original vector is better.
13617 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
13618 CommonVF == CommonMask.size() &&
13619 any_of(enumerate(CommonMask),
13620 [](const auto &&P) {
13621 return P.value() != PoisonMaskElem &&
13622 static_cast<unsigned>(P.value()) != P.index();
13623 }) &&
13624 any_of(CommonMask,
13625 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
13626 SmallVector<int> ReorderMask;
13627 inversePermutation(E->ReorderIndices, ReorderMask);
13628 ::addMask(CommonMask, ReorderMask);
13629 }
13630 } else if (V1 && P2.isNull()) {
13631 // Shuffle single vector.
13632 ExtraCost += GetValueMinBWAffectedCost(V1);
13633 CommonVF = getVF(V1);
13634 assert(
13635 all_of(Mask,
13636 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
13637 "All elements in mask must be less than CommonVF.");
13638 } else if (V1 && !V2) {
13639 // Shuffle vector and tree node.
13640 unsigned VF = getVF(V1);
13641 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
13642 CommonVF = std::max(VF, E2->getVectorFactor());
13643 assert(all_of(Mask,
13644 [=](int Idx) {
13645 return Idx < 2 * static_cast<int>(CommonVF);
13646 }) &&
13647 "All elements in mask must be less than 2 * CommonVF.");
13648 if (E2->Scalars.size() == VF && VF != CommonVF) {
13649 SmallVector<int> E2Mask = E2->getCommonMask();
13650 assert(!E2Mask.empty() && "Expected non-empty common mask.");
13651 for (int &Idx : CommonMask) {
13652 if (Idx == PoisonMaskElem)
13653 continue;
13654 if (Idx >= static_cast<int>(CommonVF))
13655 Idx = E2Mask[Idx - CommonVF] + VF;
13656 }
13657 CommonVF = VF;
13658 }
13659 ExtraCost += GetValueMinBWAffectedCost(V1);
13660 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13661 ExtraCost += GetNodeMinBWAffectedCost(
13662 *E2, std::min(CommonVF, E2->getVectorFactor()));
13663 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13664 } else if (!V1 && V2) {
13665 // Shuffle vector and tree node.
13666 unsigned VF = getVF(V2);
13667 const TreeEntry *E1 = cast<const TreeEntry *>(P1);
13668 CommonVF = std::max(VF, E1->getVectorFactor());
13669 assert(all_of(Mask,
13670 [=](int Idx) {
13671 return Idx < 2 * static_cast<int>(CommonVF);
13672 }) &&
13673 "All elements in mask must be less than 2 * CommonVF.");
13674 if (E1->Scalars.size() == VF && VF != CommonVF) {
13675 SmallVector<int> E1Mask = E1->getCommonMask();
13676 assert(!E1Mask.empty() && "Expected non-empty common mask.");
13677 for (int &Idx : CommonMask) {
13678 if (Idx == PoisonMaskElem)
13679 continue;
13680 if (Idx >= static_cast<int>(CommonVF))
13681 Idx = E1Mask[Idx - CommonVF] + VF;
13682 else
13683 Idx = E1Mask[Idx];
13684 }
13685 CommonVF = VF;
13686 }
13687 ExtraCost += GetNodeMinBWAffectedCost(
13688 *E1, std::min(CommonVF, E1->getVectorFactor()));
13689 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13690 ExtraCost += GetValueMinBWAffectedCost(V2);
13691 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13692 } else {
13693 assert(V1 && V2 && "Expected both vectors.");
13694 unsigned VF = getVF(V1);
13695 CommonVF = std::max(VF, getVF(V2));
13696 assert(all_of(Mask,
13697 [=](int Idx) {
13698 return Idx < 2 * static_cast<int>(CommonVF);
13699 }) &&
13700 "All elements in mask must be less than 2 * CommonVF.");
13701 ExtraCost +=
13702 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
13703 if (V1->getType() != V2->getType()) {
13704 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13705 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13706 } else {
13707 if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
13708 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13709 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
13710 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13711 }
13712 }
13713 InVectors.front() =
13714 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
13715 if (InVectors.size() == 2)
13716 InVectors.pop_back();
13717 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
13718 V1, V2, CommonMask, Builder, ScalarTy);
13719 }
13720
13721public:
13723 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
13724 SmallPtrSetImpl<Value *> &CheckedExtracts)
13725 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
13726 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
13727 CheckedExtracts(CheckedExtracts) {}
13728 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
13729 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13730 unsigned NumParts, bool &UseVecBaseAsInput) {
13731 UseVecBaseAsInput = false;
13732 if (Mask.empty())
13733 return nullptr;
13734 Value *VecBase = nullptr;
13735 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
13736 if (!E->ReorderIndices.empty()) {
13737 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
13738 E->ReorderIndices.end());
13739 reorderScalars(VL, ReorderMask);
13740 }
13741 // Check if it can be considered reused if same extractelements were
13742 // vectorized already.
13743 bool PrevNodeFound = any_of(
13744 ArrayRef(R.VectorizableTree).take_front(E->Idx),
13745 [&](const std::unique_ptr<TreeEntry> &TE) {
13746 return ((TE->hasState() && !TE->isAltShuffle() &&
13747 TE->getOpcode() == Instruction::ExtractElement) ||
13748 TE->isGather()) &&
13749 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
13750 return VL.size() > Data.index() &&
13751 (Mask[Data.index()] == PoisonMaskElem ||
13752 isa<UndefValue>(VL[Data.index()]) ||
13753 Data.value() == VL[Data.index()]);
13754 });
13755 });
13756 SmallPtrSet<Value *, 4> UniqueBases;
13757 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
13758 SmallDenseMap<Value *, APInt, 4> VectorOpsToExtracts;
13759 for (unsigned Part : seq<unsigned>(NumParts)) {
13760 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
13761 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
13762 for (auto [I, V] :
13763 enumerate(ArrayRef(VL).slice(Part * SliceSize, Limit))) {
13764 // Ignore non-extractelement scalars.
13765 if (isa<UndefValue>(V) ||
13766 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
13767 continue;
13768 // If all users of instruction are going to be vectorized and this
13769 // instruction itself is not going to be vectorized, consider this
13770 // instruction as dead and remove its cost from the final cost of the
13771 // vectorized tree.
13772 // Also, avoid adjusting the cost for extractelements with multiple uses
13773 // in different graph entries.
13774 auto *EE = cast<ExtractElementInst>(V);
13775 VecBase = EE->getVectorOperand();
13776 UniqueBases.insert(VecBase);
13777 ArrayRef<TreeEntry *> VEs = R.getTreeEntries(V);
13778 if (!CheckedExtracts.insert(V).second ||
13779 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
13780 any_of(EE->users(),
13781 [&](User *U) {
13782 return isa<GetElementPtrInst>(U) &&
13783 !R.areAllUsersVectorized(cast<Instruction>(U),
13784 &VectorizedVals);
13785 }) ||
13786 (!VEs.empty() && !is_contained(VEs, E)))
13787 continue;
13788 std::optional<unsigned> EEIdx = getExtractIndex(EE);
13789 if (!EEIdx)
13790 continue;
13791 unsigned Idx = *EEIdx;
13792 // Take credit for instruction that will become dead.
13793 if (EE->hasOneUse() || !PrevNodeFound) {
13794 Instruction *Ext = EE->user_back();
13795 if (isa<SExtInst, ZExtInst>(Ext) &&
13796 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
13797 // Use getExtractWithExtendCost() to calculate the cost of
13798 // extractelement/ext pair.
13800 Ext->getOpcode(), Ext->getType(), EE->getVectorOperandType(),
13801 Idx, CostKind);
13802 // Add back the cost of s|zext which is subtracted separately.
13804 Ext->getOpcode(), Ext->getType(), EE->getType(),
13805 TTI::getCastContextHint(Ext), CostKind, Ext);
13806 continue;
13807 }
13808 }
13809 APInt &DemandedElts =
13810 VectorOpsToExtracts
13811 .try_emplace(VecBase,
13812 APInt::getZero(getNumElements(VecBase->getType())))
13813 .first->getSecond();
13814 DemandedElts.setBit(Idx);
13815 }
13816 }
13817 for (const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
13818 Cost -= TTI.getScalarizationOverhead(cast<VectorType>(Vec->getType()),
13819 DemandedElts, /*Insert=*/false,
13820 /*Extract=*/true, CostKind);
13821 // Check that gather of extractelements can be represented as just a
13822 // shuffle of a single/two vectors the scalars are extracted from.
13823 // Found the bunch of extractelement instructions that must be gathered
13824 // into a vector and can be represented as a permutation elements in a
13825 // single input vector or of 2 input vectors.
13826 // Done for reused if same extractelements were vectorized already.
13827 if (!PrevNodeFound)
13828 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
13829 InVectors.assign(1, E);
13830 CommonMask.assign(Mask.begin(), Mask.end());
13831 transformMaskAfterShuffle(CommonMask, CommonMask);
13832 SameNodesEstimated = false;
13833 if (NumParts != 1 && UniqueBases.size() != 1) {
13834 UseVecBaseAsInput = true;
13835 VecBase =
13836 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
13837 }
13838 return VecBase;
13839 }
13840 /// Checks if the specified entry \p E needs to be delayed because of its
13841 /// dependency nodes.
13842 std::optional<InstructionCost>
13843 needToDelay(const TreeEntry *,
13845 // No need to delay the cost estimation during analysis.
13846 return std::nullopt;
13847 }
13848 /// Reset the builder to handle perfect diamond match.
13850 IsFinalized = false;
13851 CommonMask.clear();
13852 InVectors.clear();
13853 Cost = 0;
13854 VectorizedVals.clear();
13855 SameNodesEstimated = true;
13856 }
13857 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
13858 if (&E1 == &E2) {
13859 assert(all_of(Mask,
13860 [&](int Idx) {
13861 return Idx < static_cast<int>(E1.getVectorFactor());
13862 }) &&
13863 "Expected single vector shuffle mask.");
13864 add(E1, Mask);
13865 return;
13866 }
13867 if (InVectors.empty()) {
13868 CommonMask.assign(Mask.begin(), Mask.end());
13869 InVectors.assign({&E1, &E2});
13870 return;
13871 }
13872 assert(!CommonMask.empty() && "Expected non-empty common mask.");
13873 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
13874 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
13875 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
13876 const auto *It =
13877 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
13878 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
13879 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
13880 }
13881 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
13882 if (InVectors.empty()) {
13883 CommonMask.assign(Mask.begin(), Mask.end());
13884 InVectors.assign(1, &E1);
13885 return;
13886 }
13887 assert(!CommonMask.empty() && "Expected non-empty common mask.");
13888 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
13889 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
13890 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
13891 const auto *It =
13892 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
13893 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
13894 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
13895 if (!SameNodesEstimated && InVectors.size() == 1)
13896 InVectors.emplace_back(&E1);
13897 }
13898 /// Adds 2 input vectors and the mask for their shuffling.
13899 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
13900 // May come only for shuffling of 2 vectors with extractelements, already
13901 // handled in adjustExtracts.
13902 assert(InVectors.size() == 1 &&
13903 all_of(enumerate(CommonMask),
13904 [&](auto P) {
13905 if (P.value() == PoisonMaskElem)
13906 return Mask[P.index()] == PoisonMaskElem;
13907 auto *EI = cast<ExtractElementInst>(
13908 cast<const TreeEntry *>(InVectors.front())
13909 ->getOrdered(P.index()));
13910 return EI->getVectorOperand() == V1 ||
13911 EI->getVectorOperand() == V2;
13912 }) &&
13913 "Expected extractelement vectors.");
13914 }
13915 /// Adds another one input vector and the mask for the shuffling.
13916 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
13917 if (InVectors.empty()) {
13918 assert(CommonMask.empty() && !ForExtracts &&
13919 "Expected empty input mask/vectors.");
13920 CommonMask.assign(Mask.begin(), Mask.end());
13921 InVectors.assign(1, V1);
13922 return;
13923 }
13924 if (ForExtracts) {
13925 // No need to add vectors here, already handled them in adjustExtracts.
13926 assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
13927 !CommonMask.empty() &&
13928 all_of(enumerate(CommonMask),
13929 [&](auto P) {
13930 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
13931 ->getOrdered(P.index());
13932 if (P.value() == PoisonMaskElem)
13933 return P.value() == Mask[P.index()] ||
13934 isa<UndefValue>(Scalar);
13935 if (isa<Constant>(V1))
13936 return true;
13937 auto *EI = cast<ExtractElementInst>(Scalar);
13938 return EI->getVectorOperand() == V1;
13939 }) &&
13940 "Expected only tree entry for extractelement vectors.");
13941 return;
13942 }
13943 assert(!InVectors.empty() && !CommonMask.empty() &&
13944 "Expected only tree entries from extracts/reused buildvectors.");
13945 unsigned VF = getVF(V1);
13946 if (InVectors.size() == 2) {
13947 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
13948 transformMaskAfterShuffle(CommonMask, CommonMask);
13949 VF = std::max<unsigned>(VF, CommonMask.size());
13950 } else if (const auto *InTE =
13951 InVectors.front().dyn_cast<const TreeEntry *>()) {
13952 VF = std::max(VF, InTE->getVectorFactor());
13953 } else {
13954 VF = std::max(
13955 VF, cast<FixedVectorType>(cast<Value *>(InVectors.front())->getType())
13956 ->getNumElements());
13957 }
13958 InVectors.push_back(V1);
13959 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13960 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
13961 CommonMask[Idx] = Mask[Idx] + VF;
13962 }
13963 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
13964 Value *Root = nullptr) {
13965 Cost += getBuildVectorCost(VL, Root);
13966 if (!Root) {
13967 // FIXME: Need to find a way to avoid use of getNullValue here.
13969 unsigned VF = VL.size();
13970 if (MaskVF != 0)
13971 VF = std::min(VF, MaskVF);
13972 Type *VLScalarTy = VL.front()->getType();
13973 for (Value *V : VL.take_front(VF)) {
13974 Type *ScalarTy = VLScalarTy->getScalarType();
13975 if (isa<PoisonValue>(V)) {
13976 Vals.push_back(PoisonValue::get(ScalarTy));
13977 continue;
13978 }
13979 if (isa<UndefValue>(V)) {
13980 Vals.push_back(UndefValue::get(ScalarTy));
13981 continue;
13982 }
13983 Vals.push_back(Constant::getNullValue(ScalarTy));
13984 }
13985 if (auto *VecTy = dyn_cast<FixedVectorType>(VLScalarTy)) {
13986 assert(SLPReVec && "FixedVectorType is not expected.");
13987 // When REVEC is enabled, we need to expand vector types into scalar
13988 // types.
13989 Vals = replicateMask(Vals, VecTy->getNumElements());
13990 }
13991 return ConstantVector::get(Vals);
13992 }
13995 cast<FixedVectorType>(Root->getType())->getNumElements()),
13996 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
13997 }
13999 /// Finalize emission of the shuffles.
14001 ArrayRef<int> ExtMask,
14002 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14003 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
14006 Action = {}) {
14007 IsFinalized = true;
14008 if (Action) {
14009 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
14010 if (InVectors.size() == 2)
14011 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
14012 else
14013 Cost += createShuffle(Vec, nullptr, CommonMask);
14014 transformMaskAfterShuffle(CommonMask, CommonMask);
14015 assert(VF > 0 &&
14016 "Expected vector length for the final value before action.");
14017 Value *V = cast<Value *>(Vec);
14018 Action(V, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
14019 Cost += createShuffle(V1, V2, Mask);
14020 return V1;
14021 });
14022 InVectors.front() = V;
14023 }
14024 if (!SubVectors.empty()) {
14025 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
14026 if (InVectors.size() == 2)
14027 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
14028 else
14029 Cost += createShuffle(Vec, nullptr, CommonMask);
14030 transformMaskAfterShuffle(CommonMask, CommonMask);
14031 // Add subvectors permutation cost.
14032 if (!SubVectorsMask.empty()) {
14033 assert(SubVectorsMask.size() <= CommonMask.size() &&
14034 "Expected same size of masks for subvectors and common mask.");
14035 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
14036 copy(SubVectorsMask, SVMask.begin());
14037 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
14038 if (I2 != PoisonMaskElem) {
14039 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
14040 I1 = I2 + CommonMask.size();
14041 }
14042 }
14044 getWidenedType(ScalarTy, CommonMask.size()),
14045 SVMask, CostKind);
14046 }
14047 for (auto [E, Idx] : SubVectors) {
14048 Type *EScalarTy = E->Scalars.front()->getType();
14049 bool IsSigned = true;
14050 if (auto It = R.MinBWs.find(E); It != R.MinBWs.end()) {
14051 EScalarTy =
14052 IntegerType::get(EScalarTy->getContext(), It->second.first);
14053 IsSigned = It->second.second;
14054 }
14055 if (ScalarTy != EScalarTy) {
14056 unsigned CastOpcode = Instruction::Trunc;
14057 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
14058 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
14059 if (DstSz > SrcSz)
14060 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14062 CastOpcode, getWidenedType(ScalarTy, E->getVectorFactor()),
14063 getWidenedType(EScalarTy, E->getVectorFactor()),
14065 }
14068 getWidenedType(ScalarTy, CommonMask.size()), {}, CostKind, Idx,
14069 getWidenedType(ScalarTy, E->getVectorFactor()));
14070 if (!CommonMask.empty()) {
14071 std::iota(std::next(CommonMask.begin(), Idx),
14072 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
14073 Idx);
14074 }
14075 }
14076 }
14077
14078 if (!ExtMask.empty()) {
14079 if (CommonMask.empty()) {
14080 CommonMask.assign(ExtMask.begin(), ExtMask.end());
14081 } else {
14082 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
14083 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
14084 if (ExtMask[I] == PoisonMaskElem)
14085 continue;
14086 NewMask[I] = CommonMask[ExtMask[I]];
14087 }
14088 CommonMask.swap(NewMask);
14089 }
14090 }
14091 if (CommonMask.empty()) {
14092 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
14093 return Cost;
14094 }
14095 return Cost +
14096 createShuffle(InVectors.front(),
14097 InVectors.size() == 2 ? InVectors.back() : nullptr,
14098 CommonMask);
14099 }
14100
14102 assert((IsFinalized || CommonMask.empty()) &&
14103 "Shuffle construction must be finalized.");
14104 }
14105};
14106
14107const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
14108 unsigned Idx) const {
14109 TreeEntry *Op = OperandsToTreeEntry.at({E, Idx});
14110 assert(Op->isSame(E->getOperand(Idx)) && "Operands mismatch!");
14111 return Op;
14112}
14113
14114TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
14115 if (TE.State == TreeEntry::ScatterVectorize ||
14116 TE.State == TreeEntry::StridedVectorize)
14118 if (TE.State == TreeEntry::CompressVectorize)
14120 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
14121 !TE.isAltShuffle()) {
14122 if (TE.ReorderIndices.empty())
14124 SmallVector<int> Mask;
14125 inversePermutation(TE.ReorderIndices, Mask);
14126 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
14128 }
14130}
14131
14133BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
14134 SmallPtrSetImpl<Value *> &CheckedExtracts) {
14135 ArrayRef<Value *> VL = E->Scalars;
14136
14137 Type *ScalarTy = getValueType(VL[0]);
14138 if (!isValidElementType(ScalarTy))
14141
14142 // If we have computed a smaller type for the expression, update VecTy so
14143 // that the costs will be accurate.
14144 auto It = MinBWs.find(E);
14145 Type *OrigScalarTy = ScalarTy;
14146 if (It != MinBWs.end()) {
14147 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
14148 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
14149 if (VecTy)
14150 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
14151 }
14152 auto *VecTy = getWidenedType(ScalarTy, VL.size());
14153 unsigned EntryVF = E->getVectorFactor();
14154 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
14155
14156 if (E->isGather()) {
14157 if (allConstant(VL))
14158 return 0;
14159 if (isa<InsertElementInst>(VL[0]))
14161 if (isa<CmpInst>(VL.front()))
14162 ScalarTy = VL.front()->getType();
14163 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
14164 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
14165 }
14166 if (E->State == TreeEntry::SplitVectorize) {
14167 assert(E->CombinedEntriesWithIndices.size() == 2 &&
14168 "Expected exactly 2 combined entries.");
14169 assert(E->ReuseShuffleIndices.empty() && "Expected empty reuses mask.");
14170 InstructionCost VectorCost = 0;
14171 if (E->ReorderIndices.empty()) {
14172 VectorCost = ::getShuffleCost(
14173 *TTI, TTI::SK_InsertSubvector, FinalVecTy, {}, CostKind,
14174 E->CombinedEntriesWithIndices.back().second,
14176 ScalarTy,
14177 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
14178 ->getVectorFactor()));
14179 } else {
14180 unsigned CommonVF =
14181 std::max(VectorizableTree[E->CombinedEntriesWithIndices.front().first]
14182 ->getVectorFactor(),
14183 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
14184 ->getVectorFactor());
14186 getWidenedType(ScalarTy, CommonVF),
14187 E->getSplitMask(), CostKind);
14188 }
14189 LLVM_DEBUG(dumpTreeCosts(E, 0, VectorCost, 0, "Calculated costs for Tree"));
14190 return VectorCost;
14191 }
14192 InstructionCost CommonCost = 0;
14194 if (!E->ReorderIndices.empty() && E->State != TreeEntry::CompressVectorize &&
14195 (E->State != TreeEntry::StridedVectorize ||
14196 !isReverseOrder(E->ReorderIndices))) {
14197 SmallVector<int> NewMask;
14198 if (E->getOpcode() == Instruction::Store) {
14199 // For stores the order is actually a mask.
14200 NewMask.resize(E->ReorderIndices.size());
14201 copy(E->ReorderIndices, NewMask.begin());
14202 } else {
14203 inversePermutation(E->ReorderIndices, NewMask);
14204 }
14205 ::addMask(Mask, NewMask);
14206 }
14207 if (!E->ReuseShuffleIndices.empty())
14208 ::addMask(Mask, E->ReuseShuffleIndices);
14209 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
14210 CommonCost =
14211 ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
14212 assert((E->State == TreeEntry::Vectorize ||
14213 E->State == TreeEntry::ScatterVectorize ||
14214 E->State == TreeEntry::StridedVectorize ||
14215 E->State == TreeEntry::CompressVectorize) &&
14216 "Unhandled state");
14217 assert(E->getOpcode() &&
14218 ((allSameType(VL) && allSameBlock(VL)) ||
14219 (E->getOpcode() == Instruction::GetElementPtr &&
14220 E->getMainOp()->getType()->isPointerTy()) ||
14221 E->hasCopyableElements()) &&
14222 "Invalid VL");
14223 Instruction *VL0 = E->getMainOp();
14224 unsigned ShuffleOrOp =
14225 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
14226 if (E->CombinedOp != TreeEntry::NotCombinedOp)
14227 ShuffleOrOp = E->CombinedOp;
14228 SmallSetVector<Value *, 16> UniqueValues(VL.begin(), VL.end());
14229 const unsigned Sz = UniqueValues.size();
14230 SmallBitVector UsedScalars(Sz, false);
14231 for (unsigned I = 0; I < Sz; ++I) {
14232 if (isa<Instruction>(UniqueValues[I]) &&
14233 !E->isCopyableElement(UniqueValues[I]) &&
14234 getTreeEntries(UniqueValues[I]).front() == E)
14235 continue;
14236 UsedScalars.set(I);
14237 }
14238 auto GetCastContextHint = [&](Value *V) {
14239 if (ArrayRef<TreeEntry *> OpTEs = getTreeEntries(V); OpTEs.size() == 1)
14240 return getCastContextHint(*OpTEs.front());
14241 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
14242 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
14243 !SrcState.isAltShuffle())
14246 };
14247 auto GetCostDiff =
14248 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
14250 // Calculate the cost of this instruction.
14251 InstructionCost ScalarCost = 0;
14252 if (isa<CastInst, CallInst>(VL0)) {
14253 // For some of the instructions no need to calculate cost for each
14254 // particular instruction, we can use the cost of the single
14255 // instruction x total number of scalar instructions.
14256 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
14257 } else {
14258 for (unsigned I = 0; I < Sz; ++I) {
14259 if (UsedScalars.test(I))
14260 continue;
14261 ScalarCost += ScalarEltCost(I);
14262 }
14263 }
14264
14265 InstructionCost VecCost = VectorCost(CommonCost);
14266 // Check if the current node must be resized, if the parent node is not
14267 // resized.
14268 if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) &&
14269 E->Idx != 0 &&
14270 (E->getOpcode() != Instruction::Load || E->UserTreeIndex)) {
14271 const EdgeInfo &EI = E->UserTreeIndex;
14272 if (!EI.UserTE->hasState() ||
14273 EI.UserTE->getOpcode() != Instruction::Select ||
14274 EI.EdgeIdx != 0) {
14275 auto UserBWIt = MinBWs.find(EI.UserTE);
14276 Type *UserScalarTy =
14277 (EI.UserTE->isGather() ||
14278 EI.UserTE->State == TreeEntry::SplitVectorize)
14279 ? EI.UserTE->Scalars.front()->getType()
14280 : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
14281 if (UserBWIt != MinBWs.end())
14282 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
14283 UserBWIt->second.first);
14284 if (ScalarTy != UserScalarTy) {
14285 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
14286 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
14287 unsigned VecOpcode;
14288 auto *UserVecTy = getWidenedType(UserScalarTy, E->Scalars.size());
14289 if (BWSz > SrcBWSz)
14290 VecOpcode = Instruction::Trunc;
14291 else
14292 VecOpcode =
14293 It->second.second ? Instruction::SExt : Instruction::ZExt;
14294 TTI::CastContextHint CCH = GetCastContextHint(VL0);
14295 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
14296 CostKind);
14297 }
14298 }
14299 }
14300 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
14301 ScalarCost, "Calculated costs for Tree"));
14302 return VecCost - ScalarCost;
14303 };
14304 // Calculate cost difference from vectorizing set of GEPs.
14305 // Negative value means vectorizing is profitable.
14306 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
14307 assert((E->State == TreeEntry::Vectorize ||
14308 E->State == TreeEntry::StridedVectorize ||
14309 E->State == TreeEntry::CompressVectorize) &&
14310 "Entry state expected to be Vectorize, StridedVectorize or "
14311 "MaskedLoadCompressVectorize here.");
14312 InstructionCost ScalarCost = 0;
14313 InstructionCost VecCost = 0;
14314 std::tie(ScalarCost, VecCost) = getGEPCosts(
14315 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
14316 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
14317 "Calculated GEPs cost for Tree"));
14318
14319 return VecCost - ScalarCost;
14320 };
14321
14322 auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
14323 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI ? VI : VL);
14324 if (MinMaxID == Intrinsic::not_intrinsic)
14326 Type *CanonicalType = Ty;
14327 if (CanonicalType->isPtrOrPtrVectorTy())
14328 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
14329 CanonicalType->getContext(),
14330 DL->getTypeSizeInBits(CanonicalType->getScalarType())));
14331
14332 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
14333 {CanonicalType, CanonicalType});
14335 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
14336 // If the selects are the only uses of the compares, they will be
14337 // dead and we can adjust the cost by removing their cost.
14338 if (VI && SelectOnly) {
14339 assert((!Ty->isVectorTy() || SLPReVec) &&
14340 "Expected only for scalar type.");
14341 auto *CI = cast<CmpInst>(VI->getOperand(0));
14343 CI->getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
14344 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
14345 {TTI::OK_AnyValue, TTI::OP_None}, CI);
14346 }
14347 return IntrinsicCost;
14348 };
14349 auto GetFMulAddCost = [&, &TTI = *TTI](const InstructionsState &S,
14350 Instruction *VI) {
14351 InstructionCost Cost = canConvertToFMA(VI, S, *DT, *DL, TTI, *TLI);
14352 return Cost;
14353 };
14354 switch (ShuffleOrOp) {
14355 case Instruction::PHI: {
14356 // Count reused scalars.
14357 InstructionCost ScalarCost = 0;
14359 for (Value *V : UniqueValues) {
14360 auto *PHI = dyn_cast<PHINode>(V);
14361 if (!PHI)
14362 continue;
14363
14364 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
14365 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
14366 Value *Op = PHI->getIncomingValue(I);
14367 Operands[I] = Op;
14368 }
14369 if (const TreeEntry *OpTE =
14370 getSameValuesTreeEntry(Operands.front(), Operands))
14371 if (CountedOps.insert(OpTE).second &&
14372 !OpTE->ReuseShuffleIndices.empty())
14373 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
14374 OpTE->Scalars.size());
14375 }
14376
14377 return CommonCost - ScalarCost;
14378 }
14379 case Instruction::ExtractValue:
14380 case Instruction::ExtractElement: {
14381 APInt DemandedElts;
14382 VectorType *SrcVecTy = nullptr;
14383 auto GetScalarCost = [&](unsigned Idx) {
14384 if (isa<PoisonValue>(UniqueValues[Idx]))
14386
14387 auto *I = cast<Instruction>(UniqueValues[Idx]);
14388 if (!SrcVecTy) {
14389 if (ShuffleOrOp == Instruction::ExtractElement) {
14390 auto *EE = cast<ExtractElementInst>(I);
14391 SrcVecTy = EE->getVectorOperandType();
14392 } else {
14393 auto *EV = cast<ExtractValueInst>(I);
14394 Type *AggregateTy = EV->getAggregateOperand()->getType();
14395 unsigned NumElts;
14396 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
14397 NumElts = ATy->getNumElements();
14398 else
14399 NumElts = AggregateTy->getStructNumElements();
14400 SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
14401 }
14402 }
14403 if (I->hasOneUse()) {
14404 Instruction *Ext = I->user_back();
14405 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
14406 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
14407 // Use getExtractWithExtendCost() to calculate the cost of
14408 // extractelement/ext pair.
14410 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I),
14411 CostKind);
14412 // Subtract the cost of s|zext which is subtracted separately.
14414 Ext->getOpcode(), Ext->getType(), I->getType(),
14416 return Cost;
14417 }
14418 }
14419 if (DemandedElts.isZero())
14420 DemandedElts = APInt::getZero(getNumElements(SrcVecTy));
14421 DemandedElts.setBit(*getExtractIndex(I));
14423 };
14424 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
14425 return CommonCost - (DemandedElts.isZero()
14428 SrcVecTy, DemandedElts, /*Insert=*/false,
14429 /*Extract=*/true, CostKind));
14430 };
14431 return GetCostDiff(GetScalarCost, GetVectorCost);
14432 }
14433 case Instruction::InsertElement: {
14434 assert(E->ReuseShuffleIndices.empty() &&
14435 "Unique insertelements only are expected.");
14436 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
14437 unsigned const NumElts = SrcVecTy->getNumElements();
14438 unsigned const NumScalars = VL.size();
14439
14440 unsigned NumOfParts = ::getNumberOfParts(*TTI, SrcVecTy);
14441
14442 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
14443 unsigned OffsetBeg = *getElementIndex(VL.front());
14444 unsigned OffsetEnd = OffsetBeg;
14445 InsertMask[OffsetBeg] = 0;
14446 for (auto [I, V] : enumerate(VL.drop_front())) {
14447 unsigned Idx = *getElementIndex(V);
14448 if (OffsetBeg > Idx)
14449 OffsetBeg = Idx;
14450 else if (OffsetEnd < Idx)
14451 OffsetEnd = Idx;
14452 InsertMask[Idx] = I + 1;
14453 }
14454 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
14455 if (NumOfParts > 0 && NumOfParts < NumElts)
14456 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
14457 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
14458 VecScalarsSz;
14459 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
14460 unsigned InsertVecSz = std::min<unsigned>(
14461 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
14462 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
14463 bool IsWholeSubvector =
14464 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
14465 // Check if we can safely insert a subvector. If it is not possible, just
14466 // generate a whole-sized vector and shuffle the source vector and the new
14467 // subvector.
14468 if (OffsetBeg + InsertVecSz > VecSz) {
14469 // Align OffsetBeg to generate correct mask.
14470 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
14471 InsertVecSz = VecSz;
14472 }
14473
14474 APInt DemandedElts = APInt::getZero(NumElts);
14475 // TODO: Add support for Instruction::InsertValue.
14477 if (!E->ReorderIndices.empty()) {
14478 inversePermutation(E->ReorderIndices, Mask);
14479 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
14480 } else {
14481 Mask.assign(VecSz, PoisonMaskElem);
14482 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
14483 }
14484 bool IsIdentity = true;
14485 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
14486 Mask.swap(PrevMask);
14487 for (unsigned I = 0; I < NumScalars; ++I) {
14488 unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);
14489 DemandedElts.setBit(InsertIdx);
14490 IsIdentity &= InsertIdx - OffsetBeg == I;
14491 Mask[InsertIdx - OffsetBeg] = I;
14492 }
14493 assert(Offset < NumElts && "Failed to find vector index offset");
14494
14496 Cost -=
14497 getScalarizationOverhead(*TTI, ScalarTy, SrcVecTy, DemandedElts,
14498 /*Insert*/ true, /*Extract*/ false, CostKind);
14499
14500 // First cost - resize to actual vector size if not identity shuffle or
14501 // need to shift the vector.
14502 // Do not calculate the cost if the actual size is the register size and
14503 // we can merge this shuffle with the following SK_Select.
14504 auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz);
14505 if (!IsIdentity)
14507 InsertVecTy, Mask);
14508 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
14509 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
14510 }));
14511 // Second cost - permutation with subvector, if some elements are from the
14512 // initial vector or inserting a subvector.
14513 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
14514 // subvector of ActualVecTy.
14515 SmallBitVector InMask =
14516 isUndefVector(FirstInsert->getOperand(0),
14517 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
14518 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
14519 if (InsertVecSz != VecSz) {
14520 auto *ActualVecTy = getWidenedType(ScalarTy, VecSz);
14521 Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, ActualVecTy, {},
14522 CostKind, OffsetBeg - Offset, InsertVecTy);
14523 } else {
14524 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
14525 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
14526 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
14527 I <= End; ++I)
14528 if (Mask[I] != PoisonMaskElem)
14529 Mask[I] = I + VecSz;
14530 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
14531 Mask[I] =
14532 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
14533 Cost +=
14534 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
14535 }
14536 }
14537 return Cost;
14538 }
14539 case Instruction::ZExt:
14540 case Instruction::SExt:
14541 case Instruction::FPToUI:
14542 case Instruction::FPToSI:
14543 case Instruction::FPExt:
14544 case Instruction::PtrToInt:
14545 case Instruction::IntToPtr:
14546 case Instruction::SIToFP:
14547 case Instruction::UIToFP:
14548 case Instruction::Trunc:
14549 case Instruction::FPTrunc:
14550 case Instruction::BitCast: {
14551 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
14552 Type *SrcScalarTy = VL0->getOperand(0)->getType();
14553 auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
14554 unsigned Opcode = ShuffleOrOp;
14555 unsigned VecOpcode = Opcode;
14556 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
14557 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
14558 // Check if the values are candidates to demote.
14559 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->getScalarType());
14560 if (SrcIt != MinBWs.end()) {
14561 SrcBWSz = SrcIt->second.first;
14562 unsigned SrcScalarTyNumElements = getNumElements(SrcScalarTy);
14563 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
14564 SrcVecTy =
14565 getWidenedType(SrcScalarTy, VL.size() * SrcScalarTyNumElements);
14566 }
14567 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
14568 if (BWSz == SrcBWSz) {
14569 VecOpcode = Instruction::BitCast;
14570 } else if (BWSz < SrcBWSz) {
14571 VecOpcode = Instruction::Trunc;
14572 } else if (It != MinBWs.end()) {
14573 assert(BWSz > SrcBWSz && "Invalid cast!");
14574 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
14575 } else if (SrcIt != MinBWs.end()) {
14576 assert(BWSz > SrcBWSz && "Invalid cast!");
14577 VecOpcode =
14578 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
14579 }
14580 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
14581 !SrcIt->second.second) {
14582 VecOpcode = Instruction::UIToFP;
14583 }
14584 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
14585 assert(Idx == 0 && "Expected 0 index only");
14586 return TTI->getCastInstrCost(Opcode, VL0->getType(),
14587 VL0->getOperand(0)->getType(),
14589 };
14590 auto GetVectorCost = [=](InstructionCost CommonCost) {
14591 // Do not count cost here if minimum bitwidth is in effect and it is just
14592 // a bitcast (here it is just a noop).
14593 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
14594 return CommonCost;
14595 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
14596 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
14597
14598 bool IsArithmeticExtendedReduction =
14599 E->Idx == 0 && UserIgnoreList &&
14600 all_of(*UserIgnoreList, [](Value *V) {
14601 auto *I = cast<Instruction>(V);
14602 return is_contained({Instruction::Add, Instruction::FAdd,
14603 Instruction::Mul, Instruction::FMul,
14604 Instruction::And, Instruction::Or,
14605 Instruction::Xor},
14606 I->getOpcode());
14607 });
14608 if (IsArithmeticExtendedReduction &&
14609 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
14610 return CommonCost;
14611 return CommonCost +
14612 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
14613 VecOpcode == Opcode ? VI : nullptr);
14614 };
14615 return GetCostDiff(GetScalarCost, GetVectorCost);
14616 }
14617 case Instruction::FCmp:
14618 case Instruction::ICmp:
14619 case Instruction::Select: {
14620 CmpPredicate VecPred, SwappedVecPred;
14621 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
14622 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
14623 match(VL0, MatchCmp))
14624 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
14625 else
14626 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
14629 auto GetScalarCost = [&](unsigned Idx) {
14630 if (isa<PoisonValue>(UniqueValues[Idx]))
14632
14633 auto *VI = cast<Instruction>(UniqueValues[Idx]);
14634 CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
14637 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
14638 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
14639 !match(VI, MatchCmp)) ||
14640 (CurrentPred != static_cast<CmpInst::Predicate>(VecPred) &&
14641 CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred)))
14642 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
14645
14647 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
14648 CostKind, getOperandInfo(VI->getOperand(0)),
14649 getOperandInfo(VI->getOperand(1)), VI);
14650 InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
14651 if (IntrinsicCost.isValid())
14652 ScalarCost = IntrinsicCost;
14653
14654 return ScalarCost;
14655 };
14656 auto GetVectorCost = [&](InstructionCost CommonCost) {
14657 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
14658
14659 InstructionCost VecCost =
14660 TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VecPred,
14661 CostKind, getOperandInfo(E->getOperand(0)),
14662 getOperandInfo(E->getOperand(1)), VL0);
14663 if (auto *SI = dyn_cast<SelectInst>(VL0)) {
14664 auto *CondType =
14665 getWidenedType(SI->getCondition()->getType(), VL.size());
14666 unsigned CondNumElements = CondType->getNumElements();
14667 unsigned VecTyNumElements = getNumElements(VecTy);
14668 assert(VecTyNumElements >= CondNumElements &&
14669 VecTyNumElements % CondNumElements == 0 &&
14670 "Cannot vectorize Instruction::Select");
14671 if (CondNumElements != VecTyNumElements) {
14672 // When the return type is i1 but the source is fixed vector type, we
14673 // need to duplicate the condition value.
14674 VecCost += ::getShuffleCost(
14675 *TTI, TTI::SK_PermuteSingleSrc, CondType,
14676 createReplicatedMask(VecTyNumElements / CondNumElements,
14677 CondNumElements));
14678 }
14679 }
14680 return VecCost + CommonCost;
14681 };
14682 return GetCostDiff(GetScalarCost, GetVectorCost);
14683 }
14684 case TreeEntry::MinMax: {
14685 auto GetScalarCost = [&](unsigned Idx) {
14686 return GetMinMaxCost(OrigScalarTy);
14687 };
14688 auto GetVectorCost = [&](InstructionCost CommonCost) {
14689 InstructionCost VecCost = GetMinMaxCost(VecTy);
14690 return VecCost + CommonCost;
14691 };
14692 return GetCostDiff(GetScalarCost, GetVectorCost);
14693 }
14694 case TreeEntry::FMulAdd: {
14695 auto GetScalarCost = [&](unsigned Idx) {
14696 if (isa<PoisonValue>(UniqueValues[Idx]))
14698 return GetFMulAddCost(E->getOperations(),
14699 cast<Instruction>(UniqueValues[Idx]));
14700 };
14701 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
14702 FastMathFlags FMF;
14703 FMF.set();
14704 for (Value *V : E->Scalars) {
14705 if (auto *FPCI = dyn_cast<FPMathOperator>(V)) {
14706 FMF &= FPCI->getFastMathFlags();
14707 if (auto *FPCIOp = dyn_cast<FPMathOperator>(FPCI->getOperand(0)))
14708 FMF &= FPCIOp->getFastMathFlags();
14709 }
14710 }
14711 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
14712 {VecTy, VecTy, VecTy}, FMF);
14714 return VecCost + CommonCost;
14715 };
14716 return GetCostDiff(GetScalarCost, GetVectorCost);
14717 }
14718 case Instruction::FNeg:
14719 case Instruction::Add:
14720 case Instruction::FAdd:
14721 case Instruction::Sub:
14722 case Instruction::FSub:
14723 case Instruction::Mul:
14724 case Instruction::FMul:
14725 case Instruction::UDiv:
14726 case Instruction::SDiv:
14727 case Instruction::FDiv:
14728 case Instruction::URem:
14729 case Instruction::SRem:
14730 case Instruction::FRem:
14731 case Instruction::Shl:
14732 case Instruction::LShr:
14733 case Instruction::AShr:
14734 case Instruction::And:
14735 case Instruction::Or:
14736 case Instruction::Xor: {
14737 auto GetScalarCost = [&](unsigned Idx) {
14738 if (isa<PoisonValue>(UniqueValues[Idx]))
14740
14741 // We cannot retrieve the operand from UniqueValues[Idx] because an
14742 // interchangeable instruction may be used. The order and the actual
14743 // operand might differ from what is retrieved from UniqueValues[Idx].
14744 Value *Op1 = E->getOperand(0)[Idx];
14745 Value *Op2;
14747 if (isa<UnaryOperator>(UniqueValues[Idx])) {
14748 Op2 = Op1;
14749 } else {
14750 Op2 = E->getOperand(1)[Idx];
14751 Operands.push_back(Op2);
14752 }
14756 ShuffleOrOp, OrigScalarTy, CostKind, Op1Info, Op2Info, Operands);
14757 if (auto *I = dyn_cast<Instruction>(UniqueValues[Idx]);
14758 I && (ShuffleOrOp == Instruction::FAdd ||
14759 ShuffleOrOp == Instruction::FSub)) {
14760 InstructionCost IntrinsicCost = GetFMulAddCost(E->getOperations(), I);
14761 if (IntrinsicCost.isValid())
14762 ScalarCost = IntrinsicCost;
14763 }
14764 return ScalarCost;
14765 };
14766 auto GetVectorCost = [=](InstructionCost CommonCost) {
14767 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
14768 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
14769 ArrayRef<Value *> Ops = E->getOperand(I);
14770 if (all_of(Ops, [&](Value *Op) {
14771 auto *CI = dyn_cast<ConstantInt>(Op);
14772 return CI && CI->getValue().countr_one() >= It->second.first;
14773 }))
14774 return CommonCost;
14775 }
14776 }
14777 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
14778 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
14779 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
14780 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
14781 Op2Info, {}, nullptr, TLI) +
14782 CommonCost;
14783 };
14784 return GetCostDiff(GetScalarCost, GetVectorCost);
14785 }
14786 case Instruction::GetElementPtr: {
14787 return CommonCost + GetGEPCostDiff(VL, VL0);
14788 }
14789 case Instruction::Load: {
14790 auto GetScalarCost = [&](unsigned Idx) {
14791 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
14792 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
14793 VI->getAlign(), VI->getPointerAddressSpace(),
14795 };
14796 auto *LI0 = cast<LoadInst>(VL0);
14797 auto GetVectorCost = [&](InstructionCost CommonCost) {
14798 InstructionCost VecLdCost;
14799 switch (E->State) {
14800 case TreeEntry::Vectorize:
14801 if (unsigned Factor = E->getInterleaveFactor()) {
14802 VecLdCost = TTI->getInterleavedMemoryOpCost(
14803 Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
14804 LI0->getPointerAddressSpace(), CostKind);
14805
14806 } else {
14807 VecLdCost = TTI->getMemoryOpCost(
14808 Instruction::Load, VecTy, LI0->getAlign(),
14809 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
14810 }
14811 break;
14812 case TreeEntry::StridedVectorize: {
14813 Align CommonAlignment =
14814 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
14815 VecLdCost = TTI->getStridedMemoryOpCost(
14816 Instruction::Load, VecTy, LI0->getPointerOperand(),
14817 /*VariableMask=*/false, CommonAlignment, CostKind);
14818 break;
14819 }
14820 case TreeEntry::CompressVectorize: {
14821 bool IsMasked;
14822 unsigned InterleaveFactor;
14823 SmallVector<int> CompressMask;
14824 VectorType *LoadVecTy;
14825 SmallVector<Value *> Scalars(VL);
14826 if (!E->ReorderIndices.empty()) {
14827 SmallVector<int> Mask(E->ReorderIndices.begin(),
14828 E->ReorderIndices.end());
14829 reorderScalars(Scalars, Mask);
14830 }
14831 SmallVector<Value *> PointerOps(Scalars.size());
14832 for (auto [I, V] : enumerate(Scalars))
14833 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
14834 [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress(
14835 Scalars, PointerOps, E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
14836 *TLI, [](Value *) { return true; }, IsMasked, InterleaveFactor,
14837 CompressMask, LoadVecTy);
14838 assert(IsVectorized && "Failed to vectorize load");
14839 CompressEntryToData.try_emplace(E, CompressMask, LoadVecTy,
14840 InterleaveFactor, IsMasked);
14841 Align CommonAlignment = LI0->getAlign();
14842 if (InterleaveFactor) {
14843 VecLdCost = TTI->getInterleavedMemoryOpCost(
14844 Instruction::Load, LoadVecTy, InterleaveFactor, {},
14845 CommonAlignment, LI0->getPointerAddressSpace(), CostKind);
14846 } else if (IsMasked) {
14847 VecLdCost = TTI->getMaskedMemoryOpCost(
14848 Instruction::Load, LoadVecTy, CommonAlignment,
14849 LI0->getPointerAddressSpace(), CostKind);
14850 // TODO: include this cost into CommonCost.
14852 LoadVecTy, CompressMask, CostKind);
14853 } else {
14854 VecLdCost = TTI->getMemoryOpCost(
14855 Instruction::Load, LoadVecTy, CommonAlignment,
14856 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
14857 // TODO: include this cost into CommonCost.
14859 LoadVecTy, CompressMask, CostKind);
14860 }
14861 break;
14862 }
14863 case TreeEntry::ScatterVectorize: {
14864 Align CommonAlignment =
14865 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
14866 VecLdCost = TTI->getGatherScatterOpCost(
14867 Instruction::Load, VecTy, LI0->getPointerOperand(),
14868 /*VariableMask=*/false, CommonAlignment, CostKind);
14869 break;
14870 }
14871 case TreeEntry::CombinedVectorize:
14872 case TreeEntry::SplitVectorize:
14873 case TreeEntry::NeedToGather:
14874 llvm_unreachable("Unexpected vectorization state.");
14875 }
14876 return VecLdCost + CommonCost;
14877 };
14878
14879 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
14880 // If this node generates masked gather load then it is not a terminal node.
14881 // Hence address operand cost is estimated separately.
14882 if (E->State == TreeEntry::ScatterVectorize)
14883 return Cost;
14884
14885 // Estimate cost of GEPs since this tree node is a terminator.
14886 SmallVector<Value *> PointerOps(VL.size());
14887 for (auto [I, V] : enumerate(VL))
14888 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
14889 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
14890 }
14891 case Instruction::Store: {
14892 bool IsReorder = !E->ReorderIndices.empty();
14893 auto GetScalarCost = [=](unsigned Idx) {
14894 auto *VI = cast<StoreInst>(VL[Idx]);
14895 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
14896 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
14897 VI->getAlign(), VI->getPointerAddressSpace(),
14898 CostKind, OpInfo, VI);
14899 };
14900 auto *BaseSI =
14901 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
14902 auto GetVectorCost = [=](InstructionCost CommonCost) {
14903 // We know that we can merge the stores. Calculate the cost.
14904 InstructionCost VecStCost;
14905 if (E->State == TreeEntry::StridedVectorize) {
14906 Align CommonAlignment =
14907 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
14908 VecStCost = TTI->getStridedMemoryOpCost(
14909 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
14910 /*VariableMask=*/false, CommonAlignment, CostKind);
14911 } else {
14912 assert(E->State == TreeEntry::Vectorize &&
14913 "Expected either strided or consecutive stores.");
14914 if (unsigned Factor = E->getInterleaveFactor()) {
14915 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
14916 "No reused shuffles expected");
14917 CommonCost = 0;
14918 VecStCost = TTI->getInterleavedMemoryOpCost(
14919 Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
14920 BaseSI->getPointerAddressSpace(), CostKind);
14921 } else {
14922 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
14923 VecStCost = TTI->getMemoryOpCost(
14924 Instruction::Store, VecTy, BaseSI->getAlign(),
14925 BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
14926 }
14927 }
14928 return VecStCost + CommonCost;
14929 };
14930 SmallVector<Value *> PointerOps(VL.size());
14931 for (auto [I, V] : enumerate(VL)) {
14932 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
14933 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
14934 }
14935
14936 return GetCostDiff(GetScalarCost, GetVectorCost) +
14937 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
14938 }
14939 case Instruction::Call: {
14940 auto GetScalarCost = [&](unsigned Idx) {
14941 auto *CI = cast<CallInst>(UniqueValues[Idx]);
14944 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
14945 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
14946 }
14949 CI->getFunctionType()->params(), CostKind);
14950 };
14951 auto GetVectorCost = [=](InstructionCost CommonCost) {
14952 auto *CI = cast<CallInst>(VL0);
14955 CI, ID, VecTy->getNumElements(),
14956 It != MinBWs.end() ? It->second.first : 0, TTI);
14957 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
14958 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
14959 };
14960 return GetCostDiff(GetScalarCost, GetVectorCost);
14961 }
14962 case Instruction::ShuffleVector: {
14963 if (!SLPReVec || E->isAltShuffle())
14964 assert(E->isAltShuffle() &&
14965 ((Instruction::isBinaryOp(E->getOpcode()) &&
14966 Instruction::isBinaryOp(E->getAltOpcode())) ||
14967 (Instruction::isCast(E->getOpcode()) &&
14968 Instruction::isCast(E->getAltOpcode())) ||
14969 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
14970 "Invalid Shuffle Vector Operand");
14971 // Try to find the previous shuffle node with the same operands and same
14972 // main/alternate ops.
14973 auto TryFindNodeWithEqualOperands = [=]() {
14974 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
14975 if (TE.get() == E)
14976 break;
14977 if (TE->hasState() && TE->isAltShuffle() &&
14978 ((TE->getOpcode() == E->getOpcode() &&
14979 TE->getAltOpcode() == E->getAltOpcode()) ||
14980 (TE->getOpcode() == E->getAltOpcode() &&
14981 TE->getAltOpcode() == E->getOpcode())) &&
14982 TE->hasEqualOperands(*E))
14983 return true;
14984 }
14985 return false;
14986 };
14987 auto GetScalarCost = [&](unsigned Idx) {
14988 if (isa<PoisonValue>(UniqueValues[Idx]))
14990
14991 auto *VI = cast<Instruction>(UniqueValues[Idx]);
14992 assert(E->getMatchingMainOpOrAltOp(VI) &&
14993 "Unexpected main/alternate opcode");
14994 (void)E;
14995 return TTI->getInstructionCost(VI, CostKind);
14996 };
14997 // Need to clear CommonCost since the final shuffle cost is included into
14998 // vector cost.
14999 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
15000 // VecCost is equal to sum of the cost of creating 2 vectors
15001 // and the cost of creating shuffle.
15002 InstructionCost VecCost = 0;
15003 if (TryFindNodeWithEqualOperands()) {
15004 LLVM_DEBUG({
15005 dbgs() << "SLP: diamond match for alternate node found.\n";
15006 E->dump();
15007 });
15008 // No need to add new vector costs here since we're going to reuse
15009 // same main/alternate vector ops, just do different shuffling.
15010 } else if (Instruction::isBinaryOp(E->getOpcode())) {
15011 VecCost =
15012 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
15013 VecCost +=
15014 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
15015 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
15016 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
15017 VecCost = TTIRef.getCmpSelInstrCost(
15018 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(), CostKind,
15019 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15020 VL0);
15021 VecCost += TTIRef.getCmpSelInstrCost(
15022 E->getOpcode(), VecTy, MaskTy,
15023 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
15024 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15025 E->getAltOp());
15026 } else {
15027 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
15028 auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
15029 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
15030 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
15031 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
15032 unsigned SrcBWSz =
15033 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
15034 if (SrcIt != MinBWs.end()) {
15035 SrcBWSz = SrcIt->second.first;
15036 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
15037 SrcTy = getWidenedType(SrcSclTy, VL.size());
15038 }
15039 if (BWSz <= SrcBWSz) {
15040 if (BWSz < SrcBWSz)
15041 VecCost =
15042 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
15044 LLVM_DEBUG({
15045 dbgs()
15046 << "SLP: alternate extension, which should be truncated.\n";
15047 E->dump();
15048 });
15049 return VecCost;
15050 }
15051 }
15052 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
15054 VecCost +=
15055 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
15057 }
15059 E->buildAltOpShuffleMask(
15060 [&](Instruction *I) {
15061 assert(E->getMatchingMainOpOrAltOp(I) &&
15062 "Unexpected main/alternate opcode");
15063 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
15064 *TLI);
15065 },
15066 Mask);
15068 FinalVecTy, Mask, CostKind);
15069 // Patterns like [fadd,fsub] can be combined into a single instruction
15070 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
15071 // need to take into account their order when looking for the most used
15072 // order.
15073 unsigned Opcode0 = E->getOpcode();
15074 unsigned Opcode1 = E->getAltOpcode();
15075 SmallBitVector OpcodeMask(
15076 getAltInstrMask(E->Scalars, ScalarTy, Opcode0, Opcode1));
15077 // If this pattern is supported by the target then we consider the
15078 // order.
15079 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
15080 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
15081 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
15082 return AltVecCost < VecCost ? AltVecCost : VecCost;
15083 }
15084 // TODO: Check the reverse order too.
15085 return VecCost;
15086 };
15087 if (SLPReVec && !E->isAltShuffle())
15088 return GetCostDiff(
15089 GetScalarCost, [&](InstructionCost) -> InstructionCost {
15090 // If a group uses mask in order, the shufflevector can be
15091 // eliminated by instcombine. Then the cost is 0.
15092 assert(isa<ShuffleVectorInst>(VL.front()) &&
15093 "Not supported shufflevector usage.");
15094 auto *SV = cast<ShuffleVectorInst>(VL.front());
15095 unsigned SVNumElements =
15096 cast<FixedVectorType>(SV->getOperand(0)->getType())
15097 ->getNumElements();
15098 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
15099 for (size_t I = 0, End = VL.size(); I != End; I += GroupSize) {
15100 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
15101 int NextIndex = 0;
15102 if (!all_of(Group, [&](Value *V) {
15103 assert(isa<ShuffleVectorInst>(V) &&
15104 "Not supported shufflevector usage.");
15105 auto *SV = cast<ShuffleVectorInst>(V);
15106 int Index;
15107 [[maybe_unused]] bool IsExtractSubvectorMask =
15108 SV->isExtractSubvectorMask(Index);
15109 assert(IsExtractSubvectorMask &&
15110 "Not supported shufflevector usage.");
15111 if (NextIndex != Index)
15112 return false;
15113 NextIndex += SV->getShuffleMask().size();
15114 return true;
15115 }))
15116 return ::getShuffleCost(
15118 calculateShufflevectorMask(E->Scalars));
15119 }
15120 return TTI::TCC_Free;
15121 });
15122 return GetCostDiff(GetScalarCost, GetVectorCost);
15123 }
15124 case Instruction::Freeze:
15125 return CommonCost;
15126 default:
15127 llvm_unreachable("Unknown instruction");
15128 }
15129}
15130
15131bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
15132 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
15133 << VectorizableTree.size() << " is fully vectorizable .\n");
15134
15135 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
15137 return TE->isGather() &&
15138 !any_of(TE->Scalars,
15139 [this](Value *V) { return EphValues.contains(V); }) &&
15140 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
15141 TE->Scalars.size() < Limit ||
15142 (((TE->hasState() &&
15143 TE->getOpcode() == Instruction::ExtractElement) ||
15144 all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
15145 isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
15146 (TE->hasState() && TE->getOpcode() == Instruction::Load &&
15147 !TE->isAltShuffle()) ||
15148 any_of(TE->Scalars, IsaPred<LoadInst>));
15149 };
15150
15151 // We only handle trees of heights 1 and 2.
15152 if (VectorizableTree.size() == 1 &&
15153 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
15154 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
15155 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
15156 (ForReduction &&
15157 AreVectorizableGathers(VectorizableTree[0].get(),
15158 VectorizableTree[0]->Scalars.size()) &&
15159 VectorizableTree[0]->getVectorFactor() > 2)))
15160 return true;
15161
15162 if (VectorizableTree.size() != 2)
15163 return false;
15164
15165 // Handle splat and all-constants stores. Also try to vectorize tiny trees
15166 // with the second gather nodes if they have less scalar operands rather than
15167 // the initial tree element (may be profitable to shuffle the second gather)
15168 // or they are extractelements, which form shuffle.
15169 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
15170 AreVectorizableGathers(VectorizableTree[1].get(),
15171 VectorizableTree[0]->Scalars.size()))
15172 return true;
15173
15174 // Gathering cost would be too much for tiny trees.
15175 if (VectorizableTree[0]->isGather() ||
15176 (VectorizableTree[1]->isGather() &&
15177 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
15178 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
15179 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
15180 return false;
15181
15182 return true;
15183}
15184
15185static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
15187 bool MustMatchOrInst) {
15188 // Look past the root to find a source value. Arbitrarily follow the
15189 // path through operand 0 of any 'or'. Also, peek through optional
15190 // shift-left-by-multiple-of-8-bits.
15191 Value *ZextLoad = Root;
15192 const APInt *ShAmtC;
15193 bool FoundOr = false;
15194 while (!isa<ConstantExpr>(ZextLoad) &&
15195 (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
15196 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
15197 ShAmtC->urem(8) == 0))) {
15198 auto *BinOp = cast<BinaryOperator>(ZextLoad);
15199 ZextLoad = BinOp->getOperand(0);
15200 if (BinOp->getOpcode() == Instruction::Or)
15201 FoundOr = true;
15202 }
15203 // Check if the input is an extended load of the required or/shift expression.
15204 Value *Load;
15205 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
15206 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
15207 return false;
15208
15209 // Require that the total load bit width is a legal integer type.
15210 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
15211 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
15212 Type *SrcTy = Load->getType();
15213 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
15214 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
15215 return false;
15216
15217 // Everything matched - assume that we can fold the whole sequence using
15218 // load combining.
15219 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
15220 << *(cast<Instruction>(Root)) << "\n");
15221
15222 return true;
15223}
15224
15226 if (RdxKind != RecurKind::Or)
15227 return false;
15228
15229 unsigned NumElts = VectorizableTree[0]->Scalars.size();
15230 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
15231 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
15232 /* MatchOr */ false);
15233}
15234
15236 // Peek through a final sequence of stores and check if all operations are
15237 // likely to be load-combined.
15238 unsigned NumElts = Stores.size();
15239 for (Value *Scalar : Stores) {
15240 Value *X;
15241 if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
15242 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
15243 return false;
15244 }
15245 return true;
15246}
15247
15248bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
15249 if (!DebugCounter::shouldExecute(VectorizedGraphs))
15250 return true;
15251
15252 // Graph is empty - do nothing.
15253 if (VectorizableTree.empty()) {
15254 assert(ExternalUses.empty() && "We shouldn't have any external users");
15255
15256 return true;
15257 }
15258
15259 // No need to vectorize inserts of gathered values.
15260 if (VectorizableTree.size() == 2 &&
15261 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
15262 VectorizableTree[1]->isGather() &&
15263 (VectorizableTree[1]->getVectorFactor() <= 2 ||
15264 !(isSplat(VectorizableTree[1]->Scalars) ||
15265 allConstant(VectorizableTree[1]->Scalars))))
15266 return true;
15267
15268 // If the graph includes only PHI nodes and gathers, it is defnitely not
15269 // profitable for the vectorization, we can skip it, if the cost threshold is
15270 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
15271 // gathers/buildvectors.
15272 constexpr int Limit = 4;
15273 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
15274 !VectorizableTree.empty() &&
15275 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15276 return (TE->isGather() &&
15277 (!TE->hasState() ||
15278 TE->getOpcode() != Instruction::ExtractElement) &&
15279 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
15280 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
15281 }))
15282 return true;
15283
15284 // Do not vectorize small tree of phis only, if all vector phis are also
15285 // gathered.
15286 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15287 VectorizableTree.size() <= Limit &&
15288 all_of(VectorizableTree,
15289 [&](const std::unique_ptr<TreeEntry> &TE) {
15290 return (TE->isGather() &&
15291 (!TE->hasState() ||
15292 TE->getOpcode() != Instruction::ExtractElement) &&
15293 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <=
15294 Limit) ||
15295 (TE->hasState() &&
15296 (TE->getOpcode() == Instruction::InsertElement ||
15297 (TE->getOpcode() == Instruction::PHI &&
15298 all_of(TE->Scalars, [&](Value *V) {
15299 return isa<PoisonValue>(V) || MustGather.contains(V);
15300 }))));
15301 }) &&
15302 any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15303 return TE->State == TreeEntry::Vectorize &&
15304 TE->getOpcode() == Instruction::PHI;
15305 }))
15306 return true;
15307
15308 // If the tree contains only phis, buildvectors, split nodes and
15309 // small nodes with reuses, we can skip it.
15310 SmallVector<const TreeEntry *> StoreLoadNodes;
15311 unsigned NumGathers = 0;
15312 constexpr int LimitTreeSize = 36;
15313 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
15314 all_of(VectorizableTree,
15315 [&](const std::unique_ptr<TreeEntry> &TE) {
15316 if (!TE->isGather() && TE->hasState() &&
15317 (TE->getOpcode() == Instruction::Load ||
15318 TE->getOpcode() == Instruction::Store)) {
15319 StoreLoadNodes.push_back(TE.get());
15320 return true;
15321 }
15322 if (TE->isGather())
15323 ++NumGathers;
15324 return TE->State == TreeEntry::SplitVectorize ||
15325 (TE->Idx == 0 && TE->Scalars.size() == 2 &&
15326 TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
15327 VectorizableTree.size() > LimitTreeSize) ||
15328 (TE->isGather() &&
15329 none_of(TE->Scalars, IsaPred<ExtractElementInst>)) ||
15330 (TE->hasState() &&
15331 (TE->getOpcode() == Instruction::PHI ||
15332 (TE->hasCopyableElements() &&
15333 static_cast<unsigned>(count_if(
15334 TE->Scalars, IsaPred<PHINode, Constant>)) >=
15335 TE->Scalars.size() / 2) ||
15336 ((!TE->ReuseShuffleIndices.empty() ||
15337 !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
15338 TE->Scalars.size() == 2)));
15339 }) &&
15340 (StoreLoadNodes.empty() ||
15341 (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.size() &&
15342 (NumGathers > 0 || none_of(StoreLoadNodes, [&](const TreeEntry *TE) {
15343 return TE->getOpcode() == Instruction::Store ||
15344 all_of(TE->Scalars, [&](Value *V) {
15345 return !isa<LoadInst>(V) ||
15346 areAllUsersVectorized(cast<Instruction>(V));
15347 });
15348 })))))
15349 return true;
15350
15351 // If the tree contains only buildvector, 2 non-buildvectors (with root user
15352 // tree node) and other buildvectors, we can skip it.
15353 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15354 VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
15355 VectorizableTree.size() >= Limit &&
15356 count_if(ArrayRef(VectorizableTree).drop_front(),
15357 [&](const std::unique_ptr<TreeEntry> &TE) {
15358 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
15359 TE->UserTreeIndex.UserTE->Idx == 0;
15360 }) == 2)
15361 return true;
15362
15363 // If the tree contains only vectorization of the phi node from the
15364 // buildvector - skip it.
15365 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15366 VectorizableTree.size() > 2 &&
15367 VectorizableTree.front()->State == TreeEntry::Vectorize &&
15368 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
15369 VectorizableTree[1]->State == TreeEntry::Vectorize &&
15370 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
15371 all_of(
15372 ArrayRef(VectorizableTree).drop_front(2),
15373 [&](const std::unique_ptr<TreeEntry> &TE) { return TE->isGather(); }))
15374 return true;
15375
15376 // We can vectorize the tree if its size is greater than or equal to the
15377 // minimum size specified by the MinTreeSize command line option.
15378 if (VectorizableTree.size() >= MinTreeSize)
15379 return false;
15380
15381 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
15382 // can vectorize it if we can prove it fully vectorizable.
15383 if (isFullyVectorizableTinyTree(ForReduction))
15384 return false;
15385
15386 // Check if any of the gather node forms an insertelement buildvector
15387 // somewhere.
15388 bool IsAllowedSingleBVNode =
15389 VectorizableTree.size() > 1 ||
15390 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
15391 !VectorizableTree.front()->isAltShuffle() &&
15392 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
15393 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
15394 allSameBlock(VectorizableTree.front()->Scalars));
15395 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15396 return TE->isGather() && all_of(TE->Scalars, [&](Value *V) {
15397 return isa<ExtractElementInst, Constant>(V) ||
15398 (IsAllowedSingleBVNode &&
15399 !V->hasNUsesOrMore(UsesLimit) &&
15400 any_of(V->users(), IsaPred<InsertElementInst>));
15401 });
15402 }))
15403 return false;
15404
15405 if (VectorizableTree.back()->isGather() &&
15406 VectorizableTree.back()->hasState() &&
15407 VectorizableTree.back()->isAltShuffle() &&
15408 VectorizableTree.back()->getVectorFactor() > 2 &&
15409 allSameBlock(VectorizableTree.back()->Scalars) &&
15410 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
15412 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
15413 VectorizableTree.back()->getVectorFactor()),
15414 APInt::getAllOnes(VectorizableTree.back()->getVectorFactor()),
15415 /*Insert=*/true, /*Extract=*/false,
15417 return false;
15418
15419 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
15420 // vectorizable.
15421 return true;
15422}
15423
15426 constexpr unsigned SmallTree = 3;
15427 if (VectorizableTree.front()->isNonPowOf2Vec() &&
15428 getCanonicalGraphSize() <= SmallTree &&
15429 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
15430 [](const std::unique_ptr<TreeEntry> &TE) {
15431 return TE->isGather() && TE->hasState() &&
15432 TE->getOpcode() == Instruction::Load &&
15433 !allSameBlock(TE->Scalars);
15434 }) == 1)
15435 return true;
15436 return false;
15437 }
15438 bool Res = false;
15439 for (unsigned Idx : seq<unsigned>(getTreeSize())) {
15440 TreeEntry &E = *VectorizableTree[Idx];
15441 if (E.State == TreeEntry::SplitVectorize)
15442 return false;
15443 if (!E.isGather())
15444 continue;
15445 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
15446 (!E.hasState() &&
15447 all_of(E.Scalars, IsaPred<ExtractElementInst, LoadInst>)) ||
15448 (isa<ExtractElementInst>(E.Scalars.front()) &&
15449 getSameOpcode(ArrayRef(E.Scalars).drop_front(), *TLI).valid()))
15450 return false;
15451 if (isSplat(E.Scalars) || allConstant(E.Scalars))
15452 continue;
15453 Res = true;
15454 }
15455 return Res;
15456}
15457
15459 // Walk from the bottom of the tree to the top, tracking which values are
15460 // live. When we see a call instruction that is not part of our tree,
15461 // query TTI to see if there is a cost to keeping values live over it
15462 // (for example, if spills and fills are required).
15463
15464 const TreeEntry *Root = VectorizableTree.front().get();
15465 if (Root->isGather())
15466 return 0;
15467
15470 EntriesToOperands;
15471 SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;
15472 SmallPtrSet<const Instruction *, 8> LastInstructions;
15473 for (const auto &TEPtr : VectorizableTree) {
15474 if (!TEPtr->isGather()) {
15475 Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
15476 EntriesToLastInstruction.try_emplace(TEPtr.get(), LastInst);
15477 LastInstructions.insert(LastInst);
15478 }
15479 if (TEPtr->UserTreeIndex)
15480 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
15481 }
15482
15483 auto NoCallIntrinsic = [this](const Instruction *I) {
15484 const auto *II = dyn_cast<IntrinsicInst>(I);
15485 if (!II)
15486 return false;
15487 if (II->isAssumeLikeIntrinsic())
15488 return true;
15489 IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
15490 InstructionCost IntrCost =
15493 nullptr, II->getType(), ICA.getArgTypes(), TTI::TCK_RecipThroughput);
15494 return IntrCost < CallCost;
15495 };
15496
15497 // Maps last instruction in the entry to the last instruction for the one of
15498 // operand entries and the flag. If the flag is true, there are no calls in
15499 // between these instructions.
15501 CheckedInstructions;
15502 unsigned Budget = 0;
15503 const unsigned BudgetLimit =
15504 ScheduleRegionSizeBudget / VectorizableTree.size();
15505 auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
15506 const Instruction *Last) {
15507 assert(First->getParent() == Last->getParent() &&
15508 "Expected instructions in same block.");
15509 if (auto It = CheckedInstructions.find(Last);
15510 It != CheckedInstructions.end()) {
15511 const Instruction *Checked = It->second.getPointer();
15512 if (Checked == First || Checked->comesBefore(First))
15513 return It->second.getInt() != 0;
15514 Last = Checked;
15515 } else if (Last == First || Last->comesBefore(First)) {
15516 return true;
15517 }
15519 ++First->getIterator().getReverse(),
15520 PrevInstIt =
15521 Last->getIterator().getReverse();
15522 SmallVector<const Instruction *> LastInstsInRange;
15523 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
15524 // Debug information does not impact spill cost.
15525 // Vectorized calls, represented as vector intrinsics, do not impact spill
15526 // cost.
15527 if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
15528 CB && !NoCallIntrinsic(CB) && !isVectorized(CB)) {
15529 for (const Instruction *LastInst : LastInstsInRange)
15530 CheckedInstructions.try_emplace(LastInst, &*PrevInstIt, 0);
15531 return false;
15532 }
15533 if (LastInstructions.contains(&*PrevInstIt))
15534 LastInstsInRange.push_back(&*PrevInstIt);
15535
15536 ++PrevInstIt;
15537 ++Budget;
15538 }
15539 for (const Instruction *LastInst : LastInstsInRange)
15540 CheckedInstructions.try_emplace(
15541 LastInst, PrevInstIt == InstIt ? First : &*PrevInstIt,
15542 Budget <= BudgetLimit ? 1 : 0);
15543 return Budget <= BudgetLimit;
15544 };
15545 auto AddCosts = [&](const TreeEntry *Op) {
15546 Type *ScalarTy = Op->Scalars.front()->getType();
15547 auto It = MinBWs.find(Op);
15548 if (It != MinBWs.end())
15549 ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
15550 auto *VecTy = getWidenedType(ScalarTy, Op->getVectorFactor());
15552 if (ScalarTy->isVectorTy()) {
15553 // Handle revec dead vector instructions.
15554 Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
15555 }
15556 };
15557 // Memoize the relationship between blocks, i.e. if there is (at least one)
15558 // non-vectorized call between the blocks. This allows to skip the analysis of
15559 // the same block paths multiple times.
15561 ParentOpParentToPreds;
15562 auto CheckPredecessors = [&](BasicBlock *Root, BasicBlock *Pred,
15563 BasicBlock *OpParent) {
15564 auto Key = std::make_pair(Root, OpParent);
15565 if (auto It = ParentOpParentToPreds.find(Key);
15566 It != ParentOpParentToPreds.end())
15567 return It->second;
15569 if (Pred)
15570 Worklist.push_back(Pred);
15571 else
15572 Worklist.append(pred_begin(Root), pred_end(Root));
15575 ParentsPairsToAdd;
15576 bool Res = false;
15577 auto Cleanup = make_scope_exit([&]() {
15578 for (const auto &KeyPair : ParentsPairsToAdd) {
15579 assert(!ParentOpParentToPreds.contains(KeyPair) &&
15580 "Should not have been added before.");
15581 ParentOpParentToPreds.try_emplace(KeyPair, Res);
15582 }
15583 });
15584 while (!Worklist.empty()) {
15585 BasicBlock *BB = Worklist.pop_back_val();
15586 if (BB == OpParent || !Visited.insert(BB).second)
15587 continue;
15588 auto Pair = std::make_pair(BB, OpParent);
15589 if (auto It = ParentOpParentToPreds.find(Pair);
15590 It != ParentOpParentToPreds.end()) {
15591 Res = It->second;
15592 return Res;
15593 }
15594 ParentsPairsToAdd.insert(Pair);
15595 unsigned BlockSize = BB->size();
15596 if (BlockSize > static_cast<unsigned>(ScheduleRegionSizeBudget))
15597 return Res;
15598 Budget += BlockSize;
15599 if (Budget > BudgetLimit)
15600 return Res;
15601 if (!isa<CatchSwitchInst>(BB->getTerminator()) &&
15602 !CheckForNonVecCallsInSameBlock(&*BB->getFirstNonPHIOrDbgOrAlloca(),
15603 BB->getTerminator()))
15604 return Res;
15605 Worklist.append(pred_begin(BB), pred_end(BB));
15606 }
15607 Res = true;
15608 return Res;
15609 };
15610 SmallVector<const TreeEntry *> LiveEntries(1, Root);
15611 while (!LiveEntries.empty()) {
15612 const TreeEntry *Entry = LiveEntries.pop_back_val();
15613 SmallVector<const TreeEntry *> Operands = EntriesToOperands.lookup(Entry);
15614 if (Operands.empty())
15615 continue;
15616 Instruction *LastInst = EntriesToLastInstruction.at(Entry);
15617 BasicBlock *Parent = LastInst->getParent();
15618 for (const TreeEntry *Op : Operands) {
15619 if (!Op->isGather())
15620 LiveEntries.push_back(Op);
15621 if (Entry->State == TreeEntry::SplitVectorize ||
15622 (Entry->getOpcode() != Instruction::PHI && Op->isGather()) ||
15623 (Op->isGather() && allConstant(Op->Scalars)))
15624 continue;
15625 Budget = 0;
15626 BasicBlock *Pred = nullptr;
15627 if (auto *Phi = dyn_cast<PHINode>(Entry->getMainOp()))
15628 Pred = Phi->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
15629 BasicBlock *OpParent;
15630 Instruction *OpLastInst;
15631 if (Op->isGather()) {
15632 assert(Entry->getOpcode() == Instruction::PHI &&
15633 "Expected phi node only.");
15634 OpParent = cast<PHINode>(Entry->getMainOp())
15635 ->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
15636 OpLastInst = OpParent->getTerminator();
15637 for (Value *V : Op->Scalars) {
15638 auto *Inst = dyn_cast<Instruction>(V);
15639 if (!Inst)
15640 continue;
15641 if (isVectorized(V)) {
15642 OpParent = Inst->getParent();
15643 OpLastInst = Inst;
15644 break;
15645 }
15646 }
15647 } else {
15648 OpLastInst = EntriesToLastInstruction.at(Op);
15649 OpParent = OpLastInst->getParent();
15650 }
15651 // Check the call instructions within the same basic blocks.
15652 if (OpParent == Parent) {
15653 if (Entry->getOpcode() == Instruction::PHI) {
15654 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
15655 AddCosts(Op);
15656 continue;
15657 }
15658 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
15659 AddCosts(Op);
15660 continue;
15661 }
15662 // Check for call instruction in between blocks.
15663 // 1. Check entry's block to the head.
15664 if (Entry->getOpcode() != Instruction::PHI &&
15665 !CheckForNonVecCallsInSameBlock(
15666 &*LastInst->getParent()->getFirstNonPHIOrDbgOrAlloca(),
15667 LastInst)) {
15668 AddCosts(Op);
15669 continue;
15670 }
15671 // 2. Check op's block from the end.
15672 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
15673 OpParent->getTerminator())) {
15674 AddCosts(Op);
15675 continue;
15676 }
15677 // 3. Check the predecessors of entry's block till op's block.
15678 if (!CheckPredecessors(Parent, Pred, OpParent)) {
15679 AddCosts(Op);
15680 continue;
15681 }
15682 }
15683 }
15684
15685 return Cost;
15686}
15687
15688/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
15689/// buildvector sequence.
15691 const InsertElementInst *IE2) {
15692 if (IE1 == IE2)
15693 return false;
15694 const auto *I1 = IE1;
15695 const auto *I2 = IE2;
15696 const InsertElementInst *PrevI1;
15697 const InsertElementInst *PrevI2;
15698 unsigned Idx1 = *getElementIndex(IE1);
15699 unsigned Idx2 = *getElementIndex(IE2);
15700 do {
15701 if (I2 == IE1)
15702 return true;
15703 if (I1 == IE2)
15704 return false;
15705 PrevI1 = I1;
15706 PrevI2 = I2;
15707 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
15708 getElementIndex(I1).value_or(Idx2) != Idx2)
15709 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
15710 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
15711 getElementIndex(I2).value_or(Idx1) != Idx1)
15712 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
15713 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
15714 llvm_unreachable("Two different buildvectors not expected.");
15715}
15716
15717namespace {
15718/// Returns incoming Value *, if the requested type is Value * too, or a default
15719/// value, otherwise.
15720struct ValueSelect {
15721 template <typename U>
15722 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
15723 return V;
15724 }
15725 template <typename U>
15726 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
15727 return U();
15728 }
15729};
15730} // namespace
15731
15732/// Does the analysis of the provided shuffle masks and performs the requested
15733/// actions on the vectors with the given shuffle masks. It tries to do it in
15734/// several steps.
15735/// 1. If the Base vector is not undef vector, resizing the very first mask to
15736/// have common VF and perform action for 2 input vectors (including non-undef
15737/// Base). Other shuffle masks are combined with the resulting after the 1 stage
15738/// and processed as a shuffle of 2 elements.
15739/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
15740/// action only for 1 vector with the given mask, if it is not the identity
15741/// mask.
15742/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
15743/// vectors, combing the masks properly between the steps.
15744template <typename T>
15746 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
15747 function_ref<unsigned(T *)> GetVF,
15748 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
15750 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
15751 SmallVector<int> Mask(ShuffleMask.begin()->second);
15752 auto VMIt = std::next(ShuffleMask.begin());
15753 T *Prev = nullptr;
15754 SmallBitVector UseMask =
15755 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
15756 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
15757 if (!IsBaseUndef.all()) {
15758 // Base is not undef, need to combine it with the next subvectors.
15759 std::pair<T *, bool> Res =
15760 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
15761 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
15762 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
15763 if (Mask[Idx] == PoisonMaskElem)
15764 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
15765 else
15766 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
15767 }
15768 [[maybe_unused]] auto *V = ValueSelect::get<T *>(Base);
15769 assert((!V || GetVF(V) == Mask.size()) &&
15770 "Expected base vector of VF number of elements.");
15771 Prev = Action(Mask, {nullptr, Res.first});
15772 } else if (ShuffleMask.size() == 1) {
15773 // Base is undef and only 1 vector is shuffled - perform the action only for
15774 // single vector, if the mask is not the identity mask.
15775 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
15776 /*ForSingleMask=*/true);
15777 if (Res.second)
15778 // Identity mask is found.
15779 Prev = Res.first;
15780 else
15781 Prev = Action(Mask, {ShuffleMask.begin()->first});
15782 } else {
15783 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
15784 // shuffles step by step, combining shuffle between the steps.
15785 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
15786 unsigned Vec2VF = GetVF(VMIt->first);
15787 if (Vec1VF == Vec2VF) {
15788 // No need to resize the input vectors since they are of the same size, we
15789 // can shuffle them directly.
15790 ArrayRef<int> SecMask = VMIt->second;
15791 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
15792 if (SecMask[I] != PoisonMaskElem) {
15793 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
15794 Mask[I] = SecMask[I] + Vec1VF;
15795 }
15796 }
15797 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
15798 } else {
15799 // Vectors of different sizes - resize and reshuffle.
15800 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
15801 /*ForSingleMask=*/false);
15802 std::pair<T *, bool> Res2 =
15803 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
15804 ArrayRef<int> SecMask = VMIt->second;
15805 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
15806 if (Mask[I] != PoisonMaskElem) {
15807 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
15808 if (Res1.second)
15809 Mask[I] = I;
15810 } else if (SecMask[I] != PoisonMaskElem) {
15811 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
15812 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
15813 }
15814 }
15815 Prev = Action(Mask, {Res1.first, Res2.first});
15816 }
15817 VMIt = std::next(VMIt);
15818 }
15819 [[maybe_unused]] bool IsBaseNotUndef = !IsBaseUndef.all();
15820 // Perform requested actions for the remaining masks/vectors.
15821 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
15822 // Shuffle other input vectors, if any.
15823 std::pair<T *, bool> Res =
15824 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
15825 ArrayRef<int> SecMask = VMIt->second;
15826 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
15827 if (SecMask[I] != PoisonMaskElem) {
15828 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
15829 "Multiple uses of scalars.");
15830 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
15831 } else if (Mask[I] != PoisonMaskElem) {
15832 Mask[I] = I;
15833 }
15834 }
15835 Prev = Action(Mask, {Prev, Res.first});
15836 }
15837 return Prev;
15838}
15839
15840namespace {
15841/// Data type for handling buildvector sequences with the reused scalars from
15842/// other tree entries.
15843template <typename T> struct ShuffledInsertData {
15844 /// List of insertelements to be replaced by shuffles.
15845 SmallVector<InsertElementInst *> InsertElements;
15846 /// The parent vectors and shuffle mask for the given list of inserts.
15848};
15849} // namespace
15850
15852 InstructionCost ReductionCost) {
15853 InstructionCost Cost = ReductionCost;
15854 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
15855 << VectorizableTree.size() << ".\n");
15856
15857 SmallPtrSet<Value *, 4> CheckedExtracts;
15858 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
15859 TreeEntry &TE = *VectorizableTree[I];
15860 // No need to count the cost for combined entries, they are combined and
15861 // just skip their cost.
15862 if (TE.State == TreeEntry::CombinedVectorize) {
15863 LLVM_DEBUG(
15864 dbgs() << "SLP: Skipping cost for combined node that starts with "
15865 << *TE.Scalars[0] << ".\n";
15866 TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
15867 continue;
15868 }
15869 if (TE.hasState() &&
15870 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
15871 if (const TreeEntry *E =
15872 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
15873 E && E->getVectorFactor() == TE.getVectorFactor()) {
15874 // Some gather nodes might be absolutely the same as some vectorizable
15875 // nodes after reordering, need to handle it.
15876 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
15877 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
15878 << "SLP: Current total cost = " << Cost << "\n");
15879 continue;
15880 }
15881 }
15882
15883 // Exclude cost of gather loads nodes which are not used. These nodes were
15884 // built as part of the final attempt to vectorize gathered loads.
15885 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
15886 "Expected gather nodes with users only.");
15887
15888 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
15889 Cost += C;
15890 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
15891 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
15892 << "SLP: Current total cost = " << Cost << "\n");
15893 }
15894
15895 if (Cost >= -SLPCostThreshold &&
15896 none_of(ExternalUses, [](const ExternalUser &EU) {
15897 return isa_and_nonnull<InsertElementInst>(EU.User);
15898 }))
15899 return Cost;
15900
15901 SmallPtrSet<Value *, 16> ExtractCostCalculated;
15902 InstructionCost ExtractCost = 0;
15904 SmallVector<APInt> DemandedElts;
15905 SmallDenseSet<Value *, 4> UsedInserts;
15907 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
15909 SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
15910 // Keep track {Scalar, Index, User} tuple.
15911 // On AArch64, this helps in fusing a mov instruction, associated with
15912 // extractelement, with fmul in the backend so that extractelement is free.
15914 for (ExternalUser &EU : ExternalUses) {
15915 ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane);
15916 }
15917 SmallDenseSet<std::pair<Value *, Value *>, 8> CheckedScalarUser;
15918 for (ExternalUser &EU : ExternalUses) {
15919 LLVM_DEBUG(dbgs() << "SLP: Computing cost for external use of TreeEntry "
15920 << EU.E.Idx << " in lane " << EU.Lane << "\n");
15921 LLVM_DEBUG(if (EU.User) dbgs() << " User:" << *EU.User << "\n";
15922 else dbgs() << " User: nullptr\n");
15923 LLVM_DEBUG(dbgs() << " Use: " << EU.Scalar->getNameOrAsOperand() << "\n");
15924
15925 // Uses by ephemeral values are free (because the ephemeral value will be
15926 // removed prior to code generation, and so the extraction will be
15927 // removed as well).
15928 if (EphValues.count(EU.User))
15929 continue;
15930
15931 // Check if the scalar for the given user or all users is accounted already.
15932 if (!CheckedScalarUser.insert(std::make_pair(EU.Scalar, EU.User)).second ||
15933 (EU.User &&
15934 CheckedScalarUser.contains(std::make_pair(EU.Scalar, nullptr))))
15935 continue;
15936
15937 // Used in unreachable blocks or in EH pads (rarely executed) or is
15938 // terminated with unreachable instruction.
15939 if (BasicBlock *UserParent =
15940 EU.User ? cast<Instruction>(EU.User)->getParent() : nullptr;
15941 UserParent &&
15942 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
15943 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
15944 continue;
15945
15946 // We only add extract cost once for the same scalar.
15947 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
15948 !ExtractCostCalculated.insert(EU.Scalar).second)
15949 continue;
15950
15951 // No extract cost for vector "scalar" if REVEC is disabled
15952 if (!SLPReVec && isa<FixedVectorType>(EU.Scalar->getType()))
15953 continue;
15954
15955 // If found user is an insertelement, do not calculate extract cost but try
15956 // to detect it as a final shuffled/identity match.
15957 // TODO: what if a user is insertvalue when REVEC is enabled?
15958 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
15959 VU && VU->getOperand(1) == EU.Scalar) {
15960 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
15961 if (!UsedInserts.insert(VU).second)
15962 continue;
15963 std::optional<unsigned> InsertIdx = getElementIndex(VU);
15964 if (InsertIdx) {
15965 const TreeEntry *ScalarTE = &EU.E;
15966 auto *It = find_if(
15967 ShuffledInserts,
15968 [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
15969 // Checks if 2 insertelements are from the same buildvector.
15970 InsertElementInst *VecInsert = Data.InsertElements.front();
15972 VU, VecInsert, [this](InsertElementInst *II) -> Value * {
15973 Value *Op0 = II->getOperand(0);
15974 if (isVectorized(II) && !isVectorized(Op0))
15975 return nullptr;
15976 return Op0;
15977 });
15978 });
15979 int VecId = -1;
15980 if (It == ShuffledInserts.end()) {
15981 auto &Data = ShuffledInserts.emplace_back();
15982 Data.InsertElements.emplace_back(VU);
15983 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
15984 VecId = ShuffledInserts.size() - 1;
15985 auto It = MinBWs.find(ScalarTE);
15986 if (It != MinBWs.end() &&
15987 VectorCasts
15988 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
15989 .second) {
15990 unsigned BWSz = It->second.first;
15991 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
15992 unsigned VecOpcode;
15993 if (DstBWSz < BWSz)
15994 VecOpcode = Instruction::Trunc;
15995 else
15996 VecOpcode =
15997 It->second.second ? Instruction::SExt : Instruction::ZExt;
16000 VecOpcode, FTy,
16001 getWidenedType(IntegerType::get(FTy->getContext(), BWSz),
16002 FTy->getNumElements()),
16004 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
16005 << " for extending externally used vector with "
16006 "non-equal minimum bitwidth.\n");
16007 Cost += C;
16008 }
16009 } else {
16010 if (isFirstInsertElement(VU, It->InsertElements.front()))
16011 It->InsertElements.front() = VU;
16012 VecId = std::distance(ShuffledInserts.begin(), It);
16013 }
16014 int InIdx = *InsertIdx;
16015 SmallVectorImpl<int> &Mask =
16016 ShuffledInserts[VecId].ValueMasks[ScalarTE];
16017 if (Mask.empty())
16018 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
16019 Mask[InIdx] = EU.Lane;
16020 DemandedElts[VecId].setBit(InIdx);
16021 continue;
16022 }
16023 }
16024 }
16025
16027 // If we plan to rewrite the tree in a smaller type, we will need to sign
16028 // extend the extracted value back to the original type. Here, we account
16029 // for the extract and the added cost of the sign extend if needed.
16030 InstructionCost ExtraCost = TTI::TCC_Free;
16031 auto *ScalarTy = EU.Scalar->getType();
16032 const unsigned BundleWidth = EU.E.getVectorFactor();
16033 assert(EU.Lane < BundleWidth && "Extracted lane out of bounds.");
16034 auto *VecTy = getWidenedType(ScalarTy, BundleWidth);
16035 const TreeEntry *Entry = &EU.E;
16036 auto It = MinBWs.find(Entry);
16037 if (It != MinBWs.end()) {
16038 Type *MinTy = IntegerType::get(F->getContext(), It->second.first);
16039 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy))
16040 MinTy = getWidenedType(MinTy, VecTy->getNumElements());
16041 unsigned Extend = isKnownNonNegative(EU.Scalar, SimplifyQuery(*DL))
16042 ? Instruction::ZExt
16043 : Instruction::SExt;
16044 VecTy = getWidenedType(MinTy, BundleWidth);
16045 ExtraCost =
16046 getExtractWithExtendCost(*TTI, Extend, ScalarTy, VecTy, EU.Lane);
16047 LLVM_DEBUG(dbgs() << " ExtractExtend or ExtractSubvec cost: "
16048 << ExtraCost << "\n");
16049 } else {
16050 ExtraCost =
16051 getVectorInstrCost(*TTI, ScalarTy, Instruction::ExtractElement, VecTy,
16052 CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
16053 LLVM_DEBUG(dbgs() << " ExtractElement cost for " << *ScalarTy << " from "
16054 << *VecTy << ": " << ExtraCost << "\n");
16055 }
16056 // Leave the scalar instructions as is if they are cheaper than extracts.
16057 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
16058 Entry->getOpcode() == Instruction::Load) {
16059 // Checks if the user of the external scalar is phi in loop body.
16060 auto IsPhiInLoop = [&](const ExternalUser &U) {
16061 if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
16062 auto *I = cast<Instruction>(U.Scalar);
16063 const Loop *L = LI->getLoopFor(Phi->getParent());
16064 return L && (Phi->getParent() == I->getParent() ||
16065 L == LI->getLoopFor(I->getParent()));
16066 }
16067 return false;
16068 };
16069 if (!ValueToExtUses) {
16070 ValueToExtUses.emplace();
16071 for (const auto &P : enumerate(ExternalUses)) {
16072 // Ignore phis in loops.
16073 if (IsPhiInLoop(P.value()))
16074 continue;
16075
16076 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
16077 }
16078 }
16079 // Can use original instruction, if no operands vectorized or they are
16080 // marked as externally used already.
16081 auto *Inst = cast<Instruction>(EU.Scalar);
16082 InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
16083 auto OperandIsScalar = [&](Value *V) {
16084 if (!isVectorized(V)) {
16085 // Some extractelements might be not vectorized, but
16086 // transformed into shuffle and removed from the function,
16087 // consider it here.
16088 if (auto *EE = dyn_cast<ExtractElementInst>(V))
16089 return !EE->hasOneUse() || !MustGather.contains(EE);
16090 return true;
16091 }
16092 return ValueToExtUses->contains(V);
16093 };
16094 bool CanBeUsedAsScalar = all_of(Inst->operands(), OperandIsScalar);
16095 bool CanBeUsedAsScalarCast = false;
16096 if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
16097 if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));
16098 Op && all_of(Op->operands(), OperandIsScalar)) {
16099 InstructionCost OpCost =
16100 (isVectorized(Op) && !ValueToExtUses->contains(Op))
16102 : 0;
16103 if (ScalarCost + OpCost <= ExtraCost) {
16104 CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
16105 ScalarCost += OpCost;
16106 }
16107 }
16108 }
16109 if (CanBeUsedAsScalar) {
16110 bool KeepScalar = ScalarCost <= ExtraCost;
16111 // Try to keep original scalar if the user is the phi node from the same
16112 // block as the root phis, currently vectorized. It allows to keep
16113 // better ordering info of PHIs, being vectorized currently.
16114 bool IsProfitablePHIUser =
16115 (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
16116 VectorizableTree.front()->Scalars.size() > 2)) &&
16117 VectorizableTree.front()->hasState() &&
16118 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
16119 !Inst->hasNUsesOrMore(UsesLimit) &&
16120 none_of(Inst->users(),
16121 [&](User *U) {
16122 auto *PHIUser = dyn_cast<PHINode>(U);
16123 return (!PHIUser ||
16124 PHIUser->getParent() !=
16125 cast<Instruction>(
16126 VectorizableTree.front()->getMainOp())
16127 ->getParent()) &&
16128 !isVectorized(U);
16129 }) &&
16130 count_if(Entry->Scalars, [&](Value *V) {
16131 return ValueToExtUses->contains(V);
16132 }) <= 2;
16133 if (IsProfitablePHIUser) {
16134 KeepScalar = true;
16135 } else if (KeepScalar && ScalarCost != TTI::TCC_Free &&
16136 ExtraCost - ScalarCost <= TTI::TCC_Basic &&
16137 (!GatheredLoadsEntriesFirst.has_value() ||
16138 Entry->Idx < *GatheredLoadsEntriesFirst)) {
16139 unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
16140 return ValueToExtUses->contains(V);
16141 });
16142 auto It = ExtractsCount.find(Entry);
16143 if (It != ExtractsCount.end()) {
16144 assert(ScalarUsesCount >= It->getSecond().size() &&
16145 "Expected total number of external uses not less than "
16146 "number of scalar uses.");
16147 ScalarUsesCount -= It->getSecond().size();
16148 }
16149 // Keep original scalar if number of externally used instructions in
16150 // the same entry is not power of 2. It may help to do some extra
16151 // vectorization for now.
16152 KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
16153 }
16154 if (KeepScalar) {
16155 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
16156 for (Value *V : Inst->operands()) {
16157 auto It = ValueToExtUses->find(V);
16158 if (It != ValueToExtUses->end()) {
16159 // Replace all uses to avoid compiler crash.
16160 ExternalUses[It->second].User = nullptr;
16161 }
16162 }
16163 ExtraCost = ScalarCost;
16164 if (!IsPhiInLoop(EU))
16165 ExtractsCount[Entry].insert(Inst);
16166 if (CanBeUsedAsScalarCast) {
16167 ScalarOpsFromCasts.insert(Inst->getOperand(0));
16168 // Update the users of the operands of the cast operand to avoid
16169 // compiler crash.
16170 if (auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
16171 for (Value *V : IOp->operands()) {
16172 auto It = ValueToExtUses->find(V);
16173 if (It != ValueToExtUses->end()) {
16174 // Replace all uses to avoid compiler crash.
16175 ExternalUses[It->second].User = nullptr;
16176 }
16177 }
16178 }
16179 }
16180 }
16181 }
16182 }
16183
16184 ExtractCost += ExtraCost;
16185 }
16186 // Insert externals for extract of operands of casts to be emitted as scalars
16187 // instead of extractelement.
16188 for (Value *V : ScalarOpsFromCasts) {
16189 ExternalUsesAsOriginalScalar.insert(V);
16190 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V); !TEs.empty()) {
16191 ExternalUses.emplace_back(V, nullptr, *TEs.front(),
16192 TEs.front()->findLaneForValue(V));
16193 }
16194 }
16195 // Add reduced value cost, if resized.
16196 if (!VectorizedVals.empty()) {
16197 const TreeEntry &Root = *VectorizableTree.front();
16198 auto BWIt = MinBWs.find(&Root);
16199 if (BWIt != MinBWs.end()) {
16200 Type *DstTy = Root.Scalars.front()->getType();
16201 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->getScalarType());
16202 unsigned SrcSz =
16203 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
16204 if (OriginalSz != SrcSz) {
16205 unsigned Opcode = Instruction::Trunc;
16206 if (OriginalSz > SrcSz)
16207 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
16208 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
16209 if (auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
16210 assert(SLPReVec && "Only supported by REVEC.");
16211 SrcTy = getWidenedType(SrcTy, VecTy->getNumElements());
16212 }
16213 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
16216 }
16217 }
16218 }
16219
16220 Cost += ExtractCost;
16221 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
16222 bool ForSingleMask) {
16223 InstructionCost C = 0;
16224 unsigned VF = Mask.size();
16225 unsigned VecVF = TE->getVectorFactor();
16226 bool HasLargeIndex =
16227 any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); });
16228 if ((VF != VecVF && HasLargeIndex) ||
16230
16231 if (HasLargeIndex) {
16232 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
16233 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
16234 OrigMask.begin());
16236 getWidenedType(TE->getMainOp()->getType(), VecVF),
16237 OrigMask);
16238 LLVM_DEBUG(
16239 dbgs() << "SLP: Adding cost " << C
16240 << " for final shuffle of insertelement external users.\n";
16241 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
16242 Cost += C;
16243 return std::make_pair(TE, true);
16244 }
16245
16246 if (!ForSingleMask) {
16247 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
16248 for (unsigned I = 0; I < VF; ++I) {
16249 if (Mask[I] != PoisonMaskElem)
16250 ResizeMask[Mask[I]] = Mask[I];
16251 }
16252 if (!ShuffleVectorInst::isIdentityMask(ResizeMask, VF))
16255 getWidenedType(TE->getMainOp()->getType(), VecVF), ResizeMask);
16256 LLVM_DEBUG(
16257 dbgs() << "SLP: Adding cost " << C
16258 << " for final shuffle of insertelement external users.\n";
16259 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
16260
16261 Cost += C;
16262 }
16263 }
16264 return std::make_pair(TE, false);
16265 };
16266 // Calculate the cost of the reshuffled vectors, if any.
16267 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
16268 Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0);
16269 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
16270 unsigned VF = 0;
16271 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
16273 assert((TEs.size() == 1 || TEs.size() == 2) &&
16274 "Expected exactly 1 or 2 tree entries.");
16275 if (TEs.size() == 1) {
16276 if (VF == 0)
16277 VF = TEs.front()->getVectorFactor();
16278 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16279 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
16280 !all_of(enumerate(Mask), [=](const auto &Data) {
16281 return Data.value() == PoisonMaskElem ||
16282 (Data.index() < VF &&
16283 static_cast<int>(Data.index()) == Data.value());
16284 })) {
16287 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
16288 << " for final shuffle of insertelement "
16289 "external users.\n";
16290 TEs.front()->dump();
16291 dbgs() << "SLP: Current total cost = " << Cost << "\n");
16292 Cost += C;
16293 }
16294 } else {
16295 if (VF == 0) {
16296 if (TEs.front() &&
16297 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
16298 VF = TEs.front()->getVectorFactor();
16299 else
16300 VF = Mask.size();
16301 }
16302 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16305 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
16306 << " for final shuffle of vector node and external "
16307 "insertelement users.\n";
16308 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
16309 dbgs() << "SLP: Current total cost = " << Cost << "\n");
16310 Cost += C;
16311 }
16312 VF = Mask.size();
16313 return TEs.back();
16314 };
16315 (void)performExtractsShuffleAction<const TreeEntry>(
16316 MutableArrayRef(Vector.data(), Vector.size()), Base,
16317 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
16318 EstimateShufflesCost);
16320 cast<FixedVectorType>(
16321 ShuffledInserts[I].InsertElements.front()->getType()),
16322 DemandedElts[I],
16323 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
16324 Cost -= InsertCost;
16325 }
16326
16327 // Add the cost for reduced value resize (if required).
16328 if (ReductionBitWidth != 0) {
16329 assert(UserIgnoreList && "Expected reduction tree.");
16330 const TreeEntry &E = *VectorizableTree.front();
16331 auto It = MinBWs.find(&E);
16332 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
16333 unsigned SrcSize = It->second.first;
16334 unsigned DstSize = ReductionBitWidth;
16335 unsigned Opcode = Instruction::Trunc;
16336 if (SrcSize < DstSize) {
16337 bool IsArithmeticExtendedReduction =
16338 all_of(*UserIgnoreList, [](Value *V) {
16339 auto *I = cast<Instruction>(V);
16340 return is_contained({Instruction::Add, Instruction::FAdd,
16341 Instruction::Mul, Instruction::FMul,
16342 Instruction::And, Instruction::Or,
16343 Instruction::Xor},
16344 I->getOpcode());
16345 });
16346 if (IsArithmeticExtendedReduction)
16347 Opcode =
16348 Instruction::BitCast; // Handle it by getExtendedReductionCost
16349 else
16350 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
16351 }
16352 if (Opcode != Instruction::BitCast) {
16353 auto *SrcVecTy =
16354 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
16355 auto *DstVecTy =
16356 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
16357 TTI::CastContextHint CCH = getCastContextHint(E);
16358 InstructionCost CastCost;
16359 switch (E.getOpcode()) {
16360 case Instruction::SExt:
16361 case Instruction::ZExt:
16362 case Instruction::Trunc: {
16363 const TreeEntry *OpTE = getOperandEntry(&E, 0);
16364 CCH = getCastContextHint(*OpTE);
16365 break;
16366 }
16367 default:
16368 break;
16369 }
16370 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
16372 Cost += CastCost;
16373 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
16374 << " for final resize for reduction from " << SrcVecTy
16375 << " to " << DstVecTy << "\n";
16376 dbgs() << "SLP: Current total cost = " << Cost << "\n");
16377 }
16378 }
16379 }
16380
16381 std::optional<InstructionCost> SpillCost;
16382 if (Cost < -SLPCostThreshold) {
16383 SpillCost = getSpillCost();
16384 Cost += *SpillCost;
16385 }
16386#ifndef NDEBUG
16387 SmallString<256> Str;
16388 {
16390 OS << "SLP: Spill Cost = ";
16391 if (SpillCost)
16392 OS << *SpillCost;
16393 else
16394 OS << "<skipped>";
16395 OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n"
16396 << "SLP: Total Cost = " << Cost << ".\n";
16397 }
16398 LLVM_DEBUG(dbgs() << Str);
16399 if (ViewSLPTree)
16400 ViewGraph(this, "SLP" + F->getName(), false, Str);
16401#endif
16402
16403 return Cost;
16404}
16405
16406/// Tries to find extractelement instructions with constant indices from fixed
16407/// vector type and gather such instructions into a bunch, which highly likely
16408/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
16409/// successful, the matched scalars are replaced by poison values in \p VL for
16410/// future analysis.
16411std::optional<TTI::ShuffleKind>
16412BoUpSLP::tryToGatherSingleRegisterExtractElements(
16414 // Scan list of gathered scalars for extractelements that can be represented
16415 // as shuffles.
16417 SmallVector<int> UndefVectorExtracts;
16418 for (int I = 0, E = VL.size(); I < E; ++I) {
16419 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
16420 if (!EI) {
16421 if (isa<UndefValue>(VL[I]))
16422 UndefVectorExtracts.push_back(I);
16423 continue;
16424 }
16425 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
16426 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
16427 continue;
16428 std::optional<unsigned> Idx = getExtractIndex(EI);
16429 // Undefined index.
16430 if (!Idx) {
16431 UndefVectorExtracts.push_back(I);
16432 continue;
16433 }
16434 if (Idx >= VecTy->getNumElements()) {
16435 UndefVectorExtracts.push_back(I);
16436 continue;
16437 }
16438 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
16439 ExtractMask.reset(*Idx);
16440 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
16441 UndefVectorExtracts.push_back(I);
16442 continue;
16443 }
16444 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
16445 }
16446 // Sort the vector operands by the maximum number of uses in extractelements.
16448 VectorOpToIdx.takeVector();
16449 stable_sort(Vectors, [](const auto &P1, const auto &P2) {
16450 return P1.second.size() > P2.second.size();
16451 });
16452 // Find the best pair of the vectors or a single vector.
16453 const int UndefSz = UndefVectorExtracts.size();
16454 unsigned SingleMax = 0;
16455 unsigned PairMax = 0;
16456 if (!Vectors.empty()) {
16457 SingleMax = Vectors.front().second.size() + UndefSz;
16458 if (Vectors.size() > 1) {
16459 auto *ItNext = std::next(Vectors.begin());
16460 PairMax = SingleMax + ItNext->second.size();
16461 }
16462 }
16463 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
16464 return std::nullopt;
16465 // Check if better to perform a shuffle of 2 vectors or just of a single
16466 // vector.
16467 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
16468 SmallVector<Value *> GatheredExtracts(
16469 VL.size(), PoisonValue::get(VL.front()->getType()));
16470 if (SingleMax >= PairMax && SingleMax) {
16471 for (int Idx : Vectors.front().second)
16472 std::swap(GatheredExtracts[Idx], VL[Idx]);
16473 } else if (!Vectors.empty()) {
16474 for (unsigned Idx : {0, 1})
16475 for (int Idx : Vectors[Idx].second)
16476 std::swap(GatheredExtracts[Idx], VL[Idx]);
16477 }
16478 // Add extracts from undefs too.
16479 for (int Idx : UndefVectorExtracts)
16480 std::swap(GatheredExtracts[Idx], VL[Idx]);
16481 // Check that gather of extractelements can be represented as just a
16482 // shuffle of a single/two vectors the scalars are extracted from.
16483 std::optional<TTI::ShuffleKind> Res =
16484 isFixedVectorShuffle(GatheredExtracts, Mask, AC);
16485 if (!Res || all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
16486 // TODO: try to check other subsets if possible.
16487 // Restore the original VL if attempt was not successful.
16488 copy(SavedVL, VL.begin());
16489 return std::nullopt;
16490 }
16491 // Restore unused scalars from mask, if some of the extractelements were not
16492 // selected for shuffle.
16493 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
16494 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
16495 isa<UndefValue>(GatheredExtracts[I])) {
16496 std::swap(VL[I], GatheredExtracts[I]);
16497 continue;
16498 }
16499 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
16500 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
16501 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
16502 is_contained(UndefVectorExtracts, I))
16503 continue;
16504 }
16505 return Res;
16506}
16507
16508/// Tries to find extractelement instructions with constant indices from fixed
16509/// vector type and gather such instructions into a bunch, which highly likely
16510/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
16511/// successful, the matched scalars are replaced by poison values in \p VL for
16512/// future analysis.
16514BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
16516 unsigned NumParts) const {
16517 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
16518 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
16519 Mask.assign(VL.size(), PoisonMaskElem);
16520 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
16521 for (unsigned Part : seq<unsigned>(NumParts)) {
16522 // Scan list of gathered scalars for extractelements that can be represented
16523 // as shuffles.
16525 Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
16526 SmallVector<int> SubMask;
16527 std::optional<TTI::ShuffleKind> Res =
16528 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
16529 ShufflesRes[Part] = Res;
16530 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
16531 }
16532 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
16533 return Res.has_value();
16534 }))
16535 ShufflesRes.clear();
16536 return ShufflesRes;
16537}
16538
16539std::optional<TargetTransformInfo::ShuffleKind>
16540BoUpSLP::isGatherShuffledSingleRegisterEntry(
16541 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
16542 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
16543 Entries.clear();
16544 // TODO: currently checking only for Scalars in the tree entry, need to count
16545 // reused elements too for better cost estimation.
16546 auto GetUserEntry = [&](const TreeEntry *TE) {
16547 while (TE->UserTreeIndex && TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16548 TE = TE->UserTreeIndex.UserTE;
16549 if (TE == VectorizableTree.front().get())
16550 return EdgeInfo(const_cast<TreeEntry *>(TE), 0);
16551 return TE->UserTreeIndex;
16552 };
16553 auto HasGatherUser = [&](const TreeEntry *TE) {
16554 while (TE->Idx != 0 && TE->UserTreeIndex) {
16555 if (TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16556 return true;
16557 TE = TE->UserTreeIndex.UserTE;
16558 }
16559 return false;
16560 };
16561 const EdgeInfo TEUseEI = GetUserEntry(TE);
16562 if (!TEUseEI)
16563 return std::nullopt;
16564 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
16565 const BasicBlock *TEInsertBlock = nullptr;
16566 // Main node of PHI entries keeps the correct order of operands/incoming
16567 // blocks.
16568 if (auto *PHI = dyn_cast_or_null<PHINode>(
16569 TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() : nullptr);
16570 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
16571 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
16572 TEInsertPt = TEInsertBlock->getTerminator();
16573 } else {
16574 TEInsertBlock = TEInsertPt->getParent();
16575 }
16576 if (!DT->isReachableFromEntry(TEInsertBlock))
16577 return std::nullopt;
16578 auto *NodeUI = DT->getNode(TEInsertBlock);
16579 assert(NodeUI && "Should only process reachable instructions");
16580 SmallPtrSet<Value *, 4> GatheredScalars(llvm::from_range, VL);
16581 auto CheckOrdering = [&](const Instruction *InsertPt) {
16582 // Argument InsertPt is an instruction where vector code for some other
16583 // tree entry (one that shares one or more scalars with TE) is going to be
16584 // generated. This lambda returns true if insertion point of vector code
16585 // for the TE dominates that point (otherwise dependency is the other way
16586 // around). The other node is not limited to be of a gather kind. Gather
16587 // nodes are not scheduled and their vector code is inserted before their
16588 // first user. If user is PHI, that is supposed to be at the end of a
16589 // predecessor block. Otherwise it is the last instruction among scalars of
16590 // the user node. So, instead of checking dependency between instructions
16591 // themselves, we check dependency between their insertion points for vector
16592 // code (since each scalar instruction ends up as a lane of a vector
16593 // instruction).
16594 const BasicBlock *InsertBlock = InsertPt->getParent();
16595 auto *NodeEUI = DT->getNode(InsertBlock);
16596 if (!NodeEUI)
16597 return false;
16598 assert((NodeUI == NodeEUI) ==
16599 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
16600 "Different nodes should have different DFS numbers");
16601 // Check the order of the gather nodes users.
16602 if (TEInsertPt->getParent() != InsertBlock &&
16603 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
16604 return false;
16605 if (TEInsertPt->getParent() == InsertBlock &&
16606 TEInsertPt->comesBefore(InsertPt))
16607 return false;
16608 return true;
16609 };
16610 // Find all tree entries used by the gathered values. If no common entries
16611 // found - not a shuffle.
16612 // Here we build a set of tree nodes for each gathered value and trying to
16613 // find the intersection between these sets. If we have at least one common
16614 // tree node for each gathered value - we have just a permutation of the
16615 // single vector. If we have 2 different sets, we're in situation where we
16616 // have a permutation of 2 input vectors.
16618 SmallDenseMap<Value *, int> UsedValuesEntry;
16619 SmallPtrSet<const Value *, 16> VisitedValue;
16620 auto CheckAndUseSameNode = [&](const TreeEntry *TEPtr) {
16621 // The node is reused - exit.
16622 if ((TEPtr->getVectorFactor() != VL.size() &&
16623 TEPtr->Scalars.size() != VL.size()) ||
16624 (!TEPtr->isSame(VL) && !TEPtr->isSame(TE->Scalars)))
16625 return false;
16626 UsedTEs.clear();
16627 UsedTEs.emplace_back().insert(TEPtr);
16628 for (Value *V : VL) {
16629 if (isConstant(V))
16630 continue;
16631 UsedValuesEntry.try_emplace(V, 0);
16632 }
16633 return true;
16634 };
16635 auto CheckParentNodes = [&](const TreeEntry *User1, const TreeEntry *User2,
16636 unsigned EdgeIdx) {
16637 const TreeEntry *Ptr1 = User1;
16638 const TreeEntry *Ptr2 = User2;
16640 while (Ptr2) {
16641 PtrToIdx.try_emplace(Ptr2, EdgeIdx);
16642 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
16643 Ptr2 = Ptr2->UserTreeIndex.UserTE;
16644 }
16645 while (Ptr1) {
16646 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
16647 Ptr1 = Ptr1->UserTreeIndex.UserTE;
16648 if (auto It = PtrToIdx.find(Ptr1); It != PtrToIdx.end())
16649 return Idx < It->second;
16650 }
16651 return false;
16652 };
16653 for (Value *V : VL) {
16654 if (isConstant(V) || !VisitedValue.insert(V).second)
16655 continue;
16656 // Build a list of tree entries where V is used.
16658 for (const TreeEntry *TEPtr : ValueToGatherNodes.lookup(V)) {
16659 if (TEPtr == TE || TEPtr->Idx == 0)
16660 continue;
16661 assert(any_of(TEPtr->Scalars,
16662 [&](Value *V) { return GatheredScalars.contains(V); }) &&
16663 "Must contain at least single gathered value.");
16664 assert(TEPtr->UserTreeIndex &&
16665 "Expected only single user of a gather node.");
16666 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
16667
16668 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
16669 UseEI.UserTE->hasState())
16670 ? dyn_cast<PHINode>(UseEI.UserTE->getMainOp())
16671 : nullptr;
16672 Instruction *InsertPt =
16673 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
16674 : &getLastInstructionInBundle(UseEI.UserTE);
16675 if (TEInsertPt == InsertPt) {
16676 // Check nodes, which might be emitted first.
16677 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16678 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
16679 TEUseEI.UserTE->isAltShuffle()) &&
16680 all_of(TEUseEI.UserTE->Scalars, isUsedOutsideBlock)) {
16681 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
16682 (UseEI.UserTE->hasState() &&
16683 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16684 !UseEI.UserTE->isAltShuffle()) ||
16685 !all_of(UseEI.UserTE->Scalars, isUsedOutsideBlock))
16686 continue;
16687 }
16688
16689 // If the schedulable insertion point is used in multiple entries - just
16690 // exit, no known ordering at this point, available only after real
16691 // scheduling.
16692 if (!doesNotNeedToBeScheduled(InsertPt) &&
16693 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
16694 continue;
16695 // If the users are the PHI nodes with the same incoming blocks - skip.
16696 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16697 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
16698 UseEI.UserTE->State == TreeEntry::Vectorize &&
16699 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16700 TEUseEI.UserTE != UseEI.UserTE)
16701 continue;
16702 // If 2 gathers are operands of the same entry (regardless of whether
16703 // user is PHI or else), compare operands indices, use the earlier one
16704 // as the base.
16705 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
16706 continue;
16707 // If the user instruction is used for some reason in different
16708 // vectorized nodes - make it depend on index.
16709 if (TEUseEI.UserTE != UseEI.UserTE &&
16710 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
16711 HasGatherUser(TEUseEI.UserTE)))
16712 continue;
16713 // If the user node is the operand of the other user node - skip.
16714 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
16715 continue;
16716 }
16717
16718 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
16719 TEUseEI.UserTE->doesNotNeedToSchedule() !=
16720 UseEI.UserTE->doesNotNeedToSchedule() &&
16721 is_contained(UseEI.UserTE->Scalars, TEInsertPt))
16722 continue;
16723 // Check if the user node of the TE comes after user node of TEPtr,
16724 // otherwise TEPtr depends on TE.
16725 if ((TEInsertBlock != InsertPt->getParent() ||
16726 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
16727 !CheckOrdering(InsertPt))
16728 continue;
16729 // The node is reused - exit.
16730 if (CheckAndUseSameNode(TEPtr))
16731 break;
16732 VToTEs.insert(TEPtr);
16733 }
16734 if (ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V); !VTEs.empty()) {
16735 const auto *It = find_if(
16736 VTEs, [&](const TreeEntry *MTE) { return MTE != TEUseEI.UserTE; });
16737 if (It != VTEs.end()) {
16738 const TreeEntry *VTE = *It;
16739 if (none_of(TE->CombinedEntriesWithIndices,
16740 [&](const auto &P) { return P.first == VTE->Idx; })) {
16741 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16742 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16743 continue;
16744 }
16745 // The node is reused - exit.
16746 if (CheckAndUseSameNode(VTE))
16747 break;
16748 VToTEs.insert(VTE);
16749 }
16750 }
16751 if (ArrayRef<TreeEntry *> VTEs = getTreeEntries(V); !VTEs.empty()) {
16752 const TreeEntry *VTE = VTEs.front();
16753 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
16754 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
16755 VTEs = VTEs.drop_front();
16756 // Iterate through all vectorized nodes.
16757 const auto *MIt = find_if(VTEs, [](const TreeEntry *MTE) {
16758 return MTE->State == TreeEntry::Vectorize;
16759 });
16760 if (MIt == VTEs.end())
16761 continue;
16762 VTE = *MIt;
16763 }
16764 if (none_of(TE->CombinedEntriesWithIndices,
16765 [&](const auto &P) { return P.first == VTE->Idx; })) {
16766 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16767 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16768 continue;
16769 }
16770 // The node is reused - exit.
16771 if (CheckAndUseSameNode(VTE))
16772 break;
16773 VToTEs.insert(VTE);
16774 }
16775 if (VToTEs.empty())
16776 continue;
16777 if (UsedTEs.empty()) {
16778 // The first iteration, just insert the list of nodes to vector.
16779 UsedTEs.push_back(VToTEs);
16780 UsedValuesEntry.try_emplace(V, 0);
16781 } else {
16782 // Need to check if there are any previously used tree nodes which use V.
16783 // If there are no such nodes, consider that we have another one input
16784 // vector.
16785 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
16786 unsigned Idx = 0;
16787 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
16788 // Do we have a non-empty intersection of previously listed tree entries
16789 // and tree entries using current V?
16790 set_intersect(VToTEs, Set);
16791 if (!VToTEs.empty()) {
16792 // Yes, write the new subset and continue analysis for the next
16793 // scalar.
16794 Set.swap(VToTEs);
16795 break;
16796 }
16797 VToTEs = SavedVToTEs;
16798 ++Idx;
16799 }
16800 // No non-empty intersection found - need to add a second set of possible
16801 // source vectors.
16802 if (Idx == UsedTEs.size()) {
16803 // If the number of input vectors is greater than 2 - not a permutation,
16804 // fallback to the regular gather.
16805 // TODO: support multiple reshuffled nodes.
16806 if (UsedTEs.size() == 2)
16807 continue;
16808 UsedTEs.push_back(SavedVToTEs);
16809 Idx = UsedTEs.size() - 1;
16810 }
16811 UsedValuesEntry.try_emplace(V, Idx);
16812 }
16813 }
16814
16815 if (UsedTEs.empty()) {
16816 Entries.clear();
16817 return std::nullopt;
16818 }
16819
16820 unsigned VF = 0;
16821 if (UsedTEs.size() == 1) {
16822 // Keep the order to avoid non-determinism.
16823 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
16824 UsedTEs.front().end());
16825 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
16826 return TE1->Idx < TE2->Idx;
16827 });
16828 // Try to find the perfect match in another gather node at first.
16829 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
16830 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
16831 });
16832 if (It != FirstEntries.end() &&
16833 ((*It)->getVectorFactor() == VL.size() ||
16834 ((*It)->getVectorFactor() == TE->Scalars.size() &&
16835 TE->ReuseShuffleIndices.size() == VL.size() &&
16836 (*It)->isSame(TE->Scalars)))) {
16837 Entries.push_back(*It);
16838 if ((*It)->getVectorFactor() == VL.size()) {
16839 std::iota(std::next(Mask.begin(), Part * VL.size()),
16840 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
16841 } else {
16842 SmallVector<int> CommonMask = TE->getCommonMask();
16843 copy(CommonMask, Mask.begin());
16844 }
16845 // Clear undef scalars.
16846 for (unsigned I : seq<unsigned>(VL.size()))
16847 if (isa<PoisonValue>(VL[I]))
16848 Mask[Part * VL.size() + I] = PoisonMaskElem;
16850 }
16851 // No perfect match, just shuffle, so choose the first tree node from the
16852 // tree.
16853 Entries.push_back(FirstEntries.front());
16854 // Update mapping between values and corresponding tree entries.
16855 for (auto &P : UsedValuesEntry)
16856 P.second = 0;
16857 VF = FirstEntries.front()->getVectorFactor();
16858 } else {
16859 // Try to find nodes with the same vector factor.
16860 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
16861 // Keep the order of tree nodes to avoid non-determinism.
16863 for (const TreeEntry *TE : UsedTEs.front()) {
16864 unsigned VF = TE->getVectorFactor();
16865 auto It = VFToTE.find(VF);
16866 if (It != VFToTE.end()) {
16867 if (It->second->Idx > TE->Idx)
16868 It->getSecond() = TE;
16869 continue;
16870 }
16871 VFToTE.try_emplace(VF, TE);
16872 }
16873 // Same, keep the order to avoid non-determinism.
16874 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
16875 UsedTEs.back().end());
16876 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
16877 return TE1->Idx < TE2->Idx;
16878 });
16879 for (const TreeEntry *TE : SecondEntries) {
16880 auto It = VFToTE.find(TE->getVectorFactor());
16881 if (It != VFToTE.end()) {
16882 VF = It->first;
16883 Entries.push_back(It->second);
16884 Entries.push_back(TE);
16885 break;
16886 }
16887 }
16888 // No 2 source vectors with the same vector factor - just choose 2 with max
16889 // index.
16890 if (Entries.empty()) {
16891 Entries.push_back(*llvm::max_element(
16892 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
16893 return TE1->Idx < TE2->Idx;
16894 }));
16895 Entries.push_back(SecondEntries.front());
16896 VF = std::max(Entries.front()->getVectorFactor(),
16897 Entries.back()->getVectorFactor());
16898 } else {
16899 VF = Entries.front()->getVectorFactor();
16900 }
16901 SmallVector<SmallPtrSet<Value *, 8>> ValuesToEntries;
16902 for (const TreeEntry *E : Entries)
16903 ValuesToEntries.emplace_back().insert(E->Scalars.begin(),
16904 E->Scalars.end());
16905 // Update mapping between values and corresponding tree entries.
16906 for (auto &P : UsedValuesEntry) {
16907 for (unsigned Idx : seq<unsigned>(ValuesToEntries.size()))
16908 if (ValuesToEntries[Idx].contains(P.first)) {
16909 P.second = Idx;
16910 break;
16911 }
16912 }
16913 }
16914
16915 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
16916 // Checks if the 2 PHIs are compatible in terms of high possibility to be
16917 // vectorized.
16918 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
16919 auto *PHI = cast<PHINode>(V);
16920 auto *PHI1 = cast<PHINode>(V1);
16921 // Check that all incoming values are compatible/from same parent (if they
16922 // are instructions).
16923 // The incoming values are compatible if they all are constants, or
16924 // instruction with the same/alternate opcodes from the same basic block.
16925 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
16926 Value *In = PHI->getIncomingValue(I);
16927 Value *In1 = PHI1->getIncomingValue(I);
16928 if (isConstant(In) && isConstant(In1))
16929 continue;
16930 if (!getSameOpcode({In, In1}, *TLI))
16931 return false;
16932 if (cast<Instruction>(In)->getParent() !=
16933 cast<Instruction>(In1)->getParent())
16934 return false;
16935 }
16936 return true;
16937 };
16938 // Check if the value can be ignored during analysis for shuffled gathers.
16939 // We suppose it is better to ignore instruction, which do not form splats,
16940 // are not vectorized/not extractelements (these instructions will be handled
16941 // by extractelements processing) or may form vector node in future.
16942 auto MightBeIgnored = [=](Value *V) {
16943 auto *I = dyn_cast<Instruction>(V);
16944 return I && !IsSplatOrUndefs && !isVectorized(I) &&
16946 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
16947 };
16948 // Check that the neighbor instruction may form a full vector node with the
16949 // current instruction V. It is possible, if they have same/alternate opcode
16950 // and same parent basic block.
16951 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
16952 Value *V1 = VL[Idx];
16953 bool UsedInSameVTE = false;
16954 auto It = UsedValuesEntry.find(V1);
16955 if (It != UsedValuesEntry.end())
16956 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
16957 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
16958 getSameOpcode({V, V1}, *TLI) &&
16959 cast<Instruction>(V)->getParent() ==
16960 cast<Instruction>(V1)->getParent() &&
16961 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
16962 };
16963 // Build a shuffle mask for better cost estimation and vector emission.
16964 SmallBitVector UsedIdxs(Entries.size());
16966 for (int I = 0, E = VL.size(); I < E; ++I) {
16967 Value *V = VL[I];
16968 auto It = UsedValuesEntry.find(V);
16969 if (It == UsedValuesEntry.end())
16970 continue;
16971 // Do not try to shuffle scalars, if they are constants, or instructions
16972 // that can be vectorized as a result of the following vector build
16973 // vectorization.
16974 if (isConstant(V) || (MightBeIgnored(V) &&
16975 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
16976 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
16977 continue;
16978 unsigned Idx = It->second;
16979 EntryLanes.emplace_back(Idx, I);
16980 UsedIdxs.set(Idx);
16981 }
16982 // Iterate through all shuffled scalars and select entries, which can be used
16983 // for final shuffle.
16985 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
16986 if (!UsedIdxs.test(I))
16987 continue;
16988 // Fix the entry number for the given scalar. If it is the first entry, set
16989 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
16990 // These indices are used when calculating final shuffle mask as the vector
16991 // offset.
16992 for (std::pair<unsigned, int> &Pair : EntryLanes)
16993 if (Pair.first == I)
16994 Pair.first = TempEntries.size();
16995 TempEntries.push_back(Entries[I]);
16996 }
16997 Entries.swap(TempEntries);
16998 if (EntryLanes.size() == Entries.size() &&
16999 !VL.equals(ArrayRef(TE->Scalars)
17000 .slice(Part * VL.size(),
17001 std::min<int>(VL.size(), TE->Scalars.size())))) {
17002 // We may have here 1 or 2 entries only. If the number of scalars is equal
17003 // to the number of entries, no need to do the analysis, it is not very
17004 // profitable. Since VL is not the same as TE->Scalars, it means we already
17005 // have some shuffles before. Cut off not profitable case.
17006 Entries.clear();
17007 return std::nullopt;
17008 }
17009 // Build the final mask, check for the identity shuffle, if possible.
17010 bool IsIdentity = Entries.size() == 1;
17011 // Pair.first is the offset to the vector, while Pair.second is the index of
17012 // scalar in the list.
17013 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
17014 unsigned Idx = Part * VL.size() + Pair.second;
17015 Mask[Idx] =
17016 Pair.first * VF +
17017 (ForOrder ? std::distance(
17018 Entries[Pair.first]->Scalars.begin(),
17019 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
17020 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
17021 IsIdentity &= Mask[Idx] == Pair.second;
17022 }
17023 if (ForOrder || IsIdentity || Entries.empty()) {
17024 switch (Entries.size()) {
17025 case 1:
17026 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
17028 break;
17029 case 2:
17030 if (EntryLanes.size() > 2 || VL.size() <= 2)
17032 break;
17033 default:
17034 break;
17035 }
17036 } else if (!isa<VectorType>(VL.front()->getType()) &&
17037 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
17038 // Do the cost estimation if shuffle beneficial than buildvector.
17039 SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),
17040 std::next(Mask.begin(), (Part + 1) * VL.size()));
17041 int MinElement = SubMask.front(), MaxElement = SubMask.front();
17042 for (int Idx : SubMask) {
17043 if (Idx == PoisonMaskElem)
17044 continue;
17045 if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
17046 MinElement = Idx;
17047 if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
17048 MaxElement = Idx;
17049 }
17050 assert(MaxElement >= 0 && MinElement >= 0 &&
17051 MaxElement % VF >= MinElement % VF &&
17052 "Expected at least single element.");
17053 unsigned NewVF = std::max<unsigned>(
17054 VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(),
17055 (MaxElement % VF) -
17056 (MinElement % VF) + 1));
17057 if (NewVF < VF) {
17058 for (int &Idx : SubMask) {
17059 if (Idx == PoisonMaskElem)
17060 continue;
17061 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
17062 (Idx >= static_cast<int>(VF) ? NewVF : 0);
17063 }
17064 } else {
17065 NewVF = VF;
17066 }
17067
17069 auto *VecTy = getWidenedType(VL.front()->getType(), NewVF);
17070 auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
17071 auto GetShuffleCost = [&,
17074 VectorType *VecTy) -> InstructionCost {
17075 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
17077 Mask, Entries.front()->getInterleaveFactor()))
17078 return TTI::TCC_Free;
17079 return ::getShuffleCost(TTI,
17080 Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
17082 VecTy, Mask, CostKind);
17083 };
17084 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
17085 InstructionCost FirstShuffleCost = 0;
17086 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
17087 if (Entries.size() == 1 || !Entries[0]->isGather()) {
17088 FirstShuffleCost = ShuffleCost;
17089 } else {
17090 // Transform mask to include only first entry.
17091 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
17092 bool IsIdentity = true;
17093 for (auto [I, Idx] : enumerate(FirstMask)) {
17094 if (Idx >= static_cast<int>(NewVF)) {
17096 } else {
17097 DemandedElts.clearBit(I);
17098 if (Idx != PoisonMaskElem)
17099 IsIdentity &= static_cast<int>(I) == Idx;
17100 }
17101 }
17102 if (!IsIdentity)
17103 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
17104 FirstShuffleCost += getScalarizationOverhead(
17105 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
17106 /*Extract=*/false, CostKind);
17107 }
17108 InstructionCost SecondShuffleCost = 0;
17109 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
17110 if (Entries.size() == 1 || !Entries[1]->isGather()) {
17111 SecondShuffleCost = ShuffleCost;
17112 } else {
17113 // Transform mask to include only first entry.
17114 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
17115 bool IsIdentity = true;
17116 for (auto [I, Idx] : enumerate(SecondMask)) {
17117 if (Idx < static_cast<int>(NewVF) && Idx >= 0) {
17119 } else {
17120 DemandedElts.clearBit(I);
17121 if (Idx != PoisonMaskElem) {
17122 Idx -= NewVF;
17123 IsIdentity &= static_cast<int>(I) == Idx;
17124 }
17125 }
17126 }
17127 if (!IsIdentity)
17128 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
17129 SecondShuffleCost += getScalarizationOverhead(
17130 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
17131 /*Extract=*/false, CostKind);
17132 }
17133 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
17134 for (auto [I, Idx] : enumerate(SubMask))
17135 if (Idx == PoisonMaskElem)
17136 DemandedElts.clearBit(I);
17137 InstructionCost BuildVectorCost = getScalarizationOverhead(
17138 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
17139 /*Extract=*/false, CostKind);
17140 const TreeEntry *BestEntry = nullptr;
17141 if (FirstShuffleCost < ShuffleCost) {
17142 std::for_each(std::next(Mask.begin(), Part * VL.size()),
17143 std::next(Mask.begin(), (Part + 1) * VL.size()),
17144 [&](int &Idx) {
17145 if (Idx >= static_cast<int>(VF))
17146 Idx = PoisonMaskElem;
17147 });
17148 BestEntry = Entries.front();
17149 ShuffleCost = FirstShuffleCost;
17150 }
17151 if (SecondShuffleCost < ShuffleCost) {
17152 std::for_each(std::next(Mask.begin(), Part * VL.size()),
17153 std::next(Mask.begin(), (Part + 1) * VL.size()),
17154 [&](int &Idx) {
17155 if (Idx < static_cast<int>(VF))
17156 Idx = PoisonMaskElem;
17157 else
17158 Idx -= VF;
17159 });
17160 BestEntry = Entries[1];
17161 ShuffleCost = SecondShuffleCost;
17162 }
17163 if (BuildVectorCost >= ShuffleCost) {
17164 if (BestEntry) {
17165 Entries.clear();
17166 Entries.push_back(BestEntry);
17167 }
17168 return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
17170 }
17171 }
17172 Entries.clear();
17173 // Clear the corresponding mask elements.
17174 std::fill(std::next(Mask.begin(), Part * VL.size()),
17175 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
17176 return std::nullopt;
17177}
17178
17180BoUpSLP::isGatherShuffledEntry(
17181 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
17182 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
17183 bool ForOrder) {
17184 assert(NumParts > 0 && NumParts < VL.size() &&
17185 "Expected positive number of registers.");
17186 Entries.clear();
17187 // No need to check for the topmost gather node.
17188 if (TE == VectorizableTree.front().get() &&
17189 (!GatheredLoadsEntriesFirst.has_value() ||
17190 none_of(ArrayRef(VectorizableTree).drop_front(),
17191 [](const std::unique_ptr<TreeEntry> &TE) {
17192 return !TE->isGather();
17193 })))
17194 return {};
17195 // FIXME: Gathering for non-power-of-2 (non whole registers) nodes not
17196 // implemented yet.
17197 if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
17198 return {};
17199 Mask.assign(VL.size(), PoisonMaskElem);
17200 assert((TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
17201 "Expected only single user of the gather node.");
17202 assert(VL.size() % NumParts == 0 &&
17203 "Number of scalars must be divisible by NumParts.");
17204 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->isGather() &&
17205 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
17206 (TE->Idx == 0 ||
17207 (TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) ||
17208 isSplat(TE->Scalars) ||
17209 (TE->hasState() &&
17210 getSameValuesTreeEntry(TE->getMainOp(), TE->Scalars))))
17211 return {};
17212 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
17214 for (unsigned Part : seq<unsigned>(NumParts)) {
17215 ArrayRef<Value *> SubVL =
17216 VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
17217 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
17218 std::optional<TTI::ShuffleKind> SubRes =
17219 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
17220 ForOrder);
17221 if (!SubRes)
17222 SubEntries.clear();
17223 Res.push_back(SubRes);
17224 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
17225 SubEntries.front()->getVectorFactor() == VL.size() &&
17226 (SubEntries.front()->isSame(TE->Scalars) ||
17227 SubEntries.front()->isSame(VL))) {
17228 SmallVector<const TreeEntry *> LocalSubEntries;
17229 LocalSubEntries.swap(SubEntries);
17230 Entries.clear();
17231 Res.clear();
17232 std::iota(Mask.begin(), Mask.end(), 0);
17233 // Clear undef scalars.
17234 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
17235 if (isa<PoisonValue>(VL[I]))
17237 Entries.emplace_back(1, LocalSubEntries.front());
17239 return Res;
17240 }
17241 }
17242 if (all_of(Res,
17243 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
17244 Entries.clear();
17245 return {};
17246 }
17247 return Res;
17248}
17249
17250InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
17251 Type *ScalarTy) const {
17252 const unsigned VF = VL.size();
17253 auto *VecTy = getWidenedType(ScalarTy, VF);
17254 // Find the cost of inserting/extracting values from the vector.
17255 // Check if the same elements are inserted several times and count them as
17256 // shuffle candidates.
17257 APInt DemandedElements = APInt::getZero(VF);
17260 auto EstimateInsertCost = [&](unsigned I, Value *V) {
17261 DemandedElements.setBit(I);
17262 if (V->getType() != ScalarTy)
17263 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
17265 };
17266 SmallVector<int> ConstantShuffleMask(VF, PoisonMaskElem);
17267 std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
17268 for (auto [I, V] : enumerate(VL)) {
17269 // No need to shuffle duplicates for constants.
17270 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V))
17271 continue;
17272
17273 if (isConstant(V)) {
17274 ConstantShuffleMask[I] = I + VF;
17275 continue;
17276 }
17277 EstimateInsertCost(I, V);
17278 }
17279 // FIXME: add a cost for constant vector materialization.
17280 bool IsAnyNonUndefConst =
17281 any_of(VL, [](Value *V) { return !isa<UndefValue>(V) && isConstant(V); });
17282 // 1. Shuffle input source vector and constant vector.
17283 if (!ForPoisonSrc && IsAnyNonUndefConst) {
17285 ConstantShuffleMask);
17286 }
17287
17288 // 2. Insert unique non-constants.
17289 if (!DemandedElements.isZero())
17290 Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElements,
17291 /*Insert=*/true,
17292 /*Extract=*/false, CostKind,
17293 ForPoisonSrc && !IsAnyNonUndefConst, VL);
17294 return Cost;
17295}
17296
17297Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
17298 auto It = EntryToLastInstruction.find(E);
17299 if (It != EntryToLastInstruction.end())
17300 return *cast<Instruction>(It->second);
17301 Instruction *Res = nullptr;
17302 // Get the basic block this bundle is in. All instructions in the bundle
17303 // should be in this block (except for extractelement-like instructions with
17304 // constant indices or gathered loads or copyables).
17305 Instruction *Front;
17306 unsigned Opcode;
17307 if (E->hasState()) {
17308 Front = E->getMainOp();
17309 Opcode = E->getOpcode();
17310 } else {
17311 Front = cast<Instruction>(*find_if(E->Scalars, IsaPred<Instruction>));
17312 Opcode = Front->getOpcode();
17313 }
17314 auto *BB = Front->getParent();
17315 assert(
17316 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
17317 E->isGather() && E->Idx < *GatheredLoadsEntriesFirst) ||
17318 E->State == TreeEntry::SplitVectorize || E->hasCopyableElements() ||
17319 all_of(E->Scalars,
17320 [=](Value *V) -> bool {
17321 if (Opcode == Instruction::GetElementPtr &&
17322 !isa<GetElementPtrInst>(V))
17323 return true;
17324 auto *I = dyn_cast<Instruction>(V);
17325 return !I || !E->getMatchingMainOpOrAltOp(I) ||
17326 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
17327 })) &&
17328 "Expected gathered loads or GEPs or instructions from same basic "
17329 "block.");
17330
17331 auto FindLastInst = [&]() {
17332 Instruction *LastInst = Front;
17333 for (Value *V : E->Scalars) {
17334 auto *I = dyn_cast<Instruction>(V);
17335 if (!I)
17336 continue;
17337 if (E->isCopyableElement(I))
17338 continue;
17339 if (LastInst->getParent() == I->getParent()) {
17340 if (LastInst->comesBefore(I))
17341 LastInst = I;
17342 continue;
17343 }
17344 assert(((Opcode == Instruction::GetElementPtr &&
17345 !isa<GetElementPtrInst>(I)) ||
17346 E->State == TreeEntry::SplitVectorize ||
17347 (isVectorLikeInstWithConstOps(LastInst) &&
17349 (GatheredLoadsEntriesFirst.has_value() &&
17350 Opcode == Instruction::Load && E->isGather() &&
17351 E->Idx < *GatheredLoadsEntriesFirst)) &&
17352 "Expected vector-like or non-GEP in GEP node insts only.");
17353 if (!DT->isReachableFromEntry(LastInst->getParent())) {
17354 LastInst = I;
17355 continue;
17356 }
17357 if (!DT->isReachableFromEntry(I->getParent()))
17358 continue;
17359 auto *NodeA = DT->getNode(LastInst->getParent());
17360 auto *NodeB = DT->getNode(I->getParent());
17361 assert(NodeA && "Should only process reachable instructions");
17362 assert(NodeB && "Should only process reachable instructions");
17363 assert((NodeA == NodeB) ==
17364 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17365 "Different nodes should have different DFS numbers");
17366 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
17367 LastInst = I;
17368 }
17369 BB = LastInst->getParent();
17370 return LastInst;
17371 };
17372
17373 auto FindFirstInst = [&]() {
17374 Instruction *FirstInst = Front;
17375 for (Value *V : E->Scalars) {
17376 auto *I = dyn_cast<Instruction>(V);
17377 if (!I)
17378 continue;
17379 if (E->isCopyableElement(I))
17380 continue;
17381 if (FirstInst->getParent() == I->getParent()) {
17382 if (I->comesBefore(FirstInst))
17383 FirstInst = I;
17384 continue;
17385 }
17386 assert(((Opcode == Instruction::GetElementPtr &&
17387 !isa<GetElementPtrInst>(I)) ||
17388 (isVectorLikeInstWithConstOps(FirstInst) &&
17390 "Expected vector-like or non-GEP in GEP node insts only.");
17391 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
17392 FirstInst = I;
17393 continue;
17394 }
17395 if (!DT->isReachableFromEntry(I->getParent()))
17396 continue;
17397 auto *NodeA = DT->getNode(FirstInst->getParent());
17398 auto *NodeB = DT->getNode(I->getParent());
17399 assert(NodeA && "Should only process reachable instructions");
17400 assert(NodeB && "Should only process reachable instructions");
17401 assert((NodeA == NodeB) ==
17402 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17403 "Different nodes should have different DFS numbers");
17404 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
17405 FirstInst = I;
17406 }
17407 return FirstInst;
17408 };
17409
17410 if (E->State == TreeEntry::SplitVectorize) {
17411 Res = FindLastInst();
17412 if (ArrayRef<TreeEntry *> Entries = getTreeEntries(Res); !Entries.empty()) {
17413 for (auto *E : Entries) {
17414 auto *I = dyn_cast_or_null<Instruction>(E->VectorizedValue);
17415 if (!I)
17416 I = &getLastInstructionInBundle(E);
17417 if (Res->getParent() == I->getParent() && Res->comesBefore(I))
17418 Res = I;
17419 }
17420 }
17421 EntryToLastInstruction.try_emplace(E, Res);
17422 return *Res;
17423 }
17424
17425 // Set insertpoint for gathered loads to the very first load.
17426 if (GatheredLoadsEntriesFirst.has_value() &&
17427 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
17428 Opcode == Instruction::Load) {
17429 Res = FindFirstInst();
17430 EntryToLastInstruction.try_emplace(E, Res);
17431 return *Res;
17432 }
17433
17434 // Set the insert point to the beginning of the basic block if the entry
17435 // should not be scheduled.
17436 auto FindScheduleBundle = [&](const TreeEntry *E) -> const ScheduleBundle * {
17437 if (E->isGather())
17438 return nullptr;
17439 // Found previously that the instruction do not need to be scheduled.
17440 const auto *It = BlocksSchedules.find(BB);
17441 if (It == BlocksSchedules.end())
17442 return nullptr;
17443 for (Value *V : E->Scalars) {
17444 auto *I = dyn_cast<Instruction>(V);
17445 if (!I || isa<PHINode>(I) ||
17446 (!E->isCopyableElement(I) && doesNotNeedToBeScheduled(I)))
17447 continue;
17448 ArrayRef<ScheduleBundle *> Bundles = It->second->getScheduleBundles(I);
17449 if (Bundles.empty())
17450 continue;
17451 const auto *It = find_if(
17452 Bundles, [&](ScheduleBundle *B) { return B->getTreeEntry() == E; });
17453 if (It != Bundles.end())
17454 return *It;
17455 }
17456 return nullptr;
17457 };
17458 const ScheduleBundle *Bundle = FindScheduleBundle(E);
17459 if (!E->isGather() && !Bundle) {
17460 if ((Opcode == Instruction::GetElementPtr &&
17461 any_of(E->Scalars,
17462 [](Value *V) {
17463 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
17464 })) ||
17465 all_of(E->Scalars, [&](Value *V) {
17466 return isa<PoisonValue>(V) || E->isCopyableElement(V) ||
17467 (!isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V));
17468 }))
17469 Res = FindLastInst();
17470 else
17471 Res = FindFirstInst();
17472 EntryToLastInstruction.try_emplace(E, Res);
17473 return *Res;
17474 }
17475
17476 // Find the last instruction. The common case should be that BB has been
17477 // scheduled, and the last instruction is VL.back(). So we start with
17478 // VL.back() and iterate over schedule data until we reach the end of the
17479 // bundle. The end of the bundle is marked by null ScheduleData.
17480 if (Bundle) {
17481 assert(!E->isGather() && "Gathered instructions should not be scheduled");
17482 Res = Bundle->getBundle().back()->getInst();
17483 EntryToLastInstruction.try_emplace(E, Res);
17484 return *Res;
17485 }
17486
17487 // LastInst can still be null at this point if there's either not an entry
17488 // for BB in BlocksSchedules or there's no ScheduleData available for
17489 // VL.back(). This can be the case if buildTreeRec aborts for various
17490 // reasons (e.g., the maximum recursion depth is reached, the maximum region
17491 // size is reached, etc.). ScheduleData is initialized in the scheduling
17492 // "dry-run".
17493 //
17494 // If this happens, we can still find the last instruction by brute force. We
17495 // iterate forwards from Front (inclusive) until we either see all
17496 // instructions in the bundle or reach the end of the block. If Front is the
17497 // last instruction in program order, LastInst will be set to Front, and we
17498 // will visit all the remaining instructions in the block.
17499 //
17500 // One of the reasons we exit early from buildTreeRec is to place an upper
17501 // bound on compile-time. Thus, taking an additional compile-time hit here is
17502 // not ideal. However, this should be exceedingly rare since it requires that
17503 // we both exit early from buildTreeRec and that the bundle be out-of-order
17504 // (causing us to iterate all the way to the end of the block).
17505 if (!Res)
17506 Res = FindLastInst();
17507 assert(Res && "Failed to find last instruction in bundle");
17508 EntryToLastInstruction.try_emplace(E, Res);
17509 return *Res;
17510}
17511
17512void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
17513 auto *Front = E->getMainOp();
17514 Instruction *LastInst = &getLastInstructionInBundle(E);
17515 assert(LastInst && "Failed to find last instruction in bundle");
17516 BasicBlock::iterator LastInstIt = LastInst->getIterator();
17517 // If the instruction is PHI, set the insert point after all the PHIs.
17518 bool IsPHI = isa<PHINode>(LastInst);
17519 if (IsPHI) {
17520 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
17521 if (LastInstIt != LastInst->getParent()->end() &&
17522 LastInstIt->getParent()->isLandingPad())
17523 LastInstIt = std::next(LastInstIt);
17524 }
17525 if (IsPHI ||
17526 (!E->isGather() && E->State != TreeEntry::SplitVectorize &&
17527 E->doesNotNeedToSchedule()) ||
17528 (GatheredLoadsEntriesFirst.has_value() &&
17529 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
17530 E->getOpcode() == Instruction::Load)) {
17531 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
17532 } else {
17533 // Set the insertion point after the last instruction in the bundle. Set the
17534 // debug location to Front.
17535 Builder.SetInsertPoint(
17536 LastInst->getParent(),
17537 LastInst->getNextNode()->getIterator());
17538 }
17539 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
17540}
17541
17542Value *BoUpSLP::gather(
17543 ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
17544 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle) {
17545 // List of instructions/lanes from current block and/or the blocks which are
17546 // part of the current loop. These instructions will be inserted at the end to
17547 // make it possible to optimize loops and hoist invariant instructions out of
17548 // the loops body with better chances for success.
17550 SmallSet<int, 4> PostponedIndices;
17551 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
17552 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
17554 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
17555 InsertBB = InsertBB->getSinglePredecessor();
17556 return InsertBB && InsertBB == InstBB;
17557 };
17558 for (int I = 0, E = VL.size(); I < E; ++I) {
17559 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
17560 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
17561 isVectorized(Inst) ||
17562 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
17563 PostponedIndices.insert(I).second)
17564 PostponedInsts.emplace_back(Inst, I);
17565 }
17566
17567 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
17568 Type *Ty) {
17569 Value *Scalar = V;
17570 if (Scalar->getType() != Ty) {
17571 assert(Scalar->getType()->isIntOrIntVectorTy() &&
17572 Ty->isIntOrIntVectorTy() && "Expected integer types only.");
17573 Value *V = Scalar;
17574 if (auto *CI = dyn_cast<CastInst>(Scalar);
17575 isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
17576 Value *Op = CI->getOperand(0);
17577 if (auto *IOp = dyn_cast<Instruction>(Op);
17578 !IOp || !(isDeleted(IOp) || isVectorized(IOp)))
17579 V = Op;
17580 }
17581 Scalar = Builder.CreateIntCast(
17582 V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
17583 }
17584
17585 Instruction *InsElt;
17586 if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
17587 assert(SLPReVec && "FixedVectorType is not expected.");
17588 Vec =
17589 createInsertVector(Builder, Vec, Scalar, Pos * getNumElements(VecTy));
17590 auto *II = dyn_cast<Instruction>(Vec);
17591 if (!II)
17592 return Vec;
17593 InsElt = II;
17594 } else {
17595 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
17596 InsElt = dyn_cast<InsertElementInst>(Vec);
17597 if (!InsElt)
17598 return Vec;
17599 }
17600 GatherShuffleExtractSeq.insert(InsElt);
17601 CSEBlocks.insert(InsElt->getParent());
17602 // Add to our 'need-to-extract' list.
17603 if (isa<Instruction>(V)) {
17604 if (ArrayRef<TreeEntry *> Entries = getTreeEntries(V); !Entries.empty()) {
17605 // Find which lane we need to extract.
17606 User *UserOp = nullptr;
17607 if (Scalar != V) {
17608 if (auto *SI = dyn_cast<Instruction>(Scalar))
17609 UserOp = SI;
17610 } else {
17611 if (V->getType()->isVectorTy()) {
17612 if (auto *SV = dyn_cast<ShuffleVectorInst>(InsElt);
17613 SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
17614 // Find shufflevector, caused by resize.
17615 auto FindOperand = [](Value *Vec, Value *V) -> Instruction * {
17616 if (auto *SV = dyn_cast<ShuffleVectorInst>(Vec)) {
17617 if (SV->getOperand(0) == V)
17618 return SV;
17619 if (SV->getOperand(1) == V)
17620 return SV;
17621 }
17622 return nullptr;
17623 };
17624 InsElt = nullptr;
17625 if (Instruction *User = FindOperand(SV->getOperand(0), V))
17626 InsElt = User;
17627 else if (Instruction *User = FindOperand(SV->getOperand(1), V))
17628 InsElt = User;
17629 assert(InsElt &&
17630 "Failed to find shufflevector, caused by resize.");
17631 }
17632 }
17633 UserOp = InsElt;
17634 }
17635 if (UserOp) {
17636 unsigned FoundLane = Entries.front()->findLaneForValue(V);
17637 ExternalUses.emplace_back(V, UserOp, *Entries.front(), FoundLane);
17638 }
17639 }
17640 }
17641 return Vec;
17642 };
17643 auto *VecTy = getWidenedType(ScalarTy, VL.size());
17644 Value *Vec = PoisonValue::get(VecTy);
17645 SmallVector<int> NonConsts;
17647 std::iota(Mask.begin(), Mask.end(), 0);
17648 Value *OriginalRoot = Root;
17649 if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
17650 SV && isa<PoisonValue>(SV->getOperand(1)) &&
17651 SV->getOperand(0)->getType() == VecTy) {
17652 Root = SV->getOperand(0);
17653 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
17654 }
17655 // Insert constant values at first.
17656 for (int I = 0, E = VL.size(); I < E; ++I) {
17657 if (PostponedIndices.contains(I))
17658 continue;
17659 if (!isConstant(VL[I])) {
17660 NonConsts.push_back(I);
17661 continue;
17662 }
17663 if (isa<PoisonValue>(VL[I]))
17664 continue;
17665 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
17666 Mask[I] = I + E;
17667 }
17668 if (Root) {
17669 if (isa<PoisonValue>(Vec)) {
17670 Vec = OriginalRoot;
17671 } else {
17672 Vec = CreateShuffle(Root, Vec, Mask);
17673 if (auto *OI = dyn_cast<Instruction>(OriginalRoot);
17674 OI && OI->use_empty() &&
17675 none_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
17676 return TE->VectorizedValue == OI;
17677 }))
17678 eraseInstruction(OI);
17679 }
17680 }
17681 // Insert non-constant values.
17682 for (int I : NonConsts)
17683 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
17684 // Append instructions, which are/may be part of the loop, in the end to make
17685 // it possible to hoist non-loop-based instructions.
17686 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
17687 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
17688
17689 return Vec;
17690}
17691
17692/// Merges shuffle masks and emits final shuffle instruction, if required. It
17693/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
17694/// when the actual shuffle instruction is generated only if this is actually
17695/// required. Otherwise, the shuffle instruction emission is delayed till the
17696/// end of the process, to reduce the number of emitted instructions and further
17697/// analysis/transformations.
17698/// The class also will look through the previously emitted shuffle instructions
17699/// and properly mark indices in mask as undef.
17700/// For example, given the code
17701/// \code
17702/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
17703/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
17704/// \endcode
17705/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
17706/// look through %s1 and %s2 and emit
17707/// \code
17708/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
17709/// \endcode
17710/// instead.
17711/// If 2 operands are of different size, the smallest one will be resized and
17712/// the mask recalculated properly.
17713/// For example, given the code
17714/// \code
17715/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
17716/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
17717/// \endcode
17718/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
17719/// look through %s1 and %s2 and emit
17720/// \code
17721/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
17722/// \endcode
17723/// instead.
17724class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
17725 bool IsFinalized = false;
17726 /// Combined mask for all applied operands and masks. It is built during
17727 /// analysis and actual emission of shuffle vector instructions.
17728 SmallVector<int> CommonMask;
17729 /// List of operands for the shuffle vector instruction. It hold at max 2
17730 /// operands, if the 3rd is going to be added, the first 2 are combined into
17731 /// shuffle with \p CommonMask mask, the first operand sets to be the
17732 /// resulting shuffle and the second operand sets to be the newly added
17733 /// operand. The \p CommonMask is transformed in the proper way after that.
17734 SmallVector<Value *, 2> InVectors;
17735 IRBuilderBase &Builder;
17736 BoUpSLP &R;
17737
17738 class ShuffleIRBuilder {
17739 IRBuilderBase &Builder;
17740 /// Holds all of the instructions that we gathered.
17741 SetVector<Instruction *> &GatherShuffleExtractSeq;
17742 /// A list of blocks that we are going to CSE.
17743 DenseSet<BasicBlock *> &CSEBlocks;
17744 /// Data layout.
17745 const DataLayout &DL;
17746
17747 public:
17748 ShuffleIRBuilder(IRBuilderBase &Builder,
17749 SetVector<Instruction *> &GatherShuffleExtractSeq,
17750 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
17751 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
17752 CSEBlocks(CSEBlocks), DL(DL) {}
17753 ~ShuffleIRBuilder() = default;
17754 /// Creates shufflevector for the 2 operands with the given mask.
17755 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
17756 if (V1->getType() != V2->getType()) {
17758 V1->getType()->isIntOrIntVectorTy() &&
17759 "Expected integer vector types only.");
17760 if (V1->getType() != V2->getType()) {
17761 if (cast<VectorType>(V2->getType())
17762 ->getElementType()
17763 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
17764 ->getElementType()
17765 ->getIntegerBitWidth())
17766 V2 = Builder.CreateIntCast(
17767 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
17768 else
17769 V1 = Builder.CreateIntCast(
17770 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
17771 }
17772 }
17773 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
17774 if (auto *I = dyn_cast<Instruction>(Vec)) {
17775 GatherShuffleExtractSeq.insert(I);
17776 CSEBlocks.insert(I->getParent());
17777 }
17778 return Vec;
17779 }
17780 /// Creates permutation of the single vector operand with the given mask, if
17781 /// it is not identity mask.
17782 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
17783 if (Mask.empty())
17784 return V1;
17785 unsigned VF = Mask.size();
17786 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
17787 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
17788 return V1;
17789 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
17790 if (auto *I = dyn_cast<Instruction>(Vec)) {
17791 GatherShuffleExtractSeq.insert(I);
17792 CSEBlocks.insert(I->getParent());
17793 }
17794 return Vec;
17795 }
17796 Value *createIdentity(Value *V) { return V; }
17797 Value *createPoison(Type *Ty, unsigned VF) {
17798 return PoisonValue::get(getWidenedType(Ty, VF));
17799 }
17800 /// Resizes 2 input vector to match the sizes, if the they are not equal
17801 /// yet. The smallest vector is resized to the size of the larger vector.
17802 void resizeToMatch(Value *&V1, Value *&V2) {
17803 if (V1->getType() == V2->getType())
17804 return;
17805 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
17806 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
17807 int VF = std::max(V1VF, V2VF);
17808 int MinVF = std::min(V1VF, V2VF);
17809 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
17810 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
17811 0);
17812 Value *&Op = MinVF == V1VF ? V1 : V2;
17813 Op = Builder.CreateShuffleVector(Op, IdentityMask);
17814 if (auto *I = dyn_cast<Instruction>(Op)) {
17815 GatherShuffleExtractSeq.insert(I);
17816 CSEBlocks.insert(I->getParent());
17817 }
17818 if (MinVF == V1VF)
17819 V1 = Op;
17820 else
17821 V2 = Op;
17822 }
17823 };
17824
17825 /// Smart shuffle instruction emission, walks through shuffles trees and
17826 /// tries to find the best matching vector for the actual shuffle
17827 /// instruction.
17828 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
17829 assert(V1 && "Expected at least one vector value.");
17830 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
17831 R.CSEBlocks, *R.DL);
17832 return BaseShuffleAnalysis::createShuffle<Value *>(
17833 V1, V2, Mask, ShuffleBuilder, ScalarTy);
17834 }
17835
17836 /// Cast value \p V to the vector type with the same number of elements, but
17837 /// the base type \p ScalarTy.
17838 Value *castToScalarTyElem(Value *V,
17839 std::optional<bool> IsSigned = std::nullopt) {
17840 auto *VecTy = cast<VectorType>(V->getType());
17841 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
17842 if (VecTy->getElementType() == ScalarTy->getScalarType())
17843 return V;
17844 return Builder.CreateIntCast(
17845 V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
17846 IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
17847 }
17848
17849 Value *getVectorizedValue(const TreeEntry &E) {
17850 Value *Vec = E.VectorizedValue;
17851 if (!Vec->getType()->isIntOrIntVectorTy())
17852 return Vec;
17853 return castToScalarTyElem(Vec, any_of(E.Scalars, [&](Value *V) {
17854 return !isa<PoisonValue>(V) &&
17855 !isKnownNonNegative(
17856 V, SimplifyQuery(*R.DL));
17857 }));
17858 }
17859
17860public:
17862 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
17863
17864 /// Adjusts extractelements after reusing them.
17865 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
17866 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
17867 unsigned NumParts, bool &UseVecBaseAsInput) {
17868 UseVecBaseAsInput = false;
17869 SmallPtrSet<Value *, 4> UniqueBases;
17870 Value *VecBase = nullptr;
17871 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
17872 if (!E->ReorderIndices.empty()) {
17873 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
17874 E->ReorderIndices.end());
17875 reorderScalars(VL, ReorderMask);
17876 }
17877 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
17878 int Idx = Mask[I];
17879 if (Idx == PoisonMaskElem)
17880 continue;
17881 auto *EI = cast<ExtractElementInst>(VL[I]);
17882 VecBase = EI->getVectorOperand();
17883 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecBase); !TEs.empty())
17884 VecBase = TEs.front()->VectorizedValue;
17885 assert(VecBase && "Expected vectorized value.");
17886 UniqueBases.insert(VecBase);
17887 // If the only one use is vectorized - can delete the extractelement
17888 // itself.
17889 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
17890 (NumParts != 1 && count(VL, EI) > 1) ||
17891 any_of(EI->users(), [&](User *U) {
17892 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
17893 return UTEs.empty() || UTEs.size() > 1 ||
17894 (isa<GetElementPtrInst>(U) &&
17895 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
17896 (!UTEs.empty() &&
17897 count_if(R.VectorizableTree,
17898 [&](const std::unique_ptr<TreeEntry> &TE) {
17899 return TE->UserTreeIndex.UserTE ==
17900 UTEs.front() &&
17901 is_contained(VL, EI);
17902 }) != 1);
17903 }))
17904 continue;
17905 R.eraseInstruction(EI);
17906 }
17907 if (NumParts == 1 || UniqueBases.size() == 1) {
17908 assert(VecBase && "Expected vectorized value.");
17909 return castToScalarTyElem(VecBase);
17910 }
17911 UseVecBaseAsInput = true;
17912 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
17913 for (auto [I, Idx] : enumerate(Mask))
17914 if (Idx != PoisonMaskElem)
17915 Idx = I;
17916 };
17917 // Perform multi-register vector shuffle, joining them into a single virtual
17918 // long vector.
17919 // Need to shuffle each part independently and then insert all this parts
17920 // into a long virtual vector register, forming the original vector.
17921 Value *Vec = nullptr;
17922 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
17923 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
17924 for (unsigned Part : seq<unsigned>(NumParts)) {
17925 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
17926 ArrayRef<Value *> SubVL = ArrayRef(VL).slice(Part * SliceSize, Limit);
17927 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
17928 constexpr int MaxBases = 2;
17929 SmallVector<Value *, MaxBases> Bases(MaxBases);
17930 auto VLMask = zip(SubVL, SubMask);
17931 const unsigned VF = std::accumulate(
17932 VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) {
17933 if (std::get<1>(D) == PoisonMaskElem)
17934 return S;
17935 Value *VecOp =
17936 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
17937 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
17938 !TEs.empty())
17939 VecOp = TEs.front()->VectorizedValue;
17940 assert(VecOp && "Expected vectorized value.");
17941 const unsigned Size =
17942 cast<FixedVectorType>(VecOp->getType())->getNumElements();
17943 return std::max(S, Size);
17944 });
17945 for (const auto [V, I] : VLMask) {
17946 if (I == PoisonMaskElem)
17947 continue;
17948 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
17949 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp); !TEs.empty())
17950 VecOp = TEs.front()->VectorizedValue;
17951 assert(VecOp && "Expected vectorized value.");
17952 VecOp = castToScalarTyElem(VecOp);
17953 Bases[I / VF] = VecOp;
17954 }
17955 if (!Bases.front())
17956 continue;
17957 Value *SubVec;
17958 if (Bases.back()) {
17959 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
17960 TransformToIdentity(SubMask);
17961 } else {
17962 SubVec = Bases.front();
17963 }
17964 if (!Vec) {
17965 Vec = SubVec;
17966 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
17967 [&](unsigned P) {
17968 ArrayRef<int> SubMask =
17969 Mask.slice(P * SliceSize,
17970 getNumElems(Mask.size(),
17971 SliceSize, P));
17972 return all_of(SubMask, [](int Idx) {
17973 return Idx == PoisonMaskElem;
17974 });
17975 })) &&
17976 "Expected first part or all previous parts masked.");
17977 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
17978 } else {
17979 unsigned NewVF =
17980 cast<FixedVectorType>(Vec->getType())->getNumElements();
17981 if (Vec->getType() != SubVec->getType()) {
17982 unsigned SubVecVF =
17983 cast<FixedVectorType>(SubVec->getType())->getNumElements();
17984 NewVF = std::max(NewVF, SubVecVF);
17985 }
17986 // Adjust SubMask.
17987 for (int &Idx : SubMask)
17988 if (Idx != PoisonMaskElem)
17989 Idx += NewVF;
17990 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
17991 Vec = createShuffle(Vec, SubVec, VecMask);
17992 TransformToIdentity(VecMask);
17993 }
17994 }
17995 copy(VecMask, Mask.begin());
17996 return Vec;
17997 }
17998 /// Checks if the specified entry \p E needs to be delayed because of its
17999 /// dependency nodes.
18000 std::optional<Value *>
18001 needToDelay(const TreeEntry *E,
18003 // No need to delay emission if all deps are ready.
18004 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
18005 return all_of(
18006 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
18007 }))
18008 return std::nullopt;
18009 // Postpone gather emission, will be emitted after the end of the
18010 // process to keep correct order.
18011 auto *ResVecTy = getWidenedType(ScalarTy, E->getVectorFactor());
18012 return Builder.CreateAlignedLoad(
18013 ResVecTy,
18015 MaybeAlign());
18016 }
18017 /// Reset the builder to handle perfect diamond match.
18019 IsFinalized = false;
18020 CommonMask.clear();
18021 InVectors.clear();
18022 }
18023 /// Adds 2 input vectors (in form of tree entries) and the mask for their
18024 /// shuffling.
18025 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
18026 Value *V1 = getVectorizedValue(E1);
18027 Value *V2 = getVectorizedValue(E2);
18028 add(V1, V2, Mask);
18029 }
18030 /// Adds single input vector (in form of tree entry) and the mask for its
18031 /// shuffling.
18032 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
18033 Value *V1 = getVectorizedValue(E1);
18034 add(V1, Mask);
18035 }
18036 /// Adds 2 input vectors and the mask for their shuffling.
18037 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
18038 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
18039 assert(isa<FixedVectorType>(V1->getType()) &&
18040 isa<FixedVectorType>(V2->getType()) &&
18041 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
18042 V1 = castToScalarTyElem(V1);
18043 V2 = castToScalarTyElem(V2);
18044 if (InVectors.empty()) {
18045 InVectors.push_back(V1);
18046 InVectors.push_back(V2);
18047 CommonMask.assign(Mask.begin(), Mask.end());
18048 return;
18049 }
18050 Value *Vec = InVectors.front();
18051 if (InVectors.size() == 2) {
18052 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18053 transformMaskAfterShuffle(CommonMask, CommonMask);
18054 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
18055 Mask.size()) {
18056 Vec = createShuffle(Vec, nullptr, CommonMask);
18057 transformMaskAfterShuffle(CommonMask, CommonMask);
18058 }
18059 V1 = createShuffle(V1, V2, Mask);
18060 unsigned VF = std::max(getVF(V1), getVF(Vec));
18061 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18062 if (Mask[Idx] != PoisonMaskElem)
18063 CommonMask[Idx] = Idx + VF;
18064 InVectors.front() = Vec;
18065 if (InVectors.size() == 2)
18066 InVectors.back() = V1;
18067 else
18068 InVectors.push_back(V1);
18069 }
18070 /// Adds another one input vector and the mask for the shuffling.
18071 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
18072 assert(isa<FixedVectorType>(V1->getType()) &&
18073 "castToScalarTyElem expects V1 to be FixedVectorType");
18074 V1 = castToScalarTyElem(V1);
18075 if (InVectors.empty()) {
18076 InVectors.push_back(V1);
18077 CommonMask.assign(Mask.begin(), Mask.end());
18078 return;
18079 }
18080 const auto *It = find(InVectors, V1);
18081 if (It == InVectors.end()) {
18082 if (InVectors.size() == 2 ||
18083 InVectors.front()->getType() != V1->getType()) {
18084 Value *V = InVectors.front();
18085 if (InVectors.size() == 2) {
18086 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
18087 transformMaskAfterShuffle(CommonMask, CommonMask);
18088 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
18089 CommonMask.size()) {
18090 V = createShuffle(InVectors.front(), nullptr, CommonMask);
18091 transformMaskAfterShuffle(CommonMask, CommonMask);
18092 }
18093 unsigned VF = std::max(CommonMask.size(), Mask.size());
18094 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18095 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
18096 CommonMask[Idx] = V->getType() != V1->getType()
18097 ? Idx + VF
18098 : Mask[Idx] + getVF(V1);
18099 if (V->getType() != V1->getType())
18100 V1 = createShuffle(V1, nullptr, Mask);
18101 InVectors.front() = V;
18102 if (InVectors.size() == 2)
18103 InVectors.back() = V1;
18104 else
18105 InVectors.push_back(V1);
18106 return;
18107 }
18108 // Check if second vector is required if the used elements are already
18109 // used from the first one.
18110 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18111 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
18112 InVectors.push_back(V1);
18113 break;
18114 }
18115 }
18116 unsigned VF = 0;
18117 for (Value *V : InVectors)
18118 VF = std::max(VF, getVF(V));
18119 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18120 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
18121 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
18122 }
18123 /// Adds another one input vector and the mask for the shuffling.
18125 SmallVector<int> NewMask;
18126 inversePermutation(Order, NewMask);
18127 add(V1, NewMask);
18128 }
18129 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
18130 Value *Root = nullptr) {
18131 return R.gather(VL, Root, ScalarTy,
18132 [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
18133 return createShuffle(V1, V2, Mask);
18134 });
18135 }
18136 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
18137 /// Finalize emission of the shuffles.
18138 /// \param Action the action (if any) to be performed before final applying of
18139 /// the \p ExtMask mask.
18141 ArrayRef<int> ExtMask,
18142 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
18143 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
18146 Action = {}) {
18147 IsFinalized = true;
18148 if (Action) {
18149 Value *Vec = InVectors.front();
18150 if (InVectors.size() == 2) {
18151 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18152 InVectors.pop_back();
18153 } else {
18154 Vec = createShuffle(Vec, nullptr, CommonMask);
18155 }
18156 transformMaskAfterShuffle(CommonMask, CommonMask);
18157 assert(VF > 0 &&
18158 "Expected vector length for the final value before action.");
18159 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
18160 if (VecVF < VF) {
18161 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
18162 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
18163 Vec = createShuffle(Vec, nullptr, ResizeMask);
18164 }
18165 Action(Vec, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
18166 return createShuffle(V1, V2, Mask);
18167 });
18168 InVectors.front() = Vec;
18169 }
18170 if (!SubVectors.empty()) {
18171 Value *Vec = InVectors.front();
18172 if (InVectors.size() == 2) {
18173 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18174 InVectors.pop_back();
18175 } else {
18176 Vec = createShuffle(Vec, nullptr, CommonMask);
18177 }
18178 transformMaskAfterShuffle(CommonMask, CommonMask);
18179 auto CreateSubVectors = [&](Value *Vec,
18180 SmallVectorImpl<int> &CommonMask) {
18181 for (auto [E, Idx] : SubVectors) {
18182 Value *V = getVectorizedValue(*E);
18183 unsigned InsertionIndex = Idx * getNumElements(ScalarTy);
18184 // Use scalar version of the SCalarType to correctly handle shuffles
18185 // for revectorization. The revectorization mode operates by the
18186 // vectors, but here we need to operate on the scalars, because the
18187 // masks were already transformed for the vector elements and we don't
18188 // need doing this transformation again.
18189 Type *OrigScalarTy = ScalarTy;
18190 ScalarTy = ScalarTy->getScalarType();
18191 Vec = createInsertVector(
18192 Builder, Vec, V, InsertionIndex,
18193 std::bind(&ShuffleInstructionBuilder::createShuffle, this, _1, _2,
18194 _3));
18195 ScalarTy = OrigScalarTy;
18196 if (!CommonMask.empty()) {
18197 std::iota(std::next(CommonMask.begin(), Idx),
18198 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
18199 Idx);
18200 }
18201 }
18202 return Vec;
18203 };
18204 if (SubVectorsMask.empty()) {
18205 Vec = CreateSubVectors(Vec, CommonMask);
18206 } else {
18207 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
18208 copy(SubVectorsMask, SVMask.begin());
18209 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
18210 if (I2 != PoisonMaskElem) {
18211 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
18212 I1 = I2 + CommonMask.size();
18213 }
18214 }
18215 Value *InsertVec =
18216 CreateSubVectors(PoisonValue::get(Vec->getType()), CommonMask);
18217 Vec = createShuffle(InsertVec, Vec, SVMask);
18218 transformMaskAfterShuffle(CommonMask, SVMask);
18219 }
18220 InVectors.front() = Vec;
18221 }
18222
18223 if (!ExtMask.empty()) {
18224 if (CommonMask.empty()) {
18225 CommonMask.assign(ExtMask.begin(), ExtMask.end());
18226 } else {
18227 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
18228 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
18229 if (ExtMask[I] == PoisonMaskElem)
18230 continue;
18231 NewMask[I] = CommonMask[ExtMask[I]];
18232 }
18233 CommonMask.swap(NewMask);
18234 }
18235 }
18236 if (CommonMask.empty()) {
18237 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
18238 return InVectors.front();
18239 }
18240 if (InVectors.size() == 2)
18241 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
18242 return createShuffle(InVectors.front(), nullptr, CommonMask);
18243 }
18244
18246 assert((IsFinalized || CommonMask.empty()) &&
18247 "Shuffle construction must be finalized.");
18248 }
18249};
18250
18251Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
18252 return vectorizeTree(getOperandEntry(E, NodeIdx));
18253}
18254
18255template <typename BVTy, typename ResTy, typename... Args>
18256ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
18257 Args &...Params) {
18258 assert(E->isGather() && "Expected gather node.");
18259 unsigned VF = E->getVectorFactor();
18260
18261 bool NeedFreeze = false;
18262 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
18263 // Clear values, to be replaced by insertvector instructions.
18264 for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
18265 for_each(MutableArrayRef(GatheredScalars)
18266 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
18267 [&](Value *&V) { V = PoisonValue::get(V->getType()); });
18269 E->CombinedEntriesWithIndices.size());
18270 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
18271 [&](const auto &P) {
18272 return std::make_pair(VectorizableTree[P.first].get(), P.second);
18273 });
18274 // Build a mask out of the reorder indices and reorder scalars per this
18275 // mask.
18276 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
18277 E->ReorderIndices.end());
18278 if (!ReorderMask.empty())
18279 reorderScalars(GatheredScalars, ReorderMask);
18280 SmallVector<int> SubVectorsMask;
18281 inversePermutation(E->ReorderIndices, SubVectorsMask);
18282 // Transform non-clustered elements in the mask to poison (-1).
18283 // "Clustered" operations will be reordered using this mask later.
18284 if (!SubVectors.empty() && !SubVectorsMask.empty()) {
18285 for (unsigned I : seq<unsigned>(GatheredScalars.size()))
18286 if (E->Scalars[I] == GatheredScalars[ReorderMask[I]])
18287 SubVectorsMask[ReorderMask[I]] = PoisonMaskElem;
18288 } else {
18289 SubVectorsMask.clear();
18290 }
18291 SmallVector<Value *> StoredGS(GatheredScalars);
18292 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
18293 unsigned I, unsigned SliceSize,
18294 bool IsNotPoisonous) {
18295 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
18296 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
18297 }))
18298 return false;
18299 TreeEntry *UserTE = E->UserTreeIndex.UserTE;
18300 unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;
18301 if (UserTE->getNumOperands() != 2)
18302 return false;
18303 if (!IsNotPoisonous) {
18304 auto *It = find_if(ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
18305 [=](const std::unique_ptr<TreeEntry> &TE) {
18306 return TE->UserTreeIndex.UserTE == UserTE &&
18307 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
18308 });
18309 if (It == VectorizableTree.end())
18310 return false;
18311 SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end());
18312 if (!(*It)->ReorderIndices.empty()) {
18313 inversePermutation((*It)->ReorderIndices, ReorderMask);
18314 reorderScalars(GS, ReorderMask);
18315 }
18316 if (!all_of(zip(GatheredScalars, GS), [&](const auto &P) {
18317 Value *V0 = std::get<0>(P);
18318 Value *V1 = std::get<1>(P);
18319 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
18320 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
18321 is_contained(E->Scalars, V1));
18322 }))
18323 return false;
18324 }
18325 int Idx;
18326 if ((Mask.size() < InputVF &&
18328 Idx == 0) ||
18329 (Mask.size() == InputVF &&
18330 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
18331 std::iota(
18332 std::next(Mask.begin(), I * SliceSize),
18333 std::next(Mask.begin(),
18334 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
18335 0);
18336 } else {
18337 unsigned IVal =
18338 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
18339 std::fill(
18340 std::next(Mask.begin(), I * SliceSize),
18341 std::next(Mask.begin(),
18342 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
18343 IVal);
18344 }
18345 return true;
18346 };
18347 BVTy ShuffleBuilder(ScalarTy, Params...);
18348 ResTy Res = ResTy();
18350 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
18352 Value *ExtractVecBase = nullptr;
18353 bool UseVecBaseAsInput = false;
18356 Type *OrigScalarTy = GatheredScalars.front()->getType();
18357 auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
18358 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, GatheredScalars.size());
18359 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
18360 // Check for gathered extracts.
18361 bool Resized = false;
18362 ExtractShuffles =
18363 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
18364 if (!ExtractShuffles.empty()) {
18365 SmallVector<const TreeEntry *> ExtractEntries;
18366 for (auto [Idx, I] : enumerate(ExtractMask)) {
18367 if (I == PoisonMaskElem)
18368 continue;
18369 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(
18370 cast<ExtractElementInst>(StoredGS[Idx])->getVectorOperand());
18371 !TEs.empty())
18372 ExtractEntries.append(TEs.begin(), TEs.end());
18373 }
18374 if (std::optional<ResTy> Delayed =
18375 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
18376 // Delay emission of gathers which are not ready yet.
18377 PostponedGathers.insert(E);
18378 // Postpone gather emission, will be emitted after the end of the
18379 // process to keep correct order.
18380 return *Delayed;
18381 }
18382 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
18383 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
18384 ExtractVecBase = VecBase;
18385 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
18386 if (VF == VecBaseTy->getNumElements() &&
18387 GatheredScalars.size() != VF) {
18388 Resized = true;
18389 GatheredScalars.append(VF - GatheredScalars.size(),
18390 PoisonValue::get(OrigScalarTy));
18391 NumParts =
18392 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF), VF);
18393 }
18394 }
18395 }
18396 // Gather extracts after we check for full matched gathers only.
18397 if (!ExtractShuffles.empty() || !E->hasState() ||
18398 E->getOpcode() != Instruction::Load ||
18399 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
18400 any_of(E->Scalars, IsaPred<LoadInst>)) &&
18401 any_of(E->Scalars,
18402 [this](Value *V) {
18403 return isa<LoadInst>(V) && isVectorized(V);
18404 })) ||
18405 (E->hasState() && E->isAltShuffle()) ||
18406 all_of(E->Scalars, [this](Value *V) { return isVectorized(V); }) ||
18407 isSplat(E->Scalars) ||
18408 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
18409 GatherShuffles =
18410 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
18411 }
18412 if (!GatherShuffles.empty()) {
18413 if (std::optional<ResTy> Delayed =
18414 ShuffleBuilder.needToDelay(E, Entries)) {
18415 // Delay emission of gathers which are not ready yet.
18416 PostponedGathers.insert(E);
18417 // Postpone gather emission, will be emitted after the end of the
18418 // process to keep correct order.
18419 return *Delayed;
18420 }
18421 if (GatherShuffles.size() == 1 &&
18422 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
18423 Entries.front().front()->isSame(E->Scalars)) {
18424 // Perfect match in the graph, will reuse the previously vectorized
18425 // node. Cost is 0.
18426 LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "
18427 << shortBundleName(E->Scalars, E->Idx) << ".\n");
18428 // Restore the mask for previous partially matched values.
18429 Mask.resize(E->Scalars.size());
18430 const TreeEntry *FrontTE = Entries.front().front();
18431 if (FrontTE->ReorderIndices.empty() &&
18432 ((FrontTE->ReuseShuffleIndices.empty() &&
18433 E->Scalars.size() == FrontTE->Scalars.size()) ||
18434 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
18435 std::iota(Mask.begin(), Mask.end(), 0);
18436 } else {
18437 for (auto [I, V] : enumerate(E->Scalars)) {
18438 if (isa<PoisonValue>(V)) {
18440 continue;
18441 }
18442 Mask[I] = FrontTE->findLaneForValue(V);
18443 }
18444 }
18445 // Reset the builder(s) to correctly handle perfect diamond matched
18446 // nodes.
18447 ShuffleBuilder.resetForSameNode();
18448 ShuffleBuilder.add(*FrontTE, Mask);
18449 // Full matched entry found, no need to insert subvectors.
18450 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
18451 return Res;
18452 }
18453 if (!Resized) {
18454 if (GatheredScalars.size() != VF &&
18455 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
18456 return any_of(TEs, [&](const TreeEntry *TE) {
18457 return TE->getVectorFactor() == VF;
18458 });
18459 }))
18460 GatheredScalars.append(VF - GatheredScalars.size(),
18461 PoisonValue::get(OrigScalarTy));
18462 }
18463 // Remove shuffled elements from list of gathers.
18464 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
18465 if (Mask[I] != PoisonMaskElem)
18466 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
18467 }
18468 }
18469 }
18470 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
18471 SmallVectorImpl<int> &ReuseMask,
18472 bool IsRootPoison) {
18473 // For splats with can emit broadcasts instead of gathers, so try to find
18474 // such sequences.
18475 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
18476 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
18477 Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
18478 SmallVector<int> UndefPos;
18479 DenseMap<Value *, unsigned> UniquePositions;
18480 // Gather unique non-const values and all constant values.
18481 // For repeated values, just shuffle them.
18482 int NumNonConsts = 0;
18483 int SinglePos = 0;
18484 for (auto [I, V] : enumerate(Scalars)) {
18485 if (isa<UndefValue>(V)) {
18486 if (!isa<PoisonValue>(V)) {
18487 ReuseMask[I] = I;
18488 UndefPos.push_back(I);
18489 }
18490 continue;
18491 }
18492 if (isConstant(V)) {
18493 ReuseMask[I] = I;
18494 continue;
18495 }
18496 ++NumNonConsts;
18497 SinglePos = I;
18498 Value *OrigV = V;
18499 Scalars[I] = PoisonValue::get(OrigScalarTy);
18500 if (IsSplat) {
18501 Scalars.front() = OrigV;
18502 ReuseMask[I] = 0;
18503 } else {
18504 const auto Res = UniquePositions.try_emplace(OrigV, I);
18505 Scalars[Res.first->second] = OrigV;
18506 ReuseMask[I] = Res.first->second;
18507 }
18508 }
18509 if (NumNonConsts == 1) {
18510 // Restore single insert element.
18511 if (IsSplat) {
18512 ReuseMask.assign(VF, PoisonMaskElem);
18513 std::swap(Scalars.front(), Scalars[SinglePos]);
18514 if (!UndefPos.empty() && UndefPos.front() == 0)
18515 Scalars.front() = UndefValue::get(OrigScalarTy);
18516 }
18517 ReuseMask[SinglePos] = SinglePos;
18518 } else if (!UndefPos.empty() && IsSplat) {
18519 // For undef values, try to replace them with the simple broadcast.
18520 // We can do it if the broadcasted value is guaranteed to be
18521 // non-poisonous, or by freezing the incoming scalar value first.
18522 auto *It = find_if(Scalars, [this, E](Value *V) {
18523 return !isa<UndefValue>(V) &&
18525 (E->UserTreeIndex && any_of(V->uses(), [E](const Use &U) {
18526 // Check if the value already used in the same operation in
18527 // one of the nodes already.
18528 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
18529 is_contained(E->UserTreeIndex.UserTE->Scalars,
18530 U.getUser());
18531 })));
18532 });
18533 if (It != Scalars.end()) {
18534 // Replace undefs by the non-poisoned scalars and emit broadcast.
18535 int Pos = std::distance(Scalars.begin(), It);
18536 for (int I : UndefPos) {
18537 // Set the undef position to the non-poisoned scalar.
18538 ReuseMask[I] = Pos;
18539 // Replace the undef by the poison, in the mask it is replaced by
18540 // non-poisoned scalar already.
18541 if (I != Pos)
18542 Scalars[I] = PoisonValue::get(OrigScalarTy);
18543 }
18544 } else {
18545 // Replace undefs by the poisons, emit broadcast and then emit
18546 // freeze.
18547 for (int I : UndefPos) {
18548 ReuseMask[I] = PoisonMaskElem;
18549 if (isa<UndefValue>(Scalars[I]))
18550 Scalars[I] = PoisonValue::get(OrigScalarTy);
18551 }
18552 NeedFreeze = true;
18553 }
18554 }
18555 };
18556 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
18557 bool IsNonPoisoned = true;
18558 bool IsUsedInExpr = true;
18559 Value *Vec1 = nullptr;
18560 if (!ExtractShuffles.empty()) {
18561 // Gather of extractelements can be represented as just a shuffle of
18562 // a single/two vectors the scalars are extracted from.
18563 // Find input vectors.
18564 Value *Vec2 = nullptr;
18565 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
18566 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
18567 ExtractMask[I] = PoisonMaskElem;
18568 }
18569 if (UseVecBaseAsInput) {
18570 Vec1 = ExtractVecBase;
18571 } else {
18572 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
18573 if (ExtractMask[I] == PoisonMaskElem)
18574 continue;
18575 if (isa<UndefValue>(StoredGS[I]))
18576 continue;
18577 auto *EI = cast<ExtractElementInst>(StoredGS[I]);
18578 Value *VecOp = EI->getVectorOperand();
18579 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(VecOp);
18580 !TEs.empty() && TEs.front()->VectorizedValue)
18581 VecOp = TEs.front()->VectorizedValue;
18582 if (!Vec1) {
18583 Vec1 = VecOp;
18584 } else if (Vec1 != VecOp) {
18585 assert((!Vec2 || Vec2 == VecOp) &&
18586 "Expected only 1 or 2 vectors shuffle.");
18587 Vec2 = VecOp;
18588 }
18589 }
18590 }
18591 if (Vec2) {
18592 IsUsedInExpr = false;
18593 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1, AC) &&
18594 isGuaranteedNotToBePoison(Vec2, AC);
18595 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
18596 } else if (Vec1) {
18597 bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1, AC);
18598 IsUsedInExpr &= FindReusedSplat(
18599 ExtractMask,
18600 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
18601 ExtractMask.size(), IsNotPoisonedVec);
18602 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
18603 IsNonPoisoned &= IsNotPoisonedVec;
18604 } else {
18605 IsUsedInExpr = false;
18606 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
18607 /*ForExtracts=*/true);
18608 }
18609 }
18610 if (!GatherShuffles.empty()) {
18611 unsigned SliceSize =
18612 getPartNumElems(E->Scalars.size(),
18613 ::getNumberOfParts(*TTI, VecTy, E->Scalars.size()));
18614 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
18615 for (const auto [I, TEs] : enumerate(Entries)) {
18616 if (TEs.empty()) {
18617 assert(!GatherShuffles[I] &&
18618 "No shuffles with empty entries list expected.");
18619 continue;
18620 }
18621 assert((TEs.size() == 1 || TEs.size() == 2) &&
18622 "Expected shuffle of 1 or 2 entries.");
18623 unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
18624 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
18625 VecMask.assign(VecMask.size(), PoisonMaskElem);
18626 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
18627 if (TEs.size() == 1) {
18628 bool IsNotPoisonedVec =
18629 TEs.front()->VectorizedValue
18630 ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC)
18631 : true;
18632 IsUsedInExpr &=
18633 FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I,
18634 SliceSize, IsNotPoisonedVec);
18635 ShuffleBuilder.add(*TEs.front(), VecMask);
18636 IsNonPoisoned &= IsNotPoisonedVec;
18637 } else {
18638 IsUsedInExpr = false;
18639 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
18640 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
18641 IsNonPoisoned &=
18642 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) &&
18643 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue, AC);
18644 }
18645 }
18646 }
18647 // Try to figure out best way to combine values: build a shuffle and insert
18648 // elements or just build several shuffles.
18649 // Insert non-constant scalars.
18650 SmallVector<Value *> NonConstants(GatheredScalars);
18651 int EMSz = ExtractMask.size();
18652 int MSz = Mask.size();
18653 // Try to build constant vector and shuffle with it only if currently we
18654 // have a single permutation and more than 1 scalar constants.
18655 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
18656 bool IsIdentityShuffle =
18657 ((UseVecBaseAsInput ||
18658 all_of(ExtractShuffles,
18659 [](const std::optional<TTI::ShuffleKind> &SK) {
18660 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
18662 })) &&
18663 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
18664 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
18665 (!GatherShuffles.empty() &&
18666 all_of(GatherShuffles,
18667 [](const std::optional<TTI::ShuffleKind> &SK) {
18668 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
18670 }) &&
18671 none_of(Mask, [&](int I) { return I >= MSz; }) &&
18673 bool EnoughConstsForShuffle =
18674 IsSingleShuffle &&
18675 (none_of(GatheredScalars,
18676 [](Value *V) {
18677 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
18678 }) ||
18679 any_of(GatheredScalars,
18680 [](Value *V) {
18681 return isa<Constant>(V) && !isa<UndefValue>(V);
18682 })) &&
18683 (!IsIdentityShuffle ||
18684 (GatheredScalars.size() == 2 &&
18685 any_of(GatheredScalars,
18686 [](Value *V) { return !isa<UndefValue>(V); })) ||
18687 count_if(GatheredScalars, [](Value *V) {
18688 return isa<Constant>(V) && !isa<PoisonValue>(V);
18689 }) > 1);
18690 // NonConstants array contains just non-constant values, GatheredScalars
18691 // contains only constant to build final vector and then shuffle.
18692 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
18693 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
18694 NonConstants[I] = PoisonValue::get(OrigScalarTy);
18695 else
18696 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
18697 }
18698 // Generate constants for final shuffle and build a mask for them.
18699 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
18700 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
18701 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
18702 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
18703 ShuffleBuilder.add(BV, BVMask);
18704 }
18705 if (all_of(NonConstants, [=](Value *V) {
18706 return isa<PoisonValue>(V) ||
18707 (IsSingleShuffle && ((IsIdentityShuffle &&
18708 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
18709 }))
18710 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
18711 SubVectorsMask);
18712 else
18713 Res = ShuffleBuilder.finalize(
18714 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
18715 [&](Value *&Vec, SmallVectorImpl<int> &Mask, auto CreateShuffle) {
18716 bool IsSplat = isSplat(NonConstants);
18717 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
18718 TryPackScalars(NonConstants, BVMask, /*IsRootPoison=*/false);
18719 auto CheckIfSplatIsProfitable = [&]() {
18720 // Estimate the cost of splatting + shuffle and compare with
18721 // insert + shuffle.
18723 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
18724 if (isa<ExtractElementInst>(V) || isVectorized(V))
18725 return false;
18727 Instruction::InsertElement, VecTy, CostKind, /*Index=*/0,
18728 PoisonValue::get(VecTy), V);
18729 SmallVector<int> NewMask(Mask.begin(), Mask.end());
18730 for (auto [Idx, I] : enumerate(BVMask))
18731 if (I != PoisonMaskElem)
18732 NewMask[Idx] = Mask.size();
18733 SplatCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy,
18734 NewMask, CostKind);
18736 Instruction::InsertElement, VecTy, CostKind,
18737 *find_if(Mask, [](int I) { return I != PoisonMaskElem; }),
18738 Vec, V);
18739 // Shuffle required?
18740 if (count(BVMask, PoisonMaskElem) <
18741 static_cast<int>(BVMask.size() - 1)) {
18742 SmallVector<int> NewMask(Mask.begin(), Mask.end());
18743 for (auto [Idx, I] : enumerate(BVMask))
18744 if (I != PoisonMaskElem)
18745 NewMask[Idx] = I;
18747 VecTy, NewMask, CostKind);
18748 }
18749 return SplatCost <= BVCost;
18750 };
18751 if (!IsSplat || Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
18752 for (auto [Idx, I] : enumerate(BVMask))
18753 if (I != PoisonMaskElem)
18754 Mask[Idx] = I;
18755 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
18756 } else {
18757 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
18758 SmallVector<Value *> Values(NonConstants.size(),
18759 PoisonValue::get(ScalarTy));
18760 Values[0] = V;
18761 Value *BV = ShuffleBuilder.gather(Values, BVMask.size());
18762 SmallVector<int> SplatMask(BVMask.size(), PoisonMaskElem);
18763 transform(BVMask, SplatMask.begin(), [](int I) {
18764 return I == PoisonMaskElem ? PoisonMaskElem : 0;
18765 });
18766 if (!ShuffleVectorInst::isIdentityMask(SplatMask, VF))
18767 BV = CreateShuffle(BV, nullptr, SplatMask);
18768 for (auto [Idx, I] : enumerate(BVMask))
18769 if (I != PoisonMaskElem)
18770 Mask[Idx] = BVMask.size() + Idx;
18771 Vec = CreateShuffle(Vec, BV, Mask);
18772 for (auto [Idx, I] : enumerate(Mask))
18773 if (I != PoisonMaskElem)
18774 Mask[Idx] = Idx;
18775 }
18776 });
18777 } else if (!allConstant(GatheredScalars)) {
18778 // Gather unique scalars and all constants.
18779 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
18780 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
18781 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
18782 ShuffleBuilder.add(BV, ReuseMask);
18783 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
18784 SubVectorsMask);
18785 } else {
18786 // Gather all constants.
18787 SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem);
18788 for (auto [I, V] : enumerate(GatheredScalars)) {
18789 if (!isa<PoisonValue>(V))
18790 Mask[I] = I;
18791 }
18792 Value *BV = ShuffleBuilder.gather(GatheredScalars);
18793 ShuffleBuilder.add(BV, Mask);
18794 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
18795 SubVectorsMask);
18796 }
18797
18798 if (NeedFreeze)
18799 Res = ShuffleBuilder.createFreeze(Res);
18800 return Res;
18801}
18802
18803Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
18804 for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
18805 (void)vectorizeTree(VectorizableTree[EIdx].get());
18806 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
18807 Builder, *this);
18808}
18809
18810/// \returns \p I after propagating metadata from \p VL only for instructions in
18811/// \p VL.
18814 for (Value *V : VL)
18815 if (isa<Instruction>(V))
18816 Insts.push_back(V);
18817 return llvm::propagateMetadata(Inst, Insts);
18818}
18819
18821 if (DebugLoc DL = PN.getDebugLoc())
18822 return DL;
18823 return DebugLoc::getUnknown();
18824}
18825
18826Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
18827 IRBuilderBase::InsertPointGuard Guard(Builder);
18828
18829 Value *V = E->Scalars.front();
18830 Type *ScalarTy = V->getType();
18831 if (!isa<CmpInst>(V))
18832 ScalarTy = getValueType(V);
18833 auto It = MinBWs.find(E);
18834 if (It != MinBWs.end()) {
18835 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
18836 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
18837 if (VecTy)
18838 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
18839 }
18840 if (E->VectorizedValue)
18841 return E->VectorizedValue;
18842 auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
18843 if (E->isGather()) {
18844 // Set insert point for non-reduction initial nodes.
18845 if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
18846 setInsertPointAfterBundle(E);
18847 Value *Vec = createBuildVector(E, ScalarTy);
18848 E->VectorizedValue = Vec;
18849 return Vec;
18850 }
18851 if (E->State == TreeEntry::SplitVectorize) {
18852 assert(E->CombinedEntriesWithIndices.size() == 2 &&
18853 "Expected exactly 2 combined entries.");
18854 setInsertPointAfterBundle(E);
18855 TreeEntry &OpTE1 =
18856 *VectorizableTree[E->CombinedEntriesWithIndices.front().first];
18857 assert(OpTE1.isSame(
18858 ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) &&
18859 "Expected same first part of scalars.");
18860 Value *Op1 = vectorizeTree(&OpTE1);
18861 TreeEntry &OpTE2 =
18862 *VectorizableTree[E->CombinedEntriesWithIndices.back().first];
18863 assert(
18864 OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) &&
18865 "Expected same second part of scalars.");
18866 Value *Op2 = vectorizeTree(&OpTE2);
18867 auto GetOperandSignedness = [&](const TreeEntry *OpE) {
18868 bool IsSigned = false;
18869 auto It = MinBWs.find(OpE);
18870 if (It != MinBWs.end())
18871 IsSigned = It->second.second;
18872 else
18873 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
18874 if (isa<PoisonValue>(V))
18875 return false;
18876 return !isKnownNonNegative(R, SimplifyQuery(*DL));
18877 });
18878 return IsSigned;
18879 };
18880 if (cast<VectorType>(Op1->getType())->getElementType() !=
18881 ScalarTy->getScalarType()) {
18882 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
18883 Op1 = Builder.CreateIntCast(
18884 Op1,
18886 ScalarTy,
18887 cast<FixedVectorType>(Op1->getType())->getNumElements()),
18888 GetOperandSignedness(&OpTE1));
18889 }
18890 if (cast<VectorType>(Op2->getType())->getElementType() !=
18891 ScalarTy->getScalarType()) {
18892 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
18893 Op2 = Builder.CreateIntCast(
18894 Op2,
18896 ScalarTy,
18897 cast<FixedVectorType>(Op2->getType())->getNumElements()),
18898 GetOperandSignedness(&OpTE2));
18899 }
18900 if (E->ReorderIndices.empty()) {
18901 SmallVector<int> Mask(E->getVectorFactor(), PoisonMaskElem);
18902 std::iota(
18903 Mask.begin(),
18904 std::next(Mask.begin(), E->CombinedEntriesWithIndices.back().second),
18905 0);
18906 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
18907 if (ScalarTyNumElements != 1) {
18908 assert(SLPReVec && "Only supported by REVEC.");
18909 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, Mask);
18910 }
18911 Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
18912 Vec = createInsertVector(Builder, Vec, Op2,
18913 E->CombinedEntriesWithIndices.back().second *
18914 ScalarTyNumElements);
18915 E->VectorizedValue = Vec;
18916 return Vec;
18917 }
18918 unsigned CommonVF =
18919 std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
18920 if (getNumElements(Op1->getType()) != CommonVF) {
18922 std::iota(Mask.begin(), std::next(Mask.begin(), OpTE1.getVectorFactor()),
18923 0);
18924 Op1 = Builder.CreateShuffleVector(Op1, Mask);
18925 }
18926 if (getNumElements(Op2->getType()) != CommonVF) {
18928 std::iota(Mask.begin(), std::next(Mask.begin(), OpTE2.getVectorFactor()),
18929 0);
18930 Op2 = Builder.CreateShuffleVector(Op2, Mask);
18931 }
18932 Value *Vec = Builder.CreateShuffleVector(Op1, Op2, E->getSplitMask());
18933 E->VectorizedValue = Vec;
18934 return Vec;
18935 }
18936
18937 bool IsReverseOrder =
18938 !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices);
18939 auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
18940 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
18941 if (E->getOpcode() == Instruction::Store &&
18942 E->State == TreeEntry::Vectorize) {
18944 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
18945 E->ReorderIndices.size());
18946 ShuffleBuilder.add(V, Mask);
18947 } else if ((E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
18948 E->State == TreeEntry::CompressVectorize) {
18949 ShuffleBuilder.addOrdered(V, {});
18950 } else {
18951 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
18952 }
18954 E->CombinedEntriesWithIndices.size());
18955 transform(
18956 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
18957 return std::make_pair(VectorizableTree[P.first].get(), P.second);
18958 });
18959 assert(
18960 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
18961 "Expected either combined subnodes or reordering");
18962 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
18963 };
18964
18965 assert(!E->isGather() && "Unhandled state");
18966 unsigned ShuffleOrOp =
18967 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
18968 Instruction *VL0 = E->getMainOp();
18969 auto GetOperandSignedness = [&](unsigned Idx) {
18970 const TreeEntry *OpE = getOperandEntry(E, Idx);
18971 bool IsSigned = false;
18972 auto It = MinBWs.find(OpE);
18973 if (It != MinBWs.end())
18974 IsSigned = It->second.second;
18975 else
18976 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
18977 if (isa<PoisonValue>(V))
18978 return false;
18979 return !isKnownNonNegative(R, SimplifyQuery(*DL));
18980 });
18981 return IsSigned;
18982 };
18983 switch (ShuffleOrOp) {
18984 case Instruction::PHI: {
18985 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
18986 E != VectorizableTree.front().get() || E->UserTreeIndex) &&
18987 "PHI reordering is free.");
18988 auto *PH = cast<PHINode>(VL0);
18989 Builder.SetInsertPoint(PH->getParent(),
18990 PH->getParent()->getFirstNonPHIIt());
18992 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
18993 Value *V = NewPhi;
18994
18995 // Adjust insertion point once all PHI's have been generated.
18996 Builder.SetInsertPoint(PH->getParent(),
18997 PH->getParent()->getFirstInsertionPt());
18999
19000 V = FinalShuffle(V, E);
19001
19002 E->VectorizedValue = V;
19003 // If phi node is fully emitted - exit.
19004 if (NewPhi->getNumIncomingValues() != 0)
19005 return NewPhi;
19006
19007 // PHINodes may have multiple entries from the same block. We want to
19008 // visit every block once.
19010
19011 for (unsigned I : seq<unsigned>(PH->getNumIncomingValues())) {
19012 BasicBlock *IBB = PH->getIncomingBlock(I);
19013
19014 // Stop emission if all incoming values are generated.
19015 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
19016 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
19017 return NewPhi;
19018 }
19019
19020 if (!VisitedBBs.insert(IBB).second) {
19021 Value *VecOp = NewPhi->getIncomingValueForBlock(IBB);
19022 NewPhi->addIncoming(VecOp, IBB);
19023 TreeEntry *OpTE = getOperandEntry(E, I);
19024 assert(!OpTE->VectorizedValue && "Expected no vectorized value.");
19025 OpTE->VectorizedValue = VecOp;
19026 continue;
19027 }
19028
19029 Builder.SetInsertPoint(IBB->getTerminator());
19031 Value *Vec = vectorizeOperand(E, I);
19032 if (VecTy != Vec->getType()) {
19033 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
19034 MinBWs.contains(getOperandEntry(E, I))) &&
19035 "Expected item in MinBWs.");
19036 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
19037 }
19038 NewPhi->addIncoming(Vec, IBB);
19039 }
19040
19041 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
19042 "Invalid number of incoming values");
19043 assert(E->VectorizedValue && "Expected vectorized value.");
19044 return E->VectorizedValue;
19045 }
19046
19047 case Instruction::ExtractElement: {
19048 Value *V = E->getSingleOperand(0);
19049 setInsertPointAfterBundle(E);
19050 V = FinalShuffle(V, E);
19051 E->VectorizedValue = V;
19052 return V;
19053 }
19054 case Instruction::ExtractValue: {
19055 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
19056 Builder.SetInsertPoint(LI);
19057 Value *Ptr = LI->getPointerOperand();
19058 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
19059 Value *NewV = ::propagateMetadata(V, E->Scalars);
19060 NewV = FinalShuffle(NewV, E);
19061 E->VectorizedValue = NewV;
19062 return NewV;
19063 }
19064 case Instruction::InsertElement: {
19065 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
19066 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
19067 Value *V = vectorizeOperand(E, 1);
19068 ArrayRef<Value *> Op = E->getOperand(1);
19069 Type *ScalarTy = Op.front()->getType();
19070 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
19071 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
19072 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
19073 assert(Res.first > 0 && "Expected item in MinBWs.");
19074 V = Builder.CreateIntCast(
19075 V,
19077 ScalarTy,
19078 cast<FixedVectorType>(V->getType())->getNumElements()),
19079 Res.second);
19080 }
19081
19082 // Create InsertVector shuffle if necessary
19083 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
19084 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
19085 }));
19086 const unsigned NumElts =
19087 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
19088 const unsigned NumScalars = E->Scalars.size();
19089
19090 unsigned Offset = *getElementIndex(VL0);
19091 assert(Offset < NumElts && "Failed to find vector index offset");
19092
19093 // Create shuffle to resize vector
19095 if (!E->ReorderIndices.empty()) {
19096 inversePermutation(E->ReorderIndices, Mask);
19097 Mask.append(NumElts - NumScalars, PoisonMaskElem);
19098 } else {
19099 Mask.assign(NumElts, PoisonMaskElem);
19100 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
19101 }
19102 // Create InsertVector shuffle if necessary
19103 bool IsIdentity = true;
19104 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
19105 Mask.swap(PrevMask);
19106 for (unsigned I = 0; I < NumScalars; ++I) {
19107 Value *Scalar = E->Scalars[PrevMask[I]];
19108 unsigned InsertIdx = *getElementIndex(Scalar);
19109 IsIdentity &= InsertIdx - Offset == I;
19110 Mask[InsertIdx - Offset] = I;
19111 }
19112 if (!IsIdentity || NumElts != NumScalars) {
19113 Value *V2 = nullptr;
19114 bool IsVNonPoisonous =
19116 SmallVector<int> InsertMask(Mask);
19117 if (NumElts != NumScalars && Offset == 0) {
19118 // Follow all insert element instructions from the current buildvector
19119 // sequence.
19120 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
19121 do {
19122 std::optional<unsigned> InsertIdx = getElementIndex(Ins);
19123 if (!InsertIdx)
19124 break;
19125 if (InsertMask[*InsertIdx] == PoisonMaskElem)
19126 InsertMask[*InsertIdx] = *InsertIdx;
19127 if (!Ins->hasOneUse())
19128 break;
19129 Ins = dyn_cast_or_null<InsertElementInst>(
19130 Ins->getUniqueUndroppableUser());
19131 } while (Ins);
19132 SmallBitVector UseMask =
19133 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19134 SmallBitVector IsFirstPoison =
19135 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19136 SmallBitVector IsFirstUndef =
19137 isUndefVector(FirstInsert->getOperand(0), UseMask);
19138 if (!IsFirstPoison.all()) {
19139 unsigned Idx = 0;
19140 for (unsigned I = 0; I < NumElts; I++) {
19141 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
19142 IsFirstUndef.test(I)) {
19143 if (IsVNonPoisonous) {
19144 InsertMask[I] = I < NumScalars ? I : 0;
19145 continue;
19146 }
19147 if (!V2)
19148 V2 = UndefValue::get(V->getType());
19149 if (Idx >= NumScalars)
19150 Idx = NumScalars - 1;
19151 InsertMask[I] = NumScalars + Idx;
19152 ++Idx;
19153 } else if (InsertMask[I] != PoisonMaskElem &&
19154 Mask[I] == PoisonMaskElem) {
19155 InsertMask[I] = PoisonMaskElem;
19156 }
19157 }
19158 } else {
19159 InsertMask = Mask;
19160 }
19161 }
19162 if (!V2)
19163 V2 = PoisonValue::get(V->getType());
19164 V = Builder.CreateShuffleVector(V, V2, InsertMask);
19165 if (auto *I = dyn_cast<Instruction>(V)) {
19166 GatherShuffleExtractSeq.insert(I);
19167 CSEBlocks.insert(I->getParent());
19168 }
19169 }
19170
19171 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
19172 for (unsigned I = 0; I < NumElts; I++) {
19173 if (Mask[I] != PoisonMaskElem)
19174 InsertMask[Offset + I] = I;
19175 }
19176 SmallBitVector UseMask =
19177 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19178 SmallBitVector IsFirstUndef =
19179 isUndefVector(FirstInsert->getOperand(0), UseMask);
19180 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
19181 NumElts != NumScalars) {
19182 if (IsFirstUndef.all()) {
19183 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
19184 SmallBitVector IsFirstPoison =
19185 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19186 if (!IsFirstPoison.all()) {
19187 for (unsigned I = 0; I < NumElts; I++) {
19188 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
19189 InsertMask[I] = I + NumElts;
19190 }
19191 }
19192 V = Builder.CreateShuffleVector(
19193 V,
19194 IsFirstPoison.all() ? PoisonValue::get(V->getType())
19195 : FirstInsert->getOperand(0),
19196 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
19197 if (auto *I = dyn_cast<Instruction>(V)) {
19198 GatherShuffleExtractSeq.insert(I);
19199 CSEBlocks.insert(I->getParent());
19200 }
19201 }
19202 } else {
19203 SmallBitVector IsFirstPoison =
19204 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19205 for (unsigned I = 0; I < NumElts; I++) {
19206 if (InsertMask[I] == PoisonMaskElem)
19207 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
19208 else
19209 InsertMask[I] += NumElts;
19210 }
19211 V = Builder.CreateShuffleVector(
19212 FirstInsert->getOperand(0), V, InsertMask,
19213 cast<Instruction>(E->Scalars.back())->getName());
19214 if (auto *I = dyn_cast<Instruction>(V)) {
19215 GatherShuffleExtractSeq.insert(I);
19216 CSEBlocks.insert(I->getParent());
19217 }
19218 }
19219 }
19220
19221 ++NumVectorInstructions;
19222 E->VectorizedValue = V;
19223 return V;
19224 }
19225 case Instruction::ZExt:
19226 case Instruction::SExt:
19227 case Instruction::FPToUI:
19228 case Instruction::FPToSI:
19229 case Instruction::FPExt:
19230 case Instruction::PtrToInt:
19231 case Instruction::IntToPtr:
19232 case Instruction::SIToFP:
19233 case Instruction::UIToFP:
19234 case Instruction::Trunc:
19235 case Instruction::FPTrunc:
19236 case Instruction::BitCast: {
19237 setInsertPointAfterBundle(E);
19238
19239 Value *InVec = vectorizeOperand(E, 0);
19240
19241 auto *CI = cast<CastInst>(VL0);
19242 Instruction::CastOps VecOpcode = CI->getOpcode();
19243 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
19244 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
19245 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
19246 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
19247 SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {
19248 // Check if the values are candidates to demote.
19249 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
19250 if (SrcIt != MinBWs.end())
19251 SrcBWSz = SrcIt->second.first;
19252 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
19253 if (BWSz == SrcBWSz) {
19254 VecOpcode = Instruction::BitCast;
19255 } else if (BWSz < SrcBWSz) {
19256 VecOpcode = Instruction::Trunc;
19257 } else if (It != MinBWs.end()) {
19258 assert(BWSz > SrcBWSz && "Invalid cast!");
19259 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
19260 } else if (SrcIt != MinBWs.end()) {
19261 assert(BWSz > SrcBWSz && "Invalid cast!");
19262 VecOpcode =
19263 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
19264 }
19265 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
19266 !SrcIt->second.second) {
19267 VecOpcode = Instruction::UIToFP;
19268 }
19269 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
19270 ? InVec
19271 : Builder.CreateCast(VecOpcode, InVec, VecTy);
19272 V = FinalShuffle(V, E);
19273
19274 E->VectorizedValue = V;
19275 ++NumVectorInstructions;
19276 return V;
19277 }
19278 case Instruction::FCmp:
19279 case Instruction::ICmp: {
19280 setInsertPointAfterBundle(E);
19281
19282 Value *L = vectorizeOperand(E, 0);
19283 Value *R = vectorizeOperand(E, 1);
19284 if (L->getType() != R->getType()) {
19285 assert((getOperandEntry(E, 0)->isGather() ||
19286 getOperandEntry(E, 1)->isGather() ||
19287 MinBWs.contains(getOperandEntry(E, 0)) ||
19288 MinBWs.contains(getOperandEntry(E, 1))) &&
19289 "Expected item in MinBWs.");
19290 if (cast<VectorType>(L->getType())
19291 ->getElementType()
19292 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
19293 ->getElementType()
19294 ->getIntegerBitWidth()) {
19295 Type *CastTy = R->getType();
19296 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
19297 } else {
19298 Type *CastTy = L->getType();
19299 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
19300 }
19301 }
19302
19303 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
19304 Value *V = Builder.CreateCmp(P0, L, R);
19305 propagateIRFlags(V, E->Scalars, VL0);
19306 if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end())
19307 ICmp->setSameSign(/*B=*/false);
19308 // Do not cast for cmps.
19309 VecTy = cast<FixedVectorType>(V->getType());
19310 V = FinalShuffle(V, E);
19311
19312 E->VectorizedValue = V;
19313 ++NumVectorInstructions;
19314 return V;
19315 }
19316 case Instruction::Select: {
19317 setInsertPointAfterBundle(E);
19318
19319 Value *Cond = vectorizeOperand(E, 0);
19320 Value *True = vectorizeOperand(E, 1);
19321 Value *False = vectorizeOperand(E, 2);
19322 if (True->getType() != VecTy || False->getType() != VecTy) {
19323 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
19324 getOperandEntry(E, 2)->isGather() ||
19325 MinBWs.contains(getOperandEntry(E, 1)) ||
19326 MinBWs.contains(getOperandEntry(E, 2))) &&
19327 "Expected item in MinBWs.");
19328 if (True->getType() != VecTy)
19329 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
19330 if (False->getType() != VecTy)
19331 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
19332 }
19333
19334 unsigned CondNumElements = getNumElements(Cond->getType());
19335 unsigned TrueNumElements = getNumElements(True->getType());
19336 assert(TrueNumElements >= CondNumElements &&
19337 TrueNumElements % CondNumElements == 0 &&
19338 "Cannot vectorize Instruction::Select");
19339 assert(TrueNumElements == getNumElements(False->getType()) &&
19340 "Cannot vectorize Instruction::Select");
19341 if (CondNumElements != TrueNumElements) {
19342 // When the return type is i1 but the source is fixed vector type, we
19343 // need to duplicate the condition value.
19344 Cond = Builder.CreateShuffleVector(
19345 Cond, createReplicatedMask(TrueNumElements / CondNumElements,
19346 CondNumElements));
19347 }
19348 assert(getNumElements(Cond->getType()) == TrueNumElements &&
19349 "Cannot vectorize Instruction::Select");
19350 Value *V = Builder.CreateSelect(Cond, True, False);
19351 V = FinalShuffle(V, E);
19352
19353 E->VectorizedValue = V;
19354 ++NumVectorInstructions;
19355 return V;
19356 }
19357 case Instruction::FNeg: {
19358 setInsertPointAfterBundle(E);
19359
19360 Value *Op = vectorizeOperand(E, 0);
19361
19362 Value *V = Builder.CreateUnOp(
19363 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
19364 propagateIRFlags(V, E->Scalars, VL0);
19365 if (auto *I = dyn_cast<Instruction>(V))
19366 V = ::propagateMetadata(I, E->Scalars);
19367
19368 V = FinalShuffle(V, E);
19369
19370 E->VectorizedValue = V;
19371 ++NumVectorInstructions;
19372
19373 return V;
19374 }
19375 case Instruction::Freeze: {
19376 setInsertPointAfterBundle(E);
19377
19378 Value *Op = vectorizeOperand(E, 0);
19379
19380 if (Op->getType() != VecTy) {
19381 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
19382 MinBWs.contains(getOperandEntry(E, 0))) &&
19383 "Expected item in MinBWs.");
19384 Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0));
19385 }
19386 Value *V = Builder.CreateFreeze(Op);
19387 V = FinalShuffle(V, E);
19388
19389 E->VectorizedValue = V;
19390 ++NumVectorInstructions;
19391
19392 return V;
19393 }
19394 case Instruction::Add:
19395 case Instruction::FAdd:
19396 case Instruction::Sub:
19397 case Instruction::FSub:
19398 case Instruction::Mul:
19399 case Instruction::FMul:
19400 case Instruction::UDiv:
19401 case Instruction::SDiv:
19402 case Instruction::FDiv:
19403 case Instruction::URem:
19404 case Instruction::SRem:
19405 case Instruction::FRem:
19406 case Instruction::Shl:
19407 case Instruction::LShr:
19408 case Instruction::AShr:
19409 case Instruction::And:
19410 case Instruction::Or:
19411 case Instruction::Xor: {
19412 setInsertPointAfterBundle(E);
19413
19414 Value *LHS = vectorizeOperand(E, 0);
19415 Value *RHS = vectorizeOperand(E, 1);
19416 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
19417 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
19418 ArrayRef<Value *> Ops = E->getOperand(I);
19419 if (all_of(Ops, [&](Value *Op) {
19420 auto *CI = dyn_cast<ConstantInt>(Op);
19421 return CI && CI->getValue().countr_one() >= It->second.first;
19422 })) {
19423 V = FinalShuffle(I == 0 ? RHS : LHS, E);
19424 E->VectorizedValue = V;
19425 ++NumVectorInstructions;
19426 return V;
19427 }
19428 }
19429 }
19430 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
19431 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
19432 getOperandEntry(E, 1)->isGather() ||
19433 MinBWs.contains(getOperandEntry(E, 0)) ||
19434 MinBWs.contains(getOperandEntry(E, 1))) &&
19435 "Expected item in MinBWs.");
19436 if (LHS->getType() != VecTy)
19437 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
19438 if (RHS->getType() != VecTy)
19439 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
19440 }
19441
19442 Value *V = Builder.CreateBinOp(
19443 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
19444 RHS);
19445 propagateIRFlags(V, E->Scalars, nullptr, It == MinBWs.end());
19446 if (auto *I = dyn_cast<Instruction>(V)) {
19447 V = ::propagateMetadata(I, E->Scalars);
19448 // Drop nuw flags for abs(sub(commutative), true).
19449 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
19450 any_of(E->Scalars, [](Value *V) {
19451 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
19452 }))
19453 I->setHasNoUnsignedWrap(/*b=*/false);
19454 }
19455
19456 V = FinalShuffle(V, E);
19457
19458 E->VectorizedValue = V;
19459 ++NumVectorInstructions;
19460
19461 return V;
19462 }
19463 case Instruction::Load: {
19464 // Loads are inserted at the head of the tree because we don't want to
19465 // sink them all the way down past store instructions.
19466 setInsertPointAfterBundle(E);
19467
19468 LoadInst *LI = cast<LoadInst>(VL0);
19469 Instruction *NewLI;
19470 Value *PO = LI->getPointerOperand();
19471 if (E->State == TreeEntry::Vectorize) {
19472 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
19473 } else if (E->State == TreeEntry::CompressVectorize) {
19474 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
19475 CompressEntryToData.at(E);
19476 Align CommonAlignment = LI->getAlign();
19477 if (IsMasked) {
19478 unsigned VF = getNumElements(LoadVecTy);
19479 SmallVector<Constant *> MaskValues(
19480 VF / getNumElements(LI->getType()),
19482 for (int I : CompressMask)
19483 MaskValues[I] = ConstantInt::getTrue(VecTy->getContext());
19484 if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
19485 assert(SLPReVec && "Only supported by REVEC.");
19486 MaskValues = replicateMask(MaskValues, VecTy->getNumElements());
19487 }
19488 Constant *MaskValue = ConstantVector::get(MaskValues);
19489 NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
19490 MaskValue);
19491 } else {
19492 NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
19493 }
19494 NewLI = ::propagateMetadata(NewLI, E->Scalars);
19495 // TODO: include this cost into CommonCost.
19496 if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
19497 assert(SLPReVec && "FixedVectorType is not expected.");
19499 CompressMask);
19500 }
19501 NewLI =
19502 cast<Instruction>(Builder.CreateShuffleVector(NewLI, CompressMask));
19503 } else if (E->State == TreeEntry::StridedVectorize) {
19504 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
19505 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
19506 PO = IsReverseOrder ? PtrN : Ptr0;
19507 std::optional<int64_t> Diff = getPointersDiff(
19508 VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
19509 Type *StrideTy = DL->getIndexType(PO->getType());
19510 Value *StrideVal;
19511 if (Diff) {
19512 int64_t Stride =
19513 *Diff / (static_cast<int64_t>(E->Scalars.size()) - 1);
19514 StrideVal =
19515 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
19516 DL->getTypeAllocSize(ScalarTy));
19517 } else {
19518 SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
19519 transform(E->Scalars, PointerOps.begin(), [](Value *V) {
19520 return cast<LoadInst>(V)->getPointerOperand();
19521 });
19522 OrdersType Order;
19523 std::optional<Value *> Stride =
19524 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,
19525 &*Builder.GetInsertPoint());
19526 Value *NewStride =
19527 Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true);
19528 StrideVal = Builder.CreateMul(
19529 NewStride,
19530 ConstantInt::get(
19531 StrideTy,
19532 (IsReverseOrder ? -1 : 1) *
19533 static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
19534 }
19535 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
19536 auto *Inst = Builder.CreateIntrinsic(
19537 Intrinsic::experimental_vp_strided_load,
19538 {VecTy, PO->getType(), StrideTy},
19539 {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
19540 Builder.getInt32(E->Scalars.size())});
19541 Inst->addParamAttr(
19542 /*ArgNo=*/0,
19543 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
19544 NewLI = Inst;
19545 } else {
19546 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
19547 Value *VecPtr = vectorizeOperand(E, 0);
19548 if (isa<FixedVectorType>(ScalarTy)) {
19549 assert(SLPReVec && "FixedVectorType is not expected.");
19550 // CreateMaskedGather expects VecTy and VecPtr have same size. We need
19551 // to expand VecPtr if ScalarTy is a vector type.
19552 unsigned ScalarTyNumElements =
19553 cast<FixedVectorType>(ScalarTy)->getNumElements();
19554 unsigned VecTyNumElements =
19555 cast<FixedVectorType>(VecTy)->getNumElements();
19556 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
19557 "Cannot expand getelementptr.");
19558 unsigned VF = VecTyNumElements / ScalarTyNumElements;
19559 SmallVector<Constant *> Indices(VecTyNumElements);
19560 transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) {
19561 return Builder.getInt64(I % ScalarTyNumElements);
19562 });
19563 VecPtr = Builder.CreateGEP(
19564 VecTy->getElementType(),
19565 Builder.CreateShuffleVector(
19566 VecPtr, createReplicatedMask(ScalarTyNumElements, VF)),
19567 ConstantVector::get(Indices));
19568 }
19569 // Use the minimum alignment of the gathered loads.
19570 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
19571 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
19572 }
19573 Value *V = E->State == TreeEntry::CompressVectorize
19574 ? NewLI
19575 : ::propagateMetadata(NewLI, E->Scalars);
19576
19577 V = FinalShuffle(V, E);
19578 E->VectorizedValue = V;
19579 ++NumVectorInstructions;
19580 return V;
19581 }
19582 case Instruction::Store: {
19583 auto *SI = cast<StoreInst>(VL0);
19584
19585 setInsertPointAfterBundle(E);
19586
19587 Value *VecValue = vectorizeOperand(E, 0);
19588 if (VecValue->getType() != VecTy)
19589 VecValue =
19590 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
19591 VecValue = FinalShuffle(VecValue, E);
19592
19593 Value *Ptr = SI->getPointerOperand();
19594 Instruction *ST;
19595 if (E->State == TreeEntry::Vectorize) {
19596 ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
19597 } else {
19598 assert(E->State == TreeEntry::StridedVectorize &&
19599 "Expected either strided or consecutive stores.");
19600 if (!E->ReorderIndices.empty()) {
19601 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
19602 Ptr = SI->getPointerOperand();
19603 }
19604 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
19605 Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
19606 auto *Inst = Builder.CreateIntrinsic(
19607 Intrinsic::experimental_vp_strided_store,
19608 {VecTy, Ptr->getType(), StrideTy},
19609 {VecValue, Ptr,
19610 ConstantInt::get(
19611 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
19612 Builder.getAllOnesMask(VecTy->getElementCount()),
19613 Builder.getInt32(E->Scalars.size())});
19614 Inst->addParamAttr(
19615 /*ArgNo=*/1,
19616 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
19617 ST = Inst;
19618 }
19619
19620 Value *V = ::propagateMetadata(ST, E->Scalars);
19621
19622 E->VectorizedValue = V;
19623 ++NumVectorInstructions;
19624 return V;
19625 }
19626 case Instruction::GetElementPtr: {
19627 auto *GEP0 = cast<GetElementPtrInst>(VL0);
19628 setInsertPointAfterBundle(E);
19629
19630 Value *Op0 = vectorizeOperand(E, 0);
19631
19632 SmallVector<Value *> OpVecs;
19633 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
19634 Value *OpVec = vectorizeOperand(E, J);
19635 OpVecs.push_back(OpVec);
19636 }
19637
19638 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
19639 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
19641 for (Value *V : E->Scalars) {
19642 if (isa<GetElementPtrInst>(V))
19643 GEPs.push_back(V);
19644 }
19645 V = ::propagateMetadata(I, GEPs);
19646 }
19647
19648 V = FinalShuffle(V, E);
19649
19650 E->VectorizedValue = V;
19651 ++NumVectorInstructions;
19652
19653 return V;
19654 }
19655 case Instruction::Call: {
19656 CallInst *CI = cast<CallInst>(VL0);
19657 setInsertPointAfterBundle(E);
19658
19660
19662 CI, ID, VecTy->getNumElements(),
19663 It != MinBWs.end() ? It->second.first : 0, TTI);
19664 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
19665 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
19666 VecCallCosts.first <= VecCallCosts.second;
19667
19668 Value *ScalarArg = nullptr;
19669 SmallVector<Value *> OpVecs;
19670 SmallVector<Type *, 2> TysForDecl;
19671 // Add return type if intrinsic is overloaded on it.
19672 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI))
19673 TysForDecl.push_back(VecTy);
19674 auto *CEI = cast<CallInst>(VL0);
19675 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
19676 // Some intrinsics have scalar arguments. This argument should not be
19677 // vectorized.
19678 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) {
19679 ScalarArg = CEI->getArgOperand(I);
19680 // if decided to reduce bitwidth of abs intrinsic, it second argument
19681 // must be set false (do not return poison, if value issigned min).
19682 if (ID == Intrinsic::abs && It != MinBWs.end() &&
19683 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
19684 ScalarArg = Builder.getFalse();
19685 OpVecs.push_back(ScalarArg);
19687 TysForDecl.push_back(ScalarArg->getType());
19688 continue;
19689 }
19690
19691 Value *OpVec = vectorizeOperand(E, I);
19692 ScalarArg = CEI->getArgOperand(I);
19693 if (cast<VectorType>(OpVec->getType())->getElementType() !=
19694 ScalarArg->getType()->getScalarType() &&
19695 It == MinBWs.end()) {
19696 auto *CastTy =
19697 getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
19698 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
19699 } else if (It != MinBWs.end()) {
19700 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
19701 }
19702 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
19703 OpVecs.push_back(OpVec);
19704 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))
19705 TysForDecl.push_back(OpVec->getType());
19706 }
19707
19708 Function *CF;
19709 if (!UseIntrinsic) {
19710 VFShape Shape =
19713 false /*HasGlobalPred*/);
19714 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
19715 } else {
19716 CF = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, TysForDecl);
19717 }
19718
19720 CI->getOperandBundlesAsDefs(OpBundles);
19721 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
19722
19723 propagateIRFlags(V, E->Scalars, VL0);
19724 V = FinalShuffle(V, E);
19725
19726 E->VectorizedValue = V;
19727 ++NumVectorInstructions;
19728 return V;
19729 }
19730 case Instruction::ShuffleVector: {
19731 Value *V;
19732 if (SLPReVec && !E->isAltShuffle()) {
19733 setInsertPointAfterBundle(E);
19734 Value *Src = vectorizeOperand(E, 0);
19735 SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));
19736 if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
19737 SmallVector<int> NewMask(ThisMask.size());
19738 transform(ThisMask, NewMask.begin(), [&SVSrc](int Mask) {
19739 return SVSrc->getShuffleMask()[Mask];
19740 });
19741 V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
19742 SVSrc->getOperand(1), NewMask);
19743 } else {
19744 V = Builder.CreateShuffleVector(Src, ThisMask);
19745 }
19746 propagateIRFlags(V, E->Scalars, VL0);
19747 if (auto *I = dyn_cast<Instruction>(V))
19748 V = ::propagateMetadata(I, E->Scalars);
19749 V = FinalShuffle(V, E);
19750 } else {
19751 assert(E->isAltShuffle() &&
19752 ((Instruction::isBinaryOp(E->getOpcode()) &&
19753 Instruction::isBinaryOp(E->getAltOpcode())) ||
19754 (Instruction::isCast(E->getOpcode()) &&
19755 Instruction::isCast(E->getAltOpcode())) ||
19756 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
19757 "Invalid Shuffle Vector Operand");
19758
19759 Value *LHS = nullptr, *RHS = nullptr;
19760 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
19761 setInsertPointAfterBundle(E);
19762 LHS = vectorizeOperand(E, 0);
19763 RHS = vectorizeOperand(E, 1);
19764 } else {
19765 setInsertPointAfterBundle(E);
19766 LHS = vectorizeOperand(E, 0);
19767 }
19768 if (LHS && RHS &&
19769 ((Instruction::isBinaryOp(E->getOpcode()) &&
19770 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
19771 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
19772 assert((It != MinBWs.end() ||
19773 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
19774 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
19775 MinBWs.contains(getOperandEntry(E, 0)) ||
19776 MinBWs.contains(getOperandEntry(E, 1))) &&
19777 "Expected item in MinBWs.");
19778 Type *CastTy = VecTy;
19779 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
19780 if (cast<VectorType>(LHS->getType())
19781 ->getElementType()
19782 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
19783 ->getElementType()
19784 ->getIntegerBitWidth())
19785 CastTy = RHS->getType();
19786 else
19787 CastTy = LHS->getType();
19788 }
19789 if (LHS->getType() != CastTy)
19790 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
19791 if (RHS->getType() != CastTy)
19792 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
19793 }
19794
19795 Value *V0, *V1;
19796 if (Instruction::isBinaryOp(E->getOpcode())) {
19797 V0 = Builder.CreateBinOp(
19798 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
19799 V1 = Builder.CreateBinOp(
19800 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
19801 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
19802 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
19803 auto *AltCI = cast<CmpInst>(E->getAltOp());
19804 CmpInst::Predicate AltPred = AltCI->getPredicate();
19805 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
19806 } else {
19807 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
19808 unsigned SrcBWSz = DL->getTypeSizeInBits(
19809 cast<VectorType>(LHS->getType())->getElementType());
19810 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
19811 if (BWSz <= SrcBWSz) {
19812 if (BWSz < SrcBWSz)
19813 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
19814 assert(LHS->getType() == VecTy &&
19815 "Expected same type as operand.");
19816 if (auto *I = dyn_cast<Instruction>(LHS))
19817 LHS = ::propagateMetadata(I, E->Scalars);
19818 LHS = FinalShuffle(LHS, E);
19819 E->VectorizedValue = LHS;
19820 ++NumVectorInstructions;
19821 return LHS;
19822 }
19823 }
19824 V0 = Builder.CreateCast(
19825 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
19826 V1 = Builder.CreateCast(
19827 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
19828 }
19829 // Add V0 and V1 to later analysis to try to find and remove matching
19830 // instruction, if any.
19831 for (Value *V : {V0, V1}) {
19832 if (auto *I = dyn_cast<Instruction>(V)) {
19833 GatherShuffleExtractSeq.insert(I);
19834 CSEBlocks.insert(I->getParent());
19835 }
19836 }
19837
19838 // Create shuffle to take alternate operations from the vector.
19839 // Also, gather up main and alt scalar ops to propagate IR flags to
19840 // each vector operation.
19841 ValueList OpScalars, AltScalars;
19843 E->buildAltOpShuffleMask(
19844 [E, this](Instruction *I) {
19845 assert(E->getMatchingMainOpOrAltOp(I) &&
19846 "Unexpected main/alternate opcode");
19847 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
19848 *TLI);
19849 },
19850 Mask, &OpScalars, &AltScalars);
19851
19852 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
19853 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
19854 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
19855 // Drop nuw flags for abs(sub(commutative), true).
19856 if (auto *I = dyn_cast<Instruction>(Vec);
19857 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
19858 any_of(E->Scalars, [](Value *V) {
19859 if (isa<PoisonValue>(V))
19860 return false;
19861 auto *IV = cast<Instruction>(V);
19862 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
19863 }))
19864 I->setHasNoUnsignedWrap(/*b=*/false);
19865 };
19866 DropNuwFlag(V0, E->getOpcode());
19867 DropNuwFlag(V1, E->getAltOpcode());
19868
19869 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
19870 assert(SLPReVec && "FixedVectorType is not expected.");
19872 }
19873 V = Builder.CreateShuffleVector(V0, V1, Mask);
19874 if (auto *I = dyn_cast<Instruction>(V)) {
19875 V = ::propagateMetadata(I, E->Scalars);
19876 GatherShuffleExtractSeq.insert(I);
19877 CSEBlocks.insert(I->getParent());
19878 }
19879 }
19880
19881 E->VectorizedValue = V;
19882 ++NumVectorInstructions;
19883
19884 return V;
19885 }
19886 default:
19887 llvm_unreachable("unknown inst");
19888 }
19889 return nullptr;
19890}
19891
19893 ExtraValueToDebugLocsMap ExternallyUsedValues;
19894 return vectorizeTree(ExternallyUsedValues);
19895}
19896
19898 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
19899 Instruction *ReductionRoot,
19900 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
19901 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
19902 // need to rebuild it.
19903 EntryToLastInstruction.clear();
19904 // All blocks must be scheduled before any instructions are inserted.
19905 for (auto &BSIter : BlocksSchedules)
19906 scheduleBlock(*this, BSIter.second.get());
19907 // Cache last instructions for the nodes to avoid side effects, which may
19908 // appear during vectorization, like extra uses, etc.
19909 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
19910 if (TE->isGather())
19911 continue;
19912 (void)getLastInstructionInBundle(TE.get());
19913 }
19914
19915 if (ReductionRoot)
19916 Builder.SetInsertPoint(ReductionRoot->getParent(),
19917 ReductionRoot->getIterator());
19918 else
19919 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
19920
19921 // Vectorize gather operands of the nodes with the external uses only.
19923 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
19924 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
19925 TE->UserTreeIndex.UserTE->hasState() &&
19926 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
19927 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
19928 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
19929 !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
19930 all_of(TE->UserTreeIndex.UserTE->Scalars,
19931 [](Value *V) { return isUsedOutsideBlock(V); })) {
19932 Instruction &LastInst =
19933 getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
19934 GatherEntries.emplace_back(TE.get(), &LastInst);
19935 }
19936 }
19937 for (auto &Entry : GatherEntries) {
19938 IRBuilderBase::InsertPointGuard Guard(Builder);
19939 Builder.SetInsertPoint(Entry.second);
19940 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
19941 (void)vectorizeTree(Entry.first);
19942 }
19943 // Emit gathered loads first to emit better code for the users of those
19944 // gathered loads.
19945 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
19946 if (GatheredLoadsEntriesFirst.has_value() &&
19947 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
19948 (!TE->isGather() || TE->UserTreeIndex)) {
19949 assert((TE->UserTreeIndex ||
19950 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
19951 "Expected gathered load node.");
19952 (void)vectorizeTree(TE.get());
19953 }
19954 }
19955 (void)vectorizeTree(VectorizableTree[0].get());
19956 // Run through the list of postponed gathers and emit them, replacing the temp
19957 // emitted allocas with actual vector instructions.
19958 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
19960 for (const TreeEntry *E : PostponedNodes) {
19961 auto *TE = const_cast<TreeEntry *>(E);
19962 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
19963 TE->VectorizedValue = nullptr;
19964 auto *UserI = cast<Instruction>(TE->UserTreeIndex.UserTE->VectorizedValue);
19965 // If user is a PHI node, its vector code have to be inserted right before
19966 // block terminator. Since the node was delayed, there were some unresolved
19967 // dependencies at the moment when stab instruction was emitted. In a case
19968 // when any of these dependencies turn out an operand of another PHI, coming
19969 // from this same block, position of a stab instruction will become invalid.
19970 // The is because source vector that supposed to feed this gather node was
19971 // inserted at the end of the block [after stab instruction]. So we need
19972 // to adjust insertion point again to the end of block.
19973 if (isa<PHINode>(UserI)) {
19974 // Insert before all users.
19975 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
19976 for (User *U : PrevVec->users()) {
19977 if (U == UserI)
19978 continue;
19979 auto *UI = dyn_cast<Instruction>(U);
19980 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
19981 continue;
19982 if (UI->comesBefore(InsertPt))
19983 InsertPt = UI;
19984 }
19985 Builder.SetInsertPoint(InsertPt);
19986 } else {
19987 Builder.SetInsertPoint(PrevVec);
19988 }
19989 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
19990 Value *Vec = vectorizeTree(TE);
19991 if (auto *VecI = dyn_cast<Instruction>(Vec);
19992 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
19993 Builder.GetInsertPoint()->comesBefore(VecI))
19994 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
19995 Builder.GetInsertPoint());
19996 if (Vec->getType() != PrevVec->getType()) {
19997 assert(Vec->getType()->isIntOrIntVectorTy() &&
19998 PrevVec->getType()->isIntOrIntVectorTy() &&
19999 "Expected integer vector types only.");
20000 std::optional<bool> IsSigned;
20001 for (Value *V : TE->Scalars) {
20002 if (isVectorized(V)) {
20003 for (const TreeEntry *MNTE : getTreeEntries(V)) {
20004 auto It = MinBWs.find(MNTE);
20005 if (It != MinBWs.end()) {
20006 IsSigned = IsSigned.value_or(false) || It->second.second;
20007 if (*IsSigned)
20008 break;
20009 }
20010 }
20011 if (IsSigned.value_or(false))
20012 break;
20013 // Scan through gather nodes.
20014 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
20015 auto It = MinBWs.find(BVE);
20016 if (It != MinBWs.end()) {
20017 IsSigned = IsSigned.value_or(false) || It->second.second;
20018 if (*IsSigned)
20019 break;
20020 }
20021 }
20022 if (IsSigned.value_or(false))
20023 break;
20024 if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
20025 IsSigned =
20026 IsSigned.value_or(false) ||
20027 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
20028 continue;
20029 }
20030 if (IsSigned.value_or(false))
20031 break;
20032 }
20033 }
20034 if (IsSigned.value_or(false)) {
20035 // Final attempt - check user node.
20036 auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
20037 if (It != MinBWs.end())
20038 IsSigned = It->second.second;
20039 }
20040 assert(IsSigned &&
20041 "Expected user node or perfect diamond match in MinBWs.");
20042 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
20043 }
20044 PrevVec->replaceAllUsesWith(Vec);
20045 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
20046 // Replace the stub vector node, if it was used before for one of the
20047 // buildvector nodes already.
20048 auto It = PostponedValues.find(PrevVec);
20049 if (It != PostponedValues.end()) {
20050 for (TreeEntry *VTE : It->getSecond())
20051 VTE->VectorizedValue = Vec;
20052 }
20053 eraseInstruction(PrevVec);
20054 }
20055
20056 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
20057 << " values .\n");
20058
20060 // Maps vector instruction to original insertelement instruction
20061 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
20062 // Maps extract Scalar to the corresponding extractelement instruction in the
20063 // basic block. Only one extractelement per block should be emitted.
20065 ScalarToEEs;
20066 SmallDenseSet<Value *, 4> UsedInserts;
20068 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
20070 // Extract all of the elements with the external uses.
20071 for (const auto &ExternalUse : ExternalUses) {
20072 Value *Scalar = ExternalUse.Scalar;
20073 llvm::User *User = ExternalUse.User;
20074
20075 // Skip users that we already RAUW. This happens when one instruction
20076 // has multiple uses of the same value.
20077 if (User && !is_contained(Scalar->users(), User))
20078 continue;
20079 const TreeEntry *E = &ExternalUse.E;
20080 assert(E && "Invalid scalar");
20081 assert(!E->isGather() && "Extracting from a gather list");
20082 // Non-instruction pointers are not deleted, just skip them.
20083 if (E->getOpcode() == Instruction::GetElementPtr &&
20084 !isa<GetElementPtrInst>(Scalar))
20085 continue;
20086
20087 Value *Vec = E->VectorizedValue;
20088 assert(Vec && "Can't find vectorizable value");
20089
20090 Value *Lane = Builder.getInt32(ExternalUse.Lane);
20091 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
20092 if (Scalar->getType() != Vec->getType()) {
20093 Value *Ex = nullptr;
20094 Value *ExV = nullptr;
20095 auto *Inst = dyn_cast<Instruction>(Scalar);
20096 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
20097 auto It = ScalarToEEs.find(Scalar);
20098 if (It != ScalarToEEs.end()) {
20099 // No need to emit many extracts, just move the only one in the
20100 // current block.
20101 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
20102 : Builder.GetInsertBlock());
20103 if (EEIt != It->second.end()) {
20104 Value *PrevV = EEIt->second.first;
20105 if (auto *I = dyn_cast<Instruction>(PrevV);
20106 I && !ReplaceInst &&
20107 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
20108 Builder.GetInsertPoint()->comesBefore(I)) {
20109 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
20110 Builder.GetInsertPoint());
20111 if (auto *CI = dyn_cast<Instruction>(EEIt->second.second))
20112 CI->moveAfter(I);
20113 }
20114 Ex = PrevV;
20115 ExV = EEIt->second.second ? EEIt->second.second : Ex;
20116 }
20117 }
20118 if (!Ex) {
20119 // "Reuse" the existing extract to improve final codegen.
20120 if (ReplaceInst) {
20121 // Leave the instruction as is, if it cheaper extracts and all
20122 // operands are scalar.
20123 if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
20124 IgnoredExtracts.insert(EE);
20125 Ex = EE;
20126 } else {
20127 auto *CloneInst = Inst->clone();
20128 CloneInst->insertBefore(Inst->getIterator());
20129 if (Inst->hasName())
20130 CloneInst->takeName(Inst);
20131 Ex = CloneInst;
20132 }
20133 } else if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
20134 ES && isa<Instruction>(Vec)) {
20135 Value *V = ES->getVectorOperand();
20136 auto *IVec = cast<Instruction>(Vec);
20137 if (ArrayRef<TreeEntry *> ETEs = getTreeEntries(V); !ETEs.empty())
20138 V = ETEs.front()->VectorizedValue;
20139 if (auto *IV = dyn_cast<Instruction>(V);
20140 !IV || IV == Vec || IV->getParent() != IVec->getParent() ||
20141 IV->comesBefore(IVec))
20142 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
20143 else
20144 Ex = Builder.CreateExtractElement(Vec, Lane);
20145 } else if (auto *VecTy =
20146 dyn_cast<FixedVectorType>(Scalar->getType())) {
20147 assert(SLPReVec && "FixedVectorType is not expected.");
20148 unsigned VecTyNumElements = VecTy->getNumElements();
20149 // When REVEC is enabled, we need to extract a vector.
20150 // Note: The element size of Scalar may be different from the
20151 // element size of Vec.
20152 Ex = createExtractVector(Builder, Vec, VecTyNumElements,
20153 ExternalUse.Lane * VecTyNumElements);
20154 } else {
20155 Ex = Builder.CreateExtractElement(Vec, Lane);
20156 }
20157 // If necessary, sign-extend or zero-extend ScalarRoot
20158 // to the larger type.
20159 ExV = Ex;
20160 if (Scalar->getType() != Ex->getType())
20161 ExV = Builder.CreateIntCast(
20162 Ex, Scalar->getType(),
20163 !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
20164 auto *I = dyn_cast<Instruction>(Ex);
20165 ScalarToEEs[Scalar].try_emplace(I ? I->getParent()
20166 : &F->getEntryBlock(),
20167 std::make_pair(Ex, ExV));
20168 }
20169 // The then branch of the previous if may produce constants, since 0
20170 // operand might be a constant.
20171 if (auto *ExI = dyn_cast<Instruction>(Ex);
20172 ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {
20173 GatherShuffleExtractSeq.insert(ExI);
20174 CSEBlocks.insert(ExI->getParent());
20175 }
20176 return ExV;
20177 }
20178 assert(isa<FixedVectorType>(Scalar->getType()) &&
20179 isa<InsertElementInst>(Scalar) &&
20180 "In-tree scalar of vector type is not insertelement?");
20181 auto *IE = cast<InsertElementInst>(Scalar);
20182 VectorToInsertElement.try_emplace(Vec, IE);
20183 return Vec;
20184 };
20185 // If User == nullptr, the Scalar remains as scalar in vectorized
20186 // instructions or is used as extra arg. Generate ExtractElement instruction
20187 // and update the record for this scalar in ExternallyUsedValues.
20188 if (!User) {
20189 if (!ScalarsWithNullptrUser.insert(Scalar).second)
20190 continue;
20191 assert(
20192 (ExternallyUsedValues.count(Scalar) ||
20193 ExternalUsesWithNonUsers.count(Scalar) ||
20194 ExternalUsesAsOriginalScalar.contains(Scalar) ||
20195 any_of(
20196 Scalar->users(),
20197 [&, TTI = TTI](llvm::User *U) {
20198 if (ExternalUsesAsOriginalScalar.contains(U))
20199 return true;
20200 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
20201 return !UseEntries.empty() &&
20202 (E->State == TreeEntry::Vectorize ||
20203 E->State == TreeEntry::StridedVectorize ||
20204 E->State == TreeEntry::CompressVectorize) &&
20205 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
20206 return (UseEntry->State == TreeEntry::Vectorize ||
20207 UseEntry->State ==
20208 TreeEntry::StridedVectorize ||
20209 UseEntry->State ==
20210 TreeEntry::CompressVectorize) &&
20211 doesInTreeUserNeedToExtract(
20212 Scalar, getRootEntryInstruction(*UseEntry),
20213 TLI, TTI);
20214 });
20215 })) &&
20216 "Scalar with nullptr User must be registered in "
20217 "ExternallyUsedValues map or remain as scalar in vectorized "
20218 "instructions");
20219 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
20220 if (auto *PHI = dyn_cast<PHINode>(VecI)) {
20221 if (PHI->getParent()->isLandingPad())
20222 Builder.SetInsertPoint(
20223 PHI->getParent(),
20224 std::next(
20225 PHI->getParent()->getLandingPadInst()->getIterator()));
20226 else
20227 Builder.SetInsertPoint(PHI->getParent(),
20228 PHI->getParent()->getFirstNonPHIIt());
20229 } else {
20230 Builder.SetInsertPoint(VecI->getParent(),
20231 std::next(VecI->getIterator()));
20232 }
20233 } else {
20234 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20235 }
20236 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20237 // Required to update internally referenced instructions.
20238 if (Scalar != NewInst) {
20239 assert((!isa<ExtractElementInst>(Scalar) ||
20240 !IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&
20241 "Extractelements should not be replaced.");
20242 Scalar->replaceAllUsesWith(NewInst);
20243 }
20244 continue;
20245 }
20246
20247 if (auto *VU = dyn_cast<InsertElementInst>(User);
20248 VU && VU->getOperand(1) == Scalar) {
20249 // Skip if the scalar is another vector op or Vec is not an instruction.
20250 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
20251 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
20252 if (!UsedInserts.insert(VU).second)
20253 continue;
20254 // Need to use original vector, if the root is truncated.
20255 auto BWIt = MinBWs.find(E);
20256 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
20257 auto *ScalarTy = FTy->getElementType();
20258 auto Key = std::make_pair(Vec, ScalarTy);
20259 auto VecIt = VectorCasts.find(Key);
20260 if (VecIt == VectorCasts.end()) {
20261 IRBuilderBase::InsertPointGuard Guard(Builder);
20262 if (auto *IVec = dyn_cast<PHINode>(Vec)) {
20263 if (IVec->getParent()->isLandingPad())
20264 Builder.SetInsertPoint(IVec->getParent(),
20265 std::next(IVec->getParent()
20266 ->getLandingPadInst()
20267 ->getIterator()));
20268 else
20269 Builder.SetInsertPoint(
20270 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
20271 } else if (auto *IVec = dyn_cast<Instruction>(Vec)) {
20272 Builder.SetInsertPoint(IVec->getNextNode());
20273 }
20274 Vec = Builder.CreateIntCast(
20275 Vec,
20277 ScalarTy,
20278 cast<FixedVectorType>(Vec->getType())->getNumElements()),
20279 BWIt->second.second);
20280 VectorCasts.try_emplace(Key, Vec);
20281 } else {
20282 Vec = VecIt->second;
20283 }
20284 }
20285
20286 std::optional<unsigned> InsertIdx = getElementIndex(VU);
20287 if (InsertIdx) {
20288 auto *It = find_if(
20289 ShuffledInserts, [VU](const ShuffledInsertData<Value *> &Data) {
20290 // Checks if 2 insertelements are from the same buildvector.
20291 InsertElementInst *VecInsert = Data.InsertElements.front();
20293 VU, VecInsert,
20294 [](InsertElementInst *II) { return II->getOperand(0); });
20295 });
20296 unsigned Idx = *InsertIdx;
20297 if (It == ShuffledInserts.end()) {
20298 (void)ShuffledInserts.emplace_back();
20299 It = std::next(ShuffledInserts.begin(),
20300 ShuffledInserts.size() - 1);
20301 }
20302 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
20303 if (Mask.empty())
20304 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
20305 Mask[Idx] = ExternalUse.Lane;
20306 It->InsertElements.push_back(cast<InsertElementInst>(User));
20307 continue;
20308 }
20309 }
20310 }
20311 }
20312
20313 // Generate extracts for out-of-tree users.
20314 // Find the insertion point for the extractelement lane.
20315 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
20316 if (PHINode *PH = dyn_cast<PHINode>(User)) {
20317 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
20318 if (PH->getIncomingValue(I) == Scalar) {
20319 Instruction *IncomingTerminator =
20320 PH->getIncomingBlock(I)->getTerminator();
20321 if (isa<CatchSwitchInst>(IncomingTerminator)) {
20322 Builder.SetInsertPoint(VecI->getParent(),
20323 std::next(VecI->getIterator()));
20324 } else {
20325 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
20326 }
20327 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20328 PH->setOperand(I, NewInst);
20329 }
20330 }
20331 } else {
20332 Builder.SetInsertPoint(cast<Instruction>(User));
20333 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20334 User->replaceUsesOfWith(Scalar, NewInst);
20335 }
20336 } else {
20337 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20338 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20339 User->replaceUsesOfWith(Scalar, NewInst);
20340 }
20341
20342 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
20343 }
20344
20345 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
20346 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
20347 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
20348 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
20349 for (int I = 0, E = Mask.size(); I < E; ++I) {
20350 if (Mask[I] < VF)
20351 CombinedMask1[I] = Mask[I];
20352 else
20353 CombinedMask2[I] = Mask[I] - VF;
20354 }
20355 ShuffleInstructionBuilder ShuffleBuilder(
20356 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
20357 ShuffleBuilder.add(V1, CombinedMask1);
20358 if (V2)
20359 ShuffleBuilder.add(V2, CombinedMask2);
20360 return ShuffleBuilder.finalize({}, {}, {});
20361 };
20362
20363 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
20364 bool ForSingleMask) {
20365 unsigned VF = Mask.size();
20366 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
20367 if (VF != VecVF) {
20368 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
20369 Vec = CreateShuffle(Vec, nullptr, Mask);
20370 return std::make_pair(Vec, true);
20371 }
20372 if (!ForSingleMask) {
20373 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
20374 for (unsigned I = 0; I < VF; ++I) {
20375 if (Mask[I] != PoisonMaskElem)
20376 ResizeMask[Mask[I]] = Mask[I];
20377 }
20378 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
20379 }
20380 }
20381
20382 return std::make_pair(Vec, false);
20383 };
20384 // Perform shuffling of the vectorize tree entries for better handling of
20385 // external extracts.
20386 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
20387 // Find the first and the last instruction in the list of insertelements.
20388 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
20389 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
20390 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
20391 Builder.SetInsertPoint(LastInsert);
20392 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
20393 Value *NewInst = performExtractsShuffleAction<Value>(
20394 MutableArrayRef(Vector.data(), Vector.size()),
20395 FirstInsert->getOperand(0),
20396 [](Value *Vec) {
20397 return cast<VectorType>(Vec->getType())
20398 ->getElementCount()
20399 .getKnownMinValue();
20400 },
20401 ResizeToVF,
20402 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
20403 ArrayRef<Value *> Vals) {
20404 assert((Vals.size() == 1 || Vals.size() == 2) &&
20405 "Expected exactly 1 or 2 input values.");
20406 if (Vals.size() == 1) {
20407 // Do not create shuffle if the mask is a simple identity
20408 // non-resizing mask.
20409 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
20410 ->getNumElements() ||
20411 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
20412 return CreateShuffle(Vals.front(), nullptr, Mask);
20413 return Vals.front();
20414 }
20415 return CreateShuffle(Vals.front() ? Vals.front()
20416 : FirstInsert->getOperand(0),
20417 Vals.back(), Mask);
20418 });
20419 auto It = ShuffledInserts[I].InsertElements.rbegin();
20420 // Rebuild buildvector chain.
20421 InsertElementInst *II = nullptr;
20422 if (It != ShuffledInserts[I].InsertElements.rend())
20423 II = *It;
20425 while (It != ShuffledInserts[I].InsertElements.rend()) {
20426 assert(II && "Must be an insertelement instruction.");
20427 if (*It == II)
20428 ++It;
20429 else
20430 Inserts.push_back(cast<Instruction>(II));
20431 II = dyn_cast<InsertElementInst>(II->getOperand(0));
20432 }
20433 for (Instruction *II : reverse(Inserts)) {
20434 II->replaceUsesOfWith(II->getOperand(0), NewInst);
20435 if (auto *NewI = dyn_cast<Instruction>(NewInst))
20436 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
20437 II->moveAfter(NewI);
20438 NewInst = II;
20439 }
20440 LastInsert->replaceAllUsesWith(NewInst);
20441 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
20442 IE->replaceUsesOfWith(IE->getOperand(0),
20443 PoisonValue::get(IE->getOperand(0)->getType()));
20444 IE->replaceUsesOfWith(IE->getOperand(1),
20445 PoisonValue::get(IE->getOperand(1)->getType()));
20446 eraseInstruction(IE);
20447 }
20448 CSEBlocks.insert(LastInsert->getParent());
20449 }
20450
20451 SmallVector<Instruction *> RemovedInsts;
20452 // For each vectorized value:
20453 for (auto &TEPtr : VectorizableTree) {
20454 TreeEntry *Entry = TEPtr.get();
20455
20456 // No need to handle users of gathered values.
20457 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
20458 continue;
20459
20460 assert(Entry->VectorizedValue && "Can't find vectorizable value");
20461
20462 // For each lane:
20463 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
20464 Value *Scalar = Entry->Scalars[Lane];
20465
20466 if (Entry->getOpcode() == Instruction::GetElementPtr &&
20467 !isa<GetElementPtrInst>(Scalar))
20468 continue;
20469 if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);
20470 EE && IgnoredExtracts.contains(EE))
20471 continue;
20472 if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
20473 continue;
20474#ifndef NDEBUG
20475 Type *Ty = Scalar->getType();
20476 if (!Ty->isVoidTy()) {
20477 for (User *U : Scalar->users()) {
20478 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
20479
20480 // It is legal to delete users in the ignorelist.
20481 assert((isVectorized(U) ||
20482 (UserIgnoreList && UserIgnoreList->contains(U)) ||
20483 (isa_and_nonnull<Instruction>(U) &&
20484 isDeleted(cast<Instruction>(U)))) &&
20485 "Deleting out-of-tree value");
20486 }
20487 }
20488#endif
20489 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
20490 auto *I = cast<Instruction>(Scalar);
20491 RemovedInsts.push_back(I);
20492 }
20493 }
20494
20495 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
20496 // new vector instruction.
20497 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
20498 V->mergeDIAssignID(RemovedInsts);
20499
20500 // Clear up reduction references, if any.
20501 if (UserIgnoreList) {
20502 for (Instruction *I : RemovedInsts) {
20503 const TreeEntry *IE = getTreeEntries(I).front();
20504 if (IE->Idx != 0 &&
20505 !(VectorizableTree.front()->isGather() && IE->UserTreeIndex &&
20506 (ValueToGatherNodes.lookup(I).contains(
20507 VectorizableTree.front().get()) ||
20508 (IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
20509 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
20510 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
20511 IE->UserTreeIndex &&
20512 is_contained(VectorizableTree.front()->Scalars, I)) &&
20513 !(GatheredLoadsEntriesFirst.has_value() &&
20514 IE->Idx >= *GatheredLoadsEntriesFirst &&
20515 VectorizableTree.front()->isGather() &&
20516 is_contained(VectorizableTree.front()->Scalars, I)) &&
20517 !(!VectorizableTree.front()->isGather() &&
20518 VectorizableTree.front()->isCopyableElement(I)))
20519 continue;
20520 SmallVector<SelectInst *> LogicalOpSelects;
20521 I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
20522 // Do not replace condition of the logical op in form select <cond>.
20523 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
20524 (match(U.getUser(), m_LogicalAnd()) ||
20525 match(U.getUser(), m_LogicalOr())) &&
20526 U.getOperandNo() == 0;
20527 if (IsPoisoningLogicalOp) {
20528 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
20529 return false;
20530 }
20531 return UserIgnoreList->contains(U.getUser());
20532 });
20533 // Replace conditions of the poisoning logical ops with the non-poison
20534 // constant value.
20535 for (SelectInst *SI : LogicalOpSelects)
20536 SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));
20537 }
20538 }
20539 // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
20540 // cache correctness.
20541 // NOTE: removeInstructionAndOperands only marks the instruction for deletion
20542 // - instructions are not deleted until later.
20543 removeInstructionsAndOperands(ArrayRef(RemovedInsts), VectorValuesAndScales);
20544
20545 Builder.ClearInsertionPoint();
20546 InstrElementSize.clear();
20547
20548 const TreeEntry &RootTE = *VectorizableTree.front();
20549 Value *Vec = RootTE.VectorizedValue;
20550 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
20551 It != MinBWs.end() &&
20552 ReductionBitWidth != It->second.first) {
20553 IRBuilder<>::InsertPointGuard Guard(Builder);
20554 Builder.SetInsertPoint(ReductionRoot->getParent(),
20555 ReductionRoot->getIterator());
20556 Vec = Builder.CreateIntCast(
20557 Vec,
20558 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
20559 cast<VectorType>(Vec->getType())->getElementCount()),
20560 It->second.second);
20561 }
20562 return Vec;
20563}
20564
20566 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
20567 << " gather sequences instructions.\n");
20568 // LICM InsertElementInst sequences.
20569 for (Instruction *I : GatherShuffleExtractSeq) {
20570 if (isDeleted(I))
20571 continue;
20572
20573 // Check if this block is inside a loop.
20574 Loop *L = LI->getLoopFor(I->getParent());
20575 if (!L)
20576 continue;
20577
20578 // Check if it has a preheader.
20579 BasicBlock *PreHeader = L->getLoopPreheader();
20580 if (!PreHeader)
20581 continue;
20582
20583 // If the vector or the element that we insert into it are
20584 // instructions that are defined in this basic block then we can't
20585 // hoist this instruction.
20586 if (any_of(I->operands(), [L](Value *V) {
20587 auto *OpI = dyn_cast<Instruction>(V);
20588 return OpI && L->contains(OpI);
20589 }))
20590 continue;
20591
20592 // We can hoist this instruction. Move it to the pre-header.
20593 I->moveBefore(PreHeader->getTerminator()->getIterator());
20594 CSEBlocks.insert(PreHeader);
20595 }
20596
20597 // Make a list of all reachable blocks in our CSE queue.
20599 CSEWorkList.reserve(CSEBlocks.size());
20600 for (BasicBlock *BB : CSEBlocks)
20601 if (DomTreeNode *N = DT->getNode(BB)) {
20603 CSEWorkList.push_back(N);
20604 }
20605
20606 // Sort blocks by domination. This ensures we visit a block after all blocks
20607 // dominating it are visited.
20608 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
20609 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
20610 "Different nodes should have different DFS numbers");
20611 return A->getDFSNumIn() < B->getDFSNumIn();
20612 });
20613
20614 // Less defined shuffles can be replaced by the more defined copies.
20615 // Between two shuffles one is less defined if it has the same vector operands
20616 // and its mask indeces are the same as in the first one or undefs. E.g.
20617 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
20618 // poison, <0, 0, 0, 0>.
20619 auto &&IsIdenticalOrLessDefined = [TTI = TTI](Instruction *I1,
20620 Instruction *I2,
20621 SmallVectorImpl<int> &NewMask) {
20622 if (I1->getType() != I2->getType())
20623 return false;
20624 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
20625 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
20626 if (!SI1 || !SI2)
20627 return I1->isIdenticalTo(I2);
20628 if (SI1->isIdenticalTo(SI2))
20629 return true;
20630 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
20631 if (SI1->getOperand(I) != SI2->getOperand(I))
20632 return false;
20633 // Check if the second instruction is more defined than the first one.
20634 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
20635 ArrayRef<int> SM1 = SI1->getShuffleMask();
20636 // Count trailing undefs in the mask to check the final number of used
20637 // registers.
20638 unsigned LastUndefsCnt = 0;
20639 for (int I = 0, E = NewMask.size(); I < E; ++I) {
20640 if (SM1[I] == PoisonMaskElem)
20641 ++LastUndefsCnt;
20642 else
20643 LastUndefsCnt = 0;
20644 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
20645 NewMask[I] != SM1[I])
20646 return false;
20647 if (NewMask[I] == PoisonMaskElem)
20648 NewMask[I] = SM1[I];
20649 }
20650 // Check if the last undefs actually change the final number of used vector
20651 // registers.
20652 return SM1.size() - LastUndefsCnt > 1 &&
20653 ::getNumberOfParts(*TTI, SI1->getType()) ==
20655 *TTI, getWidenedType(SI1->getType()->getElementType(),
20656 SM1.size() - LastUndefsCnt));
20657 };
20658 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
20659 // instructions. TODO: We can further optimize this scan if we split the
20660 // instructions into different buckets based on the insert lane.
20662 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
20663 assert(*I &&
20664 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
20665 "Worklist not sorted properly!");
20666 BasicBlock *BB = (*I)->getBlock();
20667 // For all instructions in blocks containing gather sequences:
20668 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
20669 if (isDeleted(&In))
20670 continue;
20671 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
20672 !GatherShuffleExtractSeq.contains(&In))
20673 continue;
20674
20675 // Check if we can replace this instruction with any of the
20676 // visited instructions.
20677 bool Replaced = false;
20678 for (Instruction *&V : Visited) {
20679 SmallVector<int> NewMask;
20680 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
20681 DT->dominates(V->getParent(), In.getParent())) {
20682 In.replaceAllUsesWith(V);
20683 eraseInstruction(&In);
20684 if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
20685 if (!NewMask.empty())
20686 SI->setShuffleMask(NewMask);
20687 Replaced = true;
20688 break;
20689 }
20690 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
20691 GatherShuffleExtractSeq.contains(V) &&
20692 IsIdenticalOrLessDefined(V, &In, NewMask) &&
20693 DT->dominates(In.getParent(), V->getParent())) {
20694 In.moveAfter(V);
20695 V->replaceAllUsesWith(&In);
20697 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
20698 if (!NewMask.empty())
20699 SI->setShuffleMask(NewMask);
20700 V = &In;
20701 Replaced = true;
20702 break;
20703 }
20704 }
20705 if (!Replaced) {
20706 assert(!is_contained(Visited, &In));
20707 Visited.push_back(&In);
20708 }
20709 }
20710 }
20711 CSEBlocks.clear();
20712 GatherShuffleExtractSeq.clear();
20713}
20714
20715BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
20716 ArrayRef<Value *> VL, const InstructionsState &S, const EdgeInfo &EI) {
20717 auto &BundlePtr =
20718 ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
20719 for (Value *V : VL) {
20720 if (S.isNonSchedulable(V))
20721 continue;
20722 auto *I = cast<Instruction>(V);
20723 if (S.isCopyableElement(V)) {
20724 // Add a copyable element model.
20725 ScheduleCopyableData &SD =
20726 addScheduleCopyableData(EI, I, SchedulingRegionID, *BundlePtr);
20727 // Group the instructions to a bundle.
20728 BundlePtr->add(&SD);
20729 continue;
20730 }
20731 ScheduleData *BundleMember = getScheduleData(V);
20732 assert(BundleMember && "no ScheduleData for bundle member "
20733 "(maybe not in same basic block)");
20734 // Group the instructions to a bundle.
20735 BundlePtr->add(BundleMember);
20736 ScheduledBundles.try_emplace(I).first->getSecond().push_back(
20737 BundlePtr.get());
20738 }
20739 assert(BundlePtr && *BundlePtr && "Failed to find schedule bundle");
20740 return *BundlePtr;
20741}
20742
20743// Groups the instructions to a bundle (which is then a single scheduling entity)
20744// and schedules instructions until the bundle gets ready.
20745std::optional<BoUpSLP::ScheduleBundle *>
20746BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
20747 const InstructionsState &S,
20748 const EdgeInfo &EI) {
20749 // No need to schedule PHIs, insertelement, extractelement and extractvalue
20750 // instructions.
20751 bool HasCopyables = S.areInstructionsWithCopyableElements();
20752 if (isa<PHINode>(S.getMainOp()) ||
20753 isVectorLikeInstWithConstOps(S.getMainOp()) ||
20754 (!HasCopyables && doesNotNeedToSchedule(VL)) ||
20755 all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))
20756 return nullptr;
20757
20758 // Initialize the instruction bundle.
20759 Instruction *OldScheduleEnd = ScheduleEnd;
20760 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
20761
20762 auto TryScheduleBundleImpl = [=](bool ReSchedule, ScheduleBundle &Bundle) {
20763 // Clear deps or recalculate the region, if the memory instruction is a
20764 // copyable. It may have memory deps, which must be recalculated.
20765 SmallVector<ScheduleData *> ControlDependentMembers;
20766 auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
20767 SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
20768 for (ScheduleEntity *SE : Bundle.getBundle()) {
20769 if (ScheduleCopyableData *SD = dyn_cast<ScheduleCopyableData>(SE)) {
20770 if (ScheduleData *BundleMember = getScheduleData(SD->getInst());
20771 BundleMember && BundleMember->hasValidDependencies()) {
20772 BundleMember->clearDirectDependencies();
20773 if (RegionHasStackSave ||
20775 BundleMember->getInst()))
20776 ControlDependentMembers.push_back(BundleMember);
20777 }
20778 continue;
20779 }
20780 auto *SD = cast<ScheduleData>(SE);
20781 for (const Use &U : SD->getInst()->operands()) {
20782 unsigned &NumOps =
20783 UserOpToNumOps
20784 .try_emplace(std::make_pair(SD->getInst(), U.get()), 0)
20785 .first->getSecond();
20786 ++NumOps;
20787 if (auto *Op = dyn_cast<Instruction>(U.get());
20788 Op && areAllOperandsReplacedByCopyableData(SD->getInst(), Op,
20789 *SLP, NumOps)) {
20790 if (ScheduleData *OpSD = getScheduleData(Op);
20791 OpSD && OpSD->hasValidDependencies()) {
20792 OpSD->clearDirectDependencies();
20793 if (RegionHasStackSave ||
20795 ControlDependentMembers.push_back(OpSD);
20796 }
20797 }
20798 }
20799 }
20800 };
20801 // The scheduling region got new instructions at the lower end (or it is a
20802 // new region for the first bundle). This makes it necessary to
20803 // recalculate all dependencies.
20804 // It is seldom that this needs to be done a second time after adding the
20805 // initial bundle to the region.
20806 if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
20807 for_each(ScheduleDataMap, [&](auto &P) {
20808 if (BB != P.first->getParent())
20809 return;
20810 ScheduleData *SD = P.second;
20811 if (isInSchedulingRegion(*SD))
20812 SD->clearDependencies();
20813 });
20814 for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {
20815 for_each(P.second, [&](ScheduleCopyableData *SD) {
20816 if (isInSchedulingRegion(*SD))
20817 SD->clearDependencies();
20818 });
20819 });
20820 ReSchedule = true;
20821 }
20822 // Check if the bundle data has deps for copyable elements already. In
20823 // this case need to reset deps and recalculate it.
20824 if (Bundle && !Bundle.getBundle().empty()) {
20825 if (S.areInstructionsWithCopyableElements() ||
20826 !ScheduleCopyableDataMap.empty())
20827 CheckIfNeedToClearDeps(Bundle);
20828 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << Bundle << " in block "
20829 << BB->getName() << "\n");
20830 calculateDependencies(Bundle, /*InsertInReadyList=*/!ReSchedule, SLP,
20831 ControlDependentMembers);
20832 } else if (!ControlDependentMembers.empty()) {
20833 ScheduleBundle Invalid = ScheduleBundle::invalid();
20834 calculateDependencies(Invalid, /*InsertInReadyList=*/!ReSchedule, SLP,
20835 ControlDependentMembers);
20836 }
20837
20838 if (ReSchedule) {
20839 resetSchedule();
20840 initialFillReadyList(ReadyInsts);
20841 }
20842
20843 // Now try to schedule the new bundle or (if no bundle) just calculate
20844 // dependencies. As soon as the bundle is "ready" it means that there are no
20845 // cyclic dependencies and we can schedule it. Note that's important that we
20846 // don't "schedule" the bundle yet.
20847 while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
20848 !ReadyInsts.empty()) {
20849 ScheduleEntity *Picked = ReadyInsts.pop_back_val();
20850 assert(Picked->isReady() && "must be ready to schedule");
20851 schedule(*SLP, S, EI, Picked, ReadyInsts);
20852 if (Picked == &Bundle)
20853 break;
20854 }
20855 };
20856
20857 // Make sure that the scheduling region contains all
20858 // instructions of the bundle.
20859 for (Value *V : VL) {
20860 if (S.isNonSchedulable(V))
20861 continue;
20862 if (!extendSchedulingRegion(V, S)) {
20863 // If the scheduling region got new instructions at the lower end (or it
20864 // is a new region for the first bundle). This makes it necessary to
20865 // recalculate all dependencies.
20866 // Otherwise the compiler may crash trying to incorrectly calculate
20867 // dependencies and emit instruction in the wrong order at the actual
20868 // scheduling.
20869 ScheduleBundle Invalid = ScheduleBundle::invalid();
20870 TryScheduleBundleImpl(/*ReSchedule=*/false, Invalid);
20871 return std::nullopt;
20872 }
20873 }
20874
20875 bool ReSchedule = false;
20876 for (Value *V : VL) {
20877 if (S.isNonSchedulable(V))
20878 continue;
20880 getScheduleCopyableData(cast<Instruction>(V));
20881 if (!CopyableData.empty()) {
20882 for (ScheduleCopyableData *SD : CopyableData)
20883 ReadyInsts.remove(SD);
20884 }
20885 ScheduleData *BundleMember = getScheduleData(V);
20886 assert((BundleMember || S.isCopyableElement(V)) &&
20887 "no ScheduleData for bundle member (maybe not in same basic block)");
20888 if (!BundleMember)
20889 continue;
20890
20891 // Make sure we don't leave the pieces of the bundle in the ready list when
20892 // whole bundle might not be ready.
20893 ReadyInsts.remove(BundleMember);
20894 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V);
20895 !Bundles.empty()) {
20896 for (ScheduleBundle *B : Bundles)
20897 ReadyInsts.remove(B);
20898 }
20899
20900 if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
20901 continue;
20902 // A bundle member was scheduled as single instruction before and now
20903 // needs to be scheduled as part of the bundle. We just get rid of the
20904 // existing schedule.
20905 // A bundle member has deps calculated before it was copyable element - need
20906 // to reschedule.
20907 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
20908 << " was already scheduled\n");
20909 ReSchedule = true;
20910 }
20911
20912 ScheduleBundle &Bundle = buildBundle(VL, S, EI);
20913 TryScheduleBundleImpl(ReSchedule, Bundle);
20914 if (!Bundle.isReady()) {
20915 for (ScheduleEntity *BD : Bundle.getBundle()) {
20916 // Copyable data scheduling is just removed.
20917 if (isa<ScheduleCopyableData>(BD))
20918 continue;
20919 if (BD->isReady()) {
20920 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(BD->getInst());
20921 if (Bundles.empty()) {
20922 ReadyInsts.insert(BD);
20923 continue;
20924 }
20925 for (ScheduleBundle *B : Bundles)
20926 if (B->isReady())
20927 ReadyInsts.insert(B);
20928 }
20929 }
20930 ScheduledBundlesList.pop_back();
20931 SmallVector<ScheduleData *> ControlDependentMembers;
20933 for (Value *V : VL) {
20934 if (S.isNonSchedulable(V))
20935 continue;
20936 auto *I = cast<Instruction>(V);
20937 if (S.isCopyableElement(I)) {
20938 // Remove the copyable data from the scheduling region and restore
20939 // previous mappings.
20940 auto KV = std::make_pair(EI, I);
20941 assert(ScheduleCopyableDataMap.contains(KV) &&
20942 "no ScheduleCopyableData for copyable element");
20943 ScheduleCopyableData *SD =
20944 ScheduleCopyableDataMapByInst.find(I)->getSecond().pop_back_val();
20945 ScheduleCopyableDataMapByUsers[I].remove(SD);
20946 if (EI.UserTE) {
20947 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
20948 const auto *It = find(Op, I);
20949 assert(It != Op.end() && "Lane not set");
20951 do {
20952 int Lane = std::distance(Op.begin(), It);
20953 assert(Lane >= 0 && "Lane not set");
20954 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
20955 !EI.UserTE->ReorderIndices.empty())
20956 Lane = EI.UserTE->ReorderIndices[Lane];
20957 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
20958 "Couldn't find extract lane");
20959 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
20960 if (!Visited.insert(In).second) {
20961 It = find(make_range(std::next(It), Op.end()), I);
20962 break;
20963 }
20964 ScheduleCopyableDataMapByInstUser
20965 [std::make_pair(std::make_pair(In, EI.EdgeIdx), I)]
20966 .pop_back();
20967 It = find(make_range(std::next(It), Op.end()), I);
20968 } while (It != Op.end());
20969 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
20970 if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI, I))
20971 ScheduleCopyableDataMapByUsers[I].insert(UserCD);
20972 }
20973 if (ScheduleCopyableDataMapByUsers[I].empty())
20974 ScheduleCopyableDataMapByUsers.erase(I);
20975 ScheduleCopyableDataMap.erase(KV);
20976 // Need to recalculate dependencies for the actual schedule data.
20977 if (ScheduleData *OpSD = getScheduleData(I);
20978 OpSD && OpSD->hasValidDependencies()) {
20979 OpSD->clearDirectDependencies();
20980 if (RegionHasStackSave ||
20982 ControlDependentMembers.push_back(OpSD);
20983 }
20984 continue;
20985 }
20986 ScheduledBundles.find(I)->getSecond().pop_back();
20987 }
20988 if (!ControlDependentMembers.empty()) {
20989 ScheduleBundle Invalid = ScheduleBundle::invalid();
20990 calculateDependencies(Invalid, /*InsertInReadyList=*/false, SLP,
20991 ControlDependentMembers);
20992 }
20993 return std::nullopt;
20994 }
20995 return &Bundle;
20996}
20997
20998BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
20999 // Allocate a new ScheduleData for the instruction.
21000 if (ChunkPos >= ChunkSize) {
21001 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
21002 ChunkPos = 0;
21003 }
21004 return &(ScheduleDataChunks.back()[ChunkPos++]);
21005}
21006
21007bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
21008 Value *V, const InstructionsState &S) {
21009 Instruction *I = dyn_cast<Instruction>(V);
21010 assert(I && "bundle member must be an instruction");
21011 if (getScheduleData(I))
21012 return true;
21013 if (!ScheduleStart) {
21014 // It's the first instruction in the new region.
21015 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
21016 ScheduleStart = I;
21017 ScheduleEnd = I->getNextNode();
21018 assert(ScheduleEnd && "tried to vectorize a terminator?");
21019 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
21020 return true;
21021 }
21022 // Search up and down at the same time, because we don't know if the new
21023 // instruction is above or below the existing scheduling region.
21024 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
21025 // against the budget. Otherwise debug info could affect codegen.
21027 ++ScheduleStart->getIterator().getReverse();
21028 BasicBlock::reverse_iterator UpperEnd = BB->rend();
21029 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
21030 BasicBlock::iterator LowerEnd = BB->end();
21031 auto IsAssumeLikeIntr = [](const Instruction &I) {
21032 if (auto *II = dyn_cast<IntrinsicInst>(&I))
21033 return II->isAssumeLikeIntrinsic();
21034 return false;
21035 };
21036 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21037 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21038 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
21039 &*DownIter != I) {
21040 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
21041 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
21042 return false;
21043 }
21044
21045 ++UpIter;
21046 ++DownIter;
21047
21048 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21049 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21050 }
21051 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
21052 assert(I->getParent() == ScheduleStart->getParent() &&
21053 "Instruction is in wrong basic block.");
21054 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
21055 ScheduleStart = I;
21056 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
21057 << "\n");
21058 return true;
21059 }
21060 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
21061 "Expected to reach top of the basic block or instruction down the "
21062 "lower end.");
21063 assert(I->getParent() == ScheduleEnd->getParent() &&
21064 "Instruction is in wrong basic block.");
21065 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
21066 nullptr);
21067 ScheduleEnd = I->getNextNode();
21068 assert(ScheduleEnd && "tried to vectorize a terminator?");
21069 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
21070 return true;
21071}
21072
21073void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
21074 Instruction *ToI,
21075 ScheduleData *PrevLoadStore,
21076 ScheduleData *NextLoadStore) {
21077 ScheduleData *CurrentLoadStore = PrevLoadStore;
21078 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
21079 // No need to allocate data for non-schedulable instructions.
21080 if (isa<PHINode>(I))
21081 continue;
21082 ScheduleData *SD = ScheduleDataMap.lookup(I);
21083 if (!SD) {
21084 SD = allocateScheduleDataChunks();
21085 ScheduleDataMap[I] = SD;
21086 }
21087 assert(!isInSchedulingRegion(*SD) &&
21088 "new ScheduleData already in scheduling region");
21089 SD->init(SchedulingRegionID, I);
21090
21091 if (I->mayReadOrWriteMemory() &&
21092 (!isa<IntrinsicInst>(I) ||
21093 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
21094 cast<IntrinsicInst>(I)->getIntrinsicID() !=
21095 Intrinsic::pseudoprobe))) {
21096 // Update the linked list of memory accessing instructions.
21097 if (CurrentLoadStore) {
21098 CurrentLoadStore->setNextLoadStore(SD);
21099 } else {
21100 FirstLoadStoreInRegion = SD;
21101 }
21102 CurrentLoadStore = SD;
21103 }
21104
21105 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
21106 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
21107 RegionHasStackSave = true;
21108 }
21109 if (NextLoadStore) {
21110 if (CurrentLoadStore)
21111 CurrentLoadStore->setNextLoadStore(NextLoadStore);
21112 } else {
21113 LastLoadStoreInRegion = CurrentLoadStore;
21114 }
21115}
21116
21117void BoUpSLP::BlockScheduling::calculateDependencies(
21118 ScheduleBundle &Bundle, bool InsertInReadyList, BoUpSLP *SLP,
21119 ArrayRef<ScheduleData *> ControlDeps) {
21121 auto ProcessNode = [&](ScheduleEntity *SE) {
21122 if (auto *CD = dyn_cast<ScheduleCopyableData>(SE)) {
21123 if (CD->hasValidDependencies())
21124 return;
21125 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *CD << "\n");
21126 CD->initDependencies();
21127 CD->resetUnscheduledDeps();
21128 const EdgeInfo &EI = CD->getEdgeInfo();
21129 if (EI.UserTE) {
21130 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
21131 const auto *It = find(Op, CD->getInst());
21132 assert(It != Op.end() && "Lane not set");
21134 do {
21135 int Lane = std::distance(Op.begin(), It);
21136 assert(Lane >= 0 && "Lane not set");
21137 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
21138 !EI.UserTE->ReorderIndices.empty())
21139 Lane = EI.UserTE->ReorderIndices[Lane];
21140 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
21141 "Couldn't find extract lane");
21142 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
21143 if (EI.UserTE->isCopyableElement(In)) {
21144 // We may have not have related copyable scheduling data, if the
21145 // instruction is non-schedulable.
21146 if (ScheduleCopyableData *UseSD =
21147 getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) {
21148 CD->incDependencies();
21149 if (!UseSD->isScheduled())
21150 CD->incrementUnscheduledDeps(1);
21151 if (!UseSD->hasValidDependencies() ||
21152 (InsertInReadyList && UseSD->isReady()))
21153 WorkList.push_back(UseSD);
21154 }
21155 } else if (Visited.insert(In).second) {
21156 if (ScheduleData *UseSD = getScheduleData(In)) {
21157 CD->incDependencies();
21158 if (!UseSD->isScheduled())
21159 CD->incrementUnscheduledDeps(1);
21160 if (!UseSD->hasValidDependencies() ||
21161 (InsertInReadyList && UseSD->isReady()))
21162 WorkList.push_back(UseSD);
21163 }
21164 }
21165 It = find(make_range(std::next(It), Op.end()), CD->getInst());
21166 } while (It != Op.end());
21167 if (CD->isReady() && CD->getDependencies() == 0 &&
21168 (EI.UserTE->hasState() &&
21169 (EI.UserTE->getMainOp()->getParent() !=
21170 CD->getInst()->getParent() ||
21171 (isa<PHINode>(EI.UserTE->getMainOp()) &&
21172 (EI.UserTE->getMainOp()->hasNUsesOrMore(UsesLimit) ||
21173 any_of(EI.UserTE->getMainOp()->users(), [&](User *U) {
21174 auto *IU = dyn_cast<Instruction>(U);
21175 if (!IU)
21176 return true;
21177 return IU->getParent() == EI.UserTE->getMainOp()->getParent();
21178 })))))) {
21179 // If no uses in the block - mark as having pseudo-use, which cannot
21180 // be scheduled.
21181 // Prevents incorrect def-use tracking between external user and
21182 // actual instruction.
21183 CD->incDependencies();
21184 CD->incrementUnscheduledDeps(1);
21185 }
21186 }
21187 return;
21188 }
21189 auto *BundleMember = cast<ScheduleData>(SE);
21190 if (BundleMember->hasValidDependencies())
21191 return;
21192 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
21193 BundleMember->initDependencies();
21194 BundleMember->resetUnscheduledDeps();
21195 // Handle def-use chain dependencies.
21197 for (User *U : BundleMember->getInst()->users()) {
21198 if (isa<PHINode>(U))
21199 continue;
21200 if (ScheduleData *UseSD = getScheduleData(U)) {
21201 // The operand is a copyable element - skip.
21202 unsigned &NumOps = UserToNumOps.try_emplace(U, 0).first->getSecond();
21203 ++NumOps;
21204 if (areAllOperandsReplacedByCopyableData(
21205 cast<Instruction>(U), BundleMember->getInst(), *SLP, NumOps))
21206 continue;
21207 BundleMember->incDependencies();
21208 if (!UseSD->isScheduled())
21209 BundleMember->incrementUnscheduledDeps(1);
21210 if (!UseSD->hasValidDependencies() ||
21211 (InsertInReadyList && UseSD->isReady()))
21212 WorkList.push_back(UseSD);
21213 }
21214 }
21215 for (ScheduleCopyableData *UseSD :
21216 getScheduleCopyableDataUsers(BundleMember->getInst())) {
21217 BundleMember->incDependencies();
21218 if (!UseSD->isScheduled())
21219 BundleMember->incrementUnscheduledDeps(1);
21220 if (!UseSD->hasValidDependencies() ||
21221 (InsertInReadyList && UseSD->isReady()))
21222 WorkList.push_back(UseSD);
21223 }
21224
21226 auto MakeControlDependent = [&](Instruction *I) {
21227 // Do not mark control dependent twice.
21228 if (!Visited.insert(I).second)
21229 return;
21230 auto *DepDest = getScheduleData(I);
21231 assert(DepDest && "must be in schedule window");
21232 DepDest->addControlDependency(BundleMember);
21233 BundleMember->incDependencies();
21234 if (!DepDest->isScheduled())
21235 BundleMember->incrementUnscheduledDeps(1);
21236 if (!DepDest->hasValidDependencies() ||
21237 (InsertInReadyList && DepDest->isReady()))
21238 WorkList.push_back(DepDest);
21239 };
21240
21241 // Any instruction which isn't safe to speculate at the beginning of the
21242 // block is control depend on any early exit or non-willreturn call
21243 // which proceeds it.
21244 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->getInst())) {
21245 for (Instruction *I = BundleMember->getInst()->getNextNode();
21246 I != ScheduleEnd; I = I->getNextNode()) {
21247 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
21248 continue;
21249
21250 // Add the dependency
21251 MakeControlDependent(I);
21252
21254 // Everything past here must be control dependent on I.
21255 break;
21256 }
21257 }
21258
21259 if (RegionHasStackSave) {
21260 // If we have an inalloc alloca instruction, it needs to be scheduled
21261 // after any preceeding stacksave. We also need to prevent any alloca
21262 // from reordering above a preceeding stackrestore.
21263 if (match(BundleMember->getInst(), m_Intrinsic<Intrinsic::stacksave>()) ||
21264 match(BundleMember->getInst(),
21265 m_Intrinsic<Intrinsic::stackrestore>())) {
21266 for (Instruction *I = BundleMember->getInst()->getNextNode();
21267 I != ScheduleEnd; I = I->getNextNode()) {
21268 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
21269 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
21270 // Any allocas past here must be control dependent on I, and I
21271 // must be memory dependend on BundleMember->Inst.
21272 break;
21273
21274 if (!isa<AllocaInst>(I))
21275 continue;
21276
21277 // Add the dependency
21278 MakeControlDependent(I);
21279 }
21280 }
21281
21282 // In addition to the cases handle just above, we need to prevent
21283 // allocas and loads/stores from moving below a stacksave or a
21284 // stackrestore. Avoiding moving allocas below stackrestore is currently
21285 // thought to be conservatism. Moving loads/stores below a stackrestore
21286 // can lead to incorrect code.
21287 if (isa<AllocaInst>(BundleMember->getInst()) ||
21288 BundleMember->getInst()->mayReadOrWriteMemory()) {
21289 for (Instruction *I = BundleMember->getInst()->getNextNode();
21290 I != ScheduleEnd; I = I->getNextNode()) {
21291 if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&
21292 !match(I, m_Intrinsic<Intrinsic::stackrestore>()))
21293 continue;
21294
21295 // Add the dependency
21296 MakeControlDependent(I);
21297 break;
21298 }
21299 }
21300 }
21301
21302 // Handle the memory dependencies (if any).
21303 ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
21304 if (!NextLoadStore)
21305 return;
21306 Instruction *SrcInst = BundleMember->getInst();
21307 assert(SrcInst->mayReadOrWriteMemory() &&
21308 "NextLoadStore list for non memory effecting bundle?");
21309 MemoryLocation SrcLoc = getLocation(SrcInst);
21310 bool SrcMayWrite = SrcInst->mayWriteToMemory();
21311 unsigned NumAliased = 0;
21312 unsigned DistToSrc = 1;
21313 bool IsNonSimpleSrc = !SrcLoc.Ptr || !isSimple(SrcInst);
21314
21315 for (ScheduleData *DepDest = NextLoadStore; DepDest;
21316 DepDest = DepDest->getNextLoadStore()) {
21317 assert(isInSchedulingRegion(*DepDest) && "Expected to be in region");
21318
21319 // We have two limits to reduce the complexity:
21320 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
21321 // SLP->isAliased (which is the expensive part in this loop).
21322 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
21323 // the whole loop (even if the loop is fast, it's quadratic).
21324 // It's important for the loop break condition (see below) to
21325 // check this limit even between two read-only instructions.
21326 if (DistToSrc >= MaxMemDepDistance ||
21327 ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
21328 (IsNonSimpleSrc || NumAliased >= AliasedCheckLimit ||
21329 SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
21330
21331 // We increment the counter only if the locations are aliased
21332 // (instead of counting all alias checks). This gives a better
21333 // balance between reduced runtime and accurate dependencies.
21334 NumAliased++;
21335
21336 DepDest->addMemoryDependency(BundleMember);
21337 BundleMember->incDependencies();
21338 if (!DepDest->isScheduled())
21339 BundleMember->incrementUnscheduledDeps(1);
21340 if (!DepDest->hasValidDependencies() ||
21341 (InsertInReadyList && DepDest->isReady()))
21342 WorkList.push_back(DepDest);
21343 }
21344
21345 // Example, explaining the loop break condition: Let's assume our
21346 // starting instruction is i0 and MaxMemDepDistance = 3.
21347 //
21348 // +--------v--v--v
21349 // i0,i1,i2,i3,i4,i5,i6,i7,i8
21350 // +--------^--^--^
21351 //
21352 // MaxMemDepDistance let us stop alias-checking at i3 and we add
21353 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
21354 // Previously we already added dependencies from i3 to i6,i7,i8
21355 // (because of MaxMemDepDistance). As we added a dependency from
21356 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
21357 // and we can abort this loop at i6.
21358 if (DistToSrc >= 2 * MaxMemDepDistance)
21359 break;
21360 DistToSrc++;
21361 }
21362 };
21363
21364 assert((Bundle || !ControlDeps.empty()) &&
21365 "expected at least one instruction to schedule");
21366 if (Bundle)
21367 WorkList.push_back(Bundle.getBundle().front());
21368 WorkList.append(ControlDeps.begin(), ControlDeps.end());
21370 while (!WorkList.empty()) {
21371 ScheduleEntity *SD = WorkList.pop_back_val();
21372 SmallVector<ScheduleBundle *, 1> CopyableBundle;
21374 if (auto *CD = dyn_cast<ScheduleCopyableData>(SD)) {
21375 CopyableBundle.push_back(&CD->getBundle());
21376 Bundles = CopyableBundle;
21377 } else {
21378 Bundles = getScheduleBundles(SD->getInst());
21379 }
21380 if (Bundles.empty()) {
21381 if (!SD->hasValidDependencies())
21382 ProcessNode(SD);
21383 if (InsertInReadyList && SD->isReady()) {
21384 ReadyInsts.insert(SD);
21385 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD << "\n");
21386 }
21387 continue;
21388 }
21389 for (ScheduleBundle *Bundle : Bundles) {
21390 if (Bundle->hasValidDependencies() || !Visited.insert(Bundle).second)
21391 continue;
21392 assert(isInSchedulingRegion(*Bundle) &&
21393 "ScheduleData not in scheduling region");
21394 for_each(Bundle->getBundle(), ProcessNode);
21395 }
21396 if (InsertInReadyList && SD->isReady()) {
21397 for (ScheduleBundle *Bundle : Bundles) {
21398 assert(isInSchedulingRegion(*Bundle) &&
21399 "ScheduleData not in scheduling region");
21400 if (!Bundle->isReady())
21401 continue;
21402 ReadyInsts.insert(Bundle);
21403 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *Bundle
21404 << "\n");
21405 }
21406 }
21407 }
21408}
21409
21410void BoUpSLP::BlockScheduling::resetSchedule() {
21411 assert(ScheduleStart &&
21412 "tried to reset schedule on block which has not been scheduled");
21413 for_each(ScheduleDataMap, [&](auto &P) {
21414 if (BB != P.first->getParent())
21415 return;
21416 ScheduleData *SD = P.second;
21417 if (isInSchedulingRegion(*SD)) {
21418 SD->setScheduled(/*Scheduled=*/false);
21419 SD->resetUnscheduledDeps();
21420 }
21421 });
21422 for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {
21423 for_each(P.second, [&](ScheduleCopyableData *SD) {
21424 if (isInSchedulingRegion(*SD)) {
21425 SD->setScheduled(/*Scheduled=*/false);
21426 SD->resetUnscheduledDeps();
21427 }
21428 });
21429 });
21430 for_each(ScheduledBundles, [&](auto &P) {
21431 for_each(P.second, [&](ScheduleBundle *Bundle) {
21432 if (isInSchedulingRegion(*Bundle))
21433 Bundle->setScheduled(/*Scheduled=*/false);
21434 });
21435 });
21436 // Reset schedule data for copyable elements.
21437 for (auto &P : ScheduleCopyableDataMap) {
21438 if (isInSchedulingRegion(*P.second)) {
21439 P.second->setScheduled(/*Scheduled=*/false);
21440 P.second->resetUnscheduledDeps();
21441 }
21442 }
21443 ReadyInsts.clear();
21444}
21445
21446void BoUpSLP::scheduleBlock(const BoUpSLP &R, BlockScheduling *BS) {
21447 if (!BS->ScheduleStart)
21448 return;
21449
21450 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
21451
21452 // A key point - if we got here, pre-scheduling was able to find a valid
21453 // scheduling of the sub-graph of the scheduling window which consists
21454 // of all vector bundles and their transitive users. As such, we do not
21455 // need to reschedule anything *outside of* that subgraph.
21456
21457 BS->resetSchedule();
21458
21459 // For the real scheduling we use a more sophisticated ready-list: it is
21460 // sorted by the original instruction location. This lets the final schedule
21461 // be as close as possible to the original instruction order.
21462 // WARNING: If changing this order causes a correctness issue, that means
21463 // there is some missing dependence edge in the schedule data graph.
21464 struct ScheduleDataCompare {
21465 bool operator()(const ScheduleEntity *SD1,
21466 const ScheduleEntity *SD2) const {
21467 return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
21468 }
21469 };
21470 std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
21471
21472 // Ensure that all dependency data is updated (for nodes in the sub-graph)
21473 // and fill the ready-list with initial instructions.
21474 int Idx = 0;
21475 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
21476 I = I->getNextNode()) {
21477 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
21478 if (!Bundles.empty()) {
21479 for (ScheduleBundle *Bundle : Bundles) {
21480 Bundle->setSchedulingPriority(Idx++);
21481 if (!Bundle->hasValidDependencies())
21482 BS->calculateDependencies(*Bundle, /*InsertInReadyList=*/false, this);
21483 }
21484 SmallVector<ScheduleCopyableData *> SDs = BS->getScheduleCopyableData(I);
21485 for (ScheduleCopyableData *SD : reverse(SDs)) {
21486 ScheduleBundle &Bundle = SD->getBundle();
21487 Bundle.setSchedulingPriority(Idx++);
21488 if (!Bundle.hasValidDependencies())
21489 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
21490 }
21491 continue;
21492 }
21494 BS->getScheduleCopyableDataUsers(I);
21495 if (ScheduleData *SD = BS->getScheduleData(I)) {
21496 [[maybe_unused]] ArrayRef<TreeEntry *> SDTEs = getTreeEntries(I);
21497 assert((isVectorLikeInstWithConstOps(SD->getInst()) || SDTEs.empty() ||
21498 SDTEs.front()->doesNotNeedToSchedule() ||
21500 "scheduler and vectorizer bundle mismatch");
21501 SD->setSchedulingPriority(Idx++);
21502 if (!SD->hasValidDependencies() &&
21503 (!CopyableData.empty() ||
21504 any_of(R.ValueToGatherNodes.lookup(I), [&](const TreeEntry *TE) {
21505 assert(TE->isGather() && "expected gather node");
21506 return TE->hasState() && TE->hasCopyableElements() &&
21507 TE->isCopyableElement(I);
21508 }))) {
21509 // Need to calculate deps for these nodes to correctly handle copyable
21510 // dependencies, even if they were cancelled.
21511 // If copyables bundle was cancelled, the deps are cleared and need to
21512 // recalculate them.
21513 ScheduleBundle Bundle;
21514 Bundle.add(SD);
21515 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
21516 }
21517 }
21518 for (ScheduleCopyableData *SD : reverse(CopyableData)) {
21519 ScheduleBundle &Bundle = SD->getBundle();
21520 Bundle.setSchedulingPriority(Idx++);
21521 if (!Bundle.hasValidDependencies())
21522 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
21523 }
21524 }
21525 BS->initialFillReadyList(ReadyInsts);
21526
21527 Instruction *LastScheduledInst = BS->ScheduleEnd;
21528
21529 // Do the "real" scheduling.
21531 while (!ReadyInsts.empty()) {
21532 auto *Picked = *ReadyInsts.begin();
21533 ReadyInsts.erase(ReadyInsts.begin());
21534
21535 // Move the scheduled instruction(s) to their dedicated places, if not
21536 // there yet.
21537 if (auto *Bundle = dyn_cast<ScheduleBundle>(Picked)) {
21538 for (const ScheduleEntity *BundleMember : Bundle->getBundle()) {
21539 Instruction *PickedInst = BundleMember->getInst();
21540 // If copyable must be schedule as part of something else, skip it.
21541 bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst);
21542 if ((IsCopyable && BS->getScheduleData(PickedInst)) ||
21543 (!IsCopyable && !Scheduled.insert(PickedInst).second))
21544 continue;
21545 if (PickedInst->getNextNode() != LastScheduledInst)
21546 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
21547 LastScheduledInst = PickedInst;
21548 }
21549 EntryToLastInstruction.try_emplace(Bundle->getTreeEntry(),
21550 LastScheduledInst);
21551 } else {
21552 auto *SD = cast<ScheduleData>(Picked);
21553 Instruction *PickedInst = SD->getInst();
21554 if (PickedInst->getNextNode() != LastScheduledInst)
21555 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
21556 LastScheduledInst = PickedInst;
21557 }
21558 auto Invalid = InstructionsState::invalid();
21559 BS->schedule(R, Invalid, EdgeInfo(), Picked, ReadyInsts);
21560 }
21561
21562 // Check that we didn't break any of our invariants.
21563#ifdef EXPENSIVE_CHECKS
21564 BS->verify();
21565#endif
21566
21567#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
21568 // Check that all schedulable entities got scheduled
21569 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
21570 I = I->getNextNode()) {
21571 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
21572 assert(all_of(Bundles,
21573 [](const ScheduleBundle *Bundle) {
21574 return Bundle->isScheduled();
21575 }) &&
21576 "must be scheduled at this point");
21577 }
21578#endif
21579
21580 // Avoid duplicate scheduling of the block.
21581 BS->ScheduleStart = nullptr;
21582}
21583
21585 // If V is a store, just return the width of the stored value (or value
21586 // truncated just before storing) without traversing the expression tree.
21587 // This is the common case.
21588 if (auto *Store = dyn_cast<StoreInst>(V))
21589 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
21590
21591 if (auto *IEI = dyn_cast<InsertElementInst>(V))
21592 return getVectorElementSize(IEI->getOperand(1));
21593
21594 auto E = InstrElementSize.find(V);
21595 if (E != InstrElementSize.end())
21596 return E->second;
21597
21598 // If V is not a store, we can traverse the expression tree to find loads
21599 // that feed it. The type of the loaded value may indicate a more suitable
21600 // width than V's type. We want to base the vector element size on the width
21601 // of memory operations where possible.
21604 if (auto *I = dyn_cast<Instruction>(V)) {
21605 Worklist.emplace_back(I, I->getParent(), 0);
21606 Visited.insert(I);
21607 }
21608
21609 // Traverse the expression tree in bottom-up order looking for loads. If we
21610 // encounter an instruction we don't yet handle, we give up.
21611 auto Width = 0u;
21612 Value *FirstNonBool = nullptr;
21613 while (!Worklist.empty()) {
21614 auto [I, Parent, Level] = Worklist.pop_back_val();
21615
21616 // We should only be looking at scalar instructions here. If the current
21617 // instruction has a vector type, skip.
21618 auto *Ty = I->getType();
21619 if (isa<VectorType>(Ty))
21620 continue;
21621 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
21622 FirstNonBool = I;
21623 if (Level > RecursionMaxDepth)
21624 continue;
21625
21626 // If the current instruction is a load, update MaxWidth to reflect the
21627 // width of the loaded value.
21628 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(I))
21629 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
21630
21631 // Otherwise, we need to visit the operands of the instruction. We only
21632 // handle the interesting cases from buildTree here. If an operand is an
21633 // instruction we haven't yet visited and from the same basic block as the
21634 // user or the use is a PHI node, we add it to the worklist.
21637 for (Use &U : I->operands()) {
21638 if (auto *J = dyn_cast<Instruction>(U.get()))
21639 if (Visited.insert(J).second &&
21640 (isa<PHINode>(I) || J->getParent() == Parent)) {
21641 Worklist.emplace_back(J, J->getParent(), Level + 1);
21642 continue;
21643 }
21644 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
21645 FirstNonBool = U.get();
21646 }
21647 } else {
21648 break;
21649 }
21650 }
21651
21652 // If we didn't encounter a memory access in the expression tree, or if we
21653 // gave up for some reason, just return the width of V. Otherwise, return the
21654 // maximum width we found.
21655 if (!Width) {
21656 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
21657 V = FirstNonBool;
21658 Width = DL->getTypeSizeInBits(V->getType());
21659 }
21660
21661 for (Instruction *I : Visited)
21662 InstrElementSize[I] = Width;
21663
21664 return Width;
21665}
21666
21667bool BoUpSLP::collectValuesToDemote(
21668 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
21670 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
21671 bool &IsProfitableToDemote, bool IsTruncRoot) const {
21672 // We can always demote constants.
21673 if (all_of(E.Scalars, IsaPred<Constant>))
21674 return true;
21675
21676 unsigned OrigBitWidth =
21677 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
21678 if (OrigBitWidth == BitWidth) {
21679 MaxDepthLevel = 1;
21680 return true;
21681 }
21682
21683 // Check if the node was analyzed already and must keep its original bitwidth.
21684 if (NodesToKeepBWs.contains(E.Idx))
21685 return false;
21686
21687 // If the value is not a vectorized instruction in the expression and not used
21688 // by the insertelement instruction and not used in multiple vector nodes, it
21689 // cannot be demoted.
21690 bool IsSignedNode = any_of(E.Scalars, [&](Value *R) {
21691 if (isa<PoisonValue>(R))
21692 return false;
21693 return !isKnownNonNegative(R, SimplifyQuery(*DL));
21694 });
21695 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
21696 if (isa<PoisonValue>(V))
21697 return true;
21698 if (getTreeEntries(V).size() > 1)
21699 return false;
21700 // For lat shuffle of sext/zext with many uses need to check the extra bit
21701 // for unsigned values, otherwise may have incorrect casting for reused
21702 // scalars.
21703 bool IsSignedVal = !isKnownNonNegative(V, SimplifyQuery(*DL));
21704 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
21705 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
21706 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
21707 return true;
21708 }
21709 unsigned NumSignBits = ComputeNumSignBits(V, *DL, AC, nullptr, DT);
21710 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
21711 if (IsSignedNode)
21712 ++BitWidth1;
21713 if (auto *I = dyn_cast<Instruction>(V)) {
21714 APInt Mask = DB->getDemandedBits(I);
21715 unsigned BitWidth2 =
21716 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
21717 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
21718 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
21719 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
21720 break;
21721 BitWidth2 *= 2;
21722 }
21723 BitWidth1 = std::min(BitWidth1, BitWidth2);
21724 }
21725 BitWidth = std::max(BitWidth, BitWidth1);
21726 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
21727 };
21728 auto FinalAnalysis = [&, TTI = TTI]() {
21729 if (!IsProfitableToDemote)
21730 return false;
21731 bool Res = all_of(
21732 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
21733 // Demote gathers.
21734 if (Res && E.isGather()) {
21735 if (E.hasState()) {
21736 if (const TreeEntry *SameTE =
21737 getSameValuesTreeEntry(E.getMainOp(), E.Scalars);
21738 SameTE)
21739 if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot, BitWidth,
21740 ToDemote, Visited, NodesToKeepBWs,
21741 MaxDepthLevel, IsProfitableToDemote,
21742 IsTruncRoot)) {
21743 ToDemote.push_back(E.Idx);
21744 return true;
21745 }
21746 }
21747 // Check possible extractelement instructions bases and final vector
21748 // length.
21749 SmallPtrSet<Value *, 4> UniqueBases;
21750 for (Value *V : E.Scalars) {
21751 auto *EE = dyn_cast<ExtractElementInst>(V);
21752 if (!EE)
21753 continue;
21754 UniqueBases.insert(EE->getVectorOperand());
21755 }
21756 const unsigned VF = E.Scalars.size();
21757 Type *OrigScalarTy = E.Scalars.front()->getType();
21758 if (UniqueBases.size() <= 2 ||
21759 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF)) >=
21761 *TTI,
21763 IntegerType::get(OrigScalarTy->getContext(), BitWidth),
21764 VF))) {
21765 ToDemote.push_back(E.Idx);
21766 return true;
21767 }
21768 }
21769 return Res;
21770 };
21771 if (E.isGather() || !Visited.insert(&E).second ||
21772 any_of(E.Scalars, [&](Value *V) {
21773 return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {
21774 return isa<InsertElementInst>(U) && !isVectorized(U);
21775 });
21776 }))
21777 return FinalAnalysis();
21778
21779 if (any_of(E.Scalars, [&](Value *V) {
21780 return !isa<Constant>(V) && !all_of(V->users(), [=](User *U) {
21781 return isVectorized(U) ||
21782 (E.Idx == 0 && UserIgnoreList &&
21783 UserIgnoreList->contains(U)) ||
21784 (!isa<CmpInst>(U) && U->getType()->isSized() &&
21785 !U->getType()->isScalableTy() &&
21786 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
21787 }) && !IsPotentiallyTruncated(V, BitWidth);
21788 }))
21789 return false;
21790
21791 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
21792 bool &NeedToExit) {
21793 NeedToExit = false;
21794 unsigned InitLevel = MaxDepthLevel;
21795 for (const TreeEntry *Op : Operands) {
21796 unsigned Level = InitLevel;
21797 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
21798 ToDemote, Visited, NodesToKeepBWs, Level,
21799 IsProfitableToDemote, IsTruncRoot)) {
21800 if (!IsProfitableToDemote)
21801 return false;
21802 NeedToExit = true;
21803 if (!FinalAnalysis())
21804 return false;
21805 continue;
21806 }
21807 MaxDepthLevel = std::max(MaxDepthLevel, Level);
21808 }
21809 return true;
21810 };
21811 auto AttemptCheckBitwidth =
21812 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
21813 // Try all bitwidth < OrigBitWidth.
21814 NeedToExit = false;
21815 unsigned BestFailBitwidth = 0;
21816 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
21817 if (Checker(BitWidth, OrigBitWidth))
21818 return true;
21819 if (BestFailBitwidth == 0 && FinalAnalysis())
21820 BestFailBitwidth = BitWidth;
21821 }
21822 if (BitWidth >= OrigBitWidth) {
21823 if (BestFailBitwidth == 0) {
21824 BitWidth = OrigBitWidth;
21825 return false;
21826 }
21827 MaxDepthLevel = 1;
21828 BitWidth = BestFailBitwidth;
21829 NeedToExit = true;
21830 return true;
21831 }
21832 return false;
21833 };
21834 auto TryProcessInstruction =
21835 [&](unsigned &BitWidth, ArrayRef<const TreeEntry *> Operands = {},
21836 function_ref<bool(unsigned, unsigned)> Checker = {}) {
21837 if (Operands.empty()) {
21838 if (!IsTruncRoot)
21839 MaxDepthLevel = 1;
21840 for (Value *V : E.Scalars)
21841 (void)IsPotentiallyTruncated(V, BitWidth);
21842 } else {
21843 // Several vectorized uses? Check if we can truncate it, otherwise -
21844 // exit.
21845 if (any_of(E.Scalars, [&](Value *V) {
21846 return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth);
21847 }))
21848 return false;
21849 bool NeedToExit = false;
21850 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
21851 return false;
21852 if (NeedToExit)
21853 return true;
21854 if (!ProcessOperands(Operands, NeedToExit))
21855 return false;
21856 if (NeedToExit)
21857 return true;
21858 }
21859
21860 ++MaxDepthLevel;
21861 // Record the entry that we can demote.
21862 ToDemote.push_back(E.Idx);
21863 return IsProfitableToDemote;
21864 };
21865
21866 if (E.State == TreeEntry::SplitVectorize)
21867 return TryProcessInstruction(
21868 BitWidth,
21869 {VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(),
21870 VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()});
21871
21872 switch (E.getOpcode()) {
21873
21874 // We can always demote truncations and extensions. Since truncations can
21875 // seed additional demotion, we save the truncated value.
21876 case Instruction::Trunc:
21877 if (IsProfitableToDemoteRoot)
21878 IsProfitableToDemote = true;
21879 return TryProcessInstruction(BitWidth);
21880 case Instruction::ZExt:
21881 case Instruction::SExt:
21882 IsProfitableToDemote = true;
21883 return TryProcessInstruction(BitWidth);
21884
21885 // We can demote certain binary operations if we can demote both of their
21886 // operands.
21887 case Instruction::Add:
21888 case Instruction::Sub:
21889 case Instruction::Mul:
21890 case Instruction::And:
21891 case Instruction::Or:
21892 case Instruction::Xor: {
21893 return TryProcessInstruction(
21894 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
21895 }
21896 case Instruction::Freeze:
21897 return TryProcessInstruction(BitWidth, getOperandEntry(&E, 0));
21898 case Instruction::Shl: {
21899 // If we are truncating the result of this SHL, and if it's a shift of an
21900 // inrange amount, we can always perform a SHL in a smaller type.
21901 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
21902 return all_of(E.Scalars, [&](Value *V) {
21903 if (isa<PoisonValue>(V))
21904 return true;
21905 auto *I = cast<Instruction>(V);
21906 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
21907 return AmtKnownBits.getMaxValue().ult(BitWidth);
21908 });
21909 };
21910 return TryProcessInstruction(
21911 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
21912 }
21913 case Instruction::LShr: {
21914 // If this is a truncate of a logical shr, we can truncate it to a smaller
21915 // lshr iff we know that the bits we would otherwise be shifting in are
21916 // already zeros.
21917 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
21918 return all_of(E.Scalars, [&](Value *V) {
21919 if (isa<PoisonValue>(V))
21920 return true;
21921 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
21922 if (E.isCopyableElement(V))
21923 return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
21924 auto *I = cast<Instruction>(V);
21925 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
21926 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
21927 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
21928 SimplifyQuery(*DL));
21929 });
21930 };
21931 return TryProcessInstruction(
21932 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
21933 LShrChecker);
21934 }
21935 case Instruction::AShr: {
21936 // If this is a truncate of an arithmetic shr, we can truncate it to a
21937 // smaller ashr iff we know that all the bits from the sign bit of the
21938 // original type and the sign bit of the truncate type are similar.
21939 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
21940 return all_of(E.Scalars, [&](Value *V) {
21941 if (isa<PoisonValue>(V))
21942 return true;
21943 auto *I = cast<Instruction>(V);
21944 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
21945 unsigned ShiftedBits = OrigBitWidth - BitWidth;
21946 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
21947 ShiftedBits <
21948 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
21949 });
21950 };
21951 return TryProcessInstruction(
21952 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
21953 AShrChecker);
21954 }
21955 case Instruction::UDiv:
21956 case Instruction::URem: {
21957 // UDiv and URem can be truncated if all the truncated bits are zero.
21958 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
21959 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
21960 return all_of(E.Scalars, [&](Value *V) {
21961 auto *I = cast<Instruction>(V);
21962 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
21963 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
21964 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
21965 });
21966 };
21967 return TryProcessInstruction(
21968 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
21969 }
21970
21971 // We can demote selects if we can demote their true and false values.
21972 case Instruction::Select: {
21973 return TryProcessInstruction(
21974 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
21975 }
21976
21977 // We can demote phis if we can demote all their incoming operands.
21978 case Instruction::PHI: {
21979 const unsigned NumOps = E.getNumOperands();
21981 transform(seq<unsigned>(0, NumOps), Ops.begin(),
21982 [&](unsigned Idx) { return getOperandEntry(&E, Idx); });
21983
21984 return TryProcessInstruction(BitWidth, Ops);
21985 }
21986
21987 case Instruction::Call: {
21988 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
21989 if (!IC)
21990 break;
21992 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
21993 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
21994 break;
21995 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
21996 function_ref<bool(unsigned, unsigned)> CallChecker;
21997 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
21998 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
21999 return all_of(E.Scalars, [&](Value *V) {
22000 auto *I = cast<Instruction>(V);
22001 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
22002 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22003 return MaskedValueIsZero(I->getOperand(0), Mask,
22004 SimplifyQuery(*DL)) &&
22005 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22006 }
22007 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
22008 "Expected min/max intrinsics only.");
22009 unsigned SignBits = OrigBitWidth - BitWidth;
22010 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
22011 unsigned Op0SignBits =
22012 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22013 unsigned Op1SignBits =
22014 ComputeNumSignBits(I->getOperand(1), *DL, AC, nullptr, DT);
22015 return SignBits <= Op0SignBits &&
22016 ((SignBits != Op0SignBits &&
22017 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
22018 MaskedValueIsZero(I->getOperand(0), Mask,
22019 SimplifyQuery(*DL))) &&
22020 SignBits <= Op1SignBits &&
22021 ((SignBits != Op1SignBits &&
22022 !isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) ||
22023 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
22024 });
22025 };
22026 auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22027 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
22028 return all_of(E.Scalars, [&](Value *V) {
22029 auto *I = cast<Instruction>(V);
22030 unsigned SignBits = OrigBitWidth - BitWidth;
22031 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
22032 unsigned Op0SignBits =
22033 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22034 return SignBits <= Op0SignBits &&
22035 ((SignBits != Op0SignBits &&
22036 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
22037 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
22038 });
22039 };
22040 if (ID != Intrinsic::abs) {
22041 Operands.push_back(getOperandEntry(&E, 1));
22042 CallChecker = CompChecker;
22043 } else {
22044 CallChecker = AbsChecker;
22045 }
22046 InstructionCost BestCost =
22047 std::numeric_limits<InstructionCost::CostType>::max();
22048 unsigned BestBitWidth = BitWidth;
22049 unsigned VF = E.Scalars.size();
22050 // Choose the best bitwidth based on cost estimations.
22051 auto Checker = [&](unsigned BitWidth, unsigned) {
22052 unsigned MinBW = PowerOf2Ceil(BitWidth);
22053 SmallVector<Type *> ArgTys =
22054 buildIntrinsicArgTypes(IC, ID, VF, MinBW, TTI);
22055 auto VecCallCosts = getVectorCallCosts(
22056 IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),
22057 TTI, TLI, ArgTys);
22058 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
22059 if (Cost < BestCost) {
22060 BestCost = Cost;
22061 BestBitWidth = BitWidth;
22062 }
22063 return false;
22064 };
22065 [[maybe_unused]] bool NeedToExit;
22066 (void)AttemptCheckBitwidth(Checker, NeedToExit);
22067 BitWidth = BestBitWidth;
22068 return TryProcessInstruction(BitWidth, Operands, CallChecker);
22069 }
22070
22071 // Otherwise, conservatively give up.
22072 default:
22073 break;
22074 }
22075 MaxDepthLevel = 1;
22076 return FinalAnalysis();
22077}
22078
22079static RecurKind getRdxKind(Value *V);
22080
22082 // We only attempt to truncate integer expressions.
22083 bool IsStoreOrInsertElt =
22084 VectorizableTree.front()->hasState() &&
22085 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
22086 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
22087 if ((IsStoreOrInsertElt || UserIgnoreList) &&
22088 ExtraBitWidthNodes.size() <= 1 &&
22089 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
22090 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
22091 return;
22092
22093 unsigned NodeIdx = 0;
22094 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
22095 NodeIdx = 1;
22096
22097 // Ensure the roots of the vectorizable tree don't form a cycle.
22098 assert((VectorizableTree[NodeIdx]->isGather() || NodeIdx != 0 ||
22099 !VectorizableTree[NodeIdx]->UserTreeIndex) &&
22100 "Unexpected tree is graph.");
22101
22102 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
22103 // resize to the final type.
22104 bool IsTruncRoot = false;
22105 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
22106 SmallVector<unsigned> RootDemotes;
22107 SmallDenseSet<unsigned, 8> NodesToKeepBWs;
22108 if (NodeIdx != 0 &&
22109 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22110 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
22111 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
22112 IsTruncRoot = true;
22113 RootDemotes.push_back(NodeIdx);
22114 IsProfitableToDemoteRoot = true;
22115 ++NodeIdx;
22116 }
22117
22118 // Analyzed the reduction already and not profitable - exit.
22119 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
22120 return;
22121
22122 SmallVector<unsigned> ToDemote;
22123 auto ComputeMaxBitWidth =
22124 [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot,
22125 unsigned Limit, bool IsTruncRoot, bool IsSignedCmp) -> unsigned {
22126 ToDemote.clear();
22127 // Check if the root is trunc and the next node is gather/buildvector, then
22128 // keep trunc in scalars, which is free in most cases.
22129 if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
22130 !NodesToKeepBWs.contains(E.Idx) &&
22131 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
22132 all_of(E.Scalars, [&](Value *V) {
22133 return V->hasOneUse() || isa<Constant>(V) ||
22134 (!V->hasNUsesOrMore(UsesLimit) &&
22135 none_of(V->users(), [&](User *U) {
22136 ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
22137 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22138 if (TEs.empty() || is_contained(TEs, UserTE))
22139 return false;
22140 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22141 SelectInst>(U) ||
22142 isa<SIToFPInst, UIToFPInst>(U) ||
22143 (UserTE->hasState() &&
22144 (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22145 SelectInst>(UserTE->getMainOp()) ||
22146 isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))))
22147 return true;
22148 unsigned UserTESz = DL->getTypeSizeInBits(
22149 UserTE->Scalars.front()->getType());
22150 if (all_of(TEs, [&](const TreeEntry *TE) {
22151 auto It = MinBWs.find(TE);
22152 return It != MinBWs.end() &&
22153 It->second.first > UserTESz;
22154 }))
22155 return true;
22156 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
22157 }));
22158 })) {
22159 ToDemote.push_back(E.Idx);
22160 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22161 auto It = MinBWs.find(UserTE);
22162 if (It != MinBWs.end())
22163 return It->second.first;
22164 unsigned MaxBitWidth =
22165 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
22166 MaxBitWidth = bit_ceil(MaxBitWidth);
22167 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22168 MaxBitWidth = 8;
22169 return MaxBitWidth;
22170 }
22171
22172 if (!E.hasState())
22173 return 0u;
22174
22175 unsigned VF = E.getVectorFactor();
22176 Type *ScalarTy = E.Scalars.front()->getType();
22177 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
22178 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
22179 if (!TreeRootIT)
22180 return 0u;
22181
22182 if (any_of(E.Scalars,
22183 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
22184 return 0u;
22185
22186 unsigned NumParts = ::getNumberOfParts(
22187 *TTI, getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
22188
22189 // The maximum bit width required to represent all the values that can be
22190 // demoted without loss of precision. It would be safe to truncate the roots
22191 // of the expression to this width.
22192 unsigned MaxBitWidth = 1u;
22193
22194 // True if the roots can be zero-extended back to their original type,
22195 // rather than sign-extended. We know that if the leading bits are not
22196 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
22197 // True.
22198 // Determine if the sign bit of all the roots is known to be zero. If not,
22199 // IsKnownPositive is set to False.
22200 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
22201 if (isa<PoisonValue>(R))
22202 return true;
22203 KnownBits Known = computeKnownBits(R, *DL);
22204 return Known.isNonNegative();
22205 });
22206
22207 if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
22208 E.UserTreeIndex.UserTE->hasState() &&
22209 E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
22210 MaxBitWidth =
22211 std::min(DL->getTypeSizeInBits(
22212 E.UserTreeIndex.UserTE->Scalars.front()->getType()),
22213 DL->getTypeSizeInBits(ScalarTy));
22214
22215 // We first check if all the bits of the roots are demanded. If they're not,
22216 // we can truncate the roots to this narrower type.
22217 for (Value *Root : E.Scalars) {
22218 if (isa<PoisonValue>(Root))
22219 continue;
22220 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, AC, nullptr, DT);
22221 TypeSize NumTypeBits =
22222 DL->getTypeSizeInBits(Root->getType()->getScalarType());
22223 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22224 // If we can't prove that the sign bit is zero, we must add one to the
22225 // maximum bit width to account for the unknown sign bit. This preserves
22226 // the existing sign bit so we can safely sign-extend the root back to the
22227 // original type. Otherwise, if we know the sign bit is zero, we will
22228 // zero-extend the root instead.
22229 //
22230 // FIXME: This is somewhat suboptimal, as there will be cases where adding
22231 // one to the maximum bit width will yield a larger-than-necessary
22232 // type. In general, we need to add an extra bit only if we can't
22233 // prove that the upper bit of the original type is equal to the
22234 // upper bit of the proposed smaller type. If these two bits are
22235 // the same (either zero or one) we know that sign-extending from
22236 // the smaller type will result in the same value. Here, since we
22237 // can't yet prove this, we are just making the proposed smaller
22238 // type larger to ensure correctness.
22239 if (!IsKnownPositive)
22240 ++BitWidth1;
22241
22242 auto *I = dyn_cast<Instruction>(Root);
22243 if (!I) {
22244 MaxBitWidth = std::max(BitWidth1, MaxBitWidth);
22245 continue;
22246 }
22247 APInt Mask = DB->getDemandedBits(I);
22248 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22249 MaxBitWidth =
22250 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
22251 }
22252
22253 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22254 MaxBitWidth = 8;
22255
22256 // If the original type is large, but reduced type does not improve the reg
22257 // use - ignore it.
22258 if (NumParts > 1 &&
22259 NumParts ==
22261 *TTI, getWidenedType(IntegerType::get(F->getContext(),
22262 bit_ceil(MaxBitWidth)),
22263 VF)))
22264 return 0u;
22265
22266 unsigned Opcode = E.getOpcode();
22267 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
22268 Opcode == Instruction::SExt ||
22269 Opcode == Instruction::ZExt || NumParts > 1;
22270 // Conservatively determine if we can actually truncate the roots of the
22271 // expression. Collect the values that can be demoted in ToDemote and
22272 // additional roots that require investigating in Roots.
22274 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
22275 bool NeedToDemote = IsProfitableToDemote;
22276
22277 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
22278 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
22279 NeedToDemote, IsTruncRoot) ||
22280 (MaxDepthLevel <= Limit &&
22281 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
22282 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
22283 DL->getTypeSizeInBits(TreeRootIT) /
22284 DL->getTypeSizeInBits(
22285 E.getMainOp()->getOperand(0)->getType()) >
22286 2)))))
22287 return 0u;
22288 // Round MaxBitWidth up to the next power-of-two.
22289 MaxBitWidth = bit_ceil(MaxBitWidth);
22290
22291 return MaxBitWidth;
22292 };
22293
22294 // If we can truncate the root, we must collect additional values that might
22295 // be demoted as a result. That is, those seeded by truncations we will
22296 // modify.
22297 // Add reduction ops sizes, if any.
22298 if (UserIgnoreList &&
22299 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
22300 // Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
22301 // x i1> to in)).
22302 if (all_of(*UserIgnoreList,
22303 [](Value *V) {
22304 return isa<PoisonValue>(V) ||
22305 cast<Instruction>(V)->getOpcode() == Instruction::Add;
22306 }) &&
22307 VectorizableTree.front()->State == TreeEntry::Vectorize &&
22308 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
22309 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
22310 Builder.getInt1Ty()) {
22311 ReductionBitWidth = 1;
22312 } else {
22313 for (Value *V : *UserIgnoreList) {
22314 if (isa<PoisonValue>(V))
22315 continue;
22316 unsigned NumSignBits = ComputeNumSignBits(V, *DL, AC, nullptr, DT);
22317 TypeSize NumTypeBits = DL->getTypeSizeInBits(V->getType());
22318 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22320 ++BitWidth1;
22321 unsigned BitWidth2 = BitWidth1;
22323 APInt Mask = DB->getDemandedBits(cast<Instruction>(V));
22324 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22325 }
22326 ReductionBitWidth =
22327 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
22328 }
22329 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
22330 ReductionBitWidth = 8;
22331
22332 ReductionBitWidth = bit_ceil(ReductionBitWidth);
22333 }
22334 }
22335 bool IsTopRoot = NodeIdx == 0;
22336 while (NodeIdx < VectorizableTree.size() &&
22337 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22338 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
22339 RootDemotes.push_back(NodeIdx);
22340 ++NodeIdx;
22341 IsTruncRoot = true;
22342 }
22343 bool IsSignedCmp = false;
22344 if (UserIgnoreList && all_of(*UserIgnoreList, [](Value *V) {
22345 return match(V, m_SMin(m_Value(), m_Value())) ||
22346 match(V, m_SMax(m_Value(), m_Value()));
22347 }))
22348 IsSignedCmp = true;
22349 while (NodeIdx < VectorizableTree.size()) {
22350 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
22351 unsigned Limit = 2;
22352 if (IsTopRoot &&
22353 ReductionBitWidth ==
22354 DL->getTypeSizeInBits(
22355 VectorizableTree.front()->Scalars.front()->getType()))
22356 Limit = 3;
22357 unsigned MaxBitWidth = ComputeMaxBitWidth(
22358 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
22359 IsTruncRoot, IsSignedCmp);
22360 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
22361 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
22362 ReductionBitWidth = bit_ceil(MaxBitWidth);
22363 else if (MaxBitWidth == 0)
22364 ReductionBitWidth = 0;
22365 }
22366
22367 for (unsigned Idx : RootDemotes) {
22368 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
22369 uint32_t OrigBitWidth =
22370 DL->getTypeSizeInBits(V->getType()->getScalarType());
22371 if (OrigBitWidth > MaxBitWidth) {
22372 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
22373 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
22374 }
22375 return false;
22376 }))
22377 ToDemote.push_back(Idx);
22378 }
22379 RootDemotes.clear();
22380 IsTopRoot = false;
22381 IsProfitableToDemoteRoot = true;
22382
22383 if (ExtraBitWidthNodes.empty()) {
22384 NodeIdx = VectorizableTree.size();
22385 } else {
22386 unsigned NewIdx = 0;
22387 do {
22388 NewIdx = *ExtraBitWidthNodes.begin();
22389 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
22390 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
22391 NodeIdx = NewIdx;
22392 IsTruncRoot =
22393 NodeIdx < VectorizableTree.size() &&
22394 VectorizableTree[NodeIdx]->UserTreeIndex &&
22395 VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
22396 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22397 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22398 Instruction::Trunc &&
22399 !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
22400 IsSignedCmp =
22401 NodeIdx < VectorizableTree.size() &&
22402 VectorizableTree[NodeIdx]->UserTreeIndex &&
22403 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22404 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22405 Instruction::ICmp &&
22406 any_of(
22407 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
22408 [&](Value *V) {
22409 auto *IC = dyn_cast<ICmpInst>(V);
22410 return IC && (IC->isSigned() ||
22411 !isKnownNonNegative(IC->getOperand(0),
22412 SimplifyQuery(*DL)) ||
22413 !isKnownNonNegative(IC->getOperand(1),
22414 SimplifyQuery(*DL)));
22415 });
22416 }
22417
22418 // If the maximum bit width we compute is less than the width of the roots'
22419 // type, we can proceed with the narrowing. Otherwise, do nothing.
22420 if (MaxBitWidth == 0 ||
22421 MaxBitWidth >=
22422 cast<IntegerType>(TreeRoot.front()->getType()->getScalarType())
22423 ->getBitWidth()) {
22424 if (UserIgnoreList)
22425 AnalyzedMinBWVals.insert_range(TreeRoot);
22426 NodesToKeepBWs.insert_range(ToDemote);
22427 continue;
22428 }
22429
22430 // Finally, map the values we can demote to the maximum bit with we
22431 // computed.
22432 for (unsigned Idx : ToDemote) {
22433 TreeEntry *TE = VectorizableTree[Idx].get();
22434 if (MinBWs.contains(TE))
22435 continue;
22436 bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
22437 if (isa<PoisonValue>(R))
22438 return false;
22439 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22440 });
22441 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
22442 }
22443 }
22444}
22445
22447 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
22448 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
22449 auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
22450 auto *AA = &AM.getResult<AAManager>(F);
22451 auto *LI = &AM.getResult<LoopAnalysis>(F);
22452 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
22453 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
22454 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
22456
22457 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
22458 if (!Changed)
22459 return PreservedAnalyses::all();
22460
22463 return PA;
22464}
22465
22467 TargetTransformInfo *TTI_,
22468 TargetLibraryInfo *TLI_, AAResults *AA_,
22469 LoopInfo *LI_, DominatorTree *DT_,
22470 AssumptionCache *AC_, DemandedBits *DB_,
22473 return false;
22474 SE = SE_;
22475 TTI = TTI_;
22476 TLI = TLI_;
22477 AA = AA_;
22478 LI = LI_;
22479 DT = DT_;
22480 AC = AC_;
22481 DB = DB_;
22482 DL = &F.getDataLayout();
22483
22484 Stores.clear();
22485 GEPs.clear();
22486 bool Changed = false;
22487
22488 // If the target claims to have no vector registers don't attempt
22489 // vectorization.
22491 LLVM_DEBUG(
22492 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
22493 return false;
22494 }
22495
22496 // Don't vectorize when the attribute NoImplicitFloat is used.
22497 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
22498 return false;
22499
22500 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
22501
22502 // Use the bottom up slp vectorizer to construct chains that start with
22503 // store instructions.
22504 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
22505
22506 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
22507 // delete instructions.
22508
22509 // Update DFS numbers now so that we can use them for ordering.
22510 DT->updateDFSNumbers();
22511
22512 // Scan the blocks in the function in post order.
22513 for (auto *BB : post_order(&F.getEntryBlock())) {
22514 if (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()))
22515 continue;
22516
22517 // Start new block - clear the list of reduction roots.
22518 R.clearReductionData();
22519 collectSeedInstructions(BB);
22520
22521 // Vectorize trees that end at stores.
22522 if (!Stores.empty()) {
22523 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
22524 << " underlying objects.\n");
22525 Changed |= vectorizeStoreChains(R);
22526 }
22527
22528 // Vectorize trees that end at reductions.
22529 Changed |= vectorizeChainsInBlock(BB, R);
22530
22531 // Vectorize the index computations of getelementptr instructions. This
22532 // is primarily intended to catch gather-like idioms ending at
22533 // non-consecutive loads.
22534 if (!GEPs.empty()) {
22535 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
22536 << " underlying objects.\n");
22537 Changed |= vectorizeGEPIndices(BB, R);
22538 }
22539 }
22540
22541 if (Changed) {
22542 R.optimizeGatherSequence();
22543 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
22544 }
22545 return Changed;
22546}
22547
22548std::optional<bool>
22549SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
22550 unsigned Idx, unsigned MinVF,
22551 unsigned &Size) {
22552 Size = 0;
22553 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
22554 << "\n");
22555 const unsigned Sz = R.getVectorElementSize(Chain[0]);
22556 unsigned VF = Chain.size();
22557
22558 if (!has_single_bit(Sz) ||
22560 *TTI, cast<StoreInst>(Chain.front())->getValueOperand()->getType(),
22561 VF) ||
22562 VF < 2 || VF < MinVF) {
22563 // Check if vectorizing with a non-power-of-2 VF should be considered. At
22564 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
22565 // all vector lanes are used.
22566 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
22567 return false;
22568 }
22569
22570 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
22571 << "\n");
22572
22573 SetVector<Value *> ValOps;
22574 for (Value *V : Chain)
22575 ValOps.insert(cast<StoreInst>(V)->getValueOperand());
22576 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
22577 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
22578 InstructionsState S = Analysis.buildInstructionsState(
22579 ValOps.getArrayRef(), R, /*TryCopyableElementsVectorization=*/true);
22580 if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
22581 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
22582 bool IsAllowedSize =
22583 hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(),
22584 ValOps.size()) ||
22585 (VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1));
22586 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
22587 (!S.getMainOp()->isSafeToRemove() ||
22588 any_of(ValOps.getArrayRef(),
22589 [&](Value *V) {
22590 return !isa<ExtractElementInst>(V) &&
22591 (V->getNumUses() > Chain.size() ||
22592 any_of(V->users(), [&](User *U) {
22593 return !Stores.contains(U);
22594 }));
22595 }))) ||
22596 (ValOps.size() > Chain.size() / 2 && !S)) {
22597 Size = (!IsAllowedSize && S) ? 1 : 2;
22598 return false;
22599 }
22600 }
22601 if (R.isLoadCombineCandidate(Chain))
22602 return true;
22603 R.buildTree(Chain);
22604 // Check if tree tiny and store itself or its value is not vectorized.
22605 if (R.isTreeTinyAndNotFullyVectorizable()) {
22606 if (R.isGathered(Chain.front()) ||
22607 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
22608 return std::nullopt;
22609 Size = R.getCanonicalGraphSize();
22610 return false;
22611 }
22612 if (R.isProfitableToReorder()) {
22613 R.reorderTopToBottom();
22614 R.reorderBottomToTop();
22615 }
22616 R.transformNodes();
22617 R.buildExternalUses();
22618
22619 R.computeMinimumValueSizes();
22620
22621 Size = R.getCanonicalGraphSize();
22622 if (S && S.getOpcode() == Instruction::Load)
22623 Size = 2; // cut off masked gather small trees
22624 InstructionCost Cost = R.getTreeCost();
22625
22626 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
22627 if (Cost < -SLPCostThreshold) {
22628 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
22629
22630 using namespace ore;
22631
22632 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
22633 cast<StoreInst>(Chain[0]))
22634 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
22635 << " and with tree size "
22636 << NV("TreeSize", R.getTreeSize()));
22637
22638 R.vectorizeTree();
22639 return true;
22640 }
22641
22642 return false;
22643}
22644
22645/// Checks if the quadratic mean deviation is less than 90% of the mean size.
22646static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
22647 bool First) {
22648 unsigned Num = 0;
22649 uint64_t Sum = std::accumulate(
22650 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
22651 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
22652 unsigned Size = First ? Val.first : Val.second;
22653 if (Size == 1)
22654 return V;
22655 ++Num;
22656 return V + Size;
22657 });
22658 if (Num == 0)
22659 return true;
22660 uint64_t Mean = Sum / Num;
22661 if (Mean == 0)
22662 return true;
22663 uint64_t Dev = std::accumulate(
22664 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
22665 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
22666 unsigned P = First ? Val.first : Val.second;
22667 if (P == 1)
22668 return V;
22669 return V + (P - Mean) * (P - Mean);
22670 }) /
22671 Num;
22672 return Dev * 96 / (Mean * Mean) == 0;
22673}
22674
22675namespace {
22676
22677/// A group of stores that we'll try to bundle together using vector ops.
22678/// They are ordered using the signed distance of their address operand to the
22679/// address of this group's BaseInstr.
22680class RelatedStoreInsts {
22681public:
22682 RelatedStoreInsts(unsigned BaseInstrIdx, ArrayRef<StoreInst *> AllStores)
22683 : AllStores(AllStores) {
22684 reset(BaseInstrIdx);
22685 }
22686
22687 void reset(unsigned NewBaseInstr) {
22688 assert(NewBaseInstr < AllStores.size() &&
22689 "Instruction index out of bounds");
22690 BaseInstrIdx = NewBaseInstr;
22691 Instrs.clear();
22692 insertOrLookup(NewBaseInstr, 0);
22693 }
22694
22695 /// Tries to insert \p InstrIdx as the store with a pointer distance of
22696 /// \p PtrDist.
22697 /// Does nothing if there is already a store with that \p PtrDist.
22698 /// \returns The previously associated Instruction index, or std::nullopt
22699 std::optional<unsigned> insertOrLookup(unsigned InstrIdx, int64_t PtrDist) {
22700 auto [It, Inserted] = Instrs.emplace(PtrDist, InstrIdx);
22701 return Inserted ? std::nullopt : std::make_optional(It->second);
22702 }
22703
22704 using DistToInstMap = std::map<int64_t, unsigned>;
22705 const DistToInstMap &getStores() const { return Instrs; }
22706
22707 /// If \p SI is related to this group of stores, return the distance of its
22708 /// pointer operand to the one the group's BaseInstr.
22709 std::optional<int64_t> getPointerDiff(StoreInst &SI, const DataLayout &DL,
22710 ScalarEvolution &SE) const {
22711 StoreInst &BaseStore = *AllStores[BaseInstrIdx];
22712 return getPointersDiff(
22713 BaseStore.getValueOperand()->getType(), BaseStore.getPointerOperand(),
22714 SI.getValueOperand()->getType(), SI.getPointerOperand(), DL, SE,
22715 /*StrictCheck=*/true);
22716 }
22717
22718 /// Recompute the pointer distances to be based on \p NewBaseInstIdx.
22719 /// Stores whose index is less than \p MinSafeIdx will be dropped.
22720 void rebase(unsigned MinSafeIdx, unsigned NewBaseInstIdx,
22721 int64_t DistFromCurBase) {
22722 DistToInstMap PrevSet = std::move(Instrs);
22723 reset(NewBaseInstIdx);
22724
22725 // Re-insert stores that come after MinSafeIdx to try and vectorize them
22726 // again. Their distance will be "rebased" to use NewBaseInstIdx as
22727 // reference.
22728 for (auto [Dist, InstIdx] : PrevSet) {
22729 if (InstIdx >= MinSafeIdx)
22730 insertOrLookup(InstIdx, Dist - DistFromCurBase);
22731 }
22732 }
22733
22734 /// Remove all stores that have been vectorized from this group.
22735 void clearVectorizedStores(const BoUpSLP::ValueSet &VectorizedStores) {
22736 DistToInstMap::reverse_iterator LastVectorizedStore = find_if(
22737 reverse(Instrs), [&](const std::pair<int64_t, unsigned> &DistAndIdx) {
22738 return VectorizedStores.contains(AllStores[DistAndIdx.second]);
22739 });
22740
22741 // Get a forward iterator pointing after the last vectorized store and erase
22742 // all stores before it so we don't try to vectorize them again.
22743 DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
22744 Instrs.erase(Instrs.begin(), VectorizedStoresEnd);
22745 }
22746
22747private:
22748 /// The index of the Base instruction, i.e. the one with a 0 pointer distance.
22749 unsigned BaseInstrIdx;
22750
22751 /// Maps a pointer distance from \p BaseInstrIdx to an instruction index.
22752 DistToInstMap Instrs;
22753
22754 /// Reference to all the stores in the BB being analyzed.
22755 ArrayRef<StoreInst *> AllStores;
22756};
22757
22758} // end anonymous namespace
22759
22760bool SLPVectorizerPass::vectorizeStores(
22761 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
22762 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
22763 &Visited) {
22764 // We may run into multiple chains that merge into a single chain. We mark the
22765 // stores that we vectorized so that we don't visit the same store twice.
22766 BoUpSLP::ValueSet VectorizedStores;
22767 bool Changed = false;
22768
22769 auto TryToVectorize = [&](const RelatedStoreInsts::DistToInstMap &StoreSeq) {
22770 int64_t PrevDist = -1;
22772 // Collect the chain into a list.
22773 for (auto [Idx, Data] : enumerate(StoreSeq)) {
22774 auto &[Dist, InstIdx] = Data;
22775 if (Operands.empty() || Dist - PrevDist == 1) {
22776 Operands.push_back(Stores[InstIdx]);
22777 PrevDist = Dist;
22778 if (Idx != StoreSeq.size() - 1)
22779 continue;
22780 }
22781 auto E = make_scope_exit([&, &Dist = Dist, &InstIdx = InstIdx]() {
22782 Operands.clear();
22783 Operands.push_back(Stores[InstIdx]);
22784 PrevDist = Dist;
22785 });
22786
22787 if (Operands.size() <= 1 ||
22788 !Visited
22789 .insert({Operands.front(),
22790 cast<StoreInst>(Operands.front())->getValueOperand(),
22791 Operands.back(),
22792 cast<StoreInst>(Operands.back())->getValueOperand(),
22793 Operands.size()})
22794 .second)
22795 continue;
22796
22797 unsigned MaxVecRegSize = R.getMaxVecRegSize();
22798 unsigned EltSize = R.getVectorElementSize(Operands[0]);
22799 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
22800
22801 unsigned MaxVF =
22802 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
22803 auto *Store = cast<StoreInst>(Operands[0]);
22804 Type *StoreTy = Store->getValueOperand()->getType();
22805 Type *ValueTy = StoreTy;
22806 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
22807 ValueTy = Trunc->getSrcTy();
22808 // When REVEC is enabled, StoreTy and ValueTy may be FixedVectorType. But
22809 // getStoreMinimumVF only support scalar type as arguments. As a result,
22810 // we need to use the element type of StoreTy and ValueTy to retrieve the
22811 // VF and then transform it back.
22812 // Remember: VF is defined as the number we want to vectorize, not the
22813 // number of elements in the final vector.
22814 Type *StoreScalarTy = StoreTy->getScalarType();
22815 unsigned MinVF = PowerOf2Ceil(TTI->getStoreMinimumVF(
22816 R.getMinVF(DL->getTypeStoreSizeInBits(StoreScalarTy)), StoreScalarTy,
22817 ValueTy->getScalarType()));
22818 MinVF /= getNumElements(StoreTy);
22819 MinVF = std::max<unsigned>(2, MinVF);
22820
22821 if (MaxVF < MinVF) {
22822 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
22823 << ") < "
22824 << "MinVF (" << MinVF << ")\n");
22825 continue;
22826 }
22827
22828 unsigned NonPowerOf2VF = 0;
22830 // First try vectorizing with a non-power-of-2 VF. At the moment, only
22831 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
22832 // lanes are used.
22833 unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF);
22834 if (has_single_bit(CandVF + 1)) {
22835 NonPowerOf2VF = CandVF;
22836 assert(NonPowerOf2VF != MaxVF &&
22837 "Non-power-of-2 VF should not be equal to MaxVF");
22838 }
22839 }
22840
22841 // MaxRegVF represents the number of instructions (scalar, or vector in
22842 // case of revec) that can be vectorized to naturally fit in a vector
22843 // register.
22844 unsigned MaxRegVF = MaxVF;
22845
22846 MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
22847 if (MaxVF < MinVF) {
22848 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
22849 << ") < "
22850 << "MinVF (" << MinVF << ")\n");
22851 continue;
22852 }
22853
22854 SmallVector<unsigned> CandidateVFs;
22855 for (unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
22856 VF = divideCeil(VF, 2))
22857 CandidateVFs.push_back(VF);
22858
22859 unsigned End = Operands.size();
22860 unsigned Repeat = 0;
22861 constexpr unsigned MaxAttempts = 4;
22863 for (std::pair<unsigned, unsigned> &P : RangeSizes)
22864 P.first = P.second = 1;
22866 auto IsNotVectorized = [](bool First,
22867 const std::pair<unsigned, unsigned> &P) {
22868 return First ? P.first > 0 : P.second > 0;
22869 };
22870 auto IsVectorized = [](bool First,
22871 const std::pair<unsigned, unsigned> &P) {
22872 return First ? P.first == 0 : P.second == 0;
22873 };
22874 auto VFIsProfitable = [](bool First, unsigned Size,
22875 const std::pair<unsigned, unsigned> &P) {
22876 return First ? Size >= P.first : Size >= P.second;
22877 };
22878 auto FirstSizeSame = [](unsigned Size,
22879 const std::pair<unsigned, unsigned> &P) {
22880 return Size == P.first;
22881 };
22882 while (true) {
22883 ++Repeat;
22884 bool RepeatChanged = false;
22885 bool AnyProfitableGraph = false;
22886 for (unsigned VF : CandidateVFs) {
22887 AnyProfitableGraph = false;
22888 unsigned FirstUnvecStore =
22889 std::distance(RangeSizes.begin(),
22890 find_if(RangeSizes, std::bind(IsNotVectorized,
22891 VF >= MaxRegVF, _1)));
22892
22893 // Form slices of size VF starting from FirstUnvecStore and try to
22894 // vectorize them.
22895 while (FirstUnvecStore < End) {
22896 unsigned FirstVecStore = std::distance(
22897 RangeSizes.begin(),
22898 find_if(RangeSizes.drop_front(FirstUnvecStore),
22899 std::bind(IsVectorized, VF >= MaxRegVF, _1)));
22900 unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
22901 for (unsigned SliceStartIdx = FirstUnvecStore;
22902 SliceStartIdx + VF <= MaxSliceEnd;) {
22903 if (!checkTreeSizes(RangeSizes.slice(SliceStartIdx, VF),
22904 VF >= MaxRegVF)) {
22905 ++SliceStartIdx;
22906 continue;
22907 }
22908 ArrayRef<Value *> Slice =
22909 ArrayRef(Operands).slice(SliceStartIdx, VF);
22910 assert(all_of(Slice,
22911 [&](Value *V) {
22912 return cast<StoreInst>(V)
22913 ->getValueOperand()
22914 ->getType() ==
22915 cast<StoreInst>(Slice.front())
22916 ->getValueOperand()
22917 ->getType();
22918 }) &&
22919 "Expected all operands of same type.");
22920 if (!NonSchedulable.empty()) {
22921 auto [NonSchedSizeMax, NonSchedSizeMin] =
22922 NonSchedulable.lookup(Slice.front());
22923 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
22924 // VF is too ambitious. Try to vectorize another slice before
22925 // trying a smaller VF.
22926 SliceStartIdx += NonSchedSizeMax;
22927 continue;
22928 }
22929 }
22930 unsigned TreeSize;
22931 std::optional<bool> Res =
22932 vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
22933 if (!Res) {
22934 // Update the range of non schedulable VFs for slices starting
22935 // at SliceStartIdx.
22936 NonSchedulable
22937 .try_emplace(Slice.front(), std::make_pair(VF, VF))
22938 .first->getSecond()
22939 .second = VF;
22940 } else if (*Res) {
22941 // Mark the vectorized stores so that we don't vectorize them
22942 // again.
22943 VectorizedStores.insert_range(Slice);
22944 // Mark the vectorized stores so that we don't vectorize them
22945 // again.
22946 AnyProfitableGraph = RepeatChanged = Changed = true;
22947 // If we vectorized initial block, no need to try to vectorize
22948 // it again.
22949 for (std::pair<unsigned, unsigned> &P :
22950 RangeSizes.slice(SliceStartIdx, VF))
22951 P.first = P.second = 0;
22952 if (SliceStartIdx < FirstUnvecStore + MinVF) {
22953 for (std::pair<unsigned, unsigned> &P : RangeSizes.slice(
22954 FirstUnvecStore, SliceStartIdx - FirstUnvecStore))
22955 P.first = P.second = 0;
22956 FirstUnvecStore = SliceStartIdx + VF;
22957 }
22958 if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
22959 for (std::pair<unsigned, unsigned> &P :
22960 RangeSizes.slice(SliceStartIdx + VF,
22961 MaxSliceEnd - (SliceStartIdx + VF)))
22962 P.first = P.second = 0;
22963 if (MaxSliceEnd == End)
22964 End = SliceStartIdx;
22965 MaxSliceEnd = SliceStartIdx;
22966 }
22967 SliceStartIdx += VF;
22968 continue;
22969 }
22970 if (VF > 2 && Res &&
22971 !all_of(RangeSizes.slice(SliceStartIdx, VF),
22972 std::bind(VFIsProfitable, VF >= MaxRegVF, TreeSize,
22973 _1))) {
22974 SliceStartIdx += VF;
22975 continue;
22976 }
22977 // Check for the very big VFs that we're not rebuilding same
22978 // trees, just with larger number of elements.
22979 if (VF > MaxRegVF && TreeSize > 1 &&
22980 all_of(RangeSizes.slice(SliceStartIdx, VF),
22981 std::bind(FirstSizeSame, TreeSize, _1))) {
22982 SliceStartIdx += VF;
22983 while (SliceStartIdx != MaxSliceEnd &&
22984 RangeSizes[SliceStartIdx].first == TreeSize)
22985 ++SliceStartIdx;
22986 continue;
22987 }
22988 if (TreeSize > 1) {
22989 for (std::pair<unsigned, unsigned> &P :
22990 RangeSizes.slice(SliceStartIdx, VF)) {
22991 if (VF >= MaxRegVF)
22992 P.second = std::max(P.second, TreeSize);
22993 else
22994 P.first = std::max(P.first, TreeSize);
22995 }
22996 }
22997 ++SliceStartIdx;
22998 AnyProfitableGraph = true;
22999 }
23000 if (FirstUnvecStore >= End)
23001 break;
23002 if (MaxSliceEnd - FirstUnvecStore < VF &&
23003 MaxSliceEnd - FirstUnvecStore >= MinVF)
23004 AnyProfitableGraph = true;
23005 FirstUnvecStore = std::distance(
23006 RangeSizes.begin(),
23007 find_if(RangeSizes.drop_front(MaxSliceEnd),
23008 std::bind(IsNotVectorized, VF >= MaxRegVF, _1)));
23009 }
23010 if (!AnyProfitableGraph && VF >= MaxRegVF && has_single_bit(VF))
23011 break;
23012 }
23013 // All values vectorized - exit.
23014 if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
23015 return P.first == 0 && P.second == 0;
23016 }))
23017 break;
23018 // Check if tried all attempts or no need for the last attempts at all.
23019 if (Repeat >= MaxAttempts ||
23020 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
23021 break;
23022 constexpr unsigned StoresLimit = 64;
23023 const unsigned MaxTotalNum = std::min<unsigned>(
23024 Operands.size(),
23025 static_cast<unsigned>(
23026 End -
23027 std::distance(
23028 RangeSizes.begin(),
23029 find_if(RangeSizes, std::bind(IsNotVectorized, true, _1))) +
23030 1));
23031 unsigned VF = bit_ceil(CandidateVFs.front()) * 2;
23032 unsigned Limit =
23033 getFloorFullVectorNumberOfElements(*TTI, StoreTy, MaxTotalNum);
23034 CandidateVFs.clear();
23035 if (bit_floor(Limit) == VF)
23036 CandidateVFs.push_back(Limit);
23037 if (VF > MaxTotalNum || VF >= StoresLimit)
23038 break;
23039 for (std::pair<unsigned, unsigned> &P : RangeSizes) {
23040 if (P.first != 0)
23041 P.first = std::max(P.second, P.first);
23042 }
23043 // Last attempt to vectorize max number of elements, if all previous
23044 // attempts were unsuccessful because of the cost issues.
23045 CandidateVFs.push_back(VF);
23046 }
23047 }
23048 };
23049
23050 /// Groups of stores to vectorize
23051 SmallVector<RelatedStoreInsts> SortedStores;
23052
23053 // Inserts the specified store SI with the given index Idx to the set of the
23054 // stores. If the store with the same distance is found already - stop
23055 // insertion, try to vectorize already found stores. If some stores from this
23056 // sequence were not vectorized - try to vectorize them with the new store
23057 // later. But this logic is applied only to the stores, that come before the
23058 // previous store with the same distance.
23059 // Example:
23060 // 1. store x, %p
23061 // 2. store y, %p+1
23062 // 3. store z, %p+2
23063 // 4. store a, %p
23064 // 5. store b, %p+3
23065 // - Scan this from the last to first store. The very first bunch of stores is
23066 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
23067 // vector).
23068 // - The next store in the list - #1 - has the same distance from store #5 as
23069 // the store #4.
23070 // - Try to vectorize sequence of stores 4,2,3,5.
23071 // - If all these stores are vectorized - just drop them.
23072 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
23073 // - Start new stores sequence.
23074 // The new bunch of stores is {1, {1, 0}}.
23075 // - Add the stores from previous sequence, that were not vectorized.
23076 // Here we consider the stores in the reversed order, rather they are used in
23077 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
23078 // Store #3 can be added -> comes after store #4 with the same distance as
23079 // store #1.
23080 // Store #5 cannot be added - comes before store #4.
23081 // This logic allows to improve the compile time, we assume that the stores
23082 // after previous store with the same distance most likely have memory
23083 // dependencies and no need to waste compile time to try to vectorize them.
23084 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
23085 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
23086 std::optional<int64_t> PtrDist;
23087 auto *RelatedStores = find_if(
23088 SortedStores, [&PtrDist, SI, this](const RelatedStoreInsts &StoreSeq) {
23089 PtrDist = StoreSeq.getPointerDiff(*SI, *DL, *SE);
23090 return PtrDist.has_value();
23091 });
23092
23093 // We did not find a comparable store, start a new group.
23094 if (RelatedStores == SortedStores.end()) {
23095 SortedStores.emplace_back(Idx, Stores);
23096 return;
23097 }
23098
23099 // If there is already a store in the group with the same PtrDiff, try to
23100 // vectorize the existing instructions before adding the current store.
23101 // Otherwise, insert this store and keep collecting.
23102 if (std::optional<unsigned> PrevInst =
23103 RelatedStores->insertOrLookup(Idx, *PtrDist)) {
23104 TryToVectorize(RelatedStores->getStores());
23105 RelatedStores->clearVectorizedStores(VectorizedStores);
23106 RelatedStores->rebase(/*MinSafeIdx=*/*PrevInst + 1,
23107 /*NewBaseInstIdx=*/Idx,
23108 /*DistFromCurBase=*/*PtrDist);
23109 }
23110 };
23111 Type *PrevValTy = nullptr;
23112 for (auto [I, SI] : enumerate(Stores)) {
23113 if (R.isDeleted(SI))
23114 continue;
23115 if (!PrevValTy)
23116 PrevValTy = SI->getValueOperand()->getType();
23117 // Check that we do not try to vectorize stores of different types.
23118 if (PrevValTy != SI->getValueOperand()->getType()) {
23119 for (RelatedStoreInsts &StoreSeq : SortedStores)
23120 TryToVectorize(StoreSeq.getStores());
23121 SortedStores.clear();
23122 PrevValTy = SI->getValueOperand()->getType();
23123 }
23124 FillStoresSet(I, SI);
23125 }
23126
23127 // Final vectorization attempt.
23128 for (RelatedStoreInsts &StoreSeq : SortedStores)
23129 TryToVectorize(StoreSeq.getStores());
23130
23131 return Changed;
23132}
23133
23134void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
23135 // Initialize the collections. We will make a single pass over the block.
23136 Stores.clear();
23137 GEPs.clear();
23138
23139 // Visit the store and getelementptr instructions in BB and organize them in
23140 // Stores and GEPs according to the underlying objects of their pointer
23141 // operands.
23142 for (Instruction &I : *BB) {
23143 // Ignore store instructions that are volatile or have a pointer operand
23144 // that doesn't point to a scalar type.
23145 if (auto *SI = dyn_cast<StoreInst>(&I)) {
23146 if (!SI->isSimple())
23147 continue;
23148 if (!isValidElementType(SI->getValueOperand()->getType()))
23149 continue;
23150 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
23151 }
23152
23153 // Ignore getelementptr instructions that have more than one index, a
23154 // constant index, or a pointer operand that doesn't point to a scalar
23155 // type.
23156 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
23157 if (GEP->getNumIndices() != 1)
23158 continue;
23159 Value *Idx = GEP->idx_begin()->get();
23160 if (isa<Constant>(Idx))
23161 continue;
23162 if (!isValidElementType(Idx->getType()))
23163 continue;
23164 if (GEP->getType()->isVectorTy())
23165 continue;
23166 GEPs[GEP->getPointerOperand()].push_back(GEP);
23167 }
23168 }
23169}
23170
23171bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
23172 bool MaxVFOnly) {
23173 if (VL.size() < 2)
23174 return false;
23175
23176 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
23177 << VL.size() << ".\n");
23178
23179 // Check that all of the parts are instructions of the same type,
23180 // we permit an alternate opcode via InstructionsState.
23181 InstructionsState S = getSameOpcode(VL, *TLI);
23182 if (!S)
23183 return false;
23184
23185 Instruction *I0 = S.getMainOp();
23186 // Make sure invalid types (including vector type) are rejected before
23187 // determining vectorization factor for scalar instructions.
23188 for (Value *V : VL) {
23189 Type *Ty = V->getType();
23190 if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) {
23191 // NOTE: the following will give user internal llvm type name, which may
23192 // not be useful.
23193 R.getORE()->emit([&]() {
23194 std::string TypeStr;
23196 Ty->print(OS);
23197 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
23198 << "Cannot SLP vectorize list: type "
23199 << TypeStr + " is unsupported by vectorizer";
23200 });
23201 return false;
23202 }
23203 }
23204
23205 Type *ScalarTy = getValueType(VL[0]);
23206 unsigned Sz = R.getVectorElementSize(I0);
23207 unsigned MinVF = R.getMinVF(Sz);
23208 unsigned MaxVF = std::max<unsigned>(
23209 getFloorFullVectorNumberOfElements(*TTI, ScalarTy, VL.size()), MinVF);
23210 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
23211 if (MaxVF < 2) {
23212 R.getORE()->emit([&]() {
23213 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
23214 << "Cannot SLP vectorize list: vectorization factor "
23215 << "less than 2 is not supported";
23216 });
23217 return false;
23218 }
23219
23220 bool Changed = false;
23221 bool CandidateFound = false;
23222 InstructionCost MinCost = SLPCostThreshold.getValue();
23223
23224 unsigned NextInst = 0, MaxInst = VL.size();
23225 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
23226 VF = getFloorFullVectorNumberOfElements(*TTI, I0->getType(), VF - 1)) {
23227 // No actual vectorization should happen, if number of parts is the same as
23228 // provided vectorization factor (i.e. the scalar type is used for vector
23229 // code during codegen).
23230 auto *VecTy = getWidenedType(ScalarTy, VF);
23231 if (TTI->getNumberOfParts(VecTy) == VF)
23232 continue;
23233 for (unsigned I = NextInst; I < MaxInst; ++I) {
23234 unsigned ActualVF = std::min(MaxInst - I, VF);
23235
23236 if (!hasFullVectorsOrPowerOf2(*TTI, ScalarTy, ActualVF))
23237 continue;
23238
23239 if (MaxVFOnly && ActualVF < MaxVF)
23240 break;
23241 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
23242 break;
23243
23244 SmallVector<Value *> Ops(ActualVF, nullptr);
23245 unsigned Idx = 0;
23246 for (Value *V : VL.drop_front(I)) {
23247 // Check that a previous iteration of this loop did not delete the
23248 // Value.
23249 if (auto *Inst = dyn_cast<Instruction>(V);
23250 !Inst || !R.isDeleted(Inst)) {
23251 Ops[Idx] = V;
23252 ++Idx;
23253 if (Idx == ActualVF)
23254 break;
23255 }
23256 }
23257 // Not enough vectorizable instructions - exit.
23258 if (Idx != ActualVF)
23259 break;
23260
23261 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
23262 << "\n");
23263
23264 R.buildTree(Ops);
23265 if (R.isTreeTinyAndNotFullyVectorizable())
23266 continue;
23267 if (R.isProfitableToReorder()) {
23268 R.reorderTopToBottom();
23269 R.reorderBottomToTop(!isa<InsertElementInst>(Ops.front()));
23270 }
23271 R.transformNodes();
23272 R.buildExternalUses();
23273
23274 R.computeMinimumValueSizes();
23275 InstructionCost Cost = R.getTreeCost();
23276 CandidateFound = true;
23277 MinCost = std::min(MinCost, Cost);
23278
23279 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
23280 << " for VF=" << ActualVF << "\n");
23281 if (Cost < -SLPCostThreshold) {
23282 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
23283 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
23284 cast<Instruction>(Ops[0]))
23285 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
23286 << " and with tree size "
23287 << ore::NV("TreeSize", R.getTreeSize()));
23288
23289 R.vectorizeTree();
23290 // Move to the next bundle.
23291 I += VF - 1;
23292 NextInst = I + 1;
23293 Changed = true;
23294 }
23295 }
23296 }
23297
23298 if (!Changed && CandidateFound) {
23299 R.getORE()->emit([&]() {
23300 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
23301 << "List vectorization was possible but not beneficial with cost "
23302 << ore::NV("Cost", MinCost) << " >= "
23303 << ore::NV("Treshold", -SLPCostThreshold);
23304 });
23305 } else if (!Changed) {
23306 R.getORE()->emit([&]() {
23307 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
23308 << "Cannot SLP vectorize list: vectorization was impossible"
23309 << " with available vectorization factors";
23310 });
23311 }
23312 return Changed;
23313}
23314
23315namespace {
23316
23317/// Model horizontal reductions.
23318///
23319/// A horizontal reduction is a tree of reduction instructions that has values
23320/// that can be put into a vector as its leaves. For example:
23321///
23322/// mul mul mul mul
23323/// \ / \ /
23324/// + +
23325/// \ /
23326/// +
23327/// This tree has "mul" as its leaf values and "+" as its reduction
23328/// instructions. A reduction can feed into a store or a binary operation
23329/// feeding a phi.
23330/// ...
23331/// \ /
23332/// +
23333/// |
23334/// phi +=
23335///
23336/// Or:
23337/// ...
23338/// \ /
23339/// +
23340/// |
23341/// *p =
23342///
23343class HorizontalReduction {
23344 using ReductionOpsType = SmallVector<Value *, 16>;
23345 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
23346 ReductionOpsListType ReductionOps;
23347 /// List of possibly reduced values.
23349 /// Maps reduced value to the corresponding reduction operation.
23351 WeakTrackingVH ReductionRoot;
23352 /// The type of reduction operation.
23353 RecurKind RdxKind;
23354 /// Checks if the optimization of original scalar identity operations on
23355 /// matched horizontal reductions is enabled and allowed.
23356 bool IsSupportedHorRdxIdentityOp = false;
23357 /// The minimum number of the reduced values.
23358 const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
23359 /// Contains vector values for reduction including their scale factor and
23360 /// signedness.
23362
23363 static bool isCmpSelMinMax(Instruction *I) {
23364 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
23366 }
23367
23368 // And/or are potentially poison-safe logical patterns like:
23369 // select x, y, false
23370 // select x, true, y
23371 static bool isBoolLogicOp(Instruction *I) {
23372 return isa<SelectInst>(I) &&
23373 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
23374 }
23375
23376 /// Checks if instruction is associative and can be vectorized.
23377 static bool isVectorizable(RecurKind Kind, Instruction *I,
23378 bool TwoElementReduction = false) {
23379 if (Kind == RecurKind::None)
23380 return false;
23381
23382 // Integer ops that map to select instructions or intrinsics are fine.
23384 isBoolLogicOp(I))
23385 return true;
23386
23387 // No need to check for associativity, if 2 reduced values.
23388 if (TwoElementReduction)
23389 return true;
23390
23391 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
23392 // FP min/max are associative except for NaN and -0.0. We do not
23393 // have to rule out -0.0 here because the intrinsic semantics do not
23394 // specify a fixed result for it.
23395 return I->getFastMathFlags().noNaNs();
23396 }
23397
23398 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
23399 return true;
23400
23401 return I->isAssociative();
23402 }
23403
23404 static Value *getRdxOperand(Instruction *I, unsigned Index) {
23405 // Poison-safe 'or' takes the form: select X, true, Y
23406 // To make that work with the normal operand processing, we skip the
23407 // true value operand.
23408 // TODO: Change the code and data structures to handle this without a hack.
23409 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
23410 return I->getOperand(2);
23411 return I->getOperand(Index);
23412 }
23413
23414 /// Creates reduction operation with the current opcode.
23415 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
23416 Value *RHS, const Twine &Name, bool UseSelect) {
23417 Type *OpTy = LHS->getType();
23418 assert(OpTy == RHS->getType() && "Expected LHS and RHS of same type");
23419 switch (Kind) {
23420 case RecurKind::Or: {
23421 if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
23422 return Builder.CreateSelect(
23423 LHS, ConstantInt::getAllOnesValue(CmpInst::makeCmpResultType(OpTy)),
23424 RHS, Name);
23425 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
23426 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
23427 Name);
23428 }
23429 case RecurKind::And: {
23430 if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
23431 return Builder.CreateSelect(
23432 LHS, RHS,
23433 ConstantInt::getNullValue(CmpInst::makeCmpResultType(OpTy)), Name);
23434 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
23435 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
23436 Name);
23437 }
23438 case RecurKind::Add:
23439 case RecurKind::Mul:
23440 case RecurKind::Xor:
23441 case RecurKind::FAdd:
23442 case RecurKind::FMul: {
23443 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
23444 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
23445 Name);
23446 }
23447 case RecurKind::SMax:
23448 case RecurKind::SMin:
23449 case RecurKind::UMax:
23450 case RecurKind::UMin:
23451 if (UseSelect) {
23453 Value *Cmp = Builder.CreateICmp(Pred, LHS, RHS, Name);
23454 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
23455 }
23456 [[fallthrough]];
23457 case RecurKind::FMax:
23458 case RecurKind::FMin:
23459 case RecurKind::FMaximum:
23460 case RecurKind::FMinimum:
23461 case RecurKind::FMaximumNum:
23462 case RecurKind::FMinimumNum: {
23464 return Builder.CreateBinaryIntrinsic(Id, LHS, RHS);
23465 }
23466 default:
23467 llvm_unreachable("Unknown reduction operation.");
23468 }
23469 }
23470
23471 /// Creates reduction operation with the current opcode with the IR flags
23472 /// from \p ReductionOps, dropping nuw/nsw flags.
23473 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
23474 Value *RHS, const Twine &Name,
23475 const ReductionOpsListType &ReductionOps) {
23476 bool UseSelect = ReductionOps.size() == 2 ||
23477 // Logical or/and.
23478 (ReductionOps.size() == 1 &&
23479 any_of(ReductionOps.front(), IsaPred<SelectInst>));
23480 assert((!UseSelect || ReductionOps.size() != 2 ||
23481 isa<SelectInst>(ReductionOps[1][0])) &&
23482 "Expected cmp + select pairs for reduction");
23483 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
23485 if (auto *Sel = dyn_cast<SelectInst>(Op)) {
23486 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
23487 /*IncludeWrapFlags=*/false);
23488 propagateIRFlags(Op, ReductionOps[1], nullptr,
23489 /*IncludeWrapFlags=*/false);
23490 return Op;
23491 }
23492 }
23493 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
23494 return Op;
23495 }
23496
23497public:
23498 static RecurKind getRdxKind(Value *V) {
23499 auto *I = dyn_cast<Instruction>(V);
23500 if (!I)
23501 return RecurKind::None;
23502 if (match(I, m_Add(m_Value(), m_Value())))
23503 return RecurKind::Add;
23504 if (match(I, m_Mul(m_Value(), m_Value())))
23505 return RecurKind::Mul;
23506 if (match(I, m_And(m_Value(), m_Value())) ||
23508 return RecurKind::And;
23509 if (match(I, m_Or(m_Value(), m_Value())) ||
23511 return RecurKind::Or;
23512 if (match(I, m_Xor(m_Value(), m_Value())))
23513 return RecurKind::Xor;
23514 if (match(I, m_FAdd(m_Value(), m_Value())))
23515 return RecurKind::FAdd;
23516 if (match(I, m_FMul(m_Value(), m_Value())))
23517 return RecurKind::FMul;
23518
23519 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
23520 return RecurKind::FMax;
23521 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
23522 return RecurKind::FMin;
23523
23524 if (match(I, m_FMaximum(m_Value(), m_Value())))
23525 return RecurKind::FMaximum;
23526 if (match(I, m_FMinimum(m_Value(), m_Value())))
23527 return RecurKind::FMinimum;
23528 // This matches either cmp+select or intrinsics. SLP is expected to handle
23529 // either form.
23530 // TODO: If we are canonicalizing to intrinsics, we can remove several
23531 // special-case paths that deal with selects.
23532 if (match(I, m_SMax(m_Value(), m_Value())))
23533 return RecurKind::SMax;
23534 if (match(I, m_SMin(m_Value(), m_Value())))
23535 return RecurKind::SMin;
23536 if (match(I, m_UMax(m_Value(), m_Value())))
23537 return RecurKind::UMax;
23538 if (match(I, m_UMin(m_Value(), m_Value())))
23539 return RecurKind::UMin;
23540
23541 if (auto *Select = dyn_cast<SelectInst>(I)) {
23542 // Try harder: look for min/max pattern based on instructions producing
23543 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
23544 // During the intermediate stages of SLP, it's very common to have
23545 // pattern like this (since optimizeGatherSequence is run only once
23546 // at the end):
23547 // %1 = extractelement <2 x i32> %a, i32 0
23548 // %2 = extractelement <2 x i32> %a, i32 1
23549 // %cond = icmp sgt i32 %1, %2
23550 // %3 = extractelement <2 x i32> %a, i32 0
23551 // %4 = extractelement <2 x i32> %a, i32 1
23552 // %select = select i1 %cond, i32 %3, i32 %4
23553 CmpPredicate Pred;
23554 Instruction *L1;
23555 Instruction *L2;
23556
23557 Value *LHS = Select->getTrueValue();
23558 Value *RHS = Select->getFalseValue();
23559 Value *Cond = Select->getCondition();
23560
23561 // TODO: Support inverse predicates.
23562 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
23563 if (!isa<ExtractElementInst>(RHS) ||
23564 !L2->isIdenticalTo(cast<Instruction>(RHS)))
23565 return RecurKind::None;
23566 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
23567 if (!isa<ExtractElementInst>(LHS) ||
23568 !L1->isIdenticalTo(cast<Instruction>(LHS)))
23569 return RecurKind::None;
23570 } else {
23571 if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
23572 return RecurKind::None;
23573 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
23574 !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
23575 !L2->isIdenticalTo(cast<Instruction>(RHS)))
23576 return RecurKind::None;
23577 }
23578
23579 switch (Pred) {
23580 default:
23581 return RecurKind::None;
23582 case CmpInst::ICMP_SGT:
23583 case CmpInst::ICMP_SGE:
23584 return RecurKind::SMax;
23585 case CmpInst::ICMP_SLT:
23586 case CmpInst::ICMP_SLE:
23587 return RecurKind::SMin;
23588 case CmpInst::ICMP_UGT:
23589 case CmpInst::ICMP_UGE:
23590 return RecurKind::UMax;
23591 case CmpInst::ICMP_ULT:
23592 case CmpInst::ICMP_ULE:
23593 return RecurKind::UMin;
23594 }
23595 }
23596 return RecurKind::None;
23597 }
23598
23599 /// Get the index of the first operand.
23600 static unsigned getFirstOperandIndex(Instruction *I) {
23601 return isCmpSelMinMax(I) ? 1 : 0;
23602 }
23603
23604private:
23605 /// Total number of operands in the reduction operation.
23606 static unsigned getNumberOfOperands(Instruction *I) {
23607 return isCmpSelMinMax(I) ? 3 : 2;
23608 }
23609
23610 /// Checks if the instruction is in basic block \p BB.
23611 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
23612 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
23613 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
23614 auto *Sel = cast<SelectInst>(I);
23615 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
23616 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
23617 }
23618 return I->getParent() == BB;
23619 }
23620
23621 /// Expected number of uses for reduction operations/reduced values.
23622 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
23623 if (IsCmpSelMinMax) {
23624 // SelectInst must be used twice while the condition op must have single
23625 // use only.
23626 if (auto *Sel = dyn_cast<SelectInst>(I))
23627 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
23628 return I->hasNUses(2);
23629 }
23630
23631 // Arithmetic reduction operation must be used once only.
23632 return I->hasOneUse();
23633 }
23634
23635 /// Initializes the list of reduction operations.
23636 void initReductionOps(Instruction *I) {
23637 if (isCmpSelMinMax(I))
23638 ReductionOps.assign(2, ReductionOpsType());
23639 else
23640 ReductionOps.assign(1, ReductionOpsType());
23641 }
23642
23643 /// Add all reduction operations for the reduction instruction \p I.
23644 void addReductionOps(Instruction *I) {
23645 if (isCmpSelMinMax(I)) {
23646 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
23647 ReductionOps[1].emplace_back(I);
23648 } else {
23649 ReductionOps[0].emplace_back(I);
23650 }
23651 }
23652
23653 static bool isGoodForReduction(ArrayRef<Value *> Data) {
23654 int Sz = Data.size();
23655 auto *I = dyn_cast<Instruction>(Data.front());
23656 return Sz > 1 || isConstant(Data.front()) ||
23657 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
23658 }
23659
23660public:
23661 HorizontalReduction() = default;
23663 : ReductionRoot(I), ReductionLimit(2) {
23664 RdxKind = HorizontalReduction::getRdxKind(I);
23665 ReductionOps.emplace_back().push_back(I);
23666 ReducedVals.emplace_back().assign(Ops.begin(), Ops.end());
23667 for (Value *V : Ops)
23668 ReducedValsToOps[V].push_back(I);
23669 }
23670
23671 bool matchReductionForOperands() const {
23672 // Analyze "regular" integer/FP types for reductions - no target-specific
23673 // types or pointers.
23674 assert(ReductionRoot && "Reduction root is not set!");
23675 if (!isVectorizable(RdxKind, cast<Instruction>(ReductionRoot),
23676 all_of(ReducedVals, [](ArrayRef<Value *> Ops) {
23677 return Ops.size() == 2;
23678 })))
23679 return false;
23680
23681 return true;
23682 }
23683
23684 /// Try to find a reduction tree.
23685 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
23686 ScalarEvolution &SE, const DataLayout &DL,
23687 const TargetLibraryInfo &TLI) {
23688 RdxKind = HorizontalReduction::getRdxKind(Root);
23689 if (!isVectorizable(RdxKind, Root))
23690 return false;
23691
23692 // Analyze "regular" integer/FP types for reductions - no target-specific
23693 // types or pointers.
23694 Type *Ty = Root->getType();
23695 if (!isValidElementType(Ty) || Ty->isPointerTy())
23696 return false;
23697
23698 // Though the ultimate reduction may have multiple uses, its condition must
23699 // have only single use.
23700 if (auto *Sel = dyn_cast<SelectInst>(Root))
23701 if (!Sel->getCondition()->hasOneUse())
23702 return false;
23703
23704 ReductionRoot = Root;
23705
23706 // Iterate through all the operands of the possible reduction tree and
23707 // gather all the reduced values, sorting them by their value id.
23708 BasicBlock *BB = Root->getParent();
23709 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
23711 1, std::make_pair(Root, 0));
23712 // Checks if the operands of the \p TreeN instruction are also reduction
23713 // operations or should be treated as reduced values or an extra argument,
23714 // which is not part of the reduction.
23715 auto CheckOperands = [&](Instruction *TreeN,
23716 SmallVectorImpl<Value *> &PossibleReducedVals,
23717 SmallVectorImpl<Instruction *> &ReductionOps,
23718 unsigned Level) {
23719 for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
23720 getNumberOfOperands(TreeN)))) {
23721 Value *EdgeVal = getRdxOperand(TreeN, I);
23722 ReducedValsToOps[EdgeVal].push_back(TreeN);
23723 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
23724 // If the edge is not an instruction, or it is different from the main
23725 // reduction opcode or has too many uses - possible reduced value.
23726 // Also, do not try to reduce const values, if the operation is not
23727 // foldable.
23728 if (!EdgeInst || Level > RecursionMaxDepth ||
23729 getRdxKind(EdgeInst) != RdxKind ||
23730 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
23731 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
23732 !isVectorizable(RdxKind, EdgeInst) ||
23733 (R.isAnalyzedReductionRoot(EdgeInst) &&
23734 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
23735 PossibleReducedVals.push_back(EdgeVal);
23736 continue;
23737 }
23738 ReductionOps.push_back(EdgeInst);
23739 }
23740 };
23741 // Try to regroup reduced values so that it gets more profitable to try to
23742 // reduce them. Values are grouped by their value ids, instructions - by
23743 // instruction op id and/or alternate op id, plus do extra analysis for
23744 // loads (grouping them by the distance between pointers) and cmp
23745 // instructions (grouping them by the predicate).
23748 8>
23749 PossibleReducedVals;
23750 initReductionOps(Root);
23752 SmallSet<size_t, 2> LoadKeyUsed;
23753
23754 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
23755 Key = hash_combine(hash_value(LI->getParent()), Key);
23756 Value *Ptr =
23758 if (!LoadKeyUsed.insert(Key).second) {
23759 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
23760 if (LIt != LoadsMap.end()) {
23761 for (LoadInst *RLI : LIt->second) {
23762 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
23763 LI->getType(), LI->getPointerOperand(), DL, SE,
23764 /*StrictCheck=*/true))
23765 return hash_value(RLI->getPointerOperand());
23766 }
23767 for (LoadInst *RLI : LIt->second) {
23769 LI->getPointerOperand(), TLI)) {
23770 hash_code SubKey = hash_value(RLI->getPointerOperand());
23771 return SubKey;
23772 }
23773 }
23774 if (LIt->second.size() > 2) {
23775 hash_code SubKey =
23776 hash_value(LIt->second.back()->getPointerOperand());
23777 return SubKey;
23778 }
23779 }
23780 }
23781 LoadsMap.try_emplace(std::make_pair(Key, Ptr))
23782 .first->second.push_back(LI);
23783 return hash_value(LI->getPointerOperand());
23784 };
23785
23786 while (!Worklist.empty()) {
23787 auto [TreeN, Level] = Worklist.pop_back_val();
23788 SmallVector<Value *> PossibleRedVals;
23789 SmallVector<Instruction *> PossibleReductionOps;
23790 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
23791 addReductionOps(TreeN);
23792 // Add reduction values. The values are sorted for better vectorization
23793 // results.
23794 for (Value *V : PossibleRedVals) {
23795 size_t Key, Idx;
23796 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
23797 /*AllowAlternate=*/false);
23798 ++PossibleReducedVals[Key][Idx].try_emplace(V, 0).first->second;
23799 }
23800 for (Instruction *I : reverse(PossibleReductionOps))
23801 Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);
23802 }
23803 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
23804 // Sort values by the total number of values kinds to start the reduction
23805 // from the longest possible reduced values sequences.
23806 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
23807 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
23808 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
23809 for (auto &Slice : PossibleRedVals) {
23810 PossibleRedValsVect.emplace_back();
23811 auto RedValsVect = Slice.second.takeVector();
23812 stable_sort(RedValsVect, llvm::less_second());
23813 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
23814 PossibleRedValsVect.back().append(Data.second, Data.first);
23815 }
23816 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
23817 return P1.size() > P2.size();
23818 });
23819 bool First = true;
23820 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
23821 if (First) {
23822 First = false;
23823 ReducedVals.emplace_back();
23824 } else if (!isGoodForReduction(Data)) {
23825 auto *LI = dyn_cast<LoadInst>(Data.front());
23826 auto *LastLI = dyn_cast<LoadInst>(ReducedVals.back().front());
23827 if (!LI || !LastLI ||
23829 getUnderlyingObject(LastLI->getPointerOperand()))
23830 ReducedVals.emplace_back();
23831 }
23832 ReducedVals.back().append(Data.rbegin(), Data.rend());
23833 }
23834 }
23835 // Sort the reduced values by number of same/alternate opcode and/or pointer
23836 // operand.
23837 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
23838 return P1.size() > P2.size();
23839 });
23840 return true;
23841 }
23842
23843 /// Attempt to vectorize the tree found by matchAssociativeReduction.
23844 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
23845 const TargetLibraryInfo &TLI, AssumptionCache *AC,
23846 DominatorTree &DT) {
23847 constexpr unsigned RegMaxNumber = 4;
23848 constexpr unsigned RedValsMaxNumber = 128;
23849 // If there are a sufficient number of reduction values, reduce
23850 // to a nearby power-of-2. We can safely generate oversized
23851 // vectors and rely on the backend to split them to legal sizes.
23852 if (unsigned NumReducedVals = std::accumulate(
23853 ReducedVals.begin(), ReducedVals.end(), 0,
23854 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
23855 if (!isGoodForReduction(Vals))
23856 return Num;
23857 return Num + Vals.size();
23858 });
23859 NumReducedVals < ReductionLimit &&
23860 all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
23861 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
23862 })) {
23863 for (ReductionOpsType &RdxOps : ReductionOps)
23864 for (Value *RdxOp : RdxOps)
23865 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
23866 return nullptr;
23867 }
23868
23869 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
23870 TargetFolder(DL));
23871 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
23872
23873 // Track the reduced values in case if they are replaced by extractelement
23874 // because of the vectorization.
23875 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *
23876 ReducedVals.front().size());
23877
23878 // The compare instruction of a min/max is the insertion point for new
23879 // instructions and may be replaced with a new compare instruction.
23880 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
23881 assert(isa<SelectInst>(RdxRootInst) &&
23882 "Expected min/max reduction to have select root instruction");
23883 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
23884 assert(isa<Instruction>(ScalarCond) &&
23885 "Expected min/max reduction to have compare condition");
23886 return cast<Instruction>(ScalarCond);
23887 };
23888
23889 bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {
23890 return isBoolLogicOp(cast<Instruction>(V));
23891 });
23892 // Return new VectorizedTree, based on previous value.
23893 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
23894 if (VectorizedTree) {
23895 // Update the final value in the reduction.
23897 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
23898 if (AnyBoolLogicOp) {
23899 auto It = ReducedValsToOps.find(VectorizedTree);
23900 auto It1 = ReducedValsToOps.find(Res);
23901 if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) ||
23902 isGuaranteedNotToBePoison(VectorizedTree, AC) ||
23903 (It != ReducedValsToOps.end() &&
23904 any_of(It->getSecond(), [&](Instruction *I) {
23905 return isBoolLogicOp(I) &&
23906 getRdxOperand(I, 0) == VectorizedTree;
23907 }))) {
23908 ;
23909 } else if (isGuaranteedNotToBePoison(Res, AC) ||
23910 (It1 != ReducedValsToOps.end() &&
23911 any_of(It1->getSecond(), [&](Instruction *I) {
23912 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
23913 }))) {
23914 std::swap(VectorizedTree, Res);
23915 } else {
23916 VectorizedTree = Builder.CreateFreeze(VectorizedTree);
23917 }
23918 }
23919
23920 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
23921 ReductionOps);
23922 }
23923 // Initialize the final value in the reduction.
23924 return Res;
23925 };
23926 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
23927 ReductionOps.front().size());
23928 for (ReductionOpsType &RdxOps : ReductionOps)
23929 for (Value *RdxOp : RdxOps) {
23930 if (!RdxOp)
23931 continue;
23932 IgnoreList.insert(RdxOp);
23933 }
23934 // Intersect the fast-math-flags from all reduction operations.
23935 FastMathFlags RdxFMF;
23936 RdxFMF.set();
23937 for (Value *U : IgnoreList)
23938 if (auto *FPMO = dyn_cast<FPMathOperator>(U))
23939 RdxFMF &= FPMO->getFastMathFlags();
23940 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
23941
23942 // Need to track reduced vals, they may be changed during vectorization of
23943 // subvectors.
23944 for (ArrayRef<Value *> Candidates : ReducedVals)
23945 for (Value *V : Candidates)
23946 TrackedVals.try_emplace(V, V);
23947
23949 Value *V) -> unsigned & {
23950 auto *It = MV.find(V);
23951 assert(It != MV.end() && "Unable to find given key.");
23952 return It->second;
23953 };
23954
23955 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
23956 // List of the values that were reduced in other trees as part of gather
23957 // nodes and thus requiring extract if fully vectorized in other trees.
23958 SmallPtrSet<Value *, 4> RequiredExtract;
23959 WeakTrackingVH VectorizedTree = nullptr;
23960 bool CheckForReusedReductionOps = false;
23961 // Try to vectorize elements based on their type.
23963 for (ArrayRef<Value *> RV : ReducedVals)
23964 States.push_back(getSameOpcode(RV, TLI));
23965 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
23966 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
23967 InstructionsState S = States[I];
23968 SmallVector<Value *> Candidates;
23969 Candidates.reserve(2 * OrigReducedVals.size());
23970 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
23971 for (Value *ReducedVal : OrigReducedVals) {
23972 Value *RdxVal = TrackedVals.at(ReducedVal);
23973 // Check if the reduction value was not overriden by the extractelement
23974 // instruction because of the vectorization and exclude it, if it is not
23975 // compatible with other values.
23976 // Also check if the instruction was folded to constant/other value.
23977 auto *Inst = dyn_cast<Instruction>(RdxVal);
23978 if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
23979 (!S || !S.getMatchingMainOpOrAltOp(Inst))) ||
23980 (S && !Inst))
23981 continue;
23982 Candidates.push_back(RdxVal);
23983 TrackedToOrig.try_emplace(RdxVal, ReducedVal);
23984 }
23985 bool ShuffledExtracts = false;
23986 // Try to handle shuffled extractelements.
23987 if (S && S.getOpcode() == Instruction::ExtractElement &&
23988 !S.isAltShuffle() && I + 1 < E) {
23989 SmallVector<Value *> CommonCandidates(Candidates);
23990 for (Value *RV : ReducedVals[I + 1]) {
23991 Value *RdxVal = TrackedVals.at(RV);
23992 // Check if the reduction value was not overriden by the
23993 // extractelement instruction because of the vectorization and
23994 // exclude it, if it is not compatible with other values.
23995 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
23996 if (!Inst)
23997 continue;
23998 CommonCandidates.push_back(RdxVal);
23999 TrackedToOrig.try_emplace(RdxVal, RV);
24000 }
24002 if (isFixedVectorShuffle(CommonCandidates, Mask, AC)) {
24003 ++I;
24004 Candidates.swap(CommonCandidates);
24005 ShuffledExtracts = true;
24006 }
24007 }
24008
24009 // Emit code for constant values.
24010 if (Candidates.size() > 1 && allConstant(Candidates)) {
24011 Value *Res = Candidates.front();
24012 Value *OrigV = TrackedToOrig.at(Candidates.front());
24013 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24014 for (Value *VC : ArrayRef(Candidates).drop_front()) {
24015 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
24016 Value *OrigV = TrackedToOrig.at(VC);
24017 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24018 if (auto *ResI = dyn_cast<Instruction>(Res))
24019 V.analyzedReductionRoot(ResI);
24020 }
24021 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
24022 continue;
24023 }
24024
24025 unsigned NumReducedVals = Candidates.size();
24026 if (NumReducedVals < ReductionLimit &&
24027 (NumReducedVals < 2 || !isSplat(Candidates)))
24028 continue;
24029
24030 // Check if we support repeated scalar values processing (optimization of
24031 // original scalar identity operations on matched horizontal reductions).
24032 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
24033 RdxKind != RecurKind::FMul &&
24034 RdxKind != RecurKind::FMulAdd;
24035 // Gather same values.
24036 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
24037 if (IsSupportedHorRdxIdentityOp)
24038 for (Value *V : Candidates) {
24039 Value *OrigV = TrackedToOrig.at(V);
24040 ++SameValuesCounter.try_emplace(OrigV).first->second;
24041 }
24042 // Used to check if the reduced values used same number of times. In this
24043 // case the compiler may produce better code. E.g. if reduced values are
24044 // aabbccdd (8 x values), then the first node of the tree will have a node
24045 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
24046 // Plus, the final reduction will be performed on <8 x aabbccdd>.
24047 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
24048 // x abcd) * 2.
24049 // Currently it only handles add/fadd/xor. and/or/min/max do not require
24050 // this analysis, other operations may require an extra estimation of
24051 // the profitability.
24052 bool SameScaleFactor = false;
24053 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
24054 SameValuesCounter.size() != Candidates.size();
24055 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
24056 if (OptReusedScalars) {
24057 SameScaleFactor =
24058 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
24059 RdxKind == RecurKind::Xor) &&
24060 all_of(drop_begin(SameValuesCounter),
24061 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
24062 return P.second == SameValuesCounter.front().second;
24063 });
24064 Candidates.resize(SameValuesCounter.size());
24065 transform(SameValuesCounter, Candidates.begin(),
24066 [&](const auto &P) { return TrackedVals.at(P.first); });
24067 NumReducedVals = Candidates.size();
24068 // Have a reduction of the same element.
24069 if (NumReducedVals == 1) {
24070 Value *OrigV = TrackedToOrig.at(Candidates.front());
24071 unsigned Cnt = At(SameValuesCounter, OrigV);
24072 Value *RedVal =
24073 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
24074 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24075 VectorizedVals.try_emplace(OrigV, Cnt);
24076 ExternallyUsedValues.insert(OrigV);
24077 continue;
24078 }
24079 }
24080
24081 unsigned MaxVecRegSize = V.getMaxVecRegSize();
24082 unsigned EltSize = V.getVectorElementSize(Candidates[0]);
24083 const unsigned MaxElts = std::clamp<unsigned>(
24084 llvm::bit_floor(MaxVecRegSize / EltSize), RedValsMaxNumber,
24085 RegMaxNumber * RedValsMaxNumber);
24086
24087 unsigned ReduxWidth = NumReducedVals;
24088 auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) {
24089 unsigned NumParts, NumRegs;
24090 Type *ScalarTy = Candidates.front()->getType();
24091 ReduxWidth =
24092 getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth);
24093 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
24094 NumParts = ::getNumberOfParts(TTI, Tp);
24095 NumRegs =
24097 while (NumParts > NumRegs) {
24098 assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0.");
24099 ReduxWidth = bit_floor(ReduxWidth - 1);
24100 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
24101 NumParts = ::getNumberOfParts(TTI, Tp);
24102 NumRegs =
24104 }
24105 if (NumParts > NumRegs / 2)
24106 ReduxWidth = bit_floor(ReduxWidth);
24107 return ReduxWidth;
24108 };
24109 if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))
24110 ReduxWidth = GetVectorFactor(ReduxWidth);
24111 ReduxWidth = std::min(ReduxWidth, MaxElts);
24112
24113 unsigned Start = 0;
24114 unsigned Pos = Start;
24115 // Restarts vectorization attempt with lower vector factor.
24116 unsigned PrevReduxWidth = ReduxWidth;
24117 bool CheckForReusedReductionOpsLocal = false;
24118 auto AdjustReducedVals = [&](bool IgnoreVL = false) {
24119 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
24120 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
24121 // Check if any of the reduction ops are gathered. If so, worth
24122 // trying again with less number of reduction ops.
24123 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
24124 }
24125 ++Pos;
24126 if (Pos < NumReducedVals - ReduxWidth + 1)
24127 return IsAnyRedOpGathered;
24128 Pos = Start;
24129 --ReduxWidth;
24130 if (ReduxWidth > 1)
24131 ReduxWidth = GetVectorFactor(ReduxWidth);
24132 return IsAnyRedOpGathered;
24133 };
24134 bool AnyVectorized = false;
24135 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
24136 while (Pos < NumReducedVals - ReduxWidth + 1 &&
24137 ReduxWidth >= ReductionLimit) {
24138 // Dependency in tree of the reduction ops - drop this attempt, try
24139 // later.
24140 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
24141 Start == 0) {
24142 CheckForReusedReductionOps = true;
24143 break;
24144 }
24145 PrevReduxWidth = ReduxWidth;
24146 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
24147 // Been analyzed already - skip.
24148 if (IgnoredCandidates.contains(std::make_pair(Pos, ReduxWidth)) ||
24149 (!has_single_bit(ReduxWidth) &&
24150 (IgnoredCandidates.contains(
24151 std::make_pair(Pos, bit_floor(ReduxWidth))) ||
24152 IgnoredCandidates.contains(
24153 std::make_pair(Pos + (ReduxWidth - bit_floor(ReduxWidth)),
24154 bit_floor(ReduxWidth))))) ||
24155 V.areAnalyzedReductionVals(VL)) {
24156 (void)AdjustReducedVals(/*IgnoreVL=*/true);
24157 continue;
24158 }
24159 // Early exit if any of the reduction values were deleted during
24160 // previous vectorization attempts.
24161 if (any_of(VL, [&V](Value *RedVal) {
24162 auto *RedValI = dyn_cast<Instruction>(RedVal);
24163 return RedValI && V.isDeleted(RedValI);
24164 }))
24165 break;
24166 V.buildTree(VL, IgnoreList);
24167 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
24168 if (!AdjustReducedVals())
24169 V.analyzedReductionVals(VL);
24170 continue;
24171 }
24172 if (V.isLoadCombineReductionCandidate(RdxKind)) {
24173 if (!AdjustReducedVals())
24174 V.analyzedReductionVals(VL);
24175 continue;
24176 }
24177 V.reorderTopToBottom();
24178 // No need to reorder the root node at all for reassociative reduction.
24179 V.reorderBottomToTop(/*IgnoreReorder=*/RdxFMF.allowReassoc() ||
24180 VL.front()->getType()->isIntOrIntVectorTy() ||
24181 ReductionLimit > 2);
24182 // Keep extracted other reduction values, if they are used in the
24183 // vectorization trees.
24184 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
24185 ExternallyUsedValues);
24186 // The reduction root is used as the insertion point for new
24187 // instructions, so set it as externally used to prevent it from being
24188 // deleted.
24189 LocalExternallyUsedValues.insert(ReductionRoot);
24190 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
24191 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
24192 continue;
24193 for (Value *V : ReducedVals[Cnt])
24194 if (isa<Instruction>(V))
24195 LocalExternallyUsedValues.insert(TrackedVals[V]);
24196 }
24197 if (!IsSupportedHorRdxIdentityOp) {
24198 // Number of uses of the candidates in the vector of values.
24199 assert(SameValuesCounter.empty() &&
24200 "Reused values counter map is not empty");
24201 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24202 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24203 continue;
24204 Value *V = Candidates[Cnt];
24205 Value *OrigV = TrackedToOrig.at(V);
24206 ++SameValuesCounter.try_emplace(OrigV).first->second;
24207 }
24208 }
24209 V.transformNodes();
24211 // Gather externally used values.
24213 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24214 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24215 continue;
24216 Value *RdxVal = Candidates[Cnt];
24217 if (auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
24218 RdxVal = It->second;
24219 if (!Visited.insert(RdxVal).second)
24220 continue;
24221 // Check if the scalar was vectorized as part of the vectorization
24222 // tree but not the top node.
24223 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
24224 LocalExternallyUsedValues.insert(RdxVal);
24225 continue;
24226 }
24227 Value *OrigV = TrackedToOrig.at(RdxVal);
24228 unsigned NumOps =
24229 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
24230 if (NumOps != ReducedValsToOps.at(OrigV).size())
24231 LocalExternallyUsedValues.insert(RdxVal);
24232 }
24233 // Do not need the list of reused scalars in regular mode anymore.
24234 if (!IsSupportedHorRdxIdentityOp)
24235 SameValuesCounter.clear();
24236 for (Value *RdxVal : VL)
24237 if (RequiredExtract.contains(RdxVal))
24238 LocalExternallyUsedValues.insert(RdxVal);
24239 V.buildExternalUses(LocalExternallyUsedValues);
24240
24241 V.computeMinimumValueSizes();
24242
24243 // Estimate cost.
24244 InstructionCost ReductionCost =
24245 getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT, DL, TLI);
24246 InstructionCost Cost = V.getTreeCost(VL, ReductionCost);
24247 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
24248 << " for reduction\n");
24249 if (!Cost.isValid())
24250 break;
24251 if (Cost >= -SLPCostThreshold) {
24252 V.getORE()->emit([&]() {
24253 return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
24254 ReducedValsToOps.at(VL[0]).front())
24255 << "Vectorizing horizontal reduction is possible "
24256 << "but not beneficial with cost " << ore::NV("Cost", Cost)
24257 << " and threshold "
24258 << ore::NV("Threshold", -SLPCostThreshold);
24259 });
24260 if (!AdjustReducedVals()) {
24261 V.analyzedReductionVals(VL);
24262 unsigned Offset = Pos == Start ? Pos : Pos - 1;
24263 if (ReduxWidth > ReductionLimit && V.isTreeNotExtendable()) {
24264 // Add subvectors of VL to the list of the analyzed values.
24265 for (unsigned VF = getFloorFullVectorNumberOfElements(
24266 *TTI, VL.front()->getType(), ReduxWidth - 1);
24267 VF >= ReductionLimit;
24269 *TTI, VL.front()->getType(), VF - 1)) {
24270 if (has_single_bit(VF) &&
24271 V.getCanonicalGraphSize() != V.getTreeSize())
24272 continue;
24273 for (unsigned Idx : seq<unsigned>(ReduxWidth - VF))
24274 IgnoredCandidates.insert(std::make_pair(Offset + Idx, VF));
24275 }
24276 }
24277 }
24278 continue;
24279 }
24280
24281 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
24282 << Cost << ". (HorRdx)\n");
24283 V.getORE()->emit([&]() {
24284 return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
24285 ReducedValsToOps.at(VL[0]).front())
24286 << "Vectorized horizontal reduction with cost "
24287 << ore::NV("Cost", Cost) << " and with tree size "
24288 << ore::NV("TreeSize", V.getTreeSize());
24289 });
24290
24291 Builder.setFastMathFlags(RdxFMF);
24292
24293 // Emit a reduction. If the root is a select (min/max idiom), the insert
24294 // point is the compare condition of that select.
24295 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
24296 Instruction *InsertPt = RdxRootInst;
24297 if (IsCmpSelMinMax)
24298 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
24299
24300 // Vectorize a tree.
24301 Value *VectorizedRoot = V.vectorizeTree(
24302 LocalExternallyUsedValues, InsertPt, VectorValuesAndScales);
24303 // Update TrackedToOrig mapping, since the tracked values might be
24304 // updated.
24305 for (Value *RdxVal : Candidates) {
24306 Value *OrigVal = TrackedToOrig.at(RdxVal);
24307 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
24308 if (TransformedRdxVal != RdxVal)
24309 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
24310 }
24311
24312 Builder.SetInsertPoint(InsertPt);
24313
24314 // To prevent poison from leaking across what used to be sequential,
24315 // safe, scalar boolean logic operations, the reduction operand must be
24316 // frozen.
24317 if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot, AC))
24318 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
24319
24320 // Emit code to correctly handle reused reduced values, if required.
24321 if (OptReusedScalars && !SameScaleFactor) {
24322 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
24323 SameValuesCounter, TrackedToOrig);
24324 }
24325
24326 Type *ScalarTy = VL.front()->getType();
24327 Type *VecTy = VectorizedRoot->getType();
24328 Type *RedScalarTy = VecTy->getScalarType();
24329 VectorValuesAndScales.emplace_back(
24330 VectorizedRoot,
24331 OptReusedScalars && SameScaleFactor
24332 ? SameValuesCounter.front().second
24333 : 1,
24334 RedScalarTy != ScalarTy->getScalarType()
24335 ? V.isSignedMinBitwidthRootNode()
24336 : true);
24337
24338 // Count vectorized reduced values to exclude them from final reduction.
24339 for (Value *RdxVal : VL) {
24340 Value *OrigV = TrackedToOrig.at(RdxVal);
24341 if (IsSupportedHorRdxIdentityOp) {
24342 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
24343 continue;
24344 }
24345 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24346 if (!V.isVectorized(RdxVal))
24347 RequiredExtract.insert(RdxVal);
24348 }
24349 Pos += ReduxWidth;
24350 Start = Pos;
24351 ReduxWidth = NumReducedVals - Pos;
24352 if (ReduxWidth > 1)
24353 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
24354 AnyVectorized = true;
24355 }
24356 if (OptReusedScalars && !AnyVectorized) {
24357 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
24358 Value *RdxVal = TrackedVals.at(P.first);
24359 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder, P.second);
24360 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24361 VectorizedVals.try_emplace(P.first, P.second);
24362 }
24363 continue;
24364 }
24365 }
24366 if (!VectorValuesAndScales.empty())
24367 VectorizedTree = GetNewVectorizedTree(
24368 VectorizedTree,
24369 emitReduction(Builder, *TTI, ReductionRoot->getType()));
24370 if (VectorizedTree) {
24371 // Reorder operands of bool logical op in the natural order to avoid
24372 // possible problem with poison propagation. If not possible to reorder
24373 // (both operands are originally RHS), emit an extra freeze instruction
24374 // for the LHS operand.
24375 // I.e., if we have original code like this:
24376 // RedOp1 = select i1 ?, i1 LHS, i1 false
24377 // RedOp2 = select i1 RHS, i1 ?, i1 false
24378
24379 // Then, we swap LHS/RHS to create a new op that matches the poison
24380 // semantics of the original code.
24381
24382 // If we have original code like this and both values could be poison:
24383 // RedOp1 = select i1 ?, i1 LHS, i1 false
24384 // RedOp2 = select i1 ?, i1 RHS, i1 false
24385
24386 // Then, we must freeze LHS in the new op.
24387 auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS,
24388 Instruction *RedOp1,
24389 Instruction *RedOp2,
24390 bool InitStep) {
24391 if (!AnyBoolLogicOp)
24392 return;
24393 if (isBoolLogicOp(RedOp1) && ((!InitStep && LHS == VectorizedTree) ||
24394 getRdxOperand(RedOp1, 0) == LHS ||
24396 return;
24397 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
24398 getRdxOperand(RedOp2, 0) == RHS ||
24400 std::swap(LHS, RHS);
24401 return;
24402 }
24403 if (LHS != VectorizedTree)
24404 LHS = Builder.CreateFreeze(LHS);
24405 };
24406 // Finish the reduction.
24407 // Need to add extra arguments and not vectorized possible reduction
24408 // values.
24409 // Try to avoid dependencies between the scalar remainders after
24410 // reductions.
24411 auto FinalGen =
24413 bool InitStep) {
24414 unsigned Sz = InstVals.size();
24416 Sz % 2);
24417 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
24418 Instruction *RedOp = InstVals[I + 1].first;
24419 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
24420 Value *RdxVal1 = InstVals[I].second;
24421 Value *StableRdxVal1 = RdxVal1;
24422 auto It1 = TrackedVals.find(RdxVal1);
24423 if (It1 != TrackedVals.end())
24424 StableRdxVal1 = It1->second;
24425 Value *RdxVal2 = InstVals[I + 1].second;
24426 Value *StableRdxVal2 = RdxVal2;
24427 auto It2 = TrackedVals.find(RdxVal2);
24428 if (It2 != TrackedVals.end())
24429 StableRdxVal2 = It2->second;
24430 // To prevent poison from leaking across what used to be
24431 // sequential, safe, scalar boolean logic operations, the
24432 // reduction operand must be frozen.
24433 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
24434 RedOp, InitStep);
24435 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
24436 StableRdxVal2, "op.rdx", ReductionOps);
24437 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
24438 }
24439 if (Sz % 2 == 1)
24440 ExtraReds[Sz / 2] = InstVals.back();
24441 return ExtraReds;
24442 };
24444 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
24445 VectorizedTree);
24447 for (ArrayRef<Value *> Candidates : ReducedVals) {
24448 for (Value *RdxVal : Candidates) {
24449 if (!Visited.insert(RdxVal).second)
24450 continue;
24451 unsigned NumOps = VectorizedVals.lookup(RdxVal);
24452 for (Instruction *RedOp :
24453 ArrayRef(ReducedValsToOps.at(RdxVal)).drop_back(NumOps))
24454 ExtraReductions.emplace_back(RedOp, RdxVal);
24455 }
24456 }
24457 // Iterate through all not-vectorized reduction values/extra arguments.
24458 bool InitStep = true;
24459 while (ExtraReductions.size() > 1) {
24461 FinalGen(ExtraReductions, InitStep);
24462 ExtraReductions.swap(NewReds);
24463 InitStep = false;
24464 }
24465 VectorizedTree = ExtraReductions.front().second;
24466
24467 ReductionRoot->replaceAllUsesWith(VectorizedTree);
24468
24469 // The original scalar reduction is expected to have no remaining
24470 // uses outside the reduction tree itself. Assert that we got this
24471 // correct, replace internal uses with undef, and mark for eventual
24472 // deletion.
24473#ifndef NDEBUG
24474 SmallPtrSet<Value *, 4> IgnoreSet;
24475 for (ArrayRef<Value *> RdxOps : ReductionOps)
24476 IgnoreSet.insert_range(RdxOps);
24477#endif
24478 for (ArrayRef<Value *> RdxOps : ReductionOps) {
24479 for (Value *Ignore : RdxOps) {
24480 if (!Ignore)
24481 continue;
24482#ifndef NDEBUG
24483 for (auto *U : Ignore->users()) {
24484 assert(IgnoreSet.count(U) &&
24485 "All users must be either in the reduction ops list.");
24486 }
24487#endif
24488 if (!Ignore->use_empty()) {
24489 Value *P = PoisonValue::get(Ignore->getType());
24490 Ignore->replaceAllUsesWith(P);
24491 }
24492 }
24493 V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
24494 }
24495 } else if (!CheckForReusedReductionOps) {
24496 for (ReductionOpsType &RdxOps : ReductionOps)
24497 for (Value *RdxOp : RdxOps)
24498 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
24499 }
24500 return VectorizedTree;
24501 }
24502
24503private:
24504 /// Creates the reduction from the given \p Vec vector value with the given
24505 /// scale \p Scale and signedness \p IsSigned.
24506 Value *createSingleOp(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
24507 Value *Vec, unsigned Scale, bool IsSigned,
24508 Type *DestTy) {
24509 Value *Rdx;
24510 if (auto *VecTy = dyn_cast<FixedVectorType>(DestTy)) {
24511 unsigned DestTyNumElements = getNumElements(VecTy);
24512 unsigned VF = getNumElements(Vec->getType()) / DestTyNumElements;
24513 Rdx = PoisonValue::get(
24514 getWidenedType(Vec->getType()->getScalarType(), DestTyNumElements));
24515 for (unsigned I : seq<unsigned>(DestTyNumElements)) {
24516 // Do reduction for each lane.
24517 // e.g., do reduce add for
24518 // VL[0] = <4 x Ty> <a, b, c, d>
24519 // VL[1] = <4 x Ty> <e, f, g, h>
24520 // Lane[0] = <2 x Ty> <a, e>
24521 // Lane[1] = <2 x Ty> <b, f>
24522 // Lane[2] = <2 x Ty> <c, g>
24523 // Lane[3] = <2 x Ty> <d, h>
24524 // result[0] = reduce add Lane[0]
24525 // result[1] = reduce add Lane[1]
24526 // result[2] = reduce add Lane[2]
24527 // result[3] = reduce add Lane[3]
24528 SmallVector<int, 16> Mask = createStrideMask(I, DestTyNumElements, VF);
24529 Value *Lane = Builder.CreateShuffleVector(Vec, Mask);
24530 Rdx = Builder.CreateInsertElement(
24531 Rdx, emitReduction(Lane, Builder, &TTI, DestTy), I);
24532 }
24533 } else {
24534 Rdx = emitReduction(Vec, Builder, &TTI, DestTy);
24535 }
24536 if (Rdx->getType() != DestTy)
24537 Rdx = Builder.CreateIntCast(Rdx, DestTy, IsSigned);
24538 // Improved analysis for add/fadd/xor reductions with same scale
24539 // factor for all operands of reductions. We can emit scalar ops for
24540 // them instead.
24541 if (Scale > 1)
24542 Rdx = emitScaleForReusedOps(Rdx, Builder, Scale);
24543 return Rdx;
24544 }
24545
24546 /// Calculate the cost of a reduction.
24547 InstructionCost getReductionCost(TargetTransformInfo *TTI,
24548 ArrayRef<Value *> ReducedVals,
24549 bool IsCmpSelMinMax, FastMathFlags FMF,
24550 const BoUpSLP &R, DominatorTree &DT,
24551 const DataLayout &DL,
24552 const TargetLibraryInfo &TLI) {
24554 Type *ScalarTy = ReducedVals.front()->getType();
24555 unsigned ReduxWidth = ReducedVals.size();
24556 FixedVectorType *VectorTy = R.getReductionType();
24557 InstructionCost VectorCost = 0, ScalarCost;
24558 // If all of the reduced values are constant, the vector cost is 0, since
24559 // the reduction value can be calculated at the compile time.
24560 bool AllConsts = allConstant(ReducedVals);
24561 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
24563 // Scalar cost is repeated for N-1 elements.
24564 int Cnt = ReducedVals.size();
24565 for (Value *RdxVal : ReducedVals) {
24566 if (Cnt == 1)
24567 break;
24568 --Cnt;
24569 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
24570 Cost += GenCostFn();
24571 continue;
24572 }
24573 InstructionCost ScalarCost = 0;
24574 for (User *U : RdxVal->users()) {
24575 auto *RdxOp = cast<Instruction>(U);
24576 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
24577 if (RdxKind == RecurKind::FAdd) {
24579 RdxOp, getSameOpcode(RdxOp, TLI), DT, DL, *TTI, TLI);
24580 if (FMACost.isValid()) {
24581 LLVM_DEBUG(dbgs() << "FMA cost: " << FMACost << "\n");
24582 if (auto *I = dyn_cast<Instruction>(RdxVal)) {
24583 // Also, exclude scalar fmul cost.
24584 InstructionCost FMulCost =
24586 LLVM_DEBUG(dbgs() << "Minus FMul cost: " << FMulCost << "\n");
24587 FMACost -= FMulCost;
24588 }
24589 ScalarCost += FMACost;
24590 continue;
24591 }
24592 }
24593 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
24594 continue;
24595 }
24596 ScalarCost = InstructionCost::getInvalid();
24597 break;
24598 }
24599 if (ScalarCost.isValid())
24600 Cost += ScalarCost;
24601 else
24602 Cost += GenCostFn();
24603 }
24604 return Cost;
24605 };
24606 // Require reduction cost if:
24607 // 1. This type is not a full register type and no other vectors with the
24608 // same type in the storage (first vector with small type).
24609 // 2. The storage does not have any vector with full vector use (first
24610 // vector with full register use).
24611 bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.empty();
24612 switch (RdxKind) {
24613 case RecurKind::Add:
24614 case RecurKind::Mul:
24615 case RecurKind::Or:
24616 case RecurKind::And:
24617 case RecurKind::Xor:
24618 case RecurKind::FAdd:
24619 case RecurKind::FMul: {
24620 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
24621 if (!AllConsts) {
24622 if (DoesRequireReductionOp) {
24623 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
24624 assert(SLPReVec && "FixedVectorType is not expected.");
24625 unsigned ScalarTyNumElements = VecTy->getNumElements();
24626 for (unsigned I : seq<unsigned>(ReducedVals.size())) {
24627 VectorCost += TTI->getShuffleCost(
24630 ReducedVals.size()),
24631 VectorTy,
24632 createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));
24633 VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy,
24634 FMF, CostKind);
24635 }
24636 VectorCost += TTI->getScalarizationOverhead(
24637 VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
24638 /*Extract*/ false, TTI::TCK_RecipThroughput);
24639 } else {
24640 Type *RedTy = VectorTy->getElementType();
24641 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
24642 std::make_pair(RedTy, true));
24643 if (RType == RedTy) {
24644 VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
24645 FMF, CostKind);
24646 } else {
24647 VectorCost = TTI->getExtendedReductionCost(
24648 RdxOpcode, !IsSigned, RedTy,
24649 getWidenedType(RType, ReduxWidth), FMF, CostKind);
24650 }
24651 }
24652 } else {
24653 Type *RedTy = VectorTy->getElementType();
24654 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
24655 std::make_pair(RedTy, true));
24656 VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
24658 if (RdxKind == RecurKind::FAdd) {
24659 // Check if the reduction operands can be converted to FMA.
24661 FastMathFlags FMF;
24662 FMF.set();
24663 for (Value *RdxVal : ReducedVals) {
24664 if (!RdxVal->hasOneUse()) {
24665 Ops.clear();
24666 break;
24667 }
24668 if (auto *FPCI = dyn_cast<FPMathOperator>(RdxVal))
24669 FMF &= FPCI->getFastMathFlags();
24670 Ops.push_back(RdxVal->user_back());
24671 }
24672 if (!Ops.empty()) {
24673 FMACost = canConvertToFMA(Ops, getSameOpcode(Ops, TLI), DT, DL,
24674 *TTI, TLI);
24675 if (FMACost.isValid()) {
24676 // Calculate actual FMAD cost.
24677 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
24678 {RVecTy, RVecTy, RVecTy}, FMF);
24679 FMACost = TTI->getIntrinsicInstrCost(ICA, CostKind);
24680
24681 LLVM_DEBUG(dbgs() << "Vector FMA cost: " << FMACost << "\n");
24682 // Also, exclude vector fmul cost.
24684 Instruction::FMul, RVecTy, CostKind);
24686 << "Minus vector FMul cost: " << FMulCost << "\n");
24687 FMACost -= FMulCost;
24688 }
24689 }
24690 }
24691 if (FMACost.isValid())
24692 VectorCost += FMACost;
24693 else
24694 VectorCost +=
24695 TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind);
24696 if (RType != RedTy) {
24697 unsigned Opcode = Instruction::Trunc;
24698 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
24699 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
24700 VectorCost += TTI->getCastInstrCost(
24701 Opcode, VectorTy, RVecTy, TTI::CastContextHint::None, CostKind);
24702 }
24703 }
24704 }
24705 ScalarCost = EvaluateScalarCost([&]() {
24706 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
24707 });
24708 break;
24709 }
24710 case RecurKind::FMax:
24711 case RecurKind::FMin:
24712 case RecurKind::FMaximum:
24713 case RecurKind::FMinimum:
24714 case RecurKind::SMax:
24715 case RecurKind::SMin:
24716 case RecurKind::UMax:
24717 case RecurKind::UMin: {
24719 if (!AllConsts) {
24720 if (DoesRequireReductionOp) {
24721 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
24722 } else {
24723 // Check if the previous reduction already exists and account it as
24724 // series of operations + single reduction.
24725 Type *RedTy = VectorTy->getElementType();
24726 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
24727 std::make_pair(RedTy, true));
24728 VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
24729 IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);
24730 VectorCost += TTI->getIntrinsicInstrCost(ICA, CostKind);
24731 if (RType != RedTy) {
24732 unsigned Opcode = Instruction::Trunc;
24733 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
24734 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
24735 VectorCost += TTI->getCastInstrCost(
24736 Opcode, VectorTy, RVecTy, TTI::CastContextHint::None, CostKind);
24737 }
24738 }
24739 }
24740 ScalarCost = EvaluateScalarCost([&]() {
24741 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
24742 return TTI->getIntrinsicInstrCost(ICA, CostKind);
24743 });
24744 break;
24745 }
24746 default:
24747 llvm_unreachable("Expected arithmetic or min/max reduction operation");
24748 }
24749
24750 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
24751 << " for reduction of " << shortBundleName(ReducedVals)
24752 << " (It is a splitting reduction)\n");
24753 return VectorCost - ScalarCost;
24754 }
24755
24756 /// Splits the values, stored in VectorValuesAndScales, into registers/free
24757 /// sub-registers, combines them with the given reduction operation as a
24758 /// vector operation and then performs single (small enough) reduction.
24759 Value *emitReduction(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
24760 Type *DestTy) {
24761 Value *ReducedSubTree = nullptr;
24762 // Creates reduction and combines with the previous reduction.
24763 auto CreateSingleOp = [&](Value *Vec, unsigned Scale, bool IsSigned) {
24764 Value *Rdx = createSingleOp(Builder, TTI, Vec, Scale, IsSigned, DestTy);
24765 if (ReducedSubTree)
24766 ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx,
24767 "op.rdx", ReductionOps);
24768 else
24769 ReducedSubTree = Rdx;
24770 };
24771 if (VectorValuesAndScales.size() == 1) {
24772 const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.front();
24773 CreateSingleOp(Vec, Scale, IsSigned);
24774 return ReducedSubTree;
24775 }
24776 // Scales Vec using given Cnt scale factor and then performs vector combine
24777 // with previous value of VecOp.
24778 Value *VecRes = nullptr;
24779 bool VecResSignedness = false;
24780 auto CreateVecOp = [&](Value *Vec, unsigned Cnt, bool IsSigned) {
24781 Type *ScalarTy = Vec->getType()->getScalarType();
24782 // Scale Vec using given Cnt scale factor.
24783 if (Cnt > 1) {
24784 ElementCount EC = cast<VectorType>(Vec->getType())->getElementCount();
24785 switch (RdxKind) {
24786 case RecurKind::Add: {
24787 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy) {
24788 unsigned VF = getNumElements(Vec->getType());
24789 LLVM_DEBUG(dbgs() << "SLP: ctpop " << Cnt << "of " << Vec
24790 << ". (HorRdx)\n");
24792 for (unsigned I : seq<unsigned>(Cnt))
24793 std::iota(std::next(Mask.begin(), VF * I),
24794 std::next(Mask.begin(), VF * (I + 1)), 0);
24795 ++NumVectorInstructions;
24796 Vec = Builder.CreateShuffleVector(Vec, Mask);
24797 break;
24798 }
24799 // res = mul vv, n
24800 if (ScalarTy != DestTy->getScalarType())
24801 Vec = Builder.CreateIntCast(
24802 Vec, getWidenedType(DestTy, getNumElements(Vec->getType())),
24803 IsSigned);
24805 EC, ConstantInt::get(DestTy->getScalarType(), Cnt));
24806 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of " << Vec
24807 << ". (HorRdx)\n");
24808 ++NumVectorInstructions;
24809 Vec = Builder.CreateMul(Vec, Scale);
24810 break;
24811 }
24812 case RecurKind::Xor: {
24813 // res = n % 2 ? 0 : vv
24815 << "SLP: Xor " << Cnt << "of " << Vec << ". (HorRdx)\n");
24816 if (Cnt % 2 == 0)
24817 Vec = Constant::getNullValue(Vec->getType());
24818 break;
24819 }
24820 case RecurKind::FAdd: {
24821 // res = fmul v, n
24822 Value *Scale =
24823 ConstantVector::getSplat(EC, ConstantFP::get(ScalarTy, Cnt));
24824 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of " << Vec
24825 << ". (HorRdx)\n");
24826 ++NumVectorInstructions;
24827 Vec = Builder.CreateFMul(Vec, Scale);
24828 break;
24829 }
24830 case RecurKind::And:
24831 case RecurKind::Or:
24832 case RecurKind::SMax:
24833 case RecurKind::SMin:
24834 case RecurKind::UMax:
24835 case RecurKind::UMin:
24836 case RecurKind::FMax:
24837 case RecurKind::FMin:
24838 case RecurKind::FMaximum:
24839 case RecurKind::FMinimum:
24840 // res = vv
24841 break;
24842 case RecurKind::Sub:
24843 case RecurKind::AddChainWithSubs:
24844 case RecurKind::Mul:
24845 case RecurKind::FMul:
24846 case RecurKind::FMulAdd:
24847 case RecurKind::AnyOf:
24848 case RecurKind::FindFirstIVSMin:
24849 case RecurKind::FindFirstIVUMin:
24850 case RecurKind::FindLastIVSMax:
24851 case RecurKind::FindLastIVUMax:
24852 case RecurKind::FMaxNum:
24853 case RecurKind::FMinNum:
24854 case RecurKind::FMaximumNum:
24855 case RecurKind::FMinimumNum:
24856 case RecurKind::None:
24857 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
24858 }
24859 }
24860 // Combine Vec with the previous VecOp.
24861 if (!VecRes) {
24862 VecRes = Vec;
24863 VecResSignedness = IsSigned;
24864 } else {
24865 ++NumVectorInstructions;
24866 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy &&
24867 VecRes->getType()->getScalarType() == Builder.getInt1Ty()) {
24868 // Handle ctpop.
24869 unsigned VecResVF = getNumElements(VecRes->getType());
24870 unsigned VecVF = getNumElements(Vec->getType());
24871 SmallVector<int> Mask(VecResVF + VecVF, PoisonMaskElem);
24872 std::iota(Mask.begin(), Mask.end(), 0);
24873 // Ensure that VecRes is always larger than Vec
24874 if (VecResVF < VecVF) {
24875 std::swap(VecRes, Vec);
24876 std::swap(VecResVF, VecVF);
24877 }
24878 if (VecResVF != VecVF) {
24879 SmallVector<int> ResizeMask(VecResVF, PoisonMaskElem);
24880 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
24881 Vec = Builder.CreateShuffleVector(Vec, ResizeMask);
24882 }
24883 VecRes = Builder.CreateShuffleVector(VecRes, Vec, Mask, "rdx.op");
24884 return;
24885 }
24886 if (VecRes->getType()->getScalarType() != DestTy->getScalarType())
24887 VecRes = Builder.CreateIntCast(
24888 VecRes, getWidenedType(DestTy, getNumElements(VecRes->getType())),
24889 VecResSignedness);
24890 if (ScalarTy != DestTy->getScalarType())
24891 Vec = Builder.CreateIntCast(
24892 Vec, getWidenedType(DestTy, getNumElements(Vec->getType())),
24893 IsSigned);
24894 unsigned VecResVF = getNumElements(VecRes->getType());
24895 unsigned VecVF = getNumElements(Vec->getType());
24896 // Ensure that VecRes is always larger than Vec
24897 if (VecResVF < VecVF) {
24898 std::swap(VecRes, Vec);
24899 std::swap(VecResVF, VecVF);
24900 }
24901 // extract + op + insert
24902 Value *Op = VecRes;
24903 if (VecResVF != VecVF)
24904 Op = createExtractVector(Builder, VecRes, VecVF, /*Index=*/0);
24905 Op = createOp(Builder, RdxKind, Op, Vec, "rdx.op", ReductionOps);
24906 if (VecResVF != VecVF)
24907 Op = createInsertVector(Builder, VecRes, Op, /*Index=*/0);
24908 VecRes = Op;
24909 }
24910 };
24911 for (auto [Vec, Scale, IsSigned] : VectorValuesAndScales)
24912 CreateVecOp(Vec, Scale, IsSigned);
24913 CreateSingleOp(VecRes, /*Scale=*/1, /*IsSigned=*/false);
24914
24915 return ReducedSubTree;
24916 }
24917
24918 /// Emit a horizontal reduction of the vectorized value.
24919 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
24920 const TargetTransformInfo *TTI, Type *DestTy) {
24921 assert(VectorizedValue && "Need to have a vectorized tree node");
24922 assert(RdxKind != RecurKind::FMulAdd &&
24923 "A call to the llvm.fmuladd intrinsic is not handled yet");
24924
24925 auto *FTy = cast<FixedVectorType>(VectorizedValue->getType());
24926 if (FTy->getScalarType() == Builder.getInt1Ty() &&
24927 RdxKind == RecurKind::Add &&
24928 DestTy->getScalarType() != FTy->getScalarType()) {
24929 // Convert vector_reduce_add(ZExt(<n x i1>)) to
24930 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
24931 Value *V = Builder.CreateBitCast(
24932 VectorizedValue, Builder.getIntNTy(FTy->getNumElements()));
24933 ++NumVectorInstructions;
24934 return Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, V);
24935 }
24936 ++NumVectorInstructions;
24937 return createSimpleReduction(Builder, VectorizedValue, RdxKind);
24938 }
24939
24940 /// Emits optimized code for unique scalar value reused \p Cnt times.
24941 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
24942 unsigned Cnt) {
24943 assert(IsSupportedHorRdxIdentityOp &&
24944 "The optimization of matched scalar identity horizontal reductions "
24945 "must be supported.");
24946 if (Cnt == 1)
24947 return VectorizedValue;
24948 switch (RdxKind) {
24949 case RecurKind::Add: {
24950 // res = mul vv, n
24951 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
24952 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
24953 << VectorizedValue << ". (HorRdx)\n");
24954 return Builder.CreateMul(VectorizedValue, Scale);
24955 }
24956 case RecurKind::Xor: {
24957 // res = n % 2 ? 0 : vv
24958 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
24959 << ". (HorRdx)\n");
24960 if (Cnt % 2 == 0)
24961 return Constant::getNullValue(VectorizedValue->getType());
24962 return VectorizedValue;
24963 }
24964 case RecurKind::FAdd: {
24965 // res = fmul v, n
24966 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
24967 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
24968 << VectorizedValue << ". (HorRdx)\n");
24969 return Builder.CreateFMul(VectorizedValue, Scale);
24970 }
24971 case RecurKind::And:
24972 case RecurKind::Or:
24973 case RecurKind::SMax:
24974 case RecurKind::SMin:
24975 case RecurKind::UMax:
24976 case RecurKind::UMin:
24977 case RecurKind::FMax:
24978 case RecurKind::FMin:
24979 case RecurKind::FMaximum:
24980 case RecurKind::FMinimum:
24981 // res = vv
24982 return VectorizedValue;
24983 case RecurKind::Sub:
24984 case RecurKind::AddChainWithSubs:
24985 case RecurKind::Mul:
24986 case RecurKind::FMul:
24987 case RecurKind::FMulAdd:
24988 case RecurKind::AnyOf:
24989 case RecurKind::FindFirstIVSMin:
24990 case RecurKind::FindFirstIVUMin:
24991 case RecurKind::FindLastIVSMax:
24992 case RecurKind::FindLastIVUMax:
24993 case RecurKind::FMaxNum:
24994 case RecurKind::FMinNum:
24995 case RecurKind::FMaximumNum:
24996 case RecurKind::FMinimumNum:
24997 case RecurKind::None:
24998 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
24999 }
25000 return nullptr;
25001 }
25002
25003 /// Emits actual operation for the scalar identity values, found during
25004 /// horizontal reduction analysis.
25005 Value *
25006 emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
25007 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
25008 const DenseMap<Value *, Value *> &TrackedToOrig) {
25009 assert(IsSupportedHorRdxIdentityOp &&
25010 "The optimization of matched scalar identity horizontal reductions "
25011 "must be supported.");
25012 ArrayRef<Value *> VL = R.getRootNodeScalars();
25013 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
25014 if (VTy->getElementType() != VL.front()->getType()) {
25015 VectorizedValue = Builder.CreateIntCast(
25016 VectorizedValue,
25017 getWidenedType(VL.front()->getType(), VTy->getNumElements()),
25018 R.isSignedMinBitwidthRootNode());
25019 }
25020 switch (RdxKind) {
25021 case RecurKind::Add: {
25022 // root = mul prev_root, <1, 1, n, 1>
25024 for (Value *V : VL) {
25025 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
25026 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
25027 }
25028 auto *Scale = ConstantVector::get(Vals);
25029 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
25030 << VectorizedValue << ". (HorRdx)\n");
25031 return Builder.CreateMul(VectorizedValue, Scale);
25032 }
25033 case RecurKind::And:
25034 case RecurKind::Or:
25035 // No need for multiple or/and(s).
25036 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
25037 << ". (HorRdx)\n");
25038 return VectorizedValue;
25039 case RecurKind::SMax:
25040 case RecurKind::SMin:
25041 case RecurKind::UMax:
25042 case RecurKind::UMin:
25043 case RecurKind::FMax:
25044 case RecurKind::FMin:
25045 case RecurKind::FMaximum:
25046 case RecurKind::FMinimum:
25047 // No need for multiple min/max(s) of the same value.
25048 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
25049 << ". (HorRdx)\n");
25050 return VectorizedValue;
25051 case RecurKind::Xor: {
25052 // Replace values with even number of repeats with 0, since
25053 // x xor x = 0.
25054 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
25055 // 7>, if elements 4th and 6th elements have even number of repeats.
25057 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
25059 std::iota(Mask.begin(), Mask.end(), 0);
25060 bool NeedShuffle = false;
25061 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
25062 Value *V = VL[I];
25063 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
25064 if (Cnt % 2 == 0) {
25065 Mask[I] = VF;
25066 NeedShuffle = true;
25067 }
25068 }
25069 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
25070 : Mask) dbgs()
25071 << I << " ";
25072 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
25073 if (NeedShuffle)
25074 VectorizedValue = Builder.CreateShuffleVector(
25075 VectorizedValue,
25076 ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
25077 return VectorizedValue;
25078 }
25079 case RecurKind::FAdd: {
25080 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
25082 for (Value *V : VL) {
25083 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
25084 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
25085 }
25086 auto *Scale = ConstantVector::get(Vals);
25087 return Builder.CreateFMul(VectorizedValue, Scale);
25088 }
25089 case RecurKind::Sub:
25090 case RecurKind::AddChainWithSubs:
25091 case RecurKind::Mul:
25092 case RecurKind::FMul:
25093 case RecurKind::FMulAdd:
25094 case RecurKind::AnyOf:
25095 case RecurKind::FindFirstIVSMin:
25096 case RecurKind::FindFirstIVUMin:
25097 case RecurKind::FindLastIVSMax:
25098 case RecurKind::FindLastIVUMax:
25099 case RecurKind::FMaxNum:
25100 case RecurKind::FMinNum:
25101 case RecurKind::FMaximumNum:
25102 case RecurKind::FMinimumNum:
25103 case RecurKind::None:
25104 llvm_unreachable("Unexpected reduction kind for reused scalars.");
25105 }
25106 return nullptr;
25107 }
25108};
25109} // end anonymous namespace
25110
25111/// Gets recurrence kind from the specified value.
25113 return HorizontalReduction::getRdxKind(V);
25114}
25115static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
25116 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
25117 return cast<FixedVectorType>(IE->getType())->getNumElements();
25118
25119 unsigned AggregateSize = 1;
25120 auto *IV = cast<InsertValueInst>(InsertInst);
25121 Type *CurrentType = IV->getType();
25122 do {
25123 if (auto *ST = dyn_cast<StructType>(CurrentType)) {
25124 for (auto *Elt : ST->elements())
25125 if (Elt != ST->getElementType(0)) // check homogeneity
25126 return std::nullopt;
25127 AggregateSize *= ST->getNumElements();
25128 CurrentType = ST->getElementType(0);
25129 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
25130 AggregateSize *= AT->getNumElements();
25131 CurrentType = AT->getElementType();
25132 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
25133 AggregateSize *= VT->getNumElements();
25134 return AggregateSize;
25135 } else if (CurrentType->isSingleValueType()) {
25136 return AggregateSize;
25137 } else {
25138 return std::nullopt;
25139 }
25140 } while (true);
25141}
25142
25143static void findBuildAggregateRec(Instruction *LastInsertInst,
25145 SmallVectorImpl<Value *> &BuildVectorOpds,
25146 SmallVectorImpl<Value *> &InsertElts,
25147 unsigned OperandOffset, const BoUpSLP &R) {
25148 do {
25149 Value *InsertedOperand = LastInsertInst->getOperand(1);
25150 std::optional<unsigned> OperandIndex =
25151 getElementIndex(LastInsertInst, OperandOffset);
25152 if (!OperandIndex || R.isDeleted(LastInsertInst))
25153 return;
25154 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
25155 findBuildAggregateRec(cast<Instruction>(InsertedOperand), TTI,
25156 BuildVectorOpds, InsertElts, *OperandIndex, R);
25157
25158 } else {
25159 BuildVectorOpds[*OperandIndex] = InsertedOperand;
25160 InsertElts[*OperandIndex] = LastInsertInst;
25161 }
25162 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
25163 } while (LastInsertInst != nullptr &&
25164 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
25165 LastInsertInst->hasOneUse());
25166}
25167
25168/// Recognize construction of vectors like
25169/// %ra = insertelement <4 x float> poison, float %s0, i32 0
25170/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
25171/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
25172/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
25173/// starting from the last insertelement or insertvalue instruction.
25174///
25175/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
25176/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
25177/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
25178///
25179/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
25180///
25181/// \return true if it matches.
25182static bool findBuildAggregate(Instruction *LastInsertInst,
25184 SmallVectorImpl<Value *> &BuildVectorOpds,
25185 SmallVectorImpl<Value *> &InsertElts,
25186 const BoUpSLP &R) {
25187
25188 assert((isa<InsertElementInst>(LastInsertInst) ||
25189 isa<InsertValueInst>(LastInsertInst)) &&
25190 "Expected insertelement or insertvalue instruction!");
25191
25192 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
25193 "Expected empty result vectors!");
25194
25195 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
25196 if (!AggregateSize)
25197 return false;
25198 BuildVectorOpds.resize(*AggregateSize);
25199 InsertElts.resize(*AggregateSize);
25200
25201 findBuildAggregateRec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0, R);
25202 llvm::erase(BuildVectorOpds, nullptr);
25203 llvm::erase(InsertElts, nullptr);
25204 if (BuildVectorOpds.size() >= 2)
25205 return true;
25206
25207 return false;
25208}
25209
25210/// Try and get a reduction instruction from a phi node.
25211///
25212/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
25213/// if they come from either \p ParentBB or a containing loop latch.
25214///
25215/// \returns A candidate reduction value if possible, or \code nullptr \endcode
25216/// if not possible.
25218 BasicBlock *ParentBB, LoopInfo *LI) {
25219 // There are situations where the reduction value is not dominated by the
25220 // reduction phi. Vectorizing such cases has been reported to cause
25221 // miscompiles. See PR25787.
25222 auto DominatedReduxValue = [&](Value *R) {
25223 return isa<Instruction>(R) &&
25224 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
25225 };
25226
25227 Instruction *Rdx = nullptr;
25228
25229 // Return the incoming value if it comes from the same BB as the phi node.
25230 if (P->getIncomingBlock(0) == ParentBB) {
25231 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
25232 } else if (P->getIncomingBlock(1) == ParentBB) {
25233 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
25234 }
25235
25236 if (Rdx && DominatedReduxValue(Rdx))
25237 return Rdx;
25238
25239 // Otherwise, check whether we have a loop latch to look at.
25240 Loop *BBL = LI->getLoopFor(ParentBB);
25241 if (!BBL)
25242 return nullptr;
25243 BasicBlock *BBLatch = BBL->getLoopLatch();
25244 if (!BBLatch)
25245 return nullptr;
25246
25247 // There is a loop latch, return the incoming value if it comes from
25248 // that. This reduction pattern occasionally turns up.
25249 if (P->getIncomingBlock(0) == BBLatch) {
25250 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
25251 } else if (P->getIncomingBlock(1) == BBLatch) {
25252 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
25253 }
25254
25255 if (Rdx && DominatedReduxValue(Rdx))
25256 return Rdx;
25257
25258 return nullptr;
25259}
25260
25261static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
25262 if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
25263 return true;
25264 if (match(I, m_FMaxNum(m_Value(V0), m_Value(V1))))
25265 return true;
25266 if (match(I, m_FMinNum(m_Value(V0), m_Value(V1))))
25267 return true;
25268 if (match(I, m_FMaximum(m_Value(V0), m_Value(V1))))
25269 return true;
25270 if (match(I, m_FMinimum(m_Value(V0), m_Value(V1))))
25271 return true;
25272 if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1))))
25273 return true;
25274 if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1))))
25275 return true;
25276 if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1))))
25277 return true;
25278 if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1))))
25279 return true;
25280 return false;
25281}
25282
25283/// We could have an initial reduction that is not an add.
25284/// r *= v1 + v2 + v3 + v4
25285/// In such a case start looking for a tree rooted in the first '+'.
25286/// \Returns the new root if found, which may be nullptr if not an instruction.
25288 Instruction *Root) {
25289 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
25290 isa<IntrinsicInst>(Root)) &&
25291 "Expected binop, select, or intrinsic for reduction matching");
25292 Value *LHS =
25293 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
25294 Value *RHS =
25295 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
25296 if (LHS == Phi)
25297 return dyn_cast<Instruction>(RHS);
25298 if (RHS == Phi)
25299 return dyn_cast<Instruction>(LHS);
25300 return nullptr;
25301}
25302
25303/// \p Returns the first operand of \p I that does not match \p Phi. If
25304/// operand is not an instruction it returns nullptr.
25306 Value *Op0 = nullptr;
25307 Value *Op1 = nullptr;
25308 if (!matchRdxBop(I, Op0, Op1))
25309 return nullptr;
25310 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
25311}
25312
25313/// \Returns true if \p I is a candidate instruction for reduction vectorization.
25315 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
25316 Value *B0 = nullptr, *B1 = nullptr;
25317 bool IsBinop = matchRdxBop(I, B0, B1);
25318 return IsBinop || IsSelect;
25319}
25320
25321bool SLPVectorizerPass::vectorizeHorReduction(
25322 PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
25323 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
25324 if (!ShouldVectorizeHor)
25325 return false;
25326 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
25327
25328 if (Root->getParent() != BB || isa<PHINode>(Root))
25329 return false;
25330
25331 // If we can find a secondary reduction root, use that instead.
25332 auto SelectRoot = [&]() {
25333 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
25334 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
25335 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
25336 return NewRoot;
25337 return Root;
25338 };
25339
25340 // Start analysis starting from Root instruction. If horizontal reduction is
25341 // found, try to vectorize it. If it is not a horizontal reduction or
25342 // vectorization is not possible or not effective, and currently analyzed
25343 // instruction is a binary operation, try to vectorize the operands, using
25344 // pre-order DFS traversal order. If the operands were not vectorized, repeat
25345 // the same procedure considering each operand as a possible root of the
25346 // horizontal reduction.
25347 // Interrupt the process if the Root instruction itself was vectorized or all
25348 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
25349 // If a horizintal reduction was not matched or vectorized we collect
25350 // instructions for possible later attempts for vectorization.
25351 std::queue<std::pair<Instruction *, unsigned>> Stack;
25352 Stack.emplace(SelectRoot(), 0);
25353 SmallPtrSet<Value *, 8> VisitedInstrs;
25354 bool Res = false;
25355 auto TryToReduce = [this, &R, TTI = TTI](Instruction *Inst) -> Value * {
25356 if (R.isAnalyzedReductionRoot(Inst))
25357 return nullptr;
25358 if (!isReductionCandidate(Inst))
25359 return nullptr;
25360 HorizontalReduction HorRdx;
25361 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
25362 return nullptr;
25363 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
25364 };
25365 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
25366 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
25367 FutureSeed = getNonPhiOperand(Root, P);
25368 if (!FutureSeed)
25369 return false;
25370 }
25371 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
25372 // analysis is done separately.
25373 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
25374 PostponedInsts.push_back(FutureSeed);
25375 return true;
25376 };
25377
25378 while (!Stack.empty()) {
25379 Instruction *Inst;
25380 unsigned Level;
25381 std::tie(Inst, Level) = Stack.front();
25382 Stack.pop();
25383 // Do not try to analyze instruction that has already been vectorized.
25384 // This may happen when we vectorize instruction operands on a previous
25385 // iteration while stack was populated before that happened.
25386 if (R.isDeleted(Inst))
25387 continue;
25388 if (Value *VectorizedV = TryToReduce(Inst)) {
25389 Res = true;
25390 if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
25391 // Try to find another reduction.
25392 Stack.emplace(I, Level);
25393 continue;
25394 }
25395 if (R.isDeleted(Inst))
25396 continue;
25397 } else {
25398 // We could not vectorize `Inst` so try to use it as a future seed.
25399 if (!TryAppendToPostponedInsts(Inst)) {
25400 assert(Stack.empty() && "Expected empty stack");
25401 break;
25402 }
25403 }
25404
25405 // Try to vectorize operands.
25406 // Continue analysis for the instruction from the same basic block only to
25407 // save compile time.
25408 if (++Level < RecursionMaxDepth)
25409 for (auto *Op : Inst->operand_values())
25410 if (VisitedInstrs.insert(Op).second)
25411 if (auto *I = dyn_cast<Instruction>(Op))
25412 // Do not try to vectorize CmpInst operands, this is done
25413 // separately.
25414 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&
25415 !R.isDeleted(I) && I->getParent() == BB)
25416 Stack.emplace(I, Level);
25417 }
25418 return Res;
25419}
25420
25421bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
25422 if (!I)
25423 return false;
25424
25425 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
25426 return false;
25427 // Skip potential FMA candidates.
25428 if ((I->getOpcode() == Instruction::FAdd ||
25429 I->getOpcode() == Instruction::FSub) &&
25430 canConvertToFMA(I, getSameOpcode(I, *TLI), *DT, *DL, *TTI, *TLI)
25431 .isValid())
25432 return false;
25433
25434 Value *P = I->getParent();
25435
25436 // Vectorize in current basic block only.
25437 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
25438 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
25439 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
25440 R.isDeleted(Op0) || R.isDeleted(Op1))
25441 return false;
25442
25443 // First collect all possible candidates
25445 Candidates.emplace_back(Op0, Op1);
25446
25447 auto *A = dyn_cast<BinaryOperator>(Op0);
25448 auto *B = dyn_cast<BinaryOperator>(Op1);
25449 // Try to skip B.
25450 if (A && B && B->hasOneUse()) {
25451 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
25452 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
25453 if (B0 && B0->getParent() == P && !R.isDeleted(B0))
25454 Candidates.emplace_back(A, B0);
25455 if (B1 && B1->getParent() == P && !R.isDeleted(B1))
25456 Candidates.emplace_back(A, B1);
25457 }
25458 // Try to skip A.
25459 if (B && A && A->hasOneUse()) {
25460 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
25461 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
25462 if (A0 && A0->getParent() == P && !R.isDeleted(A0))
25463 Candidates.emplace_back(A0, B);
25464 if (A1 && A1->getParent() == P && !R.isDeleted(A1))
25465 Candidates.emplace_back(A1, B);
25466 }
25467
25468 auto TryToReduce = [this, &R, &TTI = *TTI](Instruction *Inst,
25469 ArrayRef<Value *> Ops) {
25470 if (!isReductionCandidate(Inst))
25471 return false;
25472 Type *Ty = Inst->getType();
25473 if (!isValidElementType(Ty) || Ty->isPointerTy())
25474 return false;
25475 HorizontalReduction HorRdx(Inst, Ops);
25476 if (!HorRdx.matchReductionForOperands())
25477 return false;
25478 // Check the cost of operations.
25479 VectorType *VecTy = getWidenedType(Ty, Ops.size());
25481 InstructionCost ScalarCost =
25483 VecTy, APInt::getAllOnes(getNumElements(VecTy)), /*Insert=*/false,
25484 /*Extract=*/true, CostKind) +
25486 InstructionCost RedCost;
25487 switch (::getRdxKind(Inst)) {
25488 case RecurKind::Add:
25489 case RecurKind::Mul:
25490 case RecurKind::Or:
25491 case RecurKind::And:
25492 case RecurKind::Xor:
25493 case RecurKind::FAdd:
25494 case RecurKind::FMul: {
25495 FastMathFlags FMF;
25496 if (auto *FPCI = dyn_cast<FPMathOperator>(Inst))
25497 FMF = FPCI->getFastMathFlags();
25498 RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
25499 CostKind);
25500 break;
25501 }
25502 default:
25503 return false;
25504 }
25505 if (RedCost >= ScalarCost)
25506 return false;
25507
25508 return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) != nullptr;
25509 };
25510 if (Candidates.size() == 1)
25511 return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R);
25512
25513 // We have multiple options. Try to pick the single best.
25514 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
25515 if (!BestCandidate)
25516 return false;
25517 return (*BestCandidate == 0 &&
25518 TryToReduce(I, {Candidates[*BestCandidate].first,
25519 Candidates[*BestCandidate].second})) ||
25520 tryToVectorizeList({Candidates[*BestCandidate].first,
25521 Candidates[*BestCandidate].second},
25522 R);
25523}
25524
25525bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
25526 BasicBlock *BB, BoUpSLP &R) {
25527 SmallVector<WeakTrackingVH> PostponedInsts;
25528 bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts);
25529 Res |= tryToVectorize(PostponedInsts, R);
25530 return Res;
25531}
25532
25533bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
25534 BoUpSLP &R) {
25535 bool Res = false;
25536 for (Value *V : Insts)
25537 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
25538 Res |= tryToVectorize(Inst, R);
25539 return Res;
25540}
25541
25542bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
25543 BasicBlock *BB, BoUpSLP &R,
25544 bool MaxVFOnly) {
25545 if (!R.canMapToVector(IVI->getType()))
25546 return false;
25547
25548 SmallVector<Value *, 16> BuildVectorOpds;
25549 SmallVector<Value *, 16> BuildVectorInsts;
25550 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts, R))
25551 return false;
25552
25553 if (MaxVFOnly && BuildVectorOpds.size() == 2) {
25554 R.getORE()->emit([&]() {
25555 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
25556 << "Cannot SLP vectorize list: only 2 elements of buildvalue, "
25557 "trying reduction first.";
25558 });
25559 return false;
25560 }
25561 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
25562 // Aggregate value is unlikely to be processed in vector register.
25563 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
25564}
25565
25566bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
25567 BasicBlock *BB, BoUpSLP &R,
25568 bool MaxVFOnly) {
25569 SmallVector<Value *, 16> BuildVectorInsts;
25570 SmallVector<Value *, 16> BuildVectorOpds;
25572 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts, R) ||
25573 (all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
25574 isFixedVectorShuffle(BuildVectorOpds, Mask, AC)))
25575 return false;
25576
25577 if (MaxVFOnly && BuildVectorInsts.size() == 2) {
25578 R.getORE()->emit([&]() {
25579 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
25580 << "Cannot SLP vectorize list: only 2 elements of buildvector, "
25581 "trying reduction first.";
25582 });
25583 return false;
25584 }
25585 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
25586 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
25587}
25588
25589template <typename T>
25591 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
25592 function_ref<bool(ArrayRef<T *>, T *)> AreCompatible,
25593 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
25594 bool MaxVFOnly, BoUpSLP &R) {
25595 bool Changed = false;
25596 // Sort by type, parent, operands.
25597 stable_sort(Incoming, Comparator);
25598
25599 // Try to vectorize elements base on their type.
25600 SmallVector<T *> Candidates;
25602 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
25603 VL.clear()) {
25604 // Look for the next elements with the same type, parent and operand
25605 // kinds.
25606 auto *I = dyn_cast<Instruction>(*IncIt);
25607 if (!I || R.isDeleted(I)) {
25608 ++IncIt;
25609 continue;
25610 }
25611 auto *SameTypeIt = IncIt;
25612 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
25613 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
25614 AreCompatible(VL, *SameTypeIt))) {
25615 auto *I = dyn_cast<Instruction>(*SameTypeIt);
25616 ++SameTypeIt;
25617 if (I && !R.isDeleted(I))
25618 VL.push_back(cast<T>(I));
25619 }
25620
25621 // Try to vectorize them.
25622 unsigned NumElts = VL.size();
25623 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
25624 << NumElts << ")\n");
25625 // The vectorization is a 3-state attempt:
25626 // 1. Try to vectorize instructions with the same/alternate opcodes with the
25627 // size of maximal register at first.
25628 // 2. Try to vectorize remaining instructions with the same type, if
25629 // possible. This may result in the better vectorization results rather than
25630 // if we try just to vectorize instructions with the same/alternate opcodes.
25631 // 3. Final attempt to try to vectorize all instructions with the
25632 // same/alternate ops only, this may result in some extra final
25633 // vectorization.
25634 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
25635 // Success start over because instructions might have been changed.
25636 Changed = true;
25637 VL.swap(Candidates);
25638 Candidates.clear();
25639 for (T *V : VL) {
25640 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
25641 Candidates.push_back(V);
25642 }
25643 } else {
25644 /// \Returns the minimum number of elements that we will attempt to
25645 /// vectorize.
25646 auto GetMinNumElements = [&R](Value *V) {
25647 unsigned EltSize = R.getVectorElementSize(V);
25648 return std::max(2U, R.getMaxVecRegSize() / EltSize);
25649 };
25650 if (NumElts < GetMinNumElements(*IncIt) &&
25651 (Candidates.empty() ||
25652 Candidates.front()->getType() == (*IncIt)->getType())) {
25653 for (T *V : VL) {
25654 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
25655 Candidates.push_back(V);
25656 }
25657 }
25658 }
25659 // Final attempt to vectorize instructions with the same types.
25660 if (Candidates.size() > 1 &&
25661 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
25662 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
25663 // Success start over because instructions might have been changed.
25664 Changed = true;
25665 } else if (MaxVFOnly) {
25666 // Try to vectorize using small vectors.
25668 for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
25669 VL.clear()) {
25670 auto *I = dyn_cast<Instruction>(*It);
25671 if (!I || R.isDeleted(I)) {
25672 ++It;
25673 continue;
25674 }
25675 auto *SameTypeIt = It;
25676 while (SameTypeIt != End &&
25677 (!isa<Instruction>(*SameTypeIt) ||
25678 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
25679 AreCompatible(*SameTypeIt, *It))) {
25680 auto *I = dyn_cast<Instruction>(*SameTypeIt);
25681 ++SameTypeIt;
25682 if (I && !R.isDeleted(I))
25683 VL.push_back(cast<T>(I));
25684 }
25685 unsigned NumElts = VL.size();
25686 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
25687 /*MaxVFOnly=*/false))
25688 Changed = true;
25689 It = SameTypeIt;
25690 }
25691 }
25692 Candidates.clear();
25693 }
25694
25695 // Start over at the next instruction of a different type (or the end).
25696 IncIt = SameTypeIt;
25697 }
25698 return Changed;
25699}
25700
25701/// Compare two cmp instructions. If IsCompatibility is true, function returns
25702/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
25703/// operands. If IsCompatibility is false, function implements strict weak
25704/// ordering relation between two cmp instructions, returning true if the first
25705/// instruction is "less" than the second, i.e. its predicate is less than the
25706/// predicate of the second or the operands IDs are less than the operands IDs
25707/// of the second cmp instruction.
25708template <bool IsCompatibility>
25709static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
25710 const DominatorTree &DT) {
25711 assert(isValidElementType(V->getType()) &&
25712 isValidElementType(V2->getType()) &&
25713 "Expected valid element types only.");
25714 if (V == V2)
25715 return IsCompatibility;
25716 auto *CI1 = cast<CmpInst>(V);
25717 auto *CI2 = cast<CmpInst>(V2);
25718 if (CI1->getOperand(0)->getType()->getTypeID() <
25719 CI2->getOperand(0)->getType()->getTypeID())
25720 return !IsCompatibility;
25721 if (CI1->getOperand(0)->getType()->getTypeID() >
25722 CI2->getOperand(0)->getType()->getTypeID())
25723 return false;
25724 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
25726 return !IsCompatibility;
25727 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
25729 return false;
25730 CmpInst::Predicate Pred1 = CI1->getPredicate();
25731 CmpInst::Predicate Pred2 = CI2->getPredicate();
25734 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
25735 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
25736 if (BasePred1 < BasePred2)
25737 return !IsCompatibility;
25738 if (BasePred1 > BasePred2)
25739 return false;
25740 // Compare operands.
25741 bool CI1Preds = Pred1 == BasePred1;
25742 bool CI2Preds = Pred2 == BasePred1;
25743 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
25744 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
25745 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
25746 if (Op1 == Op2)
25747 continue;
25748 if (Op1->getValueID() < Op2->getValueID())
25749 return !IsCompatibility;
25750 if (Op1->getValueID() > Op2->getValueID())
25751 return false;
25752 if (auto *I1 = dyn_cast<Instruction>(Op1))
25753 if (auto *I2 = dyn_cast<Instruction>(Op2)) {
25754 if (IsCompatibility) {
25755 if (I1->getParent() != I2->getParent())
25756 return false;
25757 } else {
25758 // Try to compare nodes with same parent.
25759 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
25760 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
25761 if (!NodeI1)
25762 return NodeI2 != nullptr;
25763 if (!NodeI2)
25764 return false;
25765 assert((NodeI1 == NodeI2) ==
25766 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
25767 "Different nodes should have different DFS numbers");
25768 if (NodeI1 != NodeI2)
25769 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
25770 }
25771 InstructionsState S = getSameOpcode({I1, I2}, TLI);
25772 if (S && (IsCompatibility || !S.isAltShuffle()))
25773 continue;
25774 if (IsCompatibility)
25775 return false;
25776 if (I1->getOpcode() != I2->getOpcode())
25777 return I1->getOpcode() < I2->getOpcode();
25778 }
25779 }
25780 return IsCompatibility;
25781}
25782
25783template <typename ItT>
25784bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
25785 BasicBlock *BB, BoUpSLP &R) {
25786 bool Changed = false;
25787 // Try to find reductions first.
25788 for (CmpInst *I : CmpInsts) {
25789 if (R.isDeleted(I))
25790 continue;
25791 for (Value *Op : I->operands())
25792 if (auto *RootOp = dyn_cast<Instruction>(Op)) {
25793 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R);
25794 if (R.isDeleted(I))
25795 break;
25796 }
25797 }
25798 // Try to vectorize operands as vector bundles.
25799 for (CmpInst *I : CmpInsts) {
25800 if (R.isDeleted(I))
25801 continue;
25802 Changed |= tryToVectorize(I, R);
25803 }
25804 // Try to vectorize list of compares.
25805 // Sort by type, compare predicate, etc.
25806 auto CompareSorter = [&](Value *V, Value *V2) {
25807 if (V == V2)
25808 return false;
25809 return compareCmp<false>(V, V2, *TLI, *DT);
25810 };
25811
25812 auto AreCompatibleCompares = [&](ArrayRef<Value *> VL, Value *V1) {
25813 if (VL.empty() || VL.back() == V1)
25814 return true;
25815 return compareCmp<true>(V1, VL.back(), *TLI, *DT);
25816 };
25817
25819 for (Instruction *V : CmpInsts)
25820 if (!R.isDeleted(V) && isValidElementType(getValueType(V)))
25821 Vals.push_back(V);
25822 if (Vals.size() <= 1)
25823 return Changed;
25824 Changed |= tryToVectorizeSequence<Value>(
25825 Vals, CompareSorter, AreCompatibleCompares,
25826 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
25827 // Exclude possible reductions from other blocks.
25828 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
25829 return any_of(V->users(), [V](User *U) {
25830 auto *Select = dyn_cast<SelectInst>(U);
25831 return Select &&
25832 Select->getParent() != cast<Instruction>(V)->getParent();
25833 });
25834 });
25835 if (ArePossiblyReducedInOtherBlock)
25836 return false;
25837 return tryToVectorizeList(Candidates, R, MaxVFOnly);
25838 },
25839 /*MaxVFOnly=*/true, R);
25840 return Changed;
25841}
25842
25843bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
25844 BasicBlock *BB, BoUpSLP &R) {
25845 assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
25846 "This function only accepts Insert instructions");
25847 bool OpsChanged = false;
25848 SmallVector<WeakTrackingVH> PostponedInsts;
25849 for (auto *I : reverse(Instructions)) {
25850 // pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
25851 if (R.isDeleted(I) || isa<CmpInst>(I))
25852 continue;
25853 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
25854 OpsChanged |=
25855 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/true);
25856 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
25857 OpsChanged |=
25858 vectorizeInsertElementInst(LastInsertElem, BB, R, /*MaxVFOnly=*/true);
25859 }
25860 // pass2 - try to vectorize reductions only
25861 if (R.isDeleted(I))
25862 continue;
25863 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, PostponedInsts);
25864 if (R.isDeleted(I) || isa<CmpInst>(I))
25865 continue;
25866 // pass3 - try to match and vectorize a buildvector sequence.
25867 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
25868 OpsChanged |=
25869 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/false);
25870 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
25871 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
25872 /*MaxVFOnly=*/false);
25873 }
25874 }
25875 // Now try to vectorize postponed instructions.
25876 OpsChanged |= tryToVectorize(PostponedInsts, R);
25877
25878 Instructions.clear();
25879 return OpsChanged;
25880}
25881
25882bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
25883 bool Changed = false;
25885 SmallPtrSet<Value *, 16> VisitedInstrs;
25886 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
25887 // node. Allows better to identify the chains that can be vectorized in the
25888 // better way.
25890 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
25892 isValidElementType(V2->getType()) &&
25893 "Expected vectorizable types only.");
25894 if (V1 == V2)
25895 return false;
25896 // It is fine to compare type IDs here, since we expect only vectorizable
25897 // types, like ints, floats and pointers, we don't care about other type.
25898 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
25899 return true;
25900 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
25901 return false;
25902 if (V1->getType()->getScalarSizeInBits() <
25903 V2->getType()->getScalarSizeInBits())
25904 return true;
25905 if (V1->getType()->getScalarSizeInBits() >
25906 V2->getType()->getScalarSizeInBits())
25907 return false;
25908 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
25909 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
25910 if (Opcodes1.size() < Opcodes2.size())
25911 return true;
25912 if (Opcodes1.size() > Opcodes2.size())
25913 return false;
25914 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
25915 {
25916 // Instructions come first.
25917 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
25918 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
25919 if (I1 && I2) {
25920 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
25921 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
25922 if (!NodeI1)
25923 return NodeI2 != nullptr;
25924 if (!NodeI2)
25925 return false;
25926 assert((NodeI1 == NodeI2) ==
25927 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
25928 "Different nodes should have different DFS numbers");
25929 if (NodeI1 != NodeI2)
25930 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
25931 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
25932 if (S && !S.isAltShuffle() && I1->getOpcode() == I2->getOpcode()) {
25933 const auto *E1 = dyn_cast<ExtractElementInst>(I1);
25934 const auto *E2 = dyn_cast<ExtractElementInst>(I2);
25935 if (!E1 || !E2)
25936 continue;
25937
25938 // Sort on ExtractElementInsts primarily by vector operands. Prefer
25939 // program order of the vector operands.
25940 const auto *V1 = dyn_cast<Instruction>(E1->getVectorOperand());
25941 const auto *V2 = dyn_cast<Instruction>(E2->getVectorOperand());
25942 if (V1 != V2) {
25943 if (V1 && !V2)
25944 return true;
25945 if (!V1 && V2)
25946 return false;
25948 DT->getNode(V1->getParent());
25950 DT->getNode(V2->getParent());
25951 if (!NodeI1)
25952 return NodeI2 != nullptr;
25953 if (!NodeI2)
25954 return false;
25955 assert((NodeI1 == NodeI2) ==
25956 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
25957 "Different nodes should have different DFS numbers");
25958 if (NodeI1 != NodeI2)
25959 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
25960 return V1->comesBefore(V2);
25961 }
25962 // If we have the same vector operand, try to sort by constant
25963 // index.
25964 std::optional<unsigned> Id1 = getExtractIndex(E1);
25965 std::optional<unsigned> Id2 = getExtractIndex(E2);
25966 // Bring constants to the top
25967 if (Id1 && !Id2)
25968 return true;
25969 if (!Id1 && Id2)
25970 return false;
25971 // First elements come first.
25972 if (Id1 && Id2)
25973 return *Id1 < *Id2;
25974
25975 continue;
25976 }
25977 if (I1->getOpcode() == I2->getOpcode())
25978 continue;
25979 return I1->getOpcode() < I2->getOpcode();
25980 }
25981 if (I1)
25982 return true;
25983 if (I2)
25984 return false;
25985 }
25986 {
25987 // Non-undef constants come next.
25988 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
25989 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
25990 if (C1 && C2)
25991 continue;
25992 if (C1)
25993 return true;
25994 if (C2)
25995 return false;
25996 }
25997 bool U1 = isa<UndefValue>(Opcodes1[I]);
25998 bool U2 = isa<UndefValue>(Opcodes2[I]);
25999 {
26000 // Non-constant non-instructions come next.
26001 if (!U1 && !U2) {
26002 auto ValID1 = Opcodes1[I]->getValueID();
26003 auto ValID2 = Opcodes2[I]->getValueID();
26004 if (ValID1 == ValID2)
26005 continue;
26006 if (ValID1 < ValID2)
26007 return true;
26008 if (ValID1 > ValID2)
26009 return false;
26010 }
26011 if (!U1)
26012 return true;
26013 if (!U2)
26014 return false;
26015 }
26016 // Undefs come last.
26017 assert(U1 && U2 && "The only thing left should be undef & undef.");
26018 }
26019 return false;
26020 };
26021 auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](ArrayRef<Value *> VL,
26022 Value *V1) {
26023 if (VL.empty() || V1 == VL.back())
26024 return true;
26025 Value *V2 = VL.back();
26026 if (V1->getType() != V2->getType())
26027 return false;
26028 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
26029 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
26030 if (Opcodes1.size() != Opcodes2.size())
26031 return false;
26032 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
26033 // Undefs are compatible with any other value.
26034 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
26035 continue;
26036 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
26037 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
26038 if (R.isDeleted(I1) || R.isDeleted(I2))
26039 return false;
26040 if (I1->getParent() != I2->getParent())
26041 return false;
26042 if (getSameOpcode({I1, I2}, *TLI))
26043 continue;
26044 return false;
26045 }
26046 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
26047 continue;
26048 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
26049 return false;
26050 }
26051 return true;
26052 };
26053
26054 bool HaveVectorizedPhiNodes = false;
26055 do {
26056 // Collect the incoming values from the PHIs.
26057 Incoming.clear();
26058 for (Instruction &I : *BB) {
26059 auto *P = dyn_cast<PHINode>(&I);
26060 if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
26061 break;
26062
26063 // No need to analyze deleted, vectorized and non-vectorizable
26064 // instructions.
26065 if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
26066 isValidElementType(P->getType()))
26067 Incoming.push_back(P);
26068 }
26069
26070 if (Incoming.size() <= 1)
26071 break;
26072
26073 // Find the corresponding non-phi nodes for better matching when trying to
26074 // build the tree.
26075 for (Value *V : Incoming) {
26076 SmallVectorImpl<Value *> &Opcodes =
26077 PHIToOpcodes.try_emplace(V).first->getSecond();
26078 if (!Opcodes.empty())
26079 continue;
26080 SmallVector<Value *, 4> Nodes(1, V);
26082 while (!Nodes.empty()) {
26083 auto *PHI = cast<PHINode>(Nodes.pop_back_val());
26084 if (!Visited.insert(PHI).second)
26085 continue;
26086 for (Value *V : PHI->incoming_values()) {
26087 if (auto *PHI1 = dyn_cast<PHINode>((V))) {
26088 Nodes.push_back(PHI1);
26089 continue;
26090 }
26091 Opcodes.emplace_back(V);
26092 }
26093 }
26094 }
26095
26096 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
26097 Incoming, PHICompare, AreCompatiblePHIs,
26098 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
26099 return tryToVectorizeList(Candidates, R, MaxVFOnly);
26100 },
26101 /*MaxVFOnly=*/true, R);
26102 Changed |= HaveVectorizedPhiNodes;
26103 if (HaveVectorizedPhiNodes && any_of(PHIToOpcodes, [&](const auto &P) {
26104 auto *PHI = dyn_cast<PHINode>(P.first);
26105 return !PHI || R.isDeleted(PHI);
26106 }))
26107 PHIToOpcodes.clear();
26108 VisitedInstrs.insert_range(Incoming);
26109 } while (HaveVectorizedPhiNodes);
26110
26111 VisitedInstrs.clear();
26112
26113 InstSetVector PostProcessInserts;
26114 SmallSetVector<CmpInst *, 8> PostProcessCmps;
26115 // Vectorizes Inserts in `PostProcessInserts` and if `VectorizeCmps` is true
26116 // also vectorizes `PostProcessCmps`.
26117 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
26118 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
26119 if (VectorizeCmps) {
26120 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
26121 PostProcessCmps.clear();
26122 }
26123 PostProcessInserts.clear();
26124 return Changed;
26125 };
26126 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
26127 auto IsInPostProcessInstrs = [&](Instruction *I) {
26128 if (auto *Cmp = dyn_cast<CmpInst>(I))
26129 return PostProcessCmps.contains(Cmp);
26130 return isa<InsertElementInst, InsertValueInst>(I) &&
26131 PostProcessInserts.contains(I);
26132 };
26133 // Returns true if `I` is an instruction without users, like terminator, or
26134 // function call with ignored return value, store. Ignore unused instructions
26135 // (basing on instruction type, except for CallInst and InvokeInst).
26136 auto HasNoUsers = [](Instruction *I) {
26137 return I->use_empty() &&
26138 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
26139 };
26140 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
26141 // Skip instructions with scalable type. The num of elements is unknown at
26142 // compile-time for scalable type.
26143 if (isa<ScalableVectorType>(It->getType()))
26144 continue;
26145
26146 // Skip instructions marked for the deletion.
26147 if (R.isDeleted(&*It))
26148 continue;
26149 // We may go through BB multiple times so skip the one we have checked.
26150 if (!VisitedInstrs.insert(&*It).second) {
26151 if (HasNoUsers(&*It) &&
26152 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
26153 // We would like to start over since some instructions are deleted
26154 // and the iterator may become invalid value.
26155 Changed = true;
26156 It = BB->begin();
26157 E = BB->end();
26158 }
26159 continue;
26160 }
26161
26162 // Try to vectorize reductions that use PHINodes.
26163 if (PHINode *P = dyn_cast<PHINode>(It)) {
26164 // Check that the PHI is a reduction PHI.
26165 if (P->getNumIncomingValues() == 2) {
26166 // Try to match and vectorize a horizontal reduction.
26167 Instruction *Root = getReductionInstr(DT, P, BB, LI);
26168 if (Root && vectorizeRootInstruction(P, Root, BB, R)) {
26169 Changed = true;
26170 It = BB->begin();
26171 E = BB->end();
26172 continue;
26173 }
26174 }
26175 // Try to vectorize the incoming values of the PHI, to catch reductions
26176 // that feed into PHIs.
26177 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
26178 // Skip if the incoming block is the current BB for now. Also, bypass
26179 // unreachable IR for efficiency and to avoid crashing.
26180 // TODO: Collect the skipped incoming values and try to vectorize them
26181 // after processing BB.
26182 if (BB == P->getIncomingBlock(I) ||
26183 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
26184 continue;
26185
26186 // Postponed instructions should not be vectorized here, delay their
26187 // vectorization.
26188 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
26189 PI && !IsInPostProcessInstrs(PI)) {
26190 bool Res =
26191 vectorizeRootInstruction(nullptr, PI, P->getIncomingBlock(I), R);
26192 Changed |= Res;
26193 if (Res && R.isDeleted(P)) {
26194 It = BB->begin();
26195 E = BB->end();
26196 break;
26197 }
26198 }
26199 }
26200 continue;
26201 }
26202
26203 if (HasNoUsers(&*It)) {
26204 bool OpsChanged = false;
26205 auto *SI = dyn_cast<StoreInst>(It);
26206 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
26207 if (SI) {
26208 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
26209 // Try to vectorize chain in store, if this is the only store to the
26210 // address in the block.
26211 // TODO: This is just a temporarily solution to save compile time. Need
26212 // to investigate if we can safely turn on slp-vectorize-hor-store
26213 // instead to allow lookup for reduction chains in all non-vectorized
26214 // stores (need to check side effects and compile time).
26215 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
26216 SI->getValueOperand()->hasOneUse();
26217 }
26218 if (TryToVectorizeRoot) {
26219 for (auto *V : It->operand_values()) {
26220 // Postponed instructions should not be vectorized here, delay their
26221 // vectorization.
26222 if (auto *VI = dyn_cast<Instruction>(V);
26223 VI && !IsInPostProcessInstrs(VI))
26224 // Try to match and vectorize a horizontal reduction.
26225 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R);
26226 }
26227 }
26228 // Start vectorization of post-process list of instructions from the
26229 // top-tree instructions to try to vectorize as many instructions as
26230 // possible.
26231 OpsChanged |=
26232 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
26233 if (OpsChanged) {
26234 // We would like to start over since some instructions are deleted
26235 // and the iterator may become invalid value.
26236 Changed = true;
26237 It = BB->begin();
26238 E = BB->end();
26239 continue;
26240 }
26241 }
26242
26243 if (isa<InsertElementInst, InsertValueInst>(It))
26244 PostProcessInserts.insert(&*It);
26245 else if (isa<CmpInst>(It))
26246 PostProcessCmps.insert(cast<CmpInst>(&*It));
26247 }
26248
26249 return Changed;
26250}
26251
26252bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
26253 auto Changed = false;
26254 for (auto &Entry : GEPs) {
26255 // If the getelementptr list has fewer than two elements, there's nothing
26256 // to do.
26257 if (Entry.second.size() < 2)
26258 continue;
26259
26260 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
26261 << Entry.second.size() << ".\n");
26262
26263 // Process the GEP list in chunks suitable for the target's supported
26264 // vector size. If a vector register can't hold 1 element, we are done. We
26265 // are trying to vectorize the index computations, so the maximum number of
26266 // elements is based on the size of the index expression, rather than the
26267 // size of the GEP itself (the target's pointer size).
26268 auto *It = find_if(Entry.second, [&](GetElementPtrInst *GEP) {
26269 return !R.isDeleted(GEP);
26270 });
26271 if (It == Entry.second.end())
26272 continue;
26273 unsigned MaxVecRegSize = R.getMaxVecRegSize();
26274 unsigned EltSize = R.getVectorElementSize(*(*It)->idx_begin());
26275 if (MaxVecRegSize < EltSize)
26276 continue;
26277
26278 unsigned MaxElts = MaxVecRegSize / EltSize;
26279 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
26280 auto Len = std::min<unsigned>(BE - BI, MaxElts);
26281 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
26282
26283 // Initialize a set a candidate getelementptrs. Note that we use a
26284 // SetVector here to preserve program order. If the index computations
26285 // are vectorizable and begin with loads, we want to minimize the chance
26286 // of having to reorder them later.
26287 SetVector<Value *> Candidates(llvm::from_range, GEPList);
26288
26289 // Some of the candidates may have already been vectorized after we
26290 // initially collected them or their index is optimized to constant value.
26291 // If so, they are marked as deleted, so remove them from the set of
26292 // candidates.
26293 Candidates.remove_if([&R](Value *I) {
26294 return R.isDeleted(cast<Instruction>(I)) ||
26295 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
26296 });
26297
26298 // Remove from the set of candidates all pairs of getelementptrs with
26299 // constant differences. Such getelementptrs are likely not good
26300 // candidates for vectorization in a bottom-up phase since one can be
26301 // computed from the other. We also ensure all candidate getelementptr
26302 // indices are unique.
26303 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
26304 auto *GEPI = GEPList[I];
26305 if (!Candidates.count(GEPI))
26306 continue;
26307 const SCEV *SCEVI = SE->getSCEV(GEPList[I]);
26308 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
26309 auto *GEPJ = GEPList[J];
26310 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
26311 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
26312 Candidates.remove(GEPI);
26313 Candidates.remove(GEPJ);
26314 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
26315 Candidates.remove(GEPJ);
26316 }
26317 }
26318 }
26319
26320 // We break out of the above computation as soon as we know there are
26321 // fewer than two candidates remaining.
26322 if (Candidates.size() < 2)
26323 continue;
26324
26325 // Add the single, non-constant index of each candidate to the bundle. We
26326 // ensured the indices met these constraints when we originally collected
26327 // the getelementptrs.
26328 SmallVector<Value *, 16> Bundle(Candidates.size());
26329 auto BundleIndex = 0u;
26330 for (auto *V : Candidates) {
26331 auto *GEP = cast<GetElementPtrInst>(V);
26332 auto *GEPIdx = GEP->idx_begin()->get();
26333 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
26334 Bundle[BundleIndex++] = GEPIdx;
26335 }
26336
26337 // Try and vectorize the indices. We are currently only interested in
26338 // gather-like cases of the form:
26339 //
26340 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
26341 //
26342 // where the loads of "a", the loads of "b", and the subtractions can be
26343 // performed in parallel. It's likely that detecting this pattern in a
26344 // bottom-up phase will be simpler and less costly than building a
26345 // full-blown top-down phase beginning at the consecutive loads.
26346 Changed |= tryToVectorizeList(Bundle, R);
26347 }
26348 }
26349 return Changed;
26350}
26351
26352bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
26353 bool Changed = false;
26354 // Sort by type, base pointers and values operand. Value operands must be
26355 // compatible (have the same opcode, same parent), otherwise it is
26356 // definitely not profitable to try to vectorize them.
26357 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
26358 if (V->getValueOperand()->getType()->getTypeID() <
26359 V2->getValueOperand()->getType()->getTypeID())
26360 return true;
26361 if (V->getValueOperand()->getType()->getTypeID() >
26362 V2->getValueOperand()->getType()->getTypeID())
26363 return false;
26364 if (V->getPointerOperandType()->getTypeID() <
26365 V2->getPointerOperandType()->getTypeID())
26366 return true;
26367 if (V->getPointerOperandType()->getTypeID() >
26368 V2->getPointerOperandType()->getTypeID())
26369 return false;
26370 if (V->getValueOperand()->getType()->getScalarSizeInBits() <
26371 V2->getValueOperand()->getType()->getScalarSizeInBits())
26372 return true;
26373 if (V->getValueOperand()->getType()->getScalarSizeInBits() >
26374 V2->getValueOperand()->getType()->getScalarSizeInBits())
26375 return false;
26376 // UndefValues are compatible with all other values.
26377 if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
26378 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
26380 DT->getNode(I1->getParent());
26382 DT->getNode(I2->getParent());
26383 assert(NodeI1 && "Should only process reachable instructions");
26384 assert(NodeI2 && "Should only process reachable instructions");
26385 assert((NodeI1 == NodeI2) ==
26386 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
26387 "Different nodes should have different DFS numbers");
26388 if (NodeI1 != NodeI2)
26389 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
26390 return I1->getOpcode() < I2->getOpcode();
26391 }
26392 return V->getValueOperand()->getValueID() <
26393 V2->getValueOperand()->getValueID();
26394 };
26395
26396 bool SameParent = true;
26397 auto AreCompatibleStores = [&](ArrayRef<StoreInst *> VL, StoreInst *V1) {
26398 if (VL.empty()) {
26399 SameParent = true;
26400 return true;
26401 }
26402 StoreInst *V2 = VL.back();
26403 if (V1 == V2)
26404 return true;
26405 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
26406 return false;
26407 if (V1->getPointerOperandType() != V2->getPointerOperandType())
26408 return false;
26409 // Undefs are compatible with any other value.
26410 if (isa<UndefValue>(V1->getValueOperand()) ||
26411 isa<UndefValue>(V2->getValueOperand()))
26412 return true;
26413 if (isa<Constant>(V1->getValueOperand()) &&
26414 isa<Constant>(V2->getValueOperand()))
26415 return true;
26416 // Check if the operands of the stores can be vectorized. They can be
26417 // vectorized, if they have compatible operands or have operands, which can
26418 // be vectorized as copyables.
26419 auto *I1 = dyn_cast<Instruction>(V1->getValueOperand());
26420 auto *I2 = dyn_cast<Instruction>(V2->getValueOperand());
26421 if (I1 || I2) {
26422 // Accept only tail-following non-compatible values for now.
26423 // TODO: investigate if it is possible to vectorize incompatible values,
26424 // if the copyables are first in the list.
26425 if (I1 && !I2)
26426 return false;
26427 SameParent &= I1 && I2 && I1->getParent() == I2->getParent();
26428 SmallVector<Value *> NewVL(VL.size() + 1);
26429 for (auto [SI, V] : zip(VL, NewVL))
26430 V = SI->getValueOperand();
26431 NewVL.back() = V1->getValueOperand();
26432 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
26433 InstructionsState S = Analysis.buildInstructionsState(
26434 NewVL, R, VectorizeCopyableElements, /*WithProfitabilityCheck=*/true,
26435 /*SkipSameCodeCheck=*/!SameParent);
26436 if (S)
26437 return true;
26438 if (!SameParent)
26439 return false;
26440 }
26441 return V1->getValueOperand()->getValueID() ==
26442 V2->getValueOperand()->getValueID();
26443 };
26444
26445 // Attempt to sort and vectorize each of the store-groups.
26447 for (auto &Pair : Stores) {
26448 if (Pair.second.size() < 2)
26449 continue;
26450
26451 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
26452 << Pair.second.size() << ".\n");
26453
26454 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
26455 continue;
26456
26457 // Reverse stores to do bottom-to-top analysis. This is important if the
26458 // values are stores to the same addresses several times, in this case need
26459 // to follow the stores order (reversed to meet the memory dependecies).
26460 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
26461 Pair.second.rend());
26462 Changed |= tryToVectorizeSequence<StoreInst>(
26463 ReversedStores, StoreSorter, AreCompatibleStores,
26464 [&](ArrayRef<StoreInst *> Candidates, bool) {
26465 return vectorizeStores(Candidates, R, Attempted);
26466 },
26467 /*MaxVFOnly=*/false, R);
26468 }
26469 return Changed;
26470}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isConstant(const MachineInstr &MI)
AMDGPU Register Bank Select
Rewrite undef for PHI
ReachingDefAnalysis InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
#define LLVM_MARK_AS_BITMASK_ENUM(LargestValue)
LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can perform bitwise operatio...
Definition: BitmaskEnum.h:42
block Block Frequency Analysis
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:638
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
Definition: DataLayout.cpp:919
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
Definition: DebugCounter.h:194
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
uint32_t Index
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
Early If Converter
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
Definition: ExpandFp.cpp:597
This is the interface for a simple mod/ref and alias analysis over globals.
static const HTTPClientCleanup Cleanup
Definition: HTTPClient.cpp:42
Hexagon Common GEP
#define _
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition: IVUsers.cpp:48
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Definition: LICM.cpp:1451
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
ppc ctr loops verify
static bool IsSelect(MachineInstr &MI)
if(PassOpts->AAPipeline)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static cl::opt< bool > SplitAlternateInstructions("slp-split-alternate-instructions", cl::init(true), cl::Hidden, cl::desc("Improve the code quality by splitting alternate instructions"))
static bool isStridedLoad(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, const bool IsAnyPointerUsedOutGraph, const int64_t Diff)
Checks if strided loads can be generated out of VL loads with pointers PointerOps:
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static bool isMaskedLoadCompress(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, const DominatorTree &DT, const TargetLibraryInfo &TLI, const function_ref< bool(Value *)> AreAllUsersVectorized, bool &IsMasked, unsigned &InterleaveFactor, SmallVectorImpl< int > &CompressMask, VectorType *&LoadVecTy)
Checks if the VL can be transformed to a (masked)load + compress or (masked) interleaved load.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< bool > VectorizeCopyableElements("slp-copyable-elements", cl::init(true), cl::Hidden, cl::desc("Try to replace values with the idempotent instructions for " "better vectorization."))
Enables vectorization of copyable elements.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static bool isCommutative(Instruction *I, Value *ValWithUses)
static bool allSameOpcode(ArrayRef< Value * > VL)
static InstructionCost canConvertToFMA(ArrayRef< Value * > VL, const InstructionsState &S, DominatorTree &DT, const DataLayout &DL, TargetTransformInfo &TTI, const TargetLibraryInfo &TLI)
Check if we can convert fadd/fsub sequence to FMAD.
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
#define SV_NAME
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Creates subvector insert.
static void findBuildAggregateRec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static DebugLoc getDebugLocFromPHI(PHINode &PN)
static std::optional< unsigned > getExtractIndex(const Instruction *E)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static bool isMainInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an main operation for the given MainOp and AltOp instruction...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I)
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={})
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
static bool buildCompressMask(ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, Type *ScalarTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< int > &CompressMask)
Builds compress-like mask for shuffles for the given PointerOps, ordered with Order.
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
Calculates the costs of vectorized intrinsic (if possible) and vectorized function (if possible) call...
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static SmallVector< Constant * > replicateMask(ArrayRef< Constant * > Val, unsigned VF)
Replicates the given Val VF times.
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static Instruction * findInstructionWithOpcode(ArrayRef< Value * > VL, unsigned Opcode)
Find an instruction with a specific opcode in VL.
static InstructionCost getExtractWithExtendCost(const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst is a FixedVectorType,...
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static std::pair< Instruction *, Instruction * > getMainAltOpsNoStateVL(ArrayRef< Value * > VL)
Returns main/alternate instructions for the given VL.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int64_t > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(ArrayRef< T * >, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static bool tryToFindDuplicates(SmallVectorImpl< Value * > &VL, SmallVectorImpl< int > &ReuseShuffleIndices, const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const InstructionsState &S, const BoUpSLP::EdgeInfo &UserTreeIdx, bool TryPad=false)
Checks that every instruction appears once in the list and if not, packs them, building ReuseShuffleI...
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Generates subvector extract using Generator or using default shuffle.
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static InstructionCost getVectorInstrCost(const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef< std::tuple< Value *, User *, int > > ScalarUserAndIdx)
This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy is a FixedVectorType,...
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, Type *ScalarTy, unsigned Opcode0, unsigned Opcode1)
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:480
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
#define LLVM_DEBUG(...)
Definition: Debug.h:119
static const int BlockSize
Definition: TarWriter.cpp:33
This pass exposes codegen information to IR-level passes.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:247
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
Definition: VPlanSLP.cpp:210
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:83
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
A private abstract base class describing the concept of an individual alias analysis implementation.
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1406
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1540
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1330
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:371
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:380
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1666
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1488
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1111
void clearAllBits()
Set every bit to 0.
Definition: APInt.h:1396
void negate()
Negate this APInt in place.
Definition: APInt.h:1468
unsigned logBase2() const
Definition: APInt.h:1761
void setAllBits()
Set every bit to 1.
Definition: APInt.h:1319
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition: APInt.h:1367
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:440
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:200
bool isOne() const
Determine if this is a value of 1.
Definition: APInt.h:389
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition: APInt.h:286
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:239
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:255
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:431
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:412
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition: ArrayRef.h:183
const T & back() const
back - Get the last element.
Definition: ArrayRef.h:156
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition: ArrayRef.h:224
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:200
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:150
iterator end() const
Definition: ArrayRef.h:136
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:147
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:206
iterator begin() const
Definition: ArrayRef.h:135
ArrayRef< T > take_back(size_t N=1) const
Return a copy of *this with only the last N elements.
Definition: ArrayRef.h:231
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:142
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:191
const T & consume_front()
consume_front() - Returns the first element and drops it from ArrayRef.
Definition: ArrayRef.h:162
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:234
LLVM Basic Block Representation.
Definition: BasicBlock.h:62
iterator end()
Definition: BasicBlock.h:472
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:459
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:172
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:213
reverse_iterator rend()
Definition: BasicBlock.h:477
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:170
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
Definition: BasicBlock.cpp:406
size_t size() const
Definition: BasicBlock.h:480
InstListType::const_reverse_iterator const_reverse_iterator
Definition: BasicBlock.h:173
bool isEHPad() const
Return true if this basic block is an exception handling block.
Definition: BasicBlock.h:707
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:233
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:73
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1116
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
Definition: InstrTypes.h:2010
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:1905
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1348
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
Definition: InstrTypes.h:2148
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Definition: InstrTypes.h:2004
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1292
FunctionType * getFunctionType() const
Definition: InstrTypes.h:1205
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1283
unsigned arg_size() const
Definition: InstrTypes.h:1290
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1506
bool hasOperandBundles() const
Return true if this User has any operand bundles.
Definition: InstrTypes.h:2001
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:448
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:666
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:984
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:678
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:707
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:708
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:702
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:701
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:705
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:703
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:706
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:704
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition: InstrTypes.h:829
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:791
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:767
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition: CmpPredicate.h:23
static LLVM_ABI Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2314
static LLVM_ABI Constant * getBinOpIdentity(unsigned Opcode, Type *Ty, bool AllowRHSConstant=false, bool NSZ=false)
Return the identity constant for a binary opcode.
Definition: Constants.cpp:2694
This is the shared class of boolean and integer constants.
Definition: Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:868
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:875
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:163
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:154
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
Definition: Constants.cpp:1474
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1423
This is an important base class in LLVM.
Definition: Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:420
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
static bool shouldExecute(unsigned CounterName)
Definition: DebugCounter.h:88
A debug info location.
Definition: DebugLoc.h:124
static DebugLoc getUnknown()
Definition: DebugLoc.h:162
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:104
LLVM_ABI APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:187
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:165
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition: DenseMap.h:229
bool erase(const KeyT &Val)
Definition: DenseMap.h:303
unsigned size() const
Definition: DenseMap.h:108
bool empty() const
Definition: DenseMap.h:107
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:161
iterator end()
Definition: DenseMap.h:81
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:205
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:156
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:214
Implements a dense probed hash-table based set.
Definition: DenseSet.h:263
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:284
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:165
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
Definition: Dominators.cpp:334
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:135
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:312
This instruction extracts a single (scalar) element from a VectorType value.
This instruction extracts a struct member or array element value from an aggregate value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:22
void set()
Definition: FMF.h:61
bool allowReassoc() const
Flag queries.
Definition: FMF.h:64
bool allowContract() const
Definition: FMF.h:69
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:592
unsigned getNumElements() const
Definition: DerivedTypes.h:635
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:803
ArrayRef< Type * > params() const
Definition: DerivedTypes.h:132
Type * getReturnType() const
Definition: DerivedTypes.h:126
bool empty() const
Definition: Function.h:857
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:949
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2571
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition: IRBuilder.h:547
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2559
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:575
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1864
LLVM_ABI CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
Definition: IRBuilder.cpp:488
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1005
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:202
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition: IRBuilder.h:2637
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition: IRBuilder.h:2238
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:201
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:345
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:247
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1923
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Definition: IRBuilder.h:862
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1809
LLVM_ABI Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition: IRBuilder.cpp:823
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:834
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:522
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2463
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2494
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2204
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Definition: IRBuilder.cpp:815
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2593
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:507
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2508
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1708
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:196
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2277
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:207
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1883
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2439
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1651
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1437
LLVM_ABI CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
Definition: IRBuilder.cpp:538
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2780
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
Definition: Instruction.h:321
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
Definition: Instruction.h:808
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:513
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isBinaryOp() const
Definition: Instruction.h:317
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:312
bool isShift() const
Definition: Instruction.h:320
LLVM_ABI bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
bool isIntDivRem() const
Definition: Instruction.h:318
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:319
const SmallVectorImpl< Type * > & getArgTypes() const
An instruction for reading from memory.
Definition: Instructions.h:180
Value * getPointerOperand()
Definition: Instructions.h:259
bool isSimple() const
Definition: Instructions.h:251
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:215
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:570
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:40
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator end()
Definition: MapVector.h:67
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition: MapVector.h:48
iterator find(const KeyT &Key)
Definition: MapVector.h:141
bool empty() const
Definition: MapVector.h:75
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition: MapVector.h:107
ValueT lookup(const KeyT &Key) const
Definition: MapVector.h:99
size_type size() const
Definition: MapVector.h:56
std::pair< KeyT, ValueT > & front()
Definition: MapVector.h:79
void clear()
Definition: MapVector.h:84
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:303
T & front() const
front - Get the first element.
Definition: ArrayRef.h:354
iterator end() const
Definition: ArrayRef.h:348
iterator begin() const
Definition: ArrayRef.h:347
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:381
The optimization diagnostic interface.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
This is a MutableArrayRef that owns its array.
Definition: ArrayRef.h:454
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:99
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:720
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:115
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
Definition: PointerUnion.h:141
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
Definition: PointerUnion.h:167
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1885
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:151
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:59
ArrayRef< value_type > getArrayRef() const
Definition: SetVector.h:90
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:104
const value_type & front() const
Return the first element of the SetVector.
Definition: SetVector.h:149
void insert_range(Range &&R)
Definition: SetVector.h:193
Vector takeVector()
Clear the SetVector and return the underlying vector.
Definition: SetVector.h:93
void clear()
Completely clear the SetVector.
Definition: SetVector.h:284
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:99
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:168
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:269
This instruction constructs a fixed permutation of two input vectors.
static LLVM_ABI bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static LLVM_ABI bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static LLVM_ABI bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:283
size_type size() const
Definition: SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:380
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition: SmallPtrSet.h:418
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:470
iterator end() const
Definition: SmallPtrSet.h:499
void insert_range(Range &&R)
Definition: SmallPtrSet.h:490
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:401
iterator begin() const
Definition: SmallPtrSet.h:494
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:476
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:356
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:134
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:176
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition: SmallSet.h:227
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:182
size_type size() const
Definition: SmallSet.h:171
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
bool empty() const
Definition: SmallVector.h:82
size_t size() const
Definition: SmallVector.h:79
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:705
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:938
void reserve(size_type N)
Definition: SmallVector.h:664
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:684
void swap(SmallVectorImpl &RHS)
Definition: SmallVector.h:969
void resize(size_type N)
Definition: SmallVector.h:639
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
An instruction for storing to memory.
Definition: Instructions.h:296
Type * getPointerOperandType() const
Definition: Instructions.h:389
Value * getValueOperand()
Definition: Instructions.h:383
Value * getPointerOperand()
Definition: Instructions.h:386
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
TargetFolder - Create constants with target dependent folding.
Definition: TargetFolder.h:35
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
LLVM_ABI InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
LLVM_ABI InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, const Value *Op0=nullptr, const Value *Op1=nullptr) const
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
LLVM_ABI bool isLegalMaskedLoad(Type *DataType, Align Alignment, unsigned AddressSpace) const
Return true if the target supports masked load.
LLVM_ABI bool preferAlternateOpcodeVectorization() const
LLVM_ABI InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI TypeSize getRegisterBitWidth(RegisterKind K) const
LLVM_ABI bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
LLVM_ABI InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
LLVM_ABI InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType=nullptr, TargetCostKind CostKind=TCK_SizeAndLatency) const
Estimate the cost of a GEP operation when lowered.
LLVM_ABI bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, Align Alignment, unsigned AddrSpace) const
Return true is the target supports interleaved access for the given vector type VTy,...
LLVM_ABI bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
\Returns true if the target supports broadcasting a load to a vector of type <NumElements x ElementTy...
LLVM_ABI InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
LLVM_ABI bool forceScalarizeMaskedGather(VectorType *Type, Align Alignment) const
Return true if the target forces scalarizing of llvm.masked.gather intrinsics.
LLVM_ABI bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const
Return true if the target supports strided load.
LLVM_ABI InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
OperandValueProperties
Additional properties of an operand's values.
LLVM_ABI InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const PointersChainInfo &Info, Type *AccessTy, TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Estimate the cost of a chain of pointers (typically pointer operands of a chain of loads or stores wi...
LLVM_ABI unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
LLVM_ABI bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
LLVM_ABI InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
LLVM_ABI InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const
LLVM_ABI unsigned getMinVectorRegisterBitWidth() const
LLVM_ABI InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
LLVM_ABI bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
Return true if this is an alternating opcode pattern that can be lowered to a single instruction on t...
LLVM_ABI bool isFPVectorizationPotentiallyUnsafe() const
Indicate that it is potentially unsafe to automatically vectorize floating-point operations because t...
LLVM_ABI unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
LLVM_ABI unsigned getNumberOfParts(Type *Tp) const
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
LLVM_ABI InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ GatherScatter
The cast is used with a gather/scatter.
LLVM_ABI InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind) const
OperandValueKind
Additional information about an operand's possible values.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:82
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:273
bool isX86_FP80Ty() const
Return true if this is x86 long double.
Definition: Type.h:159
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:246
LLVM_ABI bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:267
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition: Type.h:296
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
Definition: Type.h:165
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:270
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:240
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:136
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:225
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
LLVM_ABI unsigned getStructNumElements() const
LLVM_ABI unsigned getIntegerBitWidth() const
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:352
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1866
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
op_range operands()
Definition: User.h:292
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Definition: User.h:119
op_iterator op_begin()
Definition: User.h:284
Value * getOperand(unsigned i) const
Definition: User.h:232
unsigned getNumOperands() const
Definition: User.h:254
iterator_range< value_op_iterator > operand_values()
Definition: User.h:316
The Vector Function Database.
Definition: VectorUtils.h:33
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:74
LLVM Value Representation.
Definition: Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
user_iterator user_begin()
Definition: Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:546
iterator_range< user_iterator > users()
Definition: Value.h:426
User * user_back()
Definition: Value.h:412
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition: Value.h:543
LLVM_ABI bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition: Value.cpp:158
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:150
bool use_empty() const
Definition: Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1101
LLVM_ABI unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition: Value.cpp:265
iterator_range< use_iterator > uses()
Definition: Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:322
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:396
Base class of all SIMD vector types.
Definition: DerivedTypes.h:430
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:695
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Definition: DerivedTypes.h:463
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:205
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:194
iterator find(const_arg_type_t< ValueT > V)
Definition: DenseSet.h:163
void insert_range(Range &&R)
Definition: DenseSet.h:222
size_type size() const
Definition: DenseSet.h:87
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:169
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:174
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:203
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
Definition: Hashing.h:76
const ParentTy * getParent() const
Definition: ilist_node.h:34
self_iterator getIterator()
Definition: ilist_node.h:134
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition: ilist_node.h:359
CRTP base class for adapting an iterator to a different type.
Definition: iterator.h:237
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:53
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:662
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:692
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
VLOperands(ArrayRef< Value * > RootVL, ArrayRef< ValueList > Operands, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
Bottom Up SLP Vectorizer.
bool isProfitableToReorder() const
Checks if it is profitable to reorder the current tree.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleEntity &SE)
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleBundle &Bundle)
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={}, InstructionCost ReductionCost=TTI::TCC_Free)
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isVectorized(const Value *V) const
Check if the value is vectorized in the tree.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleData &SD)
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals, ArrayRef< std::tuple< Value *, unsigned, bool > > VectorValuesAndScales)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
static bool isIdentityOrder(ArrayRef< unsigned > Order)
Does this non-empty order represent an identity order? Identity should be represented as an empty ord...
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Gets reordering data for the given tree entry.
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
Definition: VectorUtils.h:108
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Key
PAL metadata keys.
@ HorizontalReduction
Definition: ARMBaseInfo.h:425
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:126
@ Entry
Definition: COFF.h:862
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
Definition: ISDOpcodes.h:1578
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:751
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:862
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaxNum(const Opnd0 &Op0, const Opnd1 &Op1)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:962
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaximum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
Definition: PatternMatch.h:299
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
Definition: PatternMatch.h:239
@ GS
Definition: X86.h:213
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
unsigned combineHashValue(unsigned a, unsigned b)
Simplistic combination of 32-bit hash values into 32-bit hash values.
Definition: DenseMapInfo.h:41
DiagnosticInfoOptimizationBase::Argument NV
LLVM_ABI const_iterator begin(StringRef path LLVM_LIFETIME_BOUND, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:226
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:338
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
Definition: LoopUtils.cpp:1313
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
@ Offset
Definition: DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:860
void stable_sort(R &&Range)
Definition: STLExtras.h:2077
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1770
void fill(R &&Range, T &&Value)
Provide wrappers to std::fill which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1764
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1737
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1744
hash_code hash_value(const FixedPointSemantics &Val)
Definition: APFixedPoint.h:137
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:1023
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1702
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:533
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition: ScopeExit.h:59
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2491
auto pred_end(const MachineBasicBlock *BB)
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
Definition: SetOperations.h:58
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:7513
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition: Utils.cpp:1723
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
constexpr from_range_t from_range
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
LLVM_ABI std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition: STLExtras.h:2250
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:663
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:551
iterator_range< po_iterator< T > > post_order(const T &G)
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
auto binary_search(R &&Range, T &&Value)
Provide wrappers to std::binary_search which take ranges instead of having to pass begin/end explicit...
Definition: STLExtras.h:2000
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:295
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:390
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2147
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1987
constexpr bool has_single_bit(T Value) noexcept
Definition: bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1751
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition: Local.cpp:402
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:428
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1669
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
Definition: STLExtras.h:1782
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition: Loads.cpp:438
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
Definition: SPIRVUtils.h:288
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1758
auto make_first_range(ContainerTy &&c)
Given a container of pairs, return a range over the first elements.
Definition: STLExtras.h:1444
LLVM_ABI std::optional< int64_t > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition: Local.cpp:421
bool isModOrRefSet(const ModRefInfo MRI)
Definition: ModRef.h:43
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
Definition: STLExtras.h:1939
LLVM_ABI bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition: Casting.h:548
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
Definition: LoopUtils.cpp:1393
constexpr int PoisonMaskElem
@ Ref
The access may reference the value stored in memory.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:399
@ Other
Any other memory.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
TargetTransformInfo TTI
LLVM_ABI CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
Definition: LoopUtils.cpp:1094
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:34
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ None
Not a recurrence.
@ Xor
Bitwise or logical XOR of integers.
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
@ FAdd
Sum of floats.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1973
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:2049
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
Definition: GraphWriter.h:443
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1854
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
Definition: STLExtras.h:1454
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:223
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1980
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
auto pred_begin(const MachineBasicBlock *BB)
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1777
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1916
InstructionCost Cost
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition: Sequence.h:305
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:595
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition: STLExtras.h:2107
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:280
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
LLVM_ABI Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
Definition: VectorUtils.cpp:46
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:469
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:853
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Used to keep track of an operand bundle.
Definition: InstrTypes.h:2169
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
static bool isEqual(const BoUpSLP::EdgeInfo &LHS, const BoUpSLP::EdgeInfo &RHS)
static BoUpSLP::EdgeInfo getEmptyKey()
static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val)
static BoUpSLP::EdgeInfo getTombstoneKey()
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:54
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
Direction
An enum for the direction of the loop.
Definition: LoopInfo.h:217
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:249
Describe known properties for a set of pointers.
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Definition: STLExtras.h:1472
Function object to check whether the second component of a container supported by std::get (like std:...
Definition: STLExtras.h:1481
This structure holds any data we need about the edges being traversed during buildTreeRec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.