LLVM 21.0.0git
LoopVectorize.cpp
Go to the documentation of this file.
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanAnalysis.h"
61#include "VPlanHCFGBuilder.h"
62#include "VPlanHelpers.h"
63#include "VPlanPatternMatch.h"
64#include "VPlanTransforms.h"
65#include "VPlanUtils.h"
66#include "VPlanVerifier.h"
67#include "llvm/ADT/APInt.h"
68#include "llvm/ADT/ArrayRef.h"
69#include "llvm/ADT/DenseMap.h"
71#include "llvm/ADT/Hashing.h"
72#include "llvm/ADT/MapVector.h"
73#include "llvm/ADT/STLExtras.h"
76#include "llvm/ADT/Statistic.h"
77#include "llvm/ADT/StringRef.h"
78#include "llvm/ADT/Twine.h"
79#include "llvm/ADT/TypeSwitch.h"
84#include "llvm/Analysis/CFG.h"
100#include "llvm/IR/Attributes.h"
101#include "llvm/IR/BasicBlock.h"
102#include "llvm/IR/CFG.h"
103#include "llvm/IR/Constant.h"
104#include "llvm/IR/Constants.h"
105#include "llvm/IR/DataLayout.h"
106#include "llvm/IR/DebugInfo.h"
107#include "llvm/IR/DebugLoc.h"
108#include "llvm/IR/DerivedTypes.h"
110#include "llvm/IR/Dominators.h"
111#include "llvm/IR/Function.h"
112#include "llvm/IR/IRBuilder.h"
113#include "llvm/IR/InstrTypes.h"
114#include "llvm/IR/Instruction.h"
115#include "llvm/IR/Instructions.h"
117#include "llvm/IR/Intrinsics.h"
118#include "llvm/IR/MDBuilder.h"
119#include "llvm/IR/Metadata.h"
120#include "llvm/IR/Module.h"
121#include "llvm/IR/Operator.h"
122#include "llvm/IR/PatternMatch.h"
124#include "llvm/IR/Type.h"
125#include "llvm/IR/Use.h"
126#include "llvm/IR/User.h"
127#include "llvm/IR/Value.h"
128#include "llvm/IR/Verifier.h"
129#include "llvm/Support/Casting.h"
131#include "llvm/Support/Debug.h"
146#include <algorithm>
147#include <cassert>
148#include <cstdint>
149#include <functional>
150#include <iterator>
151#include <limits>
152#include <memory>
153#include <string>
154#include <tuple>
155#include <utility>
156
157using namespace llvm;
158
159#define LV_NAME "loop-vectorize"
160#define DEBUG_TYPE LV_NAME
161
162#ifndef NDEBUG
163const char VerboseDebug[] = DEBUG_TYPE "-verbose";
164#endif
165
166/// @{
167/// Metadata attribute names
168const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
170 "llvm.loop.vectorize.followup_vectorized";
172 "llvm.loop.vectorize.followup_epilogue";
173/// @}
174
175STATISTIC(LoopsVectorized, "Number of loops vectorized");
176STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
177STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
178
180 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
181 cl::desc("Enable vectorization of epilogue loops."));
182
184 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
185 cl::desc("When epilogue vectorization is enabled, and a value greater than "
186 "1 is specified, forces the given VF for all applicable epilogue "
187 "loops."));
188
190 "epilogue-vectorization-minimum-VF", cl::Hidden,
191 cl::desc("Only loops with vectorization factor equal to or larger than "
192 "the specified value are considered for epilogue vectorization."));
193
194/// Loops with a known constant trip count below this number are vectorized only
195/// if no scalar iteration overheads are incurred.
197 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
198 cl::desc("Loops with a constant trip count that is smaller than this "
199 "value are vectorized only if no scalar iteration overheads "
200 "are incurred."));
201
203 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
204 cl::desc("The maximum allowed number of runtime memory checks"));
205
206// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
207// that predication is preferred, and this lists all options. I.e., the
208// vectorizer will try to fold the tail-loop (epilogue) into the vector body
209// and predicate the instructions accordingly. If tail-folding fails, there are
210// different fallback strategies depending on these values:
212 enum Option {
216 };
217} // namespace PreferPredicateTy
218
220 "prefer-predicate-over-epilogue",
223 cl::desc("Tail-folding and predication preferences over creating a scalar "
224 "epilogue loop."),
226 "scalar-epilogue",
227 "Don't tail-predicate loops, create scalar epilogue"),
229 "predicate-else-scalar-epilogue",
230 "prefer tail-folding, create scalar epilogue if tail "
231 "folding fails."),
233 "predicate-dont-vectorize",
234 "prefers tail-folding, don't attempt vectorization if "
235 "tail-folding fails.")));
236
238 "force-tail-folding-style", cl::desc("Force the tail folding style"),
239 cl::init(TailFoldingStyle::None),
241 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
243 TailFoldingStyle::Data, "data",
244 "Create lane mask for data only, using active.lane.mask intrinsic"),
245 clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
246 "data-without-lane-mask",
247 "Create lane mask with compare/stepvector"),
248 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
249 "Create lane mask using active.lane.mask intrinsic, and use "
250 "it for both data and control flow"),
251 clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
252 "data-and-control-without-rt-check",
253 "Similar to data-and-control, but remove the runtime check"),
254 clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
255 "Use predicated EVL instructions for tail folding. If EVL "
256 "is unsupported, fallback to data-without-lane-mask.")));
257
259 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
260 cl::desc("Maximize bandwidth when selecting vectorization factor which "
261 "will be determined by the smallest type in loop."));
262
264 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
265 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
266
267/// An interleave-group may need masking if it resides in a block that needs
268/// predication, or in order to mask away gaps.
270 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
271 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
272
274 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
275 cl::desc("A flag that overrides the target's number of scalar registers."));
276
278 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
279 cl::desc("A flag that overrides the target's number of vector registers."));
280
282 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
283 cl::desc("A flag that overrides the target's max interleave factor for "
284 "scalar loops."));
285
287 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
288 cl::desc("A flag that overrides the target's max interleave factor for "
289 "vectorized loops."));
290
292 "force-target-instruction-cost", cl::init(0), cl::Hidden,
293 cl::desc("A flag that overrides the target's expected cost for "
294 "an instruction to a single constant value. Mostly "
295 "useful for getting consistent testing."));
296
298 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
299 cl::desc(
300 "Pretend that scalable vectors are supported, even if the target does "
301 "not support them. This flag should only be used for testing."));
302
304 "small-loop-cost", cl::init(20), cl::Hidden,
305 cl::desc(
306 "The cost of a loop that is considered 'small' by the interleaver."));
307
309 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
310 cl::desc("Enable the use of the block frequency analysis to access PGO "
311 "heuristics minimizing code growth in cold regions and being more "
312 "aggressive in hot regions."));
313
314// Runtime interleave loops for load/store throughput.
316 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
317 cl::desc(
318 "Enable runtime interleaving until load/store ports are saturated"));
319
320/// The number of stores in a loop that are allowed to need predication.
322 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
323 cl::desc("Max number of stores to be predicated behind an if."));
324
326 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
327 cl::desc("Count the induction variable only once when interleaving"));
328
330 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
331 cl::desc("Enable if predication of stores during vectorization."));
332
334 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
335 cl::desc("The maximum interleave count to use when interleaving a scalar "
336 "reduction in a nested loop."));
337
338static cl::opt<bool>
339 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
341 cl::desc("Prefer in-loop vector reductions, "
342 "overriding the targets preference."));
343
345 "force-ordered-reductions", cl::init(false), cl::Hidden,
346 cl::desc("Enable the vectorisation of loops with in-order (strict) "
347 "FP reductions"));
348
350 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
351 cl::desc(
352 "Prefer predicating a reduction operation over an after loop select."));
353
354namespace llvm {
356 "enable-vplan-native-path", cl::Hidden,
357 cl::desc("Enable VPlan-native vectorization path with "
358 "support for outer loop vectorization."));
359
361 VerifyEachVPlan("vplan-verify-each",
362#ifdef EXPENSIVE_CHECKS
363 cl::init(true),
364#else
365 cl::init(false),
366#endif
368 cl::desc("Verfiy VPlans after VPlan transforms."));
369} // namespace llvm
370
371// This flag enables the stress testing of the VPlan H-CFG construction in the
372// VPlan-native vectorization path. It must be used in conjuction with
373// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
374// verification of the H-CFGs built.
376 "vplan-build-stress-test", cl::init(false), cl::Hidden,
377 cl::desc(
378 "Build VPlan for every supported loop nest in the function and bail "
379 "out right after the build (stress test the VPlan H-CFG construction "
380 "in the VPlan-native vectorization path)."));
381
383 "interleave-loops", cl::init(true), cl::Hidden,
384 cl::desc("Enable loop interleaving in Loop vectorization passes"));
386 "vectorize-loops", cl::init(true), cl::Hidden,
387 cl::desc("Run the Loop vectorization passes"));
388
390 "force-widen-divrem-via-safe-divisor", cl::Hidden,
391 cl::desc(
392 "Override cost based safe divisor widening for div/rem instructions"));
393
395 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
397 cl::desc("Try wider VFs if they enable the use of vector variants"));
398
400 "enable-early-exit-vectorization", cl::init(false), cl::Hidden,
401 cl::desc(
402 "Enable vectorization of early exit loops with uncountable exits."));
403
404// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
405// variables not overflowing do not hold. See `emitSCEVChecks`.
406static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
407// Likelyhood of bypassing the vectorized loop because pointers overlap. See
408// `emitMemRuntimeChecks`.
409static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
410// Likelyhood of bypassing the vectorized loop because there are zero trips left
411// after prolog. See `emitIterationCountCheck`.
412static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
413
414/// A helper function that returns true if the given type is irregular. The
415/// type is irregular if its allocated size doesn't equal the store size of an
416/// element of the corresponding vector type.
417static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
418 // Determine if an array of N elements of type Ty is "bitcast compatible"
419 // with a <N x Ty> vector.
420 // This is only true if there is no padding between the array elements.
421 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
422}
423
424/// Returns "best known" trip count for the specified loop \p L as defined by
425/// the following procedure:
426/// 1) Returns exact trip count if it is known.
427/// 2) Returns expected trip count according to profile data if any.
428/// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
429/// 4) Returns std::nullopt if all of the above failed.
430static std::optional<unsigned>
432 bool CanUseConstantMax = true) {
433 // Check if exact trip count is known.
434 if (unsigned ExpectedTC = PSE.getSE()->getSmallConstantTripCount(L))
435 return ExpectedTC;
436
437 // Check if there is an expected trip count available from profile data.
439 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
440 return *EstimatedTC;
441
442 if (!CanUseConstantMax)
443 return std::nullopt;
444
445 // Check if upper bound estimate is known.
446 if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())
447 return ExpectedTC;
448
449 return std::nullopt;
450}
451
452namespace {
453// Forward declare GeneratedRTChecks.
454class GeneratedRTChecks;
455
456using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
457} // namespace
458
459namespace llvm {
460
462
463/// InnerLoopVectorizer vectorizes loops which contain only one basic
464/// block to a specified vectorization factor (VF).
465/// This class performs the widening of scalars into vectors, or multiple
466/// scalars. This class also implements the following features:
467/// * It inserts an epilogue loop for handling loops that don't have iteration
468/// counts that are known to be a multiple of the vectorization factor.
469/// * It handles the code generation for reduction variables.
470/// * Scalarization (implementation using scalars) of un-vectorizable
471/// instructions.
472/// InnerLoopVectorizer does not perform any vectorization-legality
473/// checks, and relies on the caller to check for the different legality
474/// aspects. The InnerLoopVectorizer relies on the
475/// LoopVectorizationLegality class to provide information about the induction
476/// and reduction variables that were found to a given vectorization factor.
478public:
481 const TargetLibraryInfo *TLI,
485 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
487 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks,
488 VPlan &Plan)
489 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
490 AC(AC), ORE(ORE), VF(VecWidth),
492 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
494 VectorPHVPB(Plan.getEntry()->getSingleSuccessor()) {
495 // Query this against the original loop and save it here because the profile
496 // of the original loop header may change as the transformation happens.
499 }
500
501 virtual ~InnerLoopVectorizer() = default;
502
503 /// Create a new empty loop that will contain vectorized instructions later
504 /// on, while the old loop will be used as the scalar remainder. Control flow
505 /// is generated around the vectorized (and scalar epilogue) loops consisting
506 /// of various checks and bypasses. Return the pre-header block of the new
507 /// loop. In the case of epilogue vectorization, this function is overriden to
508 /// handle the more complex control flow around the loops. \p ExpandedSCEVs is
509 /// used to look up SCEV expansions for expressions needed during skeleton
510 /// creation.
511 virtual BasicBlock *
512 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
513
514 /// Fix the vectorized code, taking care of header phi's, and more.
516
517 // Return true if any runtime check is added.
519
520 /// A helper function to scalarize a single Instruction in the innermost loop.
521 /// Generates a sequence of scalar instances for each lane between \p MinLane
522 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
523 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
524 /// Instr's operands.
525 void scalarizeInstruction(const Instruction *Instr,
526 VPReplicateRecipe *RepRecipe, const VPLane &Lane,
527 VPTransformState &State);
528
529 /// Fix the non-induction PHIs in \p Plan.
531
532 /// Returns the original loop trip count.
533 Value *getTripCount() const { return TripCount; }
534
535 /// Used to set the trip count after ILV's construction and after the
536 /// preheader block has been executed. Note that this always holds the trip
537 /// count of the original loop for both main loop and epilogue vectorization.
538 void setTripCount(Value *TC) { TripCount = TC; }
539
540 // Retrieve the additional bypass value associated with an original
541 /// induction header phi.
543 return Induction2AdditionalBypassValue.at(OrigPhi);
544 }
545
546 /// Return the additional bypass block which targets the scalar loop by
547 /// skipping the epilogue loop after completing the main loop.
550 "Trying to access AdditionalBypassBlock but it has not been set");
552 }
553
554protected:
556
557 /// Iteratively sink the scalarized operands of a predicated instruction into
558 /// the block that was created for it.
559 void sinkScalarOperands(Instruction *PredInst);
560
561 /// Returns (and creates if needed) the trip count of the widened loop.
563
564 /// Emit a bypass check to see if the vector trip count is zero, including if
565 /// it overflows.
567
568 /// Emit a bypass check to see if all of the SCEV assumptions we've
569 /// had to make are correct. Returns the block containing the checks or
570 /// nullptr if no checks have been added.
572
573 /// Emit bypass checks to check any memory assumptions we may have made.
574 /// Returns the block containing the checks or nullptr if no checks have been
575 /// added.
577
578 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
579 /// vector loop preheader, middle block and scalar preheader.
581
582 /// Create and record the values for induction variables to resume coming from
583 /// the additional bypass block.
584 void createInductionAdditionalBypassValues(const SCEV2ValueTy &ExpandedSCEVs,
585 Value *MainVectorTripCount);
586
587 /// Allow subclasses to override and print debug traces before/after vplan
588 /// execution, when trace information is requested.
589 virtual void printDebugTracesAtStart() {}
590 virtual void printDebugTracesAtEnd() {}
591
592 /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
593 /// vector preheader and its predecessor, also connecting the new block to the
594 /// scalar preheader.
595 void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
596
597 /// The original loop.
599
600 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
601 /// dynamic knowledge to simplify SCEV expressions and converts them to a
602 /// more usable form.
604
605 /// Loop Info.
607
608 /// Dominator Tree.
610
611 /// Target Library Info.
613
614 /// Target Transform Info.
616
617 /// Assumption Cache.
619
620 /// Interface to emit optimization remarks.
622
623 /// The vectorization SIMD factor to use. Each vector will have this many
624 /// vector elements.
626
628
629 /// The vectorization unroll factor to use. Each scalar is vectorized to this
630 /// many different vector instructions.
631 unsigned UF;
632
633 /// The builder that we use
635
636 // --- Vectorization state ---
637
638 /// The vector-loop preheader.
640
641 /// The scalar-loop preheader.
643
644 /// Middle Block between the vector and the scalar.
646
647 /// A list of all bypass blocks. The first block is the entry of the loop.
649
650 /// Store instructions that were predicated.
652
653 /// Trip count of the original loop.
654 Value *TripCount = nullptr;
655
656 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
658
659 /// The legality analysis.
661
662 /// The profitablity analysis.
664
665 // Record whether runtime checks are added.
666 bool AddedSafetyChecks = false;
667
668 /// BFI and PSI are used to check for profile guided size optimizations.
671
672 // Whether this loop should be optimized for size based on profile guided size
673 // optimizatios.
675
676 /// Structure to hold information about generated runtime checks, responsible
677 /// for cleaning the checks, if vectorization turns out unprofitable.
678 GeneratedRTChecks &RTChecks;
679
680 /// Mapping of induction phis to their additional bypass values. They
681 /// need to be added as operands to phi nodes in the scalar loop preheader
682 /// after the epilogue skeleton has been created.
684
685 /// The additional bypass block which conditionally skips over the epilogue
686 /// loop after executing the main loop. Needed to resume inductions and
687 /// reductions during epilogue vectorization.
689
691
692 /// The vector preheader block of \p Plan, used as target for check blocks
693 /// introduced during skeleton creation.
695};
696
697/// Encapsulate information regarding vectorization of a loop and its epilogue.
698/// This information is meant to be updated and used across two stages of
699/// epilogue vectorization.
702 unsigned MainLoopUF = 0;
704 unsigned EpilogueUF = 0;
709 Value *TripCount = nullptr;
712
714 ElementCount EVF, unsigned EUF,
716 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF),
718 assert(EUF == 1 &&
719 "A high UF for the epilogue loop is likely not beneficial.");
720 }
721};
722
723/// An extension of the inner loop vectorizer that creates a skeleton for a
724/// vectorized loop that has its epilogue (residual) also vectorized.
725/// The idea is to run the vplan on a given loop twice, firstly to setup the
726/// skeleton and vectorize the main loop, and secondly to complete the skeleton
727/// from the first step and vectorize the epilogue. This is achieved by
728/// deriving two concrete strategy classes from this base class and invoking
729/// them in succession from the loop vectorizer planner.
731public:
739 GeneratedRTChecks &Checks, VPlan &Plan)
741 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
742 CM, BFI, PSI, Checks, Plan),
743 EPI(EPI) {}
744
745 // Override this function to handle the more complex control flow around the
746 // three loops.
747 BasicBlock *
748 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final {
749 return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
750 }
751
752 /// The interface for creating a vectorized skeleton using one of two
753 /// different strategies, each corresponding to one execution of the vplan
754 /// as described above.
755 virtual BasicBlock *
756 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
757
758 /// Holds and updates state information required to vectorize the main loop
759 /// and its epilogue in two separate passes. This setup helps us avoid
760 /// regenerating and recomputing runtime safety checks. It also helps us to
761 /// shorten the iteration-count-check path length for the cases where the
762 /// iteration count of the loop is so small that the main vector loop is
763 /// completely skipped.
765};
766
767/// A specialized derived class of inner loop vectorizer that performs
768/// vectorization of *main* loops in the process of vectorizing loops and their
769/// epilogues.
771public:
779 GeneratedRTChecks &Check, VPlan &Plan)
781 EPI, LVL, CM, BFI, PSI, Check, Plan) {}
782 /// Implements the interface for creating a vectorized skeleton using the
783 /// *main loop* strategy (ie the first pass of vplan execution).
784 BasicBlock *
785 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
786
787protected:
788 /// Emits an iteration count bypass check once for the main loop (when \p
789 /// ForEpilogue is false) and once for the epilogue loop (when \p
790 /// ForEpilogue is true).
791 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
792 void printDebugTracesAtStart() override;
793 void printDebugTracesAtEnd() override;
794};
795
796// A specialized derived class of inner loop vectorizer that performs
797// vectorization of *epilogue* loops in the process of vectorizing loops and
798// their epilogues.
800public:
808 GeneratedRTChecks &Checks, VPlan &Plan)
810 EPI, LVL, CM, BFI, PSI, Checks, Plan) {
812 }
813 /// Implements the interface for creating a vectorized skeleton using the
814 /// *epilogue loop* strategy (ie the second pass of vplan execution).
815 BasicBlock *
816 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
817
818protected:
819 /// Emits an iteration count bypass check after the main vector loop has
820 /// finished to see if there are any iterations left to execute by either
821 /// the vector epilogue or the scalar epilogue.
823 BasicBlock *Bypass,
824 BasicBlock *Insert);
825 void printDebugTracesAtStart() override;
826 void printDebugTracesAtEnd() override;
827};
828} // end namespace llvm
829
830/// Look for a meaningful debug location on the instruction or its operands.
832 if (!I)
833 return DebugLoc();
834
835 DebugLoc Empty;
836 if (I->getDebugLoc() != Empty)
837 return I->getDebugLoc();
838
839 for (Use &Op : I->operands()) {
840 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
841 if (OpInst->getDebugLoc() != Empty)
842 return OpInst->getDebugLoc();
843 }
844
845 return I->getDebugLoc();
846}
847
848/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
849/// is passed, the message relates to that particular instruction.
850#ifndef NDEBUG
851static void debugVectorizationMessage(const StringRef Prefix,
852 const StringRef DebugMsg,
853 Instruction *I) {
854 dbgs() << "LV: " << Prefix << DebugMsg;
855 if (I != nullptr)
856 dbgs() << " " << *I;
857 else
858 dbgs() << '.';
859 dbgs() << '\n';
860}
861#endif
862
863/// Create an analysis remark that explains why vectorization failed
864///
865/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
866/// RemarkName is the identifier for the remark. If \p I is passed it is an
867/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
868/// the location of the remark. If \p DL is passed, use it as debug location for
869/// the remark. \return the remark object that can be streamed to.
871createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
872 Instruction *I, DebugLoc DL = {}) {
873 Value *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
874 // If debug location is attached to the instruction, use it. Otherwise if DL
875 // was not provided, use the loop's.
876 if (I && I->getDebugLoc())
877 DL = I->getDebugLoc();
878 else if (!DL)
879 DL = TheLoop->getStartLoc();
880
881 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
882}
883
884namespace llvm {
885
886/// Return a value for Step multiplied by VF.
888 int64_t Step) {
889 assert(Ty->isIntegerTy() && "Expected an integer step");
890 return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
891}
892
893/// Return the runtime value for VF.
895 return B.CreateElementCount(Ty, VF);
896}
897
899 const StringRef OREMsg, const StringRef ORETag,
900 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
901 Instruction *I) {
902 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
903 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
904 ORE->emit(
905 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
906 << "loop not vectorized: " << OREMsg);
907}
908
909/// Reports an informative message: print \p Msg for debugging purposes as well
910/// as an optimization remark. Uses either \p I as location of the remark, or
911/// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the
912/// remark. If \p DL is passed, use it as debug location for the remark.
913static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
915 Loop *TheLoop, Instruction *I = nullptr,
916 DebugLoc DL = {}) {
918 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
919 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop,
920 I, DL)
921 << Msg);
922}
923
924/// Report successful vectorization of the loop. In case an outer loop is
925/// vectorized, prepend "outer" to the vectorization remark.
927 VectorizationFactor VF, unsigned IC) {
929 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
930 nullptr));
931 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
932 ORE->emit([&]() {
933 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
934 TheLoop->getHeader())
935 << "vectorized " << LoopType << "loop (vectorization width: "
936 << ore::NV("VectorizationFactor", VF.Width)
937 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
938 });
939}
940
941} // end namespace llvm
942
943namespace llvm {
944
945// Loop vectorization cost-model hints how the scalar epilogue loop should be
946// lowered.
948
949 // The default: allowing scalar epilogues.
951
952 // Vectorization with OptForSize: don't allow epilogues.
954
955 // A special case of vectorisation with OptForSize: loops with a very small
956 // trip count are considered for vectorization under OptForSize, thereby
957 // making sure the cost of their loop body is dominant, free of runtime
958 // guards and scalar iteration overheads.
960
961 // Loop hint predicate indicating an epilogue is undesired.
963
964 // Directive indicating we must either tail fold or not vectorize
967
968using InstructionVFPair = std::pair<Instruction *, ElementCount>;
969
970/// LoopVectorizationCostModel - estimates the expected speedups due to
971/// vectorization.
972/// In many cases vectorization is not profitable. This can happen because of
973/// a number of reasons. In this class we mainly attempt to predict the
974/// expected speedup/slowdowns due to the supported instruction set. We use the
975/// TargetTransformInfo to query the different backends for the cost of
976/// different operations.
979
980public:
990 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
991 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
992 Hints(Hints), InterleaveInfo(IAI), CostKind(TTI::TCK_RecipThroughput) {
994 initializeVScaleForTuning();
995 }
996
997 /// \return An upper bound for the vectorization factors (both fixed and
998 /// scalable). If the factors are 0, vectorization and interleaving should be
999 /// avoided up front.
1000 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1001
1002 /// \return True if runtime checks are required for vectorization, and false
1003 /// otherwise.
1004 bool runtimeChecksRequired();
1005
1006 /// Setup cost-based decisions for user vectorization factor.
1007 /// \return true if the UserVF is a feasible VF to be chosen.
1011 return expectedCost(UserVF).isValid();
1012 }
1013
1014 /// \return The size (in bits) of the smallest and widest types in the code
1015 /// that needs to be vectorized. We ignore values that remain scalar such as
1016 /// 64 bit loop indices.
1017 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1018
1019 /// \return The desired interleave count.
1020 /// If interleave count has been specified by metadata it will be returned.
1021 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1022 /// are the selected vectorization factor and the cost of the selected VF.
1023 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1024
1025 /// Memory access instruction may be vectorized in more than one way.
1026 /// Form of instruction after vectorization depends on cost.
1027 /// This function takes cost-based decisions for Load/Store instructions
1028 /// and collects them in a map. This decisions map is used for building
1029 /// the lists of loop-uniform and loop-scalar instructions.
1030 /// The calculated cost is saved with widening decision in order to
1031 /// avoid redundant calculations.
1033
1034 /// A call may be vectorized in different ways depending on whether we have
1035 /// vectorized variants available and whether the target supports masking.
1036 /// This function analyzes all calls in the function at the supplied VF,
1037 /// makes a decision based on the costs of available options, and stores that
1038 /// decision in a map for use in planning and plan execution.
1040
1041 /// A struct that represents some properties of the register usage
1042 /// of a loop.
1044 /// Holds the number of loop invariant values that are used in the loop.
1045 /// The key is ClassID of target-provided register class.
1047 /// Holds the maximum number of concurrent live intervals in the loop.
1048 /// The key is ClassID of target-provided register class.
1050 };
1051
1052 /// \return Returns information about the register usages of the loop for the
1053 /// given vectorization factors.
1056
1057 /// Collect values we want to ignore in the cost model.
1058 void collectValuesToIgnore();
1059
1060 /// Collect all element types in the loop for which widening is needed.
1062
1063 /// Split reductions into those that happen in the loop, and those that happen
1064 /// outside. In loop reductions are collected into InLoopReductions.
1066
1067 /// Returns true if we should use strict in-order reductions for the given
1068 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1069 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1070 /// of FP operations.
1071 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1072 return !Hints->allowReordering() && RdxDesc.isOrdered();
1073 }
1074
1075 /// \returns The smallest bitwidth each instruction can be represented with.
1076 /// The vector equivalents of these instructions should be truncated to this
1077 /// type.
1079 return MinBWs;
1080 }
1081
1082 /// \returns True if it is more profitable to scalarize instruction \p I for
1083 /// vectorization factor \p VF.
1085 assert(VF.isVector() &&
1086 "Profitable to scalarize relevant only for VF > 1.");
1087 assert(
1088 TheLoop->isInnermost() &&
1089 "cost-model should not be used for outer loops (in VPlan-native path)");
1090
1091 auto Scalars = InstsToScalarize.find(VF);
1092 assert(Scalars != InstsToScalarize.end() &&
1093 "VF not yet analyzed for scalarization profitability");
1094 return Scalars->second.contains(I);
1095 }
1096
1097 /// Returns true if \p I is known to be uniform after vectorization.
1099 assert(
1100 TheLoop->isInnermost() &&
1101 "cost-model should not be used for outer loops (in VPlan-native path)");
1102 // Pseudo probe needs to be duplicated for each unrolled iteration and
1103 // vector lane so that profiled loop trip count can be accurately
1104 // accumulated instead of being under counted.
1105 if (isa<PseudoProbeInst>(I))
1106 return false;
1107
1108 if (VF.isScalar())
1109 return true;
1110
1111 auto UniformsPerVF = Uniforms.find(VF);
1112 assert(UniformsPerVF != Uniforms.end() &&
1113 "VF not yet analyzed for uniformity");
1114 return UniformsPerVF->second.count(I);
1115 }
1116
1117 /// Returns true if \p I is known to be scalar after vectorization.
1119 assert(
1120 TheLoop->isInnermost() &&
1121 "cost-model should not be used for outer loops (in VPlan-native path)");
1122 if (VF.isScalar())
1123 return true;
1124
1125 auto ScalarsPerVF = Scalars.find(VF);
1126 assert(ScalarsPerVF != Scalars.end() &&
1127 "Scalar values are not calculated for VF");
1128 return ScalarsPerVF->second.count(I);
1129 }
1130
1131 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1132 /// for vectorization factor \p VF.
1134 return VF.isVector() && MinBWs.contains(I) &&
1135 !isProfitableToScalarize(I, VF) &&
1137 }
1138
1139 /// Decision that was taken during cost calculation for memory instruction.
1142 CM_Widen, // For consecutive accesses with stride +1.
1143 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1150
1151 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1152 /// instruction \p I and vector width \p VF.
1155 assert(VF.isVector() && "Expected VF >=2");
1156 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1157 }
1158
1159 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1160 /// interleaving group \p Grp and vector width \p VF.
1164 assert(VF.isVector() && "Expected VF >=2");
1165 /// Broadcast this decicion to all instructions inside the group.
1166 /// When interleaving, the cost will only be assigned one instruction, the
1167 /// insert position. For other cases, add the appropriate fraction of the
1168 /// total cost to each instruction. This ensures accurate costs are used,
1169 /// even if the insert position instruction is not used.
1170 InstructionCost InsertPosCost = Cost;
1171 InstructionCost OtherMemberCost = 0;
1172 if (W != CM_Interleave)
1173 OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers();
1174 ;
1175 for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) {
1176 if (auto *I = Grp->getMember(Idx)) {
1177 if (Grp->getInsertPos() == I)
1178 WideningDecisions[std::make_pair(I, VF)] =
1179 std::make_pair(W, InsertPosCost);
1180 else
1181 WideningDecisions[std::make_pair(I, VF)] =
1182 std::make_pair(W, OtherMemberCost);
1183 }
1184 }
1185 }
1186
1187 /// Return the cost model decision for the given instruction \p I and vector
1188 /// width \p VF. Return CM_Unknown if this instruction did not pass
1189 /// through the cost modeling.
1191 assert(VF.isVector() && "Expected VF to be a vector VF");
1192 assert(
1193 TheLoop->isInnermost() &&
1194 "cost-model should not be used for outer loops (in VPlan-native path)");
1195
1196 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1197 auto Itr = WideningDecisions.find(InstOnVF);
1198 if (Itr == WideningDecisions.end())
1199 return CM_Unknown;
1200 return Itr->second.first;
1201 }
1202
1203 /// Return the vectorization cost for the given instruction \p I and vector
1204 /// width \p VF.
1206 assert(VF.isVector() && "Expected VF >=2");
1207 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1208 assert(WideningDecisions.contains(InstOnVF) &&
1209 "The cost is not calculated");
1210 return WideningDecisions[InstOnVF].second;
1211 }
1212
1217 std::optional<unsigned> MaskPos;
1219 };
1220
1222 Function *Variant, Intrinsic::ID IID,
1223 std::optional<unsigned> MaskPos,
1225 assert(!VF.isScalar() && "Expected vector VF");
1226 CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
1227 MaskPos, Cost};
1228 }
1229
1231 ElementCount VF) const {
1232 assert(!VF.isScalar() && "Expected vector VF");
1233 return CallWideningDecisions.at(std::make_pair(CI, VF));
1234 }
1235
1236 /// Return True if instruction \p I is an optimizable truncate whose operand
1237 /// is an induction variable. Such a truncate will be removed by adding a new
1238 /// induction variable with the destination type.
1240 // If the instruction is not a truncate, return false.
1241 auto *Trunc = dyn_cast<TruncInst>(I);
1242 if (!Trunc)
1243 return false;
1244
1245 // Get the source and destination types of the truncate.
1246 Type *SrcTy = toVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1247 Type *DestTy = toVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1248
1249 // If the truncate is free for the given types, return false. Replacing a
1250 // free truncate with an induction variable would add an induction variable
1251 // update instruction to each iteration of the loop. We exclude from this
1252 // check the primary induction variable since it will need an update
1253 // instruction regardless.
1254 Value *Op = Trunc->getOperand(0);
1255 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1256 return false;
1257
1258 // If the truncated value is not an induction variable, return false.
1259 return Legal->isInductionPhi(Op);
1260 }
1261
1262 /// Collects the instructions to scalarize for each predicated instruction in
1263 /// the loop.
1265
1266 /// Collect Uniform and Scalar values for the given \p VF.
1267 /// The sets depend on CM decision for Load/Store instructions
1268 /// that may be vectorized as interleave, gather-scatter or scalarized.
1269 /// Also make a decision on what to do about call instructions in the loop
1270 /// at that VF -- scalarize, call a known vector routine, or call a
1271 /// vector intrinsic.
1273 // Do the analysis once.
1274 if (VF.isScalar() || Uniforms.contains(VF))
1275 return;
1277 collectLoopUniforms(VF);
1279 collectLoopScalars(VF);
1280 }
1281
1282 /// Returns true if the target machine supports masked store operation
1283 /// for the given \p DataType and kind of access to \p Ptr.
1284 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1285 return Legal->isConsecutivePtr(DataType, Ptr) &&
1286 TTI.isLegalMaskedStore(DataType, Alignment);
1287 }
1288
1289 /// Returns true if the target machine supports masked load operation
1290 /// for the given \p DataType and kind of access to \p Ptr.
1291 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1292 return Legal->isConsecutivePtr(DataType, Ptr) &&
1293 TTI.isLegalMaskedLoad(DataType, Alignment);
1294 }
1295
1296 /// Returns true if the target machine can represent \p V as a masked gather
1297 /// or scatter operation.
1299 bool LI = isa<LoadInst>(V);
1300 bool SI = isa<StoreInst>(V);
1301 if (!LI && !SI)
1302 return false;
1303 auto *Ty = getLoadStoreType(V);
1305 if (VF.isVector())
1306 Ty = VectorType::get(Ty, VF);
1307 return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1308 (SI && TTI.isLegalMaskedScatter(Ty, Align));
1309 }
1310
1311 /// Returns true if the target machine supports all of the reduction
1312 /// variables found for the given VF.
1314 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1315 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1316 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1317 }));
1318 }
1319
1320 /// Given costs for both strategies, return true if the scalar predication
1321 /// lowering should be used for div/rem. This incorporates an override
1322 /// option so it is not simply a cost comparison.
1324 InstructionCost SafeDivisorCost) const {
1325 switch (ForceSafeDivisor) {
1326 case cl::BOU_UNSET:
1327 return ScalarCost < SafeDivisorCost;
1328 case cl::BOU_TRUE:
1329 return false;
1330 case cl::BOU_FALSE:
1331 return true;
1332 }
1333 llvm_unreachable("impossible case value");
1334 }
1335
1336 /// Returns true if \p I is an instruction which requires predication and
1337 /// for which our chosen predication strategy is scalarization (i.e. we
1338 /// don't have an alternate strategy such as masking available).
1339 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1341
1342 /// Returns true if \p I is an instruction that needs to be predicated
1343 /// at runtime. The result is independent of the predication mechanism.
1344 /// Superset of instructions that return true for isScalarWithPredication.
1345 bool isPredicatedInst(Instruction *I) const;
1346
1347 /// Return the costs for our two available strategies for lowering a
1348 /// div/rem operation which requires speculating at least one lane.
1349 /// First result is for scalarization (will be invalid for scalable
1350 /// vectors); second is for the safe-divisor strategy.
1351 std::pair<InstructionCost, InstructionCost>
1353 ElementCount VF) const;
1354
1355 /// Returns true if \p I is a memory instruction with consecutive memory
1356 /// access that can be widened.
1358
1359 /// Returns true if \p I is a memory instruction in an interleaved-group
1360 /// of memory accesses that can be vectorized with wide vector loads/stores
1361 /// and shuffles.
1363
1364 /// Check if \p Instr belongs to any interleaved access group.
1366 return InterleaveInfo.isInterleaved(Instr);
1367 }
1368
1369 /// Get the interleaved access group that \p Instr belongs to.
1372 return InterleaveInfo.getInterleaveGroup(Instr);
1373 }
1374
1375 /// Returns true if we're required to use a scalar epilogue for at least
1376 /// the final iteration of the original loop.
1377 bool requiresScalarEpilogue(bool IsVectorizing) const {
1378 if (!isScalarEpilogueAllowed()) {
1379 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1380 return false;
1381 }
1382 // If we might exit from anywhere but the latch and early exit vectorization
1383 // is disabled, we must run the exiting iteration in scalar form.
1386 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
1387 "from latch block\n");
1388 return true;
1389 }
1390 if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1391 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1392 "interleaved group requires scalar epilogue\n");
1393 return true;
1394 }
1395 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1396 return false;
1397 }
1398
1399 /// Returns true if we're required to use a scalar epilogue for at least
1400 /// the final iteration of the original loop for all VFs in \p Range.
1401 /// A scalar epilogue must either be required for all VFs in \p Range or for
1402 /// none.
1404 auto RequiresScalarEpilogue = [this](ElementCount VF) {
1405 return requiresScalarEpilogue(VF.isVector());
1406 };
1407 bool IsRequired = all_of(Range, RequiresScalarEpilogue);
1408 assert(
1409 (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
1410 "all VFs in range must agree on whether a scalar epilogue is required");
1411 return IsRequired;
1412 }
1413
1414 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1415 /// loop hint annotation.
1417 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1418 }
1419
1420 /// Returns the TailFoldingStyle that is best for the current loop.
1421 TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1422 if (!ChosenTailFoldingStyle)
1424 return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1425 : ChosenTailFoldingStyle->second;
1426 }
1427
1428 /// Selects and saves TailFoldingStyle for 2 options - if IV update may
1429 /// overflow or not.
1430 /// \param IsScalableVF true if scalable vector factors enabled.
1431 /// \param UserIC User specific interleave count.
1432 void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1433 assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1434 if (!Legal->canFoldTailByMasking()) {
1435 ChosenTailFoldingStyle =
1437 return;
1438 }
1439
1440 if (!ForceTailFoldingStyle.getNumOccurrences()) {
1441 ChosenTailFoldingStyle = std::make_pair(
1442 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
1443 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false));
1444 return;
1445 }
1446
1447 // Set styles when forced.
1448 ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(),
1449 ForceTailFoldingStyle.getValue());
1451 return;
1452 // Override forced styles if needed.
1453 // FIXME: use actual opcode/data type for analysis here.
1454 // FIXME: Investigate opportunity for fixed vector factor.
1455 // FIXME: support fixed-order recurrences by fixing splice of non VFxUF
1456 // penultimate EVL.
1457 bool EVLIsLegal =
1458 UserIC <= 1 && TTI.hasActiveVectorLength(0, nullptr, Align()) &&
1460 if (!EVLIsLegal) {
1461 // If for some reason EVL mode is unsupported, fallback to
1462 // DataWithoutLaneMask to try to vectorize the loop with folded tail
1463 // in a generic way.
1464 ChosenTailFoldingStyle =
1467 LLVM_DEBUG(
1468 dbgs()
1469 << "LV: Preference for VP intrinsics indicated. Will "
1470 "not try to generate VP Intrinsics "
1471 << (UserIC > 1
1472 ? "since interleave count specified is greater than 1.\n"
1473 : "due to non-interleaving reasons.\n"));
1474 }
1475 }
1476
1477 /// Returns true if all loop blocks should be masked to fold tail loop.
1478 bool foldTailByMasking() const {
1479 // TODO: check if it is possible to check for None style independent of
1480 // IVUpdateMayOverflow flag in getTailFoldingStyle.
1482 }
1483
1484 /// Return maximum safe number of elements to be processed per vector
1485 /// iteration, which do not prevent store-load forwarding and are safe with
1486 /// regard to the memory dependencies. Required for EVL-based VPlans to
1487 /// correctly calculate AVL (application vector length) as min(remaining AVL,
1488 /// MaxSafeElements).
1489 /// TODO: need to consider adjusting cost model to use this value as a
1490 /// vectorization factor for EVL-based vectorization.
1491 std::optional<unsigned> getMaxSafeElements() const { return MaxSafeElements; }
1492
1493 /// Returns true if the instructions in this block requires predication
1494 /// for any reason, e.g. because tail folding now requires a predicate
1495 /// or because the block in the original loop was predicated.
1498 }
1499
1500 /// Returns true if VP intrinsics with explicit vector length support should
1501 /// be generated in the tail folded loop.
1502 bool foldTailWithEVL() const {
1504 }
1505
1506 /// Returns true if the Phi is part of an inloop reduction.
1507 bool isInLoopReduction(PHINode *Phi) const {
1508 return InLoopReductions.contains(Phi);
1509 }
1510
1511 /// Returns true if the predicated reduction select should be used to set the
1512 /// incoming value for the reduction phi.
1513 bool usePredicatedReductionSelect(unsigned Opcode, Type *PhiTy) const {
1514 // Force to use predicated reduction select since the EVL of the
1515 // second-to-last iteration might not be VF*UF.
1516 if (foldTailWithEVL())
1517 return true;
1520 Opcode, PhiTy, TargetTransformInfo::ReductionFlags());
1521 }
1522
1523 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1524 /// with factor VF. Return the cost of the instruction, including
1525 /// scalarization overhead if it's needed.
1527
1528 /// Estimate cost of a call instruction CI if it were vectorized with factor
1529 /// VF. Return the cost of the instruction, including scalarization overhead
1530 /// if it's needed.
1532
1533 /// Invalidates decisions already taken by the cost model.
1535 WideningDecisions.clear();
1536 CallWideningDecisions.clear();
1537 Uniforms.clear();
1538 Scalars.clear();
1539 }
1540
1541 /// Returns the expected execution cost. The unit of the cost does
1542 /// not matter because we use the 'cost' units to compare different
1543 /// vector widths. The cost that is returned is *not* normalized by
1544 /// the factor width.
1546
1547 bool hasPredStores() const { return NumPredStores > 0; }
1548
1549 /// Returns true if epilogue vectorization is considered profitable, and
1550 /// false otherwise.
1551 /// \p VF is the vectorization factor chosen for the original loop.
1552 /// \p Multiplier is an aditional scaling factor applied to VF before
1553 /// comparing to EpilogueVectorizationMinVF.
1555 const unsigned IC) const;
1556
1557 /// Returns the execution time cost of an instruction for a given vector
1558 /// width. Vector width of one means scalar.
1560
1561 /// Return the cost of instructions in an inloop reduction pattern, if I is
1562 /// part of that pattern.
1563 std::optional<InstructionCost> getReductionPatternCost(Instruction *I,
1564 ElementCount VF,
1565 Type *VectorTy) const;
1566
1567 /// Returns true if \p Op should be considered invariant and if it is
1568 /// trivially hoistable.
1570
1571 /// Return the value of vscale used for tuning the cost model.
1572 std::optional<unsigned> getVScaleForTuning() const { return VScaleForTuning; }
1573
1574private:
1575 unsigned NumPredStores = 0;
1576
1577 /// Used to store the value of vscale used for tuning the cost model. It is
1578 /// initialized during object construction.
1579 std::optional<unsigned> VScaleForTuning;
1580
1581 /// Initializes the value of vscale used for tuning the cost model. If
1582 /// vscale_range.min == vscale_range.max then return vscale_range.max, else
1583 /// return the value returned by the corresponding TTI method.
1584 void initializeVScaleForTuning() {
1585 const Function *Fn = TheLoop->getHeader()->getParent();
1586 if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
1587 auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
1588 auto Min = Attr.getVScaleRangeMin();
1589 auto Max = Attr.getVScaleRangeMax();
1590 if (Max && Min == Max) {
1591 VScaleForTuning = Max;
1592 return;
1593 }
1594 }
1595
1596 VScaleForTuning = TTI.getVScaleForTuning();
1597 }
1598
1599 /// \return An upper bound for the vectorization factors for both
1600 /// fixed and scalable vectorization, where the minimum-known number of
1601 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1602 /// disabled or unsupported, then the scalable part will be equal to
1603 /// ElementCount::getScalable(0).
1604 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1605 ElementCount UserVF,
1606 bool FoldTailByMasking);
1607
1608 /// \return the maximized element count based on the targets vector
1609 /// registers and the loop trip-count, but limited to a maximum safe VF.
1610 /// This is a helper function of computeFeasibleMaxVF.
1611 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1612 unsigned SmallestType,
1613 unsigned WidestType,
1614 ElementCount MaxSafeVF,
1615 bool FoldTailByMasking);
1616
1617 /// Checks if scalable vectorization is supported and enabled. Caches the
1618 /// result to avoid repeated debug dumps for repeated queries.
1619 bool isScalableVectorizationAllowed();
1620
1621 /// \return the maximum legal scalable VF, based on the safe max number
1622 /// of elements.
1623 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1624
1625 /// Calculate vectorization cost of memory instruction \p I.
1626 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1627
1628 /// The cost computation for scalarized memory instruction.
1629 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1630
1631 /// The cost computation for interleaving group of memory instructions.
1632 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1633
1634 /// The cost computation for Gather/Scatter instruction.
1635 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1636
1637 /// The cost computation for widening instruction \p I with consecutive
1638 /// memory access.
1639 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1640
1641 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1642 /// Load: scalar load + broadcast.
1643 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1644 /// element)
1645 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1646
1647 /// Estimate the overhead of scalarizing an instruction. This is a
1648 /// convenience wrapper for the type-based getScalarizationOverhead API.
1649 InstructionCost getScalarizationOverhead(Instruction *I,
1650 ElementCount VF) const;
1651
1652 /// Returns true if an artificially high cost for emulated masked memrefs
1653 /// should be used.
1654 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1655
1656 /// Map of scalar integer values to the smallest bitwidth they can be legally
1657 /// represented as. The vector equivalents of these values should be truncated
1658 /// to this type.
1660
1661 /// A type representing the costs for instructions if they were to be
1662 /// scalarized rather than vectorized. The entries are Instruction-Cost
1663 /// pairs.
1664 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1665
1666 /// A set containing all BasicBlocks that are known to present after
1667 /// vectorization as a predicated block.
1669 PredicatedBBsAfterVectorization;
1670
1671 /// Records whether it is allowed to have the original scalar loop execute at
1672 /// least once. This may be needed as a fallback loop in case runtime
1673 /// aliasing/dependence checks fail, or to handle the tail/remainder
1674 /// iterations when the trip count is unknown or doesn't divide by the VF,
1675 /// or as a peel-loop to handle gaps in interleave-groups.
1676 /// Under optsize and when the trip count is very small we don't allow any
1677 /// iterations to execute in the scalar loop.
1678 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1679
1680 /// Control finally chosen tail folding style. The first element is used if
1681 /// the IV update may overflow, the second element - if it does not.
1682 std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1683 ChosenTailFoldingStyle;
1684
1685 /// true if scalable vectorization is supported and enabled.
1686 std::optional<bool> IsScalableVectorizationAllowed;
1687
1688 /// Maximum safe number of elements to be processed per vector iteration,
1689 /// which do not prevent store-load forwarding and are safe with regard to the
1690 /// memory dependencies. Required for EVL-based veectorization, where this
1691 /// value is used as the upper bound of the safe AVL.
1692 std::optional<unsigned> MaxSafeElements;
1693
1694 /// A map holding scalar costs for different vectorization factors. The
1695 /// presence of a cost for an instruction in the mapping indicates that the
1696 /// instruction will be scalarized when vectorizing with the associated
1697 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1699
1700 /// Holds the instructions known to be uniform after vectorization.
1701 /// The data is collected per VF.
1703
1704 /// Holds the instructions known to be scalar after vectorization.
1705 /// The data is collected per VF.
1707
1708 /// Holds the instructions (address computations) that are forced to be
1709 /// scalarized.
1711
1712 /// PHINodes of the reductions that should be expanded in-loop.
1713 SmallPtrSet<PHINode *, 4> InLoopReductions;
1714
1715 /// A Map of inloop reduction operations and their immediate chain operand.
1716 /// FIXME: This can be removed once reductions can be costed correctly in
1717 /// VPlan. This was added to allow quick lookup of the inloop operations.
1718 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1719
1720 /// Returns the expected difference in cost from scalarizing the expression
1721 /// feeding a predicated instruction \p PredInst. The instructions to
1722 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1723 /// non-negative return value implies the expression will be scalarized.
1724 /// Currently, only single-use chains are considered for scalarization.
1725 InstructionCost computePredInstDiscount(Instruction *PredInst,
1726 ScalarCostsTy &ScalarCosts,
1727 ElementCount VF);
1728
1729 /// Collect the instructions that are uniform after vectorization. An
1730 /// instruction is uniform if we represent it with a single scalar value in
1731 /// the vectorized loop corresponding to each vector iteration. Examples of
1732 /// uniform instructions include pointer operands of consecutive or
1733 /// interleaved memory accesses. Note that although uniformity implies an
1734 /// instruction will be scalar, the reverse is not true. In general, a
1735 /// scalarized instruction will be represented by VF scalar values in the
1736 /// vectorized loop, each corresponding to an iteration of the original
1737 /// scalar loop.
1738 void collectLoopUniforms(ElementCount VF);
1739
1740 /// Collect the instructions that are scalar after vectorization. An
1741 /// instruction is scalar if it is known to be uniform or will be scalarized
1742 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1743 /// to the list if they are used by a load/store instruction that is marked as
1744 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1745 /// VF values in the vectorized loop, each corresponding to an iteration of
1746 /// the original scalar loop.
1747 void collectLoopScalars(ElementCount VF);
1748
1749 /// Keeps cost model vectorization decision and cost for instructions.
1750 /// Right now it is used for memory instructions only.
1752 std::pair<InstWidening, InstructionCost>>;
1753
1754 DecisionList WideningDecisions;
1755
1756 using CallDecisionList =
1757 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1758
1759 CallDecisionList CallWideningDecisions;
1760
1761 /// Returns true if \p V is expected to be vectorized and it needs to be
1762 /// extracted.
1763 bool needsExtract(Value *V, ElementCount VF) const {
1764 Instruction *I = dyn_cast<Instruction>(V);
1765 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1768 return false;
1769
1770 // Assume we can vectorize V (and hence we need extraction) if the
1771 // scalars are not computed yet. This can happen, because it is called
1772 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1773 // the scalars are collected. That should be a safe assumption in most
1774 // cases, because we check if the operands have vectorizable types
1775 // beforehand in LoopVectorizationLegality.
1776 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1777 };
1778
1779 /// Returns a range containing only operands needing to be extracted.
1780 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1781 ElementCount VF) const {
1783 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1784 }
1785
1786public:
1787 /// The loop that we evaluate.
1789
1790 /// Predicated scalar evolution analysis.
1792
1793 /// Loop Info analysis.
1795
1796 /// Vectorization legality.
1798
1799 /// Vector target information.
1801
1802 /// Target Library Info.
1804
1805 /// Demanded bits analysis.
1807
1808 /// Assumption cache.
1810
1811 /// Interface to emit optimization remarks.
1813
1815
1816 /// Loop Vectorize Hint.
1818
1819 /// The interleave access information contains groups of interleaved accesses
1820 /// with the same stride and close to each other.
1822
1823 /// Values to ignore in the cost model.
1825
1826 /// Values to ignore in the cost model when VF > 1.
1828
1829 /// All element types found in the loop.
1831
1832 /// The kind of cost that we are calculating
1834};
1835} // end namespace llvm
1836
1837namespace {
1838/// Helper struct to manage generating runtime checks for vectorization.
1839///
1840/// The runtime checks are created up-front in temporary blocks to allow better
1841/// estimating the cost and un-linked from the existing IR. After deciding to
1842/// vectorize, the checks are moved back. If deciding not to vectorize, the
1843/// temporary blocks are completely removed.
1844class GeneratedRTChecks {
1845 /// Basic block which contains the generated SCEV checks, if any.
1846 BasicBlock *SCEVCheckBlock = nullptr;
1847
1848 /// The value representing the result of the generated SCEV checks. If it is
1849 /// nullptr, either no SCEV checks have been generated or they have been used.
1850 Value *SCEVCheckCond = nullptr;
1851
1852 /// Basic block which contains the generated memory runtime checks, if any.
1853 BasicBlock *MemCheckBlock = nullptr;
1854
1855 /// The value representing the result of the generated memory runtime checks.
1856 /// If it is nullptr, either no memory runtime checks have been generated or
1857 /// they have been used.
1858 Value *MemRuntimeCheckCond = nullptr;
1859
1860 DominatorTree *DT;
1861 LoopInfo *LI;
1863
1864 SCEVExpander SCEVExp;
1865 SCEVExpander MemCheckExp;
1866
1867 bool CostTooHigh = false;
1868 const bool AddBranchWeights;
1869
1870 Loop *OuterLoop = nullptr;
1871
1873
1874 /// The kind of cost that we are calculating
1875 TTI::TargetCostKind CostKind;
1876
1877public:
1878 GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
1880 const DataLayout &DL, bool AddBranchWeights,
1881 TTI::TargetCostKind CostKind)
1882 : DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"),
1883 MemCheckExp(*PSE.getSE(), DL, "scev.check"),
1884 AddBranchWeights(AddBranchWeights), PSE(PSE), CostKind(CostKind) {}
1885
1886 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1887 /// accurately estimate the cost of the runtime checks. The blocks are
1888 /// un-linked from the IR and are added back during vector code generation. If
1889 /// there is no vector code generation, the check blocks are removed
1890 /// completely.
1891 void create(Loop *L, const LoopAccessInfo &LAI,
1892 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1893
1894 // Hard cutoff to limit compile-time increase in case a very large number of
1895 // runtime checks needs to be generated.
1896 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1897 // profile info.
1898 CostTooHigh =
1900 if (CostTooHigh)
1901 return;
1902
1903 BasicBlock *LoopHeader = L->getHeader();
1904 BasicBlock *Preheader = L->getLoopPreheader();
1905
1906 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1907 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1908 // may be used by SCEVExpander. The blocks will be un-linked from their
1909 // predecessors and removed from LI & DT at the end of the function.
1910 if (!UnionPred.isAlwaysTrue()) {
1911 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1912 nullptr, "vector.scevcheck");
1913
1914 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1915 &UnionPred, SCEVCheckBlock->getTerminator());
1916 }
1917
1918 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1919 if (RtPtrChecking.Need) {
1920 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1921 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1922 "vector.memcheck");
1923
1924 auto DiffChecks = RtPtrChecking.getDiffChecks();
1925 if (DiffChecks) {
1926 Value *RuntimeVF = nullptr;
1927 MemRuntimeCheckCond = addDiffRuntimeChecks(
1928 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1929 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1930 if (!RuntimeVF)
1931 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1932 return RuntimeVF;
1933 },
1934 IC);
1935 } else {
1936 MemRuntimeCheckCond = addRuntimeChecks(
1937 MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
1939 }
1940 assert(MemRuntimeCheckCond &&
1941 "no RT checks generated although RtPtrChecking "
1942 "claimed checks are required");
1943 }
1944
1945 if (!MemCheckBlock && !SCEVCheckBlock)
1946 return;
1947
1948 // Unhook the temporary block with the checks, update various places
1949 // accordingly.
1950 if (SCEVCheckBlock)
1951 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1952 if (MemCheckBlock)
1953 MemCheckBlock->replaceAllUsesWith(Preheader);
1954
1955 if (SCEVCheckBlock) {
1956 SCEVCheckBlock->getTerminator()->moveBefore(
1957 Preheader->getTerminator()->getIterator());
1958 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1959 Preheader->getTerminator()->eraseFromParent();
1960 }
1961 if (MemCheckBlock) {
1962 MemCheckBlock->getTerminator()->moveBefore(
1963 Preheader->getTerminator()->getIterator());
1964 new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1965 Preheader->getTerminator()->eraseFromParent();
1966 }
1967
1968 DT->changeImmediateDominator(LoopHeader, Preheader);
1969 if (MemCheckBlock) {
1970 DT->eraseNode(MemCheckBlock);
1971 LI->removeBlock(MemCheckBlock);
1972 }
1973 if (SCEVCheckBlock) {
1974 DT->eraseNode(SCEVCheckBlock);
1975 LI->removeBlock(SCEVCheckBlock);
1976 }
1977
1978 // Outer loop is used as part of the later cost calculations.
1979 OuterLoop = L->getParentLoop();
1980 }
1981
1982 InstructionCost getCost() {
1983 if (SCEVCheckBlock || MemCheckBlock)
1984 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1985
1986 if (CostTooHigh) {
1988 Cost.setInvalid();
1989 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
1990 return Cost;
1991 }
1992
1993 InstructionCost RTCheckCost = 0;
1994 if (SCEVCheckBlock)
1995 for (Instruction &I : *SCEVCheckBlock) {
1996 if (SCEVCheckBlock->getTerminator() == &I)
1997 continue;
1998 InstructionCost C = TTI->getInstructionCost(&I, CostKind);
1999 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
2000 RTCheckCost += C;
2001 }
2002 if (MemCheckBlock) {
2003 InstructionCost MemCheckCost = 0;
2004 for (Instruction &I : *MemCheckBlock) {
2005 if (MemCheckBlock->getTerminator() == &I)
2006 continue;
2007 InstructionCost C = TTI->getInstructionCost(&I, CostKind);
2008 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
2009 MemCheckCost += C;
2010 }
2011
2012 // If the runtime memory checks are being created inside an outer loop
2013 // we should find out if these checks are outer loop invariant. If so,
2014 // the checks will likely be hoisted out and so the effective cost will
2015 // reduce according to the outer loop trip count.
2016 if (OuterLoop) {
2017 ScalarEvolution *SE = MemCheckExp.getSE();
2018 // TODO: If profitable, we could refine this further by analysing every
2019 // individual memory check, since there could be a mixture of loop
2020 // variant and invariant checks that mean the final condition is
2021 // variant.
2022 const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
2023 if (SE->isLoopInvariant(Cond, OuterLoop)) {
2024 // It seems reasonable to assume that we can reduce the effective
2025 // cost of the checks even when we know nothing about the trip
2026 // count. Assume that the outer loop executes at least twice.
2027 unsigned BestTripCount = 2;
2028
2029 // Get the best known TC estimate.
2030 if (auto EstimatedTC = getSmallBestKnownTC(
2031 PSE, OuterLoop, /* CanUseConstantMax = */ false))
2032 BestTripCount = *EstimatedTC;
2033
2034 BestTripCount = std::max(BestTripCount, 1U);
2035 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
2036
2037 // Let's ensure the cost is always at least 1.
2038 NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
2040
2041 if (BestTripCount > 1)
2043 << "We expect runtime memory checks to be hoisted "
2044 << "out of the outer loop. Cost reduced from "
2045 << MemCheckCost << " to " << NewMemCheckCost << '\n');
2046
2047 MemCheckCost = NewMemCheckCost;
2048 }
2049 }
2050
2051 RTCheckCost += MemCheckCost;
2052 }
2053
2054 if (SCEVCheckBlock || MemCheckBlock)
2055 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2056 << "\n");
2057
2058 return RTCheckCost;
2059 }
2060
2061 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2062 /// unused.
2063 ~GeneratedRTChecks() {
2064 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2065 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2066 if (!SCEVCheckCond)
2067 SCEVCleaner.markResultUsed();
2068
2069 if (!MemRuntimeCheckCond)
2070 MemCheckCleaner.markResultUsed();
2071
2072 if (MemRuntimeCheckCond) {
2073 auto &SE = *MemCheckExp.getSE();
2074 // Memory runtime check generation creates compares that use expanded
2075 // values. Remove them before running the SCEVExpanderCleaners.
2076 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2077 if (MemCheckExp.isInsertedInstruction(&I))
2078 continue;
2079 SE.forgetValue(&I);
2080 I.eraseFromParent();
2081 }
2082 }
2083 MemCheckCleaner.cleanup();
2084 SCEVCleaner.cleanup();
2085
2086 if (SCEVCheckCond)
2087 SCEVCheckBlock->eraseFromParent();
2088 if (MemRuntimeCheckCond)
2089 MemCheckBlock->eraseFromParent();
2090 }
2091
2092 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2093 /// adjusts the branches to branch to the vector preheader or \p Bypass,
2094 /// depending on the generated condition.
2095 BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2096 BasicBlock *LoopVectorPreHeader) {
2097 if (!SCEVCheckCond)
2098 return nullptr;
2099
2100 Value *Cond = SCEVCheckCond;
2101 // Mark the check as used, to prevent it from being removed during cleanup.
2102 SCEVCheckCond = nullptr;
2103 if (auto *C = dyn_cast<ConstantInt>(Cond))
2104 if (C->isZero())
2105 return nullptr;
2106
2107 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2108
2109 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2110 // Create new preheader for vector loop.
2111 if (OuterLoop)
2112 OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2113
2114 SCEVCheckBlock->getTerminator()->eraseFromParent();
2115 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2116 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2117 SCEVCheckBlock);
2118
2119 DT->addNewBlock(SCEVCheckBlock, Pred);
2120 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2121
2122 BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
2123 if (AddBranchWeights)
2124 setBranchWeights(BI, SCEVCheckBypassWeights, /*IsExpected=*/false);
2125 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
2126 return SCEVCheckBlock;
2127 }
2128
2129 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2130 /// the branches to branch to the vector preheader or \p Bypass, depending on
2131 /// the generated condition.
2132 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2133 BasicBlock *LoopVectorPreHeader) {
2134 // Check if we generated code that checks in runtime if arrays overlap.
2135 if (!MemRuntimeCheckCond)
2136 return nullptr;
2137
2138 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2139 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2140 MemCheckBlock);
2141
2142 DT->addNewBlock(MemCheckBlock, Pred);
2143 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2144 MemCheckBlock->moveBefore(LoopVectorPreHeader);
2145
2146 if (OuterLoop)
2147 OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
2148
2149 BranchInst &BI =
2150 *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2151 if (AddBranchWeights) {
2152 setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false);
2153 }
2154 ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
2155 MemCheckBlock->getTerminator()->setDebugLoc(
2156 Pred->getTerminator()->getDebugLoc());
2157
2158 // Mark the check as used, to prevent it from being removed during cleanup.
2159 MemRuntimeCheckCond = nullptr;
2160 return MemCheckBlock;
2161 }
2162};
2163} // namespace
2164
2166 return Style == TailFoldingStyle::Data ||
2167 Style == TailFoldingStyle::DataAndControlFlow ||
2168 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2169}
2170
2172 return Style == TailFoldingStyle::DataAndControlFlow ||
2173 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2174}
2175
2176// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2177// vectorization. The loop needs to be annotated with #pragma omp simd
2178// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2179// vector length information is not provided, vectorization is not considered
2180// explicit. Interleave hints are not allowed either. These limitations will be
2181// relaxed in the future.
2182// Please, note that we are currently forced to abuse the pragma 'clang
2183// vectorize' semantics. This pragma provides *auto-vectorization hints*
2184// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2185// provides *explicit vectorization hints* (LV can bypass legal checks and
2186// assume that vectorization is legal). However, both hints are implemented
2187// using the same metadata (llvm.loop.vectorize, processed by
2188// LoopVectorizeHints). This will be fixed in the future when the native IR
2189// representation for pragma 'omp simd' is introduced.
2190static bool isExplicitVecOuterLoop(Loop *OuterLp,
2192 assert(!OuterLp->isInnermost() && "This is not an outer loop");
2193 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2194
2195 // Only outer loops with an explicit vectorization hint are supported.
2196 // Unannotated outer loops are ignored.
2198 return false;
2199
2200 Function *Fn = OuterLp->getHeader()->getParent();
2201 if (!Hints.allowVectorization(Fn, OuterLp,
2202 true /*VectorizeOnlyWhenForced*/)) {
2203 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2204 return false;
2205 }
2206
2207 if (Hints.getInterleave() > 1) {
2208 // TODO: Interleave support is future work.
2209 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2210 "outer loops.\n");
2211 Hints.emitRemarkWithHints();
2212 return false;
2213 }
2214
2215 return true;
2216}
2217
2221 // Collect inner loops and outer loops without irreducible control flow. For
2222 // now, only collect outer loops that have explicit vectorization hints. If we
2223 // are stress testing the VPlan H-CFG construction, we collect the outermost
2224 // loop of every loop nest.
2225 if (L.isInnermost() || VPlanBuildStressTest ||
2227 LoopBlocksRPO RPOT(&L);
2228 RPOT.perform(LI);
2229 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2230 V.push_back(&L);
2231 // TODO: Collect inner loops inside marked outer loops in case
2232 // vectorization fails for the outer loop. Do not invoke
2233 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2234 // already known to be reducible. We can use an inherited attribute for
2235 // that.
2236 return;
2237 }
2238 }
2239 for (Loop *InnerL : L)
2240 collectSupportedLoops(*InnerL, LI, ORE, V);
2241}
2242
2243//===----------------------------------------------------------------------===//
2244// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2245// LoopVectorizationCostModel and LoopVectorizationPlanner.
2246//===----------------------------------------------------------------------===//
2247
2248/// Compute the transformed value of Index at offset StartValue using step
2249/// StepValue.
2250/// For integer induction, returns StartValue + Index * StepValue.
2251/// For pointer induction, returns StartValue[Index * StepValue].
2252/// FIXME: The newly created binary instructions should contain nsw/nuw
2253/// flags, which can be found from the original scalar operations.
2254static Value *
2256 Value *Step,
2258 const BinaryOperator *InductionBinOp) {
2259 Type *StepTy = Step->getType();
2260 Value *CastedIndex = StepTy->isIntegerTy()
2261 ? B.CreateSExtOrTrunc(Index, StepTy)
2262 : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2263 if (CastedIndex != Index) {
2264 CastedIndex->setName(CastedIndex->getName() + ".cast");
2265 Index = CastedIndex;
2266 }
2267
2268 // Note: the IR at this point is broken. We cannot use SE to create any new
2269 // SCEV and then expand it, hoping that SCEV's simplification will give us
2270 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2271 // lead to various SCEV crashes. So all we can do is to use builder and rely
2272 // on InstCombine for future simplifications. Here we handle some trivial
2273 // cases only.
2274 auto CreateAdd = [&B](Value *X, Value *Y) {
2275 assert(X->getType() == Y->getType() && "Types don't match!");
2276 if (auto *CX = dyn_cast<ConstantInt>(X))
2277 if (CX->isZero())
2278 return Y;
2279 if (auto *CY = dyn_cast<ConstantInt>(Y))
2280 if (CY->isZero())
2281 return X;
2282 return B.CreateAdd(X, Y);
2283 };
2284
2285 // We allow X to be a vector type, in which case Y will potentially be
2286 // splatted into a vector with the same element count.
2287 auto CreateMul = [&B](Value *X, Value *Y) {
2288 assert(X->getType()->getScalarType() == Y->getType() &&
2289 "Types don't match!");
2290 if (auto *CX = dyn_cast<ConstantInt>(X))
2291 if (CX->isOne())
2292 return Y;
2293 if (auto *CY = dyn_cast<ConstantInt>(Y))
2294 if (CY->isOne())
2295 return X;
2296 VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2297 if (XVTy && !isa<VectorType>(Y->getType()))
2298 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2299 return B.CreateMul(X, Y);
2300 };
2301
2302 switch (InductionKind) {
2304 assert(!isa<VectorType>(Index->getType()) &&
2305 "Vector indices not supported for integer inductions yet");
2306 assert(Index->getType() == StartValue->getType() &&
2307 "Index type does not match StartValue type");
2308 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2309 return B.CreateSub(StartValue, Index);
2310 auto *Offset = CreateMul(Index, Step);
2311 return CreateAdd(StartValue, Offset);
2312 }
2314 return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
2316 assert(!isa<VectorType>(Index->getType()) &&
2317 "Vector indices not supported for FP inductions yet");
2318 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2319 assert(InductionBinOp &&
2320 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2321 InductionBinOp->getOpcode() == Instruction::FSub) &&
2322 "Original bin op should be defined for FP induction");
2323
2324 Value *MulExp = B.CreateFMul(Step, Index);
2325 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2326 "induction");
2327 }
2329 return nullptr;
2330 }
2331 llvm_unreachable("invalid enum");
2332}
2333
2334std::optional<unsigned> getMaxVScale(const Function &F,
2335 const TargetTransformInfo &TTI) {
2336 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2337 return MaxVScale;
2338
2339 if (F.hasFnAttribute(Attribute::VScaleRange))
2340 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2341
2342 return std::nullopt;
2343}
2344
2345/// For the given VF and UF and maximum trip count computed for the loop, return
2346/// whether the induction variable might overflow in the vectorized loop. If not,
2347/// then we know a runtime overflow check always evaluates to false and can be
2348/// removed.
2351 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2352 // Always be conservative if we don't know the exact unroll factor.
2353 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2354
2355 Type *IdxTy = Cost->Legal->getWidestInductionType();
2356 APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
2357
2358 // We know the runtime overflow check is known false iff the (max) trip-count
2359 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2360 // the vector loop induction variable.
2361 if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
2362 uint64_t MaxVF = VF.getKnownMinValue();
2363 if (VF.isScalable()) {
2364 std::optional<unsigned> MaxVScale =
2365 getMaxVScale(*Cost->TheFunction, Cost->TTI);
2366 if (!MaxVScale)
2367 return false;
2368 MaxVF *= *MaxVScale;
2369 }
2370
2371 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2372 }
2373
2374 return false;
2375}
2376
2377// Return whether we allow using masked interleave-groups (for dealing with
2378// strided loads/stores that reside in predicated blocks, or for dealing
2379// with gaps).
2381 // If an override option has been passed in for interleaved accesses, use it.
2384
2386}
2387
2389 VPReplicateRecipe *RepRecipe,
2390 const VPLane &Lane,
2391 VPTransformState &State) {
2392 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2393
2394 // Does this instruction return a value ?
2395 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2396
2397 Instruction *Cloned = Instr->clone();
2398 if (!IsVoidRetTy) {
2399 Cloned->setName(Instr->getName() + ".cloned");
2400#if !defined(NDEBUG)
2401 // Verify that VPlan type inference results agree with the type of the
2402 // generated values.
2403 assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2404 "inferred type and type from generated instructions do not match");
2405#endif
2406 }
2407
2408 RepRecipe->setFlags(Cloned);
2409
2410 if (auto DL = Instr->getDebugLoc())
2411 State.setDebugLocFrom(DL);
2412
2413 // Replace the operands of the cloned instructions with their scalar
2414 // equivalents in the new loop.
2415 for (const auto &I : enumerate(RepRecipe->operands())) {
2416 auto InputLane = Lane;
2417 VPValue *Operand = I.value();
2419 InputLane = VPLane::getFirstLane();
2420 Cloned->setOperand(I.index(), State.get(Operand, InputLane));
2421 }
2422 State.addNewMetadata(Cloned, Instr);
2423
2424 // Place the cloned scalar in the new loop.
2425 State.Builder.Insert(Cloned);
2426
2427 State.set(RepRecipe, Cloned, Lane);
2428
2429 // If we just cloned a new assumption, add it the assumption cache.
2430 if (auto *II = dyn_cast<AssumeInst>(Cloned))
2432
2433 // End if-block.
2434 VPRegionBlock *Parent = RepRecipe->getParent()->getParent();
2435 bool IfPredicateInstr = Parent ? Parent->isReplicator() : false;
2436 assert(
2437 (Parent || !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
2438 all_of(RepRecipe->operands(),
2439 [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
2440 "Expected a recipe is either within a region or all of its operands "
2441 "are defined outside the vectorized region.");
2442 if (IfPredicateInstr)
2443 PredicatedInstructions.push_back(Cloned);
2444}
2445
2446Value *
2448 if (VectorTripCount)
2449 return VectorTripCount;
2450
2451 Value *TC = getTripCount();
2452 IRBuilder<> Builder(InsertBlock->getTerminator());
2453
2454 Type *Ty = TC->getType();
2455 // This is where we can make the step a runtime constant.
2456 Value *Step = createStepForVF(Builder, Ty, VF, UF);
2457
2458 // If the tail is to be folded by masking, round the number of iterations N
2459 // up to a multiple of Step instead of rounding down. This is done by first
2460 // adding Step-1 and then rounding down. Note that it's ok if this addition
2461 // overflows: the vector induction variable will eventually wrap to zero given
2462 // that it starts at zero and its Step is a power of two; the loop will then
2463 // exit, with the last early-exit vector comparison also producing all-true.
2464 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2465 // is accounted for in emitIterationCountCheck that adds an overflow check.
2466 if (Cost->foldTailByMasking()) {
2468 "VF*UF must be a power of 2 when folding tail by masking");
2469 TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)),
2470 "n.rnd.up");
2471 }
2472
2473 // Now we need to generate the expression for the part of the loop that the
2474 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2475 // iterations are not required for correctness, or N - Step, otherwise. Step
2476 // is equal to the vectorization factor (number of SIMD elements) times the
2477 // unroll factor (number of SIMD instructions).
2478 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2479
2480 // There are cases where we *must* run at least one iteration in the remainder
2481 // loop. See the cost model for when this can happen. If the step evenly
2482 // divides the trip count, we set the remainder to be equal to the step. If
2483 // the step does not evenly divide the trip count, no adjustment is necessary
2484 // since there will already be scalar iterations. Note that the minimum
2485 // iterations check ensures that N >= Step.
2486 if (Cost->requiresScalarEpilogue(VF.isVector())) {
2487 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2488 R = Builder.CreateSelect(IsZero, Step, R);
2489 }
2490
2491 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2492
2493 return VectorTripCount;
2494}
2495
2497 VPBlockBase *ScalarPH = Plan.getScalarPreheader();
2499 if (PreVectorPH->getNumSuccessors() != 1) {
2500 assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors");
2501 assert(PreVectorPH->getSuccessors()[0] == ScalarPH &&
2502 "Unexpected successor");
2503 VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB);
2504 VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPB, CheckVPIRBB);
2505 PreVectorPH = CheckVPIRBB;
2506 }
2507 VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH);
2508 PreVectorPH->swapSuccessors();
2509}
2510
2512 Value *Count = getTripCount();
2513 // Reuse existing vector loop preheader for TC checks.
2514 // Note that new preheader block is generated for vector loop.
2515 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2516 IRBuilder<> Builder(TCCheckBlock->getTerminator());
2517
2518 // Generate code to check if the loop's trip count is less than VF * UF, or
2519 // equal to it in case a scalar epilogue is required; this implies that the
2520 // vector trip count is zero. This check also covers the case where adding one
2521 // to the backedge-taken count overflowed leading to an incorrect trip count
2522 // of zero. In this case we will also jump to the scalar loop.
2523 auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2525
2526 // If tail is to be folded, vector loop takes care of all iterations.
2527 Type *CountTy = Count->getType();
2528 Value *CheckMinIters = Builder.getFalse();
2529 auto CreateStep = [&]() -> Value * {
2530 // Create step with max(MinProTripCount, UF * VF).
2532 return createStepForVF(Builder, CountTy, VF, UF);
2533
2534 Value *MinProfTC =
2536 if (!VF.isScalable())
2537 return MinProfTC;
2539 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2540 };
2541
2542 TailFoldingStyle Style = Cost->getTailFoldingStyle();
2543 if (Style == TailFoldingStyle::None) {
2544 Value *Step = CreateStep();
2545 ScalarEvolution &SE = *PSE.getSE();
2546 // TODO: Emit unconditional branch to vector preheader instead of
2547 // conditional branch with known condition.
2548 const SCEV *TripCountSCEV = SE.applyLoopGuards(SE.getSCEV(Count), OrigLoop);
2549 // Check if the trip count is < the step.
2550 if (SE.isKnownPredicate(P, TripCountSCEV, SE.getSCEV(Step))) {
2551 // TODO: Ensure step is at most the trip count when determining max VF and
2552 // UF, w/o tail folding.
2553 CheckMinIters = Builder.getTrue();
2555 TripCountSCEV, SE.getSCEV(Step))) {
2556 // Generate the minimum iteration check only if we cannot prove the
2557 // check is known to be true, or known to be false.
2558 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
2559 } // else step known to be < trip count, use CheckMinIters preset to false.
2560 } else if (VF.isScalable() &&
2563 // vscale is not necessarily a power-of-2, which means we cannot guarantee
2564 // an overflow to zero when updating induction variables and so an
2565 // additional overflow check is required before entering the vector loop.
2566
2567 // Get the maximum unsigned value for the type.
2568 Value *MaxUIntTripCount =
2569 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2570 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2571
2572 // Don't execute the vector loop if (UMax - n) < (VF * UF).
2573 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2574 }
2575
2576 // Create new preheader for vector loop.
2578 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2579 "vector.ph");
2580
2581 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2582 DT->getNode(Bypass)->getIDom()) &&
2583 "TC check is expected to dominate Bypass");
2584
2585 BranchInst &BI =
2586 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
2588 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
2589 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
2590 LoopBypassBlocks.push_back(TCCheckBlock);
2591
2592 // TODO: Wrap LoopVectorPreHeader in VPIRBasicBlock here.
2593 introduceCheckBlockInVPlan(TCCheckBlock);
2594}
2595
2597 BasicBlock *const SCEVCheckBlock =
2598 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader);
2599 if (!SCEVCheckBlock)
2600 return nullptr;
2601
2602 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2604 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2605 "Cannot SCEV check stride or overflow when optimizing for size");
2606 assert(!LoopBypassBlocks.empty() &&
2607 "Should already be a bypass block due to iteration count check");
2608 LoopBypassBlocks.push_back(SCEVCheckBlock);
2609 AddedSafetyChecks = true;
2610
2611 introduceCheckBlockInVPlan(SCEVCheckBlock);
2612 return SCEVCheckBlock;
2613}
2614
2616 // VPlan-native path does not do any analysis for runtime checks currently.
2618 return nullptr;
2619
2620 BasicBlock *const MemCheckBlock =
2621 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2622
2623 // Check if we generated code that checks in runtime if arrays overlap. We put
2624 // the checks into a separate block to make the more common case of few
2625 // elements faster.
2626 if (!MemCheckBlock)
2627 return nullptr;
2628
2629 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2630 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2631 "Cannot emit memory checks when optimizing for size, unless forced "
2632 "to vectorize.");
2633 ORE->emit([&]() {
2634 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2637 << "Code-size may be reduced by not forcing "
2638 "vectorization, or by source-code modifications "
2639 "eliminating the need for runtime checks "
2640 "(e.g., adding 'restrict').";
2641 });
2642 }
2643
2644 LoopBypassBlocks.push_back(MemCheckBlock);
2645
2646 AddedSafetyChecks = true;
2647
2648 introduceCheckBlockInVPlan(MemCheckBlock);
2649 return MemCheckBlock;
2650}
2651
2652/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
2653/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
2654/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
2655/// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
2657 VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB);
2658 for (auto &R : make_early_inc_range(*VPBB)) {
2659 assert(!R.isPhi() && "Tried to move phi recipe to end of block");
2660 R.moveBefore(*IRVPBB, IRVPBB->end());
2661 }
2662
2663 VPBlockUtils::reassociateBlocks(VPBB, IRVPBB);
2664 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
2665}
2666
2669 assert(LoopVectorPreHeader && "Invalid loop structure");
2671 Cost->requiresScalarEpilogue(VF.isVector())) &&
2672 "loops not exiting via the latch without required epilogue?");
2673
2676 LI, nullptr, Twine(Prefix) + "middle.block");
2680 nullptr, Twine(Prefix) + "scalar.ph");
2682}
2683
2684/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
2685/// expansion results.
2687 const SCEV2ValueTy &ExpandedSCEVs) {
2688 const SCEV *Step = ID.getStep();
2689 if (auto *C = dyn_cast<SCEVConstant>(Step))
2690 return C->getValue();
2691 if (auto *U = dyn_cast<SCEVUnknown>(Step))
2692 return U->getValue();
2693 auto I = ExpandedSCEVs.find(Step);
2694 assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
2695 return I->second;
2696}
2697
2698/// Knowing that loop \p L executes a single vector iteration, add instructions
2699/// that will get simplified and thus should not have any cost to \p
2700/// InstsToIgnore.
2703 SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
2704 auto *Cmp = L->getLatchCmpInst();
2705 if (Cmp)
2706 InstsToIgnore.insert(Cmp);
2707 for (const auto &KV : IL) {
2708 // Extract the key by hand so that it can be used in the lambda below. Note
2709 // that captured structured bindings are a C++20 extension.
2710 const PHINode *IV = KV.first;
2711
2712 // Get next iteration value of the induction variable.
2713 Instruction *IVInst =
2714 cast<Instruction>(IV->getIncomingValueForBlock(L->getLoopLatch()));
2715 if (all_of(IVInst->users(),
2716 [&](const User *U) { return U == IV || U == Cmp; }))
2717 InstsToIgnore.insert(IVInst);
2718 }
2719}
2720
2722 const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount) {
2723 assert(MainVectorTripCount && "Must have bypass information");
2724
2725 Instruction *OldInduction = Legal->getPrimaryInduction();
2726 IRBuilder<> BypassBuilder(getAdditionalBypassBlock(),
2727 getAdditionalBypassBlock()->getFirstInsertionPt());
2728 for (const auto &InductionEntry : Legal->getInductionVars()) {
2729 PHINode *OrigPhi = InductionEntry.first;
2730 const InductionDescriptor &II = InductionEntry.second;
2731 Value *Step = getExpandedStep(II, ExpandedSCEVs);
2732 // For the primary induction the additional bypass end value is known.
2733 // Otherwise it is computed.
2734 Value *EndValueFromAdditionalBypass = MainVectorTripCount;
2735 if (OrigPhi != OldInduction) {
2736 auto *BinOp = II.getInductionBinOp();
2737 // Fast-math-flags propagate from the original induction instruction.
2738 if (isa_and_nonnull<FPMathOperator>(BinOp))
2739 BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags());
2740
2741 // Compute the end value for the additional bypass.
2742 EndValueFromAdditionalBypass =
2743 emitTransformedIndex(BypassBuilder, MainVectorTripCount,
2744 II.getStartValue(), Step, II.getKind(), BinOp);
2745 EndValueFromAdditionalBypass->setName("ind.end");
2746 }
2747
2748 // Store the bypass value here, as it needs to be added as operand to its
2749 // scalar preheader phi node after the epilogue skeleton has been created.
2750 // TODO: Directly add as extra operand to the VPResumePHI recipe.
2751 assert(!Induction2AdditionalBypassValue.contains(OrigPhi) &&
2752 "entry for OrigPhi already exits");
2753 Induction2AdditionalBypassValue[OrigPhi] = EndValueFromAdditionalBypass;
2754 }
2755}
2756
2758 const SCEV2ValueTy &ExpandedSCEVs) {
2759 /*
2760 In this function we generate a new loop. The new loop will contain
2761 the vectorized instructions while the old loop will continue to run the
2762 scalar remainder.
2763
2764 [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
2765 / | preheader are expanded here. Eventually all required SCEV
2766 / | expansion should happen here.
2767 / v
2768 | [ ] <-- vector loop bypass (may consist of multiple blocks).
2769 | / |
2770 | / v
2771 || [ ] <-- vector pre header.
2772 |/ |
2773 | v
2774 | [ ] \
2775 | [ ]_| <-- vector loop (created during VPlan execution).
2776 | |
2777 | v
2778 \ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to
2779 | | successors created during VPlan execution)
2780 \/ |
2781 /\ v
2782 | ->[ ] <--- new preheader (wrapped in VPIRBasicBlock).
2783 | |
2784 (opt) v <-- edge from middle to exit iff epilogue is not required.
2785 | [ ] \
2786 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue, header
2787 | | wrapped in VPIRBasicBlock).
2788 \ |
2789 \ v
2790 >[ ] <-- exit block(s). (wrapped in VPIRBasicBlock)
2791 ...
2792 */
2793
2794 // Create an empty vector loop, and prepare basic blocks for the runtime
2795 // checks.
2797
2798 // Now, compare the new count to zero. If it is zero skip the vector loop and
2799 // jump to the scalar loop. This check also covers the case where the
2800 // backedge-taken count is uint##_max: adding one to it will overflow leading
2801 // to an incorrect trip count of zero. In this (rare) case we will also jump
2802 // to the scalar loop.
2804
2805 // Generate the code to check any assumptions that we've made for SCEV
2806 // expressions.
2808
2809 // Generate the code that checks in runtime if arrays overlap. We put the
2810 // checks into a separate block to make the more common case of few elements
2811 // faster.
2813
2814 return LoopVectorPreHeader;
2815}
2816
2817namespace {
2818
2819struct CSEDenseMapInfo {
2820 static bool canHandle(const Instruction *I) {
2821 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
2822 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
2823 }
2824
2825 static inline Instruction *getEmptyKey() {
2827 }
2828
2829 static inline Instruction *getTombstoneKey() {
2831 }
2832
2833 static unsigned getHashValue(const Instruction *I) {
2834 assert(canHandle(I) && "Unknown instruction!");
2835 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
2836 I->value_op_end()));
2837 }
2838
2839 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2840 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2841 LHS == getTombstoneKey() || RHS == getTombstoneKey())
2842 return LHS == RHS;
2843 return LHS->isIdenticalTo(RHS);
2844 }
2845};
2846
2847} // end anonymous namespace
2848
2849///Perform cse of induction variable instructions.
2850static void cse(BasicBlock *BB) {
2851 // Perform simple cse.
2853 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
2854 if (!CSEDenseMapInfo::canHandle(&In))
2855 continue;
2856
2857 // Check if we can replace this instruction with any of the
2858 // visited instructions.
2859 if (Instruction *V = CSEMap.lookup(&In)) {
2860 In.replaceAllUsesWith(V);
2861 In.eraseFromParent();
2862 continue;
2863 }
2864
2865 CSEMap[&In] = &In;
2866 }
2867}
2868
2871 ElementCount VF) const {
2872 // We only need to calculate a cost if the VF is scalar; for actual vectors
2873 // we should already have a pre-calculated cost at each VF.
2874 if (!VF.isScalar())
2875 return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
2876
2877 Type *RetTy = CI->getType();
2879 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy))
2880 return *RedCost;
2881
2883 for (auto &ArgOp : CI->args())
2884 Tys.push_back(ArgOp->getType());
2885
2886 InstructionCost ScalarCallCost =
2888
2889 // If this is an intrinsic we may have a lower cost for it.
2891 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
2892 return std::min(ScalarCallCost, IntrinsicCost);
2893 }
2894 return ScalarCallCost;
2895}
2896
2898 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
2899 return Elt;
2900 return VectorType::get(Elt, VF);
2901}
2902
2905 ElementCount VF) const {
2907 assert(ID && "Expected intrinsic call!");
2908 Type *RetTy = maybeVectorizeType(CI->getType(), VF);
2909 FastMathFlags FMF;
2910 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
2911 FMF = FPMO->getFastMathFlags();
2912
2915 SmallVector<Type *> ParamTys;
2916 std::transform(FTy->param_begin(), FTy->param_end(),
2917 std::back_inserter(ParamTys),
2918 [&](Type *Ty) { return maybeVectorizeType(Ty, VF); });
2919
2920 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
2921 dyn_cast<IntrinsicInst>(CI));
2922 return TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
2923}
2924
2926 // Fix widened non-induction PHIs by setting up the PHI operands.
2928 fixNonInductionPHIs(State);
2929
2930 // Forget the original basic block.
2933
2934 // After vectorization, the exit blocks of the original loop will have
2935 // additional predecessors. Invalidate SCEVs for the exit phis in case SE
2936 // looked through single-entry phis.
2937 SmallVector<BasicBlock *> ExitBlocks;
2938 OrigLoop->getExitBlocks(ExitBlocks);
2939 for (BasicBlock *Exit : ExitBlocks)
2940 for (PHINode &PN : Exit->phis())
2942
2943 // Don't apply optimizations below when no vector region remains, as they all
2944 // require a vector loop at the moment.
2945 if (!State.Plan->getVectorLoopRegion())
2946 return;
2947
2949 sinkScalarOperands(&*PI);
2950
2951 VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
2952 VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
2953 BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
2954
2955 // Remove redundant induction instructions.
2956 cse(HeaderBB);
2957
2958 // Set/update profile weights for the vector and remainder loops as original
2959 // loop iterations are now distributed among them. Note that original loop
2960 // becomes the scalar remainder loop after vectorization.
2961 //
2962 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
2963 // end up getting slightly roughened result but that should be OK since
2964 // profile is not inherently precise anyway. Note also possible bypass of
2965 // vector code caused by legality checks is ignored, assigning all the weight
2966 // to the vector loop, optimistically.
2967 //
2968 // For scalable vectorization we can't know at compile time how many
2969 // iterations of the loop are handled in one vector iteration, so instead
2970 // assume a pessimistic vscale of '1'.
2971 Loop *VectorLoop = LI->getLoopFor(HeaderBB);
2973 VF.getKnownMinValue() * UF);
2974}
2975
2977 // The basic block and loop containing the predicated instruction.
2978 auto *PredBB = PredInst->getParent();
2979 auto *VectorLoop = LI->getLoopFor(PredBB);
2980
2981 // Initialize a worklist with the operands of the predicated instruction.
2982 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
2983
2984 // Holds instructions that we need to analyze again. An instruction may be
2985 // reanalyzed if we don't yet know if we can sink it or not.
2986 SmallVector<Instruction *, 8> InstsToReanalyze;
2987
2988 // Returns true if a given use occurs in the predicated block. Phi nodes use
2989 // their operands in their corresponding predecessor blocks.
2990 auto IsBlockOfUsePredicated = [&](Use &U) -> bool {
2991 auto *I = cast<Instruction>(U.getUser());
2992 BasicBlock *BB = I->getParent();
2993 if (auto *Phi = dyn_cast<PHINode>(I))
2994 BB = Phi->getIncomingBlock(
2995 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
2996 return BB == PredBB;
2997 };
2998
2999 // Iteratively sink the scalarized operands of the predicated instruction
3000 // into the block we created for it. When an instruction is sunk, it's
3001 // operands are then added to the worklist. The algorithm ends after one pass
3002 // through the worklist doesn't sink a single instruction.
3003 bool Changed;
3004 do {
3005 // Add the instructions that need to be reanalyzed to the worklist, and
3006 // reset the changed indicator.
3007 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3008 InstsToReanalyze.clear();
3009 Changed = false;
3010
3011 while (!Worklist.empty()) {
3012 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3013
3014 // We can't sink an instruction if it is a phi node, is not in the loop,
3015 // may have side effects or may read from memory.
3016 // TODO: Could do more granular checking to allow sinking
3017 // a load past non-store instructions.
3018 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
3019 I->mayHaveSideEffects() || I->mayReadFromMemory())
3020 continue;
3021
3022 // If the instruction is already in PredBB, check if we can sink its
3023 // operands. In that case, VPlan's sinkScalarOperands() succeeded in
3024 // sinking the scalar instruction I, hence it appears in PredBB; but it
3025 // may have failed to sink I's operands (recursively), which we try
3026 // (again) here.
3027 if (I->getParent() == PredBB) {
3028 Worklist.insert(I->op_begin(), I->op_end());
3029 continue;
3030 }
3031
3032 // It's legal to sink the instruction if all its uses occur in the
3033 // predicated block. Otherwise, there's nothing to do yet, and we may
3034 // need to reanalyze the instruction.
3035 if (!llvm::all_of(I->uses(), IsBlockOfUsePredicated)) {
3036 InstsToReanalyze.push_back(I);
3037 continue;
3038 }
3039
3040 // Move the instruction to the beginning of the predicated block, and add
3041 // it's operands to the worklist.
3042 I->moveBefore(PredBB->getFirstInsertionPt());
3043 Worklist.insert(I->op_begin(), I->op_end());
3044
3045 // The sinking may have enabled other instructions to be sunk, so we will
3046 // need to iterate.
3047 Changed = true;
3048 }
3049 } while (Changed);
3050}
3051
3053 auto Iter = vp_depth_first_deep(Plan.getEntry());
3054 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
3055 for (VPRecipeBase &P : VPBB->phis()) {
3056 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
3057 if (!VPPhi)
3058 continue;
3059 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi));
3060 // Make sure the builder has a valid insert point.
3061 Builder.SetInsertPoint(NewPhi);
3062 for (unsigned Idx = 0; Idx < VPPhi->getNumOperands(); ++Idx) {
3063 VPValue *Inc = VPPhi->getIncomingValue(Idx);
3064 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(Idx);
3065 NewPhi->addIncoming(State.get(Inc), State.CFG.VPBB2IRBB[VPBB]);
3066 }
3067 }
3068 }
3069}
3070
3071void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
3072 // We should not collect Scalars more than once per VF. Right now, this
3073 // function is called from collectUniformsAndScalars(), which already does
3074 // this check. Collecting Scalars for VF=1 does not make any sense.
3075 assert(VF.isVector() && !Scalars.contains(VF) &&
3076 "This function should not be visited twice for the same VF");
3077
3078 // This avoids any chances of creating a REPLICATE recipe during planning
3079 // since that would result in generation of scalarized code during execution,
3080 // which is not supported for scalable vectors.
3081 if (VF.isScalable()) {
3082 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
3083 return;
3084 }
3085
3087
3088 // These sets are used to seed the analysis with pointers used by memory
3089 // accesses that will remain scalar.
3091 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
3092 auto *Latch = TheLoop->getLoopLatch();
3093
3094 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
3095 // The pointer operands of loads and stores will be scalar as long as the
3096 // memory access is not a gather or scatter operation. The value operand of a
3097 // store will remain scalar if the store is scalarized.
3098 auto IsScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
3099 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
3100 assert(WideningDecision != CM_Unknown &&
3101 "Widening decision should be ready at this moment");
3102 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
3103 if (Ptr == Store->getValueOperand())
3104 return WideningDecision == CM_Scalarize;
3105 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
3106 "Ptr is neither a value or pointer operand");
3107 return WideningDecision != CM_GatherScatter;
3108 };
3109
3110 // A helper that returns true if the given value is a getelementptr
3111 // instruction contained in the loop.
3112 auto IsLoopVaryingGEP = [&](Value *V) {
3113 return isa<GetElementPtrInst>(V) && !TheLoop->isLoopInvariant(V);
3114 };
3115
3116 // A helper that evaluates a memory access's use of a pointer. If the use will
3117 // be a scalar use and the pointer is only used by memory accesses, we place
3118 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
3119 // PossibleNonScalarPtrs.
3120 auto EvaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
3121 // We only care about bitcast and getelementptr instructions contained in
3122 // the loop.
3123 if (!IsLoopVaryingGEP(Ptr))
3124 return;
3125
3126 // If the pointer has already been identified as scalar (e.g., if it was
3127 // also identified as uniform), there's nothing to do.
3128 auto *I = cast<Instruction>(Ptr);
3129 if (Worklist.count(I))
3130 return;
3131
3132 // If the use of the pointer will be a scalar use, and all users of the
3133 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
3134 // place the pointer in PossibleNonScalarPtrs.
3135 if (IsScalarUse(MemAccess, Ptr) &&
3136 all_of(I->users(), IsaPred<LoadInst, StoreInst>))
3137 ScalarPtrs.insert(I);
3138 else
3139 PossibleNonScalarPtrs.insert(I);
3140 };
3141
3142 // We seed the scalars analysis with three classes of instructions: (1)
3143 // instructions marked uniform-after-vectorization and (2) bitcast,
3144 // getelementptr and (pointer) phi instructions used by memory accesses
3145 // requiring a scalar use.
3146 //
3147 // (1) Add to the worklist all instructions that have been identified as
3148 // uniform-after-vectorization.
3149 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
3150
3151 // (2) Add to the worklist all bitcast and getelementptr instructions used by
3152 // memory accesses requiring a scalar use. The pointer operands of loads and
3153 // stores will be scalar unless the operation is a gather or scatter.
3154 // The value operand of a store will remain scalar if the store is scalarized.
3155 for (auto *BB : TheLoop->blocks())
3156 for (auto &I : *BB) {
3157 if (auto *Load = dyn_cast<LoadInst>(&I)) {
3158 EvaluatePtrUse(Load, Load->getPointerOperand());
3159 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
3160 EvaluatePtrUse(Store, Store->getPointerOperand());
3161 EvaluatePtrUse(Store, Store->getValueOperand());
3162 }
3163 }
3164 for (auto *I : ScalarPtrs)
3165 if (!PossibleNonScalarPtrs.count(I)) {
3166 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
3167 Worklist.insert(I);
3168 }
3169
3170 // Insert the forced scalars.
3171 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
3172 // induction variable when the PHI user is scalarized.
3173 auto ForcedScalar = ForcedScalars.find(VF);
3174 if (ForcedScalar != ForcedScalars.end())
3175 for (auto *I : ForcedScalar->second) {
3176 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
3177 Worklist.insert(I);
3178 }
3179
3180 // Expand the worklist by looking through any bitcasts and getelementptr
3181 // instructions we've already identified as scalar. This is similar to the
3182 // expansion step in collectLoopUniforms(); however, here we're only
3183 // expanding to include additional bitcasts and getelementptr instructions.
3184 unsigned Idx = 0;
3185 while (Idx != Worklist.size()) {
3186 Instruction *Dst = Worklist[Idx++];
3187 if (!IsLoopVaryingGEP(Dst->getOperand(0)))
3188 continue;
3189 auto *Src = cast<Instruction>(Dst->getOperand(0));
3190 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
3191 auto *J = cast<Instruction>(U);
3192 return !TheLoop->contains(J) || Worklist.count(J) ||
3193 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
3194 IsScalarUse(J, Src));
3195 })) {
3196 Worklist.insert(Src);
3197 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
3198 }
3199 }
3200
3201 // An induction variable will remain scalar if all users of the induction
3202 // variable and induction variable update remain scalar.
3203 for (const auto &Induction : Legal->getInductionVars()) {
3204 auto *Ind = Induction.first;
3205 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3206
3207 // If tail-folding is applied, the primary induction variable will be used
3208 // to feed a vector compare.
3209 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
3210 continue;
3211
3212 // Returns true if \p Indvar is a pointer induction that is used directly by
3213 // load/store instruction \p I.
3214 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
3215 Instruction *I) {
3216 return Induction.second.getKind() ==
3218 (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
3219 Indvar == getLoadStorePointerOperand(I) && IsScalarUse(I, Indvar);
3220 };
3221
3222 // Determine if all users of the induction variable are scalar after
3223 // vectorization.
3224 bool ScalarInd = all_of(Ind->users(), [&](User *U) -> bool {
3225 auto *I = cast<Instruction>(U);
3226 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3227 IsDirectLoadStoreFromPtrIndvar(Ind, I);
3228 });
3229 if (!ScalarInd)
3230 continue;
3231
3232 // If the induction variable update is a fixed-order recurrence, neither the
3233 // induction variable or its update should be marked scalar after
3234 // vectorization.
3235 auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate);
3236 if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi))
3237 continue;
3238
3239 // Determine if all users of the induction variable update instruction are
3240 // scalar after vectorization.
3241 bool ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
3242 auto *I = cast<Instruction>(U);
3243 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3244 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
3245 });
3246 if (!ScalarIndUpdate)
3247 continue;
3248
3249 // The induction variable and its update instruction will remain scalar.
3250 Worklist.insert(Ind);
3251 Worklist.insert(IndUpdate);
3252 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
3253 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
3254 << "\n");
3255 }
3256
3257 Scalars[VF].insert(Worklist.begin(), Worklist.end());
3258}
3259
3261 Instruction *I, ElementCount VF) const {
3262 if (!isPredicatedInst(I))
3263 return false;
3264
3265 // Do we have a non-scalar lowering for this predicated
3266 // instruction? No - it is scalar with predication.
3267 switch(I->getOpcode()) {
3268 default:
3269 return true;
3270 case Instruction::Call:
3271 if (VF.isScalar())
3272 return true;
3273 return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))
3274 .Kind == CM_Scalarize;
3275 case Instruction::Load:
3276 case Instruction::Store: {
3278 auto *Ty = getLoadStoreType(I);
3279 Type *VTy = Ty;
3280 if (VF.isVector())
3281 VTy = VectorType::get(Ty, VF);
3282 const Align Alignment = getLoadStoreAlignment(I);
3283 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
3284 TTI.isLegalMaskedGather(VTy, Alignment))
3285 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
3286 TTI.isLegalMaskedScatter(VTy, Alignment));
3287 }
3288 case Instruction::UDiv:
3289 case Instruction::SDiv:
3290 case Instruction::SRem:
3291 case Instruction::URem: {
3292 // We have the option to use the safe-divisor idiom to avoid predication.
3293 // The cost based decision here will always select safe-divisor for
3294 // scalable vectors as scalarization isn't legal.
3295 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3296 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
3297 }
3298 }
3299}
3300
3301// TODO: Fold into LoopVectorizationLegality::isMaskRequired.
3303 // If predication is not needed, avoid it.
3304 // TODO: We can use the loop-preheader as context point here and get
3305 // context sensitive reasoning for isSafeToSpeculativelyExecute.
3306 if (!blockNeedsPredicationForAnyReason(I->getParent()) ||
3308 (isa<LoadInst, StoreInst, CallInst>(I) && !Legal->isMaskRequired(I)) ||
3309 isa<BranchInst, SwitchInst, PHINode, AllocaInst>(I))
3310 return false;
3311
3312 // If the instruction was executed conditionally in the original scalar loop,
3313 // predication is needed with a mask whose lanes are all possibly inactive.
3314 if (Legal->blockNeedsPredication(I->getParent()))
3315 return true;
3316
3317 // All that remain are instructions with side-effects originally executed in
3318 // the loop unconditionally, but now execute under a tail-fold mask (only)
3319 // having at least one active lane (the first). If the side-effects of the
3320 // instruction are invariant, executing it w/o (the tail-folding) mask is safe
3321 // - it will cause the same side-effects as when masked.
3322 switch(I->getOpcode()) {
3323 default:
3325 "instruction should have been considered by earlier checks");
3326 case Instruction::Call:
3327 // Side-effects of a Call are assumed to be non-invariant, needing a
3328 // (fold-tail) mask.
3330 "should have returned earlier for calls not needing a mask");
3331 return true;
3332 case Instruction::Load:
3333 // If the address is loop invariant no predication is needed.
3335 case Instruction::Store: {
3336 // For stores, we need to prove both speculation safety (which follows from
3337 // the same argument as loads), but also must prove the value being stored
3338 // is correct. The easiest form of the later is to require that all values
3339 // stored are the same.
3341 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()));
3342 }
3343 case Instruction::UDiv:
3344 case Instruction::SDiv:
3345 case Instruction::SRem:
3346 case Instruction::URem:
3347 // If the divisor is loop-invariant no predication is needed.
3348 return !TheLoop->isLoopInvariant(I->getOperand(1));
3349 }
3350}
3351
3352std::pair<InstructionCost, InstructionCost>
3354 ElementCount VF) const {
3355 assert(I->getOpcode() == Instruction::UDiv ||
3356 I->getOpcode() == Instruction::SDiv ||
3357 I->getOpcode() == Instruction::SRem ||
3358 I->getOpcode() == Instruction::URem);
3360
3361 // Scalarization isn't legal for scalable vector types
3362 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
3363 if (!VF.isScalable()) {
3364 // Get the scalarization cost and scale this amount by the probability of
3365 // executing the predicated block. If the instruction is not predicated,
3366 // we fall through to the next case.
3367 ScalarizationCost = 0;
3368
3369 // These instructions have a non-void type, so account for the phi nodes
3370 // that we will create. This cost is likely to be zero. The phi node
3371 // cost, if any, should be scaled by the block probability because it
3372 // models a copy at the end of each predicated block.
3373 ScalarizationCost += VF.getKnownMinValue() *
3374 TTI.getCFInstrCost(Instruction::PHI, CostKind);
3375
3376 // The cost of the non-predicated instruction.
3377 ScalarizationCost += VF.getKnownMinValue() *
3378 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
3379
3380 // The cost of insertelement and extractelement instructions needed for
3381 // scalarization.
3382 ScalarizationCost += getScalarizationOverhead(I, VF);
3383
3384 // Scale the cost by the probability of executing the predicated blocks.
3385 // This assumes the predicated block for each vector lane is equally
3386 // likely.
3387 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
3388 }
3389 InstructionCost SafeDivisorCost = 0;
3390
3391 auto *VecTy = toVectorTy(I->getType(), VF);
3392
3393 // The cost of the select guard to ensure all lanes are well defined
3394 // after we speculate above any internal control flow.
3395 SafeDivisorCost +=
3396 TTI.getCmpSelInstrCost(Instruction::Select, VecTy,
3397 toVectorTy(Type::getInt1Ty(I->getContext()), VF),
3399
3400 // Certain instructions can be cheaper to vectorize if they have a constant
3401 // second vector operand. One example of this are shifts on x86.
3402 Value *Op2 = I->getOperand(1);
3403 auto Op2Info = TTI.getOperandInfo(Op2);
3404 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
3405 Legal->isInvariant(Op2))
3407
3408 SmallVector<const Value *, 4> Operands(I->operand_values());
3409 SafeDivisorCost += TTI.getArithmeticInstrCost(
3410 I->getOpcode(), VecTy, CostKind,
3411 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
3412 Op2Info, Operands, I);
3413 return {ScalarizationCost, SafeDivisorCost};
3414}
3415
3417 Instruction *I, ElementCount VF) const {
3418 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
3420 "Decision should not be set yet.");
3421 auto *Group = getInterleavedAccessGroup(I);
3422 assert(Group && "Must have a group.");
3423 unsigned InterleaveFactor = Group->getFactor();
3424
3425 // If the instruction's allocated size doesn't equal its type size, it
3426 // requires padding and will be scalarized.
3427 auto &DL = I->getDataLayout();
3428 auto *ScalarTy = getLoadStoreType(I);
3429 if (hasIrregularType(ScalarTy, DL))
3430 return false;
3431
3432 // We currently only know how to emit interleave/deinterleave with
3433 // Factor=2 for scalable vectors. This is purely an implementation
3434 // limit.
3435 if (VF.isScalable() && InterleaveFactor != 2)
3436 return false;
3437
3438 // If the group involves a non-integral pointer, we may not be able to
3439 // losslessly cast all values to a common type.
3440 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
3441 for (unsigned Idx = 0; Idx < InterleaveFactor; Idx++) {
3442 Instruction *Member = Group->getMember(Idx);
3443 if (!Member)
3444 continue;
3445 auto *MemberTy = getLoadStoreType(Member);
3446 bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
3447 // Don't coerce non-integral pointers to integers or vice versa.
3448 if (MemberNI != ScalarNI)
3449 // TODO: Consider adding special nullptr value case here
3450 return false;
3451 if (MemberNI && ScalarNI &&
3452 ScalarTy->getPointerAddressSpace() !=
3453 MemberTy->getPointerAddressSpace())
3454 return false;
3455 }
3456
3457 // Check if masking is required.
3458 // A Group may need masking for one of two reasons: it resides in a block that
3459 // needs predication, or it was decided to use masking to deal with gaps
3460 // (either a gap at the end of a load-access that may result in a speculative
3461 // load, or any gaps in a store-access).
3462 bool PredicatedAccessRequiresMasking =
3463 blockNeedsPredicationForAnyReason(I->getParent()) &&
3465 bool LoadAccessWithGapsRequiresEpilogMasking =
3466 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
3468 bool StoreAccessWithGapsRequiresMasking =
3469 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
3470 if (!PredicatedAccessRequiresMasking &&
3471 !LoadAccessWithGapsRequiresEpilogMasking &&
3472 !StoreAccessWithGapsRequiresMasking)
3473 return true;
3474
3475 // If masked interleaving is required, we expect that the user/target had
3476 // enabled it, because otherwise it either wouldn't have been created or
3477 // it should have been invalidated by the CostModel.
3479 "Masked interleave-groups for predicated accesses are not enabled.");
3480
3481 if (Group->isReverse())
3482 return false;
3483
3484 auto *Ty = getLoadStoreType(I);
3485 const Align Alignment = getLoadStoreAlignment(I);
3486 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
3487 : TTI.isLegalMaskedStore(Ty, Alignment);
3488}
3489
3491 Instruction *I, ElementCount VF) {
3492 // Get and ensure we have a valid memory instruction.
3493 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
3494
3496 auto *ScalarTy = getLoadStoreType(I);
3497
3498 // In order to be widened, the pointer should be consecutive, first of all.
3499 if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
3500 return false;
3501
3502 // If the instruction is a store located in a predicated block, it will be
3503 // scalarized.
3504 if (isScalarWithPredication(I, VF))
3505 return false;
3506
3507 // If the instruction's allocated size doesn't equal it's type size, it
3508 // requires padding and will be scalarized.
3509 auto &DL = I->getDataLayout();
3510 if (hasIrregularType(ScalarTy, DL))
3511 return false;
3512
3513 return true;
3514}
3515
3516void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3517 // We should not collect Uniforms more than once per VF. Right now,
3518 // this function is called from collectUniformsAndScalars(), which
3519 // already does this check. Collecting Uniforms for VF=1 does not make any
3520 // sense.
3521
3522 assert(VF.isVector() && !Uniforms.contains(VF) &&
3523 "This function should not be visited twice for the same VF");
3524
3525 // Visit the list of Uniforms. If we find no uniform value, we won't
3526 // analyze again. Uniforms.count(VF) will return 1.
3527 Uniforms[VF].clear();
3528
3529 // Now we know that the loop is vectorizable!
3530 // Collect instructions inside the loop that will remain uniform after
3531 // vectorization.
3532
3533 // Global values, params and instructions outside of current loop are out of
3534 // scope.
3535 auto IsOutOfScope = [&](Value *V) -> bool {
3536 Instruction *I = dyn_cast<Instruction>(V);
3537 return (!I || !TheLoop->contains(I));
3538 };
3539
3540 // Worklist containing uniform instructions demanding lane 0.
3541 SetVector<Instruction *> Worklist;
3542
3543 // Add uniform instructions demanding lane 0 to the worklist. Instructions
3544 // that require predication must not be considered uniform after
3545 // vectorization, because that would create an erroneous replicating region
3546 // where only a single instance out of VF should be formed.
3547 auto AddToWorklistIfAllowed = [&](Instruction *I) -> void {
3548 if (IsOutOfScope(I)) {
3549 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
3550 << *I << "\n");
3551 return;
3552 }
3553 if (isPredicatedInst(I)) {
3554 LLVM_DEBUG(
3555 dbgs() << "LV: Found not uniform due to requiring predication: " << *I
3556 << "\n");
3557 return;
3558 }
3559 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
3560 Worklist.insert(I);
3561 };
3562
3563 // Start with the conditional branches exiting the loop. If the branch
3564 // condition is an instruction contained in the loop that is only used by the
3565 // branch, it is uniform. Note conditions from uncountable early exits are not
3566 // uniform.
3568 TheLoop->getExitingBlocks(Exiting);
3569 for (BasicBlock *E : Exiting) {
3571 continue;
3572 auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
3573 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
3574 AddToWorklistIfAllowed(Cmp);
3575 }
3576
3577 auto PrevVF = VF.divideCoefficientBy(2);
3578 // Return true if all lanes perform the same memory operation, and we can
3579 // thus choose to execute only one.
3580 auto IsUniformMemOpUse = [&](Instruction *I) {
3581 // If the value was already known to not be uniform for the previous
3582 // (smaller VF), it cannot be uniform for the larger VF.
3583 if (PrevVF.isVector()) {
3584 auto Iter = Uniforms.find(PrevVF);
3585 if (Iter != Uniforms.end() && !Iter->second.contains(I))
3586 return false;
3587 }
3588 if (!Legal->isUniformMemOp(*I, VF))
3589 return false;
3590 if (isa<LoadInst>(I))
3591 // Loading the same address always produces the same result - at least
3592 // assuming aliasing and ordering which have already been checked.
3593 return true;
3594 // Storing the same value on every iteration.
3595 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
3596 };
3597
3598 auto IsUniformDecision = [&](Instruction *I, ElementCount VF) {
3599 InstWidening WideningDecision = getWideningDecision(I, VF);
3600 assert(WideningDecision != CM_Unknown &&
3601 "Widening decision should be ready at this moment");
3602
3603 if (IsUniformMemOpUse(I))
3604 return true;
3605
3606 return (WideningDecision == CM_Widen ||
3607 WideningDecision == CM_Widen_Reverse ||
3608 WideningDecision == CM_Interleave);
3609 };
3610
3611 // Returns true if Ptr is the pointer operand of a memory access instruction
3612 // I, I is known to not require scalarization, and the pointer is not also
3613 // stored.
3614 auto IsVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
3615 if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
3616 return false;
3617 return getLoadStorePointerOperand(I) == Ptr &&
3618 (IsUniformDecision(I, VF) || Legal->isInvariant(Ptr));
3619 };
3620
3621 // Holds a list of values which are known to have at least one uniform use.
3622 // Note that there may be other uses which aren't uniform. A "uniform use"
3623 // here is something which only demands lane 0 of the unrolled iterations;
3624 // it does not imply that all lanes produce the same value (e.g. this is not
3625 // the usual meaning of uniform)
3626 SetVector<Value *> HasUniformUse;
3627
3628 // Scan the loop for instructions which are either a) known to have only
3629 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
3630 for (auto *BB : TheLoop->blocks())
3631 for (auto &I : *BB) {
3632 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
3633 switch (II->getIntrinsicID()) {
3634 case Intrinsic::sideeffect:
3635 case Intrinsic::experimental_noalias_scope_decl:
3636 case Intrinsic::assume:
3637 case Intrinsic::lifetime_start:
3638 case Intrinsic::lifetime_end:
3640 AddToWorklistIfAllowed(&I);
3641 break;
3642 default:
3643 break;
3644 }
3645 }
3646
3647 // ExtractValue instructions must be uniform, because the operands are
3648 // known to be loop-invariant.
3649 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
3650 assert(IsOutOfScope(EVI->getAggregateOperand()) &&
3651 "Expected aggregate value to be loop invariant");
3652 AddToWorklistIfAllowed(EVI);
3653 continue;
3654 }
3655
3656 // If there's no pointer operand, there's nothing to do.
3658 if (!Ptr)
3659 continue;
3660
3661 if (IsUniformMemOpUse(&I))
3662 AddToWorklistIfAllowed(&I);
3663
3664 if (IsVectorizedMemAccessUse(&I, Ptr))
3665 HasUniformUse.insert(Ptr);
3666 }
3667
3668 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
3669 // demanding) users. Since loops are assumed to be in LCSSA form, this
3670 // disallows uses outside the loop as well.
3671 for (auto *V : HasUniformUse) {
3672 if (IsOutOfScope(V))
3673 continue;
3674 auto *I = cast<Instruction>(V);
3675 bool UsersAreMemAccesses = all_of(I->users(), [&](User *U) -> bool {
3676 auto *UI = cast<Instruction>(U);
3677 return TheLoop->contains(UI) && IsVectorizedMemAccessUse(UI, V);
3678 });
3679 if (UsersAreMemAccesses)
3680 AddToWorklistIfAllowed(I);
3681 }
3682
3683 // Expand Worklist in topological order: whenever a new instruction
3684 // is added , its users should be already inside Worklist. It ensures
3685 // a uniform instruction will only be used by uniform instructions.
3686 unsigned Idx = 0;
3687 while (Idx != Worklist.size()) {
3688 Instruction *I = Worklist[Idx++];
3689
3690 for (auto *OV : I->operand_values()) {
3691 // isOutOfScope operands cannot be uniform instructions.
3692 if (IsOutOfScope(OV))
3693 continue;
3694 // First order recurrence Phi's should typically be considered
3695 // non-uniform.
3696 auto *OP = dyn_cast<PHINode>(OV);
3698 continue;
3699 // If all the users of the operand are uniform, then add the
3700 // operand into the uniform worklist.
3701 auto *OI = cast<Instruction>(OV);
3702 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
3703 auto *J = cast<Instruction>(U);
3704 return Worklist.count(J) || IsVectorizedMemAccessUse(J, OI);
3705 }))
3706 AddToWorklistIfAllowed(OI);
3707 }
3708 }
3709
3710 // For an instruction to be added into Worklist above, all its users inside
3711 // the loop should also be in Worklist. However, this condition cannot be
3712 // true for phi nodes that form a cyclic dependence. We must process phi
3713 // nodes separately. An induction variable will remain uniform if all users
3714 // of the induction variable and induction variable update remain uniform.
3715 // The code below handles both pointer and non-pointer induction variables.
3716 BasicBlock *Latch = TheLoop->getLoopLatch();
3717 for (const auto &Induction : Legal->getInductionVars()) {
3718 auto *Ind = Induction.first;
3719 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3720
3721 // Determine if all users of the induction variable are uniform after
3722 // vectorization.
3723 bool UniformInd = all_of(Ind->users(), [&](User *U) -> bool {
3724 auto *I = cast<Instruction>(U);
3725 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3726 IsVectorizedMemAccessUse(I, Ind);
3727 });
3728 if (!UniformInd)
3729 continue;
3730
3731 // Determine if all users of the induction variable update instruction are
3732 // uniform after vectorization.
3733 bool UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
3734 auto *I = cast<Instruction>(U);
3735 return I == Ind || Worklist.count(I) ||
3736 IsVectorizedMemAccessUse(I, IndUpdate);
3737 });
3738 if (!UniformIndUpdate)
3739 continue;
3740
3741 // The induction variable and its update instruction will remain uniform.
3742 AddToWorklistIfAllowed(Ind);
3743 AddToWorklistIfAllowed(IndUpdate);
3744 }
3745
3746 Uniforms[VF].insert(Worklist.begin(), Worklist.end());
3747}
3748
3750 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
3751
3753 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
3754 "runtime pointer checks needed. Enable vectorization of this "
3755 "loop with '#pragma clang loop vectorize(enable)' when "
3756 "compiling with -Os/-Oz",
3757 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3758 return true;
3759 }
3760
3761 if (!PSE.getPredicate().isAlwaysTrue()) {
3762 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
3763 "runtime SCEV checks needed. Enable vectorization of this "
3764 "loop with '#pragma clang loop vectorize(enable)' when "
3765 "compiling with -Os/-Oz",
3766 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3767 return true;
3768 }
3769
3770 // FIXME: Avoid specializing for stride==1 instead of bailing out.
3771 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
3772 reportVectorizationFailure("Runtime stride check for small trip count",
3773 "runtime stride == 1 checks needed. Enable vectorization of "
3774 "this loop without such check by compiling with -Os/-Oz",
3775 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3776 return true;
3777 }
3778
3779 return false;
3780}
3781
3782bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
3783 if (IsScalableVectorizationAllowed)
3784 return *IsScalableVectorizationAllowed;
3785
3786 IsScalableVectorizationAllowed = false;
3788 return false;
3789
3791 reportVectorizationInfo("Scalable vectorization is explicitly disabled",
3792 "ScalableVectorizationDisabled", ORE, TheLoop);
3793 return false;
3794 }
3795
3796 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
3797
3798 auto MaxScalableVF = ElementCount::getScalable(
3799 std::numeric_limits<ElementCount::ScalarTy>::max());
3800
3801 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
3802 // FIXME: While for scalable vectors this is currently sufficient, this should
3803 // be replaced by a more detailed mechanism that filters out specific VFs,
3804 // instead of invalidating vectorization for a whole set of VFs based on the
3805 // MaxVF.
3806
3807 // Disable scalable vectorization if the loop contains unsupported reductions.
3808 if (!canVectorizeReductions(MaxScalableVF)) {
3810 "Scalable vectorization not supported for the reduction "
3811 "operations found in this loop.",
3812 "ScalableVFUnfeasible", ORE, TheLoop);
3813 return false;
3814 }
3815
3816 // Disable scalable vectorization if the loop contains any instructions
3817 // with element types not supported for scalable vectors.
3818 if (any_of(ElementTypesInLoop, [&](Type *Ty) {
3819 return !Ty->isVoidTy() &&
3821 })) {
3822 reportVectorizationInfo("Scalable vectorization is not supported "
3823 "for all element types found in this loop.",
3824 "ScalableVFUnfeasible", ORE, TheLoop);
3825 return false;
3826 }
3827
3829 reportVectorizationInfo("The target does not provide maximum vscale value "
3830 "for safe distance analysis.",
3831 "ScalableVFUnfeasible", ORE, TheLoop);
3832 return false;
3833 }
3834
3835 IsScalableVectorizationAllowed = true;
3836 return true;
3837}
3838
3840LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
3841 if (!isScalableVectorizationAllowed())
3842 return ElementCount::getScalable(0);
3843
3844 auto MaxScalableVF = ElementCount::getScalable(
3845 std::numeric_limits<ElementCount::ScalarTy>::max());
3847 return MaxScalableVF;
3848
3849 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
3850 // Limit MaxScalableVF by the maximum safe dependence distance.
3851 MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
3852
3853 if (!MaxScalableVF)
3855 "Max legal vector width too small, scalable vectorization "
3856 "unfeasible.",
3857 "ScalableVFUnfeasible", ORE, TheLoop);
3858
3859 return MaxScalableVF;
3860}
3861
3862FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
3863 unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
3865 unsigned SmallestType, WidestType;
3866 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
3867
3868 // Get the maximum safe dependence distance in bits computed by LAA.
3869 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
3870 // the memory accesses that is most restrictive (involved in the smallest
3871 // dependence distance).
3872 unsigned MaxSafeElements =
3874
3875 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
3876 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
3878 this->MaxSafeElements = MaxSafeElements;
3879
3880 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
3881 << ".\n");
3882 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
3883 << ".\n");
3884
3885 // First analyze the UserVF, fall back if the UserVF should be ignored.
3886 if (UserVF) {
3887 auto MaxSafeUserVF =
3888 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
3889
3890 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
3891 // If `VF=vscale x N` is safe, then so is `VF=N`
3892 if (UserVF.isScalable())
3893 return FixedScalableVFPair(
3894 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
3895
3896 return UserVF;
3897 }
3898
3899 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
3900
3901 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
3902 // is better to ignore the hint and let the compiler choose a suitable VF.
3903 if (!UserVF.isScalable()) {
3904 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3905 << " is unsafe, clamping to max safe VF="
3906 << MaxSafeFixedVF << ".\n");
3907 ORE->emit([&]() {
3908 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3910 TheLoop->getHeader())
3911 << "User-specified vectorization factor "
3912 << ore::NV("UserVectorizationFactor", UserVF)
3913 << " is unsafe, clamping to maximum safe vectorization factor "
3914 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
3915 });
3916 return MaxSafeFixedVF;
3917 }
3918
3920 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3921 << " is ignored because scalable vectors are not "
3922 "available.\n");
3923 ORE->emit([&]() {
3924 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3926 TheLoop->getHeader())
3927 << "User-specified vectorization factor "
3928 << ore::NV("UserVectorizationFactor", UserVF)
3929 << " is ignored because the target does not support scalable "
3930 "vectors. The compiler will pick a more suitable value.";
3931 });
3932 } else {
3933 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3934 << " is unsafe. Ignoring scalable UserVF.\n");
3935 ORE->emit([&]() {
3936 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3938 TheLoop->getHeader())
3939 << "User-specified vectorization factor "
3940 << ore::NV("UserVectorizationFactor", UserVF)
3941 << " is unsafe. Ignoring the hint to let the compiler pick a "
3942 "more suitable value.";
3943 });
3944 }
3945 }
3946
3947 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
3948 << " / " << WidestType << " bits.\n");
3949
3952 if (auto MaxVF =
3953 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3954 MaxSafeFixedVF, FoldTailByMasking))
3955 Result.FixedVF = MaxVF;
3956
3957 if (auto MaxVF =
3958 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3959 MaxSafeScalableVF, FoldTailByMasking))
3960 if (MaxVF.isScalable()) {
3961 Result.ScalableVF = MaxVF;
3962 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
3963 << "\n");
3964 }
3965
3966 return Result;
3967}
3968
3972 // TODO: It may be useful to do since it's still likely to be dynamically
3973 // uniform if the target can skip.
3975 "Not inserting runtime ptr check for divergent target",
3976 "runtime pointer checks needed. Not enabled for divergent target",
3977 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
3979 }
3980
3981 ScalarEvolution *SE = PSE.getSE();
3982 unsigned TC = SE->getSmallConstantTripCount(TheLoop);
3983 unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
3984 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
3985 if (TC != MaxTC)
3986 LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n');
3987 if (TC == 1) {
3988 reportVectorizationFailure("Single iteration (non) loop",
3989 "loop trip count is one, irrelevant for vectorization",
3990 "SingleIterationLoop", ORE, TheLoop);
3992 }
3993
3994 // If BTC matches the widest induction type and is -1 then the trip count
3995 // computation will wrap to 0 and the vector trip count will be 0. Do not try
3996 // to vectorize.
3997 const SCEV *BTC = SE->getBackedgeTakenCount(TheLoop);
3998 if (!isa<SCEVCouldNotCompute>(BTC) &&
3999 BTC->getType()->getScalarSizeInBits() >=
4002 SE->getMinusOne(BTC->getType()))) {
4004 "Trip count computation wrapped",
4005 "backedge-taken count is -1, loop trip count wrapped to 0",
4006 "TripCountWrapped", ORE, TheLoop);
4008 }
4009
4010 switch (ScalarEpilogueStatus) {
4012 return computeFeasibleMaxVF(MaxTC, UserVF, false);
4014 [[fallthrough]];
4016 LLVM_DEBUG(
4017 dbgs() << "LV: vector predicate hint/switch found.\n"
4018 << "LV: Not allowing scalar epilogue, creating predicated "
4019 << "vector loop.\n");
4020 break;
4022 // fallthrough as a special case of OptForSize
4024 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4025 LLVM_DEBUG(
4026 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4027 else
4028 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4029 << "count.\n");
4030
4031 // Bail if runtime checks are required, which are not good when optimising
4032 // for size.
4035
4036 break;
4037 }
4038
4039 // The only loops we can vectorize without a scalar epilogue, are loops with
4040 // a bottom-test and a single exiting block. We'd have to handle the fact
4041 // that not every instruction executes on the last iteration. This will
4042 // require a lane mask which varies through the vector loop body. (TODO)
4044 // If there was a tail-folding hint/switch, but we can't fold the tail by
4045 // masking, fallback to a vectorization with a scalar epilogue.
4046 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4047 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4048 "scalar epilogue instead.\n");
4049 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4050 return computeFeasibleMaxVF(MaxTC, UserVF, false);
4051 }
4053 }
4054
4055 // Now try the tail folding
4056
4057 // Invalidate interleave groups that require an epilogue if we can't mask
4058 // the interleave-group.
4060 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
4061 "No decisions should have been taken at this point");
4062 // Note: There is no need to invalidate any cost modeling decisions here, as
4063 // none were taken so far.
4065 }
4066
4067 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
4068
4069 // Avoid tail folding if the trip count is known to be a multiple of any VF
4070 // we choose.
4071 std::optional<unsigned> MaxPowerOf2RuntimeVF =
4072 MaxFactors.FixedVF.getFixedValue();
4073 if (MaxFactors.ScalableVF) {
4074 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
4075 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
4076 MaxPowerOf2RuntimeVF = std::max<unsigned>(
4077 *MaxPowerOf2RuntimeVF,
4078 *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
4079 } else
4080 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
4081 }
4082
4083 if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4084 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4085 "MaxFixedVF must be a power of 2");
4086 unsigned MaxVFtimesIC =
4087 UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
4088 ScalarEvolution *SE = PSE.getSE();
4089 // Currently only loops with countable exits are vectorized, but calling
4090 // getSymbolicMaxBackedgeTakenCount allows enablement work for loops with
4091 // uncountable exits whilst also ensuring the symbolic maximum and known
4092 // back-edge taken count remain identical for loops with countable exits.
4093 const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount();
4094 assert(BackedgeTakenCount == PSE.getBackedgeTakenCount() &&
4095 "Invalid loop count");
4096 const SCEV *ExitCount = SE->getAddExpr(
4097 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
4098 const SCEV *Rem = SE->getURemExpr(
4099 SE->applyLoopGuards(ExitCount, TheLoop),
4100 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
4101 if (Rem->isZero()) {
4102 // Accept MaxFixedVF if we do not have a tail.
4103 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4104 return MaxFactors;
4105 }
4106 }
4107
4108 // If we don't know the precise trip count, or if the trip count that we
4109 // found modulo the vectorization factor is not zero, try to fold the tail
4110 // by masking.
4111 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4112 setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
4113 if (foldTailByMasking()) {
4115 LLVM_DEBUG(
4116 dbgs()
4117 << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
4118 "try to generate VP Intrinsics with scalable vector "
4119 "factors only.\n");
4120 // Tail folded loop using VP intrinsics restricts the VF to be scalable
4121 // for now.
4122 // TODO: extend it for fixed vectors, if required.
4123 assert(MaxFactors.ScalableVF.isScalable() &&
4124 "Expected scalable vector factor.");
4125
4126 MaxFactors.FixedVF = ElementCount::getFixed(1);
4127 }
4128 return MaxFactors;
4129 }
4130
4131 // If there was a tail-folding hint/switch, but we can't fold the tail by
4132 // masking, fallback to a vectorization with a scalar epilogue.
4133 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4134 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4135 "scalar epilogue instead.\n");
4136 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4137 return MaxFactors;
4138 }
4139
4140 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
4141 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
4143 }
4144
4145 if (TC == 0) {
4147 "unable to calculate the loop count due to complex control flow",
4148 "UnknownLoopCountComplexCFG", ORE, TheLoop);
4150 }
4151
4153 "Cannot optimize for size and vectorize at the same time.",
4154 "cannot optimize for size and vectorize at the same time. "
4155 "Enable vectorization of this loop with '#pragma clang loop "
4156 "vectorize(enable)' when compiling with -Os/-Oz",
4157 "NoTailLoopWithOptForSize", ORE, TheLoop);
4159}
4160
4161ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4162 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
4163 ElementCount MaxSafeVF, bool FoldTailByMasking) {
4164 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
4165 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
4166 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4168
4169 // Convenience function to return the minimum of two ElementCounts.
4170 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
4171 assert((LHS.isScalable() == RHS.isScalable()) &&
4172 "Scalable flags must match");
4173 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
4174 };
4175
4176 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
4177 // Note that both WidestRegister and WidestType may not be a powers of 2.
4178 auto MaxVectorElementCount = ElementCount::get(
4179 llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
4180 ComputeScalableMaxVF);
4181 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
4182 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4183 << (MaxVectorElementCount * WidestType) << " bits.\n");
4184
4185 if (!MaxVectorElementCount) {
4186 LLVM_DEBUG(dbgs() << "LV: The target has no "
4187 << (ComputeScalableMaxVF ? "scalable" : "fixed")
4188 << " vector registers.\n");
4189 return ElementCount::getFixed(1);
4190 }
4191
4192 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
4193 if (MaxVectorElementCount.isScalable() &&
4194 TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
4195 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
4196 auto Min = Attr.getVScaleRangeMin();
4197 WidestRegisterMinEC *= Min;
4198 }
4199
4200 // When a scalar epilogue is required, at least one iteration of the scalar
4201 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4202 // max VF that results in a dead vector loop.
4203 if (MaxTripCount > 0 && requiresScalarEpilogue(true))
4204 MaxTripCount -= 1;
4205
4206 if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4207 (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
4208 // If upper bound loop trip count (TC) is known at compile time there is no
4209 // point in choosing VF greater than TC (as done in the loop below). Select
4210 // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4211 // scalable, we only fall back on a fixed VF when the TC is less than or
4212 // equal to the known number of lanes.
4213 auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
4214 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
4215 "exceeding the constant trip count: "
4216 << ClampedUpperTripCount << "\n");
4217 return ElementCount::get(
4218 ClampedUpperTripCount,
4219 FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4220 }
4221
4223 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4225 ElementCount MaxVF = MaxVectorElementCount;
4226 if (MaximizeBandwidth ||
4230 auto MaxVectorElementCountMaxBW = ElementCount::get(
4231 llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
4232 ComputeScalableMaxVF);
4233 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
4234
4235 // Collect all viable vectorization factors larger than the default MaxVF
4236 // (i.e. MaxVectorElementCount).
4238 for (ElementCount VS = MaxVectorElementCount * 2;
4239 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
4240 VFs.push_back(VS);
4241
4242 // For each VF calculate its register usage.
4243 auto RUs = calculateRegisterUsage(VFs);
4244
4245 // Select the largest VF which doesn't require more registers than existing
4246 // ones.
4247 for (int I = RUs.size() - 1; I >= 0; --I) {
4248 const auto &MLU = RUs[I].MaxLocalUsers;
4249 if (all_of(MLU, [&](decltype(MLU.front()) &LU) {
4250 return LU.second <= TTI.getNumberOfRegisters(LU.first);
4251 })) {
4252 MaxVF = VFs[I];
4253 break;
4254 }
4255 }
4256 if (ElementCount MinVF =
4257 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
4258 if (ElementCount::isKnownLT(MaxVF, MinVF)) {
4259 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4260 << ") with target's minimum: " << MinVF << '\n');
4261 MaxVF = MinVF;
4262 }
4263 }
4264
4265 // Invalidate any widening decisions we might have made, in case the loop
4266 // requires prediction (decided later), but we have already made some
4267 // load/store widening decisions.
4269 }
4270 return MaxVF;
4271}
4272
4273/// This function attempts to return a value that represents the vectorization
4274/// factor at runtime. For fixed-width VFs we know this precisely at compile
4275/// time, but for scalable VFs we calculate it based on an estimate of the
4276/// vscale value.
4278 std::optional<unsigned> VScale) {
4279 unsigned EstimatedVF = VF.getKnownMinValue();
4280 if (VF.isScalable())
4281 if (VScale)
4282 EstimatedVF *= *VScale;
4283 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
4284 return EstimatedVF;
4285}
4286
4287bool LoopVectorizationPlanner::isMoreProfitable(
4289 const unsigned MaxTripCount) const {
4290 InstructionCost CostA = A.Cost;
4291 InstructionCost CostB = B.Cost;
4292
4293 // Improve estimate for the vector width if it is scalable.
4294 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
4295 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
4296 if (std::optional<unsigned> VScale = CM.getVScaleForTuning()) {
4297 if (A.Width.isScalable())
4298 EstimatedWidthA *= *VScale;
4299 if (B.Width.isScalable())
4300 EstimatedWidthB *= *VScale;
4301 }
4302
4303 // Assume vscale may be larger than 1 (or the value being tuned for),
4304 // so that scalable vectorization is slightly favorable over fixed-width
4305 // vectorization.
4306 bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() &&
4307 A.Width.isScalable() && !B.Width.isScalable();
4308
4309 auto CmpFn = [PreferScalable](const InstructionCost &LHS,
4310 const InstructionCost &RHS) {
4311 return PreferScalable ? LHS <= RHS : LHS < RHS;
4312 };
4313
4314 // To avoid the need for FP division:
4315 // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
4316 // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
4317 if (!MaxTripCount)
4318 return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
4319
4320 auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4321 InstructionCost VectorCost,
4322 InstructionCost ScalarCost) {
4323 // If the trip count is a known (possibly small) constant, the trip count
4324 // will be rounded up to an integer number of iterations under
4325 // FoldTailByMasking. The total cost in that case will be
4326 // VecCost*ceil(TripCount/VF). When not folding the tail, the total
4327 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4328 // some extra overheads, but for the purpose of comparing the costs of
4329 // different VFs we can use this to compare the total loop-body cost
4330 // expected after vectorization.
4331 if (CM.foldTailByMasking())
4332 return VectorCost * divideCeil(MaxTripCount, VF);
4333 return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF);
4334 };
4335
4336 auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
4337 auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
4338 return CmpFn(RTCostA, RTCostB);
4339}
4340
4341bool LoopVectorizationPlanner::isMoreProfitable(
4342 const VectorizationFactor &A, const VectorizationFactor &B) const {
4343 const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
4344 return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount);
4345}
4346
4349 using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
4350 SmallVector<RecipeVFPair> InvalidCosts;
4351 for (const auto &Plan : VPlans) {
4352 for (ElementCount VF : Plan->vectorFactors()) {
4353 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(),
4354 CM, CM.CostKind);
4355 precomputeCosts(*Plan, VF, CostCtx);
4356 auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
4357 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
4358 for (auto &R : *VPBB) {
4359 if (!R.cost(VF, CostCtx).isValid())
4360 InvalidCosts.emplace_back(&R, VF);
4361 }
4362 }
4363 }
4364 }
4365 if (InvalidCosts.empty())
4366 return;
4367
4368 // Emit a report of VFs with invalid costs in the loop.
4369
4370 // Group the remarks per recipe, keeping the recipe order from InvalidCosts.
4372 unsigned I = 0;
4373 for (auto &Pair : InvalidCosts)
4374 if (!Numbering.count(Pair.first))
4375 Numbering[Pair.first] = I++;
4376
4377 // Sort the list, first on recipe(number) then on VF.
4378 sort(InvalidCosts, [&Numbering](RecipeVFPair &A, RecipeVFPair &B) {
4379 if (Numbering[A.first] != Numbering[B.first])
4380 return Numbering[A.first] < Numbering[B.first];
4381 const auto &LHS = A.second;
4382 const auto &RHS = B.second;
4383 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
4384 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
4385 });
4386
4387 // For a list of ordered recipe-VF pairs:
4388 // [(load, VF1), (load, VF2), (store, VF1)]
4389 // group the recipes together to emit separate remarks for:
4390 // load (VF1, VF2)
4391 // store (VF1)
4392 auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts);
4393 auto Subset = ArrayRef<RecipeVFPair>();
4394 do {
4395 if (Subset.empty())
4396 Subset = Tail.take_front(1);
4397
4398 VPRecipeBase *R = Subset.front().first;
4399
4400 unsigned Opcode =
4403 [](const auto *R) { return Instruction::PHI; })
4404 .Case<VPWidenSelectRecipe>(
4405 [](const auto *R) { return Instruction::Select; })
4406 .Case<VPWidenStoreRecipe>(
4407 [](const auto *R) { return Instruction::Store; })
4408 .Case<VPWidenLoadRecipe>(
4409 [](const auto *R) { return Instruction::Load; })
4410 .Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
4411 [](const auto *R) { return Instruction::Call; })
4414 [](const auto *R) { return R->getOpcode(); })
4415 .Case<VPInterleaveRecipe>([](const VPInterleaveRecipe *R) {
4416 return R->getStoredValues().empty() ? Instruction::Load
4417 : Instruction::Store;
4418 });
4419
4420 // If the next recipe is different, or if there are no other pairs,
4421 // emit a remark for the collated subset. e.g.
4422 // [(load, VF1), (load, VF2))]
4423 // to emit:
4424 // remark: invalid costs for 'load' at VF=(VF1, VF2)
4425 if (Subset == Tail || Tail[Subset.size()].first != R) {
4426 std::string OutString;
4427 raw_string_ostream OS(OutString);
4428 assert(!Subset.empty() && "Unexpected empty range");
4429 OS << "Recipe with invalid costs prevented vectorization at VF=(";
4430 for (const auto &Pair : Subset)
4431 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4432 OS << "):";
4433 if (Opcode == Instruction::Call) {
4434 StringRef Name = "";
4435 if (auto *Int = dyn_cast<VPWidenIntrinsicRecipe>(R)) {
4436 Name = Int->getIntrinsicName();
4437 } else {
4438 auto *WidenCall = dyn_cast<VPWidenCallRecipe>(R);
4439 Function *CalledFn =
4440 WidenCall ? WidenCall->getCalledScalarFunction()
4441 : cast<Function>(R->getOperand(R->getNumOperands() - 1)
4442 ->getLiveInIRValue());
4443 Name = CalledFn->getName();
4444 }
4445 OS << " call to " << Name;
4446 } else
4447 OS << " " << Instruction::getOpcodeName(Opcode);
4448 reportVectorizationInfo(OutString, "InvalidCost", ORE, OrigLoop, nullptr,
4449 R->getDebugLoc());
4450 Tail = Tail.drop_front(Subset.size());
4451 Subset = {};
4452 } else
4453 // Grow the subset by one element
4454 Subset = Tail.take_front(Subset.size() + 1);
4455 } while (!Tail.empty());
4456}
4457
4458/// Check if any recipe of \p Plan will generate a vector value, which will be
4459/// assigned a vector register.
4461 const TargetTransformInfo &TTI) {
4462 assert(VF.isVector() && "Checking a scalar VF?");
4463 VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
4464 DenseSet<VPRecipeBase *> EphemeralRecipes;
4465 collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes);
4466 // Set of already visited types.
4467 DenseSet<Type *> Visited;
4468 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4470 for (VPRecipeBase &R : *VPBB) {
4471 if (EphemeralRecipes.contains(&R))
4472 continue;
4473 // Continue early if the recipe is considered to not produce a vector
4474 // result. Note that this includes VPInstruction where some opcodes may
4475 // produce a vector, to preserve existing behavior as VPInstructions model
4476 // aspects not directly mapped to existing IR instructions.
4477 switch (R.getVPDefID()) {
4478 case VPDef::VPDerivedIVSC:
4479 case VPDef::VPScalarIVStepsSC:
4480 case VPDef::VPScalarCastSC:
4481 case VPDef::VPReplicateSC:
4482 case VPDef::VPInstructionSC:
4483 case VPDef::VPCanonicalIVPHISC:
4484 case VPDef::VPVectorPointerSC:
4485 case VPDef::VPReverseVectorPointerSC:
4486 case VPDef::VPExpandSCEVSC:
4487 case VPDef::VPEVLBasedIVPHISC:
4488 case VPDef::VPPredInstPHISC:
4489 case VPDef::VPBranchOnMaskSC:
4490 continue;
4491 case VPDef::VPReductionSC:
4492 case VPDef::VPActiveLaneMaskPHISC:
4493 case VPDef::VPWidenCallSC:
4494 case VPDef::VPWidenCanonicalIVSC:
4495 case VPDef::VPWidenCastSC:
4496 case VPDef::VPWidenGEPSC:
4497 case VPDef::VPWidenIntrinsicSC:
4498 case VPDef::VPWidenSC:
4499 case VPDef::VPWidenSelectSC:
4500 case VPDef::VPBlendSC:
4501 case VPDef::VPFirstOrderRecurrencePHISC:
4502 case VPDef::VPWidenPHISC:
4503 case VPDef::VPWidenIntOrFpInductionSC:
4504 case VPDef::VPWidenPointerInductionSC:
4505 case VPDef::VPReductionPHISC:
4506 case VPDef::VPInterleaveSC:
4507 case VPDef::VPWidenLoadEVLSC:
4508 case VPDef::VPWidenLoadSC:
4509 case VPDef::VPWidenStoreEVLSC:
4510 case VPDef::VPWidenStoreSC:
4511 break;
4512 default:
4513 llvm_unreachable("unhandled recipe");
4514 }
4515
4516 auto WillWiden = [&TTI, VF](Type *ScalarTy) {
4517 Type *VectorTy = toVectorTy(ScalarTy, VF);
4518 unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
4519 if (!NumLegalParts)
4520 return false;
4521 if (VF.isScalable()) {
4522 // <vscale x 1 x iN> is assumed to be profitable over iN because
4523 // scalable registers are a distinct register class from scalar
4524 // ones. If we ever find a target which wants to lower scalable
4525 // vectors back to scalars, we'll need to update this code to
4526 // explicitly ask TTI about the register class uses for each part.
4527 return NumLegalParts <= VF.getKnownMinValue();
4528 }
4529 // Two or more parts that share a register - are vectorized.
4530 return NumLegalParts < VF.getKnownMinValue();
4531 };
4532
4533 // If no def nor is a store, e.g., branches, continue - no value to check.
4534 if (R.getNumDefinedValues() == 0 &&
4535 !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
4536 &R))
4537 continue;
4538 // For multi-def recipes, currently only interleaved loads, suffice to
4539 // check first def only.
4540 // For stores check their stored value; for interleaved stores suffice
4541 // the check first stored value only. In all cases this is the second
4542 // operand.
4543 VPValue *ToCheck =
4544 R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1);
4545 Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
4546 if (!Visited.insert({ScalarTy}).second)
4547 continue;
4548 if (WillWiden(ScalarTy))
4549 return true;
4550 }
4551 }
4552
4553 return false;
4554}
4555
4556#ifndef NDEBUG
4557VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4559 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4560 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4561 assert(any_of(VPlans,
4562 [](std::unique_ptr<VPlan> &P) {
4563 return P->hasVF(ElementCount::getFixed(1));
4564 }) &&
4565 "Expected Scalar VF to be a candidate");
4566
4567 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4568 ExpectedCost);
4569 VectorizationFactor ChosenFactor = ScalarCost;
4570
4571 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4572 if (ForceVectorization &&
4573 (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
4574 // Ignore scalar width, because the user explicitly wants vectorization.
4575 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4576 // evaluation.
4577 ChosenFactor.Cost = InstructionCost::getMax();
4578 }
4579
4580 for (auto &P : VPlans) {
4581 for (ElementCount VF : P->vectorFactors()) {
4582 // The cost for scalar VF=1 is already calculated, so ignore it.
4583 if (VF.isScalar())
4584 continue;
4585
4587 VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
4588
4589 unsigned Width =
4590 getEstimatedRuntimeVF(Candidate.Width, CM.getVScaleForTuning());
4591 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
4592 << " costs: " << (Candidate.Cost / Width));
4593 if (VF.isScalable())
4594 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4595 << CM.getVScaleForTuning().value_or(1) << ")");
4596 LLVM_DEBUG(dbgs() << ".\n");
4597
4598 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
4599 LLVM_DEBUG(
4600 dbgs()
4601 << "LV: Not considering vector loop of width " << VF
4602 << " because it will not generate any vector instructions.\n");
4603 continue;
4604 }
4605
4606 if (isMoreProfitable(Candidate, ChosenFactor))
4607 ChosenFactor = Candidate;
4608 }
4609 }
4610
4613 "There are conditional stores.",
4614 "store that is conditionally executed prevents vectorization",
4615 "ConditionalStore", ORE, OrigLoop);
4616 ChosenFactor = ScalarCost;
4617 }
4618
4619 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
4620 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
4621 << "LV: Vectorization seems to be not beneficial, "
4622 << "but was forced by a user.\n");
4623 return ChosenFactor;
4624}
4625#endif
4626
4627bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4628 ElementCount VF) const {
4629 // Cross iteration phis such as reductions need special handling and are
4630 // currently unsupported.
4631 if (any_of(OrigLoop->getHeader()->phis(),
4632 [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
4633 return false;
4634
4635 // Phis with uses outside of the loop require special handling and are
4636 // currently unsupported.
4637 for (const auto &Entry : Legal->getInductionVars()) {
4638 // Look for uses of the value of the induction at the last iteration.
4639 Value *PostInc =
4640 Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
4641 for (User *U : PostInc->users())
4642 if (!OrigLoop->contains(cast<Instruction>(U)))
4643 return false;
4644 // Look for uses of penultimate value of the induction.
4645 for (User *U : Entry.first->users())
4646 if (!OrigLoop->contains(cast<Instruction>(U)))
4647 return false;
4648 }
4649
4650 // Epilogue vectorization code has not been auditted to ensure it handles
4651 // non-latch exits properly. It may be fine, but it needs auditted and
4652 // tested.
4653 // TODO: Add support for loops with an early exit.
4654 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
4655 return false;
4656
4657 return true;
4658}
4659
4661 const ElementCount VF, const unsigned IC) const {
4662 // FIXME: We need a much better cost-model to take different parameters such
4663 // as register pressure, code size increase and cost of extra branches into
4664 // account. For now we apply a very crude heuristic and only consider loops
4665 // with vectorization factors larger than a certain value.
4666
4667 // Allow the target to opt out entirely.
4669 return false;
4670
4671 // We also consider epilogue vectorization unprofitable for targets that don't
4672 // consider interleaving beneficial (eg. MVE).
4673 if (TTI.getMaxInterleaveFactor(VF) <= 1)
4674 return false;
4675
4676 // TODO: PR #108190 introduced a discrepancy between fixed-width and scalable
4677 // VFs when deciding profitability.
4678 // See related "TODO: extend to support scalable VFs." in
4679 // selectEpilogueVectorizationFactor.
4680 unsigned Multiplier = VF.isFixed() ? IC : 1;
4681 unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
4684 return getEstimatedRuntimeVF(VF * Multiplier, VScaleForTuning) >=
4685 MinVFThreshold;
4686}
4687
4689 const ElementCount MainLoopVF, unsigned IC) {
4692 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
4693 return Result;
4694 }
4695
4696 if (!CM.isScalarEpilogueAllowed()) {
4697 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
4698 "epilogue is allowed.\n");
4699 return Result;
4700 }
4701
4702 // Not really a cost consideration, but check for unsupported cases here to
4703 // simplify the logic.
4704 if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
4705 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
4706 "is not a supported candidate.\n");
4707 return Result;
4708 }
4709
4711 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
4713 if (hasPlanWithVF(ForcedEC))
4714 return {ForcedEC, 0, 0};
4715
4716 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
4717 "viable.\n");
4718 return Result;
4719 }
4720
4721 if (OrigLoop->getHeader()->getParent()->hasOptSize() ||
4722 OrigLoop->getHeader()->getParent()->hasMinSize()) {
4723 LLVM_DEBUG(
4724 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
4725 return Result;
4726 }
4727
4728 if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, IC)) {
4729 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
4730 "this loop\n");
4731 return Result;
4732 }
4733
4734 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
4735 // the main loop handles 8 lanes per iteration. We could still benefit from
4736 // vectorizing the epilogue loop with VF=4.
4737 ElementCount EstimatedRuntimeVF = ElementCount::getFixed(
4738 getEstimatedRuntimeVF(MainLoopVF, CM.getVScaleForTuning()));
4739
4740 ScalarEvolution &SE = *PSE.getSE();
4741 Type *TCType = Legal->getWidestInductionType();
4742 const SCEV *RemainingIterations = nullptr;
4743 unsigned MaxTripCount = 0;
4744 for (auto &NextVF : ProfitableVFs) {
4745 // Skip candidate VFs without a corresponding VPlan.
4746 if (!hasPlanWithVF(NextVF.Width))
4747 continue;
4748
4749 // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
4750 // vectors) or > the VF of the main loop (fixed vectors).
4751 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
4752 ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
4753 (NextVF.Width.isScalable() &&
4754 ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) ||
4755 (!NextVF.Width.isScalable() && !MainLoopVF.isScalable() &&
4756 ElementCount::isKnownGT(NextVF.Width, MainLoopVF)))
4757 continue;
4758
4759 // If NextVF is greater than the number of remaining iterations, the
4760 // epilogue loop would be dead. Skip such factors.
4761 if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
4762 // TODO: extend to support scalable VFs.
4763 if (!RemainingIterations) {
4765 getPlanFor(NextVF.Width).getTripCount(), SE);
4766 assert(!isa<SCEVCouldNotCompute>(TC) &&
4767 "Trip count SCEV must be computable");
4768 RemainingIterations = SE.getURemExpr(
4769 TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
4770 MaxTripCount = MainLoopVF.getKnownMinValue() * IC - 1;
4771 if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
4772 SE.getConstant(TCType, MaxTripCount))) {
4773 MaxTripCount =
4774 SE.getUnsignedRangeMax(RemainingIterations).getZExtValue();
4775 }
4776 LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
4777 << MaxTripCount << "\n");
4778 }
4779 if (SE.isKnownPredicate(
4781 SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
4782 RemainingIterations))
4783 continue;
4784 }
4785
4786 if (Result.Width.isScalar() ||
4787 isMoreProfitable(NextVF, Result, MaxTripCount))
4788 Result = NextVF;
4789 }
4790
4791 if (Result != VectorizationFactor::Disabled())
4792 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
4793 << Result.Width << "\n");
4794 return Result;
4795}
4796
4797std::pair<unsigned, unsigned>
4799 unsigned MinWidth = -1U;
4800 unsigned MaxWidth = 8;
4802 // For in-loop reductions, no element types are added to ElementTypesInLoop
4803 // if there are no loads/stores in the loop. In this case, check through the
4804 // reduction variables to determine the maximum width.
4805 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
4806 // Reset MaxWidth so that we can find the smallest type used by recurrences
4807 // in the loop.
4808 MaxWidth = -1U;
4809 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
4810 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
4811 // When finding the min width used by the recurrence we need to account
4812 // for casts on the input operands of the recurrence.
4813 MaxWidth = std::min<unsigned>(
4814 MaxWidth, std::min<unsigned>(
4817 }
4818 } else {
4819 for (Type *T : ElementTypesInLoop) {
4820 MinWidth = std::min<unsigned>(
4821 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4822 MaxWidth = std::max<unsigned>(
4823 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4824 }
4825 }
4826 return {MinWidth, MaxWidth};
4827}
4828
4830 ElementTypesInLoop.clear();
4831 // For each block.
4832 for (BasicBlock *BB : TheLoop->blocks()) {
4833 // For each instruction in the loop.
4834 for (Instruction &I : BB->instructionsWithoutDebug()) {
4835 Type *T = I.getType();
4836
4837 // Skip ignored values.
4838 if (ValuesToIgnore.count(&I))
4839 continue;
4840
4841 // Only examine Loads, Stores and PHINodes.
4842 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
4843 continue;
4844
4845 // Examine PHI nodes that are reduction variables. Update the type to
4846 // account for the recurrence type.
4847 if (auto *PN = dyn_cast<PHINode>(&I)) {
4848 if (!Legal->isReductionVariable(PN))
4849 continue;
4850 const RecurrenceDescriptor &RdxDesc =
4851 Legal->getReductionVars().find(PN)->second;
4854 RdxDesc.getRecurrenceType(),
4856 continue;
4857 T = RdxDesc.getRecurrenceType();
4858 }
4859
4860 // Examine the stored values.
4861 if (auto *ST = dyn_cast<StoreInst>(&I))
4862 T = ST->getValueOperand()->getType();
4863
4864 assert(T->isSized() &&
4865 "Expected the load/store/recurrence type to be sized");
4866
4867 ElementTypesInLoop.insert(T);
4868 }
4869 }
4870}
4871
4872unsigned
4874 InstructionCost LoopCost) {
4875 // -- The interleave heuristics --
4876 // We interleave the loop in order to expose ILP and reduce the loop overhead.
4877 // There are many micro-architectural considerations that we can't predict
4878 // at this level. For example, frontend pressure (on decode or fetch) due to
4879 // code size, or the number and capabilities of the execution ports.
4880 //
4881 // We use the following heuristics to select the interleave count:
4882 // 1. If the code has reductions, then we interleave to break the cross
4883 // iteration dependency.
4884 // 2. If the loop is really small, then we interleave to reduce the loop
4885 // overhead.
4886 // 3. We don't interleave if we think that we will spill registers to memory
4887 // due to the increased register pressure.
4888
4890 return 1;
4891
4892 // Do not interleave if EVL is preferred and no User IC is specified.
4893 if (foldTailWithEVL()) {
4894 LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
4895 "Unroll factor forced to be 1.\n");
4896 return 1;
4897 }
4898
4899 // We used the distance for the interleave count.
4901 return 1;
4902
4903 // We don't attempt to perform interleaving for loops with uncountable early
4904 // exits because the VPInstruction::AnyOf code cannot currently handle
4905 // multiple parts.
4907 return 1;
4908
4909 auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop);
4910 const bool HasReductions = !Legal->getReductionVars().empty();
4911
4912 // If we did not calculate the cost for VF (because the user selected the VF)
4913 // then we calculate the cost of VF here.
4914 if (LoopCost == 0) {
4915 LoopCost = expectedCost(VF);
4916 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
4917
4918 // Loop body is free and there is no need for interleaving.
4919 if (LoopCost == 0)
4920 return 1;
4921 }
4922
4924 // We divide by these constants so assume that we have at least one
4925 // instruction that uses at least one register.
4926 for (auto &Pair : R.MaxLocalUsers) {
4927 Pair.second = std::max(Pair.second, 1U);
4928 }
4929
4930 // We calculate the interleave count using the following formula.
4931 // Subtract the number of loop invariants from the number of available
4932 // registers. These registers are used by all of the interleaved instances.
4933 // Next, divide the remaining registers by the number of registers that is
4934 // required by the loop, in order to estimate how many parallel instances
4935 // fit without causing spills. All of this is rounded down if necessary to be
4936 // a power of two. We want power of two interleave count to simplify any
4937 // addressing operations or alignment considerations.
4938 // We also want power of two interleave counts to ensure that the induction
4939 // variable of the vector loop wraps to zero, when tail is folded by masking;
4940 // this currently happens when OptForSize, in which case IC is set to 1 above.
4941 unsigned IC = UINT_MAX;
4942
4943 for (const auto &Pair : R.MaxLocalUsers) {
4944 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(Pair.first);
4945 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
4946 << " registers of "
4947 << TTI.getRegisterClassName(Pair.first)
4948 << " register class\n");
4949 if (VF.isScalar()) {
4950 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
4951 TargetNumRegisters = ForceTargetNumScalarRegs;
4952 } else {
4953 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
4954 TargetNumRegisters = ForceTargetNumVectorRegs;
4955 }
4956 unsigned MaxLocalUsers = Pair.second;
4957 unsigned LoopInvariantRegs = 0;
4958 if (R.LoopInvariantRegs.find(Pair.first) != R.LoopInvariantRegs.end())
4959 LoopInvariantRegs = R.LoopInvariantRegs[Pair.first];
4960
4961 unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
4962 MaxLocalUsers);
4963 // Don't count the induction variable as interleaved.
4965 TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
4966 std::max(1U, (MaxLocalUsers - 1)));
4967 }
4968
4969 IC = std::min(IC, TmpIC);
4970 }
4971
4972 // Clamp the interleave ranges to reasonable counts.
4973 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
4974
4975 // Check if the user has overridden the max.
4976 if (VF.isScalar()) {
4977 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
4978 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
4979 } else {
4980 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
4981 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4982 }
4983
4984 unsigned EstimatedVF = getEstimatedRuntimeVF(VF, VScaleForTuning);
4985 unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4986 if (KnownTC > 0) {
4987 // At least one iteration must be scalar when this constraint holds. So the
4988 // maximum available iterations for interleaving is one less.
4989 unsigned AvailableTC =
4990 requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
4991
4992 // If trip count is known we select between two prospective ICs, where
4993 // 1) the aggressive IC is capped by the trip count divided by VF
4994 // 2) the conservative IC is capped by the trip count divided by (VF * 2)
4995 // The final IC is selected in a way that the epilogue loop trip count is
4996 // minimized while maximizing the IC itself, so that we either run the
4997 // vector loop at least once if it generates a small epilogue loop, or else
4998 // we run the vector loop at least twice.
4999
5000 unsigned InterleaveCountUB = bit_floor(
5001 std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
5002 unsigned InterleaveCountLB = bit_floor(std::max(
5003 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5004 MaxInterleaveCount = InterleaveCountLB;
5005
5006 if (InterleaveCountUB != InterleaveCountLB) {
5007 unsigned TailTripCountUB =
5008 (AvailableTC % (EstimatedVF * InterleaveCountUB));
5009 unsigned TailTripCountLB =
5010 (AvailableTC % (EstimatedVF * InterleaveCountLB));
5011 // If both produce same scalar tail, maximize the IC to do the same work
5012 // in fewer vector loop iterations
5013 if (TailTripCountUB == TailTripCountLB)
5014 MaxInterleaveCount = InterleaveCountUB;
5015 }
5016 } else if (BestKnownTC && *BestKnownTC > 0) {
5017 // At least one iteration must be scalar when this constraint holds. So the
5018 // maximum available iterations for interleaving is one less.
5019 unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
5020 ? (*BestKnownTC) - 1
5021 : *BestKnownTC;
5022
5023 // If trip count is an estimated compile time constant, limit the
5024 // IC to be capped by the trip count divided by VF * 2, such that the vector
5025 // loop runs at least twice to make interleaving seem profitable when there
5026 // is an epilogue loop present. Since exact Trip count is not known we
5027 // choose to be conservative in our IC estimate.
5028 MaxInterleaveCount = bit_floor(std::max(
5029 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5030 }
5031
5032 assert(MaxInterleaveCount > 0 &&
5033 "Maximum interleave count must be greater than 0");
5034
5035 // Clamp the calculated IC to be between the 1 and the max interleave count
5036 // that the target and trip count allows.
5037 if (IC > MaxInterleaveCount)
5038 IC = MaxInterleaveCount;
5039 else
5040 // Make sure IC is greater than 0.
5041 IC = std::max(1u, IC);
5042
5043 assert(IC > 0 && "Interleave count must be greater than 0.");
5044
5045 // Interleave if we vectorized this loop and there is a reduction that could
5046 // benefit from interleaving.
5047 if (VF.isVector() && HasReductions) {
5048 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5049 return IC;
5050 }
5051
5052 // For any scalar loop that either requires runtime checks or predication we
5053 // are better off leaving this to the unroller. Note that if we've already
5054 // vectorized the loop we will have done the runtime check and so interleaving
5055 // won't require further checks.
5056 bool ScalarInterleavingRequiresPredication =
5057 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5058 return Legal->blockNeedsPredication(BB);
5059 }));
5060 bool ScalarInterleavingRequiresRuntimePointerCheck =
5062
5063 // We want to interleave small loops in order to reduce the loop overhead and
5064 // potentially expose ILP opportunities.
5065 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5066 << "LV: IC is " << IC << '\n'
5067 << "LV: VF is " << VF << '\n');
5068 const bool AggressivelyInterleaveReductions =
5069 TTI.enableAggressiveInterleaving(HasReductions);
5070 if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5071 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5072 // We assume that the cost overhead is 1 and we use the cost model
5073 // to estimate the cost of the loop and interleave until the cost of the
5074 // loop overhead is about 5% of the cost of the loop.
5075 unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
5076 SmallLoopCost / *LoopCost.getValue()));
5077
5078 // Interleave until store/load ports (estimated by max interleave count) are
5079 // saturated.
5080 unsigned NumStores = Legal->getNumStores();
5081 unsigned NumLoads = Legal->getNumLoads();
5082 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5083 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5084
5085 // There is little point in interleaving for reductions containing selects
5086 // and compares when VF=1 since it may just create more overhead than it's
5087 // worth for loops with small trip counts. This is because we still have to
5088 // do the final reduction after the loop.
5089 bool HasSelectCmpReductions =
5090 HasReductions &&
5091 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5092 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5093 RecurKind RK = RdxDesc.getRecurrenceKind();
5094 return RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
5095 RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK);
5096 });
5097 if (HasSelectCmpReductions) {
5098 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5099 return 1;
5100 }
5101
5102 // If we have a scalar reduction (vector reductions are already dealt with
5103 // by this point), we can increase the critical path length if the loop
5104 // we're interleaving is inside another loop. For tree-wise reductions
5105 // set the limit to 2, and for ordered reductions it's best to disable
5106 // interleaving entirely.
5107 if (HasReductions && TheLoop->getLoopDepth() > 1) {
5108 bool HasOrderedReductions =
5109 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5110 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5111 return RdxDesc.isOrdered();
5112 });
5113 if (HasOrderedReductions) {
5114 LLVM_DEBUG(
5115 dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5116 return 1;
5117 }
5118
5119 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5120 SmallIC = std::min(SmallIC, F);
5121 StoresIC = std::min(StoresIC, F);
5122 LoadsIC = std::min(LoadsIC, F);
5123 }
5124
5126 std::max(StoresIC, LoadsIC) > SmallIC) {
5127 LLVM_DEBUG(
5128 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5129 return std::max(StoresIC, LoadsIC);
5130 }
5131
5132 // If there are scalar reductions and TTI has enabled aggressive
5133 // interleaving for reductions, we will interleave to expose ILP.
5134 if (VF.isScalar() && AggressivelyInterleaveReductions) {
5135 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5136 // Interleave no less than SmallIC but not as aggressive as the normal IC
5137 // to satisfy the rare situation when resources are too limited.
5138 return std::max(IC / 2, SmallIC);
5139 }
5140
5141 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5142 return SmallIC;
5143 }
5144
5145 // Interleave if this is a large loop (small loops are already dealt with by
5146 // this point) that could benefit from interleaving.
5147 if (AggressivelyInterleaveReductions) {
5148 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5149 return IC;
5150 }
5151
5152 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5153 return 1;
5154}
5155
5158 // This function calculates the register usage by measuring the highest number
5159 // of values that are alive at a single location. Obviously, this is a very
5160 // rough estimation. We scan the loop in a topological order in order and
5161 // assign a number to each instruction. We use RPO to ensure that defs are
5162 // met before their users. We assume that each instruction that has in-loop
5163 // users starts an interval. We record every time that an in-loop value is
5164 // used, so we have a list of the first and last occurrences of each
5165 // instruction. Next, we transpose this data structure into a multi map that
5166 // holds the list of intervals that *end* at a specific location. This multi
5167 // map allows us to perform a linear search. We scan the instructions linearly
5168 // and record each time that a new interval starts, by placing it in a set.
5169 // If we find this value in the multi-map then we remove it from the set.
5170 // The max register usage is the maximum size of the set.
5171 // We also search for instructions that are defined outside the loop, but are
5172 // used inside the loop. We need this number separately from the max-interval
5173 // usage number because when we unroll, loop-invariant values do not take
5174 // more register.
5176 DFS.perform(LI);
5177
5178 RegisterUsage RU;
5179
5180 // Each 'key' in the map opens a new interval. The values
5181 // of the map are the index of the 'last seen' usage of the
5182 // instruction that is the key.
5184
5185 // Maps instruction to its index.
5187 // Marks the end of each interval.
5188 IntervalMap EndPoint;
5189 // Saves the list of instruction indices that are used in the loop.
5191 // Saves the list of values that are used in the loop but are defined outside
5192 // the loop (not including non-instruction values such as arguments and
5193 // constants).
5194 SmallSetVector<Instruction *, 8> LoopInvariants;
5195
5196 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5197 for (Instruction &I : BB->instructionsWithoutDebug()) {
5198 IdxToInstr.push_back(&I);
5199
5200 // Save the end location of each USE.
5201 for (Value *U : I.operands()) {
5202 auto *Instr = dyn_cast<Instruction>(U);
5203
5204 // Ignore non-instruction values such as arguments, constants, etc.
5205 // FIXME: Might need some motivation why these values are ignored. If
5206 // for example an argument is used inside the loop it will increase the
5207 // register pressure (so shouldn't we add it to LoopInvariants).
5208 if (!Instr)
5209 continue;
5210
5211 // If this instruction is outside the loop then record it and continue.
5212 if (!TheLoop->contains(Instr)) {
5213 LoopInvariants.insert(Instr);
5214 continue;
5215 }
5216
5217 // Overwrite previous end points.
5218 EndPoint[Instr] = IdxToInstr.size();
5219 Ends.insert(Instr);
5220 }
5221 }
5222 }
5223
5224 // Saves the list of intervals that end with the index in 'key'.
5225 using InstrList = SmallVector<Instruction *, 2>;
5227
5228 // Transpose the EndPoints to a list of values that end at each index.
5229 for (auto &Interval : EndPoint)
5230 TransposeEnds[Interval.second].push_back(Interval.first);
5231
5232 SmallPtrSet<Instruction *, 8> OpenIntervals;
5235
5236 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5237
5238 const auto &TTICapture = TTI;
5239 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5240 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty) ||
5241 (VF.isScalable() &&
5242 !TTICapture.isElementTypeLegalForScalableVector(Ty)))
5243 return 0;
5244 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5245 };
5246
5247 for (unsigned int Idx = 0, Sz = IdxToInstr.size(); Idx < Sz; ++Idx) {
5248 Instruction *I = IdxToInstr[Idx];
5249
5250 // Remove all of the instructions that end at this location.
5251 InstrList &List = TransposeEnds[Idx];
5252 for (Instruction *ToRemove : List)
5253 OpenIntervals.erase(ToRemove);
5254
5255 // Ignore instructions that are never used within the loop.
5256 if (!Ends.count(I))
5257 continue;
5258
5259 // Skip ignored values.
5260 if (ValuesToIgnore.count(I))
5261 continue;
5262
5264
5265 // For each VF find the maximum usage of registers.
5266 for (unsigned J = 0, E = VFs.size(); J < E; ++J) {
5267 // Count the number of registers used, per register class, given all open
5268 // intervals.
5269 // Note that elements in this SmallMapVector will be default constructed
5270 // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5271 // there is no previous entry for ClassID.
5273
5274 if (VFs[J].isScalar()) {
5275 for (auto *Inst : OpenIntervals) {
5276 unsigned ClassID =
5277 TTI.getRegisterClassForType(false, Inst->getType());
5278 // FIXME: The target might use more than one register for the type
5279 // even in the scalar case.
5280 RegUsage[ClassID] += 1;
5281 }
5282 } else {
5284 for (auto *Inst : OpenIntervals) {
5285 // Skip ignored values for VF > 1.
5286 if (VecValuesToIgnore.count(Inst))
5287 continue;
5288 if (isScalarAfterVectorization(Inst, VFs[J])) {
5289 unsigned ClassID =
5290 TTI.getRegisterClassForType(false, Inst->getType());
5291 // FIXME: The target might use more than one register for the type
5292 // even in the scalar case.
5293 RegUsage[ClassID] += 1;
5294 } else {
5295 unsigned ClassID =
5296 TTI.getRegisterClassForType(true, Inst->getType());
5297 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[J]);
5298 }
5299 }
5300 }
5301
5302 for (const auto &Pair : RegUsage) {
5303 auto &Entry = MaxUsages[J][Pair.first];
5304 Entry = std::max(Entry, Pair.second);
5305 }
5306 }
5307
5308 LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # "
5309 << OpenIntervals.size() << '\n');
5310
5311 // Add the current instruction to the list of open intervals.
5312 OpenIntervals.insert(I);
5313 }
5314
5315 for (unsigned Idx = 0, End = VFs.size(); Idx < End; ++Idx) {
5316 // Note that elements in this SmallMapVector will be default constructed
5317 // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5318 // there is no previous entry for ClassID.
5320
5321 for (auto *Inst : LoopInvariants) {
5322 // FIXME: The target might use more than one register for the type
5323 // even in the scalar case.
5324 bool IsScalar = all_of(Inst->users(), [&](User *U) {
5325 auto *I = cast<Instruction>(U);
5326 return TheLoop != LI->getLoopFor(I->getParent()) ||
5327 isScalarAfterVectorization(I, VFs[Idx]);
5328 });
5329
5330 ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[Idx];
5331 unsigned ClassID =
5332 TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
5333 Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
5334 }
5335
5336 LLVM_DEBUG({
5337 dbgs() << "LV(REG): VF = " << VFs[Idx] << '\n';
5338 dbgs() << "LV(REG): Found max usage: " << MaxUsages[Idx].size()
5339 << " item\n";
5340 for (const auto &pair : MaxUsages[Idx]) {
5341 dbgs() << "LV(REG): RegisterClass: "
5342 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5343 << " registers\n";
5344 }
5345 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5346 << " item\n";
5347 for (const auto &pair : Invariant) {
5348 dbgs() << "LV(REG): RegisterClass: "
5349 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5350 << " registers\n";
5351 }
5352 });
5353
5354 RU.LoopInvariantRegs = Invariant;
5355 RU.MaxLocalUsers = MaxUsages[Idx];
5356 RUs[Idx] = RU;
5357 }
5358
5359 return RUs;
5360}
5361
5362bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
5363 ElementCount VF) {
5364 // TODO: Cost model for emulated masked load/store is completely
5365 // broken. This hack guides the cost model to use an artificially
5366 // high enough value to practically disable vectorization with such
5367 // operations, except where previously deployed legality hack allowed
5368 // using very low cost values. This is to avoid regressions coming simply
5369 // from moving "masked load/store" check from legality to cost model.
5370 // Masked Load/Gather emulation was previously never allowed.
5371 // Limited number of Masked Store/Scatter emulation was allowed.
5373 "Expecting a scalar emulated instruction");
5374 return isa<LoadInst>(I) ||
5375 (isa<StoreInst>(I) &&
5376 NumPredStores > NumberOfStoresToPredicate);
5377}
5378
5380 // If we aren't vectorizing the loop, or if we've already collected the
5381 // instructions to scalarize, there's nothing to do. Collection may already
5382 // have occurred if we have a user-selected VF and are now computing the
5383 // expected cost for interleaving.
5384 if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF))
5385 return;
5386
5387 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5388 // not profitable to scalarize any instructions, the presence of VF in the
5389 // map will indicate that we've analyzed it already.
5390 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5391
5392 PredicatedBBsAfterVectorization[VF].clear();
5393
5394 // Find all the instructions that are scalar with predication in the loop and
5395 // determine if it would be better to not if-convert the blocks they are in.
5396 // If so, we also record the instructions to scalarize.
5397 for (BasicBlock *BB : TheLoop->blocks()) {
5399 continue;
5400 for (Instruction &I : *BB)
5401 if (isScalarWithPredication(&I, VF)) {
5402 ScalarCostsTy ScalarCosts;
5403 // Do not apply discount logic for:
5404 // 1. Scalars after vectorization, as there will only be a single copy
5405 // of the instruction.
5406 // 2. Scalable VF, as that would lead to invalid scalarization costs.
5407 // 3. Emulated masked memrefs, if a hacked cost is needed.
5408 if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() &&
5409 !useEmulatedMaskMemRefHack(&I, VF) &&
5410 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) {
5411 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5412 // Check if we decided to scalarize a call. If so, update the widening
5413 // decision of the call to CM_Scalarize with the computed scalar cost.
5414 for (const auto &[I, _] : ScalarCosts) {
5415 auto *CI = dyn_cast<CallInst>(I);
5416 if (!CI || !CallWideningDecisions.contains({CI, VF}))
5417 continue;
5418 CallWideningDecisions[{CI, VF}].Kind = CM_Scalarize;
5419 CallWideningDecisions[{CI, VF}].Cost = ScalarCosts[CI];
5420 }
5421 }
5422 // Remember that BB will remain after vectorization.
5423 PredicatedBBsAfterVectorization[VF].insert(BB);
5424 for (auto *Pred : predecessors(BB)) {
5425 if (Pred->getSingleSuccessor() == BB)
5426 PredicatedBBsAfterVectorization[VF].insert(Pred);
5427 }
5428 }
5429 }
5430}
5431
5432InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5433 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5434 assert(!isUniformAfterVectorization(PredInst, VF) &&
5435 "Instruction marked uniform-after-vectorization will be predicated");
5436
5437 // Initialize the discount to zero, meaning that the scalar version and the
5438 // vector version cost the same.
5439 InstructionCost Discount = 0;
5440
5441 // Holds instructions to analyze. The instructions we visit are mapped in
5442 // ScalarCosts. Those instructions are the ones that would be scalarized if
5443 // we find that the scalar version costs less.
5445
5446 // Returns true if the given instruction can be scalarized.
5447 auto CanBeScalarized = [&](Instruction *I) -> bool {
5448 // We only attempt to scalarize instructions forming a single-use chain
5449 // from the original predicated block that would otherwise be vectorized.
5450 // Although not strictly necessary, we give up on instructions we know will
5451 // already be scalar to avoid traversing chains that are unlikely to be
5452 // beneficial.
5453 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5455 return false;
5456
5457 // If the instruction is scalar with predication, it will be analyzed
5458 // separately. We ignore it within the context of PredInst.
5459 if (isScalarWithPredication(I, VF))
5460 return false;
5461
5462 // If any of the instruction's operands are uniform after vectorization,
5463 // the instruction cannot be scalarized. This prevents, for example, a
5464 // masked load from being scalarized.
5465 //
5466 // We assume we will only emit a value for lane zero of an instruction
5467 // marked uniform after vectorization, rather than VF identical values.
5468 // Thus, if we scalarize an instruction that uses a uniform, we would
5469 // create uses of values corresponding to the lanes we aren't emitting code
5470 // for. This behavior can be changed by allowing getScalarValue to clone
5471 // the lane zero values for uniforms rather than asserting.
5472 for (Use &U : I->operands())
5473 if (auto *J = dyn_cast<Instruction>(U.get()))
5474 if (isUniformAfterVectorization(J, VF))
5475 return false;
5476
5477 // Otherwise, we can scalarize the instruction.
5478 return true;
5479 };
5480
5481 // Compute the expected cost discount from scalarizing the entire expression
5482 // feeding the predicated instruction. We currently only consider expressions
5483 // that are single-use instruction chains.
5484 Worklist.push_back(PredInst);
5485 while (!Worklist.empty()) {
5486 Instruction *I = Worklist.pop_back_val();
5487
5488 // If we've already analyzed the instruction, there's nothing to do.
5489 if (ScalarCosts.contains(I))
5490 continue;
5491
5492 // Compute the cost of the vector instruction. Note that this cost already
5493 // includes the scalarization overhead of the predicated instruction.
5494 InstructionCost VectorCost = getInstructionCost(I, VF);
5495
5496 // Compute the cost of the scalarized instruction. This cost is the cost of
5497 // the instruction as if it wasn't if-converted and instead remained in the
5498 // predicated block. We will scale this cost by block probability after
5499 // computing the scalarization overhead.
5500 InstructionCost ScalarCost =
5502
5503 // Compute the scalarization overhead of needed insertelement instructions
5504 // and phi nodes.
5505 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5506 ScalarCost += TTI.getScalarizationOverhead(
5507 cast<VectorType>(toVectorTy(I->getType(), VF)),
5508 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
5509 /*Extract*/ false, CostKind);
5510 ScalarCost +=
5511 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
5512 }
5513
5514 // Compute the scalarization overhead of needed extractelement
5515 // instructions. For each of the instruction's operands, if the operand can
5516 // be scalarized, add it to the worklist; otherwise, account for the
5517 // overhead.
5518 for (Use &U : I->operands())
5519 if (auto *J = dyn_cast<Instruction>(U.get())) {
5520 assert(VectorType::isValidElementType(J->getType()) &&
5521 "Instruction has non-scalar type");
5522 if (CanBeScalarized(J))
5523 Worklist.push_back(J);
5524 else if (needsExtract(J, VF)) {
5525 ScalarCost += TTI.getScalarizationOverhead(
5526 cast<VectorType>(toVectorTy(J->getType(), VF)),
5527 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5528 /*Extract*/ true, CostKind);
5529 }
5530 }
5531
5532 // Scale the total scalar cost by block probability.
5533 ScalarCost /= getReciprocalPredBlockProb();
5534
5535 // Compute the discount. A non-negative discount means the vector version
5536 // of the instruction costs more, and scalarizing would be beneficial.
5537 Discount += VectorCost - ScalarCost;
5538 ScalarCosts[I] = ScalarCost;
5539 }
5540
5541 return Discount;
5542}
5543
5546
5547 // If the vector loop gets executed exactly once with the given VF, ignore the
5548 // costs of comparison and induction instructions, as they'll get simplified
5549 // away.
5550 SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;
5552 if (VF.isFixed() && TC == VF.getFixedValue() && !foldTailByMasking())
5554 ValuesToIgnoreForVF);
5555
5556 // For each block.
5557 for (BasicBlock *BB : TheLoop->blocks()) {
5558 InstructionCost BlockCost;
5559
5560 // For each instruction in the old loop.
5561 for (Instruction &I : BB->instructionsWithoutDebug()) {
5562 // Skip ignored values.
5563 if (ValuesToIgnore.count(&I) || ValuesToIgnoreForVF.count(&I) ||
5564 (VF.isVector() && VecValuesToIgnore.count(&I)))
5565 continue;
5566
5568
5569 // Check if we should override the cost.
5570 if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
5572
5573 BlockCost += C;
5574 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5575 << VF << " For instruction: " << I << '\n');
5576 }
5577
5578 // If we are vectorizing a predicated block, it will have been
5579 // if-converted. This means that the block's instructions (aside from
5580 // stores and instructions that may divide by zero) will now be
5581 // unconditionally executed. For the scalar case, we may not always execute
5582 // the predicated block, if it is an if-else block. Thus, scale the block's
5583 // cost by the probability of executing it. blockNeedsPredication from
5584 // Legal is used so as to not include all blocks in tail folded loops.
5585 if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5586 BlockCost /= getReciprocalPredBlockProb();
5587
5588 Cost += BlockCost;
5589 }
5590
5591 return Cost;
5592}
5593
5594/// Gets Address Access SCEV after verifying that the access pattern
5595/// is loop invariant except the induction variable dependence.
5596///
5597/// This SCEV can be sent to the Target in order to estimate the address
5598/// calculation cost.
5600 Value *Ptr,
5603 const Loop *TheLoop) {
5604
5605 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5606 if (!Gep)
5607 return nullptr;
5608
5609 // We are looking for a gep with all loop invariant indices except for one
5610 // which should be an induction variable.
5611 auto *SE = PSE.getSE();
5612 unsigned NumOperands = Gep->getNumOperands();
5613 for (unsigned Idx = 1; Idx < NumOperands; ++Idx) {
5614 Value *Opd = Gep->getOperand(Idx);
5615 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5616 !Legal->isInductionVariable(Opd))
5617 return nullptr;
5618 }
5619
5620 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5621 return PSE.getSCEV(Ptr);
5622}
5623
5625LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5626 ElementCount VF) {
5627 assert(VF.isVector() &&
5628 "Scalarization cost of instruction implies vectorization.");
5629 if (VF.isScalable())
5631
5632 Type *ValTy = getLoadStoreType(I);
5633 auto *SE = PSE.getSE();
5634
5635 unsigned AS = getLoadStoreAddressSpace(I);
5637 Type *PtrTy = toVectorTy(Ptr->getType(), VF);
5638 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
5639 // that it is being called from this specific place.
5640
5641 // Figure out whether the access is strided and get the stride value
5642 // if it's known in compile time
5643 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5644
5645 // Get the cost of the scalar memory instruction and address computation.
5647 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5648
5649 // Don't pass *I here, since it is scalar but will actually be part of a
5650 // vectorized loop where the user of it is a vectorized instruction.
5651 const Align Alignment = getLoadStoreAlignment(I);
5652 Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
5653 ValTy->getScalarType(),
5654 Alignment, AS, CostKind);
5655
5656 // Get the overhead of the extractelement and insertelement instructions
5657 // we might create due to scalarization.
5658 Cost += getScalarizationOverhead(I, VF);
5659
5660 // If we have a predicated load/store, it will need extra i1 extracts and
5661 // conditional branches, but may not be executed for each vector lane. Scale
5662 // the cost by the probability of executing the predicated block.
5663 if (isPredicatedInst(I)) {
5665
5666 // Add the cost of an i1 extract and a branch
5667 auto *VecI1Ty =
5670 VecI1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
5671 /*Insert=*/false, /*Extract=*/true, CostKind);
5672 Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
5673
5674 if (useEmulatedMaskMemRefHack(I, VF))
5675 // Artificially setting to a high enough value to practically disable
5676 // vectorization with such operations.
5677 Cost = 3000000;
5678 }
5679
5680 return Cost;
5681}
5682
5684LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5685 ElementCount VF) {
5686 Type *ValTy = getLoadStoreType(I);
5687 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5689 unsigned AS = getLoadStoreAddressSpace(I);
5690 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
5691
5692 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5693 "Stride should be 1 or -1 for consecutive memory access");
5694 const Align Alignment = getLoadStoreAlignment(I);
5696 if (Legal->isMaskRequired(I)) {
5697 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5698 CostKind);
5699 } else {
5700 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5701 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5702 CostKind, OpInfo, I);
5703 }
5704
5705 bool Reverse = ConsecutiveStride < 0;
5706 if (Reverse)
5708 CostKind, 0);
5709 return Cost;
5710}
5711
5713LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5714 ElementCount VF) {
5715 assert(Legal->isUniformMemOp(*I, VF));
5716
5717 Type *ValTy = getLoadStoreType(I);
5718 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5719 const Align Alignment = getLoadStoreAlignment(I);
5720 unsigned AS = getLoadStoreAddressSpace(I);
5721 if (isa<LoadInst>(I)) {
5722 return TTI.getAddressComputationCost(ValTy) +
5723 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
5724 CostKind) +
5726 CostKind);
5727 }
5728 StoreInst *SI = cast<StoreInst>(I);
5729
5730 bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
5731 return TTI.getAddressComputationCost(ValTy) +
5732 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
5733 CostKind) +
5734 (IsLoopInvariantStoreValue
5735 ? 0
5736 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5737 CostKind, VF.getKnownMinValue() - 1));
5738}
5739
5741LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5742 ElementCount VF) {
5743 Type *ValTy = getLoadStoreType(I);
5744 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5745 const Align Alignment = getLoadStoreAlignment(I);
5747
5748 return TTI.getAddressComputationCost(VectorTy) +
5749 TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5750 Legal->isMaskRequired(I), Alignment,
5751 CostKind, I);
5752}
5753
5755LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5756 ElementCount VF) {
5757 const auto *Group = getInterleavedAccessGroup(I);
5758 assert(Group && "Fail to get an interleaved access group.");
5759
5760 Instruction *InsertPos = Group->getInsertPos();
5761 Type *ValTy = getLoadStoreType(InsertPos);
5762 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5763 unsigned AS = getLoadStoreAddressSpace(InsertPos);
5764
5765 unsigned InterleaveFactor = Group->getFactor();
5766 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5767
5768 // Holds the indices of existing members in the interleaved group.
5770 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
5771 if (Group->getMember(IF))
5772 Indices.push_back(IF);
5773
5774 // Calculate the cost of the whole interleaved group.
5775 bool UseMaskForGaps =
5776 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
5777 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
5779 InsertPos->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5780 Group->getAlign(), AS, CostKind, Legal->isMaskRequired(I),
5781 UseMaskForGaps);
5782
5783 if (Group->isReverse()) {
5784 // TODO: Add support for reversed masked interleaved access.
5786 "Reverse masked interleaved access not supported.");
5787 Cost += Group->getNumMembers() *
5789 CostKind, 0);
5790 }
5791 return Cost;
5792}
5793
5794std::optional<InstructionCost>
5796 ElementCount VF,
5797 Type *Ty) const {
5798 using namespace llvm::PatternMatch;
5799 // Early exit for no inloop reductions
5800 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
5801 return std::nullopt;
5802 auto *VectorTy = cast<VectorType>(Ty);
5803
5804 // We are looking for a pattern of, and finding the minimal acceptable cost:
5805 // reduce(mul(ext(A), ext(B))) or
5806 // reduce(mul(A, B)) or
5807 // reduce(ext(A)) or
5808 // reduce(A).
5809 // The basic idea is that we walk down the tree to do that, finding the root
5810 // reduction instruction in InLoopReductionImmediateChains. From there we find
5811 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
5812 // of the components. If the reduction cost is lower then we return it for the
5813 // reduction instruction and 0 for the other instructions in the pattern. If
5814 // it is not we return an invalid cost specifying the orignal cost method
5815 // should be used.
5816 Instruction *RetI = I;
5817 if (match(RetI, m_ZExtOrSExt(m_Value()))) {
5818 if (!RetI->hasOneUser())
5819 return std::nullopt;
5820 RetI = RetI->user_back();
5821 }
5822
5823 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
5824 RetI->user_back()->getOpcode() == Instruction::Add) {
5825 RetI = RetI->user_back();
5826 }
5827
5828 // Test if the found instruction is a reduction, and if not return an invalid
5829 // cost specifying the parent to use the original cost modelling.
5830 if (!InLoopReductionImmediateChains.count(RetI))
5831 return std::nullopt;
5832
5833 // Find the reduction this chain is a part of and calculate the basic cost of
5834 // the reduction on its own.
5835 Instruction *LastChain = InLoopReductionImmediateChains.at(RetI);
5836 Instruction *ReductionPhi = LastChain;
5837 while (!isa<PHINode>(ReductionPhi))
5838 ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
5839
5840 const RecurrenceDescriptor &RdxDesc =
5841 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
5842
5843 InstructionCost BaseCost;
5844 RecurKind RK = RdxDesc.getRecurrenceKind();
5847 BaseCost = TTI.getMinMaxReductionCost(MinMaxID, VectorTy,
5848 RdxDesc.getFastMathFlags(), CostKind);
5849 } else {
5851 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
5852 }
5853
5854 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
5855 // normal fmul instruction to the cost of the fadd reduction.
5856 if (RK == RecurKind::FMulAdd)
5857 BaseCost +=
5858 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
5859
5860 // If we're using ordered reductions then we can just return the base cost
5861 // here, since getArithmeticReductionCost calculates the full ordered
5862 // reduction cost when FP reassociation is not allowed.
5863 if (useOrderedReductions(RdxDesc))
5864 return BaseCost;
5865
5866 // Get the operand that was not the reduction chain and match it to one of the
5867 // patterns, returning the better cost if it is found.
5868 Instruction *RedOp = RetI->getOperand(1) == LastChain
5869 ? dyn_cast<Instruction>(RetI->getOperand(0))
5870 : dyn_cast<Instruction>(RetI->getOperand(1));
5871
5872 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
5873
5874 Instruction *Op0, *Op1;
5875 if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5876 match(RedOp,
5878 match(Op0, m_ZExtOrSExt(m_Value())) &&
5879 Op0->getOpcode() == Op1->getOpcode() &&
5880 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
5882 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
5883
5884 // Matched reduce.add(ext(mul(ext(A), ext(B)))
5885 // Note that the extend opcodes need to all match, or if A==B they will have
5886 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
5887 // which is equally fine.
5888 bool IsUnsigned = isa<ZExtInst>(Op0);
5889 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
5890 auto *MulType = VectorType::get(Op0->getType(), VectorTy);
5891
5892 InstructionCost ExtCost =
5893 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
5895 InstructionCost MulCost =
5896 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
5897 InstructionCost Ext2Cost =
5898 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
5900
5902 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
5903
5904 if (RedCost.isValid() &&
5905 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
5906 return I == RetI ? RedCost : 0;
5907 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
5908 !TheLoop->isLoopInvariant(RedOp)) {
5909 // Matched reduce(ext(A))
5910 bool IsUnsigned = isa<ZExtInst>(RedOp);
5911 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
5913 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
5914 RdxDesc.getFastMathFlags(), CostKind);
5915
5916 InstructionCost ExtCost =
5917 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
5919 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
5920 return I == RetI ? RedCost : 0;
5921 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5922 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
5923 if (match(Op0, m_ZExtOrSExt(m_Value())) &&
5924 Op0->getOpcode() == Op1->getOpcode() &&
5926 bool IsUnsigned = isa<ZExtInst>(Op0);
5927 Type *Op0Ty = Op0->getOperand(0)->getType();
5928 Type *Op1Ty = Op1->getOperand(0)->getType();
5929 Type *LargestOpTy =
5930 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
5931 : Op0Ty;
5932 auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
5933
5934 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
5935 // different sizes. We take the largest type as the ext to reduce, and add
5936 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
5938 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
5941 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
5943 InstructionCost MulCost =
5944 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
5945
5947 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
5948 InstructionCost ExtraExtCost = 0;
5949 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
5950 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
5951 ExtraExtCost = TTI.getCastInstrCost(
5952 ExtraExtOp->getOpcode(), ExtType,
5953 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
5955 }
5956
5957 if (RedCost.isValid() &&
5958 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
5959 return I == RetI ? RedCost : 0;
5960 } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
5961 // Matched reduce.add(mul())
5962 InstructionCost MulCost =
5963 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
5964
5966 true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
5967
5968 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
5969 return I == RetI ? RedCost : 0;
5970 }
5971 }
5972
5973 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
5974}
5975
5977LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5978 ElementCount VF) {
5979 // Calculate scalar cost only. Vectorization cost should be ready at this
5980 // moment.
5981 if (VF.isScalar()) {
5982 Type *ValTy = getLoadStoreType(I);
5983 const Align Alignment = getLoadStoreAlignment(I);
5984 unsigned AS = getLoadStoreAddressSpace(I);
5985
5986 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5987 return TTI.getAddressComputationCost(ValTy) +
5988 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, CostKind,
5989 OpInfo, I);
5990 }
5991 return getWideningCost(I, VF);
5992}
5993
5995LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5996 ElementCount VF) const {
5997
5998 // There is no mechanism yet to create a scalable scalarization loop,
5999 // so this is currently Invalid.
6000 if (VF.isScalable())
6002
6003 if (VF.isScalar())
6004 return 0;
6005
6007 Type *RetTy = toVectorTy(I->getType(), VF);
6008 if (!RetTy->isVoidTy() &&
6009 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6011 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
6012 /*Insert*/ true,
6013 /*Extract*/ false, CostKind);
6014
6015 // Some targets keep addresses scalar.
6016 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6017 return Cost;
6018
6019 // Some targets support efficient element stores.
6020 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6021 return Cost;
6022
6023 // Collect operands to consider.
6024 CallInst *CI = dyn_cast<CallInst>(I);
6025 Instruction::op_range Ops = CI ? CI->args() : I->operands();
6026
6027 // Skip operands that do not require extraction/scalarization and do not incur
6028 // any overhead.
6030 for (auto *V : filterExtractingOperands(Ops, VF))
6031 Tys.push_back(maybeVectorizeType(V->getType(), VF));
6033 filterExtractingOperands(Ops, VF), Tys, CostKind);
6034}
6035
6037 if (VF.isScalar())
6038 return;
6039 NumPredStores = 0;
6040 for (BasicBlock *BB : TheLoop->blocks()) {
6041 // For each instruction in the old loop.
6042 for (Instruction &I : *BB) {
6044 if (!Ptr)
6045 continue;
6046
6047 // TODO: We should generate better code and update the cost model for
6048 // predicated uniform stores. Today they are treated as any other
6049 // predicated store (see added test cases in
6050 // invariant-store-vectorization.ll).
6051 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6052 NumPredStores++;
6053
6054 if (Legal->isUniformMemOp(I, VF)) {
6055 auto IsLegalToScalarize = [&]() {
6056 if (!VF.isScalable())
6057 // Scalarization of fixed length vectors "just works".
6058 return true;
6059
6060 // We have dedicated lowering for unpredicated uniform loads and
6061 // stores. Note that even with tail folding we know that at least
6062 // one lane is active (i.e. generalized predication is not possible
6063 // here), and the logic below depends on this fact.
6064 if (!foldTailByMasking())
6065 return true;
6066
6067 // For scalable vectors, a uniform memop load is always
6068 // uniform-by-parts and we know how to scalarize that.
6069 if (isa<LoadInst>(I))
6070 return true;
6071
6072 // A uniform store isn't neccessarily uniform-by-part
6073 // and we can't assume scalarization.
6074 auto &SI = cast<StoreInst>(I);
6075 return TheLoop->isLoopInvariant(SI.getValueOperand());
6076 };
6077
6078 const InstructionCost GatherScatterCost =
6080 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6081
6082 // Load: Scalar load + broadcast
6083 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6084 // FIXME: This cost is a significant under-estimate for tail folded
6085 // memory ops.
6086 const InstructionCost ScalarizationCost =
6087 IsLegalToScalarize() ? getUniformMemOpCost(&I, VF)
6089
6090 // Choose better solution for the current VF, Note that Invalid
6091 // costs compare as maximumal large. If both are invalid, we get
6092 // scalable invalid which signals a failure and a vectorization abort.
6093 if (GatherScatterCost < ScalarizationCost)
6094 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6095 else
6096 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
6097 continue;
6098 }
6099
6100 // We assume that widening is the best solution when possible.
6101 if (memoryInstructionCanBeWidened(&I, VF)) {
6102 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6103 int ConsecutiveStride = Legal->isConsecutivePtr(
6105 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6106 "Expected consecutive stride.");
6107 InstWidening Decision =
6108 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6109 setWideningDecision(&I, VF, Decision, Cost);
6110 continue;
6111 }
6112
6113 // Choose between Interleaving, Gather/Scatter or Scalarization.
6115 unsigned NumAccesses = 1;
6116 if (isAccessInterleaved(&I)) {
6117 const auto *Group = getInterleavedAccessGroup(&I);
6118 assert(Group && "Fail to get an interleaved access group.");
6119
6120 // Make one decision for the whole group.
6121 if (getWideningDecision(&I, VF) != CM_Unknown)
6122 continue;
6123
6124 NumAccesses = Group->getNumMembers();
6126 InterleaveCost = getInterleaveGroupCost(&I, VF);
6127 }
6128
6129 InstructionCost GatherScatterCost =
6131 ? getGatherScatterCost(&I, VF) * NumAccesses
6133
6134 InstructionCost ScalarizationCost =
6135 getMemInstScalarizationCost(&I, VF) * NumAccesses;
6136
6137 // Choose better solution for the current VF,
6138 // write down this decision and use it during vectorization.
6140 InstWidening Decision;
6141 if (InterleaveCost <= GatherScatterCost &&
6142 InterleaveCost < ScalarizationCost) {
6143 Decision = CM_Interleave;
6144 Cost = InterleaveCost;
6145 } else if (GatherScatterCost < ScalarizationCost) {
6146 Decision = CM_GatherScatter;
6147 Cost = GatherScatterCost;
6148 } else {
6149 Decision = CM_Scalarize;
6150 Cost = ScalarizationCost;
6151 }
6152 // If the instructions belongs to an interleave group, the whole group
6153 // receives the same decision. The whole group receives the cost, but
6154 // the cost will actually be assigned to one instruction.
6155 if (const auto *Group = getInterleavedAccessGroup(&I))
6156 setWideningDecision(Group, VF, Decision, Cost);
6157 else
6158 setWideningDecision(&I, VF, Decision, Cost);
6159 }
6160 }
6161
6162 // Make sure that any load of address and any other address computation
6163 // remains scalar unless there is gather/scatter support. This avoids
6164 // inevitable extracts into address registers, and also has the benefit of
6165 // activating LSR more, since that pass can't optimize vectorized
6166 // addresses.
6168 return;
6169
6170 // Start with all scalar pointer uses.
6172 for (BasicBlock *BB : TheLoop->blocks())
6173 for (Instruction &I : *BB) {
6174 Instruction *PtrDef =
6175 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6176 if (PtrDef && TheLoop->contains(PtrDef) &&
6178 AddrDefs.insert(PtrDef);
6179 }
6180
6181 // Add all instructions used to generate the addresses.
6183 append_range(Worklist, AddrDefs);
6184 while (!Worklist.empty()) {
6185 Instruction *I = Worklist.pop_back_val();
6186 for (auto &Op : I->operands())
6187 if (auto *InstOp = dyn_cast<Instruction>(Op))
6188 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6189 AddrDefs.insert(InstOp).second)
6190 Worklist.push_back(InstOp);
6191 }
6192
6193 for (auto *I : AddrDefs) {
6194 if (isa<LoadInst>(I)) {
6195 // Setting the desired widening decision should ideally be handled in
6196 // by cost functions, but since this involves the task of finding out
6197 // if the loaded register is involved in an address computation, it is
6198 // instead changed here when we know this is the case.
6199 InstWidening Decision = getWideningDecision(I, VF);
6200 if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6201 // Scalarize a widened load of address.
6203 I, VF, CM_Scalarize,
6204 (VF.getKnownMinValue() *
6205 getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6206 else if (const auto *Group = getInterleavedAccessGroup(I)) {
6207 // Scalarize an interleave group of address loads.
6208 for (unsigned I = 0; I < Group->getFactor(); ++I) {
6209 if (Instruction *Member = Group->getMember(I))
6211 Member, VF, CM_Scalarize,
6212 (VF.getKnownMinValue() *
6213 getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6214 }
6215 }
6216 } else
6217 // Make sure I gets scalarized and a cost estimate without
6218 // scalarization overhead.
6219 ForcedScalars[VF].insert(I);
6220 }
6221}
6222
6224 assert(!VF.isScalar() &&
6225 "Trying to set a vectorization decision for a scalar VF");
6226
6227 auto ForcedScalar = ForcedScalars.find(VF);
6228 for (BasicBlock *BB : TheLoop->blocks()) {
6229 // For each instruction in the old loop.
6230 for (Instruction &I : *BB) {
6231 CallInst *CI = dyn_cast<CallInst>(&I);
6232
6233 if (!CI)
6234 continue;
6235
6239 Function *ScalarFunc = CI->getCalledFunction();
6240 Type *ScalarRetTy = CI->getType();
6241 SmallVector<Type *, 4> Tys, ScalarTys;
6242 for (auto &ArgOp : CI->args())
6243 ScalarTys.push_back(ArgOp->getType());
6244
6245 // Estimate cost of scalarized vector call. The source operands are
6246 // assumed to be vectors, so we need to extract individual elements from
6247 // there, execute VF scalar calls, and then gather the result into the
6248 // vector return value.
6249 InstructionCost ScalarCallCost =
6250 TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
6251
6252 // Compute costs of unpacking argument values for the scalar calls and
6253 // packing the return values to a vector.
6254 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
6255
6256 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6257 // Honor ForcedScalars and UniformAfterVectorization decisions.
6258 // TODO: For calls, it might still be more profitable to widen. Use
6259 // VPlan-based cost model to compare different options.
6260 if (VF.isVector() && ((ForcedScalar != ForcedScalars.end() &&
6261 ForcedScalar->second.contains(CI)) ||
6262 isUniformAfterVectorization(CI, VF))) {
6263 setCallWideningDecision(CI, VF, CM_Scalarize, nullptr,
6264 Intrinsic::not_intrinsic, std::nullopt,
6265 ScalarCost);
6266 continue;
6267 }
6268
6269 bool MaskRequired = Legal->isMaskRequired(CI);
6270 // Compute corresponding vector type for return value and arguments.
6271 Type *RetTy = toVectorTy(ScalarRetTy, VF);
6272 for (Type *ScalarTy : ScalarTys)
6273 Tys.push_back(toVectorTy(ScalarTy, VF));
6274
6275 // An in-loop reduction using an fmuladd intrinsic is a special case;
6276 // we don't want the normal cost for that intrinsic.
6278 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy)) {
6281 std::nullopt, *RedCost);
6282 continue;
6283 }
6284
6285 // Find the cost of vectorizing the call, if we can find a suitable
6286 // vector variant of the function.
6287 bool UsesMask = false;
6288 VFInfo FuncInfo;
6289 Function *VecFunc = nullptr;
6290 // Search through any available variants for one we can use at this VF.
6291 for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
6292 // Must match requested VF.
6293 if (Info.Shape.VF != VF)
6294 continue;
6295
6296 // Must take a mask argument if one is required
6297 if (MaskRequired && !Info.isMasked())
6298 continue;
6299
6300 // Check that all parameter kinds are supported
6301 bool ParamsOk = true;
6302 for (VFParameter Param : Info.Shape.Parameters) {
6303 switch (Param.ParamKind) {
6305 break;
6307 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6308 // Make sure the scalar parameter in the loop is invariant.
6309 if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
6310 TheLoop))
6311 ParamsOk = false;
6312 break;
6313 }
6315 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6316 // Find the stride for the scalar parameter in this loop and see if
6317 // it matches the stride for the variant.
6318 // TODO: do we need to figure out the cost of an extract to get the
6319 // first lane? Or do we hope that it will be folded away?
6320 ScalarEvolution *SE = PSE.getSE();
6321 const auto *SAR =
6322 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam));
6323
6324 if (!SAR || SAR->getLoop() != TheLoop) {
6325 ParamsOk = false;
6326 break;
6327 }
6328
6329 const SCEVConstant *Step =
6330 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));
6331
6332 if (!Step ||
6333 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
6334 ParamsOk = false;
6335
6336 break;
6337 }
6339 UsesMask = true;
6340 break;
6341 default:
6342 ParamsOk = false;
6343 break;
6344 }
6345 }
6346
6347 if (!ParamsOk)
6348 continue;
6349
6350 // Found a suitable candidate, stop here.
6351 VecFunc = CI->getModule()->getFunction(Info.VectorName);
6352 FuncInfo = Info;
6353 break;
6354 }
6355
6356 // Add in the cost of synthesizing a mask if one wasn't required.
6357 InstructionCost MaskCost = 0;
6358 if (VecFunc && UsesMask && !MaskRequired)
6359 MaskCost = TTI.getShuffleCost(
6362 VecFunc->getFunctionType()->getContext()),
6363 VF),
6364 {}, CostKind);
6365
6366 if (TLI && VecFunc && !CI->isNoBuiltin())
6367 VectorCost =
6368 TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
6369
6370 // Find the cost of an intrinsic; some targets may have instructions that
6371 // perform the operation without needing an actual call.
6373 if (IID != Intrinsic::not_intrinsic)
6374 IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6375
6376 InstructionCost Cost = ScalarCost;
6377 InstWidening Decision = CM_Scalarize;
6378
6379 if (VectorCost <= Cost) {
6380 Cost = VectorCost;
6381 Decision = CM_VectorCall;
6382 }
6383
6384 if (IntrinsicCost <= Cost) {
6385 Cost = IntrinsicCost;
6386 Decision = CM_IntrinsicCall;
6387 }
6388
6389 setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
6391 }
6392 }
6393}
6394
6396 if (!Legal->isInvariant(Op))
6397 return false;
6398 // Consider Op invariant, if it or its operands aren't predicated
6399 // instruction in the loop. In that case, it is not trivially hoistable.
6400 auto *OpI = dyn_cast<Instruction>(Op);
6401 return !OpI || !TheLoop->contains(OpI) ||
6402 (!isPredicatedInst(OpI) &&
6403 (!isa<PHINode>(OpI) || OpI->getParent() != TheLoop->getHeader()) &&
6404 all_of(OpI->operands(),
6405 [this](Value *Op) { return shouldConsiderInvariant(Op); }));
6406}
6407
6410 ElementCount VF) {
6411 // If we know that this instruction will remain uniform, check the cost of
6412 // the scalar version.
6414 VF = ElementCount::getFixed(1);
6415
6416 if (VF.isVector() && isProfitableToScalarize(I, VF))
6417 return InstsToScalarize[VF][I];
6418
6419 // Forced scalars do not have any scalarization overhead.
6420 auto ForcedScalar = ForcedScalars.find(VF);
6421 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6422 auto InstSet = ForcedScalar->second;
6423 if (InstSet.count(I))
6425 VF.getKnownMinValue();
6426 }
6427
6428 Type *RetTy = I->getType();
6430 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6431 auto *SE = PSE.getSE();
6432
6433 auto HasSingleCopyAfterVectorization = [this](Instruction *I,
6434 ElementCount VF) -> bool {
6435 if (VF.isScalar())
6436 return true;
6437
6438 auto Scalarized = InstsToScalarize.find(VF);
6439 assert(Scalarized != InstsToScalarize.end() &&
6440 "VF not yet analyzed for scalarization profitability");
6441 return !Scalarized->second.count(I) &&
6442 llvm::all_of(I->users(), [&](User *U) {
6443 auto *UI = cast<Instruction>(U);
6444 return !Scalarized->second.count(UI);
6445 });
6446 };
6447 (void)HasSingleCopyAfterVectorization;
6448
6449 Type *VectorTy;
6450 if (isScalarAfterVectorization(I, VF)) {
6451 // With the exception of GEPs and PHIs, after scalarization there should
6452 // only be one copy of the instruction generated in the loop. This is
6453 // because the VF is either 1, or any instructions that need scalarizing
6454 // have already been dealt with by the time we get here. As a result,
6455 // it means we don't have to multiply the instruction cost by VF.
6456 assert(I->getOpcode() == Instruction::GetElementPtr ||
6457 I->getOpcode() == Instruction::PHI ||
6458 (I->getOpcode() == Instruction::BitCast &&
6459 I->getType()->isPointerTy()) ||
6460 HasSingleCopyAfterVectorization(I, VF));
6461 VectorTy = RetTy;
6462 } else
6463 VectorTy = toVectorTy(RetTy, VF);
6464
6465 if (VF.isVector() && VectorTy->isVectorTy() &&
6466 !TTI.getNumberOfParts(VectorTy))
6468
6469 // TODO: We need to estimate the cost of intrinsic calls.
6470 switch (I->getOpcode()) {
6471 case Instruction::GetElementPtr:
6472 // We mark this instruction as zero-cost because the cost of GEPs in
6473 // vectorized code depends on whether the corresponding memory instruction
6474 // is scalarized or not. Therefore, we handle GEPs with the memory
6475 // instruction cost.
6476 return 0;
6477 case Instruction::Br: {
6478 // In cases of scalarized and predicated instructions, there will be VF
6479 // predicated blocks in the vectorized loop. Each branch around these
6480 // blocks requires also an extract of its vector compare i1 element.
6481 // Note that the conditional branch from the loop latch will be replaced by
6482 // a single branch controlling the loop, so there is no extra overhead from
6483 // scalarization.
6484 bool ScalarPredicatedBB = false;
6485 BranchInst *BI = cast<BranchInst>(I);
6486 if (VF.isVector() && BI->isConditional() &&
6487 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6488 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&
6489 BI->getParent() != TheLoop->getLoopLatch())
6490 ScalarPredicatedBB = true;
6491
6492 if (ScalarPredicatedBB) {
6493 // Not possible to scalarize scalable vector with predicated instructions.
6494 if (VF.isScalable())
6496 // Return cost for branches around scalarized and predicated blocks.
6497 auto *VecI1Ty =
6498 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6499 return (
6501 VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
6502 /*Insert*/ false, /*Extract*/ true, CostKind) +
6503 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6504 }
6505
6506 if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6507 // The back-edge branch will remain, as will all scalar branches.
6508 return TTI.getCFInstrCost(Instruction::Br, CostKind);
6509
6510 // This branch will be eliminated by if-conversion.
6511 return 0;
6512 // Note: We currently assume zero cost for an unconditional branch inside
6513 // a predicated block since it will become a fall-through, although we
6514 // may decide in the future to call TTI for all branches.
6515 }
6516 case Instruction::Switch: {
6517 if (VF.isScalar())
6518 return TTI.getCFInstrCost(Instruction::Switch, CostKind);
6519 auto *Switch = cast<SwitchInst>(I);
6520 return Switch->getNumCases() *
6522 Instruction::ICmp,
6523 toVectorTy(Switch->getCondition()->getType(), VF),
6524 toVectorTy(Type::getInt1Ty(I->getContext()), VF),
6526 }
6527 case Instruction::PHI: {
6528 auto *Phi = cast<PHINode>(I);
6529
6530 // First-order recurrences are replaced by vector shuffles inside the loop.
6531 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6532 // For <vscale x 1 x i64>, if vscale = 1 we are unable to extract the
6533 // penultimate value of the recurrence.
6534 // TODO: Consider vscale_range info.
6535 if (VF.isScalable() && VF.getKnownMinValue() == 1)
6538 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6540 cast<VectorType>(VectorTy), Mask, CostKind,
6541 VF.getKnownMinValue() - 1);
6542 }
6543
6544 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6545 // converted into select instructions. We require N - 1 selects per phi
6546 // node, where N is the number of incoming values.
6547 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) {
6548 Type *ResultTy = Phi->getType();
6549
6550 // All instructions in an Any-of reduction chain are narrowed to bool.
6551 // Check if that is the case for this phi node.
6552 auto *HeaderUser = cast_if_present<PHINode>(
6553 find_singleton<User>(Phi->users(), [this](User *U, bool) -> User * {
6554 auto *Phi = dyn_cast<PHINode>(U);
6555 if (Phi && Phi->getParent() == TheLoop->getHeader())
6556 return Phi;
6557 return nullptr;
6558 }));
6559 if (HeaderUser) {
6560 auto &ReductionVars = Legal->getReductionVars();
6561 auto Iter = ReductionVars.find(HeaderUser);
6562 if (Iter != ReductionVars.end() &&
6564 Iter->second.getRecurrenceKind()))
6565 ResultTy = Type::getInt1Ty(Phi->getContext());
6566 }
6567 return (Phi->getNumIncomingValues() - 1) *
6569 Instruction::Select, toVectorTy(ResultTy, VF),
6570 toVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6572 }
6573
6574 // When tail folding with EVL, if the phi is part of an out of loop
6575 // reduction then it will be transformed into a wide vp_merge.
6576 if (VF.isVector() && foldTailWithEVL() &&
6579 Intrinsic::vp_merge, toVectorTy(Phi->getType(), VF),
6580 {toVectorTy(Type::getInt1Ty(Phi->getContext()), VF)});
6581 return TTI.getIntrinsicInstrCost(ICA, CostKind);
6582 }
6583
6584 return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6585 }
6586 case Instruction::UDiv:
6587 case Instruction::SDiv:
6588 case Instruction::URem:
6589 case Instruction::SRem:
6590 if (VF.isVector() && isPredicatedInst(I)) {
6591 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6592 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6593 ScalarCost : SafeDivisorCost;
6594 }
6595 // We've proven all lanes safe to speculate, fall through.
6596 [[fallthrough]];
6597 case Instruction::Add:
6598 case Instruction::Sub: {
6599 auto Info = Legal->getHistogramInfo(I);
6600 if (Info && VF.isVector()) {
6601 const HistogramInfo *HGram = Info.value();
6602 // Assume that a non-constant update value (or a constant != 1) requires
6603 // a multiply, and add that into the cost.
6605 ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1));
6606 if (!RHS || RHS->getZExtValue() != 1)
6607 MulCost =
6608 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6609
6610 // Find the cost of the histogram operation itself.
6611 Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF);
6612 Type *ScalarTy = I->getType();
6613 Type *MaskTy = VectorType::get(Type::getInt1Ty(I->getContext()), VF);
6614 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
6615 Type::getVoidTy(I->getContext()),
6616 {PtrTy, ScalarTy, MaskTy});
6617
6618 // Add the costs together with the add/sub operation.
6619 return TTI.getIntrinsicInstrCost(ICA, CostKind) + MulCost +
6620 TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, CostKind);
6621 }
6622 [[fallthrough]];
6623 }
6624 case Instruction::FAdd:
6625 case Instruction::FSub:
6626 case Instruction::Mul:
6627 case Instruction::FMul:
6628 case Instruction::FDiv:
6629 case Instruction::FRem:
6630 case Instruction::Shl:
6631 case Instruction::LShr:
6632 case Instruction::AShr:
6633 case Instruction::And:
6634 case Instruction::Or:
6635 case Instruction::Xor: {
6636 // If we're speculating on the stride being 1, the multiplication may
6637 // fold away. We can generalize this for all operations using the notion
6638 // of neutral elements. (TODO)
6639 if (I->getOpcode() == Instruction::Mul &&
6640 ((TheLoop->isLoopInvariant(I->getOperand(0)) &&
6641 PSE.getSCEV(I->getOperand(0))->isOne()) ||
6642 (TheLoop->isLoopInvariant(I->getOperand(1)) &&
6643 PSE.getSCEV(I->getOperand(1))->isOne())))
6644 return 0;
6645
6646 // Detect reduction patterns
6647 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
6648 return *RedCost;
6649
6650 // Certain instructions can be cheaper to vectorize if they have a constant
6651 // second vector operand. One example of this are shifts on x86.
6652 Value *Op2 = I->getOperand(1);
6653 if (!isa<Constant>(Op2) && PSE.getSE()->isSCEVable(Op2->getType()) &&
6654 isa<SCEVConstant>(PSE.getSCEV(Op2))) {
6655 Op2 = cast<SCEVConstant>(PSE.getSCEV(Op2))->getValue();
6656 }
6657 auto Op2Info = TTI.getOperandInfo(Op2);
6658 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6661
6662 SmallVector<const Value *, 4> Operands(I->operand_values());
6664 I->getOpcode(), VectorTy, CostKind,
6665 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6666 Op2Info, Operands, I, TLI);
6667 }
6668 case Instruction::FNeg: {
6670 I->getOpcode(), VectorTy, CostKind,
6671 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6672 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6673 I->getOperand(0), I);
6674 }
6675 case Instruction::Select: {
6676 SelectInst *SI = cast<SelectInst>(I);
6677 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6678 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6679
6680 const Value *Op0, *Op1;
6681 using namespace llvm::PatternMatch;
6682 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
6683 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
6684 // select x, y, false --> x & y
6685 // select x, true, y --> x | y
6686 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
6687 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
6688 assert(Op0->getType()->getScalarSizeInBits() == 1 &&
6689 Op1->getType()->getScalarSizeInBits() == 1);
6690
6693 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
6694 CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
6695 }
6696
6697 Type *CondTy = SI->getCondition()->getType();
6698 if (!ScalarCond)
6699 CondTy = VectorType::get(CondTy, VF);
6700
6702 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
6703 Pred = Cmp->getPredicate();
6704 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
6705 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
6706 {TTI::OK_AnyValue, TTI::OP_None}, I);
6707 }
6708 case Instruction::ICmp:
6709 case Instruction::FCmp: {
6710 Type *ValTy = I->getOperand(0)->getType();
6711
6713 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6714 (void)Op0AsInstruction;
6715 assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) ||
6716 MinBWs[I] == MinBWs[Op0AsInstruction]) &&
6717 "if both the operand and the compare are marked for "
6718 "truncation, they must have the same bitwidth");
6719 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]);
6720 }
6721
6722 VectorTy = toVectorTy(ValTy, VF);
6723 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
6724 cast<CmpInst>(I)->getPredicate(), CostKind,
6725 {TTI::OK_AnyValue, TTI::OP_None},
6726 {TTI::OK_AnyValue, TTI::OP_None}, I);
6727 }
6728 case Instruction::Store:
6729 case Instruction::Load: {
6730 ElementCount Width = VF;
6731 if (Width.isVector()) {
6732 InstWidening Decision = getWideningDecision(I, Width);
6733 assert(Decision != CM_Unknown &&
6734 "CM decision should be taken at this point");
6737 if (Decision == CM_Scalarize)
6738 Width = ElementCount::getFixed(1);
6739 }
6740 VectorTy = toVectorTy(getLoadStoreType(I), Width);
6741 return getMemoryInstructionCost(I, VF);
6742 }
6743 case Instruction::BitCast:
6744 if (I->getType()->isPointerTy())
6745 return 0;
6746 [[fallthrough]];
6747 case Instruction::ZExt:
6748 case Instruction::SExt:
6749 case Instruction::FPToUI:
6750 case Instruction::FPToSI:
6751 case Instruction::FPExt:
6752 case Instruction::PtrToInt:
6753 case Instruction::IntToPtr:
6754 case Instruction::SIToFP:
6755 case Instruction::UIToFP:
6756 case Instruction::Trunc:
6757 case Instruction::FPTrunc: {
6758 // Computes the CastContextHint from a Load/Store instruction.
6759 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6760 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6761 "Expected a load or a store!");
6762
6763 if (VF.isScalar() || !TheLoop->contains(I))
6765
6766 switch (getWideningDecision(I, VF)) {
6778 llvm_unreachable("Instr did not go through cost modelling?");
6781 llvm_unreachable_internal("Instr has invalid widening decision");
6782 }
6783
6784 llvm_unreachable("Unhandled case!");
6785 };
6786
6787 unsigned Opcode = I->getOpcode();
6789 // For Trunc, the context is the only user, which must be a StoreInst.
6790 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6791 if (I->hasOneUse())
6792 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
6793 CCH = ComputeCCH(Store);
6794 }
6795 // For Z/Sext, the context is the operand, which must be a LoadInst.
6796 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6797 Opcode == Instruction::FPExt) {
6798 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
6799 CCH = ComputeCCH(Load);
6800 }
6801
6802 // We optimize the truncation of induction variables having constant
6803 // integer steps. The cost of these truncations is the same as the scalar
6804 // operation.
6805 if (isOptimizableIVTruncate(I, VF)) {
6806 auto *Trunc = cast<TruncInst>(I);
6807 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6808 Trunc->getSrcTy(), CCH, CostKind, Trunc);
6809 }
6810
6811 // Detect reduction patterns
6812 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
6813 return *RedCost;
6814
6815 Type *SrcScalarTy = I->getOperand(0)->getType();
6816 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6817 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6818 SrcScalarTy =
6819 IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]);
6820 Type *SrcVecTy =
6821 VectorTy->isVectorTy() ? toVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6822
6824 // If the result type is <= the source type, there will be no extend
6825 // after truncating the users to the minimal required bitwidth.
6826 if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
6827 (I->getOpcode() == Instruction::ZExt ||
6828 I->getOpcode() == Instruction::SExt))
6829 return 0;
6830 }
6831
6832 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
6833 }
6834 case Instruction::Call:
6835 return getVectorCallCost(cast<CallInst>(I), VF);
6836 case Instruction::ExtractValue:
6838 case Instruction::Alloca:
6839 // We cannot easily widen alloca to a scalable alloca, as
6840 // the result would need to be a vector of pointers.
6841 if (VF.isScalable())
6843 [[fallthrough]];
6844 default:
6845 // This opcode is unknown. Assume that it is the same as 'mul'.
6846 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6847 } // end of switch.
6848}
6849
6851 // Ignore ephemeral values.
6853
6854 SmallVector<Value *, 4> DeadInterleavePointerOps;
6856
6857 // If a scalar epilogue is required, users outside the loop won't use
6858 // live-outs from the vector loop but from the scalar epilogue. Ignore them if
6859 // that is the case.
6860 bool RequiresScalarEpilogue = requiresScalarEpilogue(true);
6861 auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) {
6862 return RequiresScalarEpilogue &&
6863 !TheLoop->contains(cast<Instruction>(U)->getParent());
6864 };
6865
6867 DFS.perform(LI);
6868 MapVector<Value *, SmallVector<Value *>> DeadInvariantStoreOps;
6869 for (BasicBlock *BB : reverse(make_range(DFS.beginRPO(), DFS.endRPO())))
6870 for (Instruction &I : reverse(*BB)) {
6871 // Find all stores to invariant variables. Since they are going to sink
6872 // outside the loop we do not need calculate cost for them.
6873 StoreInst *SI;
6874 if ((SI = dyn_cast<StoreInst>(&I)) &&
6875 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
6876 ValuesToIgnore.insert(&I);
6877 DeadInvariantStoreOps[SI->getPointerOperand()].push_back(
6878 SI->getValueOperand());
6879 }
6880
6881 if (VecValuesToIgnore.contains(&I) || ValuesToIgnore.contains(&I))
6882 continue;
6883
6884 // Add instructions that would be trivially dead and are only used by
6885 // values already ignored to DeadOps to seed worklist.
6887 all_of(I.users(), [this, IsLiveOutDead](User *U) {
6888 return VecValuesToIgnore.contains(U) ||
6889 ValuesToIgnore.contains(U) || IsLiveOutDead(U);
6890 }))
6891 DeadOps.push_back(&I);
6892
6893 // For interleave groups, we only create a pointer for the start of the
6894 // interleave group. Queue up addresses of group members except the insert
6895 // position for further processing.
6896 if (isAccessInterleaved(&I)) {
6897 auto *Group = getInterleavedAccessGroup(&I);
6898 if (Group->getInsertPos() == &I)
6899 continue;
6900 Value *PointerOp = getLoadStorePointerOperand(&I);
6901 DeadInterleavePointerOps.push_back(PointerOp);
6902 }
6903
6904 // Queue branches for analysis. They are dead, if their successors only
6905 // contain dead instructions.
6906 if (auto *Br = dyn_cast<BranchInst>(&I)) {
6907 if (Br->isConditional())
6908 DeadOps.push_back(&I);
6909 }
6910 }
6911
6912 // Mark ops feeding interleave group members as free, if they are only used
6913 // by other dead computations.
6914 for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
6915 auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]);
6916 if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) {
6917 Instruction *UI = cast<Instruction>(U);
6918 return !VecValuesToIgnore.contains(U) &&
6919 (!isAccessInterleaved(UI) ||
6920 getInterleavedAccessGroup(UI)->getInsertPos() == UI);
6921 }))
6922 continue;
6923 VecValuesToIgnore.insert(Op);
6924 DeadInterleavePointerOps.append(Op->op_begin(), Op->op_end());
6925 }
6926
6927 for (const auto &[_, Ops] : DeadInvariantStoreOps) {
6928 for (Value *Op : ArrayRef(Ops).drop_back())
6929 DeadOps.push_back(Op);
6930 }
6931 // Mark ops that would be trivially dead and are only used by ignored
6932 // instructions as free.
6933 BasicBlock *Header = TheLoop->getHeader();
6934
6935 // Returns true if the block contains only dead instructions. Such blocks will
6936 // be removed by VPlan-to-VPlan transforms and won't be considered by the
6937 // VPlan-based cost model, so skip them in the legacy cost-model as well.
6938 auto IsEmptyBlock = [this](BasicBlock *BB) {
6939 return all_of(*BB, [this](Instruction &I) {
6940 return ValuesToIgnore.contains(&I) || VecValuesToIgnore.contains(&I) ||
6941 (isa<BranchInst>(&I) && !cast<BranchInst>(&I)->isConditional());
6942 });
6943 };
6944 for (unsigned I = 0; I != DeadOps.size(); ++I) {
6945 auto *Op = dyn_cast<Instruction>(DeadOps[I]);
6946
6947 // Check if the branch should be considered dead.
6948 if (auto *Br = dyn_cast_or_null<BranchInst>(Op)) {
6949 BasicBlock *ThenBB = Br->getSuccessor(0);
6950 BasicBlock *ElseBB = Br->getSuccessor(1);
6951 // Don't considers branches leaving the loop for simplification.
6952 if (!TheLoop->contains(ThenBB) || !TheLoop->contains(ElseBB))
6953 continue;
6954 bool ThenEmpty = IsEmptyBlock(ThenBB);
6955 bool ElseEmpty = IsEmptyBlock(ElseBB);
6956 if ((ThenEmpty && ElseEmpty) ||
6957 (ThenEmpty && ThenBB->getSingleSuccessor() == ElseBB &&
6958 ElseBB->phis().empty()) ||
6959 (ElseEmpty && ElseBB->getSingleSuccessor() == ThenBB &&
6960 ThenBB->phis().empty())) {
6961 VecValuesToIgnore.insert(Br);
6962 DeadOps.push_back(Br->getCondition());
6963 }
6964 continue;
6965 }
6966
6967 // Skip any op that shouldn't be considered dead.
6968 if (!Op || !TheLoop->contains(Op) ||
6969 (isa<PHINode>(Op) && Op->getParent() == Header) ||
6971 any_of(Op->users(), [this, IsLiveOutDead](User *U) {
6972 return !VecValuesToIgnore.contains(U) &&
6973 !ValuesToIgnore.contains(U) && !IsLiveOutDead(U);
6974 }))
6975 continue;
6976
6977 if (!TheLoop->contains(Op->getParent()))
6978 continue;
6979
6980 // If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore
6981 // which applies for both scalar and vector versions. Otherwise it is only
6982 // dead in vector versions, so only add it to VecValuesToIgnore.
6983 if (all_of(Op->users(),
6984 [this](User *U) { return ValuesToIgnore.contains(U); }))
6985 ValuesToIgnore.insert(Op);
6986
6987 VecValuesToIgnore.insert(Op);
6988 DeadOps.append(Op->op_begin(), Op->op_end());
6989 }
6990
6991 // Ignore type-promoting instructions we identified during reduction
6992 // detection.
6993 for (const auto &Reduction : Legal->getReductionVars()) {
6994 const RecurrenceDescriptor &RedDes = Reduction.second;
6995 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6996 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6997 }
6998 // Ignore type-casting instructions we identified during induction
6999 // detection.
7000 for (const auto &Induction : Legal->getInductionVars()) {
7001 const InductionDescriptor &IndDes = Induction.second;
7002 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7003 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7004 }
7005}
7006
7008 for (const auto &Reduction : Legal->getReductionVars()) {
7009 PHINode *Phi = Reduction.first;
7010 const RecurrenceDescriptor &RdxDesc = Reduction.second;
7011
7012 // We don't collect reductions that are type promoted (yet).
7013 if (RdxDesc.getRecurrenceType() != Phi->getType())
7014 continue;
7015
7016 // If the target would prefer this reduction to happen "in-loop", then we
7017 // want to record it as such.
7018 unsigned Opcode = RdxDesc.getOpcode();
7019 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7020 !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7022 continue;
7023
7024 // Check that we can correctly put the reductions into the loop, by
7025 // finding the chain of operations that leads from the phi to the loop
7026 // exit value.
7027 SmallVector<Instruction *, 4> ReductionOperations =
7028 RdxDesc.getReductionOpChain(Phi, TheLoop);
7029 bool InLoop = !ReductionOperations.empty();
7030
7031 if (InLoop) {
7032 InLoopReductions.insert(Phi);
7033 // Add the elements to InLoopReductionImmediateChains for cost modelling.
7034 Instruction *LastChain = Phi;
7035 for (auto *I : ReductionOperations) {
7036 InLoopReductionImmediateChains[I] = LastChain;
7037 LastChain = I;
7038 }
7039 }
7040 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7041 << " reduction for phi: " << *Phi << "\n");
7042 }
7043}
7044
7045// This function will select a scalable VF if the target supports scalable
7046// vectors and a fixed one otherwise.
7047// TODO: we could return a pair of values that specify the max VF and
7048// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7049// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7050// doesn't have a cost model that can choose which plan to execute if
7051// more than one is generated.
7054 unsigned WidestType;
7055 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7056
7061
7063 unsigned N = RegSize.getKnownMinValue() / WidestType;
7064 return ElementCount::get(N, RegSize.isScalable());
7065}
7066
7069 ElementCount VF = UserVF;
7070 // Outer loop handling: They may require CFG and instruction level
7071 // transformations before even evaluating whether vectorization is profitable.
7072 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7073 // the vectorization pipeline.
7074 if (!OrigLoop->isInnermost()) {
7075 // If the user doesn't provide a vectorization factor, determine a
7076 // reasonable one.
7077 if (UserVF.isZero()) {
7078 VF = determineVPlanVF(TTI, CM);
7079 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7080
7081 // Make sure we have a VF > 1 for stress testing.
7082 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7083 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7084 << "overriding computed VF.\n");
7085 VF = ElementCount::getFixed(4);
7086 }
7087 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
7089 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
7090 << "not supported by the target.\n");
7092 "Scalable vectorization requested but not supported by the target",
7093 "the scalable user-specified vectorization width for outer-loop "
7094 "vectorization cannot be used because the target does not support "
7095 "scalable vectors.",
7096 "ScalableVFUnfeasible", ORE, OrigLoop);
7098 }
7099 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7101 "VF needs to be a power of two");
7102 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7103 << "VF " << VF << " to build VPlans.\n");
7104 buildVPlans(VF, VF);
7105
7106 // For VPlan build stress testing, we bail out after VPlan construction.
7109
7110 return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
7111 }
7112
7113 LLVM_DEBUG(
7114 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7115 "VPlan-native path.\n");
7117}
7118
7119void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7120 assert(OrigLoop->isInnermost() && "Inner loop expected.");
7123
7124 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7125 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7126 return;
7127
7128 // Invalidate interleave groups if all blocks of loop will be predicated.
7129 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7131 LLVM_DEBUG(
7132 dbgs()
7133 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7134 "which requires masked-interleaved support.\n");
7136 // Invalidating interleave groups also requires invalidating all decisions
7137 // based on them, which includes widening decisions and uniform and scalar
7138 // values.
7140 }
7141
7142 if (CM.foldTailByMasking())
7144
7145 ElementCount MaxUserVF =
7146 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7147 if (UserVF) {
7148 if (!ElementCount::isKnownLE(UserVF, MaxUserVF)) {
7150 "UserVF ignored because it may be larger than the maximal safe VF",
7151 "InvalidUserVF", ORE, OrigLoop);
7152 } else {
7154 "VF needs to be a power of two");
7155 // Collect the instructions (and their associated costs) that will be more
7156 // profitable to scalarize.
7158 if (CM.selectUserVectorizationFactor(UserVF)) {
7159 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7160 buildVPlansWithVPRecipes(UserVF, UserVF);
7162 return;
7163 }
7164 reportVectorizationInfo("UserVF ignored because of invalid costs.",
7165 "InvalidCost", ORE, OrigLoop);
7166 }
7167 }
7168
7169 // Collect the Vectorization Factor Candidates.
7170 SmallVector<ElementCount> VFCandidates;
7171 for (auto VF = ElementCount::getFixed(1);
7172 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7173 VFCandidates.push_back(VF);
7174 for (auto VF = ElementCount::getScalable(1);
7175 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7176 VFCandidates.push_back(VF);
7177
7179 for (const auto &VF : VFCandidates) {
7180 // Collect Uniform and Scalar instructions after vectorization with VF.
7182
7183 // Collect the instructions (and their associated costs) that will be more
7184 // profitable to scalarize.
7185 if (VF.isVector())
7187 }
7188
7189 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7190 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7191
7193}
7194
7196 ElementCount VF) const {
7197 if (ForceTargetInstructionCost.getNumOccurrences())
7198 return InstructionCost(ForceTargetInstructionCost.getNumOccurrences());
7199 return CM.getInstructionCost(UI, VF);
7200}
7201
7202bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
7203 return CM.ValuesToIgnore.contains(UI) ||
7204 (IsVector && CM.VecValuesToIgnore.contains(UI)) ||
7205 SkipCostComputation.contains(UI);
7206}
7207
7209LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
7210 VPCostContext &CostCtx) const {
7212 // Cost modeling for inductions is inaccurate in the legacy cost model
7213 // compared to the recipes that are generated. To match here initially during
7214 // VPlan cost model bring up directly use the induction costs from the legacy
7215 // cost model. Note that we do this as pre-processing; the VPlan may not have
7216 // any recipes associated with the original induction increment instruction
7217 // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
7218 // the cost of induction phis and increments (both that are represented by
7219 // recipes and those that are not), to avoid distinguishing between them here,
7220 // and skip all recipes that represent induction phis and increments (the
7221 // former case) later on, if they exist, to avoid counting them twice.
7222 // Similarly we pre-compute the cost of any optimized truncates.
7223 // TODO: Switch to more accurate costing based on VPlan.
7224 for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
7225 Instruction *IVInc = cast<Instruction>(
7226 IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
7227 SmallVector<Instruction *> IVInsts = {IVInc};
7228 for (unsigned I = 0; I != IVInsts.size(); I++) {
7229 for (Value *Op : IVInsts[I]->operands()) {
7230 auto *OpI = dyn_cast<Instruction>(Op);
7231 if (Op == IV || !OpI || !OrigLoop->contains(OpI) || !Op->hasOneUse())
7232 continue;
7233 IVInsts.push_back(OpI);
7234 }
7235 }
7236 IVInsts.push_back(IV);
7237 for (User *U : IV->users()) {
7238 auto *CI = cast<Instruction>(U);
7239 if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF))
7240 continue;
7241 IVInsts.push_back(CI);
7242 }
7243
7244 // If the vector loop gets executed exactly once with the given VF, ignore
7245 // the costs of comparison and induction instructions, as they'll get
7246 // simplified away.
7247 // TODO: Remove this code after stepping away from the legacy cost model and
7248 // adding code to simplify VPlans before calculating their costs.
7249 auto TC = PSE.getSE()->getSmallConstantTripCount(OrigLoop);
7250 if (VF.isFixed() && TC == VF.getFixedValue() && !CM.foldTailByMasking())
7252 CostCtx.SkipCostComputation);
7253
7254 for (Instruction *IVInst : IVInsts) {
7255 if (CostCtx.skipCostComputation(IVInst, VF.isVector()))
7256 continue;
7257 InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF);
7258 LLVM_DEBUG({
7259 dbgs() << "Cost of " << InductionCost << " for VF " << VF
7260 << ": induction instruction " << *IVInst << "\n";
7261 });
7262 Cost += InductionCost;
7263 CostCtx.SkipCostComputation.insert(IVInst);
7264 }
7265 }
7266
7267 /// Compute the cost of all exiting conditions of the loop using the legacy
7268 /// cost model. This is to match the legacy behavior, which adds the cost of
7269 /// all exit conditions. Note that this over-estimates the cost, as there will
7270 /// be a single condition to control the vector loop.
7272 CM.TheLoop->getExitingBlocks(Exiting);
7273 SetVector<Instruction *> ExitInstrs;
7274 // Collect all exit conditions.
7275 for (BasicBlock *EB : Exiting) {
7276 auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
7277 if (!Term)
7278 continue;
7279 if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
7280 ExitInstrs.insert(CondI);
7281 }
7282 }
7283 // Compute the cost of all instructions only feeding the exit conditions.
7284 for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
7285 Instruction *CondI = ExitInstrs[I];
7286 if (!OrigLoop->contains(CondI) ||
7287 !CostCtx.SkipCostComputation.insert(CondI).second)
7288 continue;
7289 InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF);
7290 LLVM_DEBUG({
7291 dbgs() << "Cost of " << CondICost << " for VF " << VF
7292 << ": exit condition instruction " << *CondI << "\n";
7293 });
7294 Cost += CondICost;
7295 for (Value *Op : CondI->operands()) {
7296 auto *OpI = dyn_cast<Instruction>(Op);
7297 if (!OpI || any_of(OpI->users(), [&ExitInstrs, this](User *U) {
7298 return OrigLoop->contains(cast<Instruction>(U)->getParent()) &&
7299 !ExitInstrs.contains(cast<Instruction>(U));
7300 }))
7301 continue;
7302 ExitInstrs.insert(OpI);
7303 }
7304 }
7305
7306 // The legacy cost model has special logic to compute the cost of in-loop
7307 // reductions, which may be smaller than the sum of all instructions involved
7308 // in the reduction.
7309 // TODO: Switch to costing based on VPlan once the logic has been ported.
7310 for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) {
7311 if (ForceTargetInstructionCost.getNumOccurrences())
7312 continue;
7313
7314 if (!CM.isInLoopReduction(RedPhi))
7315 continue;
7316
7317 const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop);
7318 SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(),
7319 ChainOps.end());
7320 auto IsZExtOrSExt = [](const unsigned Opcode) -> bool {
7321 return Opcode == Instruction::ZExt || Opcode == Instruction::SExt;
7322 };
7323 // Also include the operands of instructions in the chain, as the cost-model
7324 // may mark extends as free.
7325 //
7326 // For ARM, some of the instruction can folded into the reducion
7327 // instruction. So we need to mark all folded instructions free.
7328 // For example: We can fold reduce(mul(ext(A), ext(B))) into one
7329 // instruction.
7330 for (auto *ChainOp : ChainOps) {
7331 for (Value *Op : ChainOp->operands()) {
7332 if (auto *I = dyn_cast<Instruction>(Op)) {
7333 ChainOpsAndOperands.insert(I);
7334 if (I->getOpcode() == Instruction::Mul) {
7335 auto *Ext0 = dyn_cast<Instruction>(I->getOperand(0));
7336 auto *Ext1 = dyn_cast<Instruction>(I->getOperand(1));
7337 if (Ext0 && IsZExtOrSExt(Ext0->getOpcode()) && Ext1 &&
7338 Ext0->getOpcode() == Ext1->getOpcode()) {
7339 ChainOpsAndOperands.insert(Ext0);
7340 ChainOpsAndOperands.insert(Ext1);
7341 }
7342 }
7343 }
7344 }
7345 }
7346
7347 // Pre-compute the cost for I, if it has a reduction pattern cost.
7348 for (Instruction *I : ChainOpsAndOperands) {
7349 auto ReductionCost =
7350 CM.getReductionPatternCost(I, VF, toVectorTy(I->getType(), VF));
7351 if (!ReductionCost)
7352 continue;
7353
7354 assert(!CostCtx.SkipCostComputation.contains(I) &&
7355 "reduction op visited multiple times");
7356 CostCtx.SkipCostComputation.insert(I);
7357 LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
7358 << ":\n in-loop reduction " << *I << "\n");
7359 Cost += *ReductionCost;
7360 }
7361 }
7362
7363 // Pre-compute the costs for branches except for the backedge, as the number
7364 // of replicate regions in a VPlan may not directly match the number of
7365 // branches, which would lead to different decisions.
7366 // TODO: Compute cost of branches for each replicate region in the VPlan,
7367 // which is more accurate than the legacy cost model.
7368 for (BasicBlock *BB : OrigLoop->blocks()) {
7369 if (CostCtx.skipCostComputation(BB->getTerminator(), VF.isVector()))
7370 continue;
7371 CostCtx.SkipCostComputation.insert(BB->getTerminator());
7372 if (BB == OrigLoop->getLoopLatch())
7373 continue;
7374 auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF);
7375 Cost += BranchCost;
7376 }
7377
7378 // Pre-compute costs for instructions that are forced-scalar or profitable to
7379 // scalarize. Their costs will be computed separately in the legacy cost
7380 // model.
7381 for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) {
7382 if (CostCtx.skipCostComputation(ForcedScalar, VF.isVector()))
7383 continue;
7384 CostCtx.SkipCostComputation.insert(ForcedScalar);
7385 InstructionCost ForcedCost = CostCtx.getLegacyCost(ForcedScalar, VF);
7386 LLVM_DEBUG({
7387 dbgs() << "Cost of " << ForcedCost << " for VF " << VF
7388 << ": forced scalar " << *ForcedScalar << "\n";
7389 });
7390 Cost += ForcedCost;
7391 }
7392 for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) {
7393 if (CostCtx.skipCostComputation(Scalarized, VF.isVector()))
7394 continue;
7395 CostCtx.SkipCostComputation.insert(Scalarized);
7396 LLVM_DEBUG({
7397 dbgs() << "Cost of " << ScalarCost << " for VF " << VF
7398 << ": profitable to scalarize " << *Scalarized << "\n";
7399 });
7400 Cost += ScalarCost;
7401 }
7402
7403 return Cost;
7404}
7405
7406InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
7407 ElementCount VF) const {
7408 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
7409 CM.CostKind);
7410 InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
7411
7412 // Now compute and add the VPlan-based cost.
7413 Cost += Plan.cost(VF, CostCtx);
7414#ifndef NDEBUG
7415 unsigned EstimatedWidth = getEstimatedRuntimeVF(VF, CM.getVScaleForTuning());
7416 LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
7417 << " (Estimated cost per lane: ");
7418 if (Cost.isValid()) {
7419 double CostPerLane = double(*Cost.getValue()) / EstimatedWidth;
7420 LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane));
7421 } else /* No point dividing an invalid cost - it will still be invalid */
7422 LLVM_DEBUG(dbgs() << "Invalid");
7423 LLVM_DEBUG(dbgs() << ")\n");
7424#endif
7425 return Cost;
7426}
7427
7428#ifndef NDEBUG
7429/// Return true if the original loop \ TheLoop contains any instructions that do
7430/// not have corresponding recipes in \p Plan and are not marked to be ignored
7431/// in \p CostCtx. This means the VPlan contains simplification that the legacy
7432/// cost-model did not account for.
7434 VPCostContext &CostCtx,
7435 Loop *TheLoop) {
7436 // First collect all instructions for the recipes in Plan.
7437 auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction * {
7438 if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
7439 return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
7440 if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
7441 return &WidenMem->getIngredient();
7442 return nullptr;
7443 };
7444
7445 DenseSet<Instruction *> SeenInstrs;
7446 auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry());
7447 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
7448 for (VPRecipeBase &R : *VPBB) {
7449 if (auto *IR = dyn_cast<VPInterleaveRecipe>(&R)) {
7450 auto *IG = IR->getInterleaveGroup();
7451 unsigned NumMembers = IG->getNumMembers();
7452 for (unsigned I = 0; I != NumMembers; ++I) {
7453 if (Instruction *M = IG->getMember(I))
7454 SeenInstrs.insert(M);
7455 }
7456 continue;
7457 }
7458 // The VPlan-based cost model is more accurate for partial reduction and
7459 // comparing against the legacy cost isn't desirable.
7460 if (isa<VPPartialReductionRecipe>(&R))
7461 return true;
7462 if (Instruction *UI = GetInstructionForCost(&R))
7463 SeenInstrs.insert(UI);
7464 }
7465 }
7466
7467 // Return true if the loop contains any instructions that are not also part of
7468 // the VPlan or are skipped for VPlan-based cost computations. This indicates
7469 // that the VPlan contains extra simplifications.
7470 return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx,
7471 TheLoop](BasicBlock *BB) {
7472 return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) {
7473 if (isa<PHINode>(&I) && BB == TheLoop->getHeader())
7474 return false;
7475 return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true);
7476 });
7477 });
7478}
7479#endif
7480
7482 if (VPlans.empty())
7484 // If there is a single VPlan with a single VF, return it directly.
7485 VPlan &FirstPlan = *VPlans[0];
7486 if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
7487 return {*FirstPlan.vectorFactors().begin(), 0, 0};
7488
7489 LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: "
7491 ? "Reciprocal Throughput\n"
7493 ? "Instruction Latency\n"
7494 : CM.CostKind == TTI::TCK_CodeSize ? "Code Size\n"
7496 ? "Code Size and Latency\n"
7497 : "Unknown\n"));
7498
7500 assert(hasPlanWithVF(ScalarVF) &&
7501 "More than a single plan/VF w/o any plan having scalar VF");
7502
7503 // TODO: Compute scalar cost using VPlan-based cost model.
7504 InstructionCost ScalarCost = CM.expectedCost(ScalarVF);
7505 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n");
7506 VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost);
7507 VectorizationFactor BestFactor = ScalarFactor;
7508
7509 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7510 if (ForceVectorization) {
7511 // Ignore scalar width, because the user explicitly wants vectorization.
7512 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
7513 // evaluation.
7514 BestFactor.Cost = InstructionCost::getMax();
7515 }
7516
7517 for (auto &P : VPlans) {
7518 for (ElementCount VF : P->vectorFactors()) {
7519 if (VF.isScalar())
7520 continue;
7521 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
7522 LLVM_DEBUG(
7523 dbgs()
7524 << "LV: Not considering vector loop of width " << VF
7525 << " because it will not generate any vector instructions.\n");
7526 continue;
7527 }
7528
7529 InstructionCost Cost = cost(*P, VF);
7530 VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7531 if (isMoreProfitable(CurrentFactor, BestFactor))
7532 BestFactor = CurrentFactor;
7533
7534 // If profitable add it to ProfitableVF list.
7535 if (isMoreProfitable(CurrentFactor, ScalarFactor))
7536 ProfitableVFs.push_back(CurrentFactor);
7537 }
7538 }
7539
7540#ifndef NDEBUG
7541 // Select the optimal vectorization factor according to the legacy cost-model.
7542 // This is now only used to verify the decisions by the new VPlan-based
7543 // cost-model and will be retired once the VPlan-based cost-model is
7544 // stabilized.
7545 VectorizationFactor LegacyVF = selectVectorizationFactor();
7546 VPlan &BestPlan = getPlanFor(BestFactor.Width);
7547
7548 // Pre-compute the cost and use it to check if BestPlan contains any
7549 // simplifications not accounted for in the legacy cost model. If that's the
7550 // case, don't trigger the assertion, as the extra simplifications may cause a
7551 // different VF to be picked by the VPlan-based cost model.
7552 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
7553 CM.CostKind);
7554 precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
7555 assert((BestFactor.Width == LegacyVF.Width ||
7557 CostCtx, OrigLoop) ||
7559 CostCtx, OrigLoop)) &&
7560 " VPlan cost model and legacy cost model disagreed");
7561 assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
7562 "when vectorizing, the scalar cost must be computed.");
7563#endif
7564
7565 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n");
7566 return BestFactor;
7567}
7568
7571 // Reserve first location for self reference to the LoopID metadata node.
7572 MDs.push_back(nullptr);
7573 bool IsUnrollMetadata = false;
7574 MDNode *LoopID = L->getLoopID();
7575 if (LoopID) {
7576 // First find existing loop unrolling disable metadata.
7577 for (unsigned I = 1, IE = LoopID->getNumOperands(); I < IE; ++I) {
7578 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(I));
7579 if (MD) {
7580 const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7581 IsUnrollMetadata =
7582 S && S->getString().starts_with("llvm.loop.unroll.disable");
7583 }
7584 MDs.push_back(LoopID->getOperand(I));
7585 }
7586 }
7587
7588 if (!IsUnrollMetadata) {
7589 // Add runtime unroll disable metadata.
7590 LLVMContext &Context = L->getHeader()->getContext();
7591 SmallVector<Metadata *, 1> DisableOperands;
7592 DisableOperands.push_back(
7593 MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7594 MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7595 MDs.push_back(DisableNode);
7596 MDNode *NewLoopID = MDNode::get(Context, MDs);
7597 // Set operand 0 to refer to the loop id itself.
7598 NewLoopID->replaceOperandWith(0, NewLoopID);
7599 L->setLoopID(NewLoopID);
7600 }
7601}
7602
7603// If \p R is a ComputeReductionResult when vectorizing the epilog loop,
7604// fix the reduction's scalar PHI node by adding the incoming value from the
7605// main vector loop.
7607 VPRecipeBase *R, VPTransformState &State, BasicBlock *LoopMiddleBlock,
7608 BasicBlock *BypassBlock) {
7609 auto *EpiRedResult = dyn_cast<VPInstruction>(R);
7610 if (!EpiRedResult ||
7611 EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult)
7612 return;
7613
7614 auto *EpiRedHeaderPhi =
7615 cast<VPReductionPHIRecipe>(EpiRedResult->getOperand(0));
7616 const RecurrenceDescriptor &RdxDesc =
7617 EpiRedHeaderPhi->getRecurrenceDescriptor();
7618 Value *MainResumeValue =
7619 EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
7621 RdxDesc.getRecurrenceKind())) {
7622 auto *Cmp = cast<ICmpInst>(MainResumeValue);
7623 assert(Cmp->getPredicate() == CmpInst::ICMP_NE &&
7624 "AnyOf expected to start with ICMP_NE");
7625 assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue() &&
7626 "AnyOf expected to start by comparing main resume value to original "
7627 "start value");
7628 MainResumeValue = Cmp->getOperand(0);
7630 RdxDesc.getRecurrenceKind())) {
7631 using namespace llvm::PatternMatch;
7632 Value *Cmp, *OrigResumeV;
7633 bool IsExpectedPattern =
7634 match(MainResumeValue, m_Select(m_OneUse(m_Value(Cmp)),
7635 m_Specific(RdxDesc.getSentinelValue()),
7636 m_Value(OrigResumeV))) &&
7637 match(Cmp,
7640 assert(IsExpectedPattern && "Unexpected reduction resume pattern");
7641 (void)IsExpectedPattern;
7642 MainResumeValue = OrigResumeV;
7643 }
7644 PHINode *MainResumePhi = cast<PHINode>(MainResumeValue);
7645
7646 // When fixing reductions in the epilogue loop we should already have
7647 // created a bc.merge.rdx Phi after the main vector body. Ensure that we carry
7648 // over the incoming values correctly.
7649 using namespace VPlanPatternMatch;
7650 auto IsResumePhi = [](VPUser *U) {
7651 return match(
7652 U, m_VPInstruction<VPInstruction::ResumePhi>(m_VPValue(), m_VPValue()));
7653 };
7654 assert(count_if(EpiRedResult->users(), IsResumePhi) == 1 &&
7655 "ResumePhi must have a single user");
7656 auto *EpiResumePhiVPI =
7657 cast<VPInstruction>(*find_if(EpiRedResult->users(), IsResumePhi));
7658 auto *EpiResumePhi = cast<PHINode>(State.get(EpiResumePhiVPI, true));
7659 EpiResumePhi->setIncomingValueForBlock(
7660 BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock));
7661}
7662
7664 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7665 InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue,
7666 const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
7667 assert(BestVPlan.hasVF(BestVF) &&
7668 "Trying to execute plan with unsupported VF");
7669 assert(BestVPlan.hasUF(BestUF) &&
7670 "Trying to execute plan with unsupported UF");
7671 assert(
7672 ((VectorizingEpilogue && ExpandedSCEVs) ||
7673 (!VectorizingEpilogue && !ExpandedSCEVs)) &&
7674 "expanded SCEVs to reuse can only be used during epilogue vectorization");
7675
7676 // TODO: Move to VPlan transform stage once the transition to the VPlan-based
7677 // cost model is complete for better cost estimates.
7679 OrigLoop->getHeader()->getContext());
7680 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7682
7683 // Perform the actual loop transformation.
7684 VPTransformState State(&TTI, BestVF, BestUF, LI, DT, ILV.Builder, &ILV,
7685 &BestVPlan, OrigLoop->getParentLoop(),
7686 Legal->getWidestInductionType());
7687
7688#ifdef EXPENSIVE_CHECKS
7689 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
7690#endif
7691
7692 // 0. Generate SCEV-dependent code in the entry, including TripCount, before
7693 // making any changes to the CFG.
7694 if (!BestVPlan.getEntry()->empty())
7695 BestVPlan.getEntry()->execute(&State);
7696
7697 if (!ILV.getTripCount())
7698 ILV.setTripCount(State.get(BestVPlan.getTripCount(), VPLane(0)));
7699 else
7700 assert(VectorizingEpilogue && "should only re-use the existing trip "
7701 "count during epilogue vectorization");
7702
7703 // 1. Set up the skeleton for vectorization, including vector pre-header and
7704 // middle block. The vector loop is created during VPlan execution.
7705 VPBasicBlock *VectorPH =
7706 cast<VPBasicBlock>(BestVPlan.getEntry()->getSingleSuccessor());
7708 ExpandedSCEVs ? *ExpandedSCEVs : State.ExpandedSCEVs);
7709 if (VectorizingEpilogue)
7711
7712 // Only use noalias metadata when using memory checks guaranteeing no overlap
7713 // across all iterations.
7714 const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7715 std::unique_ptr<LoopVersioning> LVer = nullptr;
7716 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7718
7719 // We currently don't use LoopVersioning for the actual loop cloning but we
7720 // still use it to add the noalias metadata.
7721 // TODO: Find a better way to re-use LoopVersioning functionality to add
7722 // metadata.
7723 LVer = std::make_unique<LoopVersioning>(
7724 *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7725 PSE.getSE());
7726 State.LVer = &*LVer;
7728 }
7729
7731
7732 //===------------------------------------------------===//
7733 //
7734 // Notice: any optimization or new instruction that go
7735 // into the code below should also be implemented in
7736 // the cost-model.
7737 //
7738 //===------------------------------------------------===//
7739
7740 // 2. Copy and widen instructions from the old loop into the new loop.
7741 BestVPlan.prepareToExecute(
7742 ILV.getTripCount(),
7744 replaceVPBBWithIRVPBB(VectorPH, State.CFG.PrevBB);
7745
7746 BestVPlan.execute(&State);
7747
7748 auto *MiddleVPBB = BestVPlan.getMiddleBlock();
7749 // 2.5 When vectorizing the epilogue, fix reduction and induction resume
7750 // values from the additional bypass block.
7751 if (VectorizingEpilogue) {
7753 "Epilogue vectorisation not yet supported with early exits");
7754 BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock();
7755 for (VPRecipeBase &R : *MiddleVPBB) {
7757 &R, State, State.CFG.VPBB2IRBB[MiddleVPBB], BypassBlock);
7758 }
7759 BasicBlock *PH = OrigLoop->getLoopPreheader();
7760 for (const auto &[IVPhi, _] : Legal->getInductionVars()) {
7761 auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
7763 Inc->setIncomingValueForBlock(BypassBlock, V);
7764 }
7765 }
7766
7767 // 2.6. Maintain Loop Hints
7768 // Keep all loop hints from the original loop on the vector loop (we'll
7769 // replace the vectorizer-specific hints below).
7770 if (auto *LoopRegion = BestVPlan.getVectorLoopRegion()) {
7771 MDNode *OrigLoopID = OrigLoop->getLoopID();
7772
7773 std::optional<MDNode *> VectorizedLoopID =
7776
7777 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
7778 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7779 if (VectorizedLoopID) {
7780 L->setLoopID(*VectorizedLoopID);
7781 } else {
7782 // Keep all loop hints from the original loop on the vector loop (we'll
7783 // replace the vectorizer-specific hints below).
7784 if (MDNode *LID = OrigLoop->getLoopID())
7785 L->setLoopID(LID);
7786
7787 LoopVectorizeHints Hints(L, true, *ORE);
7788 Hints.setAlreadyVectorized();
7789 }
7791 TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7792 if (!UP.UnrollVectorizedLoop || VectorizingEpilogue)
7794 }
7795
7796 // 3. Fix the vectorized code: take care of header phi's, live-outs,
7797 // predication, updating analyses.
7798 ILV.fixVectorizedLoop(State);
7799
7801
7802 // 4. Adjust branch weight of the branch in the middle block.
7803 if (BestVPlan.getVectorLoopRegion()) {
7804 auto *MiddleVPBB = BestVPlan.getMiddleBlock();
7805 auto *MiddleTerm =
7806 cast<BranchInst>(State.CFG.VPBB2IRBB[MiddleVPBB]->getTerminator());
7807 if (MiddleTerm->isConditional() &&
7808 hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7809 // Assume that `Count % VectorTripCount` is equally distributed.
7810 unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
7811 assert(TripCount > 0 && "trip count should not be zero");
7812 const uint32_t Weights[] = {1, TripCount - 1};
7813 setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
7814 }
7815 }
7816
7817 return State.ExpandedSCEVs;
7818}
7819
7820//===--------------------------------------------------------------------===//
7821// EpilogueVectorizerMainLoop
7822//===--------------------------------------------------------------------===//
7823
7824/// This function is partially responsible for generating the control flow
7825/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7827 const SCEV2ValueTy &ExpandedSCEVs) {
7829
7830 // Generate the code to check the minimum iteration count of the vector
7831 // epilogue (see below).
7835
7836 // Generate the code to check any assumptions that we've made for SCEV
7837 // expressions.
7839
7840 // Generate the code that checks at runtime if arrays overlap. We put the
7841 // checks into a separate block to make the more common case of few elements
7842 // faster.
7844
7845 // Generate the iteration count check for the main loop, *after* the check
7846 // for the epilogue loop, so that the path-length is shorter for the case
7847 // that goes directly through the vector epilogue. The longer-path length for
7848 // the main loop is compensated for, by the gain from vectorizing the larger
7849 // trip count. Note: the branch will get updated later on when we vectorize
7850 // the epilogue.
7853
7854 // Generate the induction variable.
7856
7857 return LoopVectorPreHeader;
7858}
7859
7861 LLVM_DEBUG({
7862 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7863 << "Main Loop VF:" << EPI.MainLoopVF
7864 << ", Main Loop UF:" << EPI.MainLoopUF
7865 << ", Epilogue Loop VF:" << EPI.EpilogueVF
7866 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7867 });
7868}
7869
7872 dbgs() << "intermediate fn:\n"
7873 << *OrigLoop->getHeader()->getParent() << "\n";
7874 });
7875}
7876
7877BasicBlock *
7879 bool ForEpilogue) {
7880 assert(Bypass && "Expected valid bypass basic block.");
7881 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7882 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7883 Value *Count = getTripCount();
7884 // Reuse existing vector loop preheader for TC checks.
7885 // Note that new preheader block is generated for vector loop.
7886 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7887 IRBuilder<> Builder(TCCheckBlock->getTerminator());
7888
7889 // Generate code to check if the loop's trip count is less than VF * UF of the
7890 // main vector loop.
7891 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector()
7892 : VF.isVector())
7895
7896 Value *CheckMinIters = Builder.CreateICmp(
7897 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7898 "min.iters.check");
7899
7900 if (!ForEpilogue)
7901 TCCheckBlock->setName("vector.main.loop.iter.check");
7902
7903 // Create new preheader for vector loop.
7904 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7905 DT, LI, nullptr, "vector.ph");
7906
7907 if (ForEpilogue) {
7908 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7909 DT->getNode(Bypass)->getIDom()) &&
7910 "TC check is expected to dominate Bypass");
7911
7912 LoopBypassBlocks.push_back(TCCheckBlock);
7913
7914 // Save the trip count so we don't have to regenerate it in the
7915 // vec.epilog.iter.check. This is safe to do because the trip count
7916 // generated here dominates the vector epilog iter check.
7917 EPI.TripCount = Count;
7918 }
7919
7920 BranchInst &BI =
7921 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7923 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
7924 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
7925
7926 introduceCheckBlockInVPlan(TCCheckBlock);
7927 return TCCheckBlock;
7928}
7929
7930//===--------------------------------------------------------------------===//
7931// EpilogueVectorizerEpilogueLoop
7932//===--------------------------------------------------------------------===//
7933
7934/// This function is partially responsible for generating the control flow
7935/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7936BasicBlock *
7938 const SCEV2ValueTy &ExpandedSCEVs) {
7939 createVectorLoopSkeleton("vec.epilog.");
7940
7941 // Now, compare the remaining count and if there aren't enough iterations to
7942 // execute the vectorized epilogue skip to the scalar part.
7943 LoopVectorPreHeader->setName("vec.epilog.ph");
7944 BasicBlock *VecEpilogueIterationCountCheck =
7946 nullptr, "vec.epilog.iter.check", true);
7948 VecEpilogueIterationCountCheck);
7949 AdditionalBypassBlock = VecEpilogueIterationCountCheck;
7950
7951 // Adjust the control flow taking the state info from the main loop
7952 // vectorization into account.
7954 "expected this to be saved from the previous pass.");
7956 VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7957
7959 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7960
7961 if (EPI.SCEVSafetyCheck)
7963 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7964 if (EPI.MemSafetyCheck)
7966 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7967
7970 // Keep track of bypass blocks, as they feed start values to the induction and
7971 // reduction phis in the scalar loop preheader.
7972 if (EPI.SCEVSafetyCheck)
7974 if (EPI.MemSafetyCheck)
7977
7978 // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7979 // reductions which merge control-flow from the latch block and the middle
7980 // block. Update the incoming values here and move the Phi into the preheader.
7981 SmallVector<PHINode *, 4> PhisInBlock;
7982 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7983 PhisInBlock.push_back(&Phi);
7984
7985 for (PHINode *Phi : PhisInBlock) {
7986 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHIIt());
7987 Phi->replaceIncomingBlockWith(
7988 VecEpilogueIterationCountCheck->getSinglePredecessor(),
7989 VecEpilogueIterationCountCheck);
7990
7991 // If the phi doesn't have an incoming value from the
7992 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7993 // value and also those from other check blocks. This is needed for
7994 // reduction phis only.
7995 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
7996 return EPI.EpilogueIterationCountCheck == IncB;
7997 }))
7998 continue;
7999 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
8000 if (EPI.SCEVSafetyCheck)
8001 Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
8002 if (EPI.MemSafetyCheck)
8003 Phi->removeIncomingValue(EPI.MemSafetyCheck);
8004 }
8005
8006 // Generate bypass values from the additional bypass block. Note that when the
8007 // vectorized epilogue is skipped due to iteration count check, then the
8008 // resume value for the induction variable comes from the trip count of the
8009 // main vector loop, passed as the second argument.
8011 return LoopVectorPreHeader;
8012}
8013
8014BasicBlock *
8016 BasicBlock *Bypass, BasicBlock *Insert) {
8017
8019 "Expected trip count to have been saved in the first pass.");
8020 assert(
8021 (!isa<Instruction>(EPI.TripCount) ||
8022 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
8023 "saved trip count does not dominate insertion point.");
8024 Value *TC = EPI.TripCount;
8025 IRBuilder<> Builder(Insert->getTerminator());
8026 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
8027
8028 // Generate code to check if the loop's trip count is less than VF * UF of the
8029 // vector epilogue loop.
8030 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
8033
8034 Value *CheckMinIters =
8035 Builder.CreateICmp(P, Count,
8038 "min.epilog.iters.check");
8039
8040 BranchInst &BI =
8041 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
8043 unsigned MainLoopStep = UF * VF.getKnownMinValue();
8044 unsigned EpilogueLoopStep =
8046 // We assume the remaining `Count` is equally distributed in
8047 // [0, MainLoopStep)
8048 // So the probability for `Count < EpilogueLoopStep` should be
8049 // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
8050 unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
8051 const uint32_t Weights[] = {EstimatedSkipCount,
8052 MainLoopStep - EstimatedSkipCount};
8053 setBranchWeights(BI, Weights, /*IsExpected=*/false);
8054 }
8055 ReplaceInstWithInst(Insert->getTerminator(), &BI);
8056 LoopBypassBlocks.push_back(Insert);
8057
8058 // A new entry block has been created for the epilogue VPlan. Hook it in, as
8059 // otherwise we would try to modify the entry to the main vector loop.
8060 VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(Insert);
8061 VPBasicBlock *OldEntry = Plan.getEntry();
8062 VPBlockUtils::reassociateBlocks(OldEntry, NewEntry);
8063 Plan.setEntry(NewEntry);
8064 // OldEntry is now dead and will be cleaned up when the plan gets destroyed.
8065
8067 return Insert;
8068}
8069
8071 LLVM_DEBUG({
8072 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
8073 << "Epilogue Loop VF:" << EPI.EpilogueVF
8074 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8075 });
8076}
8077
8080 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
8081 });
8082}
8083
8084iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>>
8086 std::function<VPValue *(Value *)> Fn = [this](Value *Op) {
8087 return getVPValueOrAddLiveIn(Op);
8088 };
8089 return map_range(Operands, Fn);
8090}
8091
8093 BasicBlock *Src = SI->getParent();
8094 assert(!OrigLoop->isLoopExiting(Src) &&
8095 all_of(successors(Src),
8096 [this](BasicBlock *Succ) {
8097 return OrigLoop->getHeader() != Succ;
8098 }) &&
8099 "unsupported switch either exiting loop or continuing to header");
8100 // Create masks where the terminator in Src is a switch. We create mask for
8101 // all edges at the same time. This is more efficient, as we can create and
8102 // collect compares for all cases once.
8103 VPValue *Cond = getVPValueOrAddLiveIn(SI->getCondition());
8104 BasicBlock *DefaultDst = SI->getDefaultDest();
8106 for (auto &C : SI->cases()) {
8107 BasicBlock *Dst = C.getCaseSuccessor();
8108 assert(!EdgeMaskCache.contains({Src, Dst}) && "Edge masks already created");
8109 // Cases whose destination is the same as default are redundant and can be
8110 // ignored - they will get there anyhow.
8111 if (Dst == DefaultDst)
8112 continue;
8113 auto &Compares = Dst2Compares[Dst];
8114 VPValue *V = getVPValueOrAddLiveIn(C.getCaseValue());
8115 Compares.push_back(Builder.createICmp(CmpInst::ICMP_EQ, Cond, V));
8116 }
8117
8118 // We need to handle 2 separate cases below for all entries in Dst2Compares,
8119 // which excludes destinations matching the default destination.
8120 VPValue *SrcMask = getBlockInMask(Src);
8121 VPValue *DefaultMask = nullptr;
8122 for (const auto &[Dst, Conds] : Dst2Compares) {
8123 // 1. Dst is not the default destination. Dst is reached if any of the cases
8124 // with destination == Dst are taken. Join the conditions for each case
8125 // whose destination == Dst using an OR.
8126 VPValue *Mask = Conds[0];
8127 for (VPValue *V : ArrayRef<VPValue *>(Conds).drop_front())
8128 Mask = Builder.createOr(Mask, V);
8129 if (SrcMask)
8130 Mask = Builder.createLogicalAnd(SrcMask, Mask);
8131 EdgeMaskCache[{Src, Dst}] = Mask;
8132
8133 // 2. Create the mask for the default destination, which is reached if none
8134 // of the cases with destination != default destination are taken. Join the
8135 // conditions for each case where the destination is != Dst using an OR and
8136 // negate it.
8137 DefaultMask = DefaultMask ? Builder.createOr(DefaultMask, Mask) : Mask;
8138 }
8139
8140 if (DefaultMask) {
8141 DefaultMask = Builder.createNot(DefaultMask);
8142 if (SrcMask)
8143 DefaultMask = Builder.createLogicalAnd(SrcMask, DefaultMask);
8144 }
8145 EdgeMaskCache[{Src, DefaultDst}] = DefaultMask;
8146}
8147
8149 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8150
8151 // Look for cached value.
8152 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8153 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8154 if (ECEntryIt != EdgeMaskCache.end())
8155 return ECEntryIt->second;
8156
8157 if (auto *SI = dyn_cast<SwitchInst>(Src->getTerminator())) {
8159 assert(EdgeMaskCache.contains(Edge) && "Mask for Edge not created?");
8160 return EdgeMaskCache[Edge];
8161 }
8162
8163 VPValue *SrcMask = getBlockInMask(Src);
8164
8165 // The terminator has to be a branch inst!
8166 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8167 assert(BI && "Unexpected terminator found");
8168 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8169 return EdgeMaskCache[Edge] = SrcMask;
8170
8171 // If source is an exiting block, we know the exit edge is dynamically dead
8172 // in the vector loop, and thus we don't need to restrict the mask. Avoid
8173 // adding uses of an otherwise potentially dead instruction unless we are
8174 // vectorizing a loop with uncountable exits. In that case, we always
8175 // materialize the mask.
8176 if (OrigLoop->isLoopExiting(Src) &&
8177 Src != Legal->getUncountableEarlyExitingBlock())
8178 return EdgeMaskCache[Edge] = SrcMask;
8179
8180 VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition());
8181 assert(EdgeMask && "No Edge Mask found for condition");
8182
8183 if (BI->getSuccessor(0) != Dst)
8184 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8185
8186 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8187 // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask
8188 // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd'
8189 // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8190 EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, BI->getDebugLoc());
8191 }
8192
8193 return EdgeMaskCache[Edge] = EdgeMask;
8194}
8195
8197 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8198
8199 // Look for cached value.
8200 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8201 EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Edge);
8202 assert(ECEntryIt != EdgeMaskCache.end() &&
8203 "looking up mask for edge which has not been created");
8204 return ECEntryIt->second;
8205}
8206
8208 BasicBlock *Header = OrigLoop->getHeader();
8209
8210 // When not folding the tail, use nullptr to model all-true mask.
8211 if (!CM.foldTailByMasking()) {
8212 BlockMaskCache[Header] = nullptr;
8213 return;
8214 }
8215
8216 // Introduce the early-exit compare IV <= BTC to form header block mask.
8217 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8218 // constructing the desired canonical IV in the header block as its first
8219 // non-phi instructions.
8220
8221 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
8222 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8223 auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
8224 HeaderVPBB->insert(IV, NewInsertionPoint);
8225
8226 VPBuilder::InsertPointGuard Guard(Builder);
8227 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8228 VPValue *BlockMask = nullptr;
8230 BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
8231 BlockMaskCache[Header] = BlockMask;
8232}
8233
8235 // Return the cached value.
8236 BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
8237 assert(BCEntryIt != BlockMaskCache.end() &&
8238 "Trying to access mask for block without one.");
8239 return BCEntryIt->second;
8240}
8241
8243 assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8244 assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
8245 assert(OrigLoop->getHeader() != BB &&
8246 "Loop header must have cached block mask");
8247
8248 // All-one mask is modelled as no-mask following the convention for masked
8249 // load/store/gather/scatter. Initialize BlockMask to no-mask.
8250 VPValue *BlockMask = nullptr;
8251 // This is the block mask. We OR all unique incoming edges.
8252 for (auto *Predecessor :
8254 VPValue *EdgeMask = createEdgeMask(Predecessor, BB);
8255 if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
8256 BlockMaskCache[BB] = EdgeMask;
8257 return;
8258 }
8259
8260 if (!BlockMask) { // BlockMask has its initialized nullptr value.
8261 BlockMask = EdgeMask;
8262 continue;
8263 }
8264
8265 BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8266 }
8267
8268 BlockMaskCache[BB] = BlockMask;
8269}
8270
8272VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
8273 VFRange &Range) {
8274 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8275 "Must be called with either a load or store");
8276
8277 auto WillWiden = [&](ElementCount VF) -> bool {
8279 CM.getWideningDecision(I, VF);
8281 "CM decision should be taken at this point.");
8283 return true;
8284 if (CM.isScalarAfterVectorization(I, VF) ||
8285 CM.isProfitableToScalarize(I, VF))
8286 return false;
8288 };
8289
8291 return nullptr;
8292
8293 VPValue *Mask = nullptr;
8294 if (Legal->isMaskRequired(I))
8295 Mask = getBlockInMask(I->getParent());
8296
8297 // Determine if the pointer operand of the access is either consecutive or
8298 // reverse consecutive.
8300 CM.getWideningDecision(I, Range.Start);
8302 bool Consecutive =
8304
8305 VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
8306 if (Consecutive) {
8307 auto *GEP = dyn_cast<GetElementPtrInst>(
8308 Ptr->getUnderlyingValue()->stripPointerCasts());
8309 VPSingleDefRecipe *VectorPtr;
8310 if (Reverse) {
8311 // When folding the tail, we may compute an address that we don't in the
8312 // original scalar loop and it may not be inbounds. Drop Inbounds in that
8313 // case.
8314 GEPNoWrapFlags Flags =
8315 (CM.foldTailByMasking() || !GEP || !GEP->isInBounds())
8318 VectorPtr = new VPReverseVectorPointerRecipe(
8319 Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc());
8320 } else {
8321 VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
8322 GEP ? GEP->getNoWrapFlags()
8324 I->getDebugLoc());
8325 }
8326 Builder.insert(VectorPtr);
8327 Ptr = VectorPtr;
8328 }
8329 if (LoadInst *Load = dyn_cast<LoadInst>(I))
8330 return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
8331 I->getDebugLoc());
8332
8333 StoreInst *Store = cast<StoreInst>(I);
8334 return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
8335 Reverse, I->getDebugLoc());
8336}
8337
8338/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8339/// insert a recipe to expand the step for the induction recipe.
8342 VPValue *Start, const InductionDescriptor &IndDesc,
8343 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) {
8344 assert(IndDesc.getStartValue() ==
8345 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8346 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8347 "step must be loop invariant");
8348
8349 VPValue *Step =
8351 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8352 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
8353 IndDesc, TruncI,
8354 TruncI->getDebugLoc());
8355 }
8356 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8357 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
8358 IndDesc, Phi->getDebugLoc());
8359}
8360
8361VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
8363
8364 // Check if this is an integer or fp induction. If so, build the recipe that
8365 // produces its scalar and vector values.
8366 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8367 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
8368 *PSE.getSE(), *OrigLoop);
8369
8370 // Check if this is pointer induction. If so, build the recipe for it.
8371 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
8372 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
8373 *PSE.getSE());
8375 Phi, Operands[0], Step, *II,
8377 [&](ElementCount VF) {
8378 return CM.isScalarAfterVectorization(Phi, VF);
8379 },
8380 Range),
8381 Phi->getDebugLoc());
8382 }
8383 return nullptr;
8384}
8385
8386VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8388 // Optimize the special case where the source is a constant integer
8389 // induction variable. Notice that we can only optimize the 'trunc' case
8390 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8391 // (c) other casts depend on pointer size.
8392
8393 // Determine whether \p K is a truncation based on an induction variable that
8394 // can be optimized.
8395 auto IsOptimizableIVTruncate =
8396 [&](Instruction *K) -> std::function<bool(ElementCount)> {
8397 return [=](ElementCount VF) -> bool {
8398 return CM.isOptimizableIVTruncate(K, VF);
8399 };
8400 };
8401
8403 IsOptimizableIVTruncate(I), Range)) {
8404
8405 auto *Phi = cast<PHINode>(I->getOperand(0));
8407 VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue());
8408 return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
8409 *OrigLoop);
8410 }
8411 return nullptr;
8412}
8413
8414VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
8416 unsigned NumIncoming = Phi->getNumIncomingValues();
8417
8418 // We know that all PHIs in non-header blocks are converted into selects, so
8419 // we don't have to worry about the insertion order and we can just use the
8420 // builder. At this point we generate the predication tree. There may be
8421 // duplications since this is a simple recursive scan, but future
8422 // optimizations will clean it up.
8423 SmallVector<VPValue *, 2> OperandsWithMask;
8424
8425 for (unsigned In = 0; In < NumIncoming; In++) {
8426 OperandsWithMask.push_back(Operands[In]);
8427 VPValue *EdgeMask =
8428 getEdgeMask(Phi->getIncomingBlock(In), Phi->getParent());
8429 if (!EdgeMask) {
8430 assert(In == 0 && "Both null and non-null edge masks found");
8432 "Distinct incoming values with one having a full mask");
8433 break;
8434 }
8435 OperandsWithMask.push_back(EdgeMask);
8436 }
8437 return new VPBlendRecipe(Phi, OperandsWithMask);
8438}
8439
8440VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8442 VFRange &Range) {
8444 [this, CI](ElementCount VF) {
8445 return CM.isScalarWithPredication(CI, VF);
8446 },
8447 Range);
8448
8449 if (IsPredicated)
8450 return nullptr;
8451
8453 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8454 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8455 ID == Intrinsic::pseudoprobe ||
8456 ID == Intrinsic::experimental_noalias_scope_decl))
8457 return nullptr;
8458
8459 SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
8460
8461 // Is it beneficial to perform intrinsic call compared to lib call?
8462 bool ShouldUseVectorIntrinsic =
8464 [&](ElementCount VF) -> bool {
8465 return CM.getCallWideningDecision(CI, VF).Kind ==
8467 },
8468 Range);
8469 if (ShouldUseVectorIntrinsic)
8470 return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(),
8471 CI->getDebugLoc());
8472
8473 Function *Variant = nullptr;
8474 std::optional<unsigned> MaskPos;
8475 // Is better to call a vectorized version of the function than to to scalarize
8476 // the call?
8477 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8478 [&](ElementCount VF) -> bool {
8479 // The following case may be scalarized depending on the VF.
8480 // The flag shows whether we can use a usual Call for vectorized
8481 // version of the instruction.
8482
8483 // If we've found a variant at a previous VF, then stop looking. A
8484 // vectorized variant of a function expects input in a certain shape
8485 // -- basically the number of input registers, the number of lanes
8486 // per register, and whether there's a mask required.
8487 // We store a pointer to the variant in the VPWidenCallRecipe, so
8488 // once we have an appropriate variant it's only valid for that VF.
8489 // This will force a different vplan to be generated for each VF that
8490 // finds a valid variant.
8491 if (Variant)
8492 return false;
8494 CM.getCallWideningDecision(CI, VF);
8496 Variant = Decision.Variant;
8497 MaskPos = Decision.MaskPos;
8498 return true;
8499 }
8500
8501 return false;
8502 },
8503 Range);
8504 if (ShouldUseVectorCall) {
8505 if (MaskPos.has_value()) {
8506 // We have 2 cases that would require a mask:
8507 // 1) The block needs to be predicated, either due to a conditional
8508 // in the scalar loop or use of an active lane mask with
8509 // tail-folding, and we use the appropriate mask for the block.
8510 // 2) No mask is required for the block, but the only available
8511 // vector variant at this VF requires a mask, so we synthesize an
8512 // all-true mask.
8513 VPValue *Mask = nullptr;
8514 if (Legal->isMaskRequired(CI))
8515 Mask = getBlockInMask(CI->getParent());
8516 else
8517 Mask = Plan.getOrAddLiveIn(
8519
8520 Ops.insert(Ops.begin() + *MaskPos, Mask);
8521 }
8522
8523 Ops.push_back(Operands.back());
8524 return new VPWidenCallRecipe(CI, Variant, Ops, CI->getDebugLoc());
8525 }
8526
8527 return nullptr;
8528}
8529
8530bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8531 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8532 !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8533 // Instruction should be widened, unless it is scalar after vectorization,
8534 // scalarization is profitable or it is predicated.
8535 auto WillScalarize = [this, I](ElementCount VF) -> bool {
8536 return CM.isScalarAfterVectorization(I, VF) ||
8537 CM.isProfitableToScalarize(I, VF) ||
8538 CM.isScalarWithPredication(I, VF);
8539 };
8541 Range);
8542}
8543
8544VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8546 VPBasicBlock *VPBB) {
8547 switch (I->getOpcode()) {
8548 default:
8549 return nullptr;
8550 case Instruction::SDiv:
8551 case Instruction::UDiv:
8552 case Instruction::SRem:
8553 case Instruction::URem: {
8554 // If not provably safe, use a select to form a safe divisor before widening the
8555 // div/rem operation itself. Otherwise fall through to general handling below.
8556 if (CM.isPredicatedInst(I)) {
8558 VPValue *Mask = getBlockInMask(I->getParent());
8559 VPValue *One =
8560 Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));
8561 auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc());
8562 Ops[1] = SafeRHS;
8563 return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
8564 }
8565 [[fallthrough]];
8566 }
8567 case Instruction::Add:
8568 case Instruction::And:
8569 case Instruction::AShr:
8570 case Instruction::FAdd:
8571 case Instruction::FCmp:
8572 case Instruction::FDiv:
8573 case Instruction::FMul:
8574 case Instruction::FNeg:
8575 case Instruction::FRem:
8576 case Instruction::FSub:
8577 case Instruction::ICmp:
8578 case Instruction::LShr:
8579 case Instruction::Mul:
8580 case Instruction::Or:
8581 case Instruction::Select:
8582 case Instruction::Shl:
8583 case Instruction::Sub:
8584 case Instruction::Xor:
8585 case Instruction::Freeze:
8587 if (Instruction::isBinaryOp(I->getOpcode())) {
8588 // The legacy cost model uses SCEV to check if some of the operands are
8589 // constants. To match the legacy cost model's behavior, use SCEV to try
8590 // to replace operands with constants.
8591 ScalarEvolution &SE = *PSE.getSE();
8592 auto GetConstantViaSCEV = [this, &SE](VPValue *Op) {
8593 if (!Op->isLiveIn())
8594 return Op;
8595 Value *V = Op->getUnderlyingValue();
8596 if (isa<Constant>(V) || !SE.isSCEVable(V->getType()))
8597 return Op;
8598 auto *C = dyn_cast<SCEVConstant>(SE.getSCEV(V));
8599 if (!C)
8600 return Op;
8601 return Plan.getOrAddLiveIn(C->getValue());
8602 };
8603 // For Mul, the legacy cost model checks both operands.
8604 if (I->getOpcode() == Instruction::Mul)
8605 NewOps[0] = GetConstantViaSCEV(NewOps[0]);
8606 // For other binops, the legacy cost model only checks the second operand.
8607 NewOps[1] = GetConstantViaSCEV(NewOps[1]);
8608 }
8609 return new VPWidenRecipe(*I, make_range(NewOps.begin(), NewOps.end()));
8610 };
8611}
8612
8614VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
8616 // FIXME: Support other operations.
8617 unsigned Opcode = HI->Update->getOpcode();
8618 assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
8619 "Histogram update operation must be an Add or Sub");
8620
8622 // Bucket address.
8623 HGramOps.push_back(Operands[1]);
8624 // Increment value.
8625 HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update->getOperand(1)));
8626
8627 // In case of predicated execution (due to tail-folding, or conditional
8628 // execution, or both), pass the relevant mask.
8629 if (Legal->isMaskRequired(HI->Store))
8630 HGramOps.push_back(getBlockInMask(HI->Store->getParent()));
8631
8632 return new VPHistogramRecipe(Opcode,
8633 make_range(HGramOps.begin(), HGramOps.end()),
8634 HI->Store->getDebugLoc());
8635}
8636
8638 BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8639 for (VPHeaderPHIRecipe *R : PhisToFix) {
8640 auto *PN = cast<PHINode>(R->getUnderlyingValue());
8641 VPRecipeBase *IncR =
8642 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8643 R->addOperand(IncR->getVPSingleValue());
8644 }
8645}
8646
8649 VFRange &Range) {
8651 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8652 Range);
8653
8654 bool IsPredicated = CM.isPredicatedInst(I);
8655
8656 // Even if the instruction is not marked as uniform, there are certain
8657 // intrinsic calls that can be effectively treated as such, so we check for
8658 // them here. Conservatively, we only do this for scalable vectors, since
8659 // for fixed-width VFs we can always fall back on full scalarization.
8660 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8661 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8662 case Intrinsic::assume:
8663 case Intrinsic::lifetime_start:
8664 case Intrinsic::lifetime_end:
8665 // For scalable vectors if one of the operands is variant then we still
8666 // want to mark as uniform, which will generate one instruction for just
8667 // the first lane of the vector. We can't scalarize the call in the same
8668 // way as for fixed-width vectors because we don't know how many lanes
8669 // there are.
8670 //
8671 // The reasons for doing it this way for scalable vectors are:
8672 // 1. For the assume intrinsic generating the instruction for the first
8673 // lane is still be better than not generating any at all. For
8674 // example, the input may be a splat across all lanes.
8675 // 2. For the lifetime start/end intrinsics the pointer operand only
8676 // does anything useful when the input comes from a stack object,
8677 // which suggests it should always be uniform. For non-stack objects
8678 // the effect is to poison the object, which still allows us to
8679 // remove the call.
8680 IsUniform = true;
8681 break;
8682 default:
8683 break;
8684 }
8685 }
8686 VPValue *BlockInMask = nullptr;
8687 if (!IsPredicated) {
8688 // Finalize the recipe for Instr, first if it is not predicated.
8689 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8690 } else {
8691 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8692 // Instructions marked for predication are replicated and a mask operand is
8693 // added initially. Masked replicate recipes will later be placed under an
8694 // if-then construct to prevent side-effects. Generate recipes to compute
8695 // the block mask for this region.
8696 BlockInMask = getBlockInMask(I->getParent());
8697 }
8698
8699 // Note that there is some custom logic to mark some intrinsics as uniform
8700 // manually above for scalable vectors, which this assert needs to account for
8701 // as well.
8702 assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
8703 (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
8704 "Should not predicate a uniform recipe");
8705 auto *Recipe = new VPReplicateRecipe(
8706 I, make_range(Operands.begin(), Operands.end()), IsUniform, BlockInMask);
8707 return Recipe;
8708}
8709
8710/// Find all possible partial reductions in the loop and track all of those that
8711/// are valid so recipes can be formed later.
8713 // Find all possible partial reductions.
8715 PartialReductionChains;
8716 for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) {
8717 getScaledReductions(Phi, RdxDesc.getLoopExitInstr(), Range,
8718 PartialReductionChains);
8719 }
8720
8721 // A partial reduction is invalid if any of its extends are used by
8722 // something that isn't another partial reduction. This is because the
8723 // extends are intended to be lowered along with the reduction itself.
8724
8725 // Build up a set of partial reduction bin ops for efficient use checking.
8726 SmallSet<User *, 4> PartialReductionBinOps;
8727 for (const auto &[PartialRdx, _] : PartialReductionChains)
8728 PartialReductionBinOps.insert(PartialRdx.BinOp);
8729
8730 auto ExtendIsOnlyUsedByPartialReductions =
8731 [&PartialReductionBinOps](Instruction *Extend) {
8732 return all_of(Extend->users(), [&](const User *U) {
8733 return PartialReductionBinOps.contains(U);
8734 });
8735 };
8736
8737 // Check if each use of a chain's two extends is a partial reduction
8738 // and only add those that don't have non-partial reduction users.
8739 for (auto Pair : PartialReductionChains) {
8740 PartialReductionChain Chain = Pair.first;
8741 if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) &&
8742 ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB))
8743 ScaledReductionMap.insert(std::make_pair(Chain.Reduction, Pair.second));
8744 }
8745}
8746
8747bool VPRecipeBuilder::getScaledReductions(
8748 Instruction *PHI, Instruction *RdxExitInstr, VFRange &Range,
8749 SmallVectorImpl<std::pair<PartialReductionChain, unsigned>> &Chains) {
8750
8751 if (!CM.TheLoop->contains(RdxExitInstr))
8752 return false;
8753
8754 // TODO: Allow scaling reductions when predicating. The select at
8755 // the end of the loop chooses between the phi value and most recent
8756 // reduction result, both of which have different VFs to the active lane
8757 // mask when scaling.
8758 if (CM.blockNeedsPredicationForAnyReason(RdxExitInstr->getParent()))
8759 return false;
8760
8761 auto *Update = dyn_cast<BinaryOperator>(RdxExitInstr);
8762 if (!Update)
8763 return false;
8764
8765 Value *Op = Update->getOperand(0);
8766 Value *PhiOp = Update->getOperand(1);
8767 if (Op == PHI)
8768 std::swap(Op, PhiOp);
8769
8770 // Try and get a scaled reduction from the first non-phi operand.
8771 // If one is found, we use the discovered reduction instruction in
8772 // place of the accumulator for costing.
8773 if (auto *OpInst = dyn_cast<Instruction>(Op)) {
8774 if (getScaledReductions(PHI, OpInst, Range, Chains)) {
8775 PHI = Chains.rbegin()->first.Reduction;
8776
8777 Op = Update->getOperand(0);
8778 PhiOp = Update->getOperand(1);
8779 if (Op == PHI)
8780 std::swap(Op, PhiOp);
8781 }
8782 }
8783 if (PhiOp != PHI)
8784 return false;
8785
8786 auto *BinOp = dyn_cast<BinaryOperator>(Op);
8787 if (!BinOp || !BinOp->hasOneUse())
8788 return false;
8789
8790 using namespace llvm::PatternMatch;
8791 Value *A, *B;
8792 if (!match(BinOp->getOperand(0), m_ZExtOrSExt(m_Value(A))) ||
8793 !match(BinOp->getOperand(1), m_ZExtOrSExt(m_Value(B))))
8794 return false;
8795
8796 Instruction *ExtA = cast<Instruction>(BinOp->getOperand(0));
8797 Instruction *ExtB = cast<Instruction>(BinOp->getOperand(1));
8798
8803
8804 PartialReductionChain Chain(RdxExitInstr, ExtA, ExtB, BinOp);
8805
8806 unsigned TargetScaleFactor =
8807 PHI->getType()->getPrimitiveSizeInBits().getKnownScalarFactor(
8808 A->getType()->getPrimitiveSizeInBits());
8809
8811 [&](ElementCount VF) {
8813 Update->getOpcode(), A->getType(), B->getType(), PHI->getType(),
8814 VF, OpAExtend, OpBExtend,
8815 std::make_optional(BinOp->getOpcode()));
8816 return Cost.isValid();
8817 },
8818 Range)) {
8819 Chains.push_back(std::make_pair(Chain, TargetScaleFactor));
8820 return true;
8821 }
8822
8823 return false;
8824}
8825
8829 VFRange &Range, VPBasicBlock *VPBB) {
8830 // First, check for specific widening recipes that deal with inductions, Phi
8831 // nodes, calls and memory operations.
8832 VPRecipeBase *Recipe;
8833 if (auto *Phi = dyn_cast<PHINode>(Instr)) {
8834 if (Phi->getParent() != OrigLoop->getHeader())
8835 return tryToBlend(Phi, Operands);
8836
8837 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8838 return Recipe;
8839
8840 VPHeaderPHIRecipe *PhiRecipe = nullptr;
8841 assert((Legal->isReductionVariable(Phi) ||
8842 Legal->isFixedOrderRecurrence(Phi)) &&
8843 "can only widen reductions and fixed-order recurrences here");
8844 VPValue *StartV = Operands[0];
8845 if (Legal->isReductionVariable(Phi)) {
8846 const RecurrenceDescriptor &RdxDesc =
8847 Legal->getReductionVars().find(Phi)->second;
8848 assert(RdxDesc.getRecurrenceStartValue() ==
8849 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8850
8851 // If the PHI is used by a partial reduction, set the scale factor.
8852 unsigned ScaleFactor =
8853 getScalingForReduction(RdxDesc.getLoopExitInstr()).value_or(1);
8854 PhiRecipe = new VPReductionPHIRecipe(
8855 Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi),
8856 CM.useOrderedReductions(RdxDesc), ScaleFactor);
8857 } else {
8858 // TODO: Currently fixed-order recurrences are modeled as chains of
8859 // first-order recurrences. If there are no users of the intermediate
8860 // recurrences in the chain, the fixed order recurrence should be modeled
8861 // directly, enabling more efficient codegen.
8862 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8863 }
8864
8865 PhisToFix.push_back(PhiRecipe);
8866 return PhiRecipe;
8867 }
8868
8869 if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
8870 cast<TruncInst>(Instr), Operands, Range)))
8871 return Recipe;
8872
8873 // All widen recipes below deal only with VF > 1.
8875 [&](ElementCount VF) { return VF.isScalar(); }, Range))
8876 return nullptr;
8877
8878 if (auto *CI = dyn_cast<CallInst>(Instr))
8879 return tryToWidenCall(CI, Operands, Range);
8880
8881 if (StoreInst *SI = dyn_cast<StoreInst>(Instr))
8882 if (auto HistInfo = Legal->getHistogramInfo(SI))
8883 return tryToWidenHistogram(*HistInfo, Operands);
8884
8885 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8886 return tryToWidenMemory(Instr, Operands, Range);
8887
8888 if (getScalingForReduction(Instr))
8890
8891 if (!shouldWiden(Instr, Range))
8892 return nullptr;
8893
8894 if (auto *GEP = dyn_cast<GetElementPtrInst>(Instr))
8895 return new VPWidenGEPRecipe(GEP,
8896 make_range(Operands.begin(), Operands.end()));
8897
8898 if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8899 return new VPWidenSelectRecipe(
8900 *SI, make_range(Operands.begin(), Operands.end()));
8901 }
8902
8903 if (auto *CI = dyn_cast<CastInst>(Instr)) {
8904 return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(),
8905 *CI);
8906 }
8907
8908 return tryToWiden(Instr, Operands, VPBB);
8909}
8910
8914 assert(Operands.size() == 2 &&
8915 "Unexpected number of operands for partial reduction");
8916
8917 VPValue *BinOp = Operands[0];
8919 VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe();
8920 if (isa<VPReductionPHIRecipe>(BinOpRecipe) ||
8921 isa<VPPartialReductionRecipe>(BinOpRecipe))
8922 std::swap(BinOp, Accumulator);
8923
8924 return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp,
8926}
8927
8928void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8929 ElementCount MaxVF) {
8930 assert(OrigLoop->isInnermost() && "Inner loop expected.");
8931
8932 auto MaxVFTimes2 = MaxVF * 2;
8933 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8934 VFRange SubRange = {VF, MaxVFTimes2};
8935 if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
8936 // Now optimize the initial VPlan.
8937 if (!Plan->hasVF(ElementCount::getFixed(1)))
8939 *Plan, CM.getMinimalBitwidths());
8941 // TODO: try to put it close to addActiveLaneMask().
8942 // Discard the plan if it is not EVL-compatible
8943 if (CM.foldTailWithEVL() &&
8945 *Plan, CM.getMaxSafeElements()))
8946 break;
8947 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8948 VPlans.push_back(std::move(Plan));
8949 }
8950 VF = SubRange.End;
8951 }
8952}
8953
8954// Add the necessary canonical IV and branch recipes required to control the
8955// loop.
8956static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
8957 DebugLoc DL) {
8958 Value *StartIdx = ConstantInt::get(IdxTy, 0);
8959 auto *StartV = Plan.getOrAddLiveIn(StartIdx);
8960
8961 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
8962 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8963 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8964 VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8965 Header->insert(CanonicalIVPHI, Header->begin());
8966
8967 VPBuilder Builder(TopRegion->getExitingBasicBlock());
8968 // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
8969 auto *CanonicalIVIncrement = Builder.createOverflowingOp(
8970 Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL,
8971 "index.next");
8972 CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8973
8974 // Add the BranchOnCount VPInstruction to the latch.
8976 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8977}
8978
8979/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
8980/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
8981/// the end value of the induction.
8983 VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder,
8984 VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) {
8985 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
8986 // Truncated wide inductions resume from the last lane of their vector value
8987 // in the last vector iteration which is handled elsewhere.
8988 if (WideIntOrFp && WideIntOrFp->getTruncInst())
8989 return nullptr;
8990
8991 VPValue *Start = WideIV->getStartValue();
8992 VPValue *Step = WideIV->getStepValue();
8994 VPValue *EndValue = VectorTC;
8995 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
8996 EndValue = VectorPHBuilder.createDerivedIV(
8997 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
8998 Start, VectorTC, Step);
8999 }
9000
9001 // EndValue is derived from the vector trip count (which has the same type as
9002 // the widest induction) and thus may be wider than the induction here.
9003 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
9004 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
9005 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
9006 ScalarTypeOfWideIV,
9007 WideIV->getDebugLoc());
9008 }
9009
9010 auto *ResumePhiRecipe =
9011 ScalarPHBuilder.createNaryOp(VPInstruction::ResumePhi, {EndValue, Start},
9012 WideIV->getDebugLoc(), "bc.resume.val");
9013 return ResumePhiRecipe;
9014}
9015
9016/// Create resume phis in the scalar preheader for first-order recurrences,
9017/// reductions and inductions, and update the VPIRInstructions wrapping the
9018/// original phis in the scalar header. End values for inductions are added to
9019/// \p IVEndValues.
9020static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
9021 DenseMap<VPValue *, VPValue *> &IVEndValues) {
9022 VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
9023 auto *ScalarPH = Plan.getScalarPreheader();
9024 auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getSinglePredecessor());
9025 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
9026 VPBuilder VectorPHBuilder(
9027 cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
9028 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
9029 VPBuilder ScalarPHBuilder(ScalarPH);
9030 VPValue *OneVPV = Plan.getOrAddLiveIn(
9031 ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
9032 for (VPRecipeBase &ScalarPhiR : *Plan.getScalarHeader()) {
9033 auto *ScalarPhiIRI = cast<VPIRInstruction>(&ScalarPhiR);
9034 auto *ScalarPhiI = dyn_cast<PHINode>(&ScalarPhiIRI->getInstruction());
9035 if (!ScalarPhiI)
9036 break;
9037
9038 // TODO: Extract final value from induction recipe initially, optimize to
9039 // pre-computed end value together in optimizeInductionExitUsers.
9040 auto *VectorPhiR = cast<VPHeaderPHIRecipe>(Builder.getRecipe(ScalarPhiI));
9041 if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
9043 WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
9044 &Plan.getVectorTripCount())) {
9045 assert(ResumePhi->getOpcode() == VPInstruction::ResumePhi &&
9046 "Expected a ResumePhi");
9047 IVEndValues[WideIVR] = ResumePhi->getOperand(0);
9048 ScalarPhiIRI->addOperand(ResumePhi);
9049 continue;
9050 }
9051 // TODO: Also handle truncated inductions here. Computing end-values
9052 // separately should be done as VPlan-to-VPlan optimization, after
9053 // legalizing all resume values to use the last lane from the loop.
9054 assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
9055 "should only skip truncated wide inductions");
9056 continue;
9057 }
9058
9059 // The backedge value provides the value to resume coming out of a loop,
9060 // which for FORs is a vector whose last element needs to be extracted. The
9061 // start value provides the value if the loop is bypassed.
9062 bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
9063 auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
9064 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
9065 "Cannot handle loops with uncountable early exits");
9066 if (IsFOR)
9067 ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
9068 VPInstruction::ExtractFromEnd, {ResumeFromVectorLoop, OneVPV}, {},
9069 "vector.recur.extract");
9070 StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
9071 auto *ResumePhiR = ScalarPHBuilder.createNaryOp(
9073 {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name);
9074 ScalarPhiIRI->addOperand(ResumePhiR);
9075 }
9076}
9077
9078// Collect VPIRInstructions for phis in the exit blocks that are modeled
9079// in VPlan and add the exiting VPValue as operand.
9082 VPlan &Plan) {
9083 SetVector<VPIRInstruction *> ExitUsersToFix;
9084 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
9085 for (VPRecipeBase &R : *ExitVPBB) {
9086 auto *ExitIRI = dyn_cast<VPIRInstruction>(&R);
9087 if (!ExitIRI)
9088 continue;
9089 auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
9090 if (!ExitPhi)
9091 break;
9092 if (ExitVPBB->getSinglePredecessor() != Plan.getMiddleBlock()) {
9093 assert(ExitIRI->getNumOperands() ==
9094 ExitVPBB->getPredecessors().size() &&
9095 "early-exit must update exit values on construction");
9096 continue;
9097 }
9098 BasicBlock *ExitingBB = OrigLoop->getLoopLatch();
9099 Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
9100 VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
9101 ExitIRI->addOperand(V);
9102 if (V->isLiveIn())
9103 continue;
9104 assert(V->getDefiningRecipe()->getParent()->getEnclosingLoopRegion() &&
9105 "Only recipes defined inside a region should need fixing.");
9106 ExitUsersToFix.insert(ExitIRI);
9107 }
9108 }
9109 return ExitUsersToFix;
9110}
9111
9112// Add exit values to \p Plan. Extracts are added for each entry in \p
9113// ExitUsersToFix if needed and their operands are updated.
9114static void
9116 const SetVector<VPIRInstruction *> &ExitUsersToFix) {
9117 if (ExitUsersToFix.empty())
9118 return;
9119
9120 auto *MiddleVPBB = Plan.getMiddleBlock();
9121 VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
9122
9123 // Introduce extract for exiting values and update the VPIRInstructions
9124 // modeling the corresponding LCSSA phis.
9125 for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
9126 assert(ExitIRI->getNumOperands() == 1 &&
9127 ExitIRI->getParent()->getSinglePredecessor() == MiddleVPBB &&
9128 "exit values from early exits must be fixed when branch to "
9129 "early-exit is added");
9130 ExitIRI->extractLastLaneOfOperand(B);
9131 }
9132}
9133
9134/// Handle users in the exit block for first order reductions in the original
9135/// exit block. The penultimate value of recurrences is fed to their LCSSA phi
9136/// users in the original exit block using the VPIRInstruction wrapping to the
9137/// LCSSA phi.
9139 VPlan &Plan, SetVector<VPIRInstruction *> &ExitUsersToFix) {
9140 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
9141 auto *ScalarPHVPBB = Plan.getScalarPreheader();
9142 auto *MiddleVPBB = Plan.getMiddleBlock();
9143 VPBuilder ScalarPHBuilder(ScalarPHVPBB);
9144 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
9145 VPValue *TwoVPV = Plan.getOrAddLiveIn(
9146 ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 2));
9147
9148 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
9149 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
9150 if (!FOR)
9151 continue;
9152
9153 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
9154 "Cannot handle loops with uncountable early exits");
9155
9156 // This is the second phase of vectorizing first-order recurrences, creating
9157 // extract for users outside the loop. An overview of the transformation is
9158 // described below. Suppose we have the following loop with some use after
9159 // the loop of the last a[i-1],
9160 //
9161 // for (int i = 0; i < n; ++i) {
9162 // t = a[i - 1];
9163 // b[i] = a[i] - t;
9164 // }
9165 // use t;
9166 //
9167 // There is a first-order recurrence on "a". For this loop, the shorthand
9168 // scalar IR looks like:
9169 //
9170 // scalar.ph:
9171 // s.init = a[-1]
9172 // br scalar.body
9173 //
9174 // scalar.body:
9175 // i = phi [0, scalar.ph], [i+1, scalar.body]
9176 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
9177 // s2 = a[i]
9178 // b[i] = s2 - s1
9179 // br cond, scalar.body, exit.block
9180 //
9181 // exit.block:
9182 // use = lcssa.phi [s1, scalar.body]
9183 //
9184 // In this example, s1 is a recurrence because it's value depends on the
9185 // previous iteration. In the first phase of vectorization, we created a
9186 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
9187 // for users in the scalar preheader and exit block.
9188 //
9189 // vector.ph:
9190 // v_init = vector(..., ..., ..., a[-1])
9191 // br vector.body
9192 //
9193 // vector.body
9194 // i = phi [0, vector.ph], [i+4, vector.body]
9195 // v1 = phi [v_init, vector.ph], [v2, vector.body]
9196 // v2 = a[i, i+1, i+2, i+3]
9197 // b[i] = v2 - v1
9198 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
9199 // b[i, i+1, i+2, i+3] = v2 - v1
9200 // br cond, vector.body, middle.block
9201 //
9202 // middle.block:
9203 // vector.recur.extract.for.phi = v2(2)
9204 // vector.recur.extract = v2(3)
9205 // br cond, scalar.ph, exit.block
9206 //
9207 // scalar.ph:
9208 // scalar.recur.init = phi [vector.recur.extract, middle.block],
9209 // [s.init, otherwise]
9210 // br scalar.body
9211 //
9212 // scalar.body:
9213 // i = phi [0, scalar.ph], [i+1, scalar.body]
9214 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
9215 // s2 = a[i]
9216 // b[i] = s2 - s1
9217 // br cond, scalar.body, exit.block
9218 //
9219 // exit.block:
9220 // lo = lcssa.phi [s1, scalar.body],
9221 // [vector.recur.extract.for.phi, middle.block]
9222 //
9223 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
9224 // Extract the penultimate value of the recurrence and use it as operand for
9225 // the VPIRInstruction modeling the phi.
9226 for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
9227 if (ExitIRI->getOperand(0) != FOR)
9228 continue;
9229 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
9230 VPInstruction::ExtractFromEnd, {FOR->getBackedgeValue(), TwoVPV}, {},
9231 "vector.recur.extract.for.phi");
9232 ExitIRI->setOperand(0, PenultimateElement);
9233 ExitUsersToFix.remove(ExitIRI);
9234 }
9235 }
9236}
9237
9239LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
9240
9242
9243 // ---------------------------------------------------------------------------
9244 // Build initial VPlan: Scan the body of the loop in a topological order to
9245 // visit each basic block after having visited its predecessor basic blocks.
9246 // ---------------------------------------------------------------------------
9247
9248 // Create initial VPlan skeleton, having a basic block for the pre-header
9249 // which contains SCEV expansions that need to happen before the CFG is
9250 // modified; a basic block for the vector pre-header, followed by a region for
9251 // the vector loop, followed by the middle basic block. The skeleton vector
9252 // loop region contains a header and latch basic blocks.
9253
9254 bool RequiresScalarEpilogueCheck =
9256 [this](ElementCount VF) {
9257 return !CM.requiresScalarEpilogue(VF.isVector());
9258 },
9259 Range);
9261 PSE, RequiresScalarEpilogueCheck,
9262 CM.foldTailByMasking(), OrigLoop);
9263
9264 // Don't use getDecisionAndClampRange here, because we don't know the UF
9265 // so this function is better to be conservative, rather than to split
9266 // it up into different VPlans.
9267 // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
9268 bool IVUpdateMayOverflow = false;
9269 for (ElementCount VF : Range)
9270 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
9271
9273 TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
9274 // Use NUW for the induction increment if we proved that it won't overflow in
9275 // the vector loop or when not folding the tail. In the later case, we know
9276 // that the canonical induction increment will not overflow as the vector trip
9277 // count is >= increment and a multiple of the increment.
9278 bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
9279 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
9280
9281 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
9282 Builder);
9283
9284 // ---------------------------------------------------------------------------
9285 // Pre-construction: record ingredients whose recipes we'll need to further
9286 // process after constructing the initial VPlan.
9287 // ---------------------------------------------------------------------------
9288
9289 // For each interleave group which is relevant for this (possibly trimmed)
9290 // Range, add it to the set of groups to be later applied to the VPlan and add
9291 // placeholders for its members' Recipes which we'll be replacing with a
9292 // single VPInterleaveRecipe.
9294 auto ApplyIG = [IG, this](ElementCount VF) -> bool {
9295 bool Result = (VF.isVector() && // Query is illegal for VF == 1
9296 CM.getWideningDecision(IG->getInsertPos(), VF) ==
9298 // For scalable vectors, the only interleave factor currently supported
9299 // is 2 since we require the (de)interleave2 intrinsics instead of
9300 // shufflevectors.
9301 assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
9302 "Unsupported interleave factor for scalable vectors");
9303 return Result;
9304 };
9305 if (!getDecisionAndClampRange(ApplyIG, Range))
9306 continue;
9307 InterleaveGroups.insert(IG);
9308 }
9309
9310 // ---------------------------------------------------------------------------
9311 // Construct recipes for the instructions in the loop
9312 // ---------------------------------------------------------------------------
9313
9314 // Scan the body of the loop in a topological order to visit each basic block
9315 // after having visited its predecessor basic blocks.
9316 LoopBlocksDFS DFS(OrigLoop);
9317 DFS.perform(LI);
9318
9319 VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock();
9320 VPBasicBlock *VPBB = HeaderVPBB;
9321 BasicBlock *HeaderBB = OrigLoop->getHeader();
9322 bool NeedsMasks =
9323 CM.foldTailByMasking() ||
9324 any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) {
9325 bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
9326 return Legal->blockNeedsPredication(BB) || NeedsBlends;
9327 });
9328
9329 RecipeBuilder.collectScaledReductions(Range);
9330
9331 auto *MiddleVPBB = Plan->getMiddleBlock();
9332 VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
9333 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
9334 // Relevant instructions from basic block BB will be grouped into VPRecipe
9335 // ingredients and fill a new VPBasicBlock.
9336 if (VPBB != HeaderVPBB)
9337 VPBB->setName(BB->getName());
9338 Builder.setInsertPoint(VPBB);
9339
9340 if (VPBB == HeaderVPBB)
9341 RecipeBuilder.createHeaderMask();
9342 else if (NeedsMasks)
9343 RecipeBuilder.createBlockInMask(BB);
9344
9345 // Introduce each ingredient into VPlan.
9346 // TODO: Model and preserve debug intrinsics in VPlan.
9347 for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
9348 Instruction *Instr = &I;
9350 auto *Phi = dyn_cast<PHINode>(Instr);
9351 if (Phi && Phi->getParent() == HeaderBB) {
9352 Operands.push_back(Plan->getOrAddLiveIn(
9353 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
9354 } else {
9355 auto OpRange = RecipeBuilder.mapToVPValues(Instr->operands());
9356 Operands = {OpRange.begin(), OpRange.end()};
9357 }
9358
9359 // The stores with invariant address inside the loop will be deleted, and
9360 // in the exit block, a uniform store recipe will be created for the final
9361 // invariant store of the reduction.
9362 StoreInst *SI;
9363 if ((SI = dyn_cast<StoreInst>(&I)) &&
9364 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
9365 // Only create recipe for the final invariant store of the reduction.
9366 if (!Legal->isInvariantStoreOfReduction(SI))
9367 continue;
9368 auto *Recipe = new VPReplicateRecipe(
9369 SI, make_range(Operands.begin(), Operands.end()),
9370 true /* IsUniform */);
9371 Recipe->insertBefore(*MiddleVPBB, MBIP);
9372 continue;
9373 }
9374
9375 VPRecipeBase *Recipe =
9376 RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
9377 if (!Recipe)
9378 Recipe = RecipeBuilder.handleReplication(Instr, Operands, Range);
9379
9380 RecipeBuilder.setRecipe(Instr, Recipe);
9381 if (isa<VPHeaderPHIRecipe>(Recipe)) {
9382 // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
9383 // the following cases, VPHeaderPHIRecipes may be created after non-phi
9384 // recipes and need to be moved to the phi section of HeaderVPBB:
9385 // * tail-folding (non-phi recipes computing the header mask are
9386 // introduced earlier than regular header phi recipes, and should appear
9387 // after them)
9388 // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
9389
9390 assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
9391 CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
9392 "unexpected recipe needs moving");
9393 Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
9394 } else
9395 VPBB->appendRecipe(Recipe);
9396 }
9397
9398 VPBlockUtils::insertBlockAfter(Plan->createVPBasicBlock(""), VPBB);
9399 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
9400 }
9401
9402 // After here, VPBB should not be used.
9403 VPBB = nullptr;
9404
9405 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
9406 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
9407 "entry block must be set to a VPRegionBlock having a non-empty entry "
9408 "VPBasicBlock");
9409 RecipeBuilder.fixHeaderPhis();
9410
9411 // Update wide induction increments to use the same step as the corresponding
9412 // wide induction. This enables detecting induction increments directly in
9413 // VPlan and removes redundant splats.
9414 for (const auto &[Phi, ID] : Legal->getInductionVars()) {
9415 auto *IVInc = cast<Instruction>(
9416 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
9417 if (IVInc->getOperand(0) != Phi || IVInc->getOpcode() != Instruction::Add)
9418 continue;
9419 VPWidenInductionRecipe *WideIV =
9420 cast<VPWidenInductionRecipe>(RecipeBuilder.getRecipe(Phi));
9421 VPRecipeBase *R = RecipeBuilder.getRecipe(IVInc);
9422 R->setOperand(1, WideIV->getStepValue());
9423 }
9424
9425 if (auto *UncountableExitingBlock =
9428 *PSE.getSE(), OrigLoop, UncountableExitingBlock,
9429 RecipeBuilder);
9430 }
9432 addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
9433 SetVector<VPIRInstruction *> ExitUsersToFix =
9434 collectUsersInExitBlocks(OrigLoop, RecipeBuilder, *Plan);
9435 addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
9436 addUsersInExitBlocks(*Plan, ExitUsersToFix);
9437
9438 // ---------------------------------------------------------------------------
9439 // Transform initial VPlan: Apply previously taken decisions, in order, to
9440 // bring the VPlan to its final state.
9441 // ---------------------------------------------------------------------------
9442
9443 // Adjust the recipes for any inloop reductions.
9444 adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
9445
9446 // Interleave memory: for each Interleave Group we marked earlier as relevant
9447 // for this VPlan, replace the Recipes widening its memory instructions with a
9448 // single VPInterleaveRecipe at its insertion point.
9450 InterleaveGroups, RecipeBuilder,
9452
9453 for (ElementCount VF : Range)
9454 Plan->addVF(VF);
9455 Plan->setName("Initial VPlan");
9456
9457 // Replace VPValues for known constant strides guaranteed by predicate scalar
9458 // evolution.
9459 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
9460 auto *R = cast<VPRecipeBase>(&U);
9461 return R->getParent()->getParent() ||
9462 R->getParent() ==
9463 Plan->getVectorLoopRegion()->getSinglePredecessor();
9464 };
9465 for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
9466 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
9467 auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));
9468 // Only handle constant strides for now.
9469 if (!ScevStride)
9470 continue;
9471
9472 auto *CI = Plan->getOrAddLiveIn(
9473 ConstantInt::get(Stride->getType(), ScevStride->getAPInt()));
9474 if (VPValue *StrideVPV = Plan->getLiveIn(StrideV))
9475 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
9476
9477 // The versioned value may not be used in the loop directly but through a
9478 // sext/zext. Add new live-ins in those cases.
9479 for (Value *U : StrideV->users()) {
9480 if (!isa<SExtInst, ZExtInst>(U))
9481 continue;
9482 VPValue *StrideVPV = Plan->getLiveIn(U);
9483 if (!StrideVPV)
9484 continue;
9485 unsigned BW = U->getType()->getScalarSizeInBits();
9486 APInt C = isa<SExtInst>(U) ? ScevStride->getAPInt().sext(BW)
9487 : ScevStride->getAPInt().zext(BW);
9488 VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(U->getType(), C));
9489 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
9490 }
9491 }
9492
9493 auto BlockNeedsPredication = [this](BasicBlock *BB) {
9494 return Legal->blockNeedsPredication(BB);
9495 };
9497 BlockNeedsPredication);
9498
9499 // Sink users of fixed-order recurrence past the recipe defining the previous
9500 // value and introduce FirstOrderRecurrenceSplice VPInstructions.
9502 *Plan, Builder))
9503 return nullptr;
9504
9505 if (useActiveLaneMask(Style)) {
9506 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
9507 // TailFoldingStyle is visible there.
9508 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
9509 bool WithoutRuntimeCheck =
9511 VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
9512 WithoutRuntimeCheck);
9513 }
9515
9516 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
9517 return Plan;
9518}
9519
9520VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9521 // Outer loop handling: They may require CFG and instruction level
9522 // transformations before even evaluating whether vectorization is profitable.
9523 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9524 // the vectorization pipeline.
9525 assert(!OrigLoop->isInnermost());
9526 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9527
9528 // Create new empty VPlan
9529 auto Plan = VPlan::createInitialVPlan(Legal->getWidestInductionType(), PSE,
9530 true, false, OrigLoop);
9531
9532 // Build hierarchical CFG
9533 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9534 HCFGBuilder.buildHierarchicalCFG();
9535
9536 for (ElementCount VF : Range)
9537 Plan->addVF(VF);
9538
9540 Plan,
9541 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
9542 *PSE.getSE(), *TLI);
9543
9544 // Tail folding is not supported for outer loops, so the induction increment
9545 // is guaranteed to not wrap.
9546 bool HasNUW = true;
9547 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
9548 DebugLoc());
9549
9550 // Collect mapping of IR header phis to header phi recipes, to be used in
9551 // addScalarResumePhis.
9552 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
9553 Builder);
9554 for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9555 if (isa<VPCanonicalIVPHIRecipe>(&R))
9556 continue;
9557 auto *HeaderR = cast<VPHeaderPHIRecipe>(&R);
9558 RecipeBuilder.setRecipe(HeaderR->getUnderlyingInstr(), HeaderR);
9559 }
9561 // TODO: IVEndValues are not used yet in the native path, to optimize exit
9562 // values.
9563 addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
9564
9565 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
9566 return Plan;
9567}
9568
9569// Adjust the recipes for reductions. For in-loop reductions the chain of
9570// instructions leading from the loop exit instr to the phi need to be converted
9571// to reductions, with one operand being vector and the other being the scalar
9572// reduction chain. For other reductions, a select is introduced between the phi
9573// and users outside the vector region when folding the tail.
9574//
9575// A ComputeReductionResult recipe is added to the middle block, also for
9576// in-loop reductions which compute their result in-loop, because generating
9577// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
9578//
9579// Adjust AnyOf reductions; replace the reduction phi for the selected value
9580// with a boolean reduction phi node to check if the condition is true in any
9581// iteration. The final value is selected by the final ComputeReductionResult.
9582void LoopVectorizationPlanner::adjustRecipesForReductions(
9583 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
9584 using namespace VPlanPatternMatch;
9585 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
9586 VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
9587 VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();
9589
9590 for (VPRecipeBase &R : Header->phis()) {
9591 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9592 if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
9593 continue;
9594
9595 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9596 RecurKind Kind = RdxDesc.getRecurrenceKind();
9597 assert(
9600 "AnyOf and FindLast reductions are not allowed for in-loop reductions");
9601
9602 // Collect the chain of "link" recipes for the reduction starting at PhiR.
9604 Worklist.insert(PhiR);
9605 for (unsigned I = 0; I != Worklist.size(); ++I) {
9606 VPSingleDefRecipe *Cur = Worklist[I];
9607 for (VPUser *U : Cur->users()) {
9608 auto *UserRecipe = cast<VPSingleDefRecipe>(U);
9609 if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
9610 assert((UserRecipe->getParent() == MiddleVPBB ||
9611 UserRecipe->getParent() == Plan->getScalarPreheader()) &&
9612 "U must be either in the loop region, the middle block or the "
9613 "scalar preheader.");
9614 continue;
9615 }
9616 Worklist.insert(UserRecipe);
9617 }
9618 }
9619
9620 // Visit operation "Links" along the reduction chain top-down starting from
9621 // the phi until LoopExitValue. We keep track of the previous item
9622 // (PreviousLink) to tell which of the two operands of a Link will remain
9623 // scalar and which will be reduced. For minmax by select(cmp), Link will be
9624 // the select instructions. Blend recipes of in-loop reduction phi's will
9625 // get folded to their non-phi operand, as the reduction recipe handles the
9626 // condition directly.
9627 VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
9628 for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {
9629 Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
9630
9631 // Index of the first operand which holds a non-mask vector operand.
9632 unsigned IndexOfFirstOperand;
9633 // Recognize a call to the llvm.fmuladd intrinsic.
9634 bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9635 VPValue *VecOp;
9636 VPBasicBlock *LinkVPBB = CurrentLink->getParent();
9637 if (IsFMulAdd) {
9638 assert(
9640 "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9641 assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
9642 isa<VPWidenIntrinsicRecipe>(CurrentLink)) &&
9643 CurrentLink->getOperand(2) == PreviousLink &&
9644 "expected a call where the previous link is the added operand");
9645
9646 // If the instruction is a call to the llvm.fmuladd intrinsic then we
9647 // need to create an fmul recipe (multiplying the first two operands of
9648 // the fmuladd together) to use as the vector operand for the fadd
9649 // reduction.
9650 VPInstruction *FMulRecipe = new VPInstruction(
9651 Instruction::FMul,
9652 {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
9653 CurrentLinkI->getFastMathFlags());
9654 LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
9655 VecOp = FMulRecipe;
9656 } else {
9657 auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink);
9658 if (PhiR->isInLoop() && Blend) {
9659 assert(Blend->getNumIncomingValues() == 2 &&
9660 "Blend must have 2 incoming values");
9661 if (Blend->getIncomingValue(0) == PhiR)
9662 Blend->replaceAllUsesWith(Blend->getIncomingValue(1));
9663 else {
9664 assert(Blend->getIncomingValue(1) == PhiR &&
9665 "PhiR must be an operand of the blend");
9666 Blend->replaceAllUsesWith(Blend->getIncomingValue(0));
9667 }
9668 continue;
9669 }
9670
9672 if (isa<VPWidenRecipe>(CurrentLink)) {
9673 assert(isa<CmpInst>(CurrentLinkI) &&
9674 "need to have the compare of the select");
9675 continue;
9676 }
9677 assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
9678 "must be a select recipe");
9679 IndexOfFirstOperand = 1;
9680 } else {
9681 assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
9682 "Expected to replace a VPWidenSC");
9683 IndexOfFirstOperand = 0;
9684 }
9685 // Note that for non-commutable operands (cmp-selects), the semantics of
9686 // the cmp-select are captured in the recurrence kind.
9687 unsigned VecOpId =
9688 CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink
9689 ? IndexOfFirstOperand + 1
9690 : IndexOfFirstOperand;
9691 VecOp = CurrentLink->getOperand(VecOpId);
9692 assert(VecOp != PreviousLink &&
9693 CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
9694 (VecOpId - IndexOfFirstOperand)) ==
9695 PreviousLink &&
9696 "PreviousLink must be the operand other than VecOp");
9697 }
9698
9699 BasicBlock *BB = CurrentLinkI->getParent();
9700 VPValue *CondOp = nullptr;
9702 CondOp = RecipeBuilder.getBlockInMask(BB);
9703
9704 auto *RedRecipe = new VPReductionRecipe(
9705 RdxDesc, CurrentLinkI, PreviousLink, VecOp, CondOp,
9706 CM.useOrderedReductions(RdxDesc), CurrentLinkI->getDebugLoc());
9707 // Append the recipe to the end of the VPBasicBlock because we need to
9708 // ensure that it comes after all of it's inputs, including CondOp.
9709 // Delete CurrentLink as it will be invalid if its operand is replaced
9710 // with a reduction defined at the bottom of the block in the next link.
9711 LinkVPBB->appendRecipe(RedRecipe);
9712 CurrentLink->replaceAllUsesWith(RedRecipe);
9713 ToDelete.push_back(CurrentLink);
9714 PreviousLink = RedRecipe;
9715 }
9716 }
9717 VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
9718 Builder.setInsertPoint(&*LatchVPBB->begin());
9719 VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
9720 for (VPRecipeBase &R :
9721 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9722 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9723 if (!PhiR)
9724 continue;
9725
9726 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9727 // If tail is folded by masking, introduce selects between the phi
9728 // and the users outside the vector region of each reduction, at the
9729 // beginning of the dedicated latch block.
9730 auto *OrigExitingVPV = PhiR->getBackedgeValue();
9731 auto *NewExitingVPV = PhiR->getBackedgeValue();
9732 if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
9733 VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
9734 assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
9735 "reduction recipe must be defined before latch");
9736 Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
9737 std::optional<FastMathFlags> FMFs =
9738 PhiTy->isFloatingPointTy()
9739 ? std::make_optional(RdxDesc.getFastMathFlags())
9740 : std::nullopt;
9741 NewExitingVPV =
9742 Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
9743 OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
9744 return isa<VPInstruction>(&U) &&
9745 cast<VPInstruction>(&U)->getOpcode() ==
9747 });
9749 PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy))
9750 PhiR->setOperand(1, NewExitingVPV);
9751 }
9752
9753 // If the vector reduction can be performed in a smaller type, we truncate
9754 // then extend the loop exit value to enable InstCombine to evaluate the
9755 // entire expression in the smaller type.
9756 Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
9757 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
9759 RdxDesc.getRecurrenceKind())) {
9760 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9761 Type *RdxTy = RdxDesc.getRecurrenceType();
9762 auto *Trunc =
9763 new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9764 auto *Extnd =
9765 RdxDesc.isSigned()
9766 ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
9767 : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
9768
9769 Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
9770 Extnd->insertAfter(Trunc);
9771 if (PhiR->getOperand(1) == NewExitingVPV)
9772 PhiR->setOperand(1, Extnd->getVPSingleValue());
9773 NewExitingVPV = Extnd;
9774 }
9775
9776 // We want code in the middle block to appear to execute on the location of
9777 // the scalar loop's latch terminator because: (a) it is all compiler
9778 // generated, (b) these instructions are always executed after evaluating
9779 // the latch conditional branch, and (c) other passes may add new
9780 // predecessors which terminate on this line. This is the easiest way to
9781 // ensure we don't accidentally cause an extra step back into the loop while
9782 // debugging.
9783 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9784
9785 // TODO: At the moment ComputeReductionResult also drives creation of the
9786 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9787 // even for in-loop reductions, until the reduction resume value handling is
9788 // also modeled in VPlan.
9789 auto *FinalReductionResult = new VPInstruction(
9790 VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9791 // Update all users outside the vector region.
9792 OrigExitingVPV->replaceUsesWithIf(
9793 FinalReductionResult, [](VPUser &User, unsigned) {
9794 auto *Parent = cast<VPRecipeBase>(&User)->getParent();
9795 return Parent && !Parent->getParent();
9796 });
9797 FinalReductionResult->insertBefore(*MiddleVPBB, IP);
9798
9799 // Adjust AnyOf reductions; replace the reduction phi for the selected value
9800 // with a boolean reduction phi node to check if the condition is true in
9801 // any iteration. The final value is selected by the final
9802 // ComputeReductionResult.
9804 RdxDesc.getRecurrenceKind())) {
9805 auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) {
9806 return isa<VPWidenSelectRecipe>(U) ||
9807 (isa<VPReplicateRecipe>(U) &&
9808 cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() ==
9809 Instruction::Select);
9810 }));
9811 VPValue *Cmp = Select->getOperand(0);
9812 // If the compare is checking the reduction PHI node, adjust it to check
9813 // the start value.
9814 if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe()) {
9815 for (unsigned I = 0; I != CmpR->getNumOperands(); ++I)
9816 if (CmpR->getOperand(I) == PhiR)
9817 CmpR->setOperand(I, PhiR->getStartValue());
9818 }
9819 VPBuilder::InsertPointGuard Guard(Builder);
9820 Builder.setInsertPoint(Select);
9821
9822 // If the true value of the select is the reduction phi, the new value is
9823 // selected if the negated condition is true in any iteration.
9824 if (Select->getOperand(1) == PhiR)
9825 Cmp = Builder.createNot(Cmp);
9826 VPValue *Or = Builder.createOr(PhiR, Cmp);
9827 Select->getVPSingleValue()->replaceAllUsesWith(Or);
9828 // Delete Select now that it has invalid types.
9829 ToDelete.push_back(Select);
9830
9831 // Convert the reduction phi to operate on bools.
9832 PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(
9833 OrigLoop->getHeader()->getContext())));
9834 continue;
9835 }
9836
9838 RdxDesc.getRecurrenceKind())) {
9839 // Adjust the start value for FindLastIV recurrences to use the sentinel
9840 // value after generating the ResumePhi recipe, which uses the original
9841 // start value.
9842 PhiR->setOperand(0, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue()));
9843 }
9844 }
9845 for (VPRecipeBase *R : ToDelete)
9846 R->eraseFromParent();
9847
9849}
9850
9852 assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
9853
9854 // Fast-math-flags propagate from the original induction instruction.
9856 if (FPBinOp)
9857 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9858
9859 Value *Step = State.get(getStepValue(), VPLane(0));
9860 Value *Index = State.get(getOperand(1), VPLane(0));
9861 Value *DerivedIV = emitTransformedIndex(
9862 State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind,
9863 cast_if_present<BinaryOperator>(FPBinOp));
9864 DerivedIV->setName(Name);
9865 // If index is the vector trip count, the concrete value will only be set in
9866 // prepareToExecute, leading to missed simplifications, e.g. if it is 0.
9867 // TODO: Remove the special case for the vector trip count once it is computed
9868 // in VPlan and can be used during VPlan simplification.
9869 assert((DerivedIV != Index ||
9870 getOperand(1) == &getParent()->getPlan()->getVectorTripCount()) &&
9871 "IV didn't need transforming?");
9872 State.set(this, DerivedIV, VPLane(0));
9873}
9874
9877 if (State.Lane) { // Generate a single instance.
9878 assert((State.VF.isScalar() || !isUniform()) &&
9879 "uniform recipe shouldn't be predicated");
9880 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9881 State.ILV->scalarizeInstruction(UI, this, *State.Lane, State);
9882 // Insert scalar instance packing it into a vector.
9883 if (State.VF.isVector() && shouldPack()) {
9884 // If we're constructing lane 0, initialize to start from poison.
9885 if (State.Lane->isFirstLane()) {
9886 assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9888 VectorType::get(UI->getType(), State.VF));
9889 State.set(this, Poison);
9890 }
9891 State.packScalarIntoVectorValue(this, *State.Lane);
9892 }
9893 return;
9894 }
9895
9896 if (IsUniform) {
9897 // Uniform within VL means we need to generate lane 0.
9898 State.ILV->scalarizeInstruction(UI, this, VPLane(0), State);
9899 return;
9900 }
9901
9902 // A store of a loop varying value to a uniform address only needs the last
9903 // copy of the store.
9904 if (isa<StoreInst>(UI) &&
9906 auto Lane = VPLane::getLastLaneForVF(State.VF);
9907 State.ILV->scalarizeInstruction(UI, this, VPLane(Lane), State);
9908 return;
9909 }
9910
9911 // Generate scalar instances for all VF lanes.
9912 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9913 const unsigned EndLane = State.VF.getKnownMinValue();
9914 for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9915 State.ILV->scalarizeInstruction(UI, this, VPLane(Lane), State);
9916}
9917
9918// Determine how to lower the scalar epilogue, which depends on 1) optimising
9919// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9920// predication, and 4) a TTI hook that analyses whether the loop is suitable
9921// for predication.
9926 // 1) OptSize takes precedence over all other options, i.e. if this is set,
9927 // don't look at hints or options, and don't request a scalar epilogue.
9928 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9929 // LoopAccessInfo (due to code dependency and not being able to reliably get
9930 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9931 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9932 // versioning when the vectorization is forced, unlike hasOptSize. So revert
9933 // back to the old way and vectorize with versioning when forced. See D81345.)
9934 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9938
9939 // 2) If set, obey the directives
9940 if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9948 };
9949 }
9950
9951 // 3) If set, obey the hints
9952 switch (Hints.getPredicate()) {
9957 };
9958
9959 // 4) if the TTI hook indicates this is profitable, request predication.
9960 TailFoldingInfo TFI(TLI, &LVL, IAI);
9963
9965}
9966
9967// Process the loop in the VPlan-native vectorization path. This path builds
9968// VPlan upfront in the vectorization pipeline, which allows to apply
9969// VPlan-to-VPlan transformations from the very beginning without modifying the
9970// input LLVM IR.
9977 LoopVectorizationRequirements &Requirements) {
9978
9979 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9980 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9981 return false;
9982 }
9983 assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9984 Function *F = L->getHeader()->getParent();
9985 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9986
9988 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
9989
9990 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9991 &Hints, IAI);
9992 // Use the planner for outer loop vectorization.
9993 // TODO: CM is not used at this point inside the planner. Turn CM into an
9994 // optional argument if we don't need it in the future.
9995 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9996 ORE);
9997
9998 // Get user vectorization factor.
9999 ElementCount UserVF = Hints.getWidth();
10000
10002
10003 // Plan how to best vectorize, return the best VF and its cost.
10004 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
10005
10006 // If we are stress testing VPlan builds, do not attempt to generate vector
10007 // code. Masked vector code generation support will follow soon.
10008 // Also, do not attempt to vectorize if no vector code will be produced.
10010 return false;
10011
10012 VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10013
10014 {
10015 bool AddBranchWeights =
10016 hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
10017 GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
10018 AddBranchWeights, CM.CostKind);
10019 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10020 VF.Width, 1, LVL, &CM, BFI, PSI, Checks, BestPlan);
10021 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
10022 << L->getHeader()->getParent()->getName() << "\"\n");
10023 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
10024 }
10025
10026 reportVectorization(ORE, L, VF, 1);
10027
10028 // Mark the loop as already vectorized to avoid vectorizing again.
10029 Hints.setAlreadyVectorized();
10030 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10031 return true;
10032}
10033
10034// Emit a remark if there are stores to floats that required a floating point
10035// extension. If the vectorized loop was generated with floating point there
10036// will be a performance penalty from the conversion overhead and the change in
10037// the vector width.
10040 for (BasicBlock *BB : L->getBlocks()) {
10041 for (Instruction &Inst : *BB) {
10042 if (auto *S = dyn_cast<StoreInst>(&Inst)) {
10043 if (S->getValueOperand()->getType()->isFloatTy())
10044 Worklist.push_back(S);
10045 }
10046 }
10047 }
10048
10049 // Traverse the floating point stores upwards searching, for floating point
10050 // conversions.
10053 while (!Worklist.empty()) {
10054 auto *I = Worklist.pop_back_val();
10055 if (!L->contains(I))
10056 continue;
10057 if (!Visited.insert(I).second)
10058 continue;
10059
10060 // Emit a remark if the floating point store required a floating
10061 // point conversion.
10062 // TODO: More work could be done to identify the root cause such as a
10063 // constant or a function return type and point the user to it.
10064 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
10065 ORE->emit([&]() {
10066 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
10067 I->getDebugLoc(), L->getHeader())
10068 << "floating point conversion changes vector width. "
10069 << "Mixed floating point precision requires an up/down "
10070 << "cast that will negatively impact performance.";
10071 });
10072
10073 for (Use &Op : I->operands())
10074 if (auto *OpI = dyn_cast<Instruction>(Op))
10075 Worklist.push_back(OpI);
10076 }
10077}
10078
10079static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
10080 VectorizationFactor &VF, Loop *L,
10083 std::optional<unsigned> VScale) {
10084 InstructionCost CheckCost = Checks.getCost();
10085 if (!CheckCost.isValid())
10086 return false;
10087
10088 // When interleaving only scalar and vector cost will be equal, which in turn
10089 // would lead to a divide by 0. Fall back to hard threshold.
10090 if (VF.Width.isScalar()) {
10091 if (CheckCost > VectorizeMemoryCheckThreshold) {
10092 LLVM_DEBUG(
10093 dbgs()
10094 << "LV: Interleaving only is not profitable due to runtime checks\n");
10095 return false;
10096 }
10097 return true;
10098 }
10099
10100 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
10101 uint64_t ScalarC = *VF.ScalarCost.getValue();
10102 if (ScalarC == 0)
10103 return true;
10104
10105 // First, compute the minimum iteration count required so that the vector
10106 // loop outperforms the scalar loop.
10107 // The total cost of the scalar loop is
10108 // ScalarC * TC
10109 // where
10110 // * TC is the actual trip count of the loop.
10111 // * ScalarC is the cost of a single scalar iteration.
10112 //
10113 // The total cost of the vector loop is
10114 // RtC + VecC * (TC / VF) + EpiC
10115 // where
10116 // * RtC is the cost of the generated runtime checks
10117 // * VecC is the cost of a single vector iteration.
10118 // * TC is the actual trip count of the loop
10119 // * VF is the vectorization factor
10120 // * EpiCost is the cost of the generated epilogue, including the cost
10121 // of the remaining scalar operations.
10122 //
10123 // Vectorization is profitable once the total vector cost is less than the
10124 // total scalar cost:
10125 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC
10126 //
10127 // Now we can compute the minimum required trip count TC as
10128 // VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC
10129 //
10130 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
10131 // the computations are performed on doubles, not integers and the result
10132 // is rounded up, hence we get an upper estimate of the TC.
10133 unsigned IntVF = getEstimatedRuntimeVF(VF.Width, VScale);
10134 uint64_t RtC = *CheckCost.getValue();
10135 uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
10136 uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
10137
10138 // Second, compute a minimum iteration count so that the cost of the
10139 // runtime checks is only a fraction of the total scalar loop cost. This
10140 // adds a loop-dependent bound on the overhead incurred if the runtime
10141 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
10142 // * TC. To bound the runtime check to be a fraction 1/X of the scalar
10143 // cost, compute
10144 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
10145 uint64_t MinTC2 = divideCeil(RtC * 10, ScalarC);
10146
10147 // Now pick the larger minimum. If it is not a multiple of VF and a scalar
10148 // epilogue is allowed, choose the next closest multiple of VF. This should
10149 // partly compensate for ignoring the epilogue cost.
10150 uint64_t MinTC = std::max(MinTC1, MinTC2);
10151 if (SEL == CM_ScalarEpilogueAllowed)
10152 MinTC = alignTo(MinTC, IntVF);
10154
10155 LLVM_DEBUG(
10156 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
10157 << VF.MinProfitableTripCount << "\n");
10158
10159 // Skip vectorization if the expected trip count is less than the minimum
10160 // required trip count.
10161 if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
10164 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
10165 "trip count < minimum profitable VF ("
10166 << *ExpectedTC << " < " << VF.MinProfitableTripCount
10167 << ")\n");
10168
10169 return false;
10170 }
10171 }
10172 return true;
10173}
10174
10176 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10178 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10180
10181/// Prepare \p MainPlan for vectorizing the main vector loop during epilogue
10182/// vectorization. Remove ResumePhis from \p MainPlan for inductions that
10183/// don't have a corresponding wide induction in \p EpiPlan.
10184static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
10185 // Collect PHI nodes of widened phis in the VPlan for the epilogue. Those
10186 // will need their resume-values computed in the main vector loop. Others
10187 // can be removed from the main VPlan.
10188 SmallPtrSet<PHINode *, 2> EpiWidenedPhis;
10189 for (VPRecipeBase &R :
10191 if (isa<VPCanonicalIVPHIRecipe>(&R))
10192 continue;
10193 EpiWidenedPhis.insert(
10194 cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue()));
10195 }
10197 *cast<VPIRBasicBlock>(MainPlan.getScalarHeader()))) {
10198 auto *VPIRInst = cast<VPIRInstruction>(&R);
10199 auto *IRI = dyn_cast<PHINode>(&VPIRInst->getInstruction());
10200 if (!IRI)
10201 break;
10202 if (EpiWidenedPhis.contains(IRI))
10203 continue;
10204 // There is no corresponding wide induction in the epilogue plan that would
10205 // need a resume value. Remove the VPIRInst wrapping the scalar header phi
10206 // together with the corresponding ResumePhi. The resume values for the
10207 // scalar loop will be created during execution of EpiPlan.
10208 VPRecipeBase *ResumePhi = VPIRInst->getOperand(0)->getDefiningRecipe();
10209 VPIRInst->eraseFromParent();
10210 ResumePhi->eraseFromParent();
10211 }
10213
10214 using namespace VPlanPatternMatch;
10215 VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
10216 VPValue *VectorTC = &MainPlan.getVectorTripCount();
10217 // If there is a suitable resume value for the canonical induction in the
10218 // scalar (which will become vector) epilogue loop we are done. Otherwise
10219 // create it below.
10220 if (any_of(*MainScalarPH, [VectorTC](VPRecipeBase &R) {
10221 return match(&R, m_VPInstruction<VPInstruction::ResumePhi>(
10222 m_Specific(VectorTC), m_SpecificInt(0)));
10223 }))
10224 return;
10225 VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
10226 ScalarPHBuilder.createNaryOp(
10228 {VectorTC, MainPlan.getCanonicalIV()->getStartValue()}, {},
10229 "vec.epilog.resume.val");
10230}
10231
10232/// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
10233/// SCEVs from \p ExpandedSCEVs and set resume values for header recipes.
10234static void
10236 const SCEV2ValueTy &ExpandedSCEVs,
10237 const EpilogueLoopVectorizationInfo &EPI) {
10238 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
10239 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10240 Header->setName("vec.epilog.vector.body");
10241
10242 // Re-use the trip count and steps expanded for the main loop, as
10243 // skeleton creation needs it as a value that dominates both the scalar
10244 // and vector epilogue loops
10245 // TODO: This is a workaround needed for epilogue vectorization and it
10246 // should be removed once induction resume value creation is done
10247 // directly in VPlan.
10248 for (auto &R : make_early_inc_range(*Plan.getEntry())) {
10249 auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R);
10250 if (!ExpandR)
10251 continue;
10252 auto *ExpandedVal =
10253 Plan.getOrAddLiveIn(ExpandedSCEVs.find(ExpandR->getSCEV())->second);
10254 ExpandR->replaceAllUsesWith(ExpandedVal);
10255 if (Plan.getTripCount() == ExpandR)
10256 Plan.resetTripCount(ExpandedVal);
10257 ExpandR->eraseFromParent();
10258 }
10259
10260 // Ensure that the start values for all header phi recipes are updated before
10261 // vectorizing the epilogue loop.
10262 for (VPRecipeBase &R : Header->phis()) {
10263 if (auto *IV = dyn_cast<VPCanonicalIVPHIRecipe>(&R)) {
10264 // When vectorizing the epilogue loop, the canonical induction start
10265 // value needs to be changed from zero to the value after the main
10266 // vector loop. Find the resume value created during execution of the main
10267 // VPlan.
10268 // FIXME: Improve modeling for canonical IV start values in the epilogue
10269 // loop.
10270 BasicBlock *MainMiddle = find_singleton<BasicBlock>(
10271 predecessors(L->getLoopPreheader()),
10272 [&EPI](BasicBlock *BB, bool) -> BasicBlock * {
10273 if (BB != EPI.MainLoopIterationCountCheck &&
10274 BB != EPI.EpilogueIterationCountCheck &&
10275 BB != EPI.SCEVSafetyCheck && BB != EPI.MemSafetyCheck)
10276 return BB;
10277 return nullptr;
10278 });
10279 using namespace llvm::PatternMatch;
10280 Type *IdxTy = IV->getScalarType();
10281 PHINode *EPResumeVal = find_singleton<PHINode>(
10282 L->getLoopPreheader()->phis(),
10283 [&EPI, IdxTy, MainMiddle](PHINode &P, bool) -> PHINode * {
10284 if (P.getType() == IdxTy &&
10285 P.getIncomingValueForBlock(MainMiddle) == EPI.VectorTripCount &&
10286 match(
10287 P.getIncomingValueForBlock(EPI.MainLoopIterationCountCheck),
10288 m_SpecificInt(0)))
10289 return &P;
10290 return nullptr;
10291 });
10292 assert(EPResumeVal && "must have a resume value for the canonical IV");
10293 VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal);
10294 assert(all_of(IV->users(),
10295 [](const VPUser *U) {
10296 return isa<VPScalarIVStepsRecipe>(U) ||
10297 isa<VPScalarCastRecipe>(U) ||
10298 isa<VPDerivedIVRecipe>(U) ||
10299 cast<VPInstruction>(U)->getOpcode() ==
10300 Instruction::Add;
10301 }) &&
10302 "the canonical IV should only be used by its increment or "
10303 "ScalarIVSteps when resetting the start value");
10304 IV->setOperand(0, VPV);
10305 continue;
10306 }
10307
10308 Value *ResumeV = nullptr;
10309 // TODO: Move setting of resume values to prepareToExecute.
10310 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10311 ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr())
10312 ->getIncomingValueForBlock(L->getLoopPreheader());
10313 const RecurrenceDescriptor &RdxDesc =
10314 ReductionPhi->getRecurrenceDescriptor();
10315 RecurKind RK = RdxDesc.getRecurrenceKind();
10317 // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
10318 // start value; compare the final value from the main vector loop
10319 // to the start value.
10320 BasicBlock *PBB = cast<Instruction>(ResumeV)->getParent();
10321 IRBuilder<> Builder(PBB, PBB->getFirstNonPHIIt());
10322 ResumeV =
10323 Builder.CreateICmpNE(ResumeV, RdxDesc.getRecurrenceStartValue());
10325 // VPReductionPHIRecipe for FindLastIV reductions requires an adjustment
10326 // to the resume value. The resume value is adjusted to the sentinel
10327 // value when the final value from the main vector loop equals the start
10328 // value. This ensures correctness when the start value might not be
10329 // less than the minimum value of a monotonically increasing induction
10330 // variable.
10331 BasicBlock *ResumeBB = cast<Instruction>(ResumeV)->getParent();
10332 IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt());
10333 Value *Cmp =
10334 Builder.CreateICmpEQ(ResumeV, RdxDesc.getRecurrenceStartValue());
10335 ResumeV =
10336 Builder.CreateSelect(Cmp, RdxDesc.getSentinelValue(), ResumeV);
10337 }
10338 } else {
10339 // Retrieve the induction resume values for wide inductions from
10340 // their original phi nodes in the scalar loop.
10341 PHINode *IndPhi = cast<VPWidenInductionRecipe>(&R)->getPHINode();
10342 // Hook up to the PHINode generated by a ResumePhi recipe of main
10343 // loop VPlan, which feeds the scalar loop.
10344 ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader());
10345 }
10346 assert(ResumeV && "Must have a resume value");
10347 VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
10348 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
10349 }
10350}
10351
10353 assert((EnableVPlanNativePath || L->isInnermost()) &&
10354 "VPlan-native path is not enabled. Only process inner loops.");
10355
10356 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
10357 << L->getHeader()->getParent()->getName() << "' from "
10358 << L->getLocStr() << "\n");
10359
10360 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
10361
10362 LLVM_DEBUG(
10363 dbgs() << "LV: Loop hints:"
10364 << " force="
10366 ? "disabled"
10368 ? "enabled"
10369 : "?"))
10370 << " width=" << Hints.getWidth()
10371 << " interleave=" << Hints.getInterleave() << "\n");
10372
10373 // Function containing loop
10374 Function *F = L->getHeader()->getParent();
10375
10376 // Looking at the diagnostic output is the only way to determine if a loop
10377 // was vectorized (other than looking at the IR or machine code), so it
10378 // is important to generate an optimization remark for each loop. Most of
10379 // these messages are generated as OptimizationRemarkAnalysis. Remarks
10380 // generated as OptimizationRemark and OptimizationRemarkMissed are
10381 // less verbose reporting vectorized loops and unvectorized loops that may
10382 // benefit from vectorization, respectively.
10383
10384 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10385 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10386 return false;
10387 }
10388
10389 PredicatedScalarEvolution PSE(*SE, *L);
10390
10391 // Check if it is legal to vectorize the loop.
10392 LoopVectorizationRequirements Requirements;
10393 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
10394 &Requirements, &Hints, DB, AC, BFI, PSI);
10396 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10397 Hints.emitRemarkWithHints();
10398 return false;
10399 }
10400
10402 reportVectorizationFailure("Auto-vectorization of loops with uncountable "
10403 "early exit is not enabled",
10404 "UncountableEarlyExitLoopsDisabled", ORE, L);
10405 return false;
10406 }
10407
10408 if (LVL.hasStructVectorCall()) {
10409 reportVectorizationFailure("Auto-vectorization of calls that return struct "
10410 "types is not yet supported",
10411 "StructCallVectorizationUnsupported", ORE, L);
10412 return false;
10413 }
10414
10415 // Entrance to the VPlan-native vectorization path. Outer loops are processed
10416 // here. They may require CFG and instruction level transformations before
10417 // even evaluating whether vectorization is profitable. Since we cannot modify
10418 // the incoming IR, we need to build VPlan upfront in the vectorization
10419 // pipeline.
10420 if (!L->isInnermost())
10421 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10422 ORE, BFI, PSI, Hints, Requirements);
10423
10424 assert(L->isInnermost() && "Inner loop expected.");
10425
10426 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10427 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10428
10429 // If an override option has been passed in for interleaved accesses, use it.
10431 UseInterleaved = EnableInterleavedMemAccesses;
10432
10433 // Analyze interleaved memory accesses.
10434 if (UseInterleaved)
10436
10437 if (LVL.hasUncountableEarlyExit()) {
10438 BasicBlock *LoopLatch = L->getLoopLatch();
10439 if (IAI.requiresScalarEpilogue() ||
10441 [LoopLatch](BasicBlock *BB) { return BB != LoopLatch; })) {
10442 reportVectorizationFailure("Auto-vectorization of early exit loops "
10443 "requiring a scalar epilogue is unsupported",
10444 "UncountableEarlyExitUnsupported", ORE, L);
10445 return false;
10446 }
10447 }
10448
10449 // Check the function attributes and profiles to find out if this function
10450 // should be optimized for size.
10452 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
10453
10454 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10455 // count by optimizing for size, to minimize overheads.
10456 auto ExpectedTC = getSmallBestKnownTC(PSE, L);
10457 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10458 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10459 << "This loop is worth vectorizing only if no scalar "
10460 << "iteration overheads are incurred.");
10462 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10463 else {
10464 if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
10465 LLVM_DEBUG(dbgs() << "\n");
10466 // Predicate tail-folded loops are efficient even when the loop
10467 // iteration count is low. However, setting the epilogue policy to
10468 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
10469 // with runtime checks. It's more effective to let
10470 // `areRuntimeChecksProfitable` determine if vectorization is beneficial
10471 // for the loop.
10474 } else {
10475 LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
10476 "small to consider vectorizing.\n");
10478 "The trip count is below the minial threshold value.",
10479 "loop trip count is too low, avoiding vectorization",
10480 "LowTripCount", ORE, L);
10481 Hints.emitRemarkWithHints();
10482 return false;
10483 }
10484 }
10485 }
10486
10487 // Check the function attributes to see if implicit floats or vectors are
10488 // allowed.
10489 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10491 "Can't vectorize when the NoImplicitFloat attribute is used",
10492 "loop not vectorized due to NoImplicitFloat attribute",
10493 "NoImplicitFloat", ORE, L);
10494 Hints.emitRemarkWithHints();
10495 return false;
10496 }
10497
10498 // Check if the target supports potentially unsafe FP vectorization.
10499 // FIXME: Add a check for the type of safety issue (denormal, signaling)
10500 // for the target we're vectorizing for, to make sure none of the
10501 // additional fp-math flags can help.
10502 if (Hints.isPotentiallyUnsafe() &&
10505 "Potentially unsafe FP op prevents vectorization",
10506 "loop not vectorized due to unsafe FP support.",
10507 "UnsafeFP", ORE, L);
10508 Hints.emitRemarkWithHints();
10509 return false;
10510 }
10511
10512 bool AllowOrderedReductions;
10513 // If the flag is set, use that instead and override the TTI behaviour.
10515 AllowOrderedReductions = ForceOrderedReductions;
10516 else
10517 AllowOrderedReductions = TTI->enableOrderedReductions();
10518 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10519 ORE->emit([&]() {
10520 auto *ExactFPMathInst = Requirements.getExactFPInst();
10521 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10522 ExactFPMathInst->getDebugLoc(),
10523 ExactFPMathInst->getParent())
10524 << "loop not vectorized: cannot prove it is safe to reorder "
10525 "floating-point operations";
10526 });
10527 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10528 "reorder floating-point operations\n");
10529 Hints.emitRemarkWithHints();
10530 return false;
10531 }
10532
10533 // Use the cost model.
10534 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10535 F, &Hints, IAI);
10536 // Use the planner for vectorization.
10537 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
10538 ORE);
10539
10540 // Get user vectorization factor and interleave count.
10541 ElementCount UserVF = Hints.getWidth();
10542 unsigned UserIC = Hints.getInterleave();
10543
10544 // Plan how to best vectorize.
10545 LVP.plan(UserVF, UserIC);
10547 unsigned IC = 1;
10548
10551
10552 bool AddBranchWeights =
10553 hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
10554 GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
10555 AddBranchWeights, CM.CostKind);
10556 if (LVP.hasPlanWithVF(VF.Width)) {
10557 // Select the interleave count.
10558 IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
10559
10560 unsigned SelectedIC = std::max(IC, UserIC);
10561 // Optimistically generate runtime checks if they are needed. Drop them if
10562 // they turn out to not be profitable.
10563 if (VF.Width.isVector() || SelectedIC > 1)
10564 Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
10565
10566 // Check if it is profitable to vectorize with runtime checks.
10567 bool ForceVectorization =
10569 if (!ForceVectorization &&
10570 !areRuntimeChecksProfitable(Checks, VF, L, PSE, SEL,
10571 CM.getVScaleForTuning())) {
10572 ORE->emit([&]() {
10574 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
10575 L->getHeader())
10576 << "loop not vectorized: cannot prove it is safe to reorder "
10577 "memory operations";
10578 });
10579 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
10580 Hints.emitRemarkWithHints();
10581 return false;
10582 }
10583 }
10584
10585 // Identify the diagnostic messages that should be produced.
10586 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10587 bool VectorizeLoop = true, InterleaveLoop = true;
10588 if (VF.Width.isScalar()) {
10589 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10590 VecDiagMsg = std::make_pair(
10591 "VectorizationNotBeneficial",
10592 "the cost-model indicates that vectorization is not beneficial");
10593 VectorizeLoop = false;
10594 }
10595
10596 if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) {
10597 // Tell the user interleaving was avoided up-front, despite being explicitly
10598 // requested.
10599 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10600 "interleaving should be avoided up front\n");
10601 IntDiagMsg = std::make_pair(
10602 "InterleavingAvoided",
10603 "Ignoring UserIC, because interleaving was avoided up front");
10604 InterleaveLoop = false;
10605 } else if (IC == 1 && UserIC <= 1) {
10606 // Tell the user interleaving is not beneficial.
10607 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10608 IntDiagMsg = std::make_pair(
10609 "InterleavingNotBeneficial",
10610 "the cost-model indicates that interleaving is not beneficial");
10611 InterleaveLoop = false;
10612 if (UserIC == 1) {
10613 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10614 IntDiagMsg.second +=
10615 " and is explicitly disabled or interleave count is set to 1";
10616 }
10617 } else if (IC > 1 && UserIC == 1) {
10618 // Tell the user interleaving is beneficial, but it explicitly disabled.
10619 LLVM_DEBUG(
10620 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10621 IntDiagMsg = std::make_pair(
10622 "InterleavingBeneficialButDisabled",
10623 "the cost-model indicates that interleaving is beneficial "
10624 "but is explicitly disabled or interleave count is set to 1");
10625 InterleaveLoop = false;
10626 }
10627
10628 // If there is a histogram in the loop, do not just interleave without
10629 // vectorizing. The order of operations will be incorrect without the
10630 // histogram intrinsics, which are only used for recipes with VF > 1.
10631 if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) {
10632 LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
10633 << "to histogram operations.\n");
10634 IntDiagMsg = std::make_pair(
10635 "HistogramPreventsScalarInterleaving",
10636 "Unable to interleave without vectorization due to constraints on "
10637 "the order of histogram operations");
10638 InterleaveLoop = false;
10639 }
10640
10641 // Override IC if user provided an interleave count.
10642 IC = UserIC > 0 ? UserIC : IC;
10643
10644 // Emit diagnostic messages, if any.
10645 const char *VAPassName = Hints.vectorizeAnalysisPassName();
10646 if (!VectorizeLoop && !InterleaveLoop) {
10647 // Do not vectorize or interleaving the loop.
10648 ORE->emit([&]() {
10649 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10650 L->getStartLoc(), L->getHeader())
10651 << VecDiagMsg.second;
10652 });
10653 ORE->emit([&]() {
10654 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10655 L->getStartLoc(), L->getHeader())
10656 << IntDiagMsg.second;
10657 });
10658 return false;
10659 }
10660
10661 if (!VectorizeLoop && InterleaveLoop) {
10662 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10663 ORE->emit([&]() {
10664 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10665 L->getStartLoc(), L->getHeader())
10666 << VecDiagMsg.second;
10667 });
10668 } else if (VectorizeLoop && !InterleaveLoop) {
10669 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10670 << ") in " << L->getLocStr() << '\n');
10671 ORE->emit([&]() {
10672 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10673 L->getStartLoc(), L->getHeader())
10674 << IntDiagMsg.second;
10675 });
10676 } else if (VectorizeLoop && InterleaveLoop) {
10677 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10678 << ") in " << L->getLocStr() << '\n');
10679 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10680 }
10681
10682 bool DisableRuntimeUnroll = false;
10683 MDNode *OrigLoopID = L->getLoopID();
10684 {
10685 using namespace ore;
10686 if (!VectorizeLoop) {
10687 assert(IC > 1 && "interleave count should not be 1 or 0");
10688 // If we decided that it is not legal to vectorize the loop, then
10689 // interleave it.
10690 VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10691 InnerLoopVectorizer Unroller(
10692 L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1),
10693 ElementCount::getFixed(1), IC, &LVL, &CM, BFI, PSI, Checks, BestPlan);
10694
10695 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10696
10697 ORE->emit([&]() {
10698 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10699 L->getHeader())
10700 << "interleaved loop (interleaved count: "
10701 << NV("InterleaveCount", IC) << ")";
10702 });
10703 } else {
10704 // If we decided that it is *legal* to vectorize the loop, then do it.
10705
10706 VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10707 // Consider vectorizing the epilogue too if it's profitable.
10708 VectorizationFactor EpilogueVF =
10710 if (EpilogueVF.Width.isVector()) {
10711 std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
10712
10713 // The first pass vectorizes the main loop and creates a scalar epilogue
10714 // to be vectorized by executing the plan (potentially with a different
10715 // factor) again shortly afterwards.
10716 VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width);
10717 preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);
10718 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
10719 BestEpiPlan);
10720 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10721 EPI, &LVL, &CM, BFI, PSI, Checks,
10722 *BestMainPlan);
10723 auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
10724 *BestMainPlan, MainILV, DT, false);
10725 ++LoopsVectorized;
10726
10727 // Second pass vectorizes the epilogue and adjusts the control flow
10728 // edges from the first pass.
10729 EPI.MainLoopVF = EPI.EpilogueVF;
10730 EPI.MainLoopUF = EPI.EpilogueUF;
10731 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10732 ORE, EPI, &LVL, &CM, BFI, PSI,
10733 Checks, BestEpiPlan);
10734 EpilogILV.setTripCount(MainILV.getTripCount());
10735 preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI);
10736
10737 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10738 DT, true, &ExpandedSCEVs);
10739 ++LoopsEpilogueVectorized;
10740
10741 if (!MainILV.areSafetyChecksAdded())
10742 DisableRuntimeUnroll = true;
10743 } else {
10744 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10745 VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10746 PSI, Checks, BestPlan);
10747 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10748 ++LoopsVectorized;
10749
10750 // Add metadata to disable runtime unrolling a scalar loop when there
10751 // are no runtime checks about strides and memory. A scalar loop that is
10752 // rarely used is not worth unrolling.
10753 if (!LB.areSafetyChecksAdded())
10754 DisableRuntimeUnroll = true;
10755 }
10756 // Report the vectorization decision.
10757 reportVectorization(ORE, L, VF, IC);
10758 }
10759
10762 }
10763
10764 assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
10765 "DT not preserved correctly");
10766
10767 std::optional<MDNode *> RemainderLoopID =
10770 if (RemainderLoopID) {
10771 L->setLoopID(*RemainderLoopID);
10772 } else {
10773 if (DisableRuntimeUnroll)
10775
10776 // Mark the loop as already vectorized to avoid vectorizing again.
10777 Hints.setAlreadyVectorized();
10778 }
10779
10780 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10781 return true;
10782}
10783
10785
10786 // Don't attempt if
10787 // 1. the target claims to have no vector registers, and
10788 // 2. interleaving won't help ILP.
10789 //
10790 // The second condition is necessary because, even if the target has no
10791 // vector registers, loop vectorization may still enable scalar
10792 // interleaving.
10795 return LoopVectorizeResult(false, false);
10796
10797 bool Changed = false, CFGChanged = false;
10798
10799 // The vectorizer requires loops to be in simplified form.
10800 // Since simplification may add new inner loops, it has to run before the
10801 // legality and profitability checks. This means running the loop vectorizer
10802 // will simplify all loops, regardless of whether anything end up being
10803 // vectorized.
10804 for (const auto &L : *LI)
10805 Changed |= CFGChanged |=
10806 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10807
10808 // Build up a worklist of inner-loops to vectorize. This is necessary as
10809 // the act of vectorizing or partially unrolling a loop creates new loops
10810 // and can invalidate iterators across the loops.
10811 SmallVector<Loop *, 8> Worklist;
10812
10813 for (Loop *L : *LI)
10814 collectSupportedLoops(*L, LI, ORE, Worklist);
10815
10816 LoopsAnalyzed += Worklist.size();
10817
10818 // Now walk the identified inner loops.
10819 while (!Worklist.empty()) {
10820 Loop *L = Worklist.pop_back_val();
10821
10822 // For the inner loops we actually process, form LCSSA to simplify the
10823 // transform.
10824 Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10825
10826 Changed |= CFGChanged |= processLoop(L);
10827
10828 if (Changed) {
10829 LAIs->clear();
10830
10831#ifndef NDEBUG
10832 if (VerifySCEV)
10833 SE->verify();
10834#endif
10835 }
10836 }
10837
10838 // Process each loop nest in the function.
10839 return LoopVectorizeResult(Changed, CFGChanged);
10840}
10841
10844 LI = &AM.getResult<LoopAnalysis>(F);
10845 // There are no loops in the function. Return before computing other
10846 // expensive analyses.
10847 if (LI->empty())
10848 return PreservedAnalyses::all();
10857
10858 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10859 PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10860 BFI = nullptr;
10861 if (PSI && PSI->hasProfileSummary())
10863 LoopVectorizeResult Result = runImpl(F);
10864 if (!Result.MadeAnyChange)
10865 return PreservedAnalyses::all();
10867
10868 if (isAssignmentTrackingEnabled(*F.getParent())) {
10869 for (auto &BB : F)
10871 }
10872
10873 PA.preserve<LoopAnalysis>();
10877
10878 if (Result.MadeCFGChange) {
10879 // Making CFG changes likely means a loop got vectorized. Indicate that
10880 // extra simplification passes should be run.
10881 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10882 // be run if runtime checks have been added.
10885 } else {
10887 }
10888 return PA;
10889}
10890
10892 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10893 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10894 OS, MapClassName2PassName);
10895
10896 OS << '<';
10897 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10898 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10899 OS << '>';
10900}
@ Poison
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
AMDGPU Lower Kernel Arguments
AMDGPU Register Bank Select
Rewrite undef for PHI
This file implements a class to represent arbitrary precision integral constant values and operations...
@ PostInc
ReachingDefAnalysis InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
This file contains the simple types necessary to represent the attributes associated with functions a...
This is the interface for LLVM's primary stateless and local alias analysis.
static bool IsEmptyBlock(MachineBasicBlock *MBB)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:686
This file contains the declarations for the subclasses of Constant, which represent the different fla...
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
#define DEBUG_WITH_TYPE(TYPE,...)
DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug information.
Definition: Debug.h:64
This file defines DenseMapInfo traits for DenseMap.
This file defines the DenseMap class.
std::string Name
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define Check(C,...)
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
#define _
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
Module.h This file contains the declarations for the Module class.
This defines the Use class.
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
Legalize the Machine IR a function s Machine IR
Definition: Legalizer.cpp:80
This header provides classes for managing per-loop analyses.
static const char * VerboseDebug
loop Loop Strength Reduction
#define LV_NAME
This file defines the LoopVectorizationLegality class.
This file provides a LoopVectorizationPlanner class.
static void collectSupportedLoops(Loop &L, LoopInfo *LI, OptimizationRemarkEmitter *ORE, SmallVectorImpl< Loop * > &V)
static cl::opt< unsigned > EpilogueVectorizationMinVF("epilogue-vectorization-minimum-VF", cl::Hidden, cl::desc("Only loops with vectorization factor equal to or larger than " "the specified value are considered for epilogue vectorization."))
static cl::opt< unsigned > EpilogueVectorizationForceVF("epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, cl::desc("When epilogue vectorization is enabled, and a value greater than " "1 is specified, forces the given VF for all applicable epilogue " "loops."))
static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan, DenseMap< VPValue *, VPValue * > &IVEndValues)
Create resume phis in the scalar preheader for first-order recurrences, reductions and inductions,...
static void addRuntimeUnrollDisableMetaData(Loop *L)
static ElementCount determineVPlanVF(const TargetTransformInfo &TTI, LoopVectorizationCostModel &CM)
static cl::opt< unsigned > VectorizeMemoryCheckThreshold("vectorize-memory-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum allowed number of runtime memory checks"))
static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan)
Prepare MainPlan for vectorizing the main vector loop during epilogue vectorization.
static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, VectorizationFactor &VF, Loop *L, PredicatedScalarEvolution &PSE, ScalarEpilogueLowering SEL, std::optional< unsigned > VScale)
static cl::opt< unsigned > TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), cl::Hidden, cl::desc("Loops with a constant trip count that is smaller than this " "value are vectorized only if no scalar iteration overheads " "are incurred."))
Loops with a known constant trip count below this number are vectorized only if no scalar iteration o...
static void debugVectorizationMessage(const StringRef Prefix, const StringRef DebugMsg, Instruction *I)
Write a DebugMsg about vectorization to the debug output stream.
static cl::opt< bool > EnableCondStoresVectorization("enable-cond-stores-vec", cl::init(true), cl::Hidden, cl::desc("Enable if predication of stores during vectorization."))
static VPInstruction * addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Create and return a ResumePhi for WideIV, unless it is truncated.
static Value * emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, Value *Step, InductionDescriptor::InductionKind InductionKind, const BinaryOperator *InductionBinOp)
Compute the transformed value of Index at offset StartValue using step StepValue.
static DebugLoc getDebugLocFromInstOrOperands(Instruction *I)
Look for a meaningful debug location on the instruction or its operands.
static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB)
Replace VPBB with a VPIRBasicBlock wrapping IRBB.
const char LLVMLoopVectorizeFollowupAll[]
static SetVector< VPIRInstruction * > collectUsersInExitBlocks(Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan)
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, SetVector< VPIRInstruction * > &ExitUsersToFix)
Handle users in the exit block for first order reductions in the original exit block.
static cl::opt< bool > ForceTargetSupportsScalableVectors("force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, cl::desc("Pretend that scalable vectors are supported, even if the target does " "not support them. This flag should only be used for testing."))
static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, DebugLoc DL)
static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style)
static constexpr uint32_t MemCheckBypassWeights[]
static unsigned getEstimatedRuntimeVF(ElementCount VF, std::optional< unsigned > VScale)
This function attempts to return a value that represents the vectorization factor at runtime.
cl::opt< unsigned > ForceTargetInstructionCost("force-target-instruction-cost", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's expected cost for " "an instruction to a single constant value. Mostly " "useful for getting consistent testing."))
std::optional< unsigned > getMaxVScale(const Function &F, const TargetTransformInfo &TTI)
static constexpr uint32_t MinItersBypassWeights[]
static cl::opt< unsigned > ForceTargetNumScalarRegs("force-target-num-scalar-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of scalar registers."))
static cl::opt< bool > UseWiderVFIfCallVariantsPresent("vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), cl::Hidden, cl::desc("Try wider VFs if they enable the use of vector variants"))
static bool planContainsAdditionalSimplifications(VPlan &Plan, VPCostContext &CostCtx, Loop *TheLoop)
Return true if the original loop \ TheLoop contains any instructions that do not have corresponding r...
static cl::opt< unsigned > SmallLoopCost("small-loop-cost", cl::init(20), cl::Hidden, cl::desc("The cost of a loop that is considered 'small' by the interleaver."))
static cl::opt< unsigned > ForceTargetNumVectorRegs("force-target-num-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers."))
static bool isExplicitVecOuterLoop(Loop *OuterLp, OptimizationRemarkEmitter *ORE)
static cl::opt< bool > EnableIndVarRegisterHeur("enable-ind-var-reg-heur", cl::init(true), cl::Hidden, cl::desc("Count the induction variable only once when interleaving"))
static Type * maybeVectorizeType(Type *Elt, ElementCount VF)
static std::optional< unsigned > getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L, bool CanUseConstantMax=true)
Returns "best known" trip count for the specified loop L as defined by the following procedure: 1) Re...
static cl::opt< TailFoldingStyle > ForceTailFoldingStyle("force-tail-folding-style", cl::desc("Force the tail folding style"), cl::init(TailFoldingStyle::None), cl::values(clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), clEnumValN(TailFoldingStyle::Data, "data", "Create lane mask for data only, using active.lane.mask intrinsic"), clEnumValN(TailFoldingStyle::DataWithoutLaneMask, "data-without-lane-mask", "Create lane mask with compare/stepvector"), clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", "Create lane mask using active.lane.mask intrinsic, and use " "it for both data and control flow"), clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, "data-and-control-without-rt-check", "Similar to data-and-control, but remove the runtime check"), clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl", "Use predicated EVL instructions for tail folding. If EVL " "is unsupported, fallback to data-without-lane-mask.")))
static cl::opt< bool > EnableEpilogueVectorization("enable-epilogue-vectorization", cl::init(true), cl::Hidden, cl::desc("Enable vectorization of epilogue loops."))
static ScalarEpilogueLowering getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI)
const char VerboseDebug[]
static void fixReductionScalarResumeWhenVectorizingEpilog(VPRecipeBase *R, VPTransformState &State, BasicBlock *LoopMiddleBlock, BasicBlock *BypassBlock)
static cl::opt< bool > PreferPredicatedReductionSelect("prefer-predicated-reduction-select", cl::init(false), cl::Hidden, cl::desc("Prefer predicating a reduction operation over an after loop select."))
static VPWidenIntOrFpInductionRecipe * createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start, const InductionDescriptor &IndDesc, VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop)
Creates a VPWidenIntOrFpInductionRecpipe for Phi.
static constexpr uint32_t SCEVCheckBypassWeights[]
static cl::opt< bool > PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), cl::Hidden, cl::desc("Prefer in-loop vector reductions, " "overriding the targets preference."))
const char LLVMLoopVectorizeFollowupVectorized[]
static cl::opt< bool > EnableLoadStoreRuntimeInterleave("enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, cl::desc("Enable runtime interleaving until load/store ports are saturated"))
static cl::opt< bool > VPlanBuildStressTest("vplan-build-stress-test", cl::init(false), cl::Hidden, cl::desc("Build VPlan for every supported loop nest in the function and bail " "out right after the build (stress test the VPlan H-CFG construction " "in the VPlan-native vectorization path)."))
static bool hasIrregularType(Type *Ty, const DataLayout &DL)
A helper function that returns true if the given type is irregular.
static cl::opt< bool > LoopVectorizeWithBlockFrequency("loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, cl::desc("Enable the use of the block frequency analysis to access PGO " "heuristics minimizing code growth in cold regions and being more " "aggressive in hot regions."))
static Value * getExpandedStep(const InductionDescriptor &ID, const SCEV2ValueTy &ExpandedSCEVs)
Return the expanded step for ID using ExpandedSCEVs to look up SCEV expansion results.
const char LLVMLoopVectorizeFollowupEpilogue[]
static void preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, const SCEV2ValueTy &ExpandedSCEVs, const EpilogueLoopVectorizationInfo &EPI)
Prepare Plan for vectorizing the epilogue loop.
static bool useActiveLaneMask(TailFoldingStyle Style)
static bool isIndvarOverflowCheckKnownFalse(const LoopVectorizationCostModel *Cost, ElementCount VF, std::optional< unsigned > UF=std::nullopt)
For the given VF and UF and maximum trip count computed for the loop, return whether the induction va...
static void addFullyUnrolledInstructionsToIgnore(Loop *L, const LoopVectorizationLegality::InductionList &IL, SmallPtrSetImpl< Instruction * > &InstsToIgnore)
Knowing that loop L executes a single vector iteration, add instructions that will get simplified and...
static cl::opt< PreferPredicateTy::Option > PreferPredicateOverEpilogue("prefer-predicate-over-epilogue", cl::init(PreferPredicateTy::ScalarEpilogue), cl::Hidden, cl::desc("Tail-folding and predication preferences over creating a scalar " "epilogue loop."), cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, "scalar-epilogue", "Don't tail-predicate loops, create scalar epilogue"), clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, "predicate-else-scalar-epilogue", "prefer tail-folding, create scalar epilogue if tail " "folding fails."), clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, "predicate-dont-vectorize", "prefers tail-folding, don't attempt vectorization if " "tail-folding fails.")))
static cl::opt< bool > EnableInterleavedMemAccesses("enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop"))
static cl::opt< bool > EnableMaskedInterleavedMemAccesses("enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"))
An interleave-group may need masking if it resides in a block that needs predication,...
static cl::opt< bool > ForceOrderedReductions("force-ordered-reductions", cl::init(false), cl::Hidden, cl::desc("Enable the vectorisation of loops with in-order (strict) " "FP reductions"))
static void cse(BasicBlock *BB)
Perform cse of induction variable instructions.
static const SCEV * getAddressAccessSCEV(Value *Ptr, LoopVectorizationLegality *Legal, PredicatedScalarEvolution &PSE, const Loop *TheLoop)
Gets Address Access SCEV after verifying that the access pattern is loop invariant except the inducti...
static cl::opt< cl::boolOrDefault > ForceSafeDivisor("force-widen-divrem-via-safe-divisor", cl::Hidden, cl::desc("Override cost based safe divisor widening for div/rem instructions"))
#define DEBUG_TYPE
static cl::opt< unsigned > ForceTargetMaxVectorInterleaveFactor("force-target-max-vector-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "vectorized loops."))
static bool processLoopInVPlanNativePath(Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements)
static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI)
static cl::opt< unsigned > NumberOfStoresToPredicate("vectorize-num-stores-pred", cl::init(1), cl::Hidden, cl::desc("Max number of stores to be predicated behind an if."))
The number of stores in a loop that are allowed to need predication.
static cl::opt< unsigned > MaxNestedScalarReductionIC("max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop."))
static cl::opt< unsigned > ForceTargetMaxScalarInterleaveFactor("force-target-max-scalar-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "scalar loops."))
static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE)
static void addUsersInExitBlocks(VPlan &Plan, const SetVector< VPIRInstruction * > &ExitUsersToFix)
static bool willGenerateVectors(VPlan &Plan, ElementCount VF, const TargetTransformInfo &TTI)
Check if any recipe of Plan will generate a vector value, which will be assigned a vector register.
static cl::opt< bool > EnableEarlyExitVectorization("enable-early-exit-vectorization", cl::init(false), cl::Hidden, cl::desc("Enable vectorization of early exit loops with uncountable exits."))
static cl::opt< bool > MaximizeBandwidth("vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " "will be determined by the smallest type in loop."))
static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, Instruction *I, DebugLoc DL={})
Create an analysis remark that explains why vectorization failed.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
This file implements a map that provides insertion order iteration.
std::pair< uint64_t, uint64_t > Interval
This file contains the declarations for metadata subclasses.
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
if(PassOpts->AAPipeline)
This file contains the declarations for profiling metadata utility functions.
const SmallVectorImpl< MachineOperand > & Cond
static BinaryOperator * CreateMul(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
static BinaryOperator * CreateAdd(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
#define OP(OPC)
Definition: Instruction.h:45
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
This pass exposes codegen information to IR-level passes.
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file defines the VPlanHCFGBuilder class which contains the public interface (buildHierarchicalCF...
This file contains the declarations of different VPlan-related auxiliary helpers.
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const char PassName[]
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1520
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1542
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
void registerAssumption(AssumeInst *CI)
Add an @llvm.assume intrinsic to this function's cache.
unsigned getVScaleRangeMin() const
Returns the minimum value for the vscale_range attribute.
Definition: Attributes.cpp:464
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:461
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition: BasicBlock.h:530
InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:381
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:481
const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Definition: BasicBlock.cpp:511
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:220
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:168
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:240
BinaryOps getOpcode() const
Definition: InstrTypes.h:370
Analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Conditional or Unconditional Branch instruction.
bool isConditional() const
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
BasicBlock * getSuccessor(unsigned i) const
Value * getCondition() const
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:72
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:1881
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1341
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1286
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1277
unsigned arg_size() const
Definition: InstrTypes.h:1284
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:696
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:698
@ ICMP_EQ
equal
Definition: InstrTypes.h:694
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:699
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:787
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:866
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:873
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
A debug info location.
Definition: DebugLoc.h:33
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:103
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
bool empty() const
Definition: DenseMap.h:98
iterator begin()
Definition: DenseMap.h:75
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:152
iterator end()
Definition: DenseMap.h:84
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:202
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:147
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:211
Implements a dense probed hash-table based set.
Definition: DenseSet.h:278
DomTreeNodeBase * getIDom() const
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
bool verify(VerificationLevel VL=VerificationLevel::Full) const
verify - checks if the tree is correct.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
void eraseNode(NodeT *BB)
eraseNode - Removes a node from the dominator tree.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
constexpr bool isVector() const
One or more elements.
Definition: TypeSize.h:326
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:314
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
static constexpr ElementCount get(ScalarTy MinVal, bool Scalable)
Definition: TypeSize.h:317
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:322
BasicBlock * emitMinimumVectorEpilogueIterCountCheck(BasicBlock *Bypass, BasicBlock *Insert)
Emits an iteration count bypass check after the main vector loop has finished to see if there are any...
EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan)
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
BasicBlock * createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Implements the interface for creating a vectorized skeleton using the epilogue loop strategy (ie the ...
A specialized derived class of inner loop vectorizer that performs vectorization of main loops in the...
BasicBlock * createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Implements the interface for creating a vectorized skeleton using the main loop strategy (ie the firs...
EpilogueVectorizerMainLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check, VPlan &Plan)
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
BasicBlock * emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue)
Emits an iteration count bypass check once for the main loop (when ForEpilogue is false) and once for...
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition: Operator.h:338
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Class to represent function types.
Definition: DerivedTypes.h:105
param_iterator param_begin() const
Definition: DerivedTypes.h:130
param_iterator param_end() const
Definition: DerivedTypes.h:131
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:719
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:216
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:373
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:766
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:716
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:731
Represents flags for the getelementptr instruction/expression.
static GEPNoWrapFlags inBounds()
static GEPNoWrapFlags none()
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:113
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:485
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1053
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:330
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2274
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition: IRBuilder.cpp:889
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2270
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition: IRBuilder.h:164
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1387
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1370
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:490
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:199
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2380
Value * CreateURem(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1447
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2705
A struct for saving information about induction variables.
const SCEV * getStep() const
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_NoInduction
Not an induction variable.
@ IK_FpInduction
Floating point induction variable.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
const SmallVectorImpl< Instruction * > & getCastInsts() const
Returns a reference to the type cast instructions in the induction update chain, that are redundant w...
Value * getStartValue() const
An extension of the inner loop vectorizer that creates a skeleton for a vectorized loop that has its ...
InnerLoopAndEpilogueVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan)
BasicBlock * createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Create a new empty loop that will contain vectorized instructions later on, while the old loop will b...
virtual BasicBlock * createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs)=0
The interface for creating a vectorized skeleton using one of two different strategies,...
EpilogueLoopVectorizationInfo & EPI
Holds and updates state information required to vectorize the main loop and its epilogue in two separ...
InnerLoopVectorizer vectorizes loops which contain only one basic block to a specified vectorization ...
virtual void printDebugTracesAtStart()
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
Value * TripCount
Trip count of the original loop.
void sinkScalarOperands(Instruction *PredInst)
Iteratively sink the scalarized operands of a predicated instruction into the block that was created ...
const TargetLibraryInfo * TLI
Target Library Info.
ElementCount MinProfitableTripCount
const TargetTransformInfo * TTI
Target Transform Info.
Value * VectorTripCount
Trip count of the widened loop (TripCount - TripCount % (VF*UF))
BasicBlock * emitSCEVChecks(BasicBlock *Bypass)
Emit a bypass check to see if all of the SCEV assumptions we've had to make are correct.
virtual BasicBlock * createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs)
Create a new empty loop that will contain vectorized instructions later on, while the old loop will b...
LoopVectorizationCostModel * Cost
The profitablity analysis.
BasicBlock * AdditionalBypassBlock
The additional bypass block which conditionally skips over the epilogue loop after executing the main...
BlockFrequencyInfo * BFI
BFI and PSI are used to check for profile guided size optimizations.
Value * getTripCount() const
Returns the original loop trip count.
BasicBlock * LoopMiddleBlock
Middle Block between the vector and the scalar.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
void scalarizeInstruction(const Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPLane &Lane, VPTransformState &State)
A helper function to scalarize a single Instruction in the innermost loop.
SmallVector< Instruction *, 4 > PredicatedInstructions
Store instructions that were predicated.
DenseMap< PHINode *, Value * > Induction2AdditionalBypassValue
Mapping of induction phis to their additional bypass values.
void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB)
Introduces a new VPIRBasicBlock for CheckIRBB to Plan between the vector preheader and its predecesso...
void createVectorLoopSkeleton(StringRef Prefix)
Emit basic blocks (prefixed with Prefix) for the iteration check, vector loop preheader,...
BasicBlock * emitMemRuntimeChecks(BasicBlock *Bypass)
Emit bypass checks to check any memory assumptions we may have made.
BasicBlock * LoopScalarPreHeader
The scalar-loop preheader.
void createInductionAdditionalBypassValues(const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount)
Create and record the values for induction variables to resume coming from the additional bypass bloc...
VPBlockBase * VectorPHVPB
The vector preheader block of Plan, used as target for check blocks introduced during skeleton creati...
LoopVectorizationLegality * Legal
The legality analysis.
InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, ElementCount VecWidth, ElementCount MinProfitableTripCount, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks, VPlan &Plan)
void emitIterationCountCheck(BasicBlock *Bypass)
Emit a bypass check to see if the vector trip count is zero, including if it overflows.
PredicatedScalarEvolution & PSE
A wrapper around ScalarEvolution used to add runtime SCEV checks.
LoopInfo * LI
Loop Info.
ProfileSummaryInfo * PSI
Value * getInductionAdditionalBypassValue(PHINode *OrigPhi) const
induction header phi.
BasicBlock * getAdditionalBypassBlock() const
Return the additional bypass block which targets the scalar loop by skipping the epilogue loop after ...
DominatorTree * DT
Dominator Tree.
void setTripCount(Value *TC)
Used to set the trip count after ILV's construction and after the preheader block has been executed.
void fixVectorizedLoop(VPTransformState &State)
Fix the vectorized code, taking care of header phi's, and more.
BasicBlock * LoopVectorPreHeader
The vector-loop preheader.
virtual void printDebugTracesAtEnd()
AssumptionCache * AC
Assumption Cache.
Value * getOrCreateVectorTripCount(BasicBlock *InsertBlock)
Returns (and creates if needed) the trip count of the widened loop.
IRBuilder Builder
The builder that we use.
void fixNonInductionPHIs(VPTransformState &State)
Fix the non-induction PHIs in Plan.
unsigned UF
The vectorization unroll factor to use.
SmallVector< BasicBlock *, 4 > LoopBypassBlocks
A list of all bypass blocks. The first block is the entry of the loop.
GeneratedRTChecks & RTChecks
Structure to hold information about generated runtime checks, responsible for cleaning the checks,...
virtual ~InnerLoopVectorizer()=default
ElementCount VF
The vectorization SIMD factor to use.
Loop * OrigLoop
The original loop.
static InstructionCost getInvalid(CostType Val=0)
static InstructionCost getMax()
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:511
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:68
bool isBinaryOp() const
Definition: Instruction.h:315
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:94
void replaceSuccessorWith(BasicBlock *OldBB, BasicBlock *NewBB)
Replace specified successor OldBB to point at the provided block.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:169
FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
const char * getOpcodeName() const
Definition: Instruction.h:312
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:310
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:311
The group of interleaved loads/stores sharing the same stride and close to each other.
Definition: VectorUtils.h:488
uint32_t getFactor() const
Definition: VectorUtils.h:504
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
Definition: VectorUtils.h:558
InstTy * getInsertPos() const
Definition: VectorUtils.h:574
uint32_t getNumMembers() const
Definition: VectorUtils.h:506
Drive the analysis of interleaved memory accesses in the loop.
Definition: VectorUtils.h:630
InterleaveGroup< Instruction > * getInterleaveGroup(const Instruction *Instr) const
Get the interleave group that Instr belongs to.
Definition: VectorUtils.h:675
bool requiresScalarEpilogue() const
Returns true if an interleaved group that may access memory out-of-bounds requires a scalar epilogue ...
Definition: VectorUtils.h:686
bool isInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleave group.
Definition: VectorUtils.h:667
bool invalidateGroups()
Invalidate groups, e.g., in case all blocks in loop will be predicated contrary to original assumptio...
Definition: VectorUtils.h:650
iterator_range< SmallPtrSetIterator< llvm::InterleaveGroup< Instruction > * > > getInterleaveGroups()
Definition: VectorUtils.h:680
void analyzeInterleaving(bool EnableMaskedInterleavedGroup)
Analyze the interleaved accesses and collect them in interleave groups.
void invalidateGroupsRequiringScalarEpilogue()
Invalidate groups that require a scalar epilogue (due to gaps).
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:176
Type * getPointerOperandType() const
Definition: Instructions.h:258
This analysis provides dependence information for the memory accesses of a loop.
Drive the analysis of memory accesses in the loop.
const RuntimePointerChecking * getRuntimePointerChecking() const
unsigned getNumRuntimePointerChecks() const
Number of memchecks required to prove independence of otherwise may-alias pointers.
const DenseMap< Value *, const SCEV * > & getSymbolicStrides() const
If an access has a symbolic strides, this maps the pointer value to the stride symbol.
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
void getExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all of the successor blocks of this loop.
BlockT * getUniqueLatchExitBlock() const
Return the unique exit block for the latch, or null if there are multiple different exit blocks or th...
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
BlockT * getHeader() const
unsigned getLoopDepth() const
Return the nesting level of this loop.
void addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase< BlockT, LoopT > &LI)
This method is used by other analyses to update loop information.
iterator_range< block_iterator > blocks() const
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
LoopT * getParentLoop() const
Return the parent loop if it exists or nullptr for top level loops.
bool isLoopExiting(const BlockT *BB) const
True if terminator in the block can branch to another block that is outside of the current loop.
Store the result of a depth first search within basic blocks contained by a single loop.
Definition: LoopIterator.h:97
RPOIterator beginRPO() const
Reverse iterate over the cached postorder blocks.
Definition: LoopIterator.h:136
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
Definition: LoopInfo.cpp:1266
RPOIterator endRPO() const
Definition: LoopIterator.h:140
Wrapper class to LoopBlocksDFS that provides a standard begin()/end() interface for the DFS reverse p...
Definition: LoopIterator.h:172
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
Definition: LoopIterator.h:180
void removeBlock(BlockT *BB)
This method completely removes BB from all data structures, including all of the Loop objects it is n...
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LoopVectorizationCostModel - estimates the expected speedups due to vectorization.
SmallPtrSet< Type *, 16 > ElementTypesInLoop
All element types found in the loop.
void collectElementTypesForWidening()
Collect all element types in the loop for which widening is needed.
bool canVectorizeReductions(ElementCount VF) const
Returns true if the target machine supports all of the reduction variables found for the given VF.
bool isEpilogueVectorizationProfitable(const ElementCount VF, const unsigned IC) const
Returns true if epilogue vectorization is considered profitable, and false otherwise.
bool requiresScalarEpilogue(VFRange Range) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
bool isPredicatedInst(Instruction *I) const
Returns true if I is an instruction that needs to be predicated at runtime.
void collectValuesToIgnore()
Collect values we want to ignore in the cost model.
void collectInLoopReductions()
Split reductions into those that happen in the loop, and those that happen outside.
std::pair< unsigned, unsigned > getSmallestAndWidestTypes()
bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be uniform after vectorization.
bool usePredicatedReductionSelect(unsigned Opcode, Type *PhiTy) const
Returns true if the predicated reduction select should be used to set the incoming value for the redu...
PredicatedScalarEvolution & PSE
Predicated scalar evolution analysis.
const LoopVectorizeHints * Hints
Loop Vectorize Hint.
std::optional< unsigned > getMaxSafeElements() const
Return maximum safe number of elements to be processed per vector iteration, which do not prevent sto...
const TargetTransformInfo & TTI
Vector target information.
LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, const Function *F, const LoopVectorizeHints *Hints, InterleavedAccessInfo &IAI)
LoopVectorizationLegality * Legal
Vectorization legality.
bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const
Returns true if the target machine supports masked load operation for the given DataType and kind of ...
std::optional< InstructionCost > getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy) const
Return the cost of instructions in an inloop reduction pattern, if I is part of that pattern.
InstructionCost getInstructionCost(Instruction *I, ElementCount VF)
Returns the execution time cost of an instruction for a given vector width.
DemandedBits * DB
Demanded bits analysis.
bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const
Returns true if I is a memory instruction in an interleaved-group of memory accesses that can be vect...
const TargetLibraryInfo * TLI
Target Library Info.
bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF)
Returns true if I is a memory instruction with consecutive memory access that can be widened.
const InterleaveGroup< Instruction > * getInterleavedAccessGroup(Instruction *Instr) const
Get the interleaved access group that Instr belongs to.
InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const
Estimate cost of an intrinsic call instruction CI if it were vectorized with factor VF.
bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be scalar after vectorization.
bool isOptimizableIVTruncate(Instruction *I, ElementCount VF)
Return True if instruction I is an optimizable truncate whose operand is an induction variable.
FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC)
Loop * TheLoop
The loop that we evaluate.
TTI::TargetCostKind CostKind
The kind of cost that we are calculating.
TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow=true) const
Returns the TailFoldingStyle that is best for the current loop.
InterleavedAccessInfo & InterleaveInfo
The interleave access information contains groups of interleaved accesses with the same stride and cl...
SmallPtrSet< const Value *, 16 > ValuesToIgnore
Values to ignore in the cost model.
void setVectorizedCallDecision(ElementCount VF)
A call may be vectorized in different ways depending on whether we have vectorized variants available...
void invalidateCostModelingDecisions()
Invalidates decisions already taken by the cost model.
bool isAccessInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleaved access group.
bool selectUserVectorizationFactor(ElementCount UserVF)
Setup cost-based decisions for user vectorization factor.
std::optional< unsigned > getVScaleForTuning() const
Return the value of vscale used for tuning the cost model.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const
Returns true if the target machine supports masked store operation for the given DataType and kind of...
LoopInfo * LI
Loop Info analysis.
bool requiresScalarEpilogue(bool IsVectorizing) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
SmallVector< RegisterUsage, 8 > calculateRegisterUsage(ArrayRef< ElementCount > VFs)
SmallPtrSet< const Value *, 16 > VecValuesToIgnore
Values to ignore in the cost model when VF > 1.
bool isInLoopReduction(PHINode *Phi) const
Returns true if the Phi is part of an inloop reduction.
bool isProfitableToScalarize(Instruction *I, ElementCount VF) const
void setWideningDecision(const InterleaveGroup< Instruction > *Grp, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for interleaving group Grp and vector ...
const MapVector< Instruction *, uint64_t > & getMinimalBitwidths() const
CallWideningDecision getCallWideningDecision(CallInst *CI, ElementCount VF) const
bool isLegalGatherOrScatter(Value *V, ElementCount VF)
Returns true if the target machine can represent V as a masked gather or scatter operation.
bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const
bool shouldConsiderInvariant(Value *Op)
Returns true if Op should be considered invariant and if it is trivially hoistable.
bool foldTailByMasking() const
Returns true if all loop blocks should be masked to fold tail loop.
bool foldTailWithEVL() const
Returns true if VP intrinsics with explicit vector length support should be generated in the tail fol...
void collectUniformsAndScalars(ElementCount VF)
Collect Uniform and Scalar values for the given VF.
bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const
Returns true if the instructions in this block requires predication for any reason,...
void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind, Function *Variant, Intrinsic::ID IID, std::optional< unsigned > MaskPos, InstructionCost Cost)
void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC)
Selects and saves TailFoldingStyle for 2 options - if IV update may overflow or not.
AssumptionCache * AC
Assumption cache.
void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for instruction I and vector width VF.
InstWidening
Decision that was taken during cost calculation for memory instruction.
bool isScalarWithPredication(Instruction *I, ElementCount VF) const
Returns true if I is an instruction which requires predication and for which our chosen predication s...
InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const
Estimate cost of a call instruction CI if it were vectorized with factor VF.
bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const
Returns true if we should use strict in-order reductions for the given RdxDesc.
std::pair< InstructionCost, InstructionCost > getDivRemSpeculationCost(Instruction *I, ElementCount VF) const
Return the costs for our two available strategies for lowering a div/rem operation which requires spe...
bool isDivRemScalarWithPredication(InstructionCost ScalarCost, InstructionCost SafeDivisorCost) const
Given costs for both strategies, return true if the scalar predication lowering should be used for di...
InstructionCost expectedCost(ElementCount VF)
Returns the expected execution cost.
void setCostBasedWideningDecision(ElementCount VF)
Memory access instruction may be vectorized in more than one way.
InstWidening getWideningDecision(Instruction *I, ElementCount VF) const
Return the cost model decision for the given instruction I and vector width VF.
bool isScalarEpilogueAllowed() const
Returns true if a scalar epilogue is not allowed due to optsize or a loop hint annotation.
InstructionCost getWideningCost(Instruction *I, ElementCount VF)
Return the vectorization cost for the given instruction I and vector width VF.
unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost)
void collectInstsToScalarize(ElementCount VF)
Collects the instructions to scalarize for each predicated instruction in the loop.
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
bool isInvariantStoreOfReduction(StoreInst *SI)
Returns True if given store is a final invariant store of one of the reductions found in the loop.
bool hasVectorCallVariants() const
Returns true if there is at least one function call in the loop which has a vectorized variant availa...
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
bool isInvariantAddressOfReduction(Value *V)
Returns True if given address is invariant and is used to store recurrent expression.
bool blockNeedsPredication(BasicBlock *BB) const
Return true if the block BB needs to be predicated in order for the loop to be vectorized.
bool canVectorize(bool UseVPlanNativePath)
Returns true if it is legal to vectorize this loop.
int isConsecutivePtr(Type *AccessTy, Value *Ptr) const
Check if this pointer is consecutive when vectorizing.
std::optional< const HistogramInfo * > getHistogramInfo(Instruction *I) const
Returns a HistogramInfo* for the given instruction if it was determined to be part of a load -> updat...
bool canVectorizeFPMath(bool EnableStrictReductions)
Returns true if it is legal to vectorize the FP math operations in this loop.
bool isReductionVariable(PHINode *PN) const
Returns True if PN is a reduction variable in this loop.
bool isFixedOrderRecurrence(const PHINode *Phi) const
Returns True if Phi is a fixed-order recurrence in this loop.
const InductionDescriptor * getPointerInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is pointer induction.
const InductionDescriptor * getIntOrFpInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is an integer or floating point induction.
bool isInductionPhi(const Value *V) const
Returns True if V is a Phi node of an induction variable in this loop.
PHINode * getPrimaryInduction()
Returns the primary induction variable.
const SmallVector< BasicBlock *, 4 > & getCountableExitingBlocks() const
Returns all exiting blocks with a countable exit, i.e.
const InductionList & getInductionVars() const
Returns the induction variables found in the loop.
bool hasStructVectorCall() const
Returns true if there is at least one function call in the loop which returns a struct type and needs...
bool isInvariant(Value *V) const
Returns true if V is invariant across all loop iterations according to SCEV.
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
bool canFoldTailByMasking() const
Return true if we can vectorize this loop while folding its tail by masking.
void prepareToFoldTailByMasking()
Mark all respective loads/stores for masking.
Type * getWidestInductionType()
Returns the widest induction type.
bool hasUncountableEarlyExit() const
Returns true if the loop has exactly one uncountable early exit, i.e.
bool hasHistograms() const
Returns a list of all known histogram operations in the loop.
const LoopAccessInfo * getLAI() const
bool isUniformMemOp(Instruction &I, ElementCount VF) const
A uniform memory op is a load or store which accesses the same memory location on all VF lanes,...
BasicBlock * getUncountableEarlyExitingBlock() const
Returns the uncountable early exiting block, if there is exactly one.
bool isMaskRequired(const Instruction *I) const
Returns true if vector representation of the instruction I requires mask.
const RuntimePointerChecking * getRuntimePointerChecking() const
Returns the information that we collected about runtime memory check.
Planner drives the vectorization process after having passed Legality checks.
VectorizationFactor selectEpilogueVectorizationFactor(const ElementCount MaxVF, unsigned IC)
VPlan & getPlanFor(ElementCount VF) const
Return the VPlan for VF.
Definition: VPlan.cpp:1606
VectorizationFactor planInVPlanNativePath(ElementCount UserVF)
Use the VPlan-native path to plan how to best vectorize, return the best VF and its cost.
void buildVPlans(ElementCount MinVF, ElementCount MaxVF)
Build VPlans for power-of-2 VF's between MinVF and MaxVF inclusive, according to the information gath...
Definition: VPlan.cpp:1591
VectorizationFactor computeBestVF()
Compute and return the most profitable vectorization factor.
void emitInvalidCostRemarks(OptimizationRemarkEmitter *ORE)
Emit remarks for recipes with invalid costs in the available VPlans.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition: VPlan.cpp:1572
void printPlans(raw_ostream &O)
Definition: VPlan.cpp:1620
void plan(ElementCount UserVF, unsigned UserIC)
Build VPlans for the specified UserVF and UserIC if they are non-zero or all applicable candidate VFs...
DenseMap< const SCEV *, Value * > executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan, InnerLoopVectorizer &LB, DominatorTree *DT, bool VectorizingEpilogue, const DenseMap< const SCEV *, Value * > *ExpandedSCEVs=nullptr)
Generate the IR code for the vectorized loop captured in VPlan BestPlan according to the best selecte...
bool hasPlanWithVF(ElementCount VF) const
Look through the existing plans and return true if we have one with vectorization factor VF.
This holds vectorization requirements that must be verified late in the process.
Utility class for getting and setting loop vectorizer hints in the form of loop metadata.
bool allowVectorization(Function *F, Loop *L, bool VectorizeOnlyWhenForced) const
bool allowReordering() const
When enabling loop hints are provided we allow the vectorizer to change the order of operations that ...
void emitRemarkWithHints() const
Dumps all the hint information.
void setAlreadyVectorized()
Mark the loop L as already vectorized by setting the width to 1.
const char * vectorizeAnalysisPassName() const
If hints are provided that force vectorization, use the AlwaysPrint pass name to force the frontend t...
void prepareNoAliasMetadata()
Set up the aliasing scopes based on the memchecks.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
bool hasLoopInvariantOperands(const Instruction *I) const
Return true if all the operands of the specified instruction are loop invariant.
Definition: LoopInfo.cpp:67
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:632
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:61
MDNode * getLoopID() const
Return the llvm.loop loop id metadata node for this loop if it is present.
Definition: LoopInfo.cpp:502
Metadata node.
Definition: Metadata.h:1073
void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
Definition: Metadata.cpp:1077
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1434
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1549
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1440
static MDString * get(LLVMContext &Context, StringRef Str)
Definition: Metadata.cpp:606
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator find(const KeyT &Key)
Definition: MapVector.h:167
bool contains(const KeyT &Key) const
Definition: MapVector.h:163
bool empty() const
Definition: MapVector.h:79
size_type size() const
Definition: MapVector.h:60
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition: Module.cpp:228
Diagnostic information for optimization analysis remarks related to pointer aliasing.
Diagnostic information for optimization analysis remarks related to floating-point non-commutativity.
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
bool allowExtraAnalysis(StringRef PassName) const
Whether we allow for extra compile-time budget to perform more analysis to produce fewer false positi...
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
An analysis over an "inner" IR unit that provides access to an analysis manager over a "outer" IR uni...
Definition: PassManager.h:692
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
static unsigned getIncomingValueNumForOperand(unsigned i)
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1878
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
const SCEVPredicate & getPredicate() const
unsigned getSmallConstantMaxTripCount()
Returns the upper bound of the loop trip count as a normal unsigned value, or 0 if the trip count is ...
const SCEV * getBackedgeTakenCount()
Get the (predicated) backedge count for the analyzed loop.
const SCEV * getSymbolicMaxBackedgeTakenCount()
Get the (predicated) symbolic max backedge count for the analyzed loop.
const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:146
void preserve()
Mark an analysis as preserved.
Definition: Analysis.h:131
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
bool hasProfileSummary() const
Returns true if profile summary is available.
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: IVDescriptors.h:77
static bool isFMulAddIntrinsic(Instruction *I)
Returns true if the instruction is a call to the llvm.fmuladd intrinsic.
FastMathFlags getFastMathFlags() const
Instruction * getLoopExitInstr() const
static unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
Type * getRecurrenceType() const
Returns the type of the recurrence.
const SmallPtrSet< Instruction *, 8 > & getCastInsts() const
Returns a reference to the instructions used for type-promoting the recurrence.
unsigned getMinWidthCastToRecurrenceTypeInBits() const
Returns the minimum width used by the recurrence in bits.
TrackingVH< Value > getRecurrenceStartValue() const
SmallVector< Instruction *, 4 > getReductionOpChain(PHINode *Phi, Loop *L) const
Attempts to find a chain of operations from Phi to LoopExitInst that can be treated as a set of reduc...
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static bool isFindLastIVRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
bool isSigned() const
Returns true if all source operands of the recurrence are SExtInsts.
RecurKind getRecurrenceKind() const
bool isOrdered() const
Expose an ordered FP reduction to the instance users.
Value * getSentinelValue() const
Returns the sentinel value for FindLastIV recurrences to replace the start value.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
bool Need
This flag indicates if we need to add the runtime check.
std::optional< ArrayRef< PointerDiffInfo > > getDiffChecks() const
const SmallVectorImpl< RuntimePointerCheck > & getChecks() const
Returns the checks that generateChecks created.
This class represents a constant integer value.
const APInt & getAPInt() const
Helper to remove instructions inserted during SCEV expansion, unless they are marked as used.
This class uses information about analyze scalars to rewrite expressions in canonical form.
ScalarEvolution * getSE()
bool isInsertedInstruction(Instruction *I) const
Return true if the specified instruction was inserted by the code rewriter.
Value * expandCodeForPredicate(const SCEVPredicate *Pred, Instruction *Loc)
Generates a code sequence that evaluates this predicate.
This class represents an assumption made using SCEV expressions which can be checked at run-time.
virtual bool isAlwaysTrue() const =0
Returns true if the predicate is always true.
This class represents an analyzed expression in the program.
bool isOne() const
Return true if the expression is a constant one.
bool isZero() const
Return true if the expression is a constant zero.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getURemExpr(const SCEV *LHS, const SCEV *RHS)
Represents an unsigned remainder expression based on unsigned division.
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
void forgetBlockAndLoopDispositions(Value *V=nullptr)
Called when the client has changed the disposition of values in a loop or block.
const SCEV * getMinusOne(Type *Ty)
Return a SCEV for the constant -1 of a specific type.
void forgetLcssaPhiWithNewPredecessor(Loop *L, PHINode *V)
Forget LCSSA phi node V of loop L to which a new predecessor was added, such that it may no longer be...
unsigned getSmallConstantTripCount(const Loop *L)
Returns the exact trip count of the loop if we can compute it, and the result is a small constant.
APInt getUnsignedRangeMax(const SCEV *S)
Determine the max of the unsigned range for a particular SCEV.
const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
bool isKnownPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition: SetVector.h:84
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:113
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
value_type pop_back_val()
Definition: SetVector.h:285
size_type size() const
Definition: SmallPtrSet.h:94
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:363
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition: SmallPtrSet.h:401
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:452
iterator end() const
Definition: SmallPtrSet.h:477
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
iterator begin() const
Definition: SmallPtrSet.h:472
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:458
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
Multiway switch.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
std::optional< unsigned > getVScaleForTuning() const
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const
Return true if the target supports masked scatter.
bool enableAggressiveInterleaving(bool LoopHasReductions) const
Don't restrict interleaved unrolling to small loops.
bool preferInLoopReduction(unsigned Opcode, Type *Ty, ReductionFlags Flags) const
bool supportsEfficientVectorElementLoadStore() const
If target has efficient vector element load/store instructions, it can return true here so that inser...
bool prefersVectorizedAddressing() const
Return true if target doesn't mind addresses in vectors.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
bool hasBranchDivergence(const Function *F=nullptr) const
Return true if branch divergence exists.
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE=nullptr, const SCEV *Ptr=nullptr) const
void getUnrollingPreferences(Loop *L, ScalarEvolution &, UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const
Get target-customized preferences for the generic loop unrolling transformation.
InstructionCost getOperandsScalarizationOverhead(ArrayRef< const Value * > Args, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing an instructions unique non-constant operands.
TypeSize getRegisterBitWidth(RegisterKind K) const
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
std::optional< unsigned > getMaxVScale() const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
bool enableOrderedReductions() const
Return true if we should be enabling ordered reductions for the target.
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
static PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const
Query the target what the preferred style of tail folding is.
unsigned getRegUsageForType(Type *Ty) const
Returns the estimated number of registers required to represent Ty.
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of an Add ...
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool isElementTypeLegalForScalableVector(Type *Ty) const
ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const
bool enableMaskedInterleavedAccessVectorization() const
Enable matching of interleaved access groups that contain predicated accesses or gaps and therefore v...
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, ReductionFlags Flags) const
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
const char * getRegisterClassName(unsigned ClassID) const
bool preferEpilogueVectorization() const
Return true if the loop vectorizer should consider vectorizing an otherwise scalar epilogue loop.
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
bool hasActiveVectorLength(unsigned Opcode, Type *DataType, Align Alignment) const
unsigned getEpilogueVectorizationMinVF() const
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isFPVectorizationPotentiallyUnsafe() const
Indicate that it is potentially unsafe to automatically vectorize floating-point operations because t...
bool isLegalMaskedStore(Type *DataType, Align Alignment) const
Return true if the target supports masked store.
@ TCC_Free
Expected to fold away in lowering.
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
bool enableInterleavedAccessVectorization() const
Enable matching of interleaved access groups.
unsigned getMinTripCountTailFoldingThreshold() const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
unsigned getMaxInterleaveFactor(ElementCount VF) const
unsigned getNumberOfParts(Type *Tp) const
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp=std::nullopt) const
bool isTruncateFree(Type *Ty1, Type *Ty2) const
Return true if it's free to truncate a value of type Ty1 to type Ty2.
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const
Query the target whether it would be prefered to create a predicated vector loop, which can avoid the...
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ Interleave
The cast is used with an interleaved load/store.
@ GatherScatter
The cast is used with a gather/scatter.
bool isLegalMaskedLoad(Type *DataType, Align Alignment) const
Return true if the target supports masked load.
This class represents a truncation of integer types.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition: TypeSwitch.h:87
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition: TypeSwitch.h:96
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
static IntegerType * getInt1Ty(LLVMContext &C)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
bool isIntOrPtrTy() const
Return true if this is an integer type or a pointer type.
Definition: Type.h:252
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
bool isTokenTy() const
Return true if this is 'token'.
Definition: Type.h:234
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
This function has undefined behavior.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
op_range operands()
Definition: User.h:288
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
op_iterator op_begin()
Definition: User.h:280
void setOperand(unsigned i, Value *Val)
Definition: User.h:233
Value * getOperand(unsigned i) const
Definition: User.h:228
op_iterator op_end()
Definition: User.h:282
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:72
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition: VPlan.h:3200
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition: VPlan.h:3275
RecipeListTy::iterator iterator
Instruction iterators...
Definition: VPlan.h:3227
void execute(VPTransformState *State) override
The method which generates the output IR instructions that correspond to this VPBasicBlock,...
Definition: VPlan.cpp:482
iterator end()
Definition: VPlan.h:3237
iterator begin()
Recipe iterator methods.
Definition: VPlan.h:3235
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition: VPlan.h:3288
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition: VPlan.cpp:210
void insert(VPRecipeBase *Recipe, iterator InsertPt)
Definition: VPlan.h:3266
bool empty() const
Definition: VPlan.h:3246
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition: VPlan.h:2158
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition: VPlan.h:78
VPRegionBlock * getParent()
Definition: VPlan.h:170
const VPBasicBlock * getExitingBasicBlock() const
Definition: VPlan.cpp:180
void setName(const Twine &newName)
Definition: VPlan.h:163
size_t getNumSuccessors() const
Definition: VPlan.h:216
void swapSuccessors()
Swap successors of the block. The block must have exactly 2 successors.
Definition: VPlan.h:309
VPlan * getPlan()
Definition: VPlan.cpp:155
VPBlockBase * getSinglePredecessor() const
Definition: VPlan.h:212
const VPBasicBlock * getEntryBasicBlock() const
Definition: VPlan.cpp:160
VPBlockBase * getSingleSuccessor() const
Definition: VPlan.h:206
const VPBlocksTy & getSuccessors() const
Definition: VPlan.h:195
static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBase NewBlock after BlockPtr.
Definition: VPlanUtils.h:88
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition: VPlanUtils.h:204
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition: VPlanUtils.h:142
static void reassociateBlocks(VPBlockBase *Old, VPBlockBase *New)
Reassociate all the blocks connected to Old so that they now point to New.
Definition: VPlanUtils.h:169
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL={}, const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPValue * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL={}, const Twine &Name="")
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
void insert(VPRecipeBase *R)
Insert R at the current insertion point.
VPScalarCastRecipe * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL)
VPInstruction * createOverflowingOp(unsigned Opcode, std::initializer_list< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags, DebugLoc DL={}, const Twine &Name="")
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
VPValue * createNot(VPValue *Operand, DebugLoc DL={}, const Twine &Name="")
VPValue * createLogicalAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL={}, const Twine &Name="")
VPValue * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL={}, const Twine &Name="", std::optional< FastMathFlags > FMFs=std::nullopt)
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
Canonical scalar induction phi of the vector loop.
Definition: VPlan.h:2897
Type * getScalarType() const
Returns the scalar type of the induction.
Definition: VPlan.h:2928
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition: VPlanValue.h:394
void execute(VPTransformState &State) override
Generate the transformed value of the induction at offset StartValue (1.
VPValue * getStepValue() const
Definition: VPlan.h:3130
VPValue * getStartValue() const
Definition: VPlan.h:3129
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition: VPlan.h:1692
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition: VPlan.h:1740
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition: VPlan.h:1729
A recipe representing a sequence of load -> update -> store as part of a histogram operation.
Definition: VPlan.h:1442
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition: VPlan.h:3342
A recipe to wrap on original IR instruction not to be modified during execution, execept for PHIs.
Definition: VPlan.h:1036
This is a concrete Recipe that models a single VPlan-level instruction.
Definition: VPlan.h:845
@ ResumePhi
Creates a scalar phi in a leaf VPBB with a single predecessor in VPlan.
Definition: VPlan.h:863
@ ComputeReductionResult
Definition: VPlan.h:869
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition: VPlan.h:2225
In what follows, the term "input IR" refers to code that is fed into the vectorizer whereas the term ...
Definition: VPlanHelpers.h:116
static VPLane getLastLaneForVF(const ElementCount &VF)
Definition: VPlanHelpers.h:157
static VPLane getFirstLane()
Definition: VPlanHelpers.h:141
A recipe for forming partial reductions.
Definition: VPlan.h:2111
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition: VPlan.h:366
VPBasicBlock * getParent()
Definition: VPlan.h:391
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition: VPlan.h:460
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * tryToCreatePartialReduction(Instruction *Reduction, ArrayRef< VPValue * > Operands)
Create and return a partial reduction recipe for a reduction instruction along with binary operation ...
VPValue * createEdgeMask(BasicBlock *Src, BasicBlock *Dst)
A helper function that computes the predicate of the edge between SRC and DST.
void createSwitchEdgeMasks(SwitchInst *SI)
Create an edge mask for every destination of cases and/or default.
VPValue * getBlockInMask(BasicBlock *BB) const
Returns the entry mask for the block BB.
VPValue * getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const
A helper that returns the previously computed predicate of the edge between SRC and DST.
iterator_range< mapped_iterator< Use *, std::function< VPValue *(Value *)> > > mapToVPValues(User::op_range Operands)
Returns a range mapping the values of the range Operands to their corresponding VPValues.
void fixHeaderPhis()
Add the incoming values from the backedge to reduction & first-order recurrence cross-iteration phis.
VPRecipeBase * tryToCreateWidenRecipe(Instruction *Instr, ArrayRef< VPValue * > Operands, VFRange &Range, VPBasicBlock *VPBB)
Create and return a widened recipe for I if one can be created within the given VF Range.
VPValue * getVPValueOrAddLiveIn(Value *V)
void createHeaderMask()
Create the mask for the vector loop header block.
std::optional< unsigned > getScalingForReduction(const Instruction *ExitInst)
void createBlockInMask(BasicBlock *BB)
A helper function that computes the predicate of the block BB, assuming that the header block of the ...
void collectScaledReductions(VFRange &Range)
Find all possible partial reductions in the loop and track all of those that are valid so recipes can...
VPReplicateRecipe * handleReplication(Instruction *I, ArrayRef< VPValue * > Operands, VFRange &Range)
Build a VPReplicationRecipe for I using Operands.
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
void setFlags(Instruction *I) const
Set the IR flags for I.
Definition: VPlan.h:759
A recipe for handling reduction phis.
Definition: VPlan.h:2045
bool isInLoop() const
Returns true, if the phi is part of an in-loop reduction.
Definition: VPlan.h:2104
const RecurrenceDescriptor & getRecurrenceDescriptor() const
Definition: VPlan.h:2096
A recipe to represent inloop reduction operations, performing a reduction on a vector operand into a ...
Definition: VPlan.h:2320
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition: VPlan.h:3377
const VPBlockBase * getEntry() const
Definition: VPlan.h:3413
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition: VPlan.h:3445
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition: VPlan.h:2441
void execute(VPTransformState &State) override
Generate replicas of the desired Ingredient.
bool isUniform() const
Definition: VPlan.h:2485
bool shouldPack() const
Returns true if the recipe is used by a widened recipe via an intervening VPPredInstPHIRecipe.
A recipe to compute the pointers for widened memory accesses of IndexTy in reverse order.
Definition: VPlan.h:1569
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition: VPlan.h:493
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition: VPlan.h:563
An analysis for type-inference for VPValues.
Definition: VPlanAnalysis.h:40
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition: VPlanValue.h:206
operand_range operands()
Definition: VPlanValue.h:263
void setOperand(unsigned I, VPValue *New)
Definition: VPlanValue.h:248
unsigned getNumOperands() const
Definition: VPlanValue.h:242
VPValue * getOperand(unsigned N) const
Definition: VPlanValue.h:243
void addOperand(VPValue *Operand)
Definition: VPlanValue.h:237
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition: VPlan.cpp:125
void replaceAllUsesWith(VPValue *New)
Definition: VPlan.cpp:1438
Value * getLiveInIRValue()
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition: VPlanValue.h:178
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition: VPlan.cpp:1442
user_range users()
Definition: VPlanValue.h:138
A recipe to compute the pointers for widened memory accesses of IndexTy.
Definition: VPlan.h:1622
A recipe for widening Call instructions using library calls.
Definition: VPlan.h:1386
A Recipe for widening the canonical induction variable of the vector loop.
Definition: VPlan.h:3038
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition: VPlan.h:1194
A recipe for handling GEP instructions.
Definition: VPlan.h:1520
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition: VPlan.h:1754
VPValue * getStepValue()
Returns the step value of the induction.
Definition: VPlan.h:1782
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition: VPlan.h:1788
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition: VPlan.h:1807
A recipe for widening vector intrinsics.
Definition: VPlan.h:1294
A common base class for widening memory operations.
Definition: VPlan.h:2614
A recipe for handling phis that are widened in the vector loop.
Definition: VPlan.h:1967
VPValue * getIncomingValue(unsigned I)
Returns the I th incoming VPValue.
Definition: VPlan.h:2007
VPBasicBlock * getIncomingBlock(unsigned I)
Returns the I th incoming VPBasicBlock.
Definition: VPlan.h:2004
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition: VPlan.h:1096
Main class to build the VPlan H-CFG for an incoming IR.
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition: VPlan.h:3476
void prepareToExecute(Value *TripCount, Value *VectorTripCount, VPTransformState &State)
Prepare the plan for execution, setting up the required live-in values.
Definition: VPlan.cpp:938
VPBasicBlock * getEntry()
Definition: VPlan.h:3589
VPValue & getVectorTripCount()
The vector trip count.
Definition: VPlan.h:3654
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition: VPlan.h:3660
VPValue & getVF()
Returns the VF of the vector loop region.
Definition: VPlan.h:3657
VPValue * getTripCount() const
The trip count of the original loop.
Definition: VPlan.h:3633
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition: VPlan.h:3647
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
Definition: VPlan.h:3677
unsigned getUF() const
Definition: VPlan.h:3685
static VPlanPtr createInitialVPlan(Type *InductionTy, PredicatedScalarEvolution &PSE, bool RequiresScalarEpilogueCheck, bool TailFolded, Loop *TheLoop)
Create initial VPlan, having an "entry" VPBasicBlock (wrapping original scalar pre-header) which cont...
Definition: VPlan.cpp:859
bool hasVF(ElementCount VF)
Definition: VPlan.h:3670
bool hasUF(unsigned UF) const
Definition: VPlan.h:3683
auto getExitBlocks()
Return an iterator range over the VPIRBasicBlock wrapping the exit blocks of the VPlan,...
Definition: VPlanCFG.h:310
VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition: VPlan.cpp:1070
InstructionCost cost(ElementCount VF, VPCostContext &Ctx)
Return the cost of this plan.
Definition: VPlan.cpp:1064
const VPBasicBlock * getMiddleBlock() const
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition: VPlan.h:3608
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition: VPlan.h:3640
void setEntry(VPBasicBlock *VPBB)
Definition: VPlan.h:3559
VPIRBasicBlock * createVPIRBasicBlock(BasicBlock *IRBB)
Create a VPIRBasicBlock from IRBB containing VPIRInstructions for all instructions in IRBB,...
Definition: VPlan.cpp:1270
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
Definition: VPlan.h:3703
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition: VPlan.h:3616
void execute(VPTransformState *State)
Generate the IR code for this VPlan.
Definition: VPlan.cpp:974
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the vector loop.
Definition: VPlan.h:3737
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition: VPlan.h:3621
VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition: VPlan.cpp:1210
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition: Value.cpp:157
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:377
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition: Value.cpp:542
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1094
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
int getNumOccurrences() const
Definition: CommandLine.h:399
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:193
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:232
constexpr bool isNonZero() const
Definition: TypeSize.h:158
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:218
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition: TypeSize.h:258
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition: TypeSize.h:174
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
constexpr bool isZero() const
Definition: TypeSize.h:156
static constexpr bool isKnownGT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:225
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition: TypeSize.h:254
static constexpr bool isKnownGE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:239
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
A range adaptor for a pair of iterators.
IteratorT end() const
IteratorT begin() const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:661
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ Entry
Definition: COFF.h:844
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
ID ArrayRef< Type * > Tys
Definition: Intrinsics.h:102
std::variant< std::monostate, Loc::Single, Loc::Multi, Loc::MMI, Loc::EntryValue > Variant
Alias for the std::variant specialization base class of DbgVariable.
Definition: DwarfDebug.h:190
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
Definition: PatternMatch.h:982
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:826
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:885
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:711
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< InstrNode * > Instr
Definition: RDFGraph.h:389
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
const_iterator begin(StringRef path LLVM_LIFETIME_BOUND, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:226
const_iterator end(StringRef path LLVM_LIFETIME_BOUND)
Get end iterator over path.
Definition: Path.cpp:235
bool isUniformAfterVectorization(const VPValue *VPV)
Returns true if VPV is uniform after vectorization.
Definition: VPlanUtils.h:41
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr, ScalarEvolution &SE)
Get or create a VPValue that corresponds to the expansion of Expr.
Definition: VPlanUtils.cpp:26
const SCEV * getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE)
Return the SCEV expression for V.
Definition: VPlanUtils.cpp:65
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
Simplify each loop in a loop nest recursively.
void ReplaceInstWithInst(BasicBlock *BB, BasicBlock::iterator &BI, Instruction *I)
Replace the instruction specified by BI with the instruction specified by I.
@ Offset
Definition: DWP.cpp:480
Value * addRuntimeChecks(Instruction *Loc, Loop *TheLoop, const SmallVectorImpl< RuntimePointerCheck > &PointerChecks, SCEVExpander &Expander, bool HoistRuntimeChecks=false)
Add code that checks at runtime if the accessed arrays in PointerChecks overlap.
Definition: LoopUtils.cpp:1954
bool RemoveRedundantDbgInstrs(BasicBlock *BB)
Try to remove redundant dbg.value instructions from given basic block.
std::optional< unsigned > getLoopEstimatedTripCount(Loop *L, unsigned *EstimatedLoopInvocationWeight=nullptr)
Returns a loop's estimated trip count based on branch weight metadata.
Definition: LoopUtils.cpp:850
static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, VectorizationFactor VF, unsigned IC)
Report successful vectorization of the loop.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
unsigned getLoadStoreAddressSpace(const Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:989
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
auto pred_end(const MachineBasicBlock *BB)
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:7301
auto successors(const MachineBasicBlock *BB)
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
std::pair< Instruction *, ElementCount > InstructionVFPair
Value * getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF)
Return the runtime value for VF.
bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition: LCSSA.cpp:465
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
std::optional< MDNode * > makeFollowupLoopID(MDNode *OrigLoopID, ArrayRef< StringRef > FollowupAttrs, const char *InheritOptionsAttrsPrefix="", bool AlwaysNew=false)
Create a new loop identifier for a loop created from a loop transformation.
Definition: LoopUtils.cpp:263
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2115
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition: VPlanCFG.h:215
bool VerifySCEV
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition: VPlanCFG.h:227
auto map_range(ContainerTy &&C, FuncTy F)
Definition: STLExtras.h:377
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
void collectEphemeralRecipesForVPlan(VPlan &Plan, DenseSet< VPRecipeBase * > &EphRecipes)
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
void setBranchWeights(Instruction &I, ArrayRef< uint32_t > Weights, bool IsExpected)
Create a new branch_weights metadata node and add or overwrite a prof metadata reference to instructi...
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:292
cl::opt< bool > EnableVPlanNativePath("enable-vplan-native-path", cl::Hidden, cl::desc("Enable VPlan-native vectorization path with " "support for outer loop vectorization."))
Definition: VPlan.cpp:54
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1664
std::unique_ptr< VPlan > VPlanPtr
Definition: VPlan.h:74
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
cl::opt< bool > EnableLoopVectorization
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition: Local.cpp:425
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition: STLExtras.h:573
void llvm_unreachable_internal(const char *msg=nullptr, const char *file=nullptr, unsigned line=0)
This function calls abort(), and prints the optional message to stderr.
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition: STLExtras.h:336
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:404
TargetTransformInfo TTI
static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr, DebugLoc DL={})
Reports an informative message: print Msg for debugging purposes as well as an optimization remark.
bool isAssignmentTrackingEnabled(const Module &M)
Return true if assignment tracking is enabled for module M.
Definition: DebugInfo.cpp:2299
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:33
@ Or
Bitwise or logical OR of integers.
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
cl::opt< bool > VerifyEachVPlan("vplan-verify-each", cl::init(false), cl::Hidden, cl::desc("Verfiy VPlans after VPlan transforms."))
void setProfileInfoAfterUnrolling(Loop *OrigLoop, Loop *UnrolledLoop, Loop *RemainderLoop, uint64_t UF)
Set weights for UnrolledLoop and RemainderLoop based on weights for OrigLoop and the following distri...
Definition: LoopUtils.cpp:1761
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr)
Reports a vectorization failure: print DebugMsg for debugging purposes along with the corresponding o...
DWARFExpression::Operation Op
ScalarEpilogueLowering
@ CM_ScalarEpilogueNotAllowedLowTripLoop
@ CM_ScalarEpilogueNotNeededUsePredicate
@ CM_ScalarEpilogueNotAllowedOptSize
@ CM_ScalarEpilogueAllowed
@ CM_ScalarEpilogueNotAllowedUsePredicate
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1945
auto pred_begin(const MachineBasicBlock *BB)
Value * createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step)
Return a value for Step multiplied by VF.
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1766
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
Value * addDiffRuntimeChecks(Instruction *Loc, ArrayRef< PointerDiffInfo > Checks, SCEVExpander &Expander, function_ref< Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC)
Definition: LoopUtils.cpp:2012
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2087
InstructionCost Cost
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
@ None
Don't use tail folding.
@ DataWithEVL
Use predicated EVL instructions for tail-folding.
@ DataWithoutLaneMask
Same as Data, but avoids using the get.active.lane.mask intrinsic to calculate the mask and instead i...
unsigned getReciprocalPredBlockProb()
A helper function that returns the reciprocal of the block probability of predicated blocks.
Definition: VPlanHelpers.h:57
bool hasBranchWeightMD(const Instruction &I)
Checks if an instructions has Branch Weight Metadata.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:590
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
bool verifyVPlanIsValid(const VPlan &Plan)
Verify invariants for general VPlans.
MapVector< Instruction *, uint64_t > computeMinimumValueSizes(ArrayRef< BasicBlock * > Blocks, DemandedBits &DB, const TargetTransformInfo *TTI=nullptr)
Compute a map of integer instructions to their minimum legal type size.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:468
cl::opt< bool > EnableLoopInterleaving
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
A special type used by analysis passes to provide an address that identifies that particular analysis...
Definition: Analysis.h:28
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:52
Encapsulate information regarding vectorization of a loop and its epilogue.
EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, ElementCount EVF, unsigned EUF, VPlan &EpiloguePlan)
A class that represents two vectorization factors (initialized with 0 by default).
static FixedScalableVFPair getNone()
This holds details about a histogram operation – a load -> update -> store sequence where each lane i...
A struct that represents some properties of the register usage of a loop.
SmallMapVector< unsigned, unsigned, 4 > MaxLocalUsers
Holds the maximum number of concurrent live intervals in the loop.
SmallMapVector< unsigned, unsigned, 4 > LoopInvariantRegs
Holds the number of loop invariant values that are used in the loop.
TargetLibraryInfo * TLI
LoopVectorizeResult runImpl(Function &F)
ProfileSummaryInfo * PSI
LoopAccessInfoManager * LAIs
void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
LoopVectorizePass(LoopVectorizeOptions Opts={})
BlockFrequencyInfo * BFI
ScalarEvolution * SE
AssumptionCache * AC
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
OptimizationRemarkEmitter * ORE
Storage for information about made changes.
A chain of instructions that form a partial reduction.
Instruction * Reduction
The top-level binary operation that forms the reduction to a scalar after the loop body.
Instruction * ExtendA
The extension of each of the inner binary operation's operands.
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition: PassManager.h:69
A marker analysis to determine if extra passes should be run after loop vectorization.
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
Flags describing the kind of vector reduction.
Parameters that control the generic loop unrolling transformation.
bool UnrollVectorizedLoop
Don't disable runtime unroll for the loops which were vectorized.
Holds the VFShape for a specific scalar to vector function mapping.
std::optional< unsigned > getParamIndexForOptionalMask() const
Instruction Set Architecture.
Encapsulates information needed to describe a parameter.
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Definition: VPlanHelpers.h:62
ElementCount End
Definition: VPlanHelpers.h:67
Struct to hold various analysis needed for cost computations.
Definition: VPlanHelpers.h:356
LoopVectorizationCostModel & CM
Definition: VPlanHelpers.h:361
bool skipCostComputation(Instruction *UI, bool IsVector) const
Return true if the cost for UI shouldn't be computed, e.g.
InstructionCost getLegacyCost(Instruction *UI, ElementCount VF) const
Return the cost for UI with VF using the legacy cost model as fallback until computing the cost of al...
SmallPtrSet< Instruction *, 8 > SkipCostComputation
Definition: VPlanHelpers.h:362
A recipe for handling first-order recurrence phis.
Definition: VPlan.h:2013
BasicBlock * PrevBB
The previous IR BasicBlock created or used.
Definition: VPlanHelpers.h:304
SmallDenseMap< VPBasicBlock *, BasicBlock * > VPBB2IRBB
A mapping of each VPBasicBlock to the corresponding BasicBlock.
Definition: VPlanHelpers.h:312
VPTransformState holds information passed down when "executing" a VPlan, needed for generating the ou...
Definition: VPlanHelpers.h:196
DenseMap< const SCEV *, Value * > ExpandedSCEVs
Map SCEVs to their expanded values.
Definition: VPlanHelpers.h:349
VPTypeAnalysis TypeAnalysis
VPlan-based type analysis.
Definition: VPlanHelpers.h:352
void packScalarIntoVectorValue(VPValue *Def, const VPLane &Lane)
Construct the vector value of a scalarized value V one lane at a time.
Definition: VPlan.cpp:395
Value * get(VPValue *Def, bool IsScalar=false)
Get the generated vector Value for a given VPValue Def if IsScalar is false, otherwise return the gen...
Definition: VPlan.cpp:251
struct llvm::VPTransformState::CFGState CFG
LoopVersioning * LVer
LoopVersioning.
Definition: VPlanHelpers.h:345
void addNewMetadata(Instruction *To, const Instruction *Orig)
Add additional metadata to To that was not present on Orig.
Definition: VPlan.cpp:354
std::optional< VPLane > Lane
Hold the index to generate specific scalar instructions.
Definition: VPlanHelpers.h:210
IRBuilderBase & Builder
Hold a reference to the IRBuilder used to generate output IR code.
Definition: VPlanHelpers.h:329
VPlan * Plan
Pointer to the VPlan code is generated for.
Definition: VPlanHelpers.h:335
InnerLoopVectorizer * ILV
Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
Definition: VPlanHelpers.h:332
ElementCount VF
The chosen Vectorization Factor of the loop being vectorized.
Definition: VPlanHelpers.h:205
void setDebugLocFrom(DebugLoc DL)
Set the debug location in the builder using the debug location DL.
Definition: VPlan.cpp:373
void set(VPValue *Def, Value *V, bool IsScalar=false)
Set the generated vector Value for a given VPValue, if IsScalar is false.
Definition: VPlanHelpers.h:239
A recipe for widening load operations, using the address to load from and an optional mask.
Definition: VPlan.h:2694
A recipe for widening select instructions.
Definition: VPlan.h:1483
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition: VPlan.h:2772
static void handleUncountableEarlyExit(VPlan &Plan, ScalarEvolution &SE, Loop *OrigLoop, BasicBlock *UncountableExitingBlock, VPRecipeBuilder &RecipeBuilder)
Update Plan to account for the uncountable early exit block in UncountableExitingBlock by.
static void optimizeInductionExitUsers(VPlan &Plan, DenseMap< VPValue *, VPValue * > &EndValues)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static bool runPass(bool(*Transform)(VPlan &, ArgsTy...), VPlan &Plan, typename std::remove_reference< ArgsTy >::type &...Args)
Helper to run a VPlan transform Transform on VPlan, forwarding extra arguments to the transform.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx)
Explicitly unroll Plan by UF.
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static bool tryAddExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void VPInstructionsToVPRecipes(VPlanPtr &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, ScalarEvolution &SE, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
TODO: The following VectorizationFactor was pulled out of LoopVectorizationCostModel class.
InstructionCost Cost
Cost of the loop with that width.
ElementCount MinProfitableTripCount
The minimum trip count required to make vectorization profitable, e.g.
ElementCount Width
Vector width with best cost.
InstructionCost ScalarCost
Cost of the scalar loop.
static VectorizationFactor Disabled()
Width 1 means no vectorization, cost 0 means uncomputed cost.