LLVM 21.0.0git
LoopVectorize.cpp
Go to the documentation of this file.
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanAnalysis.h"
61#include "VPlanHCFGBuilder.h"
62#include "VPlanHelpers.h"
63#include "VPlanPatternMatch.h"
64#include "VPlanTransforms.h"
65#include "VPlanUtils.h"
66#include "VPlanVerifier.h"
67#include "llvm/ADT/APInt.h"
68#include "llvm/ADT/ArrayRef.h"
69#include "llvm/ADT/DenseMap.h"
71#include "llvm/ADT/Hashing.h"
72#include "llvm/ADT/MapVector.h"
73#include "llvm/ADT/STLExtras.h"
76#include "llvm/ADT/Statistic.h"
77#include "llvm/ADT/StringRef.h"
78#include "llvm/ADT/Twine.h"
79#include "llvm/ADT/TypeSwitch.h"
84#include "llvm/Analysis/CFG.h"
100#include "llvm/IR/Attributes.h"
101#include "llvm/IR/BasicBlock.h"
102#include "llvm/IR/CFG.h"
103#include "llvm/IR/Constant.h"
104#include "llvm/IR/Constants.h"
105#include "llvm/IR/DataLayout.h"
106#include "llvm/IR/DebugInfo.h"
107#include "llvm/IR/DebugLoc.h"
108#include "llvm/IR/DerivedTypes.h"
110#include "llvm/IR/Dominators.h"
111#include "llvm/IR/Function.h"
112#include "llvm/IR/IRBuilder.h"
113#include "llvm/IR/InstrTypes.h"
114#include "llvm/IR/Instruction.h"
115#include "llvm/IR/Instructions.h"
117#include "llvm/IR/Intrinsics.h"
118#include "llvm/IR/MDBuilder.h"
119#include "llvm/IR/Metadata.h"
120#include "llvm/IR/Module.h"
121#include "llvm/IR/Operator.h"
122#include "llvm/IR/PatternMatch.h"
124#include "llvm/IR/Type.h"
125#include "llvm/IR/Use.h"
126#include "llvm/IR/User.h"
127#include "llvm/IR/Value.h"
128#include "llvm/IR/Verifier.h"
129#include "llvm/Support/Casting.h"
131#include "llvm/Support/Debug.h"
146#include <algorithm>
147#include <cassert>
148#include <cstdint>
149#include <functional>
150#include <iterator>
151#include <limits>
152#include <memory>
153#include <string>
154#include <tuple>
155#include <utility>
156
157using namespace llvm;
158
159#define LV_NAME "loop-vectorize"
160#define DEBUG_TYPE LV_NAME
161
162#ifndef NDEBUG
163const char VerboseDebug[] = DEBUG_TYPE "-verbose";
164#endif
165
166/// @{
167/// Metadata attribute names
168const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
170 "llvm.loop.vectorize.followup_vectorized";
172 "llvm.loop.vectorize.followup_epilogue";
173/// @}
174
175STATISTIC(LoopsVectorized, "Number of loops vectorized");
176STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
177STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
178
180 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
181 cl::desc("Enable vectorization of epilogue loops."));
182
184 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
185 cl::desc("When epilogue vectorization is enabled, and a value greater than "
186 "1 is specified, forces the given VF for all applicable epilogue "
187 "loops."));
188
190 "epilogue-vectorization-minimum-VF", cl::Hidden,
191 cl::desc("Only loops with vectorization factor equal to or larger than "
192 "the specified value are considered for epilogue vectorization."));
193
194/// Loops with a known constant trip count below this number are vectorized only
195/// if no scalar iteration overheads are incurred.
197 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
198 cl::desc("Loops with a constant trip count that is smaller than this "
199 "value are vectorized only if no scalar iteration overheads "
200 "are incurred."));
201
203 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
204 cl::desc("The maximum allowed number of runtime memory checks"));
205
206// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
207// that predication is preferred, and this lists all options. I.e., the
208// vectorizer will try to fold the tail-loop (epilogue) into the vector body
209// and predicate the instructions accordingly. If tail-folding fails, there are
210// different fallback strategies depending on these values:
212 enum Option {
216 };
217} // namespace PreferPredicateTy
218
220 "prefer-predicate-over-epilogue",
223 cl::desc("Tail-folding and predication preferences over creating a scalar "
224 "epilogue loop."),
226 "scalar-epilogue",
227 "Don't tail-predicate loops, create scalar epilogue"),
229 "predicate-else-scalar-epilogue",
230 "prefer tail-folding, create scalar epilogue if tail "
231 "folding fails."),
233 "predicate-dont-vectorize",
234 "prefers tail-folding, don't attempt vectorization if "
235 "tail-folding fails.")));
236
238 "force-tail-folding-style", cl::desc("Force the tail folding style"),
239 cl::init(TailFoldingStyle::None),
241 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
243 TailFoldingStyle::Data, "data",
244 "Create lane mask for data only, using active.lane.mask intrinsic"),
245 clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
246 "data-without-lane-mask",
247 "Create lane mask with compare/stepvector"),
248 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
249 "Create lane mask using active.lane.mask intrinsic, and use "
250 "it for both data and control flow"),
251 clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
252 "data-and-control-without-rt-check",
253 "Similar to data-and-control, but remove the runtime check"),
254 clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
255 "Use predicated EVL instructions for tail folding. If EVL "
256 "is unsupported, fallback to data-without-lane-mask.")));
257
259 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
260 cl::desc("Maximize bandwidth when selecting vectorization factor which "
261 "will be determined by the smallest type in loop."));
262
264 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
265 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
266
267/// An interleave-group may need masking if it resides in a block that needs
268/// predication, or in order to mask away gaps.
270 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
271 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
272
274 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
275 cl::desc("A flag that overrides the target's number of scalar registers."));
276
278 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
279 cl::desc("A flag that overrides the target's number of vector registers."));
280
282 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
283 cl::desc("A flag that overrides the target's max interleave factor for "
284 "scalar loops."));
285
287 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
288 cl::desc("A flag that overrides the target's max interleave factor for "
289 "vectorized loops."));
290
292 "force-target-instruction-cost", cl::init(0), cl::Hidden,
293 cl::desc("A flag that overrides the target's expected cost for "
294 "an instruction to a single constant value. Mostly "
295 "useful for getting consistent testing."));
296
298 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
299 cl::desc(
300 "Pretend that scalable vectors are supported, even if the target does "
301 "not support them. This flag should only be used for testing."));
302
304 "small-loop-cost", cl::init(20), cl::Hidden,
305 cl::desc(
306 "The cost of a loop that is considered 'small' by the interleaver."));
307
309 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
310 cl::desc("Enable the use of the block frequency analysis to access PGO "
311 "heuristics minimizing code growth in cold regions and being more "
312 "aggressive in hot regions."));
313
314// Runtime interleave loops for load/store throughput.
316 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
317 cl::desc(
318 "Enable runtime interleaving until load/store ports are saturated"));
319
320/// The number of stores in a loop that are allowed to need predication.
322 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
323 cl::desc("Max number of stores to be predicated behind an if."));
324
326 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
327 cl::desc("Count the induction variable only once when interleaving"));
328
330 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
331 cl::desc("Enable if predication of stores during vectorization."));
332
334 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
335 cl::desc("The maximum interleave count to use when interleaving a scalar "
336 "reduction in a nested loop."));
337
338static cl::opt<bool>
339 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
341 cl::desc("Prefer in-loop vector reductions, "
342 "overriding the targets preference."));
343
345 "force-ordered-reductions", cl::init(false), cl::Hidden,
346 cl::desc("Enable the vectorisation of loops with in-order (strict) "
347 "FP reductions"));
348
350 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
351 cl::desc(
352 "Prefer predicating a reduction operation over an after loop select."));
353
354namespace llvm {
356 "enable-vplan-native-path", cl::Hidden,
357 cl::desc("Enable VPlan-native vectorization path with "
358 "support for outer loop vectorization."));
359
361 VerifyEachVPlan("vplan-verify-each",
362#ifdef EXPENSIVE_CHECKS
363 cl::init(true),
364#else
365 cl::init(false),
366#endif
368 cl::desc("Verfiy VPlans after VPlan transforms."));
369} // namespace llvm
370
371// This flag enables the stress testing of the VPlan H-CFG construction in the
372// VPlan-native vectorization path. It must be used in conjuction with
373// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
374// verification of the H-CFGs built.
376 "vplan-build-stress-test", cl::init(false), cl::Hidden,
377 cl::desc(
378 "Build VPlan for every supported loop nest in the function and bail "
379 "out right after the build (stress test the VPlan H-CFG construction "
380 "in the VPlan-native vectorization path)."));
381
383 "interleave-loops", cl::init(true), cl::Hidden,
384 cl::desc("Enable loop interleaving in Loop vectorization passes"));
386 "vectorize-loops", cl::init(true), cl::Hidden,
387 cl::desc("Run the Loop vectorization passes"));
388
390 "force-widen-divrem-via-safe-divisor", cl::Hidden,
391 cl::desc(
392 "Override cost based safe divisor widening for div/rem instructions"));
393
395 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
397 cl::desc("Try wider VFs if they enable the use of vector variants"));
398
400 "enable-early-exit-vectorization", cl::init(false), cl::Hidden,
401 cl::desc(
402 "Enable vectorization of early exit loops with uncountable exits."));
403
404// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
405// variables not overflowing do not hold. See `emitSCEVChecks`.
406static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
407// Likelyhood of bypassing the vectorized loop because pointers overlap. See
408// `emitMemRuntimeChecks`.
409static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
410// Likelyhood of bypassing the vectorized loop because there are zero trips left
411// after prolog. See `emitIterationCountCheck`.
412static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
413
414/// A helper function that returns true if the given type is irregular. The
415/// type is irregular if its allocated size doesn't equal the store size of an
416/// element of the corresponding vector type.
417static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
418 // Determine if an array of N elements of type Ty is "bitcast compatible"
419 // with a <N x Ty> vector.
420 // This is only true if there is no padding between the array elements.
421 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
422}
423
424/// Returns "best known" trip count for the specified loop \p L as defined by
425/// the following procedure:
426/// 1) Returns exact trip count if it is known.
427/// 2) Returns expected trip count according to profile data if any.
428/// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
429/// 4) Returns std::nullopt if all of the above failed.
430static std::optional<unsigned>
432 bool CanUseConstantMax = true) {
433 // Check if exact trip count is known.
434 if (unsigned ExpectedTC = PSE.getSE()->getSmallConstantTripCount(L))
435 return ExpectedTC;
436
437 // Check if there is an expected trip count available from profile data.
439 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
440 return *EstimatedTC;
441
442 if (!CanUseConstantMax)
443 return std::nullopt;
444
445 // Check if upper bound estimate is known.
446 if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())
447 return ExpectedTC;
448
449 return std::nullopt;
450}
451
452namespace {
453// Forward declare GeneratedRTChecks.
454class GeneratedRTChecks;
455
456using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
457} // namespace
458
459namespace llvm {
460
462
463/// InnerLoopVectorizer vectorizes loops which contain only one basic
464/// block to a specified vectorization factor (VF).
465/// This class performs the widening of scalars into vectors, or multiple
466/// scalars. This class also implements the following features:
467/// * It inserts an epilogue loop for handling loops that don't have iteration
468/// counts that are known to be a multiple of the vectorization factor.
469/// * It handles the code generation for reduction variables.
470/// * Scalarization (implementation using scalars) of un-vectorizable
471/// instructions.
472/// InnerLoopVectorizer does not perform any vectorization-legality
473/// checks, and relies on the caller to check for the different legality
474/// aspects. The InnerLoopVectorizer relies on the
475/// LoopVectorizationLegality class to provide information about the induction
476/// and reduction variables that were found to a given vectorization factor.
478public:
481 const TargetLibraryInfo *TLI,
485 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
487 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks,
488 VPlan &Plan)
489 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
490 AC(AC), ORE(ORE), VF(VecWidth),
492 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
494 VectorPHVPB(Plan.getEntry()->getSingleSuccessor()) {
495 // Query this against the original loop and save it here because the profile
496 // of the original loop header may change as the transformation happens.
499 }
500
501 virtual ~InnerLoopVectorizer() = default;
502
503 /// Create a new empty loop that will contain vectorized instructions later
504 /// on, while the old loop will be used as the scalar remainder. Control flow
505 /// is generated around the vectorized (and scalar epilogue) loops consisting
506 /// of various checks and bypasses. Return the pre-header block of the new
507 /// loop. In the case of epilogue vectorization, this function is overriden to
508 /// handle the more complex control flow around the loops. \p ExpandedSCEVs is
509 /// used to look up SCEV expansions for expressions needed during skeleton
510 /// creation.
511 virtual BasicBlock *
512 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
513
514 /// Fix the vectorized code, taking care of header phi's, and more.
516
517 // Return true if any runtime check is added.
519
520 /// A helper function to scalarize a single Instruction in the innermost loop.
521 /// Generates a sequence of scalar instances for each lane between \p MinLane
522 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
523 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
524 /// Instr's operands.
525 void scalarizeInstruction(const Instruction *Instr,
526 VPReplicateRecipe *RepRecipe, const VPLane &Lane,
527 VPTransformState &State);
528
529 /// Fix the non-induction PHIs in \p Plan.
531
532 /// Returns the original loop trip count.
533 Value *getTripCount() const { return TripCount; }
534
535 /// Used to set the trip count after ILV's construction and after the
536 /// preheader block has been executed. Note that this always holds the trip
537 /// count of the original loop for both main loop and epilogue vectorization.
538 void setTripCount(Value *TC) { TripCount = TC; }
539
540 // Retrieve the additional bypass value associated with an original
541 /// induction header phi.
543 return Induction2AdditionalBypassValue.at(OrigPhi);
544 }
545
546 /// Return the additional bypass block which targets the scalar loop by
547 /// skipping the epilogue loop after completing the main loop.
550 "Trying to access AdditionalBypassBlock but it has not been set");
552 }
553
554protected:
556
557 /// Iteratively sink the scalarized operands of a predicated instruction into
558 /// the block that was created for it.
559 void sinkScalarOperands(Instruction *PredInst);
560
561 /// Returns (and creates if needed) the trip count of the widened loop.
563
564 /// Emit a bypass check to see if the vector trip count is zero, including if
565 /// it overflows.
567
568 /// Emit a bypass check to see if all of the SCEV assumptions we've
569 /// had to make are correct. Returns the block containing the checks or
570 /// nullptr if no checks have been added.
572
573 /// Emit bypass checks to check any memory assumptions we may have made.
574 /// Returns the block containing the checks or nullptr if no checks have been
575 /// added.
577
578 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
579 /// vector loop preheader, middle block and scalar preheader.
581
582 /// Create and record the values for induction variables to resume coming from
583 /// the additional bypass block.
584 void createInductionAdditionalBypassValues(const SCEV2ValueTy &ExpandedSCEVs,
585 Value *MainVectorTripCount);
586
587 /// Allow subclasses to override and print debug traces before/after vplan
588 /// execution, when trace information is requested.
589 virtual void printDebugTracesAtStart() {}
590 virtual void printDebugTracesAtEnd() {}
591
592 /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
593 /// vector preheader and its predecessor, also connecting the new block to the
594 /// scalar preheader.
595 void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
596
597 /// The original loop.
599
600 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
601 /// dynamic knowledge to simplify SCEV expressions and converts them to a
602 /// more usable form.
604
605 /// Loop Info.
607
608 /// Dominator Tree.
610
611 /// Target Library Info.
613
614 /// Target Transform Info.
616
617 /// Assumption Cache.
619
620 /// Interface to emit optimization remarks.
622
623 /// The vectorization SIMD factor to use. Each vector will have this many
624 /// vector elements.
626
628
629 /// The vectorization unroll factor to use. Each scalar is vectorized to this
630 /// many different vector instructions.
631 unsigned UF;
632
633 /// The builder that we use
635
636 // --- Vectorization state ---
637
638 /// The vector-loop preheader.
640
641 /// The scalar-loop preheader.
643
644 /// Middle Block between the vector and the scalar.
646
647 /// A list of all bypass blocks. The first block is the entry of the loop.
649
650 /// Store instructions that were predicated.
652
653 /// Trip count of the original loop.
654 Value *TripCount = nullptr;
655
656 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
658
659 /// The legality analysis.
661
662 /// The profitablity analysis.
664
665 // Record whether runtime checks are added.
666 bool AddedSafetyChecks = false;
667
668 /// BFI and PSI are used to check for profile guided size optimizations.
671
672 // Whether this loop should be optimized for size based on profile guided size
673 // optimizatios.
675
676 /// Structure to hold information about generated runtime checks, responsible
677 /// for cleaning the checks, if vectorization turns out unprofitable.
678 GeneratedRTChecks &RTChecks;
679
680 /// Mapping of induction phis to their additional bypass values. They
681 /// need to be added as operands to phi nodes in the scalar loop preheader
682 /// after the epilogue skeleton has been created.
684
685 /// The additional bypass block which conditionally skips over the epilogue
686 /// loop after executing the main loop. Needed to resume inductions and
687 /// reductions during epilogue vectorization.
689
691
692 /// The vector preheader block of \p Plan, used as target for check blocks
693 /// introduced during skeleton creation.
695};
696
697/// Encapsulate information regarding vectorization of a loop and its epilogue.
698/// This information is meant to be updated and used across two stages of
699/// epilogue vectorization.
702 unsigned MainLoopUF = 0;
704 unsigned EpilogueUF = 0;
709 Value *TripCount = nullptr;
712
714 ElementCount EVF, unsigned EUF,
716 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF),
718 assert(EUF == 1 &&
719 "A high UF for the epilogue loop is likely not beneficial.");
720 }
721};
722
723/// An extension of the inner loop vectorizer that creates a skeleton for a
724/// vectorized loop that has its epilogue (residual) also vectorized.
725/// The idea is to run the vplan on a given loop twice, firstly to setup the
726/// skeleton and vectorize the main loop, and secondly to complete the skeleton
727/// from the first step and vectorize the epilogue. This is achieved by
728/// deriving two concrete strategy classes from this base class and invoking
729/// them in succession from the loop vectorizer planner.
731public:
739 GeneratedRTChecks &Checks, VPlan &Plan)
741 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
742 CM, BFI, PSI, Checks, Plan),
743 EPI(EPI) {}
744
745 // Override this function to handle the more complex control flow around the
746 // three loops.
747 BasicBlock *
748 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final {
749 return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
750 }
751
752 /// The interface for creating a vectorized skeleton using one of two
753 /// different strategies, each corresponding to one execution of the vplan
754 /// as described above.
755 virtual BasicBlock *
756 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
757
758 /// Holds and updates state information required to vectorize the main loop
759 /// and its epilogue in two separate passes. This setup helps us avoid
760 /// regenerating and recomputing runtime safety checks. It also helps us to
761 /// shorten the iteration-count-check path length for the cases where the
762 /// iteration count of the loop is so small that the main vector loop is
763 /// completely skipped.
765};
766
767/// A specialized derived class of inner loop vectorizer that performs
768/// vectorization of *main* loops in the process of vectorizing loops and their
769/// epilogues.
771public:
779 GeneratedRTChecks &Check, VPlan &Plan)
781 EPI, LVL, CM, BFI, PSI, Check, Plan) {}
782 /// Implements the interface for creating a vectorized skeleton using the
783 /// *main loop* strategy (ie the first pass of vplan execution).
784 BasicBlock *
785 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
786
787protected:
788 /// Emits an iteration count bypass check once for the main loop (when \p
789 /// ForEpilogue is false) and once for the epilogue loop (when \p
790 /// ForEpilogue is true).
791 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
792 void printDebugTracesAtStart() override;
793 void printDebugTracesAtEnd() override;
794};
795
796// A specialized derived class of inner loop vectorizer that performs
797// vectorization of *epilogue* loops in the process of vectorizing loops and
798// their epilogues.
800public:
808 GeneratedRTChecks &Checks, VPlan &Plan)
810 EPI, LVL, CM, BFI, PSI, Checks, Plan) {
812 }
813 /// Implements the interface for creating a vectorized skeleton using the
814 /// *epilogue loop* strategy (ie the second pass of vplan execution).
815 BasicBlock *
816 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
817
818protected:
819 /// Emits an iteration count bypass check after the main vector loop has
820 /// finished to see if there are any iterations left to execute by either
821 /// the vector epilogue or the scalar epilogue.
823 BasicBlock *Bypass,
824 BasicBlock *Insert);
825 void printDebugTracesAtStart() override;
826 void printDebugTracesAtEnd() override;
827};
828} // end namespace llvm
829
830/// Look for a meaningful debug location on the instruction or its operands.
832 if (!I)
833 return DebugLoc();
834
835 DebugLoc Empty;
836 if (I->getDebugLoc() != Empty)
837 return I->getDebugLoc();
838
839 for (Use &Op : I->operands()) {
840 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
841 if (OpInst->getDebugLoc() != Empty)
842 return OpInst->getDebugLoc();
843 }
844
845 return I->getDebugLoc();
846}
847
848/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
849/// is passed, the message relates to that particular instruction.
850#ifndef NDEBUG
851static void debugVectorizationMessage(const StringRef Prefix,
852 const StringRef DebugMsg,
853 Instruction *I) {
854 dbgs() << "LV: " << Prefix << DebugMsg;
855 if (I != nullptr)
856 dbgs() << " " << *I;
857 else
858 dbgs() << '.';
859 dbgs() << '\n';
860}
861#endif
862
863/// Create an analysis remark that explains why vectorization failed
864///
865/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
866/// RemarkName is the identifier for the remark. If \p I is passed it is an
867/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
868/// the location of the remark. If \p DL is passed, use it as debug location for
869/// the remark. \return the remark object that can be streamed to.
871createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
872 Instruction *I, DebugLoc DL = {}) {
873 Value *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
874 // If debug location is attached to the instruction, use it. Otherwise if DL
875 // was not provided, use the loop's.
876 if (I && I->getDebugLoc())
877 DL = I->getDebugLoc();
878 else if (!DL)
879 DL = TheLoop->getStartLoc();
880
881 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
882}
883
884namespace llvm {
885
886/// Return a value for Step multiplied by VF.
888 int64_t Step) {
889 assert(Ty->isIntegerTy() && "Expected an integer step");
890 return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
891}
892
893/// Return the runtime value for VF.
895 return B.CreateElementCount(Ty, VF);
896}
897
899 const StringRef OREMsg, const StringRef ORETag,
900 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
901 Instruction *I) {
902 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
903 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
904 ORE->emit(
905 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
906 << "loop not vectorized: " << OREMsg);
907}
908
909/// Reports an informative message: print \p Msg for debugging purposes as well
910/// as an optimization remark. Uses either \p I as location of the remark, or
911/// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the
912/// remark. If \p DL is passed, use it as debug location for the remark.
913static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
915 Loop *TheLoop, Instruction *I = nullptr,
916 DebugLoc DL = {}) {
918 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
919 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop,
920 I, DL)
921 << Msg);
922}
923
924/// Report successful vectorization of the loop. In case an outer loop is
925/// vectorized, prepend "outer" to the vectorization remark.
927 VectorizationFactor VF, unsigned IC) {
929 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
930 nullptr));
931 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
932 ORE->emit([&]() {
933 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
934 TheLoop->getHeader())
935 << "vectorized " << LoopType << "loop (vectorization width: "
936 << ore::NV("VectorizationFactor", VF.Width)
937 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
938 });
939}
940
941} // end namespace llvm
942
943namespace llvm {
944
945// Loop vectorization cost-model hints how the scalar epilogue loop should be
946// lowered.
948
949 // The default: allowing scalar epilogues.
951
952 // Vectorization with OptForSize: don't allow epilogues.
954
955 // A special case of vectorisation with OptForSize: loops with a very small
956 // trip count are considered for vectorization under OptForSize, thereby
957 // making sure the cost of their loop body is dominant, free of runtime
958 // guards and scalar iteration overheads.
960
961 // Loop hint predicate indicating an epilogue is undesired.
963
964 // Directive indicating we must either tail fold or not vectorize
967
968using InstructionVFPair = std::pair<Instruction *, ElementCount>;
969
970/// LoopVectorizationCostModel - estimates the expected speedups due to
971/// vectorization.
972/// In many cases vectorization is not profitable. This can happen because of
973/// a number of reasons. In this class we mainly attempt to predict the
974/// expected speedup/slowdowns due to the supported instruction set. We use the
975/// TargetTransformInfo to query the different backends for the cost of
976/// different operations.
979
980public:
990 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
991 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
992 Hints(Hints), InterleaveInfo(IAI), CostKind(TTI::TCK_RecipThroughput) {
994 initializeVScaleForTuning();
995 }
996
997 /// \return An upper bound for the vectorization factors (both fixed and
998 /// scalable). If the factors are 0, vectorization and interleaving should be
999 /// avoided up front.
1000 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1001
1002 /// \return True if runtime checks are required for vectorization, and false
1003 /// otherwise.
1004 bool runtimeChecksRequired();
1005
1006 /// Setup cost-based decisions for user vectorization factor.
1007 /// \return true if the UserVF is a feasible VF to be chosen.
1011 return expectedCost(UserVF).isValid();
1012 }
1013
1014 /// \return The size (in bits) of the smallest and widest types in the code
1015 /// that needs to be vectorized. We ignore values that remain scalar such as
1016 /// 64 bit loop indices.
1017 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1018
1019 /// \return The desired interleave count.
1020 /// If interleave count has been specified by metadata it will be returned.
1021 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1022 /// are the selected vectorization factor and the cost of the selected VF.
1023 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1024
1025 /// Memory access instruction may be vectorized in more than one way.
1026 /// Form of instruction after vectorization depends on cost.
1027 /// This function takes cost-based decisions for Load/Store instructions
1028 /// and collects them in a map. This decisions map is used for building
1029 /// the lists of loop-uniform and loop-scalar instructions.
1030 /// The calculated cost is saved with widening decision in order to
1031 /// avoid redundant calculations.
1033
1034 /// A call may be vectorized in different ways depending on whether we have
1035 /// vectorized variants available and whether the target supports masking.
1036 /// This function analyzes all calls in the function at the supplied VF,
1037 /// makes a decision based on the costs of available options, and stores that
1038 /// decision in a map for use in planning and plan execution.
1040
1041 /// A struct that represents some properties of the register usage
1042 /// of a loop.
1044 /// Holds the number of loop invariant values that are used in the loop.
1045 /// The key is ClassID of target-provided register class.
1047 /// Holds the maximum number of concurrent live intervals in the loop.
1048 /// The key is ClassID of target-provided register class.
1050 };
1051
1052 /// \return Returns information about the register usages of the loop for the
1053 /// given vectorization factors.
1056
1057 /// Collect values we want to ignore in the cost model.
1058 void collectValuesToIgnore();
1059
1060 /// Collect all element types in the loop for which widening is needed.
1062
1063 /// Split reductions into those that happen in the loop, and those that happen
1064 /// outside. In loop reductions are collected into InLoopReductions.
1066
1067 /// Returns true if we should use strict in-order reductions for the given
1068 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1069 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1070 /// of FP operations.
1071 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1072 return !Hints->allowReordering() && RdxDesc.isOrdered();
1073 }
1074
1075 /// \returns The smallest bitwidth each instruction can be represented with.
1076 /// The vector equivalents of these instructions should be truncated to this
1077 /// type.
1079 return MinBWs;
1080 }
1081
1082 /// \returns True if it is more profitable to scalarize instruction \p I for
1083 /// vectorization factor \p VF.
1085 assert(VF.isVector() &&
1086 "Profitable to scalarize relevant only for VF > 1.");
1087 assert(
1088 TheLoop->isInnermost() &&
1089 "cost-model should not be used for outer loops (in VPlan-native path)");
1090
1091 auto Scalars = InstsToScalarize.find(VF);
1092 assert(Scalars != InstsToScalarize.end() &&
1093 "VF not yet analyzed for scalarization profitability");
1094 return Scalars->second.contains(I);
1095 }
1096
1097 /// Returns true if \p I is known to be uniform after vectorization.
1099 assert(
1100 TheLoop->isInnermost() &&
1101 "cost-model should not be used for outer loops (in VPlan-native path)");
1102 // Pseudo probe needs to be duplicated for each unrolled iteration and
1103 // vector lane so that profiled loop trip count can be accurately
1104 // accumulated instead of being under counted.
1105 if (isa<PseudoProbeInst>(I))
1106 return false;
1107
1108 if (VF.isScalar())
1109 return true;
1110
1111 auto UniformsPerVF = Uniforms.find(VF);
1112 assert(UniformsPerVF != Uniforms.end() &&
1113 "VF not yet analyzed for uniformity");
1114 return UniformsPerVF->second.count(I);
1115 }
1116
1117 /// Returns true if \p I is known to be scalar after vectorization.
1119 assert(
1120 TheLoop->isInnermost() &&
1121 "cost-model should not be used for outer loops (in VPlan-native path)");
1122 if (VF.isScalar())
1123 return true;
1124
1125 auto ScalarsPerVF = Scalars.find(VF);
1126 assert(ScalarsPerVF != Scalars.end() &&
1127 "Scalar values are not calculated for VF");
1128 return ScalarsPerVF->second.count(I);
1129 }
1130
1131 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1132 /// for vectorization factor \p VF.
1134 return VF.isVector() && MinBWs.contains(I) &&
1135 !isProfitableToScalarize(I, VF) &&
1137 }
1138
1139 /// Decision that was taken during cost calculation for memory instruction.
1142 CM_Widen, // For consecutive accesses with stride +1.
1143 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1150
1151 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1152 /// instruction \p I and vector width \p VF.
1155 assert(VF.isVector() && "Expected VF >=2");
1156 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1157 }
1158
1159 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1160 /// interleaving group \p Grp and vector width \p VF.
1164 assert(VF.isVector() && "Expected VF >=2");
1165 /// Broadcast this decicion to all instructions inside the group.
1166 /// When interleaving, the cost will only be assigned one instruction, the
1167 /// insert position. For other cases, add the appropriate fraction of the
1168 /// total cost to each instruction. This ensures accurate costs are used,
1169 /// even if the insert position instruction is not used.
1170 InstructionCost InsertPosCost = Cost;
1171 InstructionCost OtherMemberCost = 0;
1172 if (W != CM_Interleave)
1173 OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers();
1174 ;
1175 for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) {
1176 if (auto *I = Grp->getMember(Idx)) {
1177 if (Grp->getInsertPos() == I)
1178 WideningDecisions[std::make_pair(I, VF)] =
1179 std::make_pair(W, InsertPosCost);
1180 else
1181 WideningDecisions[std::make_pair(I, VF)] =
1182 std::make_pair(W, OtherMemberCost);
1183 }
1184 }
1185 }
1186
1187 /// Return the cost model decision for the given instruction \p I and vector
1188 /// width \p VF. Return CM_Unknown if this instruction did not pass
1189 /// through the cost modeling.
1191 assert(VF.isVector() && "Expected VF to be a vector VF");
1192 assert(
1193 TheLoop->isInnermost() &&
1194 "cost-model should not be used for outer loops (in VPlan-native path)");
1195
1196 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1197 auto Itr = WideningDecisions.find(InstOnVF);
1198 if (Itr == WideningDecisions.end())
1199 return CM_Unknown;
1200 return Itr->second.first;
1201 }
1202
1203 /// Return the vectorization cost for the given instruction \p I and vector
1204 /// width \p VF.
1206 assert(VF.isVector() && "Expected VF >=2");
1207 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1208 assert(WideningDecisions.contains(InstOnVF) &&
1209 "The cost is not calculated");
1210 return WideningDecisions[InstOnVF].second;
1211 }
1212
1217 std::optional<unsigned> MaskPos;
1219 };
1220
1222 Function *Variant, Intrinsic::ID IID,
1223 std::optional<unsigned> MaskPos,
1225 assert(!VF.isScalar() && "Expected vector VF");
1226 CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
1227 MaskPos, Cost};
1228 }
1229
1231 ElementCount VF) const {
1232 assert(!VF.isScalar() && "Expected vector VF");
1233 return CallWideningDecisions.at(std::make_pair(CI, VF));
1234 }
1235
1236 /// Return True if instruction \p I is an optimizable truncate whose operand
1237 /// is an induction variable. Such a truncate will be removed by adding a new
1238 /// induction variable with the destination type.
1240 // If the instruction is not a truncate, return false.
1241 auto *Trunc = dyn_cast<TruncInst>(I);
1242 if (!Trunc)
1243 return false;
1244
1245 // Get the source and destination types of the truncate.
1246 Type *SrcTy = toVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1247 Type *DestTy = toVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1248
1249 // If the truncate is free for the given types, return false. Replacing a
1250 // free truncate with an induction variable would add an induction variable
1251 // update instruction to each iteration of the loop. We exclude from this
1252 // check the primary induction variable since it will need an update
1253 // instruction regardless.
1254 Value *Op = Trunc->getOperand(0);
1255 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1256 return false;
1257
1258 // If the truncated value is not an induction variable, return false.
1259 return Legal->isInductionPhi(Op);
1260 }
1261
1262 /// Collects the instructions to scalarize for each predicated instruction in
1263 /// the loop.
1265
1266 /// Collect Uniform and Scalar values for the given \p VF.
1267 /// The sets depend on CM decision for Load/Store instructions
1268 /// that may be vectorized as interleave, gather-scatter or scalarized.
1269 /// Also make a decision on what to do about call instructions in the loop
1270 /// at that VF -- scalarize, call a known vector routine, or call a
1271 /// vector intrinsic.
1273 // Do the analysis once.
1274 if (VF.isScalar() || Uniforms.contains(VF))
1275 return;
1277 collectLoopUniforms(VF);
1279 collectLoopScalars(VF);
1280 }
1281
1282 /// Returns true if the target machine supports masked store operation
1283 /// for the given \p DataType and kind of access to \p Ptr.
1284 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1285 return Legal->isConsecutivePtr(DataType, Ptr) &&
1286 TTI.isLegalMaskedStore(DataType, Alignment);
1287 }
1288
1289 /// Returns true if the target machine supports masked load operation
1290 /// for the given \p DataType and kind of access to \p Ptr.
1291 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1292 return Legal->isConsecutivePtr(DataType, Ptr) &&
1293 TTI.isLegalMaskedLoad(DataType, Alignment);
1294 }
1295
1296 /// Returns true if the target machine can represent \p V as a masked gather
1297 /// or scatter operation.
1299 bool LI = isa<LoadInst>(V);
1300 bool SI = isa<StoreInst>(V);
1301 if (!LI && !SI)
1302 return false;
1303 auto *Ty = getLoadStoreType(V);
1305 if (VF.isVector())
1306 Ty = VectorType::get(Ty, VF);
1307 return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1308 (SI && TTI.isLegalMaskedScatter(Ty, Align));
1309 }
1310
1311 /// Returns true if the target machine supports all of the reduction
1312 /// variables found for the given VF.
1314 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1315 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1316 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1317 }));
1318 }
1319
1320 /// Given costs for both strategies, return true if the scalar predication
1321 /// lowering should be used for div/rem. This incorporates an override
1322 /// option so it is not simply a cost comparison.
1324 InstructionCost SafeDivisorCost) const {
1325 switch (ForceSafeDivisor) {
1326 case cl::BOU_UNSET:
1327 return ScalarCost < SafeDivisorCost;
1328 case cl::BOU_TRUE:
1329 return false;
1330 case cl::BOU_FALSE:
1331 return true;
1332 }
1333 llvm_unreachable("impossible case value");
1334 }
1335
1336 /// Returns true if \p I is an instruction which requires predication and
1337 /// for which our chosen predication strategy is scalarization (i.e. we
1338 /// don't have an alternate strategy such as masking available).
1339 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1341
1342 /// Returns true if \p I is an instruction that needs to be predicated
1343 /// at runtime. The result is independent of the predication mechanism.
1344 /// Superset of instructions that return true for isScalarWithPredication.
1345 bool isPredicatedInst(Instruction *I) const;
1346
1347 /// Return the costs for our two available strategies for lowering a
1348 /// div/rem operation which requires speculating at least one lane.
1349 /// First result is for scalarization (will be invalid for scalable
1350 /// vectors); second is for the safe-divisor strategy.
1351 std::pair<InstructionCost, InstructionCost>
1353 ElementCount VF) const;
1354
1355 /// Returns true if \p I is a memory instruction with consecutive memory
1356 /// access that can be widened.
1358
1359 /// Returns true if \p I is a memory instruction in an interleaved-group
1360 /// of memory accesses that can be vectorized with wide vector loads/stores
1361 /// and shuffles.
1363
1364 /// Check if \p Instr belongs to any interleaved access group.
1366 return InterleaveInfo.isInterleaved(Instr);
1367 }
1368
1369 /// Get the interleaved access group that \p Instr belongs to.
1372 return InterleaveInfo.getInterleaveGroup(Instr);
1373 }
1374
1375 /// Returns true if we're required to use a scalar epilogue for at least
1376 /// the final iteration of the original loop.
1377 bool requiresScalarEpilogue(bool IsVectorizing) const {
1378 if (!isScalarEpilogueAllowed()) {
1379 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1380 return false;
1381 }
1382 // If we might exit from anywhere but the latch and early exit vectorization
1383 // is disabled, we must run the exiting iteration in scalar form.
1386 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
1387 "from latch block\n");
1388 return true;
1389 }
1390 if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1391 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1392 "interleaved group requires scalar epilogue\n");
1393 return true;
1394 }
1395 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1396 return false;
1397 }
1398
1399 /// Returns true if we're required to use a scalar epilogue for at least
1400 /// the final iteration of the original loop for all VFs in \p Range.
1401 /// A scalar epilogue must either be required for all VFs in \p Range or for
1402 /// none.
1404 auto RequiresScalarEpilogue = [this](ElementCount VF) {
1405 return requiresScalarEpilogue(VF.isVector());
1406 };
1407 bool IsRequired = all_of(Range, RequiresScalarEpilogue);
1408 assert(
1409 (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
1410 "all VFs in range must agree on whether a scalar epilogue is required");
1411 return IsRequired;
1412 }
1413
1414 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1415 /// loop hint annotation.
1417 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1418 }
1419
1420 /// Returns the TailFoldingStyle that is best for the current loop.
1421 TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1422 if (!ChosenTailFoldingStyle)
1424 return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1425 : ChosenTailFoldingStyle->second;
1426 }
1427
1428 /// Selects and saves TailFoldingStyle for 2 options - if IV update may
1429 /// overflow or not.
1430 /// \param IsScalableVF true if scalable vector factors enabled.
1431 /// \param UserIC User specific interleave count.
1432 void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1433 assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1434 if (!Legal->canFoldTailByMasking()) {
1435 ChosenTailFoldingStyle =
1437 return;
1438 }
1439
1440 if (!ForceTailFoldingStyle.getNumOccurrences()) {
1441 ChosenTailFoldingStyle = std::make_pair(
1442 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
1443 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false));
1444 return;
1445 }
1446
1447 // Set styles when forced.
1448 ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(),
1449 ForceTailFoldingStyle.getValue());
1451 return;
1452 // Override forced styles if needed.
1453 // FIXME: use actual opcode/data type for analysis here.
1454 // FIXME: Investigate opportunity for fixed vector factor.
1455 // FIXME: support fixed-order recurrences by fixing splice of non VFxUF
1456 // penultimate EVL.
1457 bool EVLIsLegal = UserIC <= 1 && IsScalableVF &&
1458 TTI.hasActiveVectorLength(0, nullptr, Align()) &&
1461 if (!EVLIsLegal) {
1462 // If for some reason EVL mode is unsupported, fallback to
1463 // DataWithoutLaneMask to try to vectorize the loop with folded tail
1464 // in a generic way.
1465 ChosenTailFoldingStyle =
1468 LLVM_DEBUG(
1469 dbgs()
1470 << "LV: Preference for VP intrinsics indicated. Will "
1471 "not try to generate VP Intrinsics "
1472 << (UserIC > 1
1473 ? "since interleave count specified is greater than 1.\n"
1474 : "due to non-interleaving reasons.\n"));
1475 }
1476 }
1477
1478 /// Returns true if all loop blocks should be masked to fold tail loop.
1479 bool foldTailByMasking() const {
1480 // TODO: check if it is possible to check for None style independent of
1481 // IVUpdateMayOverflow flag in getTailFoldingStyle.
1483 }
1484
1485 /// Return maximum safe number of elements to be processed per vector
1486 /// iteration, which do not prevent store-load forwarding and are safe with
1487 /// regard to the memory dependencies. Required for EVL-based VPlans to
1488 /// correctly calculate AVL (application vector length) as min(remaining AVL,
1489 /// MaxSafeElements).
1490 /// TODO: need to consider adjusting cost model to use this value as a
1491 /// vectorization factor for EVL-based vectorization.
1492 std::optional<unsigned> getMaxSafeElements() const { return MaxSafeElements; }
1493
1494 /// Returns true if the instructions in this block requires predication
1495 /// for any reason, e.g. because tail folding now requires a predicate
1496 /// or because the block in the original loop was predicated.
1499 }
1500
1501 /// Returns true if VP intrinsics with explicit vector length support should
1502 /// be generated in the tail folded loop.
1503 bool foldTailWithEVL() const {
1505 }
1506
1507 /// Returns true if the Phi is part of an inloop reduction.
1508 bool isInLoopReduction(PHINode *Phi) const {
1509 return InLoopReductions.contains(Phi);
1510 }
1511
1512 /// Returns true if the predicated reduction select should be used to set the
1513 /// incoming value for the reduction phi.
1514 bool usePredicatedReductionSelect(unsigned Opcode, Type *PhiTy) const {
1515 // Force to use predicated reduction select since the EVL of the
1516 // second-to-last iteration might not be VF*UF.
1517 if (foldTailWithEVL())
1518 return true;
1521 Opcode, PhiTy, TargetTransformInfo::ReductionFlags());
1522 }
1523
1524 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1525 /// with factor VF. Return the cost of the instruction, including
1526 /// scalarization overhead if it's needed.
1528
1529 /// Estimate cost of a call instruction CI if it were vectorized with factor
1530 /// VF. Return the cost of the instruction, including scalarization overhead
1531 /// if it's needed.
1533
1534 /// Invalidates decisions already taken by the cost model.
1536 WideningDecisions.clear();
1537 CallWideningDecisions.clear();
1538 Uniforms.clear();
1539 Scalars.clear();
1540 }
1541
1542 /// Returns the expected execution cost. The unit of the cost does
1543 /// not matter because we use the 'cost' units to compare different
1544 /// vector widths. The cost that is returned is *not* normalized by
1545 /// the factor width.
1547
1548 bool hasPredStores() const { return NumPredStores > 0; }
1549
1550 /// Returns true if epilogue vectorization is considered profitable, and
1551 /// false otherwise.
1552 /// \p VF is the vectorization factor chosen for the original loop.
1553 /// \p Multiplier is an aditional scaling factor applied to VF before
1554 /// comparing to EpilogueVectorizationMinVF.
1556 const unsigned IC) const;
1557
1558 /// Returns the execution time cost of an instruction for a given vector
1559 /// width. Vector width of one means scalar.
1561
1562 /// Return the cost of instructions in an inloop reduction pattern, if I is
1563 /// part of that pattern.
1564 std::optional<InstructionCost> getReductionPatternCost(Instruction *I,
1565 ElementCount VF,
1566 Type *VectorTy) const;
1567
1568 /// Returns true if \p Op should be considered invariant and if it is
1569 /// trivially hoistable.
1571
1572 /// Return the value of vscale used for tuning the cost model.
1573 std::optional<unsigned> getVScaleForTuning() const { return VScaleForTuning; }
1574
1575private:
1576 unsigned NumPredStores = 0;
1577
1578 /// Used to store the value of vscale used for tuning the cost model. It is
1579 /// initialized during object construction.
1580 std::optional<unsigned> VScaleForTuning;
1581
1582 /// Initializes the value of vscale used for tuning the cost model. If
1583 /// vscale_range.min == vscale_range.max then return vscale_range.max, else
1584 /// return the value returned by the corresponding TTI method.
1585 void initializeVScaleForTuning() {
1586 const Function *Fn = TheLoop->getHeader()->getParent();
1587 if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
1588 auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
1589 auto Min = Attr.getVScaleRangeMin();
1590 auto Max = Attr.getVScaleRangeMax();
1591 if (Max && Min == Max) {
1592 VScaleForTuning = Max;
1593 return;
1594 }
1595 }
1596
1597 VScaleForTuning = TTI.getVScaleForTuning();
1598 }
1599
1600 /// \return An upper bound for the vectorization factors for both
1601 /// fixed and scalable vectorization, where the minimum-known number of
1602 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1603 /// disabled or unsupported, then the scalable part will be equal to
1604 /// ElementCount::getScalable(0).
1605 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1606 ElementCount UserVF,
1607 bool FoldTailByMasking);
1608
1609 /// \return the maximized element count based on the targets vector
1610 /// registers and the loop trip-count, but limited to a maximum safe VF.
1611 /// This is a helper function of computeFeasibleMaxVF.
1612 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1613 unsigned SmallestType,
1614 unsigned WidestType,
1615 ElementCount MaxSafeVF,
1616 bool FoldTailByMasking);
1617
1618 /// Checks if scalable vectorization is supported and enabled. Caches the
1619 /// result to avoid repeated debug dumps for repeated queries.
1620 bool isScalableVectorizationAllowed();
1621
1622 /// \return the maximum legal scalable VF, based on the safe max number
1623 /// of elements.
1624 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1625
1626 /// Calculate vectorization cost of memory instruction \p I.
1627 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1628
1629 /// The cost computation for scalarized memory instruction.
1630 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1631
1632 /// The cost computation for interleaving group of memory instructions.
1633 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1634
1635 /// The cost computation for Gather/Scatter instruction.
1636 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1637
1638 /// The cost computation for widening instruction \p I with consecutive
1639 /// memory access.
1640 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1641
1642 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1643 /// Load: scalar load + broadcast.
1644 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1645 /// element)
1646 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1647
1648 /// Estimate the overhead of scalarizing an instruction. This is a
1649 /// convenience wrapper for the type-based getScalarizationOverhead API.
1650 InstructionCost getScalarizationOverhead(Instruction *I,
1651 ElementCount VF) const;
1652
1653 /// Returns true if an artificially high cost for emulated masked memrefs
1654 /// should be used.
1655 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1656
1657 /// Map of scalar integer values to the smallest bitwidth they can be legally
1658 /// represented as. The vector equivalents of these values should be truncated
1659 /// to this type.
1661
1662 /// A type representing the costs for instructions if they were to be
1663 /// scalarized rather than vectorized. The entries are Instruction-Cost
1664 /// pairs.
1665 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1666
1667 /// A set containing all BasicBlocks that are known to present after
1668 /// vectorization as a predicated block.
1670 PredicatedBBsAfterVectorization;
1671
1672 /// Records whether it is allowed to have the original scalar loop execute at
1673 /// least once. This may be needed as a fallback loop in case runtime
1674 /// aliasing/dependence checks fail, or to handle the tail/remainder
1675 /// iterations when the trip count is unknown or doesn't divide by the VF,
1676 /// or as a peel-loop to handle gaps in interleave-groups.
1677 /// Under optsize and when the trip count is very small we don't allow any
1678 /// iterations to execute in the scalar loop.
1679 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1680
1681 /// Control finally chosen tail folding style. The first element is used if
1682 /// the IV update may overflow, the second element - if it does not.
1683 std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1684 ChosenTailFoldingStyle;
1685
1686 /// true if scalable vectorization is supported and enabled.
1687 std::optional<bool> IsScalableVectorizationAllowed;
1688
1689 /// Maximum safe number of elements to be processed per vector iteration,
1690 /// which do not prevent store-load forwarding and are safe with regard to the
1691 /// memory dependencies. Required for EVL-based veectorization, where this
1692 /// value is used as the upper bound of the safe AVL.
1693 std::optional<unsigned> MaxSafeElements;
1694
1695 /// A map holding scalar costs for different vectorization factors. The
1696 /// presence of a cost for an instruction in the mapping indicates that the
1697 /// instruction will be scalarized when vectorizing with the associated
1698 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1700
1701 /// Holds the instructions known to be uniform after vectorization.
1702 /// The data is collected per VF.
1704
1705 /// Holds the instructions known to be scalar after vectorization.
1706 /// The data is collected per VF.
1708
1709 /// Holds the instructions (address computations) that are forced to be
1710 /// scalarized.
1712
1713 /// PHINodes of the reductions that should be expanded in-loop.
1714 SmallPtrSet<PHINode *, 4> InLoopReductions;
1715
1716 /// A Map of inloop reduction operations and their immediate chain operand.
1717 /// FIXME: This can be removed once reductions can be costed correctly in
1718 /// VPlan. This was added to allow quick lookup of the inloop operations.
1719 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1720
1721 /// Returns the expected difference in cost from scalarizing the expression
1722 /// feeding a predicated instruction \p PredInst. The instructions to
1723 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1724 /// non-negative return value implies the expression will be scalarized.
1725 /// Currently, only single-use chains are considered for scalarization.
1726 InstructionCost computePredInstDiscount(Instruction *PredInst,
1727 ScalarCostsTy &ScalarCosts,
1728 ElementCount VF);
1729
1730 /// Collect the instructions that are uniform after vectorization. An
1731 /// instruction is uniform if we represent it with a single scalar value in
1732 /// the vectorized loop corresponding to each vector iteration. Examples of
1733 /// uniform instructions include pointer operands of consecutive or
1734 /// interleaved memory accesses. Note that although uniformity implies an
1735 /// instruction will be scalar, the reverse is not true. In general, a
1736 /// scalarized instruction will be represented by VF scalar values in the
1737 /// vectorized loop, each corresponding to an iteration of the original
1738 /// scalar loop.
1739 void collectLoopUniforms(ElementCount VF);
1740
1741 /// Collect the instructions that are scalar after vectorization. An
1742 /// instruction is scalar if it is known to be uniform or will be scalarized
1743 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1744 /// to the list if they are used by a load/store instruction that is marked as
1745 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1746 /// VF values in the vectorized loop, each corresponding to an iteration of
1747 /// the original scalar loop.
1748 void collectLoopScalars(ElementCount VF);
1749
1750 /// Keeps cost model vectorization decision and cost for instructions.
1751 /// Right now it is used for memory instructions only.
1753 std::pair<InstWidening, InstructionCost>>;
1754
1755 DecisionList WideningDecisions;
1756
1757 using CallDecisionList =
1758 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1759
1760 CallDecisionList CallWideningDecisions;
1761
1762 /// Returns true if \p V is expected to be vectorized and it needs to be
1763 /// extracted.
1764 bool needsExtract(Value *V, ElementCount VF) const {
1765 Instruction *I = dyn_cast<Instruction>(V);
1766 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1769 return false;
1770
1771 // Assume we can vectorize V (and hence we need extraction) if the
1772 // scalars are not computed yet. This can happen, because it is called
1773 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1774 // the scalars are collected. That should be a safe assumption in most
1775 // cases, because we check if the operands have vectorizable types
1776 // beforehand in LoopVectorizationLegality.
1777 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1778 };
1779
1780 /// Returns a range containing only operands needing to be extracted.
1781 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1782 ElementCount VF) const {
1784 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1785 }
1786
1787public:
1788 /// The loop that we evaluate.
1790
1791 /// Predicated scalar evolution analysis.
1793
1794 /// Loop Info analysis.
1796
1797 /// Vectorization legality.
1799
1800 /// Vector target information.
1802
1803 /// Target Library Info.
1805
1806 /// Demanded bits analysis.
1808
1809 /// Assumption cache.
1811
1812 /// Interface to emit optimization remarks.
1814
1816
1817 /// Loop Vectorize Hint.
1819
1820 /// The interleave access information contains groups of interleaved accesses
1821 /// with the same stride and close to each other.
1823
1824 /// Values to ignore in the cost model.
1826
1827 /// Values to ignore in the cost model when VF > 1.
1829
1830 /// All element types found in the loop.
1832
1833 /// The kind of cost that we are calculating
1835};
1836} // end namespace llvm
1837
1838namespace {
1839/// Helper struct to manage generating runtime checks for vectorization.
1840///
1841/// The runtime checks are created up-front in temporary blocks to allow better
1842/// estimating the cost and un-linked from the existing IR. After deciding to
1843/// vectorize, the checks are moved back. If deciding not to vectorize, the
1844/// temporary blocks are completely removed.
1845class GeneratedRTChecks {
1846 /// Basic block which contains the generated SCEV checks, if any.
1847 BasicBlock *SCEVCheckBlock = nullptr;
1848
1849 /// The value representing the result of the generated SCEV checks. If it is
1850 /// nullptr, either no SCEV checks have been generated or they have been used.
1851 Value *SCEVCheckCond = nullptr;
1852
1853 /// Basic block which contains the generated memory runtime checks, if any.
1854 BasicBlock *MemCheckBlock = nullptr;
1855
1856 /// The value representing the result of the generated memory runtime checks.
1857 /// If it is nullptr, either no memory runtime checks have been generated or
1858 /// they have been used.
1859 Value *MemRuntimeCheckCond = nullptr;
1860
1861 DominatorTree *DT;
1862 LoopInfo *LI;
1864
1865 SCEVExpander SCEVExp;
1866 SCEVExpander MemCheckExp;
1867
1868 bool CostTooHigh = false;
1869 const bool AddBranchWeights;
1870
1871 Loop *OuterLoop = nullptr;
1872
1874
1875 /// The kind of cost that we are calculating
1876 TTI::TargetCostKind CostKind;
1877
1878public:
1879 GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
1881 const DataLayout &DL, bool AddBranchWeights,
1882 TTI::TargetCostKind CostKind)
1883 : DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"),
1884 MemCheckExp(*PSE.getSE(), DL, "scev.check"),
1885 AddBranchWeights(AddBranchWeights), PSE(PSE), CostKind(CostKind) {}
1886
1887 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1888 /// accurately estimate the cost of the runtime checks. The blocks are
1889 /// un-linked from the IR and are added back during vector code generation. If
1890 /// there is no vector code generation, the check blocks are removed
1891 /// completely.
1892 void create(Loop *L, const LoopAccessInfo &LAI,
1893 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1894
1895 // Hard cutoff to limit compile-time increase in case a very large number of
1896 // runtime checks needs to be generated.
1897 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1898 // profile info.
1899 CostTooHigh =
1901 if (CostTooHigh)
1902 return;
1903
1904 BasicBlock *LoopHeader = L->getHeader();
1905 BasicBlock *Preheader = L->getLoopPreheader();
1906
1907 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1908 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1909 // may be used by SCEVExpander. The blocks will be un-linked from their
1910 // predecessors and removed from LI & DT at the end of the function.
1911 if (!UnionPred.isAlwaysTrue()) {
1912 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1913 nullptr, "vector.scevcheck");
1914
1915 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1916 &UnionPred, SCEVCheckBlock->getTerminator());
1917 }
1918
1919 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1920 if (RtPtrChecking.Need) {
1921 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1922 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1923 "vector.memcheck");
1924
1925 auto DiffChecks = RtPtrChecking.getDiffChecks();
1926 if (DiffChecks) {
1927 Value *RuntimeVF = nullptr;
1928 MemRuntimeCheckCond = addDiffRuntimeChecks(
1929 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1930 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1931 if (!RuntimeVF)
1932 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1933 return RuntimeVF;
1934 },
1935 IC);
1936 } else {
1937 MemRuntimeCheckCond = addRuntimeChecks(
1938 MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
1940 }
1941 assert(MemRuntimeCheckCond &&
1942 "no RT checks generated although RtPtrChecking "
1943 "claimed checks are required");
1944 }
1945
1946 if (!MemCheckBlock && !SCEVCheckBlock)
1947 return;
1948
1949 // Unhook the temporary block with the checks, update various places
1950 // accordingly.
1951 if (SCEVCheckBlock)
1952 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1953 if (MemCheckBlock)
1954 MemCheckBlock->replaceAllUsesWith(Preheader);
1955
1956 if (SCEVCheckBlock) {
1957 SCEVCheckBlock->getTerminator()->moveBefore(
1958 Preheader->getTerminator()->getIterator());
1959 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1960 Preheader->getTerminator()->eraseFromParent();
1961 }
1962 if (MemCheckBlock) {
1963 MemCheckBlock->getTerminator()->moveBefore(
1964 Preheader->getTerminator()->getIterator());
1965 new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1966 Preheader->getTerminator()->eraseFromParent();
1967 }
1968
1969 DT->changeImmediateDominator(LoopHeader, Preheader);
1970 if (MemCheckBlock) {
1971 DT->eraseNode(MemCheckBlock);
1972 LI->removeBlock(MemCheckBlock);
1973 }
1974 if (SCEVCheckBlock) {
1975 DT->eraseNode(SCEVCheckBlock);
1976 LI->removeBlock(SCEVCheckBlock);
1977 }
1978
1979 // Outer loop is used as part of the later cost calculations.
1980 OuterLoop = L->getParentLoop();
1981 }
1982
1983 InstructionCost getCost() {
1984 if (SCEVCheckBlock || MemCheckBlock)
1985 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1986
1987 if (CostTooHigh) {
1989 Cost.setInvalid();
1990 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
1991 return Cost;
1992 }
1993
1994 InstructionCost RTCheckCost = 0;
1995 if (SCEVCheckBlock)
1996 for (Instruction &I : *SCEVCheckBlock) {
1997 if (SCEVCheckBlock->getTerminator() == &I)
1998 continue;
1999 InstructionCost C = TTI->getInstructionCost(&I, CostKind);
2000 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
2001 RTCheckCost += C;
2002 }
2003 if (MemCheckBlock) {
2004 InstructionCost MemCheckCost = 0;
2005 for (Instruction &I : *MemCheckBlock) {
2006 if (MemCheckBlock->getTerminator() == &I)
2007 continue;
2008 InstructionCost C = TTI->getInstructionCost(&I, CostKind);
2009 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
2010 MemCheckCost += C;
2011 }
2012
2013 // If the runtime memory checks are being created inside an outer loop
2014 // we should find out if these checks are outer loop invariant. If so,
2015 // the checks will likely be hoisted out and so the effective cost will
2016 // reduce according to the outer loop trip count.
2017 if (OuterLoop) {
2018 ScalarEvolution *SE = MemCheckExp.getSE();
2019 // TODO: If profitable, we could refine this further by analysing every
2020 // individual memory check, since there could be a mixture of loop
2021 // variant and invariant checks that mean the final condition is
2022 // variant.
2023 const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
2024 if (SE->isLoopInvariant(Cond, OuterLoop)) {
2025 // It seems reasonable to assume that we can reduce the effective
2026 // cost of the checks even when we know nothing about the trip
2027 // count. Assume that the outer loop executes at least twice.
2028 unsigned BestTripCount = 2;
2029
2030 // Get the best known TC estimate.
2031 if (auto EstimatedTC = getSmallBestKnownTC(
2032 PSE, OuterLoop, /* CanUseConstantMax = */ false))
2033 BestTripCount = *EstimatedTC;
2034
2035 BestTripCount = std::max(BestTripCount, 1U);
2036 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
2037
2038 // Let's ensure the cost is always at least 1.
2039 NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
2041
2042 if (BestTripCount > 1)
2044 << "We expect runtime memory checks to be hoisted "
2045 << "out of the outer loop. Cost reduced from "
2046 << MemCheckCost << " to " << NewMemCheckCost << '\n');
2047
2048 MemCheckCost = NewMemCheckCost;
2049 }
2050 }
2051
2052 RTCheckCost += MemCheckCost;
2053 }
2054
2055 if (SCEVCheckBlock || MemCheckBlock)
2056 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2057 << "\n");
2058
2059 return RTCheckCost;
2060 }
2061
2062 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2063 /// unused.
2064 ~GeneratedRTChecks() {
2065 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2066 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2067 if (!SCEVCheckCond)
2068 SCEVCleaner.markResultUsed();
2069
2070 if (!MemRuntimeCheckCond)
2071 MemCheckCleaner.markResultUsed();
2072
2073 if (MemRuntimeCheckCond) {
2074 auto &SE = *MemCheckExp.getSE();
2075 // Memory runtime check generation creates compares that use expanded
2076 // values. Remove them before running the SCEVExpanderCleaners.
2077 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2078 if (MemCheckExp.isInsertedInstruction(&I))
2079 continue;
2080 SE.forgetValue(&I);
2081 I.eraseFromParent();
2082 }
2083 }
2084 MemCheckCleaner.cleanup();
2085 SCEVCleaner.cleanup();
2086
2087 if (SCEVCheckCond)
2088 SCEVCheckBlock->eraseFromParent();
2089 if (MemRuntimeCheckCond)
2090 MemCheckBlock->eraseFromParent();
2091 }
2092
2093 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2094 /// adjusts the branches to branch to the vector preheader or \p Bypass,
2095 /// depending on the generated condition.
2096 BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2097 BasicBlock *LoopVectorPreHeader) {
2098 if (!SCEVCheckCond)
2099 return nullptr;
2100
2101 Value *Cond = SCEVCheckCond;
2102 // Mark the check as used, to prevent it from being removed during cleanup.
2103 SCEVCheckCond = nullptr;
2104 if (auto *C = dyn_cast<ConstantInt>(Cond))
2105 if (C->isZero())
2106 return nullptr;
2107
2108 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2109
2110 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2111 // Create new preheader for vector loop.
2112 if (OuterLoop)
2113 OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2114
2115 SCEVCheckBlock->getTerminator()->eraseFromParent();
2116 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2117 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2118 SCEVCheckBlock);
2119
2120 DT->addNewBlock(SCEVCheckBlock, Pred);
2121 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2122
2123 BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
2124 if (AddBranchWeights)
2125 setBranchWeights(BI, SCEVCheckBypassWeights, /*IsExpected=*/false);
2126 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
2127 return SCEVCheckBlock;
2128 }
2129
2130 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2131 /// the branches to branch to the vector preheader or \p Bypass, depending on
2132 /// the generated condition.
2133 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2134 BasicBlock *LoopVectorPreHeader) {
2135 // Check if we generated code that checks in runtime if arrays overlap.
2136 if (!MemRuntimeCheckCond)
2137 return nullptr;
2138
2139 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2140 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2141 MemCheckBlock);
2142
2143 DT->addNewBlock(MemCheckBlock, Pred);
2144 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2145 MemCheckBlock->moveBefore(LoopVectorPreHeader);
2146
2147 if (OuterLoop)
2148 OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
2149
2150 BranchInst &BI =
2151 *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2152 if (AddBranchWeights) {
2153 setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false);
2154 }
2155 ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
2156 MemCheckBlock->getTerminator()->setDebugLoc(
2157 Pred->getTerminator()->getDebugLoc());
2158
2159 // Mark the check as used, to prevent it from being removed during cleanup.
2160 MemRuntimeCheckCond = nullptr;
2161 return MemCheckBlock;
2162 }
2163};
2164} // namespace
2165
2167 return Style == TailFoldingStyle::Data ||
2168 Style == TailFoldingStyle::DataAndControlFlow ||
2169 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2170}
2171
2173 return Style == TailFoldingStyle::DataAndControlFlow ||
2174 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2175}
2176
2177// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2178// vectorization. The loop needs to be annotated with #pragma omp simd
2179// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2180// vector length information is not provided, vectorization is not considered
2181// explicit. Interleave hints are not allowed either. These limitations will be
2182// relaxed in the future.
2183// Please, note that we are currently forced to abuse the pragma 'clang
2184// vectorize' semantics. This pragma provides *auto-vectorization hints*
2185// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2186// provides *explicit vectorization hints* (LV can bypass legal checks and
2187// assume that vectorization is legal). However, both hints are implemented
2188// using the same metadata (llvm.loop.vectorize, processed by
2189// LoopVectorizeHints). This will be fixed in the future when the native IR
2190// representation for pragma 'omp simd' is introduced.
2191static bool isExplicitVecOuterLoop(Loop *OuterLp,
2193 assert(!OuterLp->isInnermost() && "This is not an outer loop");
2194 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2195
2196 // Only outer loops with an explicit vectorization hint are supported.
2197 // Unannotated outer loops are ignored.
2199 return false;
2200
2201 Function *Fn = OuterLp->getHeader()->getParent();
2202 if (!Hints.allowVectorization(Fn, OuterLp,
2203 true /*VectorizeOnlyWhenForced*/)) {
2204 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2205 return false;
2206 }
2207
2208 if (Hints.getInterleave() > 1) {
2209 // TODO: Interleave support is future work.
2210 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2211 "outer loops.\n");
2212 Hints.emitRemarkWithHints();
2213 return false;
2214 }
2215
2216 return true;
2217}
2218
2222 // Collect inner loops and outer loops without irreducible control flow. For
2223 // now, only collect outer loops that have explicit vectorization hints. If we
2224 // are stress testing the VPlan H-CFG construction, we collect the outermost
2225 // loop of every loop nest.
2226 if (L.isInnermost() || VPlanBuildStressTest ||
2228 LoopBlocksRPO RPOT(&L);
2229 RPOT.perform(LI);
2230 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2231 V.push_back(&L);
2232 // TODO: Collect inner loops inside marked outer loops in case
2233 // vectorization fails for the outer loop. Do not invoke
2234 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2235 // already known to be reducible. We can use an inherited attribute for
2236 // that.
2237 return;
2238 }
2239 }
2240 for (Loop *InnerL : L)
2241 collectSupportedLoops(*InnerL, LI, ORE, V);
2242}
2243
2244//===----------------------------------------------------------------------===//
2245// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2246// LoopVectorizationCostModel and LoopVectorizationPlanner.
2247//===----------------------------------------------------------------------===//
2248
2249/// Compute the transformed value of Index at offset StartValue using step
2250/// StepValue.
2251/// For integer induction, returns StartValue + Index * StepValue.
2252/// For pointer induction, returns StartValue[Index * StepValue].
2253/// FIXME: The newly created binary instructions should contain nsw/nuw
2254/// flags, which can be found from the original scalar operations.
2255static Value *
2257 Value *Step,
2259 const BinaryOperator *InductionBinOp) {
2260 Type *StepTy = Step->getType();
2261 Value *CastedIndex = StepTy->isIntegerTy()
2262 ? B.CreateSExtOrTrunc(Index, StepTy)
2263 : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2264 if (CastedIndex != Index) {
2265 CastedIndex->setName(CastedIndex->getName() + ".cast");
2266 Index = CastedIndex;
2267 }
2268
2269 // Note: the IR at this point is broken. We cannot use SE to create any new
2270 // SCEV and then expand it, hoping that SCEV's simplification will give us
2271 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2272 // lead to various SCEV crashes. So all we can do is to use builder and rely
2273 // on InstCombine for future simplifications. Here we handle some trivial
2274 // cases only.
2275 auto CreateAdd = [&B](Value *X, Value *Y) {
2276 assert(X->getType() == Y->getType() && "Types don't match!");
2277 if (auto *CX = dyn_cast<ConstantInt>(X))
2278 if (CX->isZero())
2279 return Y;
2280 if (auto *CY = dyn_cast<ConstantInt>(Y))
2281 if (CY->isZero())
2282 return X;
2283 return B.CreateAdd(X, Y);
2284 };
2285
2286 // We allow X to be a vector type, in which case Y will potentially be
2287 // splatted into a vector with the same element count.
2288 auto CreateMul = [&B](Value *X, Value *Y) {
2289 assert(X->getType()->getScalarType() == Y->getType() &&
2290 "Types don't match!");
2291 if (auto *CX = dyn_cast<ConstantInt>(X))
2292 if (CX->isOne())
2293 return Y;
2294 if (auto *CY = dyn_cast<ConstantInt>(Y))
2295 if (CY->isOne())
2296 return X;
2297 VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2298 if (XVTy && !isa<VectorType>(Y->getType()))
2299 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2300 return B.CreateMul(X, Y);
2301 };
2302
2303 switch (InductionKind) {
2305 assert(!isa<VectorType>(Index->getType()) &&
2306 "Vector indices not supported for integer inductions yet");
2307 assert(Index->getType() == StartValue->getType() &&
2308 "Index type does not match StartValue type");
2309 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2310 return B.CreateSub(StartValue, Index);
2311 auto *Offset = CreateMul(Index, Step);
2312 return CreateAdd(StartValue, Offset);
2313 }
2315 return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
2317 assert(!isa<VectorType>(Index->getType()) &&
2318 "Vector indices not supported for FP inductions yet");
2319 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2320 assert(InductionBinOp &&
2321 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2322 InductionBinOp->getOpcode() == Instruction::FSub) &&
2323 "Original bin op should be defined for FP induction");
2324
2325 Value *MulExp = B.CreateFMul(Step, Index);
2326 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2327 "induction");
2328 }
2330 return nullptr;
2331 }
2332 llvm_unreachable("invalid enum");
2333}
2334
2335std::optional<unsigned> getMaxVScale(const Function &F,
2336 const TargetTransformInfo &TTI) {
2337 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2338 return MaxVScale;
2339
2340 if (F.hasFnAttribute(Attribute::VScaleRange))
2341 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2342
2343 return std::nullopt;
2344}
2345
2346/// For the given VF and UF and maximum trip count computed for the loop, return
2347/// whether the induction variable might overflow in the vectorized loop. If not,
2348/// then we know a runtime overflow check always evaluates to false and can be
2349/// removed.
2352 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2353 // Always be conservative if we don't know the exact unroll factor.
2354 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2355
2356 Type *IdxTy = Cost->Legal->getWidestInductionType();
2357 APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
2358
2359 // We know the runtime overflow check is known false iff the (max) trip-count
2360 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2361 // the vector loop induction variable.
2362 if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
2363 uint64_t MaxVF = VF.getKnownMinValue();
2364 if (VF.isScalable()) {
2365 std::optional<unsigned> MaxVScale =
2366 getMaxVScale(*Cost->TheFunction, Cost->TTI);
2367 if (!MaxVScale)
2368 return false;
2369 MaxVF *= *MaxVScale;
2370 }
2371
2372 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2373 }
2374
2375 return false;
2376}
2377
2378// Return whether we allow using masked interleave-groups (for dealing with
2379// strided loads/stores that reside in predicated blocks, or for dealing
2380// with gaps).
2382 // If an override option has been passed in for interleaved accesses, use it.
2385
2387}
2388
2390 VPReplicateRecipe *RepRecipe,
2391 const VPLane &Lane,
2392 VPTransformState &State) {
2393 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2394
2395 // Does this instruction return a value ?
2396 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2397
2398 Instruction *Cloned = Instr->clone();
2399 if (!IsVoidRetTy) {
2400 Cloned->setName(Instr->getName() + ".cloned");
2401#if !defined(NDEBUG)
2402 // Verify that VPlan type inference results agree with the type of the
2403 // generated values.
2404 assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2405 "inferred type and type from generated instructions do not match");
2406#endif
2407 }
2408
2409 RepRecipe->setFlags(Cloned);
2410
2411 if (auto DL = Instr->getDebugLoc())
2412 State.setDebugLocFrom(DL);
2413
2414 // Replace the operands of the cloned instructions with their scalar
2415 // equivalents in the new loop.
2416 for (const auto &I : enumerate(RepRecipe->operands())) {
2417 auto InputLane = Lane;
2418 VPValue *Operand = I.value();
2420 InputLane = VPLane::getFirstLane();
2421 Cloned->setOperand(I.index(), State.get(Operand, InputLane));
2422 }
2423 State.addNewMetadata(Cloned, Instr);
2424
2425 // Place the cloned scalar in the new loop.
2426 State.Builder.Insert(Cloned);
2427
2428 State.set(RepRecipe, Cloned, Lane);
2429
2430 // If we just cloned a new assumption, add it the assumption cache.
2431 if (auto *II = dyn_cast<AssumeInst>(Cloned))
2433
2434 // End if-block.
2435 VPRegionBlock *Parent = RepRecipe->getParent()->getParent();
2436 bool IfPredicateInstr = Parent ? Parent->isReplicator() : false;
2437 assert(
2438 (Parent || !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
2439 all_of(RepRecipe->operands(),
2440 [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
2441 "Expected a recipe is either within a region or all of its operands "
2442 "are defined outside the vectorized region.");
2443 if (IfPredicateInstr)
2444 PredicatedInstructions.push_back(Cloned);
2445}
2446
2447Value *
2449 if (VectorTripCount)
2450 return VectorTripCount;
2451
2452 Value *TC = getTripCount();
2453 IRBuilder<> Builder(InsertBlock->getTerminator());
2454
2455 Type *Ty = TC->getType();
2456 // This is where we can make the step a runtime constant.
2457 Value *Step = createStepForVF(Builder, Ty, VF, UF);
2458
2459 // If the tail is to be folded by masking, round the number of iterations N
2460 // up to a multiple of Step instead of rounding down. This is done by first
2461 // adding Step-1 and then rounding down. Note that it's ok if this addition
2462 // overflows: the vector induction variable will eventually wrap to zero given
2463 // that it starts at zero and its Step is a power of two; the loop will then
2464 // exit, with the last early-exit vector comparison also producing all-true.
2465 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2466 // is accounted for in emitIterationCountCheck that adds an overflow check.
2467 if (Cost->foldTailByMasking()) {
2469 "VF*UF must be a power of 2 when folding tail by masking");
2470 TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)),
2471 "n.rnd.up");
2472 }
2473
2474 // Now we need to generate the expression for the part of the loop that the
2475 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2476 // iterations are not required for correctness, or N - Step, otherwise. Step
2477 // is equal to the vectorization factor (number of SIMD elements) times the
2478 // unroll factor (number of SIMD instructions).
2479 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2480
2481 // There are cases where we *must* run at least one iteration in the remainder
2482 // loop. See the cost model for when this can happen. If the step evenly
2483 // divides the trip count, we set the remainder to be equal to the step. If
2484 // the step does not evenly divide the trip count, no adjustment is necessary
2485 // since there will already be scalar iterations. Note that the minimum
2486 // iterations check ensures that N >= Step.
2487 if (Cost->requiresScalarEpilogue(VF.isVector())) {
2488 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2489 R = Builder.CreateSelect(IsZero, Step, R);
2490 }
2491
2492 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2493
2494 return VectorTripCount;
2495}
2496
2498 VPBlockBase *ScalarPH = Plan.getScalarPreheader();
2500 if (PreVectorPH->getNumSuccessors() != 1) {
2501 assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors");
2502 assert(PreVectorPH->getSuccessors()[0] == ScalarPH &&
2503 "Unexpected successor");
2504 VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB);
2505 VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPB, CheckVPIRBB);
2506 PreVectorPH = CheckVPIRBB;
2507 }
2508 VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH);
2509 PreVectorPH->swapSuccessors();
2510
2511 // We just connected a new block to the scalar preheader. Update all
2512 // ResumePhis by adding an incoming value for it.
2513 for (VPRecipeBase &R : *cast<VPBasicBlock>(ScalarPH)) {
2514 auto *ResumePhi = dyn_cast<VPInstruction>(&R);
2515 if (!ResumePhi || ResumePhi->getOpcode() != VPInstruction::ResumePhi)
2516 continue;
2517 ResumePhi->addOperand(ResumePhi->getOperand(1));
2518 }
2519}
2520
2522 Value *Count = getTripCount();
2523 // Reuse existing vector loop preheader for TC checks.
2524 // Note that new preheader block is generated for vector loop.
2525 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2526 IRBuilder<> Builder(TCCheckBlock->getTerminator());
2527
2528 // Generate code to check if the loop's trip count is less than VF * UF, or
2529 // equal to it in case a scalar epilogue is required; this implies that the
2530 // vector trip count is zero. This check also covers the case where adding one
2531 // to the backedge-taken count overflowed leading to an incorrect trip count
2532 // of zero. In this case we will also jump to the scalar loop.
2533 auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2535
2536 // If tail is to be folded, vector loop takes care of all iterations.
2537 Type *CountTy = Count->getType();
2538 Value *CheckMinIters = Builder.getFalse();
2539 auto CreateStep = [&]() -> Value * {
2540 // Create step with max(MinProTripCount, UF * VF).
2542 return createStepForVF(Builder, CountTy, VF, UF);
2543
2544 Value *MinProfTC =
2546 if (!VF.isScalable())
2547 return MinProfTC;
2549 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2550 };
2551
2552 TailFoldingStyle Style = Cost->getTailFoldingStyle();
2553 if (Style == TailFoldingStyle::None) {
2554 Value *Step = CreateStep();
2555 ScalarEvolution &SE = *PSE.getSE();
2556 // TODO: Emit unconditional branch to vector preheader instead of
2557 // conditional branch with known condition.
2558 const SCEV *TripCountSCEV = SE.applyLoopGuards(SE.getSCEV(Count), OrigLoop);
2559 // Check if the trip count is < the step.
2560 if (SE.isKnownPredicate(P, TripCountSCEV, SE.getSCEV(Step))) {
2561 // TODO: Ensure step is at most the trip count when determining max VF and
2562 // UF, w/o tail folding.
2563 CheckMinIters = Builder.getTrue();
2565 TripCountSCEV, SE.getSCEV(Step))) {
2566 // Generate the minimum iteration check only if we cannot prove the
2567 // check is known to be true, or known to be false.
2568 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
2569 } // else step known to be < trip count, use CheckMinIters preset to false.
2570 } else if (VF.isScalable() &&
2573 // vscale is not necessarily a power-of-2, which means we cannot guarantee
2574 // an overflow to zero when updating induction variables and so an
2575 // additional overflow check is required before entering the vector loop.
2576
2577 // Get the maximum unsigned value for the type.
2578 Value *MaxUIntTripCount =
2579 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2580 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2581
2582 // Don't execute the vector loop if (UMax - n) < (VF * UF).
2583 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2584 }
2585
2586 // Create new preheader for vector loop.
2588 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2589 "vector.ph");
2590
2591 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2592 DT->getNode(Bypass)->getIDom()) &&
2593 "TC check is expected to dominate Bypass");
2594
2595 BranchInst &BI =
2596 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
2598 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
2599 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
2600 LoopBypassBlocks.push_back(TCCheckBlock);
2601
2602 // TODO: Wrap LoopVectorPreHeader in VPIRBasicBlock here.
2603 introduceCheckBlockInVPlan(TCCheckBlock);
2604}
2605
2607 BasicBlock *const SCEVCheckBlock =
2608 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader);
2609 if (!SCEVCheckBlock)
2610 return nullptr;
2611
2612 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2614 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2615 "Cannot SCEV check stride or overflow when optimizing for size");
2616 assert(!LoopBypassBlocks.empty() &&
2617 "Should already be a bypass block due to iteration count check");
2618 LoopBypassBlocks.push_back(SCEVCheckBlock);
2619 AddedSafetyChecks = true;
2620
2621 introduceCheckBlockInVPlan(SCEVCheckBlock);
2622 return SCEVCheckBlock;
2623}
2624
2626 // VPlan-native path does not do any analysis for runtime checks currently.
2628 return nullptr;
2629
2630 BasicBlock *const MemCheckBlock =
2631 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2632
2633 // Check if we generated code that checks in runtime if arrays overlap. We put
2634 // the checks into a separate block to make the more common case of few
2635 // elements faster.
2636 if (!MemCheckBlock)
2637 return nullptr;
2638
2639 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2640 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2641 "Cannot emit memory checks when optimizing for size, unless forced "
2642 "to vectorize.");
2643 ORE->emit([&]() {
2644 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2647 << "Code-size may be reduced by not forcing "
2648 "vectorization, or by source-code modifications "
2649 "eliminating the need for runtime checks "
2650 "(e.g., adding 'restrict').";
2651 });
2652 }
2653
2654 LoopBypassBlocks.push_back(MemCheckBlock);
2655
2656 AddedSafetyChecks = true;
2657
2658 introduceCheckBlockInVPlan(MemCheckBlock);
2659 return MemCheckBlock;
2660}
2661
2662/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
2663/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
2664/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
2665/// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
2667 VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB);
2668 for (auto &R : make_early_inc_range(*VPBB)) {
2669 assert(!R.isPhi() && "Tried to move phi recipe to end of block");
2670 R.moveBefore(*IRVPBB, IRVPBB->end());
2671 }
2672
2673 VPBlockUtils::reassociateBlocks(VPBB, IRVPBB);
2674 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
2675}
2676
2679 assert(LoopVectorPreHeader && "Invalid loop structure");
2681 Cost->requiresScalarEpilogue(VF.isVector())) &&
2682 "loops not exiting via the latch without required epilogue?");
2683
2686 LI, nullptr, Twine(Prefix) + "middle.block");
2690 nullptr, Twine(Prefix) + "scalar.ph");
2692}
2693
2694/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
2695/// expansion results.
2697 const SCEV2ValueTy &ExpandedSCEVs) {
2698 const SCEV *Step = ID.getStep();
2699 if (auto *C = dyn_cast<SCEVConstant>(Step))
2700 return C->getValue();
2701 if (auto *U = dyn_cast<SCEVUnknown>(Step))
2702 return U->getValue();
2703 auto I = ExpandedSCEVs.find(Step);
2704 assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
2705 return I->second;
2706}
2707
2708/// Knowing that loop \p L executes a single vector iteration, add instructions
2709/// that will get simplified and thus should not have any cost to \p
2710/// InstsToIgnore.
2713 SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
2714 auto *Cmp = L->getLatchCmpInst();
2715 if (Cmp)
2716 InstsToIgnore.insert(Cmp);
2717 for (const auto &KV : IL) {
2718 // Extract the key by hand so that it can be used in the lambda below. Note
2719 // that captured structured bindings are a C++20 extension.
2720 const PHINode *IV = KV.first;
2721
2722 // Get next iteration value of the induction variable.
2723 Instruction *IVInst =
2724 cast<Instruction>(IV->getIncomingValueForBlock(L->getLoopLatch()));
2725 if (all_of(IVInst->users(),
2726 [&](const User *U) { return U == IV || U == Cmp; }))
2727 InstsToIgnore.insert(IVInst);
2728 }
2729}
2730
2732 const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount) {
2733 assert(MainVectorTripCount && "Must have bypass information");
2734
2735 Instruction *OldInduction = Legal->getPrimaryInduction();
2736 IRBuilder<> BypassBuilder(getAdditionalBypassBlock(),
2737 getAdditionalBypassBlock()->getFirstInsertionPt());
2738 for (const auto &InductionEntry : Legal->getInductionVars()) {
2739 PHINode *OrigPhi = InductionEntry.first;
2740 const InductionDescriptor &II = InductionEntry.second;
2741 Value *Step = getExpandedStep(II, ExpandedSCEVs);
2742 // For the primary induction the additional bypass end value is known.
2743 // Otherwise it is computed.
2744 Value *EndValueFromAdditionalBypass = MainVectorTripCount;
2745 if (OrigPhi != OldInduction) {
2746 auto *BinOp = II.getInductionBinOp();
2747 // Fast-math-flags propagate from the original induction instruction.
2748 if (isa_and_nonnull<FPMathOperator>(BinOp))
2749 BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags());
2750
2751 // Compute the end value for the additional bypass.
2752 EndValueFromAdditionalBypass =
2753 emitTransformedIndex(BypassBuilder, MainVectorTripCount,
2754 II.getStartValue(), Step, II.getKind(), BinOp);
2755 EndValueFromAdditionalBypass->setName("ind.end");
2756 }
2757
2758 // Store the bypass value here, as it needs to be added as operand to its
2759 // scalar preheader phi node after the epilogue skeleton has been created.
2760 // TODO: Directly add as extra operand to the VPResumePHI recipe.
2761 assert(!Induction2AdditionalBypassValue.contains(OrigPhi) &&
2762 "entry for OrigPhi already exits");
2763 Induction2AdditionalBypassValue[OrigPhi] = EndValueFromAdditionalBypass;
2764 }
2765}
2766
2768 const SCEV2ValueTy &ExpandedSCEVs) {
2769 /*
2770 In this function we generate a new loop. The new loop will contain
2771 the vectorized instructions while the old loop will continue to run the
2772 scalar remainder.
2773
2774 [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
2775 / | preheader are expanded here. Eventually all required SCEV
2776 / | expansion should happen here.
2777 / v
2778 | [ ] <-- vector loop bypass (may consist of multiple blocks).
2779 | / |
2780 | / v
2781 || [ ] <-- vector pre header.
2782 |/ |
2783 | v
2784 | [ ] \
2785 | [ ]_| <-- vector loop (created during VPlan execution).
2786 | |
2787 | v
2788 \ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to
2789 | | successors created during VPlan execution)
2790 \/ |
2791 /\ v
2792 | ->[ ] <--- new preheader (wrapped in VPIRBasicBlock).
2793 | |
2794 (opt) v <-- edge from middle to exit iff epilogue is not required.
2795 | [ ] \
2796 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue, header
2797 | | wrapped in VPIRBasicBlock).
2798 \ |
2799 \ v
2800 >[ ] <-- exit block(s). (wrapped in VPIRBasicBlock)
2801 ...
2802 */
2803
2804 // Create an empty vector loop, and prepare basic blocks for the runtime
2805 // checks.
2807
2808 // Now, compare the new count to zero. If it is zero skip the vector loop and
2809 // jump to the scalar loop. This check also covers the case where the
2810 // backedge-taken count is uint##_max: adding one to it will overflow leading
2811 // to an incorrect trip count of zero. In this (rare) case we will also jump
2812 // to the scalar loop.
2814
2815 // Generate the code to check any assumptions that we've made for SCEV
2816 // expressions.
2818
2819 // Generate the code that checks in runtime if arrays overlap. We put the
2820 // checks into a separate block to make the more common case of few elements
2821 // faster.
2823
2824 return LoopVectorPreHeader;
2825}
2826
2827namespace {
2828
2829struct CSEDenseMapInfo {
2830 static bool canHandle(const Instruction *I) {
2831 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
2832 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
2833 }
2834
2835 static inline Instruction *getEmptyKey() {
2837 }
2838
2839 static inline Instruction *getTombstoneKey() {
2841 }
2842
2843 static unsigned getHashValue(const Instruction *I) {
2844 assert(canHandle(I) && "Unknown instruction!");
2845 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
2846 I->value_op_end()));
2847 }
2848
2849 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2850 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2851 LHS == getTombstoneKey() || RHS == getTombstoneKey())
2852 return LHS == RHS;
2853 return LHS->isIdenticalTo(RHS);
2854 }
2855};
2856
2857} // end anonymous namespace
2858
2859///Perform cse of induction variable instructions.
2860static void cse(BasicBlock *BB) {
2861 // Perform simple cse.
2863 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
2864 if (!CSEDenseMapInfo::canHandle(&In))
2865 continue;
2866
2867 // Check if we can replace this instruction with any of the
2868 // visited instructions.
2869 if (Instruction *V = CSEMap.lookup(&In)) {
2870 In.replaceAllUsesWith(V);
2871 In.eraseFromParent();
2872 continue;
2873 }
2874
2875 CSEMap[&In] = &In;
2876 }
2877}
2878
2881 ElementCount VF) const {
2882 // We only need to calculate a cost if the VF is scalar; for actual vectors
2883 // we should already have a pre-calculated cost at each VF.
2884 if (!VF.isScalar())
2885 return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
2886
2887 Type *RetTy = CI->getType();
2889 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy))
2890 return *RedCost;
2891
2893 for (auto &ArgOp : CI->args())
2894 Tys.push_back(ArgOp->getType());
2895
2896 InstructionCost ScalarCallCost =
2898
2899 // If this is an intrinsic we may have a lower cost for it.
2901 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
2902 return std::min(ScalarCallCost, IntrinsicCost);
2903 }
2904 return ScalarCallCost;
2905}
2906
2908 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
2909 return Elt;
2910 return VectorType::get(Elt, VF);
2911}
2912
2915 ElementCount VF) const {
2917 assert(ID && "Expected intrinsic call!");
2918 Type *RetTy = maybeVectorizeType(CI->getType(), VF);
2919 FastMathFlags FMF;
2920 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
2921 FMF = FPMO->getFastMathFlags();
2922
2925 SmallVector<Type *> ParamTys;
2926 std::transform(FTy->param_begin(), FTy->param_end(),
2927 std::back_inserter(ParamTys),
2928 [&](Type *Ty) { return maybeVectorizeType(Ty, VF); });
2929
2930 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
2931 dyn_cast<IntrinsicInst>(CI));
2932 return TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
2933}
2934
2936 // Fix widened non-induction PHIs by setting up the PHI operands.
2938 fixNonInductionPHIs(State);
2939
2940 // After vectorization, the exit blocks of the original loop will have
2941 // additional predecessors. Invalidate SCEVs for the exit phis in case SE
2942 // looked through single-entry phis.
2943 SmallVector<BasicBlock *> ExitBlocks;
2944 OrigLoop->getExitBlocks(ExitBlocks);
2945 for (BasicBlock *Exit : ExitBlocks)
2946 for (PHINode &PN : Exit->phis())
2948
2949 // Forget the original basic block.
2952
2953 // Don't apply optimizations below when no vector region remains, as they all
2954 // require a vector loop at the moment.
2955 if (!State.Plan->getVectorLoopRegion())
2956 return;
2957
2959 sinkScalarOperands(&*PI);
2960
2961 VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
2962 VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
2963 BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
2964
2965 // Remove redundant induction instructions.
2966 cse(HeaderBB);
2967
2968 // Set/update profile weights for the vector and remainder loops as original
2969 // loop iterations are now distributed among them. Note that original loop
2970 // becomes the scalar remainder loop after vectorization.
2971 //
2972 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
2973 // end up getting slightly roughened result but that should be OK since
2974 // profile is not inherently precise anyway. Note also possible bypass of
2975 // vector code caused by legality checks is ignored, assigning all the weight
2976 // to the vector loop, optimistically.
2977 //
2978 // For scalable vectorization we can't know at compile time how many
2979 // iterations of the loop are handled in one vector iteration, so instead
2980 // assume a pessimistic vscale of '1'.
2981 Loop *VectorLoop = LI->getLoopFor(HeaderBB);
2983 VF.getKnownMinValue() * UF);
2984}
2985
2987 // The basic block and loop containing the predicated instruction.
2988 auto *PredBB = PredInst->getParent();
2989 auto *VectorLoop = LI->getLoopFor(PredBB);
2990
2991 // Initialize a worklist with the operands of the predicated instruction.
2992 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
2993
2994 // Holds instructions that we need to analyze again. An instruction may be
2995 // reanalyzed if we don't yet know if we can sink it or not.
2996 SmallVector<Instruction *, 8> InstsToReanalyze;
2997
2998 // Returns true if a given use occurs in the predicated block. Phi nodes use
2999 // their operands in their corresponding predecessor blocks.
3000 auto IsBlockOfUsePredicated = [&](Use &U) -> bool {
3001 auto *I = cast<Instruction>(U.getUser());
3002 BasicBlock *BB = I->getParent();
3003 if (auto *Phi = dyn_cast<PHINode>(I))
3004 BB = Phi->getIncomingBlock(
3005 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3006 return BB == PredBB;
3007 };
3008
3009 // Iteratively sink the scalarized operands of the predicated instruction
3010 // into the block we created for it. When an instruction is sunk, it's
3011 // operands are then added to the worklist. The algorithm ends after one pass
3012 // through the worklist doesn't sink a single instruction.
3013 bool Changed;
3014 do {
3015 // Add the instructions that need to be reanalyzed to the worklist, and
3016 // reset the changed indicator.
3017 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3018 InstsToReanalyze.clear();
3019 Changed = false;
3020
3021 while (!Worklist.empty()) {
3022 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3023
3024 // We can't sink an instruction if it is a phi node, is not in the loop,
3025 // may have side effects or may read from memory.
3026 // TODO: Could do more granular checking to allow sinking
3027 // a load past non-store instructions.
3028 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
3029 I->mayHaveSideEffects() || I->mayReadFromMemory())
3030 continue;
3031
3032 // If the instruction is already in PredBB, check if we can sink its
3033 // operands. In that case, VPlan's sinkScalarOperands() succeeded in
3034 // sinking the scalar instruction I, hence it appears in PredBB; but it
3035 // may have failed to sink I's operands (recursively), which we try
3036 // (again) here.
3037 if (I->getParent() == PredBB) {
3038 Worklist.insert(I->op_begin(), I->op_end());
3039 continue;
3040 }
3041
3042 // It's legal to sink the instruction if all its uses occur in the
3043 // predicated block. Otherwise, there's nothing to do yet, and we may
3044 // need to reanalyze the instruction.
3045 if (!llvm::all_of(I->uses(), IsBlockOfUsePredicated)) {
3046 InstsToReanalyze.push_back(I);
3047 continue;
3048 }
3049
3050 // Move the instruction to the beginning of the predicated block, and add
3051 // it's operands to the worklist.
3052 I->moveBefore(PredBB->getFirstInsertionPt());
3053 Worklist.insert(I->op_begin(), I->op_end());
3054
3055 // The sinking may have enabled other instructions to be sunk, so we will
3056 // need to iterate.
3057 Changed = true;
3058 }
3059 } while (Changed);
3060}
3061
3063 auto Iter = vp_depth_first_deep(Plan.getEntry());
3064 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
3065 for (VPRecipeBase &P : VPBB->phis()) {
3066 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
3067 if (!VPPhi)
3068 continue;
3069 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi));
3070 // Make sure the builder has a valid insert point.
3071 Builder.SetInsertPoint(NewPhi);
3072 for (unsigned Idx = 0; Idx < VPPhi->getNumOperands(); ++Idx) {
3073 VPValue *Inc = VPPhi->getIncomingValue(Idx);
3074 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(Idx);
3075 NewPhi->addIncoming(State.get(Inc), State.CFG.VPBB2IRBB[VPBB]);
3076 }
3077 }
3078 }
3079}
3080
3081void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
3082 // We should not collect Scalars more than once per VF. Right now, this
3083 // function is called from collectUniformsAndScalars(), which already does
3084 // this check. Collecting Scalars for VF=1 does not make any sense.
3085 assert(VF.isVector() && !Scalars.contains(VF) &&
3086 "This function should not be visited twice for the same VF");
3087
3088 // This avoids any chances of creating a REPLICATE recipe during planning
3089 // since that would result in generation of scalarized code during execution,
3090 // which is not supported for scalable vectors.
3091 if (VF.isScalable()) {
3092 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
3093 return;
3094 }
3095
3097
3098 // These sets are used to seed the analysis with pointers used by memory
3099 // accesses that will remain scalar.
3101 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
3102 auto *Latch = TheLoop->getLoopLatch();
3103
3104 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
3105 // The pointer operands of loads and stores will be scalar as long as the
3106 // memory access is not a gather or scatter operation. The value operand of a
3107 // store will remain scalar if the store is scalarized.
3108 auto IsScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
3109 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
3110 assert(WideningDecision != CM_Unknown &&
3111 "Widening decision should be ready at this moment");
3112 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
3113 if (Ptr == Store->getValueOperand())
3114 return WideningDecision == CM_Scalarize;
3115 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
3116 "Ptr is neither a value or pointer operand");
3117 return WideningDecision != CM_GatherScatter;
3118 };
3119
3120 // A helper that returns true if the given value is a getelementptr
3121 // instruction contained in the loop.
3122 auto IsLoopVaryingGEP = [&](Value *V) {
3123 return isa<GetElementPtrInst>(V) && !TheLoop->isLoopInvariant(V);
3124 };
3125
3126 // A helper that evaluates a memory access's use of a pointer. If the use will
3127 // be a scalar use and the pointer is only used by memory accesses, we place
3128 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
3129 // PossibleNonScalarPtrs.
3130 auto EvaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
3131 // We only care about bitcast and getelementptr instructions contained in
3132 // the loop.
3133 if (!IsLoopVaryingGEP(Ptr))
3134 return;
3135
3136 // If the pointer has already been identified as scalar (e.g., if it was
3137 // also identified as uniform), there's nothing to do.
3138 auto *I = cast<Instruction>(Ptr);
3139 if (Worklist.count(I))
3140 return;
3141
3142 // If the use of the pointer will be a scalar use, and all users of the
3143 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
3144 // place the pointer in PossibleNonScalarPtrs.
3145 if (IsScalarUse(MemAccess, Ptr) &&
3146 all_of(I->users(), IsaPred<LoadInst, StoreInst>))
3147 ScalarPtrs.insert(I);
3148 else
3149 PossibleNonScalarPtrs.insert(I);
3150 };
3151
3152 // We seed the scalars analysis with three classes of instructions: (1)
3153 // instructions marked uniform-after-vectorization and (2) bitcast,
3154 // getelementptr and (pointer) phi instructions used by memory accesses
3155 // requiring a scalar use.
3156 //
3157 // (1) Add to the worklist all instructions that have been identified as
3158 // uniform-after-vectorization.
3159 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
3160
3161 // (2) Add to the worklist all bitcast and getelementptr instructions used by
3162 // memory accesses requiring a scalar use. The pointer operands of loads and
3163 // stores will be scalar unless the operation is a gather or scatter.
3164 // The value operand of a store will remain scalar if the store is scalarized.
3165 for (auto *BB : TheLoop->blocks())
3166 for (auto &I : *BB) {
3167 if (auto *Load = dyn_cast<LoadInst>(&I)) {
3168 EvaluatePtrUse(Load, Load->getPointerOperand());
3169 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
3170 EvaluatePtrUse(Store, Store->getPointerOperand());
3171 EvaluatePtrUse(Store, Store->getValueOperand());
3172 }
3173 }
3174 for (auto *I : ScalarPtrs)
3175 if (!PossibleNonScalarPtrs.count(I)) {
3176 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
3177 Worklist.insert(I);
3178 }
3179
3180 // Insert the forced scalars.
3181 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
3182 // induction variable when the PHI user is scalarized.
3183 auto ForcedScalar = ForcedScalars.find(VF);
3184 if (ForcedScalar != ForcedScalars.end())
3185 for (auto *I : ForcedScalar->second) {
3186 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
3187 Worklist.insert(I);
3188 }
3189
3190 // Expand the worklist by looking through any bitcasts and getelementptr
3191 // instructions we've already identified as scalar. This is similar to the
3192 // expansion step in collectLoopUniforms(); however, here we're only
3193 // expanding to include additional bitcasts and getelementptr instructions.
3194 unsigned Idx = 0;
3195 while (Idx != Worklist.size()) {
3196 Instruction *Dst = Worklist[Idx++];
3197 if (!IsLoopVaryingGEP(Dst->getOperand(0)))
3198 continue;
3199 auto *Src = cast<Instruction>(Dst->getOperand(0));
3200 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
3201 auto *J = cast<Instruction>(U);
3202 return !TheLoop->contains(J) || Worklist.count(J) ||
3203 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
3204 IsScalarUse(J, Src));
3205 })) {
3206 Worklist.insert(Src);
3207 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
3208 }
3209 }
3210
3211 // An induction variable will remain scalar if all users of the induction
3212 // variable and induction variable update remain scalar.
3213 for (const auto &Induction : Legal->getInductionVars()) {
3214 auto *Ind = Induction.first;
3215 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3216
3217 // If tail-folding is applied, the primary induction variable will be used
3218 // to feed a vector compare.
3219 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
3220 continue;
3221
3222 // Returns true if \p Indvar is a pointer induction that is used directly by
3223 // load/store instruction \p I.
3224 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
3225 Instruction *I) {
3226 return Induction.second.getKind() ==
3228 (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
3229 Indvar == getLoadStorePointerOperand(I) && IsScalarUse(I, Indvar);
3230 };
3231
3232 // Determine if all users of the induction variable are scalar after
3233 // vectorization.
3234 bool ScalarInd = all_of(Ind->users(), [&](User *U) -> bool {
3235 auto *I = cast<Instruction>(U);
3236 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3237 IsDirectLoadStoreFromPtrIndvar(Ind, I);
3238 });
3239 if (!ScalarInd)
3240 continue;
3241
3242 // If the induction variable update is a fixed-order recurrence, neither the
3243 // induction variable or its update should be marked scalar after
3244 // vectorization.
3245 auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate);
3246 if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi))
3247 continue;
3248
3249 // Determine if all users of the induction variable update instruction are
3250 // scalar after vectorization.
3251 bool ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
3252 auto *I = cast<Instruction>(U);
3253 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3254 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
3255 });
3256 if (!ScalarIndUpdate)
3257 continue;
3258
3259 // The induction variable and its update instruction will remain scalar.
3260 Worklist.insert(Ind);
3261 Worklist.insert(IndUpdate);
3262 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
3263 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
3264 << "\n");
3265 }
3266
3267 Scalars[VF].insert(Worklist.begin(), Worklist.end());
3268}
3269
3271 Instruction *I, ElementCount VF) const {
3272 if (!isPredicatedInst(I))
3273 return false;
3274
3275 // Do we have a non-scalar lowering for this predicated
3276 // instruction? No - it is scalar with predication.
3277 switch(I->getOpcode()) {
3278 default:
3279 return true;
3280 case Instruction::Call:
3281 if (VF.isScalar())
3282 return true;
3283 return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))
3284 .Kind == CM_Scalarize;
3285 case Instruction::Load:
3286 case Instruction::Store: {
3288 auto *Ty = getLoadStoreType(I);
3289 Type *VTy = Ty;
3290 if (VF.isVector())
3291 VTy = VectorType::get(Ty, VF);
3292 const Align Alignment = getLoadStoreAlignment(I);
3293 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
3294 TTI.isLegalMaskedGather(VTy, Alignment))
3295 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
3296 TTI.isLegalMaskedScatter(VTy, Alignment));
3297 }
3298 case Instruction::UDiv:
3299 case Instruction::SDiv:
3300 case Instruction::SRem:
3301 case Instruction::URem: {
3302 // We have the option to use the safe-divisor idiom to avoid predication.
3303 // The cost based decision here will always select safe-divisor for
3304 // scalable vectors as scalarization isn't legal.
3305 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3306 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
3307 }
3308 }
3309}
3310
3311// TODO: Fold into LoopVectorizationLegality::isMaskRequired.
3313 // If predication is not needed, avoid it.
3314 // TODO: We can use the loop-preheader as context point here and get
3315 // context sensitive reasoning for isSafeToSpeculativelyExecute.
3316 if (!blockNeedsPredicationForAnyReason(I->getParent()) ||
3318 (isa<LoadInst, StoreInst, CallInst>(I) && !Legal->isMaskRequired(I)) ||
3319 isa<BranchInst, SwitchInst, PHINode, AllocaInst>(I))
3320 return false;
3321
3322 // If the instruction was executed conditionally in the original scalar loop,
3323 // predication is needed with a mask whose lanes are all possibly inactive.
3324 if (Legal->blockNeedsPredication(I->getParent()))
3325 return true;
3326
3327 // All that remain are instructions with side-effects originally executed in
3328 // the loop unconditionally, but now execute under a tail-fold mask (only)
3329 // having at least one active lane (the first). If the side-effects of the
3330 // instruction are invariant, executing it w/o (the tail-folding) mask is safe
3331 // - it will cause the same side-effects as when masked.
3332 switch(I->getOpcode()) {
3333 default:
3335 "instruction should have been considered by earlier checks");
3336 case Instruction::Call:
3337 // Side-effects of a Call are assumed to be non-invariant, needing a
3338 // (fold-tail) mask.
3340 "should have returned earlier for calls not needing a mask");
3341 return true;
3342 case Instruction::Load:
3343 // If the address is loop invariant no predication is needed.
3345 case Instruction::Store: {
3346 // For stores, we need to prove both speculation safety (which follows from
3347 // the same argument as loads), but also must prove the value being stored
3348 // is correct. The easiest form of the later is to require that all values
3349 // stored are the same.
3351 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()));
3352 }
3353 case Instruction::UDiv:
3354 case Instruction::SDiv:
3355 case Instruction::SRem:
3356 case Instruction::URem:
3357 // If the divisor is loop-invariant no predication is needed.
3358 return !TheLoop->isLoopInvariant(I->getOperand(1));
3359 }
3360}
3361
3362std::pair<InstructionCost, InstructionCost>
3364 ElementCount VF) const {
3365 assert(I->getOpcode() == Instruction::UDiv ||
3366 I->getOpcode() == Instruction::SDiv ||
3367 I->getOpcode() == Instruction::SRem ||
3368 I->getOpcode() == Instruction::URem);
3370
3371 // Scalarization isn't legal for scalable vector types
3372 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
3373 if (!VF.isScalable()) {
3374 // Get the scalarization cost and scale this amount by the probability of
3375 // executing the predicated block. If the instruction is not predicated,
3376 // we fall through to the next case.
3377 ScalarizationCost = 0;
3378
3379 // These instructions have a non-void type, so account for the phi nodes
3380 // that we will create. This cost is likely to be zero. The phi node
3381 // cost, if any, should be scaled by the block probability because it
3382 // models a copy at the end of each predicated block.
3383 ScalarizationCost += VF.getKnownMinValue() *
3384 TTI.getCFInstrCost(Instruction::PHI, CostKind);
3385
3386 // The cost of the non-predicated instruction.
3387 ScalarizationCost += VF.getKnownMinValue() *
3388 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
3389
3390 // The cost of insertelement and extractelement instructions needed for
3391 // scalarization.
3392 ScalarizationCost += getScalarizationOverhead(I, VF);
3393
3394 // Scale the cost by the probability of executing the predicated blocks.
3395 // This assumes the predicated block for each vector lane is equally
3396 // likely.
3397 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
3398 }
3399 InstructionCost SafeDivisorCost = 0;
3400
3401 auto *VecTy = toVectorTy(I->getType(), VF);
3402
3403 // The cost of the select guard to ensure all lanes are well defined
3404 // after we speculate above any internal control flow.
3405 SafeDivisorCost +=
3406 TTI.getCmpSelInstrCost(Instruction::Select, VecTy,
3407 toVectorTy(Type::getInt1Ty(I->getContext()), VF),
3409
3410 // Certain instructions can be cheaper to vectorize if they have a constant
3411 // second vector operand. One example of this are shifts on x86.
3412 Value *Op2 = I->getOperand(1);
3413 auto Op2Info = TTI.getOperandInfo(Op2);
3414 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
3415 Legal->isInvariant(Op2))
3417
3418 SmallVector<const Value *, 4> Operands(I->operand_values());
3419 SafeDivisorCost += TTI.getArithmeticInstrCost(
3420 I->getOpcode(), VecTy, CostKind,
3421 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
3422 Op2Info, Operands, I);
3423 return {ScalarizationCost, SafeDivisorCost};
3424}
3425
3427 Instruction *I, ElementCount VF) const {
3428 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
3430 "Decision should not be set yet.");
3431 auto *Group = getInterleavedAccessGroup(I);
3432 assert(Group && "Must have a group.");
3433 unsigned InterleaveFactor = Group->getFactor();
3434
3435 // If the instruction's allocated size doesn't equal its type size, it
3436 // requires padding and will be scalarized.
3437 auto &DL = I->getDataLayout();
3438 auto *ScalarTy = getLoadStoreType(I);
3439 if (hasIrregularType(ScalarTy, DL))
3440 return false;
3441
3442 // For scalable vectors, the only interleave factor currently supported
3443 // must be power of 2 since we require the (de)interleave2 intrinsics
3444 // instead of shufflevectors.
3445 if (VF.isScalable() && !isPowerOf2_32(InterleaveFactor))
3446 return false;
3447
3448 // If the group involves a non-integral pointer, we may not be able to
3449 // losslessly cast all values to a common type.
3450 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
3451 for (unsigned Idx = 0; Idx < InterleaveFactor; Idx++) {
3452 Instruction *Member = Group->getMember(Idx);
3453 if (!Member)
3454 continue;
3455 auto *MemberTy = getLoadStoreType(Member);
3456 bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
3457 // Don't coerce non-integral pointers to integers or vice versa.
3458 if (MemberNI != ScalarNI)
3459 // TODO: Consider adding special nullptr value case here
3460 return false;
3461 if (MemberNI && ScalarNI &&
3462 ScalarTy->getPointerAddressSpace() !=
3463 MemberTy->getPointerAddressSpace())
3464 return false;
3465 }
3466
3467 // Check if masking is required.
3468 // A Group may need masking for one of two reasons: it resides in a block that
3469 // needs predication, or it was decided to use masking to deal with gaps
3470 // (either a gap at the end of a load-access that may result in a speculative
3471 // load, or any gaps in a store-access).
3472 bool PredicatedAccessRequiresMasking =
3473 blockNeedsPredicationForAnyReason(I->getParent()) &&
3475 bool LoadAccessWithGapsRequiresEpilogMasking =
3476 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
3478 bool StoreAccessWithGapsRequiresMasking =
3479 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
3480 if (!PredicatedAccessRequiresMasking &&
3481 !LoadAccessWithGapsRequiresEpilogMasking &&
3482 !StoreAccessWithGapsRequiresMasking)
3483 return true;
3484
3485 // If masked interleaving is required, we expect that the user/target had
3486 // enabled it, because otherwise it either wouldn't have been created or
3487 // it should have been invalidated by the CostModel.
3489 "Masked interleave-groups for predicated accesses are not enabled.");
3490
3491 if (Group->isReverse())
3492 return false;
3493
3494 auto *Ty = getLoadStoreType(I);
3495 const Align Alignment = getLoadStoreAlignment(I);
3496 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
3497 : TTI.isLegalMaskedStore(Ty, Alignment);
3498}
3499
3501 Instruction *I, ElementCount VF) {
3502 // Get and ensure we have a valid memory instruction.
3503 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
3504
3506 auto *ScalarTy = getLoadStoreType(I);
3507
3508 // In order to be widened, the pointer should be consecutive, first of all.
3509 if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
3510 return false;
3511
3512 // If the instruction is a store located in a predicated block, it will be
3513 // scalarized.
3514 if (isScalarWithPredication(I, VF))
3515 return false;
3516
3517 // If the instruction's allocated size doesn't equal it's type size, it
3518 // requires padding and will be scalarized.
3519 auto &DL = I->getDataLayout();
3520 if (hasIrregularType(ScalarTy, DL))
3521 return false;
3522
3523 return true;
3524}
3525
3526void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3527 // We should not collect Uniforms more than once per VF. Right now,
3528 // this function is called from collectUniformsAndScalars(), which
3529 // already does this check. Collecting Uniforms for VF=1 does not make any
3530 // sense.
3531
3532 assert(VF.isVector() && !Uniforms.contains(VF) &&
3533 "This function should not be visited twice for the same VF");
3534
3535 // Visit the list of Uniforms. If we find no uniform value, we won't
3536 // analyze again. Uniforms.count(VF) will return 1.
3537 Uniforms[VF].clear();
3538
3539 // Now we know that the loop is vectorizable!
3540 // Collect instructions inside the loop that will remain uniform after
3541 // vectorization.
3542
3543 // Global values, params and instructions outside of current loop are out of
3544 // scope.
3545 auto IsOutOfScope = [&](Value *V) -> bool {
3546 Instruction *I = dyn_cast<Instruction>(V);
3547 return (!I || !TheLoop->contains(I));
3548 };
3549
3550 // Worklist containing uniform instructions demanding lane 0.
3551 SetVector<Instruction *> Worklist;
3552
3553 // Add uniform instructions demanding lane 0 to the worklist. Instructions
3554 // that require predication must not be considered uniform after
3555 // vectorization, because that would create an erroneous replicating region
3556 // where only a single instance out of VF should be formed.
3557 auto AddToWorklistIfAllowed = [&](Instruction *I) -> void {
3558 if (IsOutOfScope(I)) {
3559 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
3560 << *I << "\n");
3561 return;
3562 }
3563 if (isPredicatedInst(I)) {
3564 LLVM_DEBUG(
3565 dbgs() << "LV: Found not uniform due to requiring predication: " << *I
3566 << "\n");
3567 return;
3568 }
3569 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
3570 Worklist.insert(I);
3571 };
3572
3573 // Start with the conditional branches exiting the loop. If the branch
3574 // condition is an instruction contained in the loop that is only used by the
3575 // branch, it is uniform. Note conditions from uncountable early exits are not
3576 // uniform.
3578 TheLoop->getExitingBlocks(Exiting);
3579 for (BasicBlock *E : Exiting) {
3581 continue;
3582 auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
3583 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
3584 AddToWorklistIfAllowed(Cmp);
3585 }
3586
3587 auto PrevVF = VF.divideCoefficientBy(2);
3588 // Return true if all lanes perform the same memory operation, and we can
3589 // thus choose to execute only one.
3590 auto IsUniformMemOpUse = [&](Instruction *I) {
3591 // If the value was already known to not be uniform for the previous
3592 // (smaller VF), it cannot be uniform for the larger VF.
3593 if (PrevVF.isVector()) {
3594 auto Iter = Uniforms.find(PrevVF);
3595 if (Iter != Uniforms.end() && !Iter->second.contains(I))
3596 return false;
3597 }
3598 if (!Legal->isUniformMemOp(*I, VF))
3599 return false;
3600 if (isa<LoadInst>(I))
3601 // Loading the same address always produces the same result - at least
3602 // assuming aliasing and ordering which have already been checked.
3603 return true;
3604 // Storing the same value on every iteration.
3605 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
3606 };
3607
3608 auto IsUniformDecision = [&](Instruction *I, ElementCount VF) {
3609 InstWidening WideningDecision = getWideningDecision(I, VF);
3610 assert(WideningDecision != CM_Unknown &&
3611 "Widening decision should be ready at this moment");
3612
3613 if (IsUniformMemOpUse(I))
3614 return true;
3615
3616 return (WideningDecision == CM_Widen ||
3617 WideningDecision == CM_Widen_Reverse ||
3618 WideningDecision == CM_Interleave);
3619 };
3620
3621 // Returns true if Ptr is the pointer operand of a memory access instruction
3622 // I, I is known to not require scalarization, and the pointer is not also
3623 // stored.
3624 auto IsVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
3625 if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
3626 return false;
3627 return getLoadStorePointerOperand(I) == Ptr &&
3628 (IsUniformDecision(I, VF) || Legal->isInvariant(Ptr));
3629 };
3630
3631 // Holds a list of values which are known to have at least one uniform use.
3632 // Note that there may be other uses which aren't uniform. A "uniform use"
3633 // here is something which only demands lane 0 of the unrolled iterations;
3634 // it does not imply that all lanes produce the same value (e.g. this is not
3635 // the usual meaning of uniform)
3636 SetVector<Value *> HasUniformUse;
3637
3638 // Scan the loop for instructions which are either a) known to have only
3639 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
3640 for (auto *BB : TheLoop->blocks())
3641 for (auto &I : *BB) {
3642 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
3643 switch (II->getIntrinsicID()) {
3644 case Intrinsic::sideeffect:
3645 case Intrinsic::experimental_noalias_scope_decl:
3646 case Intrinsic::assume:
3647 case Intrinsic::lifetime_start:
3648 case Intrinsic::lifetime_end:
3650 AddToWorklistIfAllowed(&I);
3651 break;
3652 default:
3653 break;
3654 }
3655 }
3656
3657 // ExtractValue instructions must be uniform, because the operands are
3658 // known to be loop-invariant.
3659 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
3660 assert(IsOutOfScope(EVI->getAggregateOperand()) &&
3661 "Expected aggregate value to be loop invariant");
3662 AddToWorklistIfAllowed(EVI);
3663 continue;
3664 }
3665
3666 // If there's no pointer operand, there's nothing to do.
3668 if (!Ptr)
3669 continue;
3670
3671 if (IsUniformMemOpUse(&I))
3672 AddToWorklistIfAllowed(&I);
3673
3674 if (IsVectorizedMemAccessUse(&I, Ptr))
3675 HasUniformUse.insert(Ptr);
3676 }
3677
3678 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
3679 // demanding) users. Since loops are assumed to be in LCSSA form, this
3680 // disallows uses outside the loop as well.
3681 for (auto *V : HasUniformUse) {
3682 if (IsOutOfScope(V))
3683 continue;
3684 auto *I = cast<Instruction>(V);
3685 bool UsersAreMemAccesses = all_of(I->users(), [&](User *U) -> bool {
3686 auto *UI = cast<Instruction>(U);
3687 return TheLoop->contains(UI) && IsVectorizedMemAccessUse(UI, V);
3688 });
3689 if (UsersAreMemAccesses)
3690 AddToWorklistIfAllowed(I);
3691 }
3692
3693 // Expand Worklist in topological order: whenever a new instruction
3694 // is added , its users should be already inside Worklist. It ensures
3695 // a uniform instruction will only be used by uniform instructions.
3696 unsigned Idx = 0;
3697 while (Idx != Worklist.size()) {
3698 Instruction *I = Worklist[Idx++];
3699
3700 for (auto *OV : I->operand_values()) {
3701 // isOutOfScope operands cannot be uniform instructions.
3702 if (IsOutOfScope(OV))
3703 continue;
3704 // First order recurrence Phi's should typically be considered
3705 // non-uniform.
3706 auto *OP = dyn_cast<PHINode>(OV);
3708 continue;
3709 // If all the users of the operand are uniform, then add the
3710 // operand into the uniform worklist.
3711 auto *OI = cast<Instruction>(OV);
3712 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
3713 auto *J = cast<Instruction>(U);
3714 return Worklist.count(J) || IsVectorizedMemAccessUse(J, OI);
3715 }))
3716 AddToWorklistIfAllowed(OI);
3717 }
3718 }
3719
3720 // For an instruction to be added into Worklist above, all its users inside
3721 // the loop should also be in Worklist. However, this condition cannot be
3722 // true for phi nodes that form a cyclic dependence. We must process phi
3723 // nodes separately. An induction variable will remain uniform if all users
3724 // of the induction variable and induction variable update remain uniform.
3725 // The code below handles both pointer and non-pointer induction variables.
3726 BasicBlock *Latch = TheLoop->getLoopLatch();
3727 for (const auto &Induction : Legal->getInductionVars()) {
3728 auto *Ind = Induction.first;
3729 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3730
3731 // Determine if all users of the induction variable are uniform after
3732 // vectorization.
3733 bool UniformInd = all_of(Ind->users(), [&](User *U) -> bool {
3734 auto *I = cast<Instruction>(U);
3735 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3736 IsVectorizedMemAccessUse(I, Ind);
3737 });
3738 if (!UniformInd)
3739 continue;
3740
3741 // Determine if all users of the induction variable update instruction are
3742 // uniform after vectorization.
3743 bool UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
3744 auto *I = cast<Instruction>(U);
3745 return I == Ind || Worklist.count(I) ||
3746 IsVectorizedMemAccessUse(I, IndUpdate);
3747 });
3748 if (!UniformIndUpdate)
3749 continue;
3750
3751 // The induction variable and its update instruction will remain uniform.
3752 AddToWorklistIfAllowed(Ind);
3753 AddToWorklistIfAllowed(IndUpdate);
3754 }
3755
3756 Uniforms[VF].insert(Worklist.begin(), Worklist.end());
3757}
3758
3760 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
3761
3763 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
3764 "runtime pointer checks needed. Enable vectorization of this "
3765 "loop with '#pragma clang loop vectorize(enable)' when "
3766 "compiling with -Os/-Oz",
3767 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3768 return true;
3769 }
3770
3771 if (!PSE.getPredicate().isAlwaysTrue()) {
3772 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
3773 "runtime SCEV checks needed. Enable vectorization of this "
3774 "loop with '#pragma clang loop vectorize(enable)' when "
3775 "compiling with -Os/-Oz",
3776 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3777 return true;
3778 }
3779
3780 // FIXME: Avoid specializing for stride==1 instead of bailing out.
3781 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
3782 reportVectorizationFailure("Runtime stride check for small trip count",
3783 "runtime stride == 1 checks needed. Enable vectorization of "
3784 "this loop without such check by compiling with -Os/-Oz",
3785 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3786 return true;
3787 }
3788
3789 return false;
3790}
3791
3792bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
3793 if (IsScalableVectorizationAllowed)
3794 return *IsScalableVectorizationAllowed;
3795
3796 IsScalableVectorizationAllowed = false;
3798 return false;
3799
3801 reportVectorizationInfo("Scalable vectorization is explicitly disabled",
3802 "ScalableVectorizationDisabled", ORE, TheLoop);
3803 return false;
3804 }
3805
3806 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
3807
3808 auto MaxScalableVF = ElementCount::getScalable(
3809 std::numeric_limits<ElementCount::ScalarTy>::max());
3810
3811 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
3812 // FIXME: While for scalable vectors this is currently sufficient, this should
3813 // be replaced by a more detailed mechanism that filters out specific VFs,
3814 // instead of invalidating vectorization for a whole set of VFs based on the
3815 // MaxVF.
3816
3817 // Disable scalable vectorization if the loop contains unsupported reductions.
3818 if (!canVectorizeReductions(MaxScalableVF)) {
3820 "Scalable vectorization not supported for the reduction "
3821 "operations found in this loop.",
3822 "ScalableVFUnfeasible", ORE, TheLoop);
3823 return false;
3824 }
3825
3826 // Disable scalable vectorization if the loop contains any instructions
3827 // with element types not supported for scalable vectors.
3828 if (any_of(ElementTypesInLoop, [&](Type *Ty) {
3829 return !Ty->isVoidTy() &&
3831 })) {
3832 reportVectorizationInfo("Scalable vectorization is not supported "
3833 "for all element types found in this loop.",
3834 "ScalableVFUnfeasible", ORE, TheLoop);
3835 return false;
3836 }
3837
3839 reportVectorizationInfo("The target does not provide maximum vscale value "
3840 "for safe distance analysis.",
3841 "ScalableVFUnfeasible", ORE, TheLoop);
3842 return false;
3843 }
3844
3845 IsScalableVectorizationAllowed = true;
3846 return true;
3847}
3848
3850LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
3851 if (!isScalableVectorizationAllowed())
3852 return ElementCount::getScalable(0);
3853
3854 auto MaxScalableVF = ElementCount::getScalable(
3855 std::numeric_limits<ElementCount::ScalarTy>::max());
3857 return MaxScalableVF;
3858
3859 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
3860 // Limit MaxScalableVF by the maximum safe dependence distance.
3861 MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
3862
3863 if (!MaxScalableVF)
3865 "Max legal vector width too small, scalable vectorization "
3866 "unfeasible.",
3867 "ScalableVFUnfeasible", ORE, TheLoop);
3868
3869 return MaxScalableVF;
3870}
3871
3872FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
3873 unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
3875 unsigned SmallestType, WidestType;
3876 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
3877
3878 // Get the maximum safe dependence distance in bits computed by LAA.
3879 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
3880 // the memory accesses that is most restrictive (involved in the smallest
3881 // dependence distance).
3882 unsigned MaxSafeElements =
3884
3885 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
3886 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
3888 this->MaxSafeElements = MaxSafeElements;
3889
3890 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
3891 << ".\n");
3892 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
3893 << ".\n");
3894
3895 // First analyze the UserVF, fall back if the UserVF should be ignored.
3896 if (UserVF) {
3897 auto MaxSafeUserVF =
3898 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
3899
3900 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
3901 // If `VF=vscale x N` is safe, then so is `VF=N`
3902 if (UserVF.isScalable())
3903 return FixedScalableVFPair(
3904 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
3905
3906 return UserVF;
3907 }
3908
3909 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
3910
3911 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
3912 // is better to ignore the hint and let the compiler choose a suitable VF.
3913 if (!UserVF.isScalable()) {
3914 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3915 << " is unsafe, clamping to max safe VF="
3916 << MaxSafeFixedVF << ".\n");
3917 ORE->emit([&]() {
3918 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3920 TheLoop->getHeader())
3921 << "User-specified vectorization factor "
3922 << ore::NV("UserVectorizationFactor", UserVF)
3923 << " is unsafe, clamping to maximum safe vectorization factor "
3924 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
3925 });
3926 return MaxSafeFixedVF;
3927 }
3928
3930 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3931 << " is ignored because scalable vectors are not "
3932 "available.\n");
3933 ORE->emit([&]() {
3934 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3936 TheLoop->getHeader())
3937 << "User-specified vectorization factor "
3938 << ore::NV("UserVectorizationFactor", UserVF)
3939 << " is ignored because the target does not support scalable "
3940 "vectors. The compiler will pick a more suitable value.";
3941 });
3942 } else {
3943 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3944 << " is unsafe. Ignoring scalable UserVF.\n");
3945 ORE->emit([&]() {
3946 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3948 TheLoop->getHeader())
3949 << "User-specified vectorization factor "
3950 << ore::NV("UserVectorizationFactor", UserVF)
3951 << " is unsafe. Ignoring the hint to let the compiler pick a "
3952 "more suitable value.";
3953 });
3954 }
3955 }
3956
3957 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
3958 << " / " << WidestType << " bits.\n");
3959
3962 if (auto MaxVF =
3963 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3964 MaxSafeFixedVF, FoldTailByMasking))
3965 Result.FixedVF = MaxVF;
3966
3967 if (auto MaxVF =
3968 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3969 MaxSafeScalableVF, FoldTailByMasking))
3970 if (MaxVF.isScalable()) {
3971 Result.ScalableVF = MaxVF;
3972 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
3973 << "\n");
3974 }
3975
3976 return Result;
3977}
3978
3982 // TODO: It may be useful to do since it's still likely to be dynamically
3983 // uniform if the target can skip.
3985 "Not inserting runtime ptr check for divergent target",
3986 "runtime pointer checks needed. Not enabled for divergent target",
3987 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
3989 }
3990
3991 ScalarEvolution *SE = PSE.getSE();
3992 unsigned TC = SE->getSmallConstantTripCount(TheLoop);
3993 unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
3994 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
3995 if (TC != MaxTC)
3996 LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n');
3997 if (TC == 1) {
3998 reportVectorizationFailure("Single iteration (non) loop",
3999 "loop trip count is one, irrelevant for vectorization",
4000 "SingleIterationLoop", ORE, TheLoop);
4002 }
4003
4004 // If BTC matches the widest induction type and is -1 then the trip count
4005 // computation will wrap to 0 and the vector trip count will be 0. Do not try
4006 // to vectorize.
4007 const SCEV *BTC = SE->getBackedgeTakenCount(TheLoop);
4008 if (!isa<SCEVCouldNotCompute>(BTC) &&
4009 BTC->getType()->getScalarSizeInBits() >=
4012 SE->getMinusOne(BTC->getType()))) {
4014 "Trip count computation wrapped",
4015 "backedge-taken count is -1, loop trip count wrapped to 0",
4016 "TripCountWrapped", ORE, TheLoop);
4018 }
4019
4020 switch (ScalarEpilogueStatus) {
4022 return computeFeasibleMaxVF(MaxTC, UserVF, false);
4024 [[fallthrough]];
4026 LLVM_DEBUG(
4027 dbgs() << "LV: vector predicate hint/switch found.\n"
4028 << "LV: Not allowing scalar epilogue, creating predicated "
4029 << "vector loop.\n");
4030 break;
4032 // fallthrough as a special case of OptForSize
4034 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4035 LLVM_DEBUG(
4036 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4037 else
4038 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4039 << "count.\n");
4040
4041 // Bail if runtime checks are required, which are not good when optimising
4042 // for size.
4045
4046 break;
4047 }
4048
4049 // The only loops we can vectorize without a scalar epilogue, are loops with
4050 // a bottom-test and a single exiting block. We'd have to handle the fact
4051 // that not every instruction executes on the last iteration. This will
4052 // require a lane mask which varies through the vector loop body. (TODO)
4054 // If there was a tail-folding hint/switch, but we can't fold the tail by
4055 // masking, fallback to a vectorization with a scalar epilogue.
4056 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4057 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4058 "scalar epilogue instead.\n");
4059 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4060 return computeFeasibleMaxVF(MaxTC, UserVF, false);
4061 }
4063 }
4064
4065 // Now try the tail folding
4066
4067 // Invalidate interleave groups that require an epilogue if we can't mask
4068 // the interleave-group.
4070 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
4071 "No decisions should have been taken at this point");
4072 // Note: There is no need to invalidate any cost modeling decisions here, as
4073 // none were taken so far.
4075 }
4076
4077 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
4078
4079 // Avoid tail folding if the trip count is known to be a multiple of any VF
4080 // we choose.
4081 std::optional<unsigned> MaxPowerOf2RuntimeVF =
4082 MaxFactors.FixedVF.getFixedValue();
4083 if (MaxFactors.ScalableVF) {
4084 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
4085 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
4086 MaxPowerOf2RuntimeVF = std::max<unsigned>(
4087 *MaxPowerOf2RuntimeVF,
4088 *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
4089 } else
4090 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
4091 }
4092
4093 if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4094 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4095 "MaxFixedVF must be a power of 2");
4096 unsigned MaxVFtimesIC =
4097 UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
4098 ScalarEvolution *SE = PSE.getSE();
4099 // Currently only loops with countable exits are vectorized, but calling
4100 // getSymbolicMaxBackedgeTakenCount allows enablement work for loops with
4101 // uncountable exits whilst also ensuring the symbolic maximum and known
4102 // back-edge taken count remain identical for loops with countable exits.
4103 const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount();
4104 assert(BackedgeTakenCount == PSE.getBackedgeTakenCount() &&
4105 "Invalid loop count");
4106 const SCEV *ExitCount = SE->getAddExpr(
4107 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
4108 const SCEV *Rem = SE->getURemExpr(
4109 SE->applyLoopGuards(ExitCount, TheLoop),
4110 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
4111 if (Rem->isZero()) {
4112 // Accept MaxFixedVF if we do not have a tail.
4113 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4114 return MaxFactors;
4115 }
4116 }
4117
4118 // If we don't know the precise trip count, or if the trip count that we
4119 // found modulo the vectorization factor is not zero, try to fold the tail
4120 // by masking.
4121 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4122 bool ContainsScalableVF = MaxFactors.ScalableVF.isNonZero();
4123 setTailFoldingStyles(ContainsScalableVF, UserIC);
4124 if (foldTailByMasking()) {
4126 LLVM_DEBUG(
4127 dbgs()
4128 << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
4129 "try to generate VP Intrinsics with scalable vector "
4130 "factors only.\n");
4131 // Tail folded loop using VP intrinsics restricts the VF to be scalable
4132 // for now.
4133 // TODO: extend it for fixed vectors, if required.
4134 assert(ContainsScalableVF && "Expected scalable vector factor.");
4135
4136 MaxFactors.FixedVF = ElementCount::getFixed(1);
4137 }
4138 return MaxFactors;
4139 }
4140
4141 // If there was a tail-folding hint/switch, but we can't fold the tail by
4142 // masking, fallback to a vectorization with a scalar epilogue.
4143 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4144 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4145 "scalar epilogue instead.\n");
4146 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4147 return MaxFactors;
4148 }
4149
4150 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
4151 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
4153 }
4154
4155 if (TC == 0) {
4157 "unable to calculate the loop count due to complex control flow",
4158 "UnknownLoopCountComplexCFG", ORE, TheLoop);
4160 }
4161
4163 "Cannot optimize for size and vectorize at the same time.",
4164 "cannot optimize for size and vectorize at the same time. "
4165 "Enable vectorization of this loop with '#pragma clang loop "
4166 "vectorize(enable)' when compiling with -Os/-Oz",
4167 "NoTailLoopWithOptForSize", ORE, TheLoop);
4169}
4170
4171ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4172 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
4173 ElementCount MaxSafeVF, bool FoldTailByMasking) {
4174 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
4175 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
4176 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4178
4179 // Convenience function to return the minimum of two ElementCounts.
4180 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
4181 assert((LHS.isScalable() == RHS.isScalable()) &&
4182 "Scalable flags must match");
4183 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
4184 };
4185
4186 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
4187 // Note that both WidestRegister and WidestType may not be a powers of 2.
4188 auto MaxVectorElementCount = ElementCount::get(
4189 llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
4190 ComputeScalableMaxVF);
4191 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
4192 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4193 << (MaxVectorElementCount * WidestType) << " bits.\n");
4194
4195 if (!MaxVectorElementCount) {
4196 LLVM_DEBUG(dbgs() << "LV: The target has no "
4197 << (ComputeScalableMaxVF ? "scalable" : "fixed")
4198 << " vector registers.\n");
4199 return ElementCount::getFixed(1);
4200 }
4201
4202 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
4203 if (MaxVectorElementCount.isScalable() &&
4204 TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
4205 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
4206 auto Min = Attr.getVScaleRangeMin();
4207 WidestRegisterMinEC *= Min;
4208 }
4209
4210 // When a scalar epilogue is required, at least one iteration of the scalar
4211 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4212 // max VF that results in a dead vector loop.
4213 if (MaxTripCount > 0 && requiresScalarEpilogue(true))
4214 MaxTripCount -= 1;
4215
4216 if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4217 (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
4218 // If upper bound loop trip count (TC) is known at compile time there is no
4219 // point in choosing VF greater than TC (as done in the loop below). Select
4220 // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4221 // scalable, we only fall back on a fixed VF when the TC is less than or
4222 // equal to the known number of lanes.
4223 auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
4224 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
4225 "exceeding the constant trip count: "
4226 << ClampedUpperTripCount << "\n");
4227 return ElementCount::get(
4228 ClampedUpperTripCount,
4229 FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4230 }
4231
4233 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4235 ElementCount MaxVF = MaxVectorElementCount;
4236 if (MaximizeBandwidth ||
4240 auto MaxVectorElementCountMaxBW = ElementCount::get(
4241 llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
4242 ComputeScalableMaxVF);
4243 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
4244
4245 // Collect all viable vectorization factors larger than the default MaxVF
4246 // (i.e. MaxVectorElementCount).
4248 for (ElementCount VS = MaxVectorElementCount * 2;
4249 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
4250 VFs.push_back(VS);
4251
4252 // For each VF calculate its register usage.
4253 auto RUs = calculateRegisterUsage(VFs);
4254
4255 // Select the largest VF which doesn't require more registers than existing
4256 // ones.
4257 for (int I = RUs.size() - 1; I >= 0; --I) {
4258 const auto &MLU = RUs[I].MaxLocalUsers;
4259 if (all_of(MLU, [&](decltype(MLU.front()) &LU) {
4260 return LU.second <= TTI.getNumberOfRegisters(LU.first);
4261 })) {
4262 MaxVF = VFs[I];
4263 break;
4264 }
4265 }
4266 if (ElementCount MinVF =
4267 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
4268 if (ElementCount::isKnownLT(MaxVF, MinVF)) {
4269 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4270 << ") with target's minimum: " << MinVF << '\n');
4271 MaxVF = MinVF;
4272 }
4273 }
4274
4275 // Invalidate any widening decisions we might have made, in case the loop
4276 // requires prediction (decided later), but we have already made some
4277 // load/store widening decisions.
4279 }
4280 return MaxVF;
4281}
4282
4283/// This function attempts to return a value that represents the vectorization
4284/// factor at runtime. For fixed-width VFs we know this precisely at compile
4285/// time, but for scalable VFs we calculate it based on an estimate of the
4286/// vscale value.
4288 std::optional<unsigned> VScale) {
4289 unsigned EstimatedVF = VF.getKnownMinValue();
4290 if (VF.isScalable())
4291 if (VScale)
4292 EstimatedVF *= *VScale;
4293 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
4294 return EstimatedVF;
4295}
4296
4297bool LoopVectorizationPlanner::isMoreProfitable(
4299 const unsigned MaxTripCount) const {
4300 InstructionCost CostA = A.Cost;
4301 InstructionCost CostB = B.Cost;
4302
4303 // Improve estimate for the vector width if it is scalable.
4304 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
4305 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
4306 if (std::optional<unsigned> VScale = CM.getVScaleForTuning()) {
4307 if (A.Width.isScalable())
4308 EstimatedWidthA *= *VScale;
4309 if (B.Width.isScalable())
4310 EstimatedWidthB *= *VScale;
4311 }
4312
4313 // Assume vscale may be larger than 1 (or the value being tuned for),
4314 // so that scalable vectorization is slightly favorable over fixed-width
4315 // vectorization.
4316 bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() &&
4317 A.Width.isScalable() && !B.Width.isScalable();
4318
4319 auto CmpFn = [PreferScalable](const InstructionCost &LHS,
4320 const InstructionCost &RHS) {
4321 return PreferScalable ? LHS <= RHS : LHS < RHS;
4322 };
4323
4324 // To avoid the need for FP division:
4325 // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
4326 // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
4327 if (!MaxTripCount)
4328 return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
4329
4330 auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4331 InstructionCost VectorCost,
4332 InstructionCost ScalarCost) {
4333 // If the trip count is a known (possibly small) constant, the trip count
4334 // will be rounded up to an integer number of iterations under
4335 // FoldTailByMasking. The total cost in that case will be
4336 // VecCost*ceil(TripCount/VF). When not folding the tail, the total
4337 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4338 // some extra overheads, but for the purpose of comparing the costs of
4339 // different VFs we can use this to compare the total loop-body cost
4340 // expected after vectorization.
4341 if (CM.foldTailByMasking())
4342 return VectorCost * divideCeil(MaxTripCount, VF);
4343 return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF);
4344 };
4345
4346 auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
4347 auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
4348 return CmpFn(RTCostA, RTCostB);
4349}
4350
4351bool LoopVectorizationPlanner::isMoreProfitable(
4352 const VectorizationFactor &A, const VectorizationFactor &B) const {
4353 const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
4354 return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount);
4355}
4356
4359 using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
4360 SmallVector<RecipeVFPair> InvalidCosts;
4361 for (const auto &Plan : VPlans) {
4362 for (ElementCount VF : Plan->vectorFactors()) {
4363 // The VPlan-based cost model is designed for computing vector cost.
4364 // Querying VPlan-based cost model with a scarlar VF will cause some
4365 // errors because we expect the VF is vector for most of the widen
4366 // recipes.
4367 if (VF.isScalar())
4368 continue;
4369
4370 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(),
4371 CM, CM.CostKind);
4372 precomputeCosts(*Plan, VF, CostCtx);
4373 auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
4374 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
4375 for (auto &R : *VPBB) {
4376 if (!R.cost(VF, CostCtx).isValid())
4377 InvalidCosts.emplace_back(&R, VF);
4378 }
4379 }
4380 }
4381 }
4382 if (InvalidCosts.empty())
4383 return;
4384
4385 // Emit a report of VFs with invalid costs in the loop.
4386
4387 // Group the remarks per recipe, keeping the recipe order from InvalidCosts.
4389 unsigned I = 0;
4390 for (auto &Pair : InvalidCosts)
4391 if (!Numbering.count(Pair.first))
4392 Numbering[Pair.first] = I++;
4393
4394 // Sort the list, first on recipe(number) then on VF.
4395 sort(InvalidCosts, [&Numbering](RecipeVFPair &A, RecipeVFPair &B) {
4396 if (Numbering[A.first] != Numbering[B.first])
4397 return Numbering[A.first] < Numbering[B.first];
4398 const auto &LHS = A.second;
4399 const auto &RHS = B.second;
4400 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
4401 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
4402 });
4403
4404 // For a list of ordered recipe-VF pairs:
4405 // [(load, VF1), (load, VF2), (store, VF1)]
4406 // group the recipes together to emit separate remarks for:
4407 // load (VF1, VF2)
4408 // store (VF1)
4409 auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts);
4410 auto Subset = ArrayRef<RecipeVFPair>();
4411 do {
4412 if (Subset.empty())
4413 Subset = Tail.take_front(1);
4414
4415 VPRecipeBase *R = Subset.front().first;
4416
4417 unsigned Opcode =
4420 [](const auto *R) { return Instruction::PHI; })
4421 .Case<VPWidenSelectRecipe>(
4422 [](const auto *R) { return Instruction::Select; })
4423 .Case<VPWidenStoreRecipe>(
4424 [](const auto *R) { return Instruction::Store; })
4425 .Case<VPWidenLoadRecipe>(
4426 [](const auto *R) { return Instruction::Load; })
4427 .Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
4428 [](const auto *R) { return Instruction::Call; })
4431 [](const auto *R) { return R->getOpcode(); })
4432 .Case<VPInterleaveRecipe>([](const VPInterleaveRecipe *R) {
4433 return R->getStoredValues().empty() ? Instruction::Load
4434 : Instruction::Store;
4435 });
4436
4437 // If the next recipe is different, or if there are no other pairs,
4438 // emit a remark for the collated subset. e.g.
4439 // [(load, VF1), (load, VF2))]
4440 // to emit:
4441 // remark: invalid costs for 'load' at VF=(VF1, VF2)
4442 if (Subset == Tail || Tail[Subset.size()].first != R) {
4443 std::string OutString;
4444 raw_string_ostream OS(OutString);
4445 assert(!Subset.empty() && "Unexpected empty range");
4446 OS << "Recipe with invalid costs prevented vectorization at VF=(";
4447 for (const auto &Pair : Subset)
4448 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4449 OS << "):";
4450 if (Opcode == Instruction::Call) {
4451 StringRef Name = "";
4452 if (auto *Int = dyn_cast<VPWidenIntrinsicRecipe>(R)) {
4453 Name = Int->getIntrinsicName();
4454 } else {
4455 auto *WidenCall = dyn_cast<VPWidenCallRecipe>(R);
4456 Function *CalledFn =
4457 WidenCall ? WidenCall->getCalledScalarFunction()
4458 : cast<Function>(R->getOperand(R->getNumOperands() - 1)
4459 ->getLiveInIRValue());
4460 Name = CalledFn->getName();
4461 }
4462 OS << " call to " << Name;
4463 } else
4464 OS << " " << Instruction::getOpcodeName(Opcode);
4465 reportVectorizationInfo(OutString, "InvalidCost", ORE, OrigLoop, nullptr,
4466 R->getDebugLoc());
4467 Tail = Tail.drop_front(Subset.size());
4468 Subset = {};
4469 } else
4470 // Grow the subset by one element
4471 Subset = Tail.take_front(Subset.size() + 1);
4472 } while (!Tail.empty());
4473}
4474
4475/// Check if any recipe of \p Plan will generate a vector value, which will be
4476/// assigned a vector register.
4478 const TargetTransformInfo &TTI) {
4479 assert(VF.isVector() && "Checking a scalar VF?");
4480 VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
4481 DenseSet<VPRecipeBase *> EphemeralRecipes;
4482 collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes);
4483 // Set of already visited types.
4484 DenseSet<Type *> Visited;
4485 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4487 for (VPRecipeBase &R : *VPBB) {
4488 if (EphemeralRecipes.contains(&R))
4489 continue;
4490 // Continue early if the recipe is considered to not produce a vector
4491 // result. Note that this includes VPInstruction where some opcodes may
4492 // produce a vector, to preserve existing behavior as VPInstructions model
4493 // aspects not directly mapped to existing IR instructions.
4494 switch (R.getVPDefID()) {
4495 case VPDef::VPDerivedIVSC:
4496 case VPDef::VPScalarIVStepsSC:
4497 case VPDef::VPScalarCastSC:
4498 case VPDef::VPReplicateSC:
4499 case VPDef::VPInstructionSC:
4500 case VPDef::VPCanonicalIVPHISC:
4501 case VPDef::VPVectorPointerSC:
4502 case VPDef::VPReverseVectorPointerSC:
4503 case VPDef::VPExpandSCEVSC:
4504 case VPDef::VPEVLBasedIVPHISC:
4505 case VPDef::VPPredInstPHISC:
4506 case VPDef::VPBranchOnMaskSC:
4507 continue;
4508 case VPDef::VPReductionSC:
4509 case VPDef::VPActiveLaneMaskPHISC:
4510 case VPDef::VPWidenCallSC:
4511 case VPDef::VPWidenCanonicalIVSC:
4512 case VPDef::VPWidenCastSC:
4513 case VPDef::VPWidenGEPSC:
4514 case VPDef::VPWidenIntrinsicSC:
4515 case VPDef::VPWidenSC:
4516 case VPDef::VPWidenSelectSC:
4517 case VPDef::VPBlendSC:
4518 case VPDef::VPFirstOrderRecurrencePHISC:
4519 case VPDef::VPWidenPHISC:
4520 case VPDef::VPWidenIntOrFpInductionSC:
4521 case VPDef::VPWidenPointerInductionSC:
4522 case VPDef::VPReductionPHISC:
4523 case VPDef::VPInterleaveSC:
4524 case VPDef::VPWidenLoadEVLSC:
4525 case VPDef::VPWidenLoadSC:
4526 case VPDef::VPWidenStoreEVLSC:
4527 case VPDef::VPWidenStoreSC:
4528 break;
4529 default:
4530 llvm_unreachable("unhandled recipe");
4531 }
4532
4533 auto WillWiden = [&TTI, VF](Type *ScalarTy) {
4534 Type *VectorTy = toVectorTy(ScalarTy, VF);
4535 unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
4536 if (!NumLegalParts)
4537 return false;
4538 if (VF.isScalable()) {
4539 // <vscale x 1 x iN> is assumed to be profitable over iN because
4540 // scalable registers are a distinct register class from scalar
4541 // ones. If we ever find a target which wants to lower scalable
4542 // vectors back to scalars, we'll need to update this code to
4543 // explicitly ask TTI about the register class uses for each part.
4544 return NumLegalParts <= VF.getKnownMinValue();
4545 }
4546 // Two or more parts that share a register - are vectorized.
4547 return NumLegalParts < VF.getKnownMinValue();
4548 };
4549
4550 // If no def nor is a store, e.g., branches, continue - no value to check.
4551 if (R.getNumDefinedValues() == 0 &&
4552 !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
4553 &R))
4554 continue;
4555 // For multi-def recipes, currently only interleaved loads, suffice to
4556 // check first def only.
4557 // For stores check their stored value; for interleaved stores suffice
4558 // the check first stored value only. In all cases this is the second
4559 // operand.
4560 VPValue *ToCheck =
4561 R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1);
4562 Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
4563 if (!Visited.insert({ScalarTy}).second)
4564 continue;
4565 if (WillWiden(ScalarTy))
4566 return true;
4567 }
4568 }
4569
4570 return false;
4571}
4572
4573#ifndef NDEBUG
4574VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4576 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4577 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4578 assert(
4579 any_of(VPlans,
4580 [](std::unique_ptr<VPlan> &P) { return P->hasScalarVFOnly(); }) &&
4581 "Expected Scalar VF to be a candidate");
4582
4583 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4584 ExpectedCost);
4585 VectorizationFactor ChosenFactor = ScalarCost;
4586
4587 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4588 if (ForceVectorization &&
4589 (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
4590 // Ignore scalar width, because the user explicitly wants vectorization.
4591 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4592 // evaluation.
4593 ChosenFactor.Cost = InstructionCost::getMax();
4594 }
4595
4596 for (auto &P : VPlans) {
4597 for (ElementCount VF : P->vectorFactors()) {
4598 // The cost for scalar VF=1 is already calculated, so ignore it.
4599 if (VF.isScalar())
4600 continue;
4601
4603 VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
4604
4605 unsigned Width =
4606 getEstimatedRuntimeVF(Candidate.Width, CM.getVScaleForTuning());
4607 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
4608 << " costs: " << (Candidate.Cost / Width));
4609 if (VF.isScalable())
4610 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4611 << CM.getVScaleForTuning().value_or(1) << ")");
4612 LLVM_DEBUG(dbgs() << ".\n");
4613
4614 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
4615 LLVM_DEBUG(
4616 dbgs()
4617 << "LV: Not considering vector loop of width " << VF
4618 << " because it will not generate any vector instructions.\n");
4619 continue;
4620 }
4621
4622 if (isMoreProfitable(Candidate, ChosenFactor))
4623 ChosenFactor = Candidate;
4624 }
4625 }
4626
4629 "There are conditional stores.",
4630 "store that is conditionally executed prevents vectorization",
4631 "ConditionalStore", ORE, OrigLoop);
4632 ChosenFactor = ScalarCost;
4633 }
4634
4635 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
4636 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
4637 << "LV: Vectorization seems to be not beneficial, "
4638 << "but was forced by a user.\n");
4639 return ChosenFactor;
4640}
4641#endif
4642
4643bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4644 ElementCount VF) const {
4645 // Cross iteration phis such as reductions need special handling and are
4646 // currently unsupported.
4647 if (any_of(OrigLoop->getHeader()->phis(),
4648 [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
4649 return false;
4650
4651 // Phis with uses outside of the loop require special handling and are
4652 // currently unsupported.
4653 for (const auto &Entry : Legal->getInductionVars()) {
4654 // Look for uses of the value of the induction at the last iteration.
4655 Value *PostInc =
4656 Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
4657 for (User *U : PostInc->users())
4658 if (!OrigLoop->contains(cast<Instruction>(U)))
4659 return false;
4660 // Look for uses of penultimate value of the induction.
4661 for (User *U : Entry.first->users())
4662 if (!OrigLoop->contains(cast<Instruction>(U)))
4663 return false;
4664 }
4665
4666 // Epilogue vectorization code has not been auditted to ensure it handles
4667 // non-latch exits properly. It may be fine, but it needs auditted and
4668 // tested.
4669 // TODO: Add support for loops with an early exit.
4670 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
4671 return false;
4672
4673 return true;
4674}
4675
4677 const ElementCount VF, const unsigned IC) const {
4678 // FIXME: We need a much better cost-model to take different parameters such
4679 // as register pressure, code size increase and cost of extra branches into
4680 // account. For now we apply a very crude heuristic and only consider loops
4681 // with vectorization factors larger than a certain value.
4682
4683 // Allow the target to opt out entirely.
4685 return false;
4686
4687 // We also consider epilogue vectorization unprofitable for targets that don't
4688 // consider interleaving beneficial (eg. MVE).
4689 if (TTI.getMaxInterleaveFactor(VF) <= 1)
4690 return false;
4691
4692 // TODO: PR #108190 introduced a discrepancy between fixed-width and scalable
4693 // VFs when deciding profitability.
4694 // See related "TODO: extend to support scalable VFs." in
4695 // selectEpilogueVectorizationFactor.
4696 unsigned Multiplier = VF.isFixed() ? IC : 1;
4697 unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
4700 return getEstimatedRuntimeVF(VF * Multiplier, VScaleForTuning) >=
4701 MinVFThreshold;
4702}
4703
4705 const ElementCount MainLoopVF, unsigned IC) {
4708 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
4709 return Result;
4710 }
4711
4712 if (!CM.isScalarEpilogueAllowed()) {
4713 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
4714 "epilogue is allowed.\n");
4715 return Result;
4716 }
4717
4718 // Not really a cost consideration, but check for unsupported cases here to
4719 // simplify the logic.
4720 if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
4721 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
4722 "is not a supported candidate.\n");
4723 return Result;
4724 }
4725
4727 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
4729 if (hasPlanWithVF(ForcedEC))
4730 return {ForcedEC, 0, 0};
4731
4732 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
4733 "viable.\n");
4734 return Result;
4735 }
4736
4737 if (OrigLoop->getHeader()->getParent()->hasOptSize() ||
4738 OrigLoop->getHeader()->getParent()->hasMinSize()) {
4739 LLVM_DEBUG(
4740 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
4741 return Result;
4742 }
4743
4744 if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, IC)) {
4745 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
4746 "this loop\n");
4747 return Result;
4748 }
4749
4750 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
4751 // the main loop handles 8 lanes per iteration. We could still benefit from
4752 // vectorizing the epilogue loop with VF=4.
4753 ElementCount EstimatedRuntimeVF = ElementCount::getFixed(
4754 getEstimatedRuntimeVF(MainLoopVF, CM.getVScaleForTuning()));
4755
4756 ScalarEvolution &SE = *PSE.getSE();
4757 Type *TCType = Legal->getWidestInductionType();
4758 const SCEV *RemainingIterations = nullptr;
4759 unsigned MaxTripCount = 0;
4760 for (auto &NextVF : ProfitableVFs) {
4761 // Skip candidate VFs without a corresponding VPlan.
4762 if (!hasPlanWithVF(NextVF.Width))
4763 continue;
4764
4765 // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
4766 // vectors) or > the VF of the main loop (fixed vectors).
4767 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
4768 ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
4769 (NextVF.Width.isScalable() &&
4770 ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) ||
4771 (!NextVF.Width.isScalable() && !MainLoopVF.isScalable() &&
4772 ElementCount::isKnownGT(NextVF.Width, MainLoopVF)))
4773 continue;
4774
4775 // If NextVF is greater than the number of remaining iterations, the
4776 // epilogue loop would be dead. Skip such factors.
4777 if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
4778 // TODO: extend to support scalable VFs.
4779 if (!RemainingIterations) {
4781 getPlanFor(NextVF.Width).getTripCount(), SE);
4782 assert(!isa<SCEVCouldNotCompute>(TC) &&
4783 "Trip count SCEV must be computable");
4784 RemainingIterations = SE.getURemExpr(
4785 TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
4786 MaxTripCount = MainLoopVF.getKnownMinValue() * IC - 1;
4787 if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
4788 SE.getConstant(TCType, MaxTripCount))) {
4789 MaxTripCount =
4790 SE.getUnsignedRangeMax(RemainingIterations).getZExtValue();
4791 }
4792 LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
4793 << MaxTripCount << "\n");
4794 }
4795 if (SE.isKnownPredicate(
4797 SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
4798 RemainingIterations))
4799 continue;
4800 }
4801
4802 if (Result.Width.isScalar() ||
4803 isMoreProfitable(NextVF, Result, MaxTripCount))
4804 Result = NextVF;
4805 }
4806
4807 if (Result != VectorizationFactor::Disabled())
4808 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
4809 << Result.Width << "\n");
4810 return Result;
4811}
4812
4813std::pair<unsigned, unsigned>
4815 unsigned MinWidth = -1U;
4816 unsigned MaxWidth = 8;
4818 // For in-loop reductions, no element types are added to ElementTypesInLoop
4819 // if there are no loads/stores in the loop. In this case, check through the
4820 // reduction variables to determine the maximum width.
4821 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
4822 // Reset MaxWidth so that we can find the smallest type used by recurrences
4823 // in the loop.
4824 MaxWidth = -1U;
4825 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
4826 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
4827 // When finding the min width used by the recurrence we need to account
4828 // for casts on the input operands of the recurrence.
4829 MaxWidth = std::min<unsigned>(
4830 MaxWidth, std::min<unsigned>(
4833 }
4834 } else {
4835 for (Type *T : ElementTypesInLoop) {
4836 MinWidth = std::min<unsigned>(
4837 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4838 MaxWidth = std::max<unsigned>(
4839 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4840 }
4841 }
4842 return {MinWidth, MaxWidth};
4843}
4844
4846 ElementTypesInLoop.clear();
4847 // For each block.
4848 for (BasicBlock *BB : TheLoop->blocks()) {
4849 // For each instruction in the loop.
4850 for (Instruction &I : BB->instructionsWithoutDebug()) {
4851 Type *T = I.getType();
4852
4853 // Skip ignored values.
4854 if (ValuesToIgnore.count(&I))
4855 continue;
4856
4857 // Only examine Loads, Stores and PHINodes.
4858 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
4859 continue;
4860
4861 // Examine PHI nodes that are reduction variables. Update the type to
4862 // account for the recurrence type.
4863 if (auto *PN = dyn_cast<PHINode>(&I)) {
4864 if (!Legal->isReductionVariable(PN))
4865 continue;
4866 const RecurrenceDescriptor &RdxDesc =
4867 Legal->getReductionVars().find(PN)->second;
4870 RdxDesc.getRecurrenceType(),
4872 continue;
4873 T = RdxDesc.getRecurrenceType();
4874 }
4875
4876 // Examine the stored values.
4877 if (auto *ST = dyn_cast<StoreInst>(&I))
4878 T = ST->getValueOperand()->getType();
4879
4880 assert(T->isSized() &&
4881 "Expected the load/store/recurrence type to be sized");
4882
4883 ElementTypesInLoop.insert(T);
4884 }
4885 }
4886}
4887
4888unsigned
4890 InstructionCost LoopCost) {
4891 // -- The interleave heuristics --
4892 // We interleave the loop in order to expose ILP and reduce the loop overhead.
4893 // There are many micro-architectural considerations that we can't predict
4894 // at this level. For example, frontend pressure (on decode or fetch) due to
4895 // code size, or the number and capabilities of the execution ports.
4896 //
4897 // We use the following heuristics to select the interleave count:
4898 // 1. If the code has reductions, then we interleave to break the cross
4899 // iteration dependency.
4900 // 2. If the loop is really small, then we interleave to reduce the loop
4901 // overhead.
4902 // 3. We don't interleave if we think that we will spill registers to memory
4903 // due to the increased register pressure.
4904
4906 return 1;
4907
4908 // Do not interleave if EVL is preferred and no User IC is specified.
4909 if (foldTailWithEVL()) {
4910 LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
4911 "Unroll factor forced to be 1.\n");
4912 return 1;
4913 }
4914
4915 // We used the distance for the interleave count.
4917 return 1;
4918
4919 // We don't attempt to perform interleaving for loops with uncountable early
4920 // exits because the VPInstruction::AnyOf code cannot currently handle
4921 // multiple parts.
4923 return 1;
4924
4925 auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop);
4926 const bool HasReductions = !Legal->getReductionVars().empty();
4927
4928 // If we did not calculate the cost for VF (because the user selected the VF)
4929 // then we calculate the cost of VF here.
4930 if (LoopCost == 0) {
4931 LoopCost = expectedCost(VF);
4932 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
4933
4934 // Loop body is free and there is no need for interleaving.
4935 if (LoopCost == 0)
4936 return 1;
4937 }
4938
4940 // We divide by these constants so assume that we have at least one
4941 // instruction that uses at least one register.
4942 for (auto &Pair : R.MaxLocalUsers) {
4943 Pair.second = std::max(Pair.second, 1U);
4944 }
4945
4946 // We calculate the interleave count using the following formula.
4947 // Subtract the number of loop invariants from the number of available
4948 // registers. These registers are used by all of the interleaved instances.
4949 // Next, divide the remaining registers by the number of registers that is
4950 // required by the loop, in order to estimate how many parallel instances
4951 // fit without causing spills. All of this is rounded down if necessary to be
4952 // a power of two. We want power of two interleave count to simplify any
4953 // addressing operations or alignment considerations.
4954 // We also want power of two interleave counts to ensure that the induction
4955 // variable of the vector loop wraps to zero, when tail is folded by masking;
4956 // this currently happens when OptForSize, in which case IC is set to 1 above.
4957 unsigned IC = UINT_MAX;
4958
4959 for (const auto &Pair : R.MaxLocalUsers) {
4960 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(Pair.first);
4961 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
4962 << " registers of "
4963 << TTI.getRegisterClassName(Pair.first)
4964 << " register class\n");
4965 if (VF.isScalar()) {
4966 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
4967 TargetNumRegisters = ForceTargetNumScalarRegs;
4968 } else {
4969 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
4970 TargetNumRegisters = ForceTargetNumVectorRegs;
4971 }
4972 unsigned MaxLocalUsers = Pair.second;
4973 unsigned LoopInvariantRegs = 0;
4974 if (R.LoopInvariantRegs.find(Pair.first) != R.LoopInvariantRegs.end())
4975 LoopInvariantRegs = R.LoopInvariantRegs[Pair.first];
4976
4977 unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
4978 MaxLocalUsers);
4979 // Don't count the induction variable as interleaved.
4981 TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
4982 std::max(1U, (MaxLocalUsers - 1)));
4983 }
4984
4985 IC = std::min(IC, TmpIC);
4986 }
4987
4988 // Clamp the interleave ranges to reasonable counts.
4989 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
4990
4991 // Check if the user has overridden the max.
4992 if (VF.isScalar()) {
4993 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
4994 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
4995 } else {
4996 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
4997 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4998 }
4999
5000 unsigned EstimatedVF = getEstimatedRuntimeVF(VF, VScaleForTuning);
5001 unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5002 if (KnownTC > 0) {
5003 // At least one iteration must be scalar when this constraint holds. So the
5004 // maximum available iterations for interleaving is one less.
5005 unsigned AvailableTC =
5006 requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
5007
5008 // If trip count is known we select between two prospective ICs, where
5009 // 1) the aggressive IC is capped by the trip count divided by VF
5010 // 2) the conservative IC is capped by the trip count divided by (VF * 2)
5011 // The final IC is selected in a way that the epilogue loop trip count is
5012 // minimized while maximizing the IC itself, so that we either run the
5013 // vector loop at least once if it generates a small epilogue loop, or else
5014 // we run the vector loop at least twice.
5015
5016 unsigned InterleaveCountUB = bit_floor(
5017 std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
5018 unsigned InterleaveCountLB = bit_floor(std::max(
5019 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5020 MaxInterleaveCount = InterleaveCountLB;
5021
5022 if (InterleaveCountUB != InterleaveCountLB) {
5023 unsigned TailTripCountUB =
5024 (AvailableTC % (EstimatedVF * InterleaveCountUB));
5025 unsigned TailTripCountLB =
5026 (AvailableTC % (EstimatedVF * InterleaveCountLB));
5027 // If both produce same scalar tail, maximize the IC to do the same work
5028 // in fewer vector loop iterations
5029 if (TailTripCountUB == TailTripCountLB)
5030 MaxInterleaveCount = InterleaveCountUB;
5031 }
5032 } else if (BestKnownTC && *BestKnownTC > 0) {
5033 // At least one iteration must be scalar when this constraint holds. So the
5034 // maximum available iterations for interleaving is one less.
5035 unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
5036 ? (*BestKnownTC) - 1
5037 : *BestKnownTC;
5038
5039 // If trip count is an estimated compile time constant, limit the
5040 // IC to be capped by the trip count divided by VF * 2, such that the vector
5041 // loop runs at least twice to make interleaving seem profitable when there
5042 // is an epilogue loop present. Since exact Trip count is not known we
5043 // choose to be conservative in our IC estimate.
5044 MaxInterleaveCount = bit_floor(std::max(
5045 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5046 }
5047
5048 assert(MaxInterleaveCount > 0 &&
5049 "Maximum interleave count must be greater than 0");
5050
5051 // Clamp the calculated IC to be between the 1 and the max interleave count
5052 // that the target and trip count allows.
5053 if (IC > MaxInterleaveCount)
5054 IC = MaxInterleaveCount;
5055 else
5056 // Make sure IC is greater than 0.
5057 IC = std::max(1u, IC);
5058
5059 assert(IC > 0 && "Interleave count must be greater than 0.");
5060
5061 // Interleave if we vectorized this loop and there is a reduction that could
5062 // benefit from interleaving.
5063 if (VF.isVector() && HasReductions) {
5064 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5065 return IC;
5066 }
5067
5068 // For any scalar loop that either requires runtime checks or predication we
5069 // are better off leaving this to the unroller. Note that if we've already
5070 // vectorized the loop we will have done the runtime check and so interleaving
5071 // won't require further checks.
5072 bool ScalarInterleavingRequiresPredication =
5073 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5074 return Legal->blockNeedsPredication(BB);
5075 }));
5076 bool ScalarInterleavingRequiresRuntimePointerCheck =
5078
5079 // We want to interleave small loops in order to reduce the loop overhead and
5080 // potentially expose ILP opportunities.
5081 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5082 << "LV: IC is " << IC << '\n'
5083 << "LV: VF is " << VF << '\n');
5084 const bool AggressivelyInterleaveReductions =
5085 TTI.enableAggressiveInterleaving(HasReductions);
5086 if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5087 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5088 // We assume that the cost overhead is 1 and we use the cost model
5089 // to estimate the cost of the loop and interleave until the cost of the
5090 // loop overhead is about 5% of the cost of the loop.
5091 unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
5092 SmallLoopCost / *LoopCost.getValue()));
5093
5094 // Interleave until store/load ports (estimated by max interleave count) are
5095 // saturated.
5096 unsigned NumStores = Legal->getNumStores();
5097 unsigned NumLoads = Legal->getNumLoads();
5098 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5099 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5100
5101 // There is little point in interleaving for reductions containing selects
5102 // and compares when VF=1 since it may just create more overhead than it's
5103 // worth for loops with small trip counts. This is because we still have to
5104 // do the final reduction after the loop.
5105 bool HasSelectCmpReductions =
5106 HasReductions &&
5107 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5108 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5109 RecurKind RK = RdxDesc.getRecurrenceKind();
5110 return RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
5111 RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK);
5112 });
5113 if (HasSelectCmpReductions) {
5114 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5115 return 1;
5116 }
5117
5118 // If we have a scalar reduction (vector reductions are already dealt with
5119 // by this point), we can increase the critical path length if the loop
5120 // we're interleaving is inside another loop. For tree-wise reductions
5121 // set the limit to 2, and for ordered reductions it's best to disable
5122 // interleaving entirely.
5123 if (HasReductions && TheLoop->getLoopDepth() > 1) {
5124 bool HasOrderedReductions =
5125 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5126 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5127 return RdxDesc.isOrdered();
5128 });
5129 if (HasOrderedReductions) {
5130 LLVM_DEBUG(
5131 dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5132 return 1;
5133 }
5134
5135 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5136 SmallIC = std::min(SmallIC, F);
5137 StoresIC = std::min(StoresIC, F);
5138 LoadsIC = std::min(LoadsIC, F);
5139 }
5140
5142 std::max(StoresIC, LoadsIC) > SmallIC) {
5143 LLVM_DEBUG(
5144 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5145 return std::max(StoresIC, LoadsIC);
5146 }
5147
5148 // If there are scalar reductions and TTI has enabled aggressive
5149 // interleaving for reductions, we will interleave to expose ILP.
5150 if (VF.isScalar() && AggressivelyInterleaveReductions) {
5151 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5152 // Interleave no less than SmallIC but not as aggressive as the normal IC
5153 // to satisfy the rare situation when resources are too limited.
5154 return std::max(IC / 2, SmallIC);
5155 }
5156
5157 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5158 return SmallIC;
5159 }
5160
5161 // Interleave if this is a large loop (small loops are already dealt with by
5162 // this point) that could benefit from interleaving.
5163 if (AggressivelyInterleaveReductions) {
5164 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5165 return IC;
5166 }
5167
5168 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5169 return 1;
5170}
5171
5174 // This function calculates the register usage by measuring the highest number
5175 // of values that are alive at a single location. Obviously, this is a very
5176 // rough estimation. We scan the loop in a topological order in order and
5177 // assign a number to each instruction. We use RPO to ensure that defs are
5178 // met before their users. We assume that each instruction that has in-loop
5179 // users starts an interval. We record every time that an in-loop value is
5180 // used, so we have a list of the first and last occurrences of each
5181 // instruction. Next, we transpose this data structure into a multi map that
5182 // holds the list of intervals that *end* at a specific location. This multi
5183 // map allows us to perform a linear search. We scan the instructions linearly
5184 // and record each time that a new interval starts, by placing it in a set.
5185 // If we find this value in the multi-map then we remove it from the set.
5186 // The max register usage is the maximum size of the set.
5187 // We also search for instructions that are defined outside the loop, but are
5188 // used inside the loop. We need this number separately from the max-interval
5189 // usage number because when we unroll, loop-invariant values do not take
5190 // more register.
5192 DFS.perform(LI);
5193
5194 RegisterUsage RU;
5195
5196 // Each 'key' in the map opens a new interval. The values
5197 // of the map are the index of the 'last seen' usage of the
5198 // instruction that is the key.
5200
5201 // Maps instruction to its index.
5203 // Marks the end of each interval.
5204 IntervalMap EndPoint;
5205 // Saves the list of instruction indices that are used in the loop.
5207 // Saves the list of values that are used in the loop but are defined outside
5208 // the loop (not including non-instruction values such as arguments and
5209 // constants).
5210 SmallSetVector<Instruction *, 8> LoopInvariants;
5211
5212 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5213 for (Instruction &I : BB->instructionsWithoutDebug()) {
5214 IdxToInstr.push_back(&I);
5215
5216 // Save the end location of each USE.
5217 for (Value *U : I.operands()) {
5218 auto *Instr = dyn_cast<Instruction>(U);
5219
5220 // Ignore non-instruction values such as arguments, constants, etc.
5221 // FIXME: Might need some motivation why these values are ignored. If
5222 // for example an argument is used inside the loop it will increase the
5223 // register pressure (so shouldn't we add it to LoopInvariants).
5224 if (!Instr)
5225 continue;
5226
5227 // If this instruction is outside the loop then record it and continue.
5228 if (!TheLoop->contains(Instr)) {
5229 LoopInvariants.insert(Instr);
5230 continue;
5231 }
5232
5233 // Overwrite previous end points.
5234 EndPoint[Instr] = IdxToInstr.size();
5235 Ends.insert(Instr);
5236 }
5237 }
5238 }
5239
5240 // Saves the list of intervals that end with the index in 'key'.
5241 using InstrList = SmallVector<Instruction *, 2>;
5243
5244 // Transpose the EndPoints to a list of values that end at each index.
5245 for (auto &Interval : EndPoint)
5246 TransposeEnds[Interval.second].push_back(Interval.first);
5247
5248 SmallPtrSet<Instruction *, 8> OpenIntervals;
5251
5252 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5253
5254 const auto &TTICapture = TTI;
5255 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5256 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty) ||
5257 (VF.isScalable() &&
5258 !TTICapture.isElementTypeLegalForScalableVector(Ty)))
5259 return 0;
5260 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5261 };
5262
5264
5265 for (unsigned int Idx = 0, Sz = IdxToInstr.size(); Idx < Sz; ++Idx) {
5266 Instruction *I = IdxToInstr[Idx];
5267
5268 // Remove all of the instructions that end at this location.
5269 InstrList &List = TransposeEnds[Idx];
5270 for (Instruction *ToRemove : List)
5271 OpenIntervals.erase(ToRemove);
5272
5273 // Ignore instructions that are never used within the loop.
5274 if (!Ends.count(I))
5275 continue;
5276
5277 // Skip ignored values.
5278 if (ValuesToIgnore.count(I))
5279 continue;
5280
5281 // For each VF find the maximum usage of registers.
5282 for (unsigned J = 0, E = VFs.size(); J < E; ++J) {
5283 // Count the number of registers used, per register class, given all open
5284 // intervals.
5285 // Note that elements in this SmallMapVector will be default constructed
5286 // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5287 // there is no previous entry for ClassID.
5289
5290 if (VFs[J].isScalar()) {
5291 for (auto *Inst : OpenIntervals) {
5292 unsigned ClassID =
5293 TTI.getRegisterClassForType(false, Inst->getType());
5294 // FIXME: The target might use more than one register for the type
5295 // even in the scalar case.
5296 RegUsage[ClassID] += 1;
5297 }
5298 } else {
5300 for (auto *Inst : OpenIntervals) {
5301 // Skip ignored values for VF > 1.
5302 if (VecValuesToIgnore.count(Inst))
5303 continue;
5304 if (isScalarAfterVectorization(Inst, VFs[J])) {
5305 unsigned ClassID =
5306 TTI.getRegisterClassForType(false, Inst->getType());
5307 // FIXME: The target might use more than one register for the type
5308 // even in the scalar case.
5309 RegUsage[ClassID] += 1;
5310 } else {
5311 unsigned ClassID =
5312 TTI.getRegisterClassForType(true, Inst->getType());
5313 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[J]);
5314 }
5315 }
5316 }
5317
5318 for (const auto &Pair : RegUsage) {
5319 auto &Entry = MaxUsages[J][Pair.first];
5320 Entry = std::max(Entry, Pair.second);
5321 }
5322 }
5323
5324 LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # "
5325 << OpenIntervals.size() << '\n');
5326
5327 // Add the current instruction to the list of open intervals.
5328 OpenIntervals.insert(I);
5329 }
5330
5331 for (unsigned Idx = 0, End = VFs.size(); Idx < End; ++Idx) {
5332 // Note that elements in this SmallMapVector will be default constructed
5333 // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5334 // there is no previous entry for ClassID.
5336
5337 for (auto *Inst : LoopInvariants) {
5338 // FIXME: The target might use more than one register for the type
5339 // even in the scalar case.
5340 bool IsScalar = all_of(Inst->users(), [&](User *U) {
5341 auto *I = cast<Instruction>(U);
5342 return TheLoop != LI->getLoopFor(I->getParent()) ||
5343 isScalarAfterVectorization(I, VFs[Idx]);
5344 });
5345
5346 ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[Idx];
5347 unsigned ClassID =
5348 TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
5349 Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
5350 }
5351
5352 LLVM_DEBUG({
5353 dbgs() << "LV(REG): VF = " << VFs[Idx] << '\n';
5354 dbgs() << "LV(REG): Found max usage: " << MaxUsages[Idx].size()
5355 << " item\n";
5356 for (const auto &pair : MaxUsages[Idx]) {
5357 dbgs() << "LV(REG): RegisterClass: "
5358 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5359 << " registers\n";
5360 }
5361 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5362 << " item\n";
5363 for (const auto &pair : Invariant) {
5364 dbgs() << "LV(REG): RegisterClass: "
5365 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5366 << " registers\n";
5367 }
5368 });
5369
5370 RU.LoopInvariantRegs = Invariant;
5371 RU.MaxLocalUsers = MaxUsages[Idx];
5372 RUs[Idx] = RU;
5373 }
5374
5375 return RUs;
5376}
5377
5378bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
5379 ElementCount VF) {
5380 // TODO: Cost model for emulated masked load/store is completely
5381 // broken. This hack guides the cost model to use an artificially
5382 // high enough value to practically disable vectorization with such
5383 // operations, except where previously deployed legality hack allowed
5384 // using very low cost values. This is to avoid regressions coming simply
5385 // from moving "masked load/store" check from legality to cost model.
5386 // Masked Load/Gather emulation was previously never allowed.
5387 // Limited number of Masked Store/Scatter emulation was allowed.
5389 "Expecting a scalar emulated instruction");
5390 return isa<LoadInst>(I) ||
5391 (isa<StoreInst>(I) &&
5392 NumPredStores > NumberOfStoresToPredicate);
5393}
5394
5396 // If we aren't vectorizing the loop, or if we've already collected the
5397 // instructions to scalarize, there's nothing to do. Collection may already
5398 // have occurred if we have a user-selected VF and are now computing the
5399 // expected cost for interleaving.
5400 if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF))
5401 return;
5402
5403 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5404 // not profitable to scalarize any instructions, the presence of VF in the
5405 // map will indicate that we've analyzed it already.
5406 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5407
5408 PredicatedBBsAfterVectorization[VF].clear();
5409
5410 // Find all the instructions that are scalar with predication in the loop and
5411 // determine if it would be better to not if-convert the blocks they are in.
5412 // If so, we also record the instructions to scalarize.
5413 for (BasicBlock *BB : TheLoop->blocks()) {
5415 continue;
5416 for (Instruction &I : *BB)
5417 if (isScalarWithPredication(&I, VF)) {
5418 ScalarCostsTy ScalarCosts;
5419 // Do not apply discount logic for:
5420 // 1. Scalars after vectorization, as there will only be a single copy
5421 // of the instruction.
5422 // 2. Scalable VF, as that would lead to invalid scalarization costs.
5423 // 3. Emulated masked memrefs, if a hacked cost is needed.
5424 if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() &&
5425 !useEmulatedMaskMemRefHack(&I, VF) &&
5426 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) {
5427 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5428 // Check if we decided to scalarize a call. If so, update the widening
5429 // decision of the call to CM_Scalarize with the computed scalar cost.
5430 for (const auto &[I, _] : ScalarCosts) {
5431 auto *CI = dyn_cast<CallInst>(I);
5432 if (!CI || !CallWideningDecisions.contains({CI, VF}))
5433 continue;
5434 CallWideningDecisions[{CI, VF}].Kind = CM_Scalarize;
5435 CallWideningDecisions[{CI, VF}].Cost = ScalarCosts[CI];
5436 }
5437 }
5438 // Remember that BB will remain after vectorization.
5439 PredicatedBBsAfterVectorization[VF].insert(BB);
5440 for (auto *Pred : predecessors(BB)) {
5441 if (Pred->getSingleSuccessor() == BB)
5442 PredicatedBBsAfterVectorization[VF].insert(Pred);
5443 }
5444 }
5445 }
5446}
5447
5448InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5449 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5450 assert(!isUniformAfterVectorization(PredInst, VF) &&
5451 "Instruction marked uniform-after-vectorization will be predicated");
5452
5453 // Initialize the discount to zero, meaning that the scalar version and the
5454 // vector version cost the same.
5455 InstructionCost Discount = 0;
5456
5457 // Holds instructions to analyze. The instructions we visit are mapped in
5458 // ScalarCosts. Those instructions are the ones that would be scalarized if
5459 // we find that the scalar version costs less.
5461
5462 // Returns true if the given instruction can be scalarized.
5463 auto CanBeScalarized = [&](Instruction *I) -> bool {
5464 // We only attempt to scalarize instructions forming a single-use chain
5465 // from the original predicated block that would otherwise be vectorized.
5466 // Although not strictly necessary, we give up on instructions we know will
5467 // already be scalar to avoid traversing chains that are unlikely to be
5468 // beneficial.
5469 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5471 return false;
5472
5473 // If the instruction is scalar with predication, it will be analyzed
5474 // separately. We ignore it within the context of PredInst.
5475 if (isScalarWithPredication(I, VF))
5476 return false;
5477
5478 // If any of the instruction's operands are uniform after vectorization,
5479 // the instruction cannot be scalarized. This prevents, for example, a
5480 // masked load from being scalarized.
5481 //
5482 // We assume we will only emit a value for lane zero of an instruction
5483 // marked uniform after vectorization, rather than VF identical values.
5484 // Thus, if we scalarize an instruction that uses a uniform, we would
5485 // create uses of values corresponding to the lanes we aren't emitting code
5486 // for. This behavior can be changed by allowing getScalarValue to clone
5487 // the lane zero values for uniforms rather than asserting.
5488 for (Use &U : I->operands())
5489 if (auto *J = dyn_cast<Instruction>(U.get()))
5490 if (isUniformAfterVectorization(J, VF))
5491 return false;
5492
5493 // Otherwise, we can scalarize the instruction.
5494 return true;
5495 };
5496
5497 // Compute the expected cost discount from scalarizing the entire expression
5498 // feeding the predicated instruction. We currently only consider expressions
5499 // that are single-use instruction chains.
5500 Worklist.push_back(PredInst);
5501 while (!Worklist.empty()) {
5502 Instruction *I = Worklist.pop_back_val();
5503
5504 // If we've already analyzed the instruction, there's nothing to do.
5505 if (ScalarCosts.contains(I))
5506 continue;
5507
5508 // Compute the cost of the vector instruction. Note that this cost already
5509 // includes the scalarization overhead of the predicated instruction.
5510 InstructionCost VectorCost = getInstructionCost(I, VF);
5511
5512 // Compute the cost of the scalarized instruction. This cost is the cost of
5513 // the instruction as if it wasn't if-converted and instead remained in the
5514 // predicated block. We will scale this cost by block probability after
5515 // computing the scalarization overhead.
5516 InstructionCost ScalarCost =
5518
5519 // Compute the scalarization overhead of needed insertelement instructions
5520 // and phi nodes.
5521 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5522 ScalarCost += TTI.getScalarizationOverhead(
5523 cast<VectorType>(toVectorTy(I->getType(), VF)),
5524 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
5525 /*Extract*/ false, CostKind);
5526 ScalarCost +=
5527 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
5528 }
5529
5530 // Compute the scalarization overhead of needed extractelement
5531 // instructions. For each of the instruction's operands, if the operand can
5532 // be scalarized, add it to the worklist; otherwise, account for the
5533 // overhead.
5534 for (Use &U : I->operands())
5535 if (auto *J = dyn_cast<Instruction>(U.get())) {
5536 assert(VectorType::isValidElementType(J->getType()) &&
5537 "Instruction has non-scalar type");
5538 if (CanBeScalarized(J))
5539 Worklist.push_back(J);
5540 else if (needsExtract(J, VF)) {
5541 ScalarCost += TTI.getScalarizationOverhead(
5542 cast<VectorType>(toVectorTy(J->getType(), VF)),
5543 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5544 /*Extract*/ true, CostKind);
5545 }
5546 }
5547
5548 // Scale the total scalar cost by block probability.
5549 ScalarCost /= getReciprocalPredBlockProb();
5550
5551 // Compute the discount. A non-negative discount means the vector version
5552 // of the instruction costs more, and scalarizing would be beneficial.
5553 Discount += VectorCost - ScalarCost;
5554 ScalarCosts[I] = ScalarCost;
5555 }
5556
5557 return Discount;
5558}
5559
5562
5563 // If the vector loop gets executed exactly once with the given VF, ignore the
5564 // costs of comparison and induction instructions, as they'll get simplified
5565 // away.
5566 SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;
5568 if (VF.isFixed() && TC == VF.getFixedValue() && !foldTailByMasking())
5570 ValuesToIgnoreForVF);
5571
5572 // For each block.
5573 for (BasicBlock *BB : TheLoop->blocks()) {
5574 InstructionCost BlockCost;
5575
5576 // For each instruction in the old loop.
5577 for (Instruction &I : BB->instructionsWithoutDebug()) {
5578 // Skip ignored values.
5579 if (ValuesToIgnore.count(&I) || ValuesToIgnoreForVF.count(&I) ||
5580 (VF.isVector() && VecValuesToIgnore.count(&I)))
5581 continue;
5582
5584
5585 // Check if we should override the cost.
5586 if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
5588
5589 BlockCost += C;
5590 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5591 << VF << " For instruction: " << I << '\n');
5592 }
5593
5594 // If we are vectorizing a predicated block, it will have been
5595 // if-converted. This means that the block's instructions (aside from
5596 // stores and instructions that may divide by zero) will now be
5597 // unconditionally executed. For the scalar case, we may not always execute
5598 // the predicated block, if it is an if-else block. Thus, scale the block's
5599 // cost by the probability of executing it. blockNeedsPredication from
5600 // Legal is used so as to not include all blocks in tail folded loops.
5601 if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5602 BlockCost /= getReciprocalPredBlockProb();
5603
5604 Cost += BlockCost;
5605 }
5606
5607 return Cost;
5608}
5609
5610/// Gets Address Access SCEV after verifying that the access pattern
5611/// is loop invariant except the induction variable dependence.
5612///
5613/// This SCEV can be sent to the Target in order to estimate the address
5614/// calculation cost.
5616 Value *Ptr,
5619 const Loop *TheLoop) {
5620
5621 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5622 if (!Gep)
5623 return nullptr;
5624
5625 // We are looking for a gep with all loop invariant indices except for one
5626 // which should be an induction variable.
5627 auto *SE = PSE.getSE();
5628 unsigned NumOperands = Gep->getNumOperands();
5629 for (unsigned Idx = 1; Idx < NumOperands; ++Idx) {
5630 Value *Opd = Gep->getOperand(Idx);
5631 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5632 !Legal->isInductionVariable(Opd))
5633 return nullptr;
5634 }
5635
5636 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5637 return PSE.getSCEV(Ptr);
5638}
5639
5641LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5642 ElementCount VF) {
5643 assert(VF.isVector() &&
5644 "Scalarization cost of instruction implies vectorization.");
5645 if (VF.isScalable())
5647
5648 Type *ValTy = getLoadStoreType(I);
5649 auto *SE = PSE.getSE();
5650
5651 unsigned AS = getLoadStoreAddressSpace(I);
5653 Type *PtrTy = toVectorTy(Ptr->getType(), VF);
5654 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
5655 // that it is being called from this specific place.
5656
5657 // Figure out whether the access is strided and get the stride value
5658 // if it's known in compile time
5659 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5660
5661 // Get the cost of the scalar memory instruction and address computation.
5663 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5664
5665 // Don't pass *I here, since it is scalar but will actually be part of a
5666 // vectorized loop where the user of it is a vectorized instruction.
5667 const Align Alignment = getLoadStoreAlignment(I);
5668 Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
5669 ValTy->getScalarType(),
5670 Alignment, AS, CostKind);
5671
5672 // Get the overhead of the extractelement and insertelement instructions
5673 // we might create due to scalarization.
5674 Cost += getScalarizationOverhead(I, VF);
5675
5676 // If we have a predicated load/store, it will need extra i1 extracts and
5677 // conditional branches, but may not be executed for each vector lane. Scale
5678 // the cost by the probability of executing the predicated block.
5679 if (isPredicatedInst(I)) {
5681
5682 // Add the cost of an i1 extract and a branch
5683 auto *VecI1Ty =
5686 VecI1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
5687 /*Insert=*/false, /*Extract=*/true, CostKind);
5688 Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
5689
5690 if (useEmulatedMaskMemRefHack(I, VF))
5691 // Artificially setting to a high enough value to practically disable
5692 // vectorization with such operations.
5693 Cost = 3000000;
5694 }
5695
5696 return Cost;
5697}
5698
5700LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5701 ElementCount VF) {
5702 Type *ValTy = getLoadStoreType(I);
5703 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5705 unsigned AS = getLoadStoreAddressSpace(I);
5706 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
5707
5708 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5709 "Stride should be 1 or -1 for consecutive memory access");
5710 const Align Alignment = getLoadStoreAlignment(I);
5712 if (Legal->isMaskRequired(I)) {
5713 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5714 CostKind);
5715 } else {
5716 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5717 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5718 CostKind, OpInfo, I);
5719 }
5720
5721 bool Reverse = ConsecutiveStride < 0;
5722 if (Reverse)
5724 CostKind, 0);
5725 return Cost;
5726}
5727
5729LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5730 ElementCount VF) {
5731 assert(Legal->isUniformMemOp(*I, VF));
5732
5733 Type *ValTy = getLoadStoreType(I);
5734 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5735 const Align Alignment = getLoadStoreAlignment(I);
5736 unsigned AS = getLoadStoreAddressSpace(I);
5737 if (isa<LoadInst>(I)) {
5738 return TTI.getAddressComputationCost(ValTy) +
5739 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
5740 CostKind) +
5742 CostKind);
5743 }
5744 StoreInst *SI = cast<StoreInst>(I);
5745
5746 bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
5747 return TTI.getAddressComputationCost(ValTy) +
5748 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
5749 CostKind) +
5750 (IsLoopInvariantStoreValue
5751 ? 0
5752 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5753 CostKind, VF.getKnownMinValue() - 1));
5754}
5755
5757LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5758 ElementCount VF) {
5759 Type *ValTy = getLoadStoreType(I);
5760 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5761 const Align Alignment = getLoadStoreAlignment(I);
5763
5764 return TTI.getAddressComputationCost(VectorTy) +
5765 TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5766 Legal->isMaskRequired(I), Alignment,
5767 CostKind, I);
5768}
5769
5771LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5772 ElementCount VF) {
5773 const auto *Group = getInterleavedAccessGroup(I);
5774 assert(Group && "Fail to get an interleaved access group.");
5775
5776 Instruction *InsertPos = Group->getInsertPos();
5777 Type *ValTy = getLoadStoreType(InsertPos);
5778 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5779 unsigned AS = getLoadStoreAddressSpace(InsertPos);
5780
5781 unsigned InterleaveFactor = Group->getFactor();
5782 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5783
5784 // Holds the indices of existing members in the interleaved group.
5786 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
5787 if (Group->getMember(IF))
5788 Indices.push_back(IF);
5789
5790 // Calculate the cost of the whole interleaved group.
5791 bool UseMaskForGaps =
5792 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
5793 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
5795 InsertPos->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5796 Group->getAlign(), AS, CostKind, Legal->isMaskRequired(I),
5797 UseMaskForGaps);
5798
5799 if (Group->isReverse()) {
5800 // TODO: Add support for reversed masked interleaved access.
5802 "Reverse masked interleaved access not supported.");
5803 Cost += Group->getNumMembers() *
5805 CostKind, 0);
5806 }
5807 return Cost;
5808}
5809
5810std::optional<InstructionCost>
5812 ElementCount VF,
5813 Type *Ty) const {
5814 using namespace llvm::PatternMatch;
5815 // Early exit for no inloop reductions
5816 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
5817 return std::nullopt;
5818 auto *VectorTy = cast<VectorType>(Ty);
5819
5820 // We are looking for a pattern of, and finding the minimal acceptable cost:
5821 // reduce(mul(ext(A), ext(B))) or
5822 // reduce(mul(A, B)) or
5823 // reduce(ext(A)) or
5824 // reduce(A).
5825 // The basic idea is that we walk down the tree to do that, finding the root
5826 // reduction instruction in InLoopReductionImmediateChains. From there we find
5827 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
5828 // of the components. If the reduction cost is lower then we return it for the
5829 // reduction instruction and 0 for the other instructions in the pattern. If
5830 // it is not we return an invalid cost specifying the orignal cost method
5831 // should be used.
5832 Instruction *RetI = I;
5833 if (match(RetI, m_ZExtOrSExt(m_Value()))) {
5834 if (!RetI->hasOneUser())
5835 return std::nullopt;
5836 RetI = RetI->user_back();
5837 }
5838
5839 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
5840 RetI->user_back()->getOpcode() == Instruction::Add) {
5841 RetI = RetI->user_back();
5842 }
5843
5844 // Test if the found instruction is a reduction, and if not return an invalid
5845 // cost specifying the parent to use the original cost modelling.
5846 if (!InLoopReductionImmediateChains.count(RetI))
5847 return std::nullopt;
5848
5849 // Find the reduction this chain is a part of and calculate the basic cost of
5850 // the reduction on its own.
5851 Instruction *LastChain = InLoopReductionImmediateChains.at(RetI);
5852 Instruction *ReductionPhi = LastChain;
5853 while (!isa<PHINode>(ReductionPhi))
5854 ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
5855
5856 const RecurrenceDescriptor &RdxDesc =
5857 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
5858
5859 InstructionCost BaseCost;
5860 RecurKind RK = RdxDesc.getRecurrenceKind();
5863 BaseCost = TTI.getMinMaxReductionCost(MinMaxID, VectorTy,
5864 RdxDesc.getFastMathFlags(), CostKind);
5865 } else {
5867 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
5868 }
5869
5870 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
5871 // normal fmul instruction to the cost of the fadd reduction.
5872 if (RK == RecurKind::FMulAdd)
5873 BaseCost +=
5874 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
5875
5876 // If we're using ordered reductions then we can just return the base cost
5877 // here, since getArithmeticReductionCost calculates the full ordered
5878 // reduction cost when FP reassociation is not allowed.
5879 if (useOrderedReductions(RdxDesc))
5880 return BaseCost;
5881
5882 // Get the operand that was not the reduction chain and match it to one of the
5883 // patterns, returning the better cost if it is found.
5884 Instruction *RedOp = RetI->getOperand(1) == LastChain
5885 ? dyn_cast<Instruction>(RetI->getOperand(0))
5886 : dyn_cast<Instruction>(RetI->getOperand(1));
5887
5888 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
5889
5890 Instruction *Op0, *Op1;
5891 if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5892 match(RedOp,
5894 match(Op0, m_ZExtOrSExt(m_Value())) &&
5895 Op0->getOpcode() == Op1->getOpcode() &&
5896 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
5898 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
5899
5900 // Matched reduce.add(ext(mul(ext(A), ext(B)))
5901 // Note that the extend opcodes need to all match, or if A==B they will have
5902 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
5903 // which is equally fine.
5904 bool IsUnsigned = isa<ZExtInst>(Op0);
5905 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
5906 auto *MulType = VectorType::get(Op0->getType(), VectorTy);
5907
5908 InstructionCost ExtCost =
5909 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
5911 InstructionCost MulCost =
5912 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
5913 InstructionCost Ext2Cost =
5914 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
5916
5918 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
5919
5920 if (RedCost.isValid() &&
5921 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
5922 return I == RetI ? RedCost : 0;
5923 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
5924 !TheLoop->isLoopInvariant(RedOp)) {
5925 // Matched reduce(ext(A))
5926 bool IsUnsigned = isa<ZExtInst>(RedOp);
5927 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
5929 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
5930 RdxDesc.getFastMathFlags(), CostKind);
5931
5932 InstructionCost ExtCost =
5933 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
5935 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
5936 return I == RetI ? RedCost : 0;
5937 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5938 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
5939 if (match(Op0, m_ZExtOrSExt(m_Value())) &&
5940 Op0->getOpcode() == Op1->getOpcode() &&
5942 bool IsUnsigned = isa<ZExtInst>(Op0);
5943 Type *Op0Ty = Op0->getOperand(0)->getType();
5944 Type *Op1Ty = Op1->getOperand(0)->getType();
5945 Type *LargestOpTy =
5946 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
5947 : Op0Ty;
5948 auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
5949
5950 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
5951 // different sizes. We take the largest type as the ext to reduce, and add
5952 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
5954 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
5957 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
5959 InstructionCost MulCost =
5960 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
5961
5963 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
5964 InstructionCost ExtraExtCost = 0;
5965 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
5966 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
5967 ExtraExtCost = TTI.getCastInstrCost(
5968 ExtraExtOp->getOpcode(), ExtType,
5969 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
5971 }
5972
5973 if (RedCost.isValid() &&
5974 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
5975 return I == RetI ? RedCost : 0;
5976 } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
5977 // Matched reduce.add(mul())
5978 InstructionCost MulCost =
5979 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
5980
5982 true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
5983
5984 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
5985 return I == RetI ? RedCost : 0;
5986 }
5987 }
5988
5989 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
5990}
5991
5993LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5994 ElementCount VF) {
5995 // Calculate scalar cost only. Vectorization cost should be ready at this
5996 // moment.
5997 if (VF.isScalar()) {
5998 Type *ValTy = getLoadStoreType(I);
5999 const Align Alignment = getLoadStoreAlignment(I);
6000 unsigned AS = getLoadStoreAddressSpace(I);
6001
6002 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6003 return TTI.getAddressComputationCost(ValTy) +
6004 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, CostKind,
6005 OpInfo, I);
6006 }
6007 return getWideningCost(I, VF);
6008}
6009
6011LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6012 ElementCount VF) const {
6013
6014 // There is no mechanism yet to create a scalable scalarization loop,
6015 // so this is currently Invalid.
6016 if (VF.isScalable())
6018
6019 if (VF.isScalar())
6020 return 0;
6021
6023 Type *RetTy = toVectorTy(I->getType(), VF);
6024 if (!RetTy->isVoidTy() &&
6025 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6027 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
6028 /*Insert*/ true,
6029 /*Extract*/ false, CostKind);
6030
6031 // Some targets keep addresses scalar.
6032 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6033 return Cost;
6034
6035 // Some targets support efficient element stores.
6036 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6037 return Cost;
6038
6039 // Collect operands to consider.
6040 CallInst *CI = dyn_cast<CallInst>(I);
6041 Instruction::op_range Ops = CI ? CI->args() : I->operands();
6042
6043 // Skip operands that do not require extraction/scalarization and do not incur
6044 // any overhead.
6046 for (auto *V : filterExtractingOperands(Ops, VF))
6047 Tys.push_back(maybeVectorizeType(V->getType(), VF));
6049 filterExtractingOperands(Ops, VF), Tys, CostKind);
6050}
6051
6053 if (VF.isScalar())
6054 return;
6055 NumPredStores = 0;
6056 for (BasicBlock *BB : TheLoop->blocks()) {
6057 // For each instruction in the old loop.
6058 for (Instruction &I : *BB) {
6060 if (!Ptr)
6061 continue;
6062
6063 // TODO: We should generate better code and update the cost model for
6064 // predicated uniform stores. Today they are treated as any other
6065 // predicated store (see added test cases in
6066 // invariant-store-vectorization.ll).
6067 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6068 NumPredStores++;
6069
6070 if (Legal->isUniformMemOp(I, VF)) {
6071 auto IsLegalToScalarize = [&]() {
6072 if (!VF.isScalable())
6073 // Scalarization of fixed length vectors "just works".
6074 return true;
6075
6076 // We have dedicated lowering for unpredicated uniform loads and
6077 // stores. Note that even with tail folding we know that at least
6078 // one lane is active (i.e. generalized predication is not possible
6079 // here), and the logic below depends on this fact.
6080 if (!foldTailByMasking())
6081 return true;
6082
6083 // For scalable vectors, a uniform memop load is always
6084 // uniform-by-parts and we know how to scalarize that.
6085 if (isa<LoadInst>(I))
6086 return true;
6087
6088 // A uniform store isn't neccessarily uniform-by-part
6089 // and we can't assume scalarization.
6090 auto &SI = cast<StoreInst>(I);
6091 return TheLoop->isLoopInvariant(SI.getValueOperand());
6092 };
6093
6094 const InstructionCost GatherScatterCost =
6096 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6097
6098 // Load: Scalar load + broadcast
6099 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6100 // FIXME: This cost is a significant under-estimate for tail folded
6101 // memory ops.
6102 const InstructionCost ScalarizationCost =
6103 IsLegalToScalarize() ? getUniformMemOpCost(&I, VF)
6105
6106 // Choose better solution for the current VF, Note that Invalid
6107 // costs compare as maximumal large. If both are invalid, we get
6108 // scalable invalid which signals a failure and a vectorization abort.
6109 if (GatherScatterCost < ScalarizationCost)
6110 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6111 else
6112 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
6113 continue;
6114 }
6115
6116 // We assume that widening is the best solution when possible.
6117 if (memoryInstructionCanBeWidened(&I, VF)) {
6118 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6119 int ConsecutiveStride = Legal->isConsecutivePtr(
6121 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6122 "Expected consecutive stride.");
6123 InstWidening Decision =
6124 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6125 setWideningDecision(&I, VF, Decision, Cost);
6126 continue;
6127 }
6128
6129 // Choose between Interleaving, Gather/Scatter or Scalarization.
6131 unsigned NumAccesses = 1;
6132 if (isAccessInterleaved(&I)) {
6133 const auto *Group = getInterleavedAccessGroup(&I);
6134 assert(Group && "Fail to get an interleaved access group.");
6135
6136 // Make one decision for the whole group.
6137 if (getWideningDecision(&I, VF) != CM_Unknown)
6138 continue;
6139
6140 NumAccesses = Group->getNumMembers();
6142 InterleaveCost = getInterleaveGroupCost(&I, VF);
6143 }
6144
6145 InstructionCost GatherScatterCost =
6147 ? getGatherScatterCost(&I, VF) * NumAccesses
6149
6150 InstructionCost ScalarizationCost =
6151 getMemInstScalarizationCost(&I, VF) * NumAccesses;
6152
6153 // Choose better solution for the current VF,
6154 // write down this decision and use it during vectorization.
6156 InstWidening Decision;
6157 if (InterleaveCost <= GatherScatterCost &&
6158 InterleaveCost < ScalarizationCost) {
6159 Decision = CM_Interleave;
6160 Cost = InterleaveCost;
6161 } else if (GatherScatterCost < ScalarizationCost) {
6162 Decision = CM_GatherScatter;
6163 Cost = GatherScatterCost;
6164 } else {
6165 Decision = CM_Scalarize;
6166 Cost = ScalarizationCost;
6167 }
6168 // If the instructions belongs to an interleave group, the whole group
6169 // receives the same decision. The whole group receives the cost, but
6170 // the cost will actually be assigned to one instruction.
6171 if (const auto *Group = getInterleavedAccessGroup(&I))
6172 setWideningDecision(Group, VF, Decision, Cost);
6173 else
6174 setWideningDecision(&I, VF, Decision, Cost);
6175 }
6176 }
6177
6178 // Make sure that any load of address and any other address computation
6179 // remains scalar unless there is gather/scatter support. This avoids
6180 // inevitable extracts into address registers, and also has the benefit of
6181 // activating LSR more, since that pass can't optimize vectorized
6182 // addresses.
6184 return;
6185
6186 // Start with all scalar pointer uses.
6188 for (BasicBlock *BB : TheLoop->blocks())
6189 for (Instruction &I : *BB) {
6190 Instruction *PtrDef =
6191 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6192 if (PtrDef && TheLoop->contains(PtrDef) &&
6194 AddrDefs.insert(PtrDef);
6195 }
6196
6197 // Add all instructions used to generate the addresses.
6199 append_range(Worklist, AddrDefs);
6200 while (!Worklist.empty()) {
6201 Instruction *I = Worklist.pop_back_val();
6202 for (auto &Op : I->operands())
6203 if (auto *InstOp = dyn_cast<Instruction>(Op))
6204 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6205 AddrDefs.insert(InstOp).second)
6206 Worklist.push_back(InstOp);
6207 }
6208
6209 for (auto *I : AddrDefs) {
6210 if (isa<LoadInst>(I)) {
6211 // Setting the desired widening decision should ideally be handled in
6212 // by cost functions, but since this involves the task of finding out
6213 // if the loaded register is involved in an address computation, it is
6214 // instead changed here when we know this is the case.
6215 InstWidening Decision = getWideningDecision(I, VF);
6216 if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6217 // Scalarize a widened load of address.
6219 I, VF, CM_Scalarize,
6220 (VF.getKnownMinValue() *
6221 getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6222 else if (const auto *Group = getInterleavedAccessGroup(I)) {
6223 // Scalarize an interleave group of address loads.
6224 for (unsigned I = 0; I < Group->getFactor(); ++I) {
6225 if (Instruction *Member = Group->getMember(I))
6227 Member, VF, CM_Scalarize,
6228 (VF.getKnownMinValue() *
6229 getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6230 }
6231 }
6232 } else
6233 // Make sure I gets scalarized and a cost estimate without
6234 // scalarization overhead.
6235 ForcedScalars[VF].insert(I);
6236 }
6237}
6238
6240 assert(!VF.isScalar() &&
6241 "Trying to set a vectorization decision for a scalar VF");
6242
6243 auto ForcedScalar = ForcedScalars.find(VF);
6244 for (BasicBlock *BB : TheLoop->blocks()) {
6245 // For each instruction in the old loop.
6246 for (Instruction &I : *BB) {
6247 CallInst *CI = dyn_cast<CallInst>(&I);
6248
6249 if (!CI)
6250 continue;
6251
6255 Function *ScalarFunc = CI->getCalledFunction();
6256 Type *ScalarRetTy = CI->getType();
6257 SmallVector<Type *, 4> Tys, ScalarTys;
6258 for (auto &ArgOp : CI->args())
6259 ScalarTys.push_back(ArgOp->getType());
6260
6261 // Estimate cost of scalarized vector call. The source operands are
6262 // assumed to be vectors, so we need to extract individual elements from
6263 // there, execute VF scalar calls, and then gather the result into the
6264 // vector return value.
6265 InstructionCost ScalarCallCost =
6266 TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
6267
6268 // Compute costs of unpacking argument values for the scalar calls and
6269 // packing the return values to a vector.
6270 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
6271
6272 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6273 // Honor ForcedScalars and UniformAfterVectorization decisions.
6274 // TODO: For calls, it might still be more profitable to widen. Use
6275 // VPlan-based cost model to compare different options.
6276 if (VF.isVector() && ((ForcedScalar != ForcedScalars.end() &&
6277 ForcedScalar->second.contains(CI)) ||
6278 isUniformAfterVectorization(CI, VF))) {
6279 setCallWideningDecision(CI, VF, CM_Scalarize, nullptr,
6280 Intrinsic::not_intrinsic, std::nullopt,
6281 ScalarCost);
6282 continue;
6283 }
6284
6285 bool MaskRequired = Legal->isMaskRequired(CI);
6286 // Compute corresponding vector type for return value and arguments.
6287 Type *RetTy = toVectorTy(ScalarRetTy, VF);
6288 for (Type *ScalarTy : ScalarTys)
6289 Tys.push_back(toVectorTy(ScalarTy, VF));
6290
6291 // An in-loop reduction using an fmuladd intrinsic is a special case;
6292 // we don't want the normal cost for that intrinsic.
6294 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy)) {
6297 std::nullopt, *RedCost);
6298 continue;
6299 }
6300
6301 // Find the cost of vectorizing the call, if we can find a suitable
6302 // vector variant of the function.
6303 VFInfo FuncInfo;
6304 Function *VecFunc = nullptr;
6305 // Search through any available variants for one we can use at this VF.
6306 for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
6307 // Must match requested VF.
6308 if (Info.Shape.VF != VF)
6309 continue;
6310
6311 // Must take a mask argument if one is required
6312 if (MaskRequired && !Info.isMasked())
6313 continue;
6314
6315 // Check that all parameter kinds are supported
6316 bool ParamsOk = true;
6317 for (VFParameter Param : Info.Shape.Parameters) {
6318 switch (Param.ParamKind) {
6320 break;
6322 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6323 // Make sure the scalar parameter in the loop is invariant.
6324 if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
6325 TheLoop))
6326 ParamsOk = false;
6327 break;
6328 }
6330 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6331 // Find the stride for the scalar parameter in this loop and see if
6332 // it matches the stride for the variant.
6333 // TODO: do we need to figure out the cost of an extract to get the
6334 // first lane? Or do we hope that it will be folded away?
6335 ScalarEvolution *SE = PSE.getSE();
6336 const auto *SAR =
6337 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam));
6338
6339 if (!SAR || SAR->getLoop() != TheLoop) {
6340 ParamsOk = false;
6341 break;
6342 }
6343
6344 const SCEVConstant *Step =
6345 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));
6346
6347 if (!Step ||
6348 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
6349 ParamsOk = false;
6350
6351 break;
6352 }
6354 break;
6355 default:
6356 ParamsOk = false;
6357 break;
6358 }
6359 }
6360
6361 if (!ParamsOk)
6362 continue;
6363
6364 // Found a suitable candidate, stop here.
6365 VecFunc = CI->getModule()->getFunction(Info.VectorName);
6366 FuncInfo = Info;
6367 break;
6368 }
6369
6370 if (TLI && VecFunc && !CI->isNoBuiltin())
6371 VectorCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind);
6372
6373 // Find the cost of an intrinsic; some targets may have instructions that
6374 // perform the operation without needing an actual call.
6376 if (IID != Intrinsic::not_intrinsic)
6377 IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6378
6379 InstructionCost Cost = ScalarCost;
6380 InstWidening Decision = CM_Scalarize;
6381
6382 if (VectorCost <= Cost) {
6383 Cost = VectorCost;
6384 Decision = CM_VectorCall;
6385 }
6386
6387 if (IntrinsicCost <= Cost) {
6388 Cost = IntrinsicCost;
6389 Decision = CM_IntrinsicCall;
6390 }
6391
6392 setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
6394 }
6395 }
6396}
6397
6399 if (!Legal->isInvariant(Op))
6400 return false;
6401 // Consider Op invariant, if it or its operands aren't predicated
6402 // instruction in the loop. In that case, it is not trivially hoistable.
6403 auto *OpI = dyn_cast<Instruction>(Op);
6404 return !OpI || !TheLoop->contains(OpI) ||
6405 (!isPredicatedInst(OpI) &&
6406 (!isa<PHINode>(OpI) || OpI->getParent() != TheLoop->getHeader()) &&
6407 all_of(OpI->operands(),
6408 [this](Value *Op) { return shouldConsiderInvariant(Op); }));
6409}
6410
6413 ElementCount VF) {
6414 // If we know that this instruction will remain uniform, check the cost of
6415 // the scalar version.
6417 VF = ElementCount::getFixed(1);
6418
6419 if (VF.isVector() && isProfitableToScalarize(I, VF))
6420 return InstsToScalarize[VF][I];
6421
6422 // Forced scalars do not have any scalarization overhead.
6423 auto ForcedScalar = ForcedScalars.find(VF);
6424 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6425 auto InstSet = ForcedScalar->second;
6426 if (InstSet.count(I))
6428 VF.getKnownMinValue();
6429 }
6430
6431 Type *RetTy = I->getType();
6433 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6434 auto *SE = PSE.getSE();
6435
6436 auto HasSingleCopyAfterVectorization = [this](Instruction *I,
6437 ElementCount VF) -> bool {
6438 if (VF.isScalar())
6439 return true;
6440
6441 auto Scalarized = InstsToScalarize.find(VF);
6442 assert(Scalarized != InstsToScalarize.end() &&
6443 "VF not yet analyzed for scalarization profitability");
6444 return !Scalarized->second.count(I) &&
6445 llvm::all_of(I->users(), [&](User *U) {
6446 auto *UI = cast<Instruction>(U);
6447 return !Scalarized->second.count(UI);
6448 });
6449 };
6450 (void)HasSingleCopyAfterVectorization;
6451
6452 Type *VectorTy;
6453 if (isScalarAfterVectorization(I, VF)) {
6454 // With the exception of GEPs and PHIs, after scalarization there should
6455 // only be one copy of the instruction generated in the loop. This is
6456 // because the VF is either 1, or any instructions that need scalarizing
6457 // have already been dealt with by the time we get here. As a result,
6458 // it means we don't have to multiply the instruction cost by VF.
6459 assert(I->getOpcode() == Instruction::GetElementPtr ||
6460 I->getOpcode() == Instruction::PHI ||
6461 (I->getOpcode() == Instruction::BitCast &&
6462 I->getType()->isPointerTy()) ||
6463 HasSingleCopyAfterVectorization(I, VF));
6464 VectorTy = RetTy;
6465 } else
6466 VectorTy = toVectorTy(RetTy, VF);
6467
6468 if (VF.isVector() && VectorTy->isVectorTy() &&
6469 !TTI.getNumberOfParts(VectorTy))
6471
6472 // TODO: We need to estimate the cost of intrinsic calls.
6473 switch (I->getOpcode()) {
6474 case Instruction::GetElementPtr:
6475 // We mark this instruction as zero-cost because the cost of GEPs in
6476 // vectorized code depends on whether the corresponding memory instruction
6477 // is scalarized or not. Therefore, we handle GEPs with the memory
6478 // instruction cost.
6479 return 0;
6480 case Instruction::Br: {
6481 // In cases of scalarized and predicated instructions, there will be VF
6482 // predicated blocks in the vectorized loop. Each branch around these
6483 // blocks requires also an extract of its vector compare i1 element.
6484 // Note that the conditional branch from the loop latch will be replaced by
6485 // a single branch controlling the loop, so there is no extra overhead from
6486 // scalarization.
6487 bool ScalarPredicatedBB = false;
6488 BranchInst *BI = cast<BranchInst>(I);
6489 if (VF.isVector() && BI->isConditional() &&
6490 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6491 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&
6492 BI->getParent() != TheLoop->getLoopLatch())
6493 ScalarPredicatedBB = true;
6494
6495 if (ScalarPredicatedBB) {
6496 // Not possible to scalarize scalable vector with predicated instructions.
6497 if (VF.isScalable())
6499 // Return cost for branches around scalarized and predicated blocks.
6500 auto *VecI1Ty =
6501 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6502 return (
6504 VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
6505 /*Insert*/ false, /*Extract*/ true, CostKind) +
6506 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6507 }
6508
6509 if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6510 // The back-edge branch will remain, as will all scalar branches.
6511 return TTI.getCFInstrCost(Instruction::Br, CostKind);
6512
6513 // This branch will be eliminated by if-conversion.
6514 return 0;
6515 // Note: We currently assume zero cost for an unconditional branch inside
6516 // a predicated block since it will become a fall-through, although we
6517 // may decide in the future to call TTI for all branches.
6518 }
6519 case Instruction::Switch: {
6520 if (VF.isScalar())
6521 return TTI.getCFInstrCost(Instruction::Switch, CostKind);
6522 auto *Switch = cast<SwitchInst>(I);
6523 return Switch->getNumCases() *
6525 Instruction::ICmp,
6526 toVectorTy(Switch->getCondition()->getType(), VF),
6527 toVectorTy(Type::getInt1Ty(I->getContext()), VF),
6529 }
6530 case Instruction::PHI: {
6531 auto *Phi = cast<PHINode>(I);
6532
6533 // First-order recurrences are replaced by vector shuffles inside the loop.
6534 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6535 // For <vscale x 1 x i64>, if vscale = 1 we are unable to extract the
6536 // penultimate value of the recurrence.
6537 // TODO: Consider vscale_range info.
6538 if (VF.isScalable() && VF.getKnownMinValue() == 1)
6541 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6543 cast<VectorType>(VectorTy), Mask, CostKind,
6544 VF.getKnownMinValue() - 1);
6545 }
6546
6547 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6548 // converted into select instructions. We require N - 1 selects per phi
6549 // node, where N is the number of incoming values.
6550 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) {
6551 Type *ResultTy = Phi->getType();
6552
6553 // All instructions in an Any-of reduction chain are narrowed to bool.
6554 // Check if that is the case for this phi node.
6555 auto *HeaderUser = cast_if_present<PHINode>(
6556 find_singleton<User>(Phi->users(), [this](User *U, bool) -> User * {
6557 auto *Phi = dyn_cast<PHINode>(U);
6558 if (Phi && Phi->getParent() == TheLoop->getHeader())
6559 return Phi;
6560 return nullptr;
6561 }));
6562 if (HeaderUser) {
6563 auto &ReductionVars = Legal->getReductionVars();
6564 auto Iter = ReductionVars.find(HeaderUser);
6565 if (Iter != ReductionVars.end() &&
6567 Iter->second.getRecurrenceKind()))
6568 ResultTy = Type::getInt1Ty(Phi->getContext());
6569 }
6570 return (Phi->getNumIncomingValues() - 1) *
6572 Instruction::Select, toVectorTy(ResultTy, VF),
6573 toVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6575 }
6576
6577 // When tail folding with EVL, if the phi is part of an out of loop
6578 // reduction then it will be transformed into a wide vp_merge.
6579 if (VF.isVector() && foldTailWithEVL() &&
6582 Intrinsic::vp_merge, toVectorTy(Phi->getType(), VF),
6583 {toVectorTy(Type::getInt1Ty(Phi->getContext()), VF)});
6584 return TTI.getIntrinsicInstrCost(ICA, CostKind);
6585 }
6586
6587 return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6588 }
6589 case Instruction::UDiv:
6590 case Instruction::SDiv:
6591 case Instruction::URem:
6592 case Instruction::SRem:
6593 if (VF.isVector() && isPredicatedInst(I)) {
6594 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6595 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6596 ScalarCost : SafeDivisorCost;
6597 }
6598 // We've proven all lanes safe to speculate, fall through.
6599 [[fallthrough]];
6600 case Instruction::Add:
6601 case Instruction::Sub: {
6602 auto Info = Legal->getHistogramInfo(I);
6603 if (Info && VF.isVector()) {
6604 const HistogramInfo *HGram = Info.value();
6605 // Assume that a non-constant update value (or a constant != 1) requires
6606 // a multiply, and add that into the cost.
6608 ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1));
6609 if (!RHS || RHS->getZExtValue() != 1)
6610 MulCost =
6611 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6612
6613 // Find the cost of the histogram operation itself.
6614 Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF);
6615 Type *ScalarTy = I->getType();
6616 Type *MaskTy = VectorType::get(Type::getInt1Ty(I->getContext()), VF);
6617 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
6618 Type::getVoidTy(I->getContext()),
6619 {PtrTy, ScalarTy, MaskTy});
6620
6621 // Add the costs together with the add/sub operation.
6622 return TTI.getIntrinsicInstrCost(ICA, CostKind) + MulCost +
6623 TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, CostKind);
6624 }
6625 [[fallthrough]];
6626 }
6627 case Instruction::FAdd:
6628 case Instruction::FSub:
6629 case Instruction::Mul:
6630 case Instruction::FMul:
6631 case Instruction::FDiv:
6632 case Instruction::FRem:
6633 case Instruction::Shl:
6634 case Instruction::LShr:
6635 case Instruction::AShr:
6636 case Instruction::And:
6637 case Instruction::Or:
6638 case Instruction::Xor: {
6639 // If we're speculating on the stride being 1, the multiplication may
6640 // fold away. We can generalize this for all operations using the notion
6641 // of neutral elements. (TODO)
6642 if (I->getOpcode() == Instruction::Mul &&
6643 ((TheLoop->isLoopInvariant(I->getOperand(0)) &&
6644 PSE.getSCEV(I->getOperand(0))->isOne()) ||
6645 (TheLoop->isLoopInvariant(I->getOperand(1)) &&
6646 PSE.getSCEV(I->getOperand(1))->isOne())))
6647 return 0;
6648
6649 // Detect reduction patterns
6650 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
6651 return *RedCost;
6652
6653 // Certain instructions can be cheaper to vectorize if they have a constant
6654 // second vector operand. One example of this are shifts on x86.
6655 Value *Op2 = I->getOperand(1);
6656 if (!isa<Constant>(Op2) && PSE.getSE()->isSCEVable(Op2->getType()) &&
6657 isa<SCEVConstant>(PSE.getSCEV(Op2))) {
6658 Op2 = cast<SCEVConstant>(PSE.getSCEV(Op2))->getValue();
6659 }
6660 auto Op2Info = TTI.getOperandInfo(Op2);
6661 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6664
6665 SmallVector<const Value *, 4> Operands(I->operand_values());
6667 I->getOpcode(), VectorTy, CostKind,
6668 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6669 Op2Info, Operands, I, TLI);
6670 }
6671 case Instruction::FNeg: {
6673 I->getOpcode(), VectorTy, CostKind,
6674 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6675 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6676 I->getOperand(0), I);
6677 }
6678 case Instruction::Select: {
6679 SelectInst *SI = cast<SelectInst>(I);
6680 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6681 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6682
6683 const Value *Op0, *Op1;
6684 using namespace llvm::PatternMatch;
6685 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
6686 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
6687 // select x, y, false --> x & y
6688 // select x, true, y --> x | y
6689 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
6690 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
6691 assert(Op0->getType()->getScalarSizeInBits() == 1 &&
6692 Op1->getType()->getScalarSizeInBits() == 1);
6693
6696 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
6697 CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
6698 }
6699
6700 Type *CondTy = SI->getCondition()->getType();
6701 if (!ScalarCond)
6702 CondTy = VectorType::get(CondTy, VF);
6703
6705 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
6706 Pred = Cmp->getPredicate();
6707 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
6708 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
6709 {TTI::OK_AnyValue, TTI::OP_None}, I);
6710 }
6711 case Instruction::ICmp:
6712 case Instruction::FCmp: {
6713 Type *ValTy = I->getOperand(0)->getType();
6714
6716 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6717 (void)Op0AsInstruction;
6718 assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) ||
6719 MinBWs[I] == MinBWs[Op0AsInstruction]) &&
6720 "if both the operand and the compare are marked for "
6721 "truncation, they must have the same bitwidth");
6722 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]);
6723 }
6724
6725 VectorTy = toVectorTy(ValTy, VF);
6726 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
6727 cast<CmpInst>(I)->getPredicate(), CostKind,
6728 {TTI::OK_AnyValue, TTI::OP_None},
6729 {TTI::OK_AnyValue, TTI::OP_None}, I);
6730 }
6731 case Instruction::Store:
6732 case Instruction::Load: {
6733 ElementCount Width = VF;
6734 if (Width.isVector()) {
6735 InstWidening Decision = getWideningDecision(I, Width);
6736 assert(Decision != CM_Unknown &&
6737 "CM decision should be taken at this point");
6740 if (Decision == CM_Scalarize)
6741 Width = ElementCount::getFixed(1);
6742 }
6743 VectorTy = toVectorTy(getLoadStoreType(I), Width);
6744 return getMemoryInstructionCost(I, VF);
6745 }
6746 case Instruction::BitCast:
6747 if (I->getType()->isPointerTy())
6748 return 0;
6749 [[fallthrough]];
6750 case Instruction::ZExt:
6751 case Instruction::SExt:
6752 case Instruction::FPToUI:
6753 case Instruction::FPToSI:
6754 case Instruction::FPExt:
6755 case Instruction::PtrToInt:
6756 case Instruction::IntToPtr:
6757 case Instruction::SIToFP:
6758 case Instruction::UIToFP:
6759 case Instruction::Trunc:
6760 case Instruction::FPTrunc: {
6761 // Computes the CastContextHint from a Load/Store instruction.
6762 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6763 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6764 "Expected a load or a store!");
6765
6766 if (VF.isScalar() || !TheLoop->contains(I))
6768
6769 switch (getWideningDecision(I, VF)) {
6781 llvm_unreachable("Instr did not go through cost modelling?");
6784 llvm_unreachable_internal("Instr has invalid widening decision");
6785 }
6786
6787 llvm_unreachable("Unhandled case!");
6788 };
6789
6790 unsigned Opcode = I->getOpcode();
6792 // For Trunc, the context is the only user, which must be a StoreInst.
6793 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6794 if (I->hasOneUse())
6795 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
6796 CCH = ComputeCCH(Store);
6797 }
6798 // For Z/Sext, the context is the operand, which must be a LoadInst.
6799 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6800 Opcode == Instruction::FPExt) {
6801 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
6802 CCH = ComputeCCH(Load);
6803 }
6804
6805 // We optimize the truncation of induction variables having constant
6806 // integer steps. The cost of these truncations is the same as the scalar
6807 // operation.
6808 if (isOptimizableIVTruncate(I, VF)) {
6809 auto *Trunc = cast<TruncInst>(I);
6810 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6811 Trunc->getSrcTy(), CCH, CostKind, Trunc);
6812 }
6813
6814 // Detect reduction patterns
6815 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
6816 return *RedCost;
6817
6818 Type *SrcScalarTy = I->getOperand(0)->getType();
6819 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6820 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6821 SrcScalarTy =
6822 IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]);
6823 Type *SrcVecTy =
6824 VectorTy->isVectorTy() ? toVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6825
6827 // If the result type is <= the source type, there will be no extend
6828 // after truncating the users to the minimal required bitwidth.
6829 if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
6830 (I->getOpcode() == Instruction::ZExt ||
6831 I->getOpcode() == Instruction::SExt))
6832 return 0;
6833 }
6834
6835 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
6836 }
6837 case Instruction::Call:
6838 return getVectorCallCost(cast<CallInst>(I), VF);
6839 case Instruction::ExtractValue:
6841 case Instruction::Alloca:
6842 // We cannot easily widen alloca to a scalable alloca, as
6843 // the result would need to be a vector of pointers.
6844 if (VF.isScalable())
6846 [[fallthrough]];
6847 default:
6848 // This opcode is unknown. Assume that it is the same as 'mul'.
6849 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6850 } // end of switch.
6851}
6852
6854 // Ignore ephemeral values.
6856
6857 SmallVector<Value *, 4> DeadInterleavePointerOps;
6859
6860 // If a scalar epilogue is required, users outside the loop won't use
6861 // live-outs from the vector loop but from the scalar epilogue. Ignore them if
6862 // that is the case.
6863 bool RequiresScalarEpilogue = requiresScalarEpilogue(true);
6864 auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) {
6865 return RequiresScalarEpilogue &&
6866 !TheLoop->contains(cast<Instruction>(U)->getParent());
6867 };
6868
6870 DFS.perform(LI);
6871 MapVector<Value *, SmallVector<Value *>> DeadInvariantStoreOps;
6872 for (BasicBlock *BB : reverse(make_range(DFS.beginRPO(), DFS.endRPO())))
6873 for (Instruction &I : reverse(*BB)) {
6874 // Find all stores to invariant variables. Since they are going to sink
6875 // outside the loop we do not need calculate cost for them.
6876 StoreInst *SI;
6877 if ((SI = dyn_cast<StoreInst>(&I)) &&
6878 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
6879 ValuesToIgnore.insert(&I);
6880 DeadInvariantStoreOps[SI->getPointerOperand()].push_back(
6881 SI->getValueOperand());
6882 }
6883
6884 if (VecValuesToIgnore.contains(&I) || ValuesToIgnore.contains(&I))
6885 continue;
6886
6887 // Add instructions that would be trivially dead and are only used by
6888 // values already ignored to DeadOps to seed worklist.
6890 all_of(I.users(), [this, IsLiveOutDead](User *U) {
6891 return VecValuesToIgnore.contains(U) ||
6892 ValuesToIgnore.contains(U) || IsLiveOutDead(U);
6893 }))
6894 DeadOps.push_back(&I);
6895
6896 // For interleave groups, we only create a pointer for the start of the
6897 // interleave group. Queue up addresses of group members except the insert
6898 // position for further processing.
6899 if (isAccessInterleaved(&I)) {
6900 auto *Group = getInterleavedAccessGroup(&I);
6901 if (Group->getInsertPos() == &I)
6902 continue;
6903 Value *PointerOp = getLoadStorePointerOperand(&I);
6904 DeadInterleavePointerOps.push_back(PointerOp);
6905 }
6906
6907 // Queue branches for analysis. They are dead, if their successors only
6908 // contain dead instructions.
6909 if (auto *Br = dyn_cast<BranchInst>(&I)) {
6910 if (Br->isConditional())
6911 DeadOps.push_back(&I);
6912 }
6913 }
6914
6915 // Mark ops feeding interleave group members as free, if they are only used
6916 // by other dead computations.
6917 for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
6918 auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]);
6919 if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) {
6920 Instruction *UI = cast<Instruction>(U);
6921 return !VecValuesToIgnore.contains(U) &&
6922 (!isAccessInterleaved(UI) ||
6923 getInterleavedAccessGroup(UI)->getInsertPos() == UI);
6924 }))
6925 continue;
6926 VecValuesToIgnore.insert(Op);
6927 DeadInterleavePointerOps.append(Op->op_begin(), Op->op_end());
6928 }
6929
6930 for (const auto &[_, Ops] : DeadInvariantStoreOps) {
6931 for (Value *Op : ArrayRef(Ops).drop_back())
6932 DeadOps.push_back(Op);
6933 }
6934 // Mark ops that would be trivially dead and are only used by ignored
6935 // instructions as free.
6936 BasicBlock *Header = TheLoop->getHeader();
6937
6938 // Returns true if the block contains only dead instructions. Such blocks will
6939 // be removed by VPlan-to-VPlan transforms and won't be considered by the
6940 // VPlan-based cost model, so skip them in the legacy cost-model as well.
6941 auto IsEmptyBlock = [this](BasicBlock *BB) {
6942 return all_of(*BB, [this](Instruction &I) {
6943 return ValuesToIgnore.contains(&I) || VecValuesToIgnore.contains(&I) ||
6944 (isa<BranchInst>(&I) && !cast<BranchInst>(&I)->isConditional());
6945 });
6946 };
6947 for (unsigned I = 0; I != DeadOps.size(); ++I) {
6948 auto *Op = dyn_cast<Instruction>(DeadOps[I]);
6949
6950 // Check if the branch should be considered dead.
6951 if (auto *Br = dyn_cast_or_null<BranchInst>(Op)) {
6952 BasicBlock *ThenBB = Br->getSuccessor(0);
6953 BasicBlock *ElseBB = Br->getSuccessor(1);
6954 // Don't considers branches leaving the loop for simplification.
6955 if (!TheLoop->contains(ThenBB) || !TheLoop->contains(ElseBB))
6956 continue;
6957 bool ThenEmpty = IsEmptyBlock(ThenBB);
6958 bool ElseEmpty = IsEmptyBlock(ElseBB);
6959 if ((ThenEmpty && ElseEmpty) ||
6960 (ThenEmpty && ThenBB->getSingleSuccessor() == ElseBB &&
6961 ElseBB->phis().empty()) ||
6962 (ElseEmpty && ElseBB->getSingleSuccessor() == ThenBB &&
6963 ThenBB->phis().empty())) {
6964 VecValuesToIgnore.insert(Br);
6965 DeadOps.push_back(Br->getCondition());
6966 }
6967 continue;
6968 }
6969
6970 // Skip any op that shouldn't be considered dead.
6971 if (!Op || !TheLoop->contains(Op) ||
6972 (isa<PHINode>(Op) && Op->getParent() == Header) ||
6974 any_of(Op->users(), [this, IsLiveOutDead](User *U) {
6975 return !VecValuesToIgnore.contains(U) &&
6976 !ValuesToIgnore.contains(U) && !IsLiveOutDead(U);
6977 }))
6978 continue;
6979
6980 if (!TheLoop->contains(Op->getParent()))
6981 continue;
6982
6983 // If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore
6984 // which applies for both scalar and vector versions. Otherwise it is only
6985 // dead in vector versions, so only add it to VecValuesToIgnore.
6986 if (all_of(Op->users(),
6987 [this](User *U) { return ValuesToIgnore.contains(U); }))
6988 ValuesToIgnore.insert(Op);
6989
6990 VecValuesToIgnore.insert(Op);
6991 DeadOps.append(Op->op_begin(), Op->op_end());
6992 }
6993
6994 // Ignore type-promoting instructions we identified during reduction
6995 // detection.
6996 for (const auto &Reduction : Legal->getReductionVars()) {
6997 const RecurrenceDescriptor &RedDes = Reduction.second;
6998 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6999 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7000 }
7001 // Ignore type-casting instructions we identified during induction
7002 // detection.
7003 for (const auto &Induction : Legal->getInductionVars()) {
7004 const InductionDescriptor &IndDes = Induction.second;
7005 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7006 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7007 }
7008}
7009
7011 // Avoid duplicating work finding in-loop reductions.
7012 if (!InLoopReductions.empty())
7013 return;
7014
7015 for (const auto &Reduction : Legal->getReductionVars()) {
7016 PHINode *Phi = Reduction.first;
7017 const RecurrenceDescriptor &RdxDesc = Reduction.second;
7018
7019 // We don't collect reductions that are type promoted (yet).
7020 if (RdxDesc.getRecurrenceType() != Phi->getType())
7021 continue;
7022
7023 // If the target would prefer this reduction to happen "in-loop", then we
7024 // want to record it as such.
7025 unsigned Opcode = RdxDesc.getOpcode();
7026 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7027 !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7029 continue;
7030
7031 // Check that we can correctly put the reductions into the loop, by
7032 // finding the chain of operations that leads from the phi to the loop
7033 // exit value.
7034 SmallVector<Instruction *, 4> ReductionOperations =
7035 RdxDesc.getReductionOpChain(Phi, TheLoop);
7036 bool InLoop = !ReductionOperations.empty();
7037
7038 if (InLoop) {
7039 InLoopReductions.insert(Phi);
7040 // Add the elements to InLoopReductionImmediateChains for cost modelling.
7041 Instruction *LastChain = Phi;
7042 for (auto *I : ReductionOperations) {
7043 InLoopReductionImmediateChains[I] = LastChain;
7044 LastChain = I;
7045 }
7046 }
7047 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7048 << " reduction for phi: " << *Phi << "\n");
7049 }
7050}
7051
7052// This function will select a scalable VF if the target supports scalable
7053// vectors and a fixed one otherwise.
7054// TODO: we could return a pair of values that specify the max VF and
7055// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7056// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7057// doesn't have a cost model that can choose which plan to execute if
7058// more than one is generated.
7061 unsigned WidestType;
7062 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7063
7068
7070 unsigned N = RegSize.getKnownMinValue() / WidestType;
7071 return ElementCount::get(N, RegSize.isScalable());
7072}
7073
7076 ElementCount VF = UserVF;
7077 // Outer loop handling: They may require CFG and instruction level
7078 // transformations before even evaluating whether vectorization is profitable.
7079 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7080 // the vectorization pipeline.
7081 if (!OrigLoop->isInnermost()) {
7082 // If the user doesn't provide a vectorization factor, determine a
7083 // reasonable one.
7084 if (UserVF.isZero()) {
7085 VF = determineVPlanVF(TTI, CM);
7086 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7087
7088 // Make sure we have a VF > 1 for stress testing.
7089 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7090 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7091 << "overriding computed VF.\n");
7092 VF = ElementCount::getFixed(4);
7093 }
7094 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
7096 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
7097 << "not supported by the target.\n");
7099 "Scalable vectorization requested but not supported by the target",
7100 "the scalable user-specified vectorization width for outer-loop "
7101 "vectorization cannot be used because the target does not support "
7102 "scalable vectors.",
7103 "ScalableVFUnfeasible", ORE, OrigLoop);
7105 }
7106 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7108 "VF needs to be a power of two");
7109 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7110 << "VF " << VF << " to build VPlans.\n");
7111 buildVPlans(VF, VF);
7112
7113 // For VPlan build stress testing, we bail out after VPlan construction.
7116
7117 return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
7118 }
7119
7120 LLVM_DEBUG(
7121 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7122 "VPlan-native path.\n");
7124}
7125
7126void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7127 assert(OrigLoop->isInnermost() && "Inner loop expected.");
7130
7131 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7132 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7133 return;
7134
7135 // Invalidate interleave groups if all blocks of loop will be predicated.
7136 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7138 LLVM_DEBUG(
7139 dbgs()
7140 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7141 "which requires masked-interleaved support.\n");
7143 // Invalidating interleave groups also requires invalidating all decisions
7144 // based on them, which includes widening decisions and uniform and scalar
7145 // values.
7147 }
7148
7149 if (CM.foldTailByMasking())
7151
7152 ElementCount MaxUserVF =
7153 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7154 if (UserVF) {
7155 if (!ElementCount::isKnownLE(UserVF, MaxUserVF)) {
7157 "UserVF ignored because it may be larger than the maximal safe VF",
7158 "InvalidUserVF", ORE, OrigLoop);
7159 } else {
7161 "VF needs to be a power of two");
7162 // Collect the instructions (and their associated costs) that will be more
7163 // profitable to scalarize.
7165 if (CM.selectUserVectorizationFactor(UserVF)) {
7166 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7167 buildVPlansWithVPRecipes(UserVF, UserVF);
7169 return;
7170 }
7171 reportVectorizationInfo("UserVF ignored because of invalid costs.",
7172 "InvalidCost", ORE, OrigLoop);
7173 }
7174 }
7175
7176 // Collect the Vectorization Factor Candidates.
7177 SmallVector<ElementCount> VFCandidates;
7178 for (auto VF = ElementCount::getFixed(1);
7179 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7180 VFCandidates.push_back(VF);
7181 for (auto VF = ElementCount::getScalable(1);
7182 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7183 VFCandidates.push_back(VF);
7184
7186 for (const auto &VF : VFCandidates) {
7187 // Collect Uniform and Scalar instructions after vectorization with VF.
7189
7190 // Collect the instructions (and their associated costs) that will be more
7191 // profitable to scalarize.
7192 if (VF.isVector())
7194 }
7195
7196 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7197 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7198
7200}
7201
7203 ElementCount VF) const {
7204 if (ForceTargetInstructionCost.getNumOccurrences())
7205 return InstructionCost(ForceTargetInstructionCost.getNumOccurrences());
7206 return CM.getInstructionCost(UI, VF);
7207}
7208
7209bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
7210 return CM.ValuesToIgnore.contains(UI) ||
7211 (IsVector && CM.VecValuesToIgnore.contains(UI)) ||
7212 SkipCostComputation.contains(UI);
7213}
7214
7216LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
7217 VPCostContext &CostCtx) const {
7219 // Cost modeling for inductions is inaccurate in the legacy cost model
7220 // compared to the recipes that are generated. To match here initially during
7221 // VPlan cost model bring up directly use the induction costs from the legacy
7222 // cost model. Note that we do this as pre-processing; the VPlan may not have
7223 // any recipes associated with the original induction increment instruction
7224 // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
7225 // the cost of induction phis and increments (both that are represented by
7226 // recipes and those that are not), to avoid distinguishing between them here,
7227 // and skip all recipes that represent induction phis and increments (the
7228 // former case) later on, if they exist, to avoid counting them twice.
7229 // Similarly we pre-compute the cost of any optimized truncates.
7230 // TODO: Switch to more accurate costing based on VPlan.
7231 for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
7232 Instruction *IVInc = cast<Instruction>(
7233 IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
7234 SmallVector<Instruction *> IVInsts = {IVInc};
7235 for (unsigned I = 0; I != IVInsts.size(); I++) {
7236 for (Value *Op : IVInsts[I]->operands()) {
7237 auto *OpI = dyn_cast<Instruction>(Op);
7238 if (Op == IV || !OpI || !OrigLoop->contains(OpI) || !Op->hasOneUse())
7239 continue;
7240 IVInsts.push_back(OpI);
7241 }
7242 }
7243 IVInsts.push_back(IV);
7244 for (User *U : IV->users()) {
7245 auto *CI = cast<Instruction>(U);
7246 if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF))
7247 continue;
7248 IVInsts.push_back(CI);
7249 }
7250
7251 // If the vector loop gets executed exactly once with the given VF, ignore
7252 // the costs of comparison and induction instructions, as they'll get
7253 // simplified away.
7254 // TODO: Remove this code after stepping away from the legacy cost model and
7255 // adding code to simplify VPlans before calculating their costs.
7256 auto TC = PSE.getSE()->getSmallConstantTripCount(OrigLoop);
7257 if (VF.isFixed() && TC == VF.getFixedValue() && !CM.foldTailByMasking())
7259 CostCtx.SkipCostComputation);
7260
7261 for (Instruction *IVInst : IVInsts) {
7262 if (CostCtx.skipCostComputation(IVInst, VF.isVector()))
7263 continue;
7264 InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF);
7265 LLVM_DEBUG({
7266 dbgs() << "Cost of " << InductionCost << " for VF " << VF
7267 << ": induction instruction " << *IVInst << "\n";
7268 });
7269 Cost += InductionCost;
7270 CostCtx.SkipCostComputation.insert(IVInst);
7271 }
7272 }
7273
7274 /// Compute the cost of all exiting conditions of the loop using the legacy
7275 /// cost model. This is to match the legacy behavior, which adds the cost of
7276 /// all exit conditions. Note that this over-estimates the cost, as there will
7277 /// be a single condition to control the vector loop.
7279 CM.TheLoop->getExitingBlocks(Exiting);
7280 SetVector<Instruction *> ExitInstrs;
7281 // Collect all exit conditions.
7282 for (BasicBlock *EB : Exiting) {
7283 auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
7284 if (!Term)
7285 continue;
7286 if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
7287 ExitInstrs.insert(CondI);
7288 }
7289 }
7290 // Compute the cost of all instructions only feeding the exit conditions.
7291 for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
7292 Instruction *CondI = ExitInstrs[I];
7293 if (!OrigLoop->contains(CondI) ||
7294 !CostCtx.SkipCostComputation.insert(CondI).second)
7295 continue;
7296 InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF);
7297 LLVM_DEBUG({
7298 dbgs() << "Cost of " << CondICost << " for VF " << VF
7299 << ": exit condition instruction " << *CondI << "\n";
7300 });
7301 Cost += CondICost;
7302 for (Value *Op : CondI->operands()) {
7303 auto *OpI = dyn_cast<Instruction>(Op);
7304 if (!OpI || any_of(OpI->users(), [&ExitInstrs, this](User *U) {
7305 return OrigLoop->contains(cast<Instruction>(U)->getParent()) &&
7306 !ExitInstrs.contains(cast<Instruction>(U));
7307 }))
7308 continue;
7309 ExitInstrs.insert(OpI);
7310 }
7311 }
7312
7313 // The legacy cost model has special logic to compute the cost of in-loop
7314 // reductions, which may be smaller than the sum of all instructions involved
7315 // in the reduction.
7316 // TODO: Switch to costing based on VPlan once the logic has been ported.
7317 for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) {
7318 if (ForceTargetInstructionCost.getNumOccurrences())
7319 continue;
7320
7321 if (!CM.isInLoopReduction(RedPhi))
7322 continue;
7323
7324 const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop);
7325 SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(),
7326 ChainOps.end());
7327 auto IsZExtOrSExt = [](const unsigned Opcode) -> bool {
7328 return Opcode == Instruction::ZExt || Opcode == Instruction::SExt;
7329 };
7330 // Also include the operands of instructions in the chain, as the cost-model
7331 // may mark extends as free.
7332 //
7333 // For ARM, some of the instruction can folded into the reducion
7334 // instruction. So we need to mark all folded instructions free.
7335 // For example: We can fold reduce(mul(ext(A), ext(B))) into one
7336 // instruction.
7337 for (auto *ChainOp : ChainOps) {
7338 for (Value *Op : ChainOp->operands()) {
7339 if (auto *I = dyn_cast<Instruction>(Op)) {
7340 ChainOpsAndOperands.insert(I);
7341 if (I->getOpcode() == Instruction::Mul) {
7342 auto *Ext0 = dyn_cast<Instruction>(I->getOperand(0));
7343 auto *Ext1 = dyn_cast<Instruction>(I->getOperand(1));
7344 if (Ext0 && IsZExtOrSExt(Ext0->getOpcode()) && Ext1 &&
7345 Ext0->getOpcode() == Ext1->getOpcode()) {
7346 ChainOpsAndOperands.insert(Ext0);
7347 ChainOpsAndOperands.insert(Ext1);
7348 }
7349 }
7350 }
7351 }
7352 }
7353
7354 // Pre-compute the cost for I, if it has a reduction pattern cost.
7355 for (Instruction *I : ChainOpsAndOperands) {
7356 auto ReductionCost =
7357 CM.getReductionPatternCost(I, VF, toVectorTy(I->getType(), VF));
7358 if (!ReductionCost)
7359 continue;
7360
7361 assert(!CostCtx.SkipCostComputation.contains(I) &&
7362 "reduction op visited multiple times");
7363 CostCtx.SkipCostComputation.insert(I);
7364 LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
7365 << ":\n in-loop reduction " << *I << "\n");
7366 Cost += *ReductionCost;
7367 }
7368 }
7369
7370 // Pre-compute the costs for branches except for the backedge, as the number
7371 // of replicate regions in a VPlan may not directly match the number of
7372 // branches, which would lead to different decisions.
7373 // TODO: Compute cost of branches for each replicate region in the VPlan,
7374 // which is more accurate than the legacy cost model.
7375 for (BasicBlock *BB : OrigLoop->blocks()) {
7376 if (CostCtx.skipCostComputation(BB->getTerminator(), VF.isVector()))
7377 continue;
7378 CostCtx.SkipCostComputation.insert(BB->getTerminator());
7379 if (BB == OrigLoop->getLoopLatch())
7380 continue;
7381 auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF);
7382 Cost += BranchCost;
7383 }
7384
7385 // Pre-compute costs for instructions that are forced-scalar or profitable to
7386 // scalarize. Their costs will be computed separately in the legacy cost
7387 // model.
7388 for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) {
7389 if (CostCtx.skipCostComputation(ForcedScalar, VF.isVector()))
7390 continue;
7391 CostCtx.SkipCostComputation.insert(ForcedScalar);
7392 InstructionCost ForcedCost = CostCtx.getLegacyCost(ForcedScalar, VF);
7393 LLVM_DEBUG({
7394 dbgs() << "Cost of " << ForcedCost << " for VF " << VF
7395 << ": forced scalar " << *ForcedScalar << "\n";
7396 });
7397 Cost += ForcedCost;
7398 }
7399 for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) {
7400 if (CostCtx.skipCostComputation(Scalarized, VF.isVector()))
7401 continue;
7402 CostCtx.SkipCostComputation.insert(Scalarized);
7403 LLVM_DEBUG({
7404 dbgs() << "Cost of " << ScalarCost << " for VF " << VF
7405 << ": profitable to scalarize " << *Scalarized << "\n";
7406 });
7407 Cost += ScalarCost;
7408 }
7409
7410 return Cost;
7411}
7412
7413InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
7414 ElementCount VF) const {
7415 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
7416 CM.CostKind);
7417 InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
7418
7419 // Now compute and add the VPlan-based cost.
7420 Cost += Plan.cost(VF, CostCtx);
7421#ifndef NDEBUG
7422 unsigned EstimatedWidth = getEstimatedRuntimeVF(VF, CM.getVScaleForTuning());
7423 LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
7424 << " (Estimated cost per lane: ");
7425 if (Cost.isValid()) {
7426 double CostPerLane = double(*Cost.getValue()) / EstimatedWidth;
7427 LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane));
7428 } else /* No point dividing an invalid cost - it will still be invalid */
7429 LLVM_DEBUG(dbgs() << "Invalid");
7430 LLVM_DEBUG(dbgs() << ")\n");
7431#endif
7432 return Cost;
7433}
7434
7435#ifndef NDEBUG
7436/// Return true if the original loop \ TheLoop contains any instructions that do
7437/// not have corresponding recipes in \p Plan and are not marked to be ignored
7438/// in \p CostCtx. This means the VPlan contains simplification that the legacy
7439/// cost-model did not account for.
7441 VPCostContext &CostCtx,
7442 Loop *TheLoop) {
7443 // First collect all instructions for the recipes in Plan.
7444 auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction * {
7445 if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
7446 return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
7447 if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
7448 return &WidenMem->getIngredient();
7449 return nullptr;
7450 };
7451
7452 DenseSet<Instruction *> SeenInstrs;
7453 auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry());
7454 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
7455 for (VPRecipeBase &R : *VPBB) {
7456 if (auto *IR = dyn_cast<VPInterleaveRecipe>(&R)) {
7457 auto *IG = IR->getInterleaveGroup();
7458 unsigned NumMembers = IG->getNumMembers();
7459 for (unsigned I = 0; I != NumMembers; ++I) {
7460 if (Instruction *M = IG->getMember(I))
7461 SeenInstrs.insert(M);
7462 }
7463 continue;
7464 }
7465 // The VPlan-based cost model is more accurate for partial reduction and
7466 // comparing against the legacy cost isn't desirable.
7467 if (isa<VPPartialReductionRecipe>(&R))
7468 return true;
7469 if (Instruction *UI = GetInstructionForCost(&R))
7470 SeenInstrs.insert(UI);
7471 }
7472 }
7473
7474 // Return true if the loop contains any instructions that are not also part of
7475 // the VPlan or are skipped for VPlan-based cost computations. This indicates
7476 // that the VPlan contains extra simplifications.
7477 return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx,
7478 TheLoop](BasicBlock *BB) {
7479 return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) {
7480 if (isa<PHINode>(&I) && BB == TheLoop->getHeader())
7481 return false;
7482 return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true);
7483 });
7484 });
7485}
7486#endif
7487
7489 if (VPlans.empty())
7491 // If there is a single VPlan with a single VF, return it directly.
7492 VPlan &FirstPlan = *VPlans[0];
7493 if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
7494 return {*FirstPlan.vectorFactors().begin(), 0, 0};
7495
7496 LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: "
7498 ? "Reciprocal Throughput\n"
7500 ? "Instruction Latency\n"
7501 : CM.CostKind == TTI::TCK_CodeSize ? "Code Size\n"
7503 ? "Code Size and Latency\n"
7504 : "Unknown\n"));
7505
7507 assert(hasPlanWithVF(ScalarVF) &&
7508 "More than a single plan/VF w/o any plan having scalar VF");
7509
7510 // TODO: Compute scalar cost using VPlan-based cost model.
7511 InstructionCost ScalarCost = CM.expectedCost(ScalarVF);
7512 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n");
7513 VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost);
7514 VectorizationFactor BestFactor = ScalarFactor;
7515
7516 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7517 if (ForceVectorization) {
7518 // Ignore scalar width, because the user explicitly wants vectorization.
7519 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
7520 // evaluation.
7521 BestFactor.Cost = InstructionCost::getMax();
7522 }
7523
7524 for (auto &P : VPlans) {
7525 for (ElementCount VF : P->vectorFactors()) {
7526 if (VF.isScalar())
7527 continue;
7528 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
7529 LLVM_DEBUG(
7530 dbgs()
7531 << "LV: Not considering vector loop of width " << VF
7532 << " because it will not generate any vector instructions.\n");
7533 continue;
7534 }
7535
7536 InstructionCost Cost = cost(*P, VF);
7537 VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7538 if (isMoreProfitable(CurrentFactor, BestFactor))
7539 BestFactor = CurrentFactor;
7540
7541 // If profitable add it to ProfitableVF list.
7542 if (isMoreProfitable(CurrentFactor, ScalarFactor))
7543 ProfitableVFs.push_back(CurrentFactor);
7544 }
7545 }
7546
7547#ifndef NDEBUG
7548 // Select the optimal vectorization factor according to the legacy cost-model.
7549 // This is now only used to verify the decisions by the new VPlan-based
7550 // cost-model and will be retired once the VPlan-based cost-model is
7551 // stabilized.
7552 VectorizationFactor LegacyVF = selectVectorizationFactor();
7553 VPlan &BestPlan = getPlanFor(BestFactor.Width);
7554
7555 // Pre-compute the cost and use it to check if BestPlan contains any
7556 // simplifications not accounted for in the legacy cost model. If that's the
7557 // case, don't trigger the assertion, as the extra simplifications may cause a
7558 // different VF to be picked by the VPlan-based cost model.
7559 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
7560 CM.CostKind);
7561 precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
7562 // Set PlanForEarlyExitLoop to true if the BestPlan has been built from a
7563 // loop with an uncountable early exit. The legacy cost model doesn't
7564 // properly model costs for such loops.
7565 bool PlanForEarlyExitLoop =
7566 BestPlan.getVectorLoopRegion() &&
7568 BestPlan.getMiddleBlock();
7569 assert((BestFactor.Width == LegacyVF.Width || PlanForEarlyExitLoop ||
7571 CostCtx, OrigLoop) ||
7573 CostCtx, OrigLoop)) &&
7574 " VPlan cost model and legacy cost model disagreed");
7575 assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
7576 "when vectorizing, the scalar cost must be computed.");
7577#endif
7578
7579 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n");
7580 return BestFactor;
7581}
7582
7585 // Reserve first location for self reference to the LoopID metadata node.
7586 MDs.push_back(nullptr);
7587 bool IsUnrollMetadata = false;
7588 MDNode *LoopID = L->getLoopID();
7589 if (LoopID) {
7590 // First find existing loop unrolling disable metadata.
7591 for (unsigned I = 1, IE = LoopID->getNumOperands(); I < IE; ++I) {
7592 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(I));
7593 if (MD) {
7594 const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7595 IsUnrollMetadata =
7596 S && S->getString().starts_with("llvm.loop.unroll.disable");
7597 }
7598 MDs.push_back(LoopID->getOperand(I));
7599 }
7600 }
7601
7602 if (!IsUnrollMetadata) {
7603 // Add runtime unroll disable metadata.
7604 LLVMContext &Context = L->getHeader()->getContext();
7605 SmallVector<Metadata *, 1> DisableOperands;
7606 DisableOperands.push_back(
7607 MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7608 MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7609 MDs.push_back(DisableNode);
7610 MDNode *NewLoopID = MDNode::get(Context, MDs);
7611 // Set operand 0 to refer to the loop id itself.
7612 NewLoopID->replaceOperandWith(0, NewLoopID);
7613 L->setLoopID(NewLoopID);
7614 }
7615}
7616
7617// If \p R is a ComputeReductionResult when vectorizing the epilog loop,
7618// fix the reduction's scalar PHI node by adding the incoming value from the
7619// main vector loop.
7621 VPRecipeBase *R, VPTransformState &State, BasicBlock *LoopMiddleBlock,
7622 BasicBlock *BypassBlock) {
7623 auto *EpiRedResult = dyn_cast<VPInstruction>(R);
7624 if (!EpiRedResult ||
7625 EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult)
7626 return;
7627
7628 auto *EpiRedHeaderPhi =
7629 cast<VPReductionPHIRecipe>(EpiRedResult->getOperand(0));
7630 const RecurrenceDescriptor &RdxDesc =
7631 EpiRedHeaderPhi->getRecurrenceDescriptor();
7632 Value *MainResumeValue =
7633 EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
7635 RdxDesc.getRecurrenceKind())) {
7636 auto *Cmp = cast<ICmpInst>(MainResumeValue);
7637 assert(Cmp->getPredicate() == CmpInst::ICMP_NE &&
7638 "AnyOf expected to start with ICMP_NE");
7639 assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue() &&
7640 "AnyOf expected to start by comparing main resume value to original "
7641 "start value");
7642 MainResumeValue = Cmp->getOperand(0);
7644 RdxDesc.getRecurrenceKind())) {
7645 using namespace llvm::PatternMatch;
7646 Value *Cmp, *OrigResumeV;
7647 bool IsExpectedPattern =
7648 match(MainResumeValue, m_Select(m_OneUse(m_Value(Cmp)),
7649 m_Specific(RdxDesc.getSentinelValue()),
7650 m_Value(OrigResumeV))) &&
7651 match(Cmp,
7654 assert(IsExpectedPattern && "Unexpected reduction resume pattern");
7655 (void)IsExpectedPattern;
7656 MainResumeValue = OrigResumeV;
7657 }
7658 PHINode *MainResumePhi = cast<PHINode>(MainResumeValue);
7659
7660 // When fixing reductions in the epilogue loop we should already have
7661 // created a bc.merge.rdx Phi after the main vector body. Ensure that we carry
7662 // over the incoming values correctly.
7663 using namespace VPlanPatternMatch;
7664 auto IsResumePhi = [](VPUser *U) {
7665 auto *VPI = dyn_cast<VPInstruction>(U);
7666 return VPI && VPI->getOpcode() == VPInstruction::ResumePhi;
7667 };
7668 assert(count_if(EpiRedResult->users(), IsResumePhi) == 1 &&
7669 "ResumePhi must have a single user");
7670 auto *EpiResumePhiVPI =
7671 cast<VPInstruction>(*find_if(EpiRedResult->users(), IsResumePhi));
7672 auto *EpiResumePhi = cast<PHINode>(State.get(EpiResumePhiVPI, true));
7673 EpiResumePhi->setIncomingValueForBlock(
7674 BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock));
7675}
7676
7678 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7679 InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue,
7680 const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
7681 assert(BestVPlan.hasVF(BestVF) &&
7682 "Trying to execute plan with unsupported VF");
7683 assert(BestVPlan.hasUF(BestUF) &&
7684 "Trying to execute plan with unsupported UF");
7685 assert(
7686 ((VectorizingEpilogue && ExpandedSCEVs) ||
7687 (!VectorizingEpilogue && !ExpandedSCEVs)) &&
7688 "expanded SCEVs to reuse can only be used during epilogue vectorization");
7689
7690 // TODO: Move to VPlan transform stage once the transition to the VPlan-based
7691 // cost model is complete for better cost estimates.
7693 OrigLoop->getHeader()->getContext());
7694 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7698
7699 // Perform the actual loop transformation.
7700 VPTransformState State(&TTI, BestVF, BestUF, LI, DT, ILV.Builder, &ILV,
7701 &BestVPlan, OrigLoop->getParentLoop(),
7702 Legal->getWidestInductionType());
7703
7704#ifdef EXPENSIVE_CHECKS
7705 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
7706#endif
7707
7708 // 0. Generate SCEV-dependent code in the entry, including TripCount, before
7709 // making any changes to the CFG.
7710 if (!BestVPlan.getEntry()->empty())
7711 BestVPlan.getEntry()->execute(&State);
7712
7713 if (!ILV.getTripCount())
7714 ILV.setTripCount(State.get(BestVPlan.getTripCount(), VPLane(0)));
7715 else
7716 assert(VectorizingEpilogue && "should only re-use the existing trip "
7717 "count during epilogue vectorization");
7718
7719 // 1. Set up the skeleton for vectorization, including vector pre-header and
7720 // middle block. The vector loop is created during VPlan execution.
7721 VPBasicBlock *VectorPH =
7722 cast<VPBasicBlock>(BestVPlan.getEntry()->getSingleSuccessor());
7724 ExpandedSCEVs ? *ExpandedSCEVs : State.ExpandedSCEVs);
7725 if (VectorizingEpilogue)
7727
7728 // Only use noalias metadata when using memory checks guaranteeing no overlap
7729 // across all iterations.
7730 const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7731 std::unique_ptr<LoopVersioning> LVer = nullptr;
7732 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7734
7735 // We currently don't use LoopVersioning for the actual loop cloning but we
7736 // still use it to add the noalias metadata.
7737 // TODO: Find a better way to re-use LoopVersioning functionality to add
7738 // metadata.
7739 LVer = std::make_unique<LoopVersioning>(
7740 *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7741 PSE.getSE());
7742 State.LVer = &*LVer;
7744 }
7745
7747
7748 //===------------------------------------------------===//
7749 //
7750 // Notice: any optimization or new instruction that go
7751 // into the code below should also be implemented in
7752 // the cost-model.
7753 //
7754 //===------------------------------------------------===//
7755
7756 // 2. Copy and widen instructions from the old loop into the new loop.
7757 BestVPlan.prepareToExecute(
7758 ILV.getTripCount(),
7760 replaceVPBBWithIRVPBB(VectorPH, State.CFG.PrevBB);
7761
7762 BestVPlan.execute(&State);
7763
7764 auto *MiddleVPBB = BestVPlan.getMiddleBlock();
7765 // 2.5 When vectorizing the epilogue, fix reduction and induction resume
7766 // values from the additional bypass block.
7767 if (VectorizingEpilogue) {
7769 "Epilogue vectorisation not yet supported with early exits");
7770 BasicBlock *PH = OrigLoop->getLoopPreheader();
7771 BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock();
7772 for (auto *Pred : predecessors(PH)) {
7773 for (PHINode &Phi : PH->phis()) {
7774 if (Phi.getBasicBlockIndex(Pred) != -1)
7775 continue;
7776 Phi.addIncoming(Phi.getIncomingValueForBlock(BypassBlock), Pred);
7777 }
7778 }
7779
7780 for (VPRecipeBase &R : *MiddleVPBB) {
7782 &R, State, State.CFG.VPBB2IRBB[MiddleVPBB], BypassBlock);
7783 }
7784 for (const auto &[IVPhi, _] : Legal->getInductionVars()) {
7785 auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
7787 Inc->setIncomingValueForBlock(BypassBlock, V);
7788 }
7789 }
7790
7791 // 2.6. Maintain Loop Hints
7792 // Keep all loop hints from the original loop on the vector loop (we'll
7793 // replace the vectorizer-specific hints below).
7794 if (auto *LoopRegion = BestVPlan.getVectorLoopRegion()) {
7795 MDNode *OrigLoopID = OrigLoop->getLoopID();
7796
7797 std::optional<MDNode *> VectorizedLoopID =
7800
7801 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
7802 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7803 if (VectorizedLoopID) {
7804 L->setLoopID(*VectorizedLoopID);
7805 } else {
7806 // Keep all loop hints from the original loop on the vector loop (we'll
7807 // replace the vectorizer-specific hints below).
7808 if (MDNode *LID = OrigLoop->getLoopID())
7809 L->setLoopID(LID);
7810
7811 LoopVectorizeHints Hints(L, true, *ORE);
7812 Hints.setAlreadyVectorized();
7813 }
7815 TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7816 if (!UP.UnrollVectorizedLoop || VectorizingEpilogue)
7818 }
7819
7820 // 3. Fix the vectorized code: take care of header phi's, live-outs,
7821 // predication, updating analyses.
7822 ILV.fixVectorizedLoop(State);
7823
7825
7826 // 4. Adjust branch weight of the branch in the middle block.
7827 if (BestVPlan.getVectorLoopRegion()) {
7828 auto *MiddleVPBB = BestVPlan.getMiddleBlock();
7829 auto *MiddleTerm =
7830 cast<BranchInst>(State.CFG.VPBB2IRBB[MiddleVPBB]->getTerminator());
7831 if (MiddleTerm->isConditional() &&
7832 hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7833 // Assume that `Count % VectorTripCount` is equally distributed.
7834 unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
7835 assert(TripCount > 0 && "trip count should not be zero");
7836 const uint32_t Weights[] = {1, TripCount - 1};
7837 setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
7838 }
7839 }
7840
7841 return State.ExpandedSCEVs;
7842}
7843
7844//===--------------------------------------------------------------------===//
7845// EpilogueVectorizerMainLoop
7846//===--------------------------------------------------------------------===//
7847
7848/// This function is partially responsible for generating the control flow
7849/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7851 const SCEV2ValueTy &ExpandedSCEVs) {
7853
7854 // Generate the code to check the minimum iteration count of the vector
7855 // epilogue (see below).
7859
7860 // Generate the code to check any assumptions that we've made for SCEV
7861 // expressions.
7863
7864 // Generate the code that checks at runtime if arrays overlap. We put the
7865 // checks into a separate block to make the more common case of few elements
7866 // faster.
7868
7869 // Generate the iteration count check for the main loop, *after* the check
7870 // for the epilogue loop, so that the path-length is shorter for the case
7871 // that goes directly through the vector epilogue. The longer-path length for
7872 // the main loop is compensated for, by the gain from vectorizing the larger
7873 // trip count. Note: the branch will get updated later on when we vectorize
7874 // the epilogue.
7877
7878 // Generate the induction variable.
7880
7881 return LoopVectorPreHeader;
7882}
7883
7885 LLVM_DEBUG({
7886 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7887 << "Main Loop VF:" << EPI.MainLoopVF
7888 << ", Main Loop UF:" << EPI.MainLoopUF
7889 << ", Epilogue Loop VF:" << EPI.EpilogueVF
7890 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7891 });
7892}
7893
7896 dbgs() << "intermediate fn:\n"
7897 << *OrigLoop->getHeader()->getParent() << "\n";
7898 });
7899}
7900
7901BasicBlock *
7903 bool ForEpilogue) {
7904 assert(Bypass && "Expected valid bypass basic block.");
7905 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7906 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7907 Value *Count = getTripCount();
7908 // Reuse existing vector loop preheader for TC checks.
7909 // Note that new preheader block is generated for vector loop.
7910 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7911 IRBuilder<> Builder(TCCheckBlock->getTerminator());
7912
7913 // Generate code to check if the loop's trip count is less than VF * UF of the
7914 // main vector loop.
7915 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector()
7916 : VF.isVector())
7919
7920 Value *CheckMinIters = Builder.CreateICmp(
7921 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7922 "min.iters.check");
7923
7924 if (!ForEpilogue)
7925 TCCheckBlock->setName("vector.main.loop.iter.check");
7926
7927 // Create new preheader for vector loop.
7928 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7929 DT, LI, nullptr, "vector.ph");
7930
7931 if (ForEpilogue) {
7932 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7933 DT->getNode(Bypass)->getIDom()) &&
7934 "TC check is expected to dominate Bypass");
7935
7936 LoopBypassBlocks.push_back(TCCheckBlock);
7937
7938 // Save the trip count so we don't have to regenerate it in the
7939 // vec.epilog.iter.check. This is safe to do because the trip count
7940 // generated here dominates the vector epilog iter check.
7941 EPI.TripCount = Count;
7942 }
7943
7944 BranchInst &BI =
7945 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7947 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
7948 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
7949
7950 introduceCheckBlockInVPlan(TCCheckBlock);
7951 return TCCheckBlock;
7952}
7953
7954//===--------------------------------------------------------------------===//
7955// EpilogueVectorizerEpilogueLoop
7956//===--------------------------------------------------------------------===//
7957
7958/// This function is partially responsible for generating the control flow
7959/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7960BasicBlock *
7962 const SCEV2ValueTy &ExpandedSCEVs) {
7963 createVectorLoopSkeleton("vec.epilog.");
7964
7965 // Now, compare the remaining count and if there aren't enough iterations to
7966 // execute the vectorized epilogue skip to the scalar part.
7967 LoopVectorPreHeader->setName("vec.epilog.ph");
7968 BasicBlock *VecEpilogueIterationCountCheck =
7970 nullptr, "vec.epilog.iter.check", true);
7972 VecEpilogueIterationCountCheck);
7973 AdditionalBypassBlock = VecEpilogueIterationCountCheck;
7974
7975 // Adjust the control flow taking the state info from the main loop
7976 // vectorization into account.
7978 "expected this to be saved from the previous pass.");
7980 VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7981
7983 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7984
7985 if (EPI.SCEVSafetyCheck)
7987 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7988 if (EPI.MemSafetyCheck)
7990 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7991
7994 // Keep track of bypass blocks, as they feed start values to the induction and
7995 // reduction phis in the scalar loop preheader.
7996 if (EPI.SCEVSafetyCheck)
7998 if (EPI.MemSafetyCheck)
8001
8002 // The vec.epilog.iter.check block may contain Phi nodes from inductions or
8003 // reductions which merge control-flow from the latch block and the middle
8004 // block. Update the incoming values here and move the Phi into the preheader.
8005 SmallVector<PHINode *, 4> PhisInBlock;
8006 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
8007 PhisInBlock.push_back(&Phi);
8008
8009 for (PHINode *Phi : PhisInBlock) {
8010 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHIIt());
8011 Phi->replaceIncomingBlockWith(
8012 VecEpilogueIterationCountCheck->getSinglePredecessor(),
8013 VecEpilogueIterationCountCheck);
8014
8015 // If the phi doesn't have an incoming value from the
8016 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
8017 // value and also those from other check blocks. This is needed for
8018 // reduction phis only.
8019 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
8020 return EPI.EpilogueIterationCountCheck == IncB;
8021 }))
8022 continue;
8023 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
8024 if (EPI.SCEVSafetyCheck)
8025 Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
8026 if (EPI.MemSafetyCheck)
8027 Phi->removeIncomingValue(EPI.MemSafetyCheck);
8028 }
8029
8030 // Generate bypass values from the additional bypass block. Note that when the
8031 // vectorized epilogue is skipped due to iteration count check, then the
8032 // resume value for the induction variable comes from the trip count of the
8033 // main vector loop, passed as the second argument.
8035 return LoopVectorPreHeader;
8036}
8037
8038BasicBlock *
8040 BasicBlock *Bypass, BasicBlock *Insert) {
8041
8043 "Expected trip count to have been saved in the first pass.");
8044 assert(
8045 (!isa<Instruction>(EPI.TripCount) ||
8046 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
8047 "saved trip count does not dominate insertion point.");
8048 Value *TC = EPI.TripCount;
8049 IRBuilder<> Builder(Insert->getTerminator());
8050 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
8051
8052 // Generate code to check if the loop's trip count is less than VF * UF of the
8053 // vector epilogue loop.
8054 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
8057
8058 Value *CheckMinIters =
8059 Builder.CreateICmp(P, Count,
8062 "min.epilog.iters.check");
8063
8064 BranchInst &BI =
8065 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
8067 unsigned MainLoopStep = UF * VF.getKnownMinValue();
8068 unsigned EpilogueLoopStep =
8070 // We assume the remaining `Count` is equally distributed in
8071 // [0, MainLoopStep)
8072 // So the probability for `Count < EpilogueLoopStep` should be
8073 // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
8074 unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
8075 const uint32_t Weights[] = {EstimatedSkipCount,
8076 MainLoopStep - EstimatedSkipCount};
8077 setBranchWeights(BI, Weights, /*IsExpected=*/false);
8078 }
8079 ReplaceInstWithInst(Insert->getTerminator(), &BI);
8080 LoopBypassBlocks.push_back(Insert);
8081
8082 // A new entry block has been created for the epilogue VPlan. Hook it in, as
8083 // otherwise we would try to modify the entry to the main vector loop.
8084 VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(Insert);
8085 VPBasicBlock *OldEntry = Plan.getEntry();
8086 VPBlockUtils::reassociateBlocks(OldEntry, NewEntry);
8087 Plan.setEntry(NewEntry);
8088 // OldEntry is now dead and will be cleaned up when the plan gets destroyed.
8089
8091 return Insert;
8092}
8093
8095 LLVM_DEBUG({
8096 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
8097 << "Epilogue Loop VF:" << EPI.EpilogueVF
8098 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8099 });
8100}
8101
8104 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
8105 });
8106}
8107
8108iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>>
8110 std::function<VPValue *(Value *)> Fn = [this](Value *Op) {
8111 return getVPValueOrAddLiveIn(Op);
8112 };
8113 return map_range(Operands, Fn);
8114}
8115
8117 BasicBlock *Src = SI->getParent();
8118 assert(!OrigLoop->isLoopExiting(Src) &&
8119 all_of(successors(Src),
8120 [this](BasicBlock *Succ) {
8121 return OrigLoop->getHeader() != Succ;
8122 }) &&
8123 "unsupported switch either exiting loop or continuing to header");
8124 // Create masks where the terminator in Src is a switch. We create mask for
8125 // all edges at the same time. This is more efficient, as we can create and
8126 // collect compares for all cases once.
8127 VPValue *Cond = getVPValueOrAddLiveIn(SI->getCondition());
8128 BasicBlock *DefaultDst = SI->getDefaultDest();
8130 for (auto &C : SI->cases()) {
8131 BasicBlock *Dst = C.getCaseSuccessor();
8132 assert(!EdgeMaskCache.contains({Src, Dst}) && "Edge masks already created");
8133 // Cases whose destination is the same as default are redundant and can be
8134 // ignored - they will get there anyhow.
8135 if (Dst == DefaultDst)
8136 continue;
8137 auto &Compares = Dst2Compares[Dst];
8138 VPValue *V = getVPValueOrAddLiveIn(C.getCaseValue());
8139 Compares.push_back(Builder.createICmp(CmpInst::ICMP_EQ, Cond, V));
8140 }
8141
8142 // We need to handle 2 separate cases below for all entries in Dst2Compares,
8143 // which excludes destinations matching the default destination.
8144 VPValue *SrcMask = getBlockInMask(Src);
8145 VPValue *DefaultMask = nullptr;
8146 for (const auto &[Dst, Conds] : Dst2Compares) {
8147 // 1. Dst is not the default destination. Dst is reached if any of the cases
8148 // with destination == Dst are taken. Join the conditions for each case
8149 // whose destination == Dst using an OR.
8150 VPValue *Mask = Conds[0];
8151 for (VPValue *V : ArrayRef<VPValue *>(Conds).drop_front())
8152 Mask = Builder.createOr(Mask, V);
8153 if (SrcMask)
8154 Mask = Builder.createLogicalAnd(SrcMask, Mask);
8155 EdgeMaskCache[{Src, Dst}] = Mask;
8156
8157 // 2. Create the mask for the default destination, which is reached if none
8158 // of the cases with destination != default destination are taken. Join the
8159 // conditions for each case where the destination is != Dst using an OR and
8160 // negate it.
8161 DefaultMask = DefaultMask ? Builder.createOr(DefaultMask, Mask) : Mask;
8162 }
8163
8164 if (DefaultMask) {
8165 DefaultMask = Builder.createNot(DefaultMask);
8166 if (SrcMask)
8167 DefaultMask = Builder.createLogicalAnd(SrcMask, DefaultMask);
8168 }
8169 EdgeMaskCache[{Src, DefaultDst}] = DefaultMask;
8170}
8171
8173 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8174
8175 // Look for cached value.
8176 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8177 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8178 if (ECEntryIt != EdgeMaskCache.end())
8179 return ECEntryIt->second;
8180
8181 if (auto *SI = dyn_cast<SwitchInst>(Src->getTerminator())) {
8183 assert(EdgeMaskCache.contains(Edge) && "Mask for Edge not created?");
8184 return EdgeMaskCache[Edge];
8185 }
8186
8187 VPValue *SrcMask = getBlockInMask(Src);
8188
8189 // The terminator has to be a branch inst!
8190 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8191 assert(BI && "Unexpected terminator found");
8192 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8193 return EdgeMaskCache[Edge] = SrcMask;
8194
8195 // If source is an exiting block, we know the exit edge is dynamically dead
8196 // in the vector loop, and thus we don't need to restrict the mask. Avoid
8197 // adding uses of an otherwise potentially dead instruction unless we are
8198 // vectorizing a loop with uncountable exits. In that case, we always
8199 // materialize the mask.
8200 if (OrigLoop->isLoopExiting(Src) &&
8201 Src != Legal->getUncountableEarlyExitingBlock())
8202 return EdgeMaskCache[Edge] = SrcMask;
8203
8204 VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition());
8205 assert(EdgeMask && "No Edge Mask found for condition");
8206
8207 if (BI->getSuccessor(0) != Dst)
8208 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8209
8210 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8211 // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask
8212 // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd'
8213 // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8214 EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, BI->getDebugLoc());
8215 }
8216
8217 return EdgeMaskCache[Edge] = EdgeMask;
8218}
8219
8221 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8222
8223 // Look for cached value.
8224 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8225 EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Edge);
8226 assert(ECEntryIt != EdgeMaskCache.end() &&
8227 "looking up mask for edge which has not been created");
8228 return ECEntryIt->second;
8229}
8230
8232 BasicBlock *Header = OrigLoop->getHeader();
8233
8234 // When not folding the tail, use nullptr to model all-true mask.
8235 if (!CM.foldTailByMasking()) {
8236 BlockMaskCache[Header] = nullptr;
8237 return;
8238 }
8239
8240 // Introduce the early-exit compare IV <= BTC to form header block mask.
8241 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8242 // constructing the desired canonical IV in the header block as its first
8243 // non-phi instructions.
8244
8245 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
8246 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8247 auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
8248 HeaderVPBB->insert(IV, NewInsertionPoint);
8249
8250 VPBuilder::InsertPointGuard Guard(Builder);
8251 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8252 VPValue *BlockMask = nullptr;
8254 BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
8255 BlockMaskCache[Header] = BlockMask;
8256}
8257
8259 // Return the cached value.
8260 BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
8261 assert(BCEntryIt != BlockMaskCache.end() &&
8262 "Trying to access mask for block without one.");
8263 return BCEntryIt->second;
8264}
8265
8267 assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8268 assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
8269 assert(OrigLoop->getHeader() != BB &&
8270 "Loop header must have cached block mask");
8271
8272 // All-one mask is modelled as no-mask following the convention for masked
8273 // load/store/gather/scatter. Initialize BlockMask to no-mask.
8274 VPValue *BlockMask = nullptr;
8275 // This is the block mask. We OR all unique incoming edges.
8276 for (auto *Predecessor :
8278 VPValue *EdgeMask = createEdgeMask(Predecessor, BB);
8279 if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
8280 BlockMaskCache[BB] = EdgeMask;
8281 return;
8282 }
8283
8284 if (!BlockMask) { // BlockMask has its initialized nullptr value.
8285 BlockMask = EdgeMask;
8286 continue;
8287 }
8288
8289 BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8290 }
8291
8292 BlockMaskCache[BB] = BlockMask;
8293}
8294
8296VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
8297 VFRange &Range) {
8298 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8299 "Must be called with either a load or store");
8300
8301 auto WillWiden = [&](ElementCount VF) -> bool {
8303 CM.getWideningDecision(I, VF);
8305 "CM decision should be taken at this point.");
8307 return true;
8308 if (CM.isScalarAfterVectorization(I, VF) ||
8309 CM.isProfitableToScalarize(I, VF))
8310 return false;
8312 };
8313
8315 return nullptr;
8316
8317 VPValue *Mask = nullptr;
8318 if (Legal->isMaskRequired(I))
8319 Mask = getBlockInMask(I->getParent());
8320
8321 // Determine if the pointer operand of the access is either consecutive or
8322 // reverse consecutive.
8324 CM.getWideningDecision(I, Range.Start);
8326 bool Consecutive =
8328
8329 VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
8330 if (Consecutive) {
8331 auto *GEP = dyn_cast<GetElementPtrInst>(
8332 Ptr->getUnderlyingValue()->stripPointerCasts());
8333 VPSingleDefRecipe *VectorPtr;
8334 if (Reverse) {
8335 // When folding the tail, we may compute an address that we don't in the
8336 // original scalar loop and it may not be inbounds. Drop Inbounds in that
8337 // case.
8338 GEPNoWrapFlags Flags =
8339 (CM.foldTailByMasking() || !GEP || !GEP->isInBounds())
8342 VectorPtr = new VPReverseVectorPointerRecipe(
8343 Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc());
8344 } else {
8345 VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
8346 GEP ? GEP->getNoWrapFlags()
8348 I->getDebugLoc());
8349 }
8350 Builder.insert(VectorPtr);
8351 Ptr = VectorPtr;
8352 }
8353 if (LoadInst *Load = dyn_cast<LoadInst>(I))
8354 return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
8355 I->getDebugLoc());
8356
8357 StoreInst *Store = cast<StoreInst>(I);
8358 return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
8359 Reverse, I->getDebugLoc());
8360}
8361
8362/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8363/// insert a recipe to expand the step for the induction recipe.
8366 VPValue *Start, const InductionDescriptor &IndDesc,
8367 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) {
8368 assert(IndDesc.getStartValue() ==
8369 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8370 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8371 "step must be loop invariant");
8372
8373 VPValue *Step =
8375 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8376 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
8377 IndDesc, TruncI,
8378 TruncI->getDebugLoc());
8379 }
8380 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8381 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
8382 IndDesc, Phi->getDebugLoc());
8383}
8384
8385VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
8387
8388 // Check if this is an integer or fp induction. If so, build the recipe that
8389 // produces its scalar and vector values.
8390 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8391 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
8392 *PSE.getSE(), *OrigLoop);
8393
8394 // Check if this is pointer induction. If so, build the recipe for it.
8395 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
8396 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
8397 *PSE.getSE());
8399 Phi, Operands[0], Step, *II,
8401 [&](ElementCount VF) {
8402 return CM.isScalarAfterVectorization(Phi, VF);
8403 },
8404 Range),
8405 Phi->getDebugLoc());
8406 }
8407 return nullptr;
8408}
8409
8410VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8412 // Optimize the special case where the source is a constant integer
8413 // induction variable. Notice that we can only optimize the 'trunc' case
8414 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8415 // (c) other casts depend on pointer size.
8416
8417 // Determine whether \p K is a truncation based on an induction variable that
8418 // can be optimized.
8419 auto IsOptimizableIVTruncate =
8420 [&](Instruction *K) -> std::function<bool(ElementCount)> {
8421 return [=](ElementCount VF) -> bool {
8422 return CM.isOptimizableIVTruncate(K, VF);
8423 };
8424 };
8425
8427 IsOptimizableIVTruncate(I), Range)) {
8428
8429 auto *Phi = cast<PHINode>(I->getOperand(0));
8431 VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue());
8432 return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
8433 *OrigLoop);
8434 }
8435 return nullptr;
8436}
8437
8438VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
8440 unsigned NumIncoming = Phi->getNumIncomingValues();
8441
8442 // We know that all PHIs in non-header blocks are converted into selects, so
8443 // we don't have to worry about the insertion order and we can just use the
8444 // builder. At this point we generate the predication tree. There may be
8445 // duplications since this is a simple recursive scan, but future
8446 // optimizations will clean it up.
8447 SmallVector<VPValue *, 2> OperandsWithMask;
8448
8449 for (unsigned In = 0; In < NumIncoming; In++) {
8450 OperandsWithMask.push_back(Operands[In]);
8451 VPValue *EdgeMask =
8452 getEdgeMask(Phi->getIncomingBlock(In), Phi->getParent());
8453 if (!EdgeMask) {
8454 assert(In == 0 && "Both null and non-null edge masks found");
8456 "Distinct incoming values with one having a full mask");
8457 break;
8458 }
8459 OperandsWithMask.push_back(EdgeMask);
8460 }
8461 return new VPBlendRecipe(Phi, OperandsWithMask);
8462}
8463
8464VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8466 VFRange &Range) {
8468 [this, CI](ElementCount VF) {
8469 return CM.isScalarWithPredication(CI, VF);
8470 },
8471 Range);
8472
8473 if (IsPredicated)
8474 return nullptr;
8475
8477 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8478 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8479 ID == Intrinsic::pseudoprobe ||
8480 ID == Intrinsic::experimental_noalias_scope_decl))
8481 return nullptr;
8482
8483 SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
8484
8485 // Is it beneficial to perform intrinsic call compared to lib call?
8486 bool ShouldUseVectorIntrinsic =
8488 [&](ElementCount VF) -> bool {
8489 return CM.getCallWideningDecision(CI, VF).Kind ==
8491 },
8492 Range);
8493 if (ShouldUseVectorIntrinsic)
8494 return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(),
8495 CI->getDebugLoc());
8496
8497 Function *Variant = nullptr;
8498 std::optional<unsigned> MaskPos;
8499 // Is better to call a vectorized version of the function than to to scalarize
8500 // the call?
8501 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8502 [&](ElementCount VF) -> bool {
8503 // The following case may be scalarized depending on the VF.
8504 // The flag shows whether we can use a usual Call for vectorized
8505 // version of the instruction.
8506
8507 // If we've found a variant at a previous VF, then stop looking. A
8508 // vectorized variant of a function expects input in a certain shape
8509 // -- basically the number of input registers, the number of lanes
8510 // per register, and whether there's a mask required.
8511 // We store a pointer to the variant in the VPWidenCallRecipe, so
8512 // once we have an appropriate variant it's only valid for that VF.
8513 // This will force a different vplan to be generated for each VF that
8514 // finds a valid variant.
8515 if (Variant)
8516 return false;
8518 CM.getCallWideningDecision(CI, VF);
8520 Variant = Decision.Variant;
8521 MaskPos = Decision.MaskPos;
8522 return true;
8523 }
8524
8525 return false;
8526 },
8527 Range);
8528 if (ShouldUseVectorCall) {
8529 if (MaskPos.has_value()) {
8530 // We have 2 cases that would require a mask:
8531 // 1) The block needs to be predicated, either due to a conditional
8532 // in the scalar loop or use of an active lane mask with
8533 // tail-folding, and we use the appropriate mask for the block.
8534 // 2) No mask is required for the block, but the only available
8535 // vector variant at this VF requires a mask, so we synthesize an
8536 // all-true mask.
8537 VPValue *Mask = nullptr;
8538 if (Legal->isMaskRequired(CI))
8539 Mask = getBlockInMask(CI->getParent());
8540 else
8541 Mask = Plan.getOrAddLiveIn(
8543
8544 Ops.insert(Ops.begin() + *MaskPos, Mask);
8545 }
8546
8547 Ops.push_back(Operands.back());
8548 return new VPWidenCallRecipe(CI, Variant, Ops, CI->getDebugLoc());
8549 }
8550
8551 return nullptr;
8552}
8553
8554bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8555 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8556 !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8557 // Instruction should be widened, unless it is scalar after vectorization,
8558 // scalarization is profitable or it is predicated.
8559 auto WillScalarize = [this, I](ElementCount VF) -> bool {
8560 return CM.isScalarAfterVectorization(I, VF) ||
8561 CM.isProfitableToScalarize(I, VF) ||
8562 CM.isScalarWithPredication(I, VF);
8563 };
8565 Range);
8566}
8567
8568VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8570 switch (I->getOpcode()) {
8571 default:
8572 return nullptr;
8573 case Instruction::SDiv:
8574 case Instruction::UDiv:
8575 case Instruction::SRem:
8576 case Instruction::URem: {
8577 // If not provably safe, use a select to form a safe divisor before widening the
8578 // div/rem operation itself. Otherwise fall through to general handling below.
8579 if (CM.isPredicatedInst(I)) {
8581 VPValue *Mask = getBlockInMask(I->getParent());
8582 VPValue *One =
8583 Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));
8584 auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc());
8585 Ops[1] = SafeRHS;
8586 return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
8587 }
8588 [[fallthrough]];
8589 }
8590 case Instruction::Add:
8591 case Instruction::And:
8592 case Instruction::AShr:
8593 case Instruction::FAdd:
8594 case Instruction::FCmp:
8595 case Instruction::FDiv:
8596 case Instruction::FMul:
8597 case Instruction::FNeg:
8598 case Instruction::FRem:
8599 case Instruction::FSub:
8600 case Instruction::ICmp:
8601 case Instruction::LShr:
8602 case Instruction::Mul:
8603 case Instruction::Or:
8604 case Instruction::Select:
8605 case Instruction::Shl:
8606 case Instruction::Sub:
8607 case Instruction::Xor:
8608 case Instruction::Freeze:
8610 if (Instruction::isBinaryOp(I->getOpcode())) {
8611 // The legacy cost model uses SCEV to check if some of the operands are
8612 // constants. To match the legacy cost model's behavior, use SCEV to try
8613 // to replace operands with constants.
8614 ScalarEvolution &SE = *PSE.getSE();
8615 auto GetConstantViaSCEV = [this, &SE](VPValue *Op) {
8616 if (!Op->isLiveIn())
8617 return Op;
8618 Value *V = Op->getUnderlyingValue();
8619 if (isa<Constant>(V) || !SE.isSCEVable(V->getType()))
8620 return Op;
8621 auto *C = dyn_cast<SCEVConstant>(SE.getSCEV(V));
8622 if (!C)
8623 return Op;
8624 return Plan.getOrAddLiveIn(C->getValue());
8625 };
8626 // For Mul, the legacy cost model checks both operands.
8627 if (I->getOpcode() == Instruction::Mul)
8628 NewOps[0] = GetConstantViaSCEV(NewOps[0]);
8629 // For other binops, the legacy cost model only checks the second operand.
8630 NewOps[1] = GetConstantViaSCEV(NewOps[1]);
8631 }
8632 return new VPWidenRecipe(*I, make_range(NewOps.begin(), NewOps.end()));
8633 };
8634}
8635
8637VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
8639 // FIXME: Support other operations.
8640 unsigned Opcode = HI->Update->getOpcode();
8641 assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
8642 "Histogram update operation must be an Add or Sub");
8643
8645 // Bucket address.
8646 HGramOps.push_back(Operands[1]);
8647 // Increment value.
8648 HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update->getOperand(1)));
8649
8650 // In case of predicated execution (due to tail-folding, or conditional
8651 // execution, or both), pass the relevant mask.
8652 if (Legal->isMaskRequired(HI->Store))
8653 HGramOps.push_back(getBlockInMask(HI->Store->getParent()));
8654
8655 return new VPHistogramRecipe(Opcode,
8656 make_range(HGramOps.begin(), HGramOps.end()),
8657 HI->Store->getDebugLoc());
8658}
8659
8661 BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8662 for (VPHeaderPHIRecipe *R : PhisToFix) {
8663 auto *PN = cast<PHINode>(R->getUnderlyingValue());
8664 VPRecipeBase *IncR =
8665 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8666 R->addOperand(IncR->getVPSingleValue());
8667 }
8668}
8669
8672 VFRange &Range) {
8674 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8675 Range);
8676
8677 bool IsPredicated = CM.isPredicatedInst(I);
8678
8679 // Even if the instruction is not marked as uniform, there are certain
8680 // intrinsic calls that can be effectively treated as such, so we check for
8681 // them here. Conservatively, we only do this for scalable vectors, since
8682 // for fixed-width VFs we can always fall back on full scalarization.
8683 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8684 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8685 case Intrinsic::assume:
8686 case Intrinsic::lifetime_start:
8687 case Intrinsic::lifetime_end:
8688 // For scalable vectors if one of the operands is variant then we still
8689 // want to mark as uniform, which will generate one instruction for just
8690 // the first lane of the vector. We can't scalarize the call in the same
8691 // way as for fixed-width vectors because we don't know how many lanes
8692 // there are.
8693 //
8694 // The reasons for doing it this way for scalable vectors are:
8695 // 1. For the assume intrinsic generating the instruction for the first
8696 // lane is still be better than not generating any at all. For
8697 // example, the input may be a splat across all lanes.
8698 // 2. For the lifetime start/end intrinsics the pointer operand only
8699 // does anything useful when the input comes from a stack object,
8700 // which suggests it should always be uniform. For non-stack objects
8701 // the effect is to poison the object, which still allows us to
8702 // remove the call.
8703 IsUniform = true;
8704 break;
8705 default:
8706 break;
8707 }
8708 }
8709 VPValue *BlockInMask = nullptr;
8710 if (!IsPredicated) {
8711 // Finalize the recipe for Instr, first if it is not predicated.
8712 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8713 } else {
8714 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8715 // Instructions marked for predication are replicated and a mask operand is
8716 // added initially. Masked replicate recipes will later be placed under an
8717 // if-then construct to prevent side-effects. Generate recipes to compute
8718 // the block mask for this region.
8719 BlockInMask = getBlockInMask(I->getParent());
8720 }
8721
8722 // Note that there is some custom logic to mark some intrinsics as uniform
8723 // manually above for scalable vectors, which this assert needs to account for
8724 // as well.
8725 assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
8726 (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
8727 "Should not predicate a uniform recipe");
8728 auto *Recipe = new VPReplicateRecipe(
8729 I, make_range(Operands.begin(), Operands.end()), IsUniform, BlockInMask);
8730 return Recipe;
8731}
8732
8733/// Find all possible partial reductions in the loop and track all of those that
8734/// are valid so recipes can be formed later.
8736 // Find all possible partial reductions.
8738 PartialReductionChains;
8739 for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) {
8740 getScaledReductions(Phi, RdxDesc.getLoopExitInstr(), Range,
8741 PartialReductionChains);
8742 }
8743
8744 // A partial reduction is invalid if any of its extends are used by
8745 // something that isn't another partial reduction. This is because the
8746 // extends are intended to be lowered along with the reduction itself.
8747
8748 // Build up a set of partial reduction bin ops for efficient use checking.
8749 SmallSet<User *, 4> PartialReductionBinOps;
8750 for (const auto &[PartialRdx, _] : PartialReductionChains)
8751 PartialReductionBinOps.insert(PartialRdx.BinOp);
8752
8753 auto ExtendIsOnlyUsedByPartialReductions =
8754 [&PartialReductionBinOps](Instruction *Extend) {
8755 return all_of(Extend->users(), [&](const User *U) {
8756 return PartialReductionBinOps.contains(U);
8757 });
8758 };
8759
8760 // Check if each use of a chain's two extends is a partial reduction
8761 // and only add those that don't have non-partial reduction users.
8762 for (auto Pair : PartialReductionChains) {
8763 PartialReductionChain Chain = Pair.first;
8764 if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) &&
8765 ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB))
8766 ScaledReductionMap.insert(std::make_pair(Chain.Reduction, Pair.second));
8767 }
8768}
8769
8770bool VPRecipeBuilder::getScaledReductions(
8771 Instruction *PHI, Instruction *RdxExitInstr, VFRange &Range,
8772 SmallVectorImpl<std::pair<PartialReductionChain, unsigned>> &Chains) {
8773
8774 if (!CM.TheLoop->contains(RdxExitInstr))
8775 return false;
8776
8777 auto *Update = dyn_cast<BinaryOperator>(RdxExitInstr);
8778 if (!Update)
8779 return false;
8780
8781 Value *Op = Update->getOperand(0);
8782 Value *PhiOp = Update->getOperand(1);
8783 if (Op == PHI)
8784 std::swap(Op, PhiOp);
8785
8786 // Try and get a scaled reduction from the first non-phi operand.
8787 // If one is found, we use the discovered reduction instruction in
8788 // place of the accumulator for costing.
8789 if (auto *OpInst = dyn_cast<Instruction>(Op)) {
8790 if (getScaledReductions(PHI, OpInst, Range, Chains)) {
8791 PHI = Chains.rbegin()->first.Reduction;
8792
8793 Op = Update->getOperand(0);
8794 PhiOp = Update->getOperand(1);
8795 if (Op == PHI)
8796 std::swap(Op, PhiOp);
8797 }
8798 }
8799 if (PhiOp != PHI)
8800 return false;
8801
8802 auto *BinOp = dyn_cast<BinaryOperator>(Op);
8803 if (!BinOp || !BinOp->hasOneUse())
8804 return false;
8805
8806 using namespace llvm::PatternMatch;
8807 Value *A, *B;
8808 if (!match(BinOp->getOperand(0), m_ZExtOrSExt(m_Value(A))) ||
8809 !match(BinOp->getOperand(1), m_ZExtOrSExt(m_Value(B))))
8810 return false;
8811
8812 Instruction *ExtA = cast<Instruction>(BinOp->getOperand(0));
8813 Instruction *ExtB = cast<Instruction>(BinOp->getOperand(1));
8814
8819
8820 PartialReductionChain Chain(RdxExitInstr, ExtA, ExtB, BinOp);
8821
8822 unsigned TargetScaleFactor =
8823 PHI->getType()->getPrimitiveSizeInBits().getKnownScalarFactor(
8824 A->getType()->getPrimitiveSizeInBits());
8825
8827 [&](ElementCount VF) {
8829 Update->getOpcode(), A->getType(), B->getType(), PHI->getType(),
8830 VF, OpAExtend, OpBExtend,
8831 std::make_optional(BinOp->getOpcode()));
8832 return Cost.isValid();
8833 },
8834 Range)) {
8835 Chains.push_back(std::make_pair(Chain, TargetScaleFactor));
8836 return true;
8837 }
8838
8839 return false;
8840}
8841
8844 // First, check for specific widening recipes that deal with inductions, Phi
8845 // nodes, calls and memory operations.
8846 VPRecipeBase *Recipe;
8847 if (auto *Phi = dyn_cast<PHINode>(Instr)) {
8848 if (Phi->getParent() != OrigLoop->getHeader())
8849 return tryToBlend(Phi, Operands);
8850
8851 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8852 return Recipe;
8853
8854 VPHeaderPHIRecipe *PhiRecipe = nullptr;
8855 assert((Legal->isReductionVariable(Phi) ||
8856 Legal->isFixedOrderRecurrence(Phi)) &&
8857 "can only widen reductions and fixed-order recurrences here");
8858 VPValue *StartV = Operands[0];
8859 if (Legal->isReductionVariable(Phi)) {
8860 const RecurrenceDescriptor &RdxDesc =
8861 Legal->getReductionVars().find(Phi)->second;
8862 assert(RdxDesc.getRecurrenceStartValue() ==
8863 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8864
8865 // If the PHI is used by a partial reduction, set the scale factor.
8866 unsigned ScaleFactor =
8867 getScalingForReduction(RdxDesc.getLoopExitInstr()).value_or(1);
8868 PhiRecipe = new VPReductionPHIRecipe(
8869 Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi),
8870 CM.useOrderedReductions(RdxDesc), ScaleFactor);
8871 } else {
8872 // TODO: Currently fixed-order recurrences are modeled as chains of
8873 // first-order recurrences. If there are no users of the intermediate
8874 // recurrences in the chain, the fixed order recurrence should be modeled
8875 // directly, enabling more efficient codegen.
8876 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8877 }
8878
8879 PhisToFix.push_back(PhiRecipe);
8880 return PhiRecipe;
8881 }
8882
8883 if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
8884 cast<TruncInst>(Instr), Operands, Range)))
8885 return Recipe;
8886
8887 // All widen recipes below deal only with VF > 1.
8889 [&](ElementCount VF) { return VF.isScalar(); }, Range))
8890 return nullptr;
8891
8892 if (auto *CI = dyn_cast<CallInst>(Instr))
8893 return tryToWidenCall(CI, Operands, Range);
8894
8895 if (StoreInst *SI = dyn_cast<StoreInst>(Instr))
8896 if (auto HistInfo = Legal->getHistogramInfo(SI))
8897 return tryToWidenHistogram(*HistInfo, Operands);
8898
8899 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8900 return tryToWidenMemory(Instr, Operands, Range);
8901
8902 if (getScalingForReduction(Instr))
8904
8905 if (!shouldWiden(Instr, Range))
8906 return nullptr;
8907
8908 if (auto *GEP = dyn_cast<GetElementPtrInst>(Instr))
8909 return new VPWidenGEPRecipe(GEP,
8910 make_range(Operands.begin(), Operands.end()));
8911
8912 if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8913 return new VPWidenSelectRecipe(
8914 *SI, make_range(Operands.begin(), Operands.end()));
8915 }
8916
8917 if (auto *CI = dyn_cast<CastInst>(Instr)) {
8918 return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(),
8919 *CI);
8920 }
8921
8922 return tryToWiden(Instr, Operands);
8923}
8924
8928 assert(Operands.size() == 2 &&
8929 "Unexpected number of operands for partial reduction");
8930
8931 VPValue *BinOp = Operands[0];
8933 VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe();
8934 if (isa<VPReductionPHIRecipe>(BinOpRecipe) ||
8935 isa<VPPartialReductionRecipe>(BinOpRecipe))
8936 std::swap(BinOp, Accumulator);
8937
8938 unsigned ReductionOpcode = Reduction->getOpcode();
8939 if (CM.blockNeedsPredicationForAnyReason(Reduction->getParent())) {
8940 assert((ReductionOpcode == Instruction::Add ||
8941 ReductionOpcode == Instruction::Sub) &&
8942 "Expected an ADD or SUB operation for predicated partial "
8943 "reductions (because the neutral element in the mask is zero)!");
8944 VPValue *Mask = getBlockInMask(Reduction->getParent());
8945 VPValue *Zero =
8946 Plan.getOrAddLiveIn(ConstantInt::get(Reduction->getType(), 0));
8947 BinOp = Builder.createSelect(Mask, BinOp, Zero, Reduction->getDebugLoc());
8948 }
8949 return new VPPartialReductionRecipe(ReductionOpcode, BinOp, Accumulator,
8950 Reduction);
8951}
8952
8953void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8954 ElementCount MaxVF) {
8955 assert(OrigLoop->isInnermost() && "Inner loop expected.");
8956
8957 auto MaxVFTimes2 = MaxVF * 2;
8958 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8959 VFRange SubRange = {VF, MaxVFTimes2};
8960 if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
8961 bool HasScalarVF = Plan->hasScalarVFOnly();
8962 // Now optimize the initial VPlan.
8963 if (!HasScalarVF)
8965 *Plan, CM.getMinimalBitwidths());
8967 // TODO: try to put it close to addActiveLaneMask().
8968 // Discard the plan if it is not EVL-compatible
8969 if (CM.foldTailWithEVL() && !HasScalarVF &&
8971 *Plan, CM.getMaxSafeElements()))
8972 break;
8973 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8974 VPlans.push_back(std::move(Plan));
8975 }
8976 VF = SubRange.End;
8977 }
8978}
8979
8980// Add the necessary canonical IV and branch recipes required to control the
8981// loop.
8982static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
8983 DebugLoc DL) {
8984 Value *StartIdx = ConstantInt::get(IdxTy, 0);
8985 auto *StartV = Plan.getOrAddLiveIn(StartIdx);
8986
8987 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
8988 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8989 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8990 VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8991 Header->insert(CanonicalIVPHI, Header->begin());
8992
8993 VPBuilder Builder(TopRegion->getExitingBasicBlock());
8994 // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
8995 auto *CanonicalIVIncrement = Builder.createOverflowingOp(
8996 Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL,
8997 "index.next");
8998 CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8999
9000 // Add the BranchOnCount VPInstruction to the latch.
9002 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
9003}
9004
9005/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
9006/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
9007/// the end value of the induction.
9009 VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder,
9010 VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) {
9011 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
9012 // Truncated wide inductions resume from the last lane of their vector value
9013 // in the last vector iteration which is handled elsewhere.
9014 if (WideIntOrFp && WideIntOrFp->getTruncInst())
9015 return nullptr;
9016
9017 VPValue *Start = WideIV->getStartValue();
9018 VPValue *Step = WideIV->getStepValue();
9020 VPValue *EndValue = VectorTC;
9021 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
9022 EndValue = VectorPHBuilder.createDerivedIV(
9023 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
9024 Start, VectorTC, Step);
9025 }
9026
9027 // EndValue is derived from the vector trip count (which has the same type as
9028 // the widest induction) and thus may be wider than the induction here.
9029 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
9030 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
9031 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
9032 ScalarTypeOfWideIV,
9033 WideIV->getDebugLoc());
9034 }
9035
9036 auto *ResumePhiRecipe =
9037 ScalarPHBuilder.createNaryOp(VPInstruction::ResumePhi, {EndValue, Start},
9038 WideIV->getDebugLoc(), "bc.resume.val");
9039 return ResumePhiRecipe;
9040}
9041
9042/// Create resume phis in the scalar preheader for first-order recurrences,
9043/// reductions and inductions, and update the VPIRInstructions wrapping the
9044/// original phis in the scalar header. End values for inductions are added to
9045/// \p IVEndValues.
9046static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
9047 DenseMap<VPValue *, VPValue *> &IVEndValues) {
9048 VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
9049 auto *ScalarPH = Plan.getScalarPreheader();
9050 auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getSinglePredecessor());
9051 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
9052 VPBuilder VectorPHBuilder(
9053 cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
9054 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
9055 VPBuilder ScalarPHBuilder(ScalarPH);
9056 VPValue *OneVPV = Plan.getOrAddLiveIn(
9057 ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
9058 for (VPRecipeBase &ScalarPhiR : *Plan.getScalarHeader()) {
9059 auto *ScalarPhiIRI = cast<VPIRInstruction>(&ScalarPhiR);
9060 auto *ScalarPhiI = dyn_cast<PHINode>(&ScalarPhiIRI->getInstruction());
9061 if (!ScalarPhiI)
9062 break;
9063
9064 // TODO: Extract final value from induction recipe initially, optimize to
9065 // pre-computed end value together in optimizeInductionExitUsers.
9066 auto *VectorPhiR = cast<VPHeaderPHIRecipe>(Builder.getRecipe(ScalarPhiI));
9067 if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
9069 WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
9070 &Plan.getVectorTripCount())) {
9071 assert(ResumePhi->getOpcode() == VPInstruction::ResumePhi &&
9072 "Expected a ResumePhi");
9073 IVEndValues[WideIVR] = ResumePhi->getOperand(0);
9074 ScalarPhiIRI->addOperand(ResumePhi);
9075 continue;
9076 }
9077 // TODO: Also handle truncated inductions here. Computing end-values
9078 // separately should be done as VPlan-to-VPlan optimization, after
9079 // legalizing all resume values to use the last lane from the loop.
9080 assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
9081 "should only skip truncated wide inductions");
9082 continue;
9083 }
9084
9085 // The backedge value provides the value to resume coming out of a loop,
9086 // which for FORs is a vector whose last element needs to be extracted. The
9087 // start value provides the value if the loop is bypassed.
9088 bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
9089 auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
9090 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
9091 "Cannot handle loops with uncountable early exits");
9092 if (IsFOR)
9093 ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
9094 VPInstruction::ExtractFromEnd, {ResumeFromVectorLoop, OneVPV}, {},
9095 "vector.recur.extract");
9096 StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
9097 auto *ResumePhiR = ScalarPHBuilder.createNaryOp(
9099 {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name);
9100 ScalarPhiIRI->addOperand(ResumePhiR);
9101 }
9102}
9103
9104// Collect VPIRInstructions for phis in the exit blocks that are modeled
9105// in VPlan and add the exiting VPValue as operand.
9108 VPlan &Plan) {
9109 SetVector<VPIRInstruction *> ExitUsersToFix;
9110 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
9111 for (VPRecipeBase &R : *ExitVPBB) {
9112 auto *ExitIRI = dyn_cast<VPIRInstruction>(&R);
9113 if (!ExitIRI)
9114 continue;
9115 auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
9116 if (!ExitPhi)
9117 break;
9118 if (ExitVPBB->getSinglePredecessor() != Plan.getMiddleBlock()) {
9119 assert(ExitIRI->getNumOperands() ==
9120 ExitVPBB->getPredecessors().size() &&
9121 "early-exit must update exit values on construction");
9122 continue;
9123 }
9124 BasicBlock *ExitingBB = OrigLoop->getLoopLatch();
9125 Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
9126 VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
9127 ExitIRI->addOperand(V);
9128 if (V->isLiveIn())
9129 continue;
9130 assert(V->getDefiningRecipe()->getParent()->getEnclosingLoopRegion() &&
9131 "Only recipes defined inside a region should need fixing.");
9132 ExitUsersToFix.insert(ExitIRI);
9133 }
9134 }
9135 return ExitUsersToFix;
9136}
9137
9138// Add exit values to \p Plan. Extracts are added for each entry in \p
9139// ExitUsersToFix if needed and their operands are updated.
9140static void
9142 const SetVector<VPIRInstruction *> &ExitUsersToFix) {
9143 if (ExitUsersToFix.empty())
9144 return;
9145
9146 auto *MiddleVPBB = Plan.getMiddleBlock();
9147 VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
9148
9149 // Introduce extract for exiting values and update the VPIRInstructions
9150 // modeling the corresponding LCSSA phis.
9151 for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
9152 assert(ExitIRI->getNumOperands() == 1 &&
9153 ExitIRI->getParent()->getSinglePredecessor() == MiddleVPBB &&
9154 "exit values from early exits must be fixed when branch to "
9155 "early-exit is added");
9156 ExitIRI->extractLastLaneOfOperand(B);
9157 }
9158}
9159
9160/// Handle users in the exit block for first order reductions in the original
9161/// exit block. The penultimate value of recurrences is fed to their LCSSA phi
9162/// users in the original exit block using the VPIRInstruction wrapping to the
9163/// LCSSA phi.
9165 VPlan &Plan, SetVector<VPIRInstruction *> &ExitUsersToFix) {
9166 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
9167 auto *ScalarPHVPBB = Plan.getScalarPreheader();
9168 auto *MiddleVPBB = Plan.getMiddleBlock();
9169 VPBuilder ScalarPHBuilder(ScalarPHVPBB);
9170 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
9171 VPValue *TwoVPV = Plan.getOrAddLiveIn(
9172 ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 2));
9173
9174 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
9175 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
9176 if (!FOR)
9177 continue;
9178
9179 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
9180 "Cannot handle loops with uncountable early exits");
9181
9182 // This is the second phase of vectorizing first-order recurrences, creating
9183 // extract for users outside the loop. An overview of the transformation is
9184 // described below. Suppose we have the following loop with some use after
9185 // the loop of the last a[i-1],
9186 //
9187 // for (int i = 0; i < n; ++i) {
9188 // t = a[i - 1];
9189 // b[i] = a[i] - t;
9190 // }
9191 // use t;
9192 //
9193 // There is a first-order recurrence on "a". For this loop, the shorthand
9194 // scalar IR looks like:
9195 //
9196 // scalar.ph:
9197 // s.init = a[-1]
9198 // br scalar.body
9199 //
9200 // scalar.body:
9201 // i = phi [0, scalar.ph], [i+1, scalar.body]
9202 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
9203 // s2 = a[i]
9204 // b[i] = s2 - s1
9205 // br cond, scalar.body, exit.block
9206 //
9207 // exit.block:
9208 // use = lcssa.phi [s1, scalar.body]
9209 //
9210 // In this example, s1 is a recurrence because it's value depends on the
9211 // previous iteration. In the first phase of vectorization, we created a
9212 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
9213 // for users in the scalar preheader and exit block.
9214 //
9215 // vector.ph:
9216 // v_init = vector(..., ..., ..., a[-1])
9217 // br vector.body
9218 //
9219 // vector.body
9220 // i = phi [0, vector.ph], [i+4, vector.body]
9221 // v1 = phi [v_init, vector.ph], [v2, vector.body]
9222 // v2 = a[i, i+1, i+2, i+3]
9223 // b[i] = v2 - v1
9224 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
9225 // b[i, i+1, i+2, i+3] = v2 - v1
9226 // br cond, vector.body, middle.block
9227 //
9228 // middle.block:
9229 // vector.recur.extract.for.phi = v2(2)
9230 // vector.recur.extract = v2(3)
9231 // br cond, scalar.ph, exit.block
9232 //
9233 // scalar.ph:
9234 // scalar.recur.init = phi [vector.recur.extract, middle.block],
9235 // [s.init, otherwise]
9236 // br scalar.body
9237 //
9238 // scalar.body:
9239 // i = phi [0, scalar.ph], [i+1, scalar.body]
9240 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
9241 // s2 = a[i]
9242 // b[i] = s2 - s1
9243 // br cond, scalar.body, exit.block
9244 //
9245 // exit.block:
9246 // lo = lcssa.phi [s1, scalar.body],
9247 // [vector.recur.extract.for.phi, middle.block]
9248 //
9249 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
9250 // Extract the penultimate value of the recurrence and use it as operand for
9251 // the VPIRInstruction modeling the phi.
9252 for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
9253 if (ExitIRI->getOperand(0) != FOR)
9254 continue;
9255 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
9256 VPInstruction::ExtractFromEnd, {FOR->getBackedgeValue(), TwoVPV}, {},
9257 "vector.recur.extract.for.phi");
9258 ExitIRI->setOperand(0, PenultimateElement);
9259 ExitUsersToFix.remove(ExitIRI);
9260 }
9261 }
9262}
9263
9265LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
9266
9268
9269 // ---------------------------------------------------------------------------
9270 // Build initial VPlan: Scan the body of the loop in a topological order to
9271 // visit each basic block after having visited its predecessor basic blocks.
9272 // ---------------------------------------------------------------------------
9273
9274 // Create initial VPlan skeleton, having a basic block for the pre-header
9275 // which contains SCEV expansions that need to happen before the CFG is
9276 // modified; a basic block for the vector pre-header, followed by a region for
9277 // the vector loop, followed by the middle basic block. The skeleton vector
9278 // loop region contains a header and latch basic blocks.
9279
9280 bool RequiresScalarEpilogueCheck =
9282 [this](ElementCount VF) {
9283 return !CM.requiresScalarEpilogue(VF.isVector());
9284 },
9285 Range);
9287 PSE, RequiresScalarEpilogueCheck,
9288 CM.foldTailByMasking(), OrigLoop);
9289
9290 // Don't use getDecisionAndClampRange here, because we don't know the UF
9291 // so this function is better to be conservative, rather than to split
9292 // it up into different VPlans.
9293 // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
9294 bool IVUpdateMayOverflow = false;
9295 for (ElementCount VF : Range)
9296 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
9297
9299 TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
9300 // Use NUW for the induction increment if we proved that it won't overflow in
9301 // the vector loop or when not folding the tail. In the later case, we know
9302 // that the canonical induction increment will not overflow as the vector trip
9303 // count is >= increment and a multiple of the increment.
9304 bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
9305 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
9306
9307 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
9308 Builder);
9309
9310 // ---------------------------------------------------------------------------
9311 // Pre-construction: record ingredients whose recipes we'll need to further
9312 // process after constructing the initial VPlan.
9313 // ---------------------------------------------------------------------------
9314
9315 // For each interleave group which is relevant for this (possibly trimmed)
9316 // Range, add it to the set of groups to be later applied to the VPlan and add
9317 // placeholders for its members' Recipes which we'll be replacing with a
9318 // single VPInterleaveRecipe.
9320 auto ApplyIG = [IG, this](ElementCount VF) -> bool {
9321 bool Result = (VF.isVector() && // Query is illegal for VF == 1
9322 CM.getWideningDecision(IG->getInsertPos(), VF) ==
9324 // For scalable vectors, the only interleave factor currently supported
9325 // must be power of 2 since we require the (de)interleave2 intrinsics
9326 // instead of shufflevectors.
9327 assert((!Result || !VF.isScalable() || isPowerOf2_32(IG->getFactor())) &&
9328 "Unsupported interleave factor for scalable vectors");
9329 return Result;
9330 };
9331 if (!getDecisionAndClampRange(ApplyIG, Range))
9332 continue;
9333 InterleaveGroups.insert(IG);
9334 }
9335
9336 // ---------------------------------------------------------------------------
9337 // Construct recipes for the instructions in the loop
9338 // ---------------------------------------------------------------------------
9339
9340 // Scan the body of the loop in a topological order to visit each basic block
9341 // after having visited its predecessor basic blocks.
9342 LoopBlocksDFS DFS(OrigLoop);
9343 DFS.perform(LI);
9344
9345 VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock();
9346 VPBasicBlock *VPBB = HeaderVPBB;
9347 BasicBlock *HeaderBB = OrigLoop->getHeader();
9348 bool NeedsMasks =
9349 CM.foldTailByMasking() ||
9350 any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) {
9351 bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
9352 return Legal->blockNeedsPredication(BB) || NeedsBlends;
9353 });
9354
9355 RecipeBuilder.collectScaledReductions(Range);
9356
9357 auto *MiddleVPBB = Plan->getMiddleBlock();
9358 VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
9359 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
9360 // Relevant instructions from basic block BB will be grouped into VPRecipe
9361 // ingredients and fill a new VPBasicBlock.
9362 if (VPBB != HeaderVPBB)
9363 VPBB->setName(BB->getName());
9364 Builder.setInsertPoint(VPBB);
9365
9366 if (VPBB == HeaderVPBB)
9367 RecipeBuilder.createHeaderMask();
9368 else if (NeedsMasks)
9369 RecipeBuilder.createBlockInMask(BB);
9370
9371 // Introduce each ingredient into VPlan.
9372 // TODO: Model and preserve debug intrinsics in VPlan.
9373 for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
9374 Instruction *Instr = &I;
9376 auto *Phi = dyn_cast<PHINode>(Instr);
9377 if (Phi && Phi->getParent() == HeaderBB) {
9378 Operands.push_back(Plan->getOrAddLiveIn(
9379 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
9380 } else {
9381 auto OpRange = RecipeBuilder.mapToVPValues(Instr->operands());
9382 Operands = {OpRange.begin(), OpRange.end()};
9383 }
9384
9385 // The stores with invariant address inside the loop will be deleted, and
9386 // in the exit block, a uniform store recipe will be created for the final
9387 // invariant store of the reduction.
9388 StoreInst *SI;
9389 if ((SI = dyn_cast<StoreInst>(&I)) &&
9390 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
9391 // Only create recipe for the final invariant store of the reduction.
9392 if (!Legal->isInvariantStoreOfReduction(SI))
9393 continue;
9394 auto *Recipe = new VPReplicateRecipe(
9395 SI, make_range(Operands.begin(), Operands.end()),
9396 true /* IsUniform */);
9397 Recipe->insertBefore(*MiddleVPBB, MBIP);
9398 continue;
9399 }
9400
9401 VPRecipeBase *Recipe =
9402 RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range);
9403 if (!Recipe)
9404 Recipe = RecipeBuilder.handleReplication(Instr, Operands, Range);
9405
9406 RecipeBuilder.setRecipe(Instr, Recipe);
9407 if (isa<VPHeaderPHIRecipe>(Recipe)) {
9408 // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
9409 // the following cases, VPHeaderPHIRecipes may be created after non-phi
9410 // recipes and need to be moved to the phi section of HeaderVPBB:
9411 // * tail-folding (non-phi recipes computing the header mask are
9412 // introduced earlier than regular header phi recipes, and should appear
9413 // after them)
9414 // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
9415
9416 assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
9417 CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
9418 "unexpected recipe needs moving");
9419 Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
9420 } else
9421 VPBB->appendRecipe(Recipe);
9422 }
9423
9424 VPBlockUtils::insertBlockAfter(Plan->createVPBasicBlock(""), VPBB);
9425 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
9426 }
9427
9428 // After here, VPBB should not be used.
9429 VPBB = nullptr;
9430
9431 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
9432 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
9433 "entry block must be set to a VPRegionBlock having a non-empty entry "
9434 "VPBasicBlock");
9435 RecipeBuilder.fixHeaderPhis();
9436
9437 // Update wide induction increments to use the same step as the corresponding
9438 // wide induction. This enables detecting induction increments directly in
9439 // VPlan and removes redundant splats.
9440 for (const auto &[Phi, ID] : Legal->getInductionVars()) {
9441 auto *IVInc = cast<Instruction>(
9442 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
9443 if (IVInc->getOperand(0) != Phi || IVInc->getOpcode() != Instruction::Add)
9444 continue;
9445 VPWidenInductionRecipe *WideIV =
9446 cast<VPWidenInductionRecipe>(RecipeBuilder.getRecipe(Phi));
9447 VPRecipeBase *R = RecipeBuilder.getRecipe(IVInc);
9448 R->setOperand(1, WideIV->getStepValue());
9449 }
9450
9451 if (auto *UncountableExitingBlock =
9454 *PSE.getSE(), OrigLoop, UncountableExitingBlock,
9455 RecipeBuilder);
9456 }
9458 addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
9459 SetVector<VPIRInstruction *> ExitUsersToFix =
9460 collectUsersInExitBlocks(OrigLoop, RecipeBuilder, *Plan);
9461 addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
9462 addUsersInExitBlocks(*Plan, ExitUsersToFix);
9463
9464 // ---------------------------------------------------------------------------
9465 // Transform initial VPlan: Apply previously taken decisions, in order, to
9466 // bring the VPlan to its final state.
9467 // ---------------------------------------------------------------------------
9468
9469 // Adjust the recipes for any inloop reductions.
9470 adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
9471
9472 // Interleave memory: for each Interleave Group we marked earlier as relevant
9473 // for this VPlan, replace the Recipes widening its memory instructions with a
9474 // single VPInterleaveRecipe at its insertion point.
9476 InterleaveGroups, RecipeBuilder,
9478
9479 for (ElementCount VF : Range)
9480 Plan->addVF(VF);
9481 Plan->setName("Initial VPlan");
9482
9483 // Replace VPValues for known constant strides guaranteed by predicate scalar
9484 // evolution.
9485 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
9486 auto *R = cast<VPRecipeBase>(&U);
9487 return R->getParent()->getParent() ||
9488 R->getParent() ==
9489 Plan->getVectorLoopRegion()->getSinglePredecessor();
9490 };
9491 for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
9492 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
9493 auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));
9494 // Only handle constant strides for now.
9495 if (!ScevStride)
9496 continue;
9497
9498 auto *CI = Plan->getOrAddLiveIn(
9499 ConstantInt::get(Stride->getType(), ScevStride->getAPInt()));
9500 if (VPValue *StrideVPV = Plan->getLiveIn(StrideV))
9501 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
9502
9503 // The versioned value may not be used in the loop directly but through a
9504 // sext/zext. Add new live-ins in those cases.
9505 for (Value *U : StrideV->users()) {
9506 if (!isa<SExtInst, ZExtInst>(U))
9507 continue;
9508 VPValue *StrideVPV = Plan->getLiveIn(U);
9509 if (!StrideVPV)
9510 continue;
9511 unsigned BW = U->getType()->getScalarSizeInBits();
9512 APInt C = isa<SExtInst>(U) ? ScevStride->getAPInt().sext(BW)
9513 : ScevStride->getAPInt().zext(BW);
9514 VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(U->getType(), C));
9515 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
9516 }
9517 }
9518
9519 auto BlockNeedsPredication = [this](BasicBlock *BB) {
9520 return Legal->blockNeedsPredication(BB);
9521 };
9523 BlockNeedsPredication);
9524
9525 // Sink users of fixed-order recurrence past the recipe defining the previous
9526 // value and introduce FirstOrderRecurrenceSplice VPInstructions.
9528 *Plan, Builder))
9529 return nullptr;
9530
9531 if (useActiveLaneMask(Style)) {
9532 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
9533 // TailFoldingStyle is visible there.
9534 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
9535 bool WithoutRuntimeCheck =
9537 VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
9538 WithoutRuntimeCheck);
9539 }
9541
9542 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
9543 return Plan;
9544}
9545
9546VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9547 // Outer loop handling: They may require CFG and instruction level
9548 // transformations before even evaluating whether vectorization is profitable.
9549 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9550 // the vectorization pipeline.
9551 assert(!OrigLoop->isInnermost());
9552 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9553
9554 // Create new empty VPlan
9555 auto Plan = VPlan::createInitialVPlan(Legal->getWidestInductionType(), PSE,
9556 true, false, OrigLoop);
9557
9558 // Build hierarchical CFG
9559 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9560 HCFGBuilder.buildHierarchicalCFG();
9561
9562 for (ElementCount VF : Range)
9563 Plan->addVF(VF);
9564
9566 Plan,
9567 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
9568 *PSE.getSE(), *TLI);
9569
9570 // Tail folding is not supported for outer loops, so the induction increment
9571 // is guaranteed to not wrap.
9572 bool HasNUW = true;
9573 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
9574 DebugLoc());
9575
9576 // Collect mapping of IR header phis to header phi recipes, to be used in
9577 // addScalarResumePhis.
9578 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
9579 Builder);
9580 for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9581 if (isa<VPCanonicalIVPHIRecipe>(&R))
9582 continue;
9583 auto *HeaderR = cast<VPHeaderPHIRecipe>(&R);
9584 RecipeBuilder.setRecipe(HeaderR->getUnderlyingInstr(), HeaderR);
9585 }
9587 // TODO: IVEndValues are not used yet in the native path, to optimize exit
9588 // values.
9589 addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
9590
9591 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
9592 return Plan;
9593}
9594
9595// Adjust the recipes for reductions. For in-loop reductions the chain of
9596// instructions leading from the loop exit instr to the phi need to be converted
9597// to reductions, with one operand being vector and the other being the scalar
9598// reduction chain. For other reductions, a select is introduced between the phi
9599// and users outside the vector region when folding the tail.
9600//
9601// A ComputeReductionResult recipe is added to the middle block, also for
9602// in-loop reductions which compute their result in-loop, because generating
9603// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
9604//
9605// Adjust AnyOf reductions; replace the reduction phi for the selected value
9606// with a boolean reduction phi node to check if the condition is true in any
9607// iteration. The final value is selected by the final ComputeReductionResult.
9608void LoopVectorizationPlanner::adjustRecipesForReductions(
9609 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
9610 using namespace VPlanPatternMatch;
9611 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
9612 VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
9613 VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();
9615
9616 for (VPRecipeBase &R : Header->phis()) {
9617 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9618 if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
9619 continue;
9620
9621 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9622 RecurKind Kind = RdxDesc.getRecurrenceKind();
9623 assert(
9626 "AnyOf and FindLast reductions are not allowed for in-loop reductions");
9627
9628 // Collect the chain of "link" recipes for the reduction starting at PhiR.
9630 Worklist.insert(PhiR);
9631 for (unsigned I = 0; I != Worklist.size(); ++I) {
9632 VPSingleDefRecipe *Cur = Worklist[I];
9633 for (VPUser *U : Cur->users()) {
9634 auto *UserRecipe = cast<VPSingleDefRecipe>(U);
9635 if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
9636 assert((UserRecipe->getParent() == MiddleVPBB ||
9637 UserRecipe->getParent() == Plan->getScalarPreheader()) &&
9638 "U must be either in the loop region, the middle block or the "
9639 "scalar preheader.");
9640 continue;
9641 }
9642 Worklist.insert(UserRecipe);
9643 }
9644 }
9645
9646 // Visit operation "Links" along the reduction chain top-down starting from
9647 // the phi until LoopExitValue. We keep track of the previous item
9648 // (PreviousLink) to tell which of the two operands of a Link will remain
9649 // scalar and which will be reduced. For minmax by select(cmp), Link will be
9650 // the select instructions. Blend recipes of in-loop reduction phi's will
9651 // get folded to their non-phi operand, as the reduction recipe handles the
9652 // condition directly.
9653 VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
9654 for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {
9655 Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
9656
9657 // Index of the first operand which holds a non-mask vector operand.
9658 unsigned IndexOfFirstOperand;
9659 // Recognize a call to the llvm.fmuladd intrinsic.
9660 bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9661 VPValue *VecOp;
9662 VPBasicBlock *LinkVPBB = CurrentLink->getParent();
9663 if (IsFMulAdd) {
9664 assert(
9666 "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9667 assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
9668 isa<VPWidenIntrinsicRecipe>(CurrentLink)) &&
9669 CurrentLink->getOperand(2) == PreviousLink &&
9670 "expected a call where the previous link is the added operand");
9671
9672 // If the instruction is a call to the llvm.fmuladd intrinsic then we
9673 // need to create an fmul recipe (multiplying the first two operands of
9674 // the fmuladd together) to use as the vector operand for the fadd
9675 // reduction.
9676 VPInstruction *FMulRecipe = new VPInstruction(
9677 Instruction::FMul,
9678 {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
9679 CurrentLinkI->getFastMathFlags());
9680 LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
9681 VecOp = FMulRecipe;
9682 } else {
9683 auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink);
9684 if (PhiR->isInLoop() && Blend) {
9685 assert(Blend->getNumIncomingValues() == 2 &&
9686 "Blend must have 2 incoming values");
9687 if (Blend->getIncomingValue(0) == PhiR)
9688 Blend->replaceAllUsesWith(Blend->getIncomingValue(1));
9689 else {
9690 assert(Blend->getIncomingValue(1) == PhiR &&
9691 "PhiR must be an operand of the blend");
9692 Blend->replaceAllUsesWith(Blend->getIncomingValue(0));
9693 }
9694 continue;
9695 }
9696
9698 if (isa<VPWidenRecipe>(CurrentLink)) {
9699 assert(isa<CmpInst>(CurrentLinkI) &&
9700 "need to have the compare of the select");
9701 continue;
9702 }
9703 assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
9704 "must be a select recipe");
9705 IndexOfFirstOperand = 1;
9706 } else {
9707 assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
9708 "Expected to replace a VPWidenSC");
9709 IndexOfFirstOperand = 0;
9710 }
9711 // Note that for non-commutable operands (cmp-selects), the semantics of
9712 // the cmp-select are captured in the recurrence kind.
9713 unsigned VecOpId =
9714 CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink
9715 ? IndexOfFirstOperand + 1
9716 : IndexOfFirstOperand;
9717 VecOp = CurrentLink->getOperand(VecOpId);
9718 assert(VecOp != PreviousLink &&
9719 CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
9720 (VecOpId - IndexOfFirstOperand)) ==
9721 PreviousLink &&
9722 "PreviousLink must be the operand other than VecOp");
9723 }
9724
9725 BasicBlock *BB = CurrentLinkI->getParent();
9726 VPValue *CondOp = nullptr;
9728 CondOp = RecipeBuilder.getBlockInMask(BB);
9729
9730 auto *RedRecipe = new VPReductionRecipe(
9731 RdxDesc, CurrentLinkI, PreviousLink, VecOp, CondOp,
9732 CM.useOrderedReductions(RdxDesc), CurrentLinkI->getDebugLoc());
9733 // Append the recipe to the end of the VPBasicBlock because we need to
9734 // ensure that it comes after all of it's inputs, including CondOp.
9735 // Delete CurrentLink as it will be invalid if its operand is replaced
9736 // with a reduction defined at the bottom of the block in the next link.
9737 LinkVPBB->appendRecipe(RedRecipe);
9738 CurrentLink->replaceAllUsesWith(RedRecipe);
9739 ToDelete.push_back(CurrentLink);
9740 PreviousLink = RedRecipe;
9741 }
9742 }
9743 VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
9744 Builder.setInsertPoint(&*LatchVPBB->begin());
9745 VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
9746 for (VPRecipeBase &R :
9747 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9748 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9749 if (!PhiR)
9750 continue;
9751
9752 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9753 // If tail is folded by masking, introduce selects between the phi
9754 // and the users outside the vector region of each reduction, at the
9755 // beginning of the dedicated latch block.
9756 auto *OrigExitingVPV = PhiR->getBackedgeValue();
9757 auto *NewExitingVPV = PhiR->getBackedgeValue();
9758 // Don't output selects for partial reductions because they have an output
9759 // with fewer lanes than the VF. So the operands of the select would have
9760 // different numbers of lanes. Partial reductions mask the input instead.
9761 if (!PhiR->isInLoop() && CM.foldTailByMasking() &&
9762 !isa<VPPartialReductionRecipe>(OrigExitingVPV->getDefiningRecipe())) {
9763 VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
9764 assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
9765 "reduction recipe must be defined before latch");
9766 Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
9767 std::optional<FastMathFlags> FMFs =
9768 PhiTy->isFloatingPointTy()
9769 ? std::make_optional(RdxDesc.getFastMathFlags())
9770 : std::nullopt;
9771 NewExitingVPV =
9772 Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
9773 OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
9774 return isa<VPInstruction>(&U) &&
9775 cast<VPInstruction>(&U)->getOpcode() ==
9777 });
9779 PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy))
9780 PhiR->setOperand(1, NewExitingVPV);
9781 }
9782
9783 // If the vector reduction can be performed in a smaller type, we truncate
9784 // then extend the loop exit value to enable InstCombine to evaluate the
9785 // entire expression in the smaller type.
9786 Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
9787 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
9789 RdxDesc.getRecurrenceKind())) {
9790 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9791 Type *RdxTy = RdxDesc.getRecurrenceType();
9792 auto *Trunc =
9793 new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9794 auto *Extnd =
9795 RdxDesc.isSigned()
9796 ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
9797 : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
9798
9799 Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
9800 Extnd->insertAfter(Trunc);
9801 if (PhiR->getOperand(1) == NewExitingVPV)
9802 PhiR->setOperand(1, Extnd->getVPSingleValue());
9803 NewExitingVPV = Extnd;
9804 }
9805
9806 // We want code in the middle block to appear to execute on the location of
9807 // the scalar loop's latch terminator because: (a) it is all compiler
9808 // generated, (b) these instructions are always executed after evaluating
9809 // the latch conditional branch, and (c) other passes may add new
9810 // predecessors which terminate on this line. This is the easiest way to
9811 // ensure we don't accidentally cause an extra step back into the loop while
9812 // debugging.
9813 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9814
9815 // TODO: At the moment ComputeReductionResult also drives creation of the
9816 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9817 // even for in-loop reductions, until the reduction resume value handling is
9818 // also modeled in VPlan.
9819 auto *FinalReductionResult = new VPInstruction(
9820 VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9821 // Update all users outside the vector region.
9822 OrigExitingVPV->replaceUsesWithIf(
9823 FinalReductionResult, [](VPUser &User, unsigned) {
9824 auto *Parent = cast<VPRecipeBase>(&User)->getParent();
9825 return Parent && !Parent->getParent();
9826 });
9827 FinalReductionResult->insertBefore(*MiddleVPBB, IP);
9828
9829 // Adjust AnyOf reductions; replace the reduction phi for the selected value
9830 // with a boolean reduction phi node to check if the condition is true in
9831 // any iteration. The final value is selected by the final
9832 // ComputeReductionResult.
9834 RdxDesc.getRecurrenceKind())) {
9835 auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) {
9836 return isa<VPWidenSelectRecipe>(U) ||
9837 (isa<VPReplicateRecipe>(U) &&
9838 cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() ==
9839 Instruction::Select);
9840 }));
9841 VPValue *Cmp = Select->getOperand(0);
9842 // If the compare is checking the reduction PHI node, adjust it to check
9843 // the start value.
9844 if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe()) {
9845 for (unsigned I = 0; I != CmpR->getNumOperands(); ++I)
9846 if (CmpR->getOperand(I) == PhiR)
9847 CmpR->setOperand(I, PhiR->getStartValue());
9848 }
9849 VPBuilder::InsertPointGuard Guard(Builder);
9850 Builder.setInsertPoint(Select);
9851
9852 // If the true value of the select is the reduction phi, the new value is
9853 // selected if the negated condition is true in any iteration.
9854 if (Select->getOperand(1) == PhiR)
9855 Cmp = Builder.createNot(Cmp);
9856 VPValue *Or = Builder.createOr(PhiR, Cmp);
9857 Select->getVPSingleValue()->replaceAllUsesWith(Or);
9858 // Delete Select now that it has invalid types.
9859 ToDelete.push_back(Select);
9860
9861 // Convert the reduction phi to operate on bools.
9862 PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(
9863 OrigLoop->getHeader()->getContext())));
9864 continue;
9865 }
9866
9868 RdxDesc.getRecurrenceKind())) {
9869 // Adjust the start value for FindLastIV recurrences to use the sentinel
9870 // value after generating the ResumePhi recipe, which uses the original
9871 // start value.
9872 PhiR->setOperand(0, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue()));
9873 }
9874 }
9875 for (VPRecipeBase *R : ToDelete)
9876 R->eraseFromParent();
9877
9879}
9880
9882 assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
9883
9884 // Fast-math-flags propagate from the original induction instruction.
9886 if (FPBinOp)
9887 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9888
9889 Value *Step = State.get(getStepValue(), VPLane(0));
9890 Value *Index = State.get(getOperand(1), VPLane(0));
9891 Value *DerivedIV = emitTransformedIndex(
9892 State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind,
9893 cast_if_present<BinaryOperator>(FPBinOp));
9894 DerivedIV->setName(Name);
9895 // If index is the vector trip count, the concrete value will only be set in
9896 // prepareToExecute, leading to missed simplifications, e.g. if it is 0.
9897 // TODO: Remove the special case for the vector trip count once it is computed
9898 // in VPlan and can be used during VPlan simplification.
9899 assert((DerivedIV != Index ||
9900 getOperand(1) == &getParent()->getPlan()->getVectorTripCount()) &&
9901 "IV didn't need transforming?");
9902 State.set(this, DerivedIV, VPLane(0));
9903}
9904
9907 if (State.Lane) { // Generate a single instance.
9908 assert((State.VF.isScalar() || !isUniform()) &&
9909 "uniform recipe shouldn't be predicated");
9910 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9911 State.ILV->scalarizeInstruction(UI, this, *State.Lane, State);
9912 // Insert scalar instance packing it into a vector.
9913 if (State.VF.isVector() && shouldPack()) {
9914 // If we're constructing lane 0, initialize to start from poison.
9915 if (State.Lane->isFirstLane()) {
9916 assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9918 VectorType::get(UI->getType(), State.VF));
9919 State.set(this, Poison);
9920 }
9921 State.packScalarIntoVectorValue(this, *State.Lane);
9922 }
9923 return;
9924 }
9925
9926 if (IsUniform) {
9927 // Uniform within VL means we need to generate lane 0.
9928 State.ILV->scalarizeInstruction(UI, this, VPLane(0), State);
9929 return;
9930 }
9931
9932 // A store of a loop varying value to a uniform address only needs the last
9933 // copy of the store.
9934 if (isa<StoreInst>(UI) &&
9936 auto Lane = VPLane::getLastLaneForVF(State.VF);
9937 State.ILV->scalarizeInstruction(UI, this, VPLane(Lane), State);
9938 return;
9939 }
9940
9941 // Generate scalar instances for all VF lanes.
9942 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9943 const unsigned EndLane = State.VF.getKnownMinValue();
9944 for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9945 State.ILV->scalarizeInstruction(UI, this, VPLane(Lane), State);
9946}
9947
9948// Determine how to lower the scalar epilogue, which depends on 1) optimising
9949// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9950// predication, and 4) a TTI hook that analyses whether the loop is suitable
9951// for predication.
9956 // 1) OptSize takes precedence over all other options, i.e. if this is set,
9957 // don't look at hints or options, and don't request a scalar epilogue.
9958 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9959 // LoopAccessInfo (due to code dependency and not being able to reliably get
9960 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9961 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9962 // versioning when the vectorization is forced, unlike hasOptSize. So revert
9963 // back to the old way and vectorize with versioning when forced. See D81345.)
9964 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9968
9969 // 2) If set, obey the directives
9970 if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9978 };
9979 }
9980
9981 // 3) If set, obey the hints
9982 switch (Hints.getPredicate()) {
9987 };
9988
9989 // 4) if the TTI hook indicates this is profitable, request predication.
9990 TailFoldingInfo TFI(TLI, &LVL, IAI);
9993
9995}
9996
9997// Process the loop in the VPlan-native vectorization path. This path builds
9998// VPlan upfront in the vectorization pipeline, which allows to apply
9999// VPlan-to-VPlan transformations from the very beginning without modifying the
10000// input LLVM IR.
10007 LoopVectorizationRequirements &Requirements) {
10008
10009 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
10010 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
10011 return false;
10012 }
10013 assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
10014 Function *F = L->getHeader()->getParent();
10015 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
10016
10018 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
10019
10020 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
10021 &Hints, IAI);
10022 // Use the planner for outer loop vectorization.
10023 // TODO: CM is not used at this point inside the planner. Turn CM into an
10024 // optional argument if we don't need it in the future.
10025 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
10026 ORE);
10027
10028 // Get user vectorization factor.
10029 ElementCount UserVF = Hints.getWidth();
10030
10032
10033 // Plan how to best vectorize, return the best VF and its cost.
10034 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
10035
10036 // If we are stress testing VPlan builds, do not attempt to generate vector
10037 // code. Masked vector code generation support will follow soon.
10038 // Also, do not attempt to vectorize if no vector code will be produced.
10040 return false;
10041
10042 VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10043
10044 {
10045 bool AddBranchWeights =
10046 hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
10047 GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
10048 AddBranchWeights, CM.CostKind);
10049 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10050 VF.Width, 1, LVL, &CM, BFI, PSI, Checks, BestPlan);
10051 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
10052 << L->getHeader()->getParent()->getName() << "\"\n");
10053 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
10054 }
10055
10056 reportVectorization(ORE, L, VF, 1);
10057
10058 // Mark the loop as already vectorized to avoid vectorizing again.
10059 Hints.setAlreadyVectorized();
10060 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10061 return true;
10062}
10063
10064// Emit a remark if there are stores to floats that required a floating point
10065// extension. If the vectorized loop was generated with floating point there
10066// will be a performance penalty from the conversion overhead and the change in
10067// the vector width.
10070 for (BasicBlock *BB : L->getBlocks()) {
10071 for (Instruction &Inst : *BB) {
10072 if (auto *S = dyn_cast<StoreInst>(&Inst)) {
10073 if (S->getValueOperand()->getType()->isFloatTy())
10074 Worklist.push_back(S);
10075 }
10076 }
10077 }
10078
10079 // Traverse the floating point stores upwards searching, for floating point
10080 // conversions.
10083 while (!Worklist.empty()) {
10084 auto *I = Worklist.pop_back_val();
10085 if (!L->contains(I))
10086 continue;
10087 if (!Visited.insert(I).second)
10088 continue;
10089
10090 // Emit a remark if the floating point store required a floating
10091 // point conversion.
10092 // TODO: More work could be done to identify the root cause such as a
10093 // constant or a function return type and point the user to it.
10094 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
10095 ORE->emit([&]() {
10096 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
10097 I->getDebugLoc(), L->getHeader())
10098 << "floating point conversion changes vector width. "
10099 << "Mixed floating point precision requires an up/down "
10100 << "cast that will negatively impact performance.";
10101 });
10102
10103 for (Use &Op : I->operands())
10104 if (auto *OpI = dyn_cast<Instruction>(Op))
10105 Worklist.push_back(OpI);
10106 }
10107}
10108
10109static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
10110 VectorizationFactor &VF, Loop *L,
10113 std::optional<unsigned> VScale) {
10114 InstructionCost CheckCost = Checks.getCost();
10115 if (!CheckCost.isValid())
10116 return false;
10117
10118 // When interleaving only scalar and vector cost will be equal, which in turn
10119 // would lead to a divide by 0. Fall back to hard threshold.
10120 if (VF.Width.isScalar()) {
10121 if (CheckCost > VectorizeMemoryCheckThreshold) {
10122 LLVM_DEBUG(
10123 dbgs()
10124 << "LV: Interleaving only is not profitable due to runtime checks\n");
10125 return false;
10126 }
10127 return true;
10128 }
10129
10130 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
10131 uint64_t ScalarC = *VF.ScalarCost.getValue();
10132 if (ScalarC == 0)
10133 return true;
10134
10135 // First, compute the minimum iteration count required so that the vector
10136 // loop outperforms the scalar loop.
10137 // The total cost of the scalar loop is
10138 // ScalarC * TC
10139 // where
10140 // * TC is the actual trip count of the loop.
10141 // * ScalarC is the cost of a single scalar iteration.
10142 //
10143 // The total cost of the vector loop is
10144 // RtC + VecC * (TC / VF) + EpiC
10145 // where
10146 // * RtC is the cost of the generated runtime checks
10147 // * VecC is the cost of a single vector iteration.
10148 // * TC is the actual trip count of the loop
10149 // * VF is the vectorization factor
10150 // * EpiCost is the cost of the generated epilogue, including the cost
10151 // of the remaining scalar operations.
10152 //
10153 // Vectorization is profitable once the total vector cost is less than the
10154 // total scalar cost:
10155 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC
10156 //
10157 // Now we can compute the minimum required trip count TC as
10158 // VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC
10159 //
10160 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
10161 // the computations are performed on doubles, not integers and the result
10162 // is rounded up, hence we get an upper estimate of the TC.
10163 unsigned IntVF = getEstimatedRuntimeVF(VF.Width, VScale);
10164 uint64_t RtC = *CheckCost.getValue();
10165 uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
10166 uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
10167
10168 // Second, compute a minimum iteration count so that the cost of the
10169 // runtime checks is only a fraction of the total scalar loop cost. This
10170 // adds a loop-dependent bound on the overhead incurred if the runtime
10171 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
10172 // * TC. To bound the runtime check to be a fraction 1/X of the scalar
10173 // cost, compute
10174 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
10175 uint64_t MinTC2 = divideCeil(RtC * 10, ScalarC);
10176
10177 // Now pick the larger minimum. If it is not a multiple of VF and a scalar
10178 // epilogue is allowed, choose the next closest multiple of VF. This should
10179 // partly compensate for ignoring the epilogue cost.
10180 uint64_t MinTC = std::max(MinTC1, MinTC2);
10181 if (SEL == CM_ScalarEpilogueAllowed)
10182 MinTC = alignTo(MinTC, IntVF);
10184
10185 LLVM_DEBUG(
10186 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
10187 << VF.MinProfitableTripCount << "\n");
10188
10189 // Skip vectorization if the expected trip count is less than the minimum
10190 // required trip count.
10191 if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
10194 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
10195 "trip count < minimum profitable VF ("
10196 << *ExpectedTC << " < " << VF.MinProfitableTripCount
10197 << ")\n");
10198
10199 return false;
10200 }
10201 }
10202 return true;
10203}
10204
10206 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10208 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10210
10211/// Prepare \p MainPlan for vectorizing the main vector loop during epilogue
10212/// vectorization. Remove ResumePhis from \p MainPlan for inductions that
10213/// don't have a corresponding wide induction in \p EpiPlan.
10214static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
10215 // Collect PHI nodes of widened phis in the VPlan for the epilogue. Those
10216 // will need their resume-values computed in the main vector loop. Others
10217 // can be removed from the main VPlan.
10218 SmallPtrSet<PHINode *, 2> EpiWidenedPhis;
10219 for (VPRecipeBase &R :
10221 if (isa<VPCanonicalIVPHIRecipe>(&R))
10222 continue;
10223 EpiWidenedPhis.insert(
10224 cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue()));
10225 }
10227 *cast<VPIRBasicBlock>(MainPlan.getScalarHeader()))) {
10228 auto *VPIRInst = cast<VPIRInstruction>(&R);
10229 auto *IRI = dyn_cast<PHINode>(&VPIRInst->getInstruction());
10230 if (!IRI)
10231 break;
10232 if (EpiWidenedPhis.contains(IRI))
10233 continue;
10234 // There is no corresponding wide induction in the epilogue plan that would
10235 // need a resume value. Remove the VPIRInst wrapping the scalar header phi
10236 // together with the corresponding ResumePhi. The resume values for the
10237 // scalar loop will be created during execution of EpiPlan.
10238 VPRecipeBase *ResumePhi = VPIRInst->getOperand(0)->getDefiningRecipe();
10239 VPIRInst->eraseFromParent();
10240 ResumePhi->eraseFromParent();
10241 }
10243
10244 using namespace VPlanPatternMatch;
10245 VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
10246 VPValue *VectorTC = &MainPlan.getVectorTripCount();
10247 // If there is a suitable resume value for the canonical induction in the
10248 // scalar (which will become vector) epilogue loop we are done. Otherwise
10249 // create it below.
10250 if (any_of(*MainScalarPH, [VectorTC](VPRecipeBase &R) {
10251 return match(&R, m_VPInstruction<VPInstruction::ResumePhi>(
10252 m_Specific(VectorTC), m_SpecificInt(0)));
10253 }))
10254 return;
10255 VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
10256 ScalarPHBuilder.createNaryOp(
10258 {VectorTC, MainPlan.getCanonicalIV()->getStartValue()}, {},
10259 "vec.epilog.resume.val");
10260}
10261
10262/// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
10263/// SCEVs from \p ExpandedSCEVs and set resume values for header recipes.
10264static void
10266 const SCEV2ValueTy &ExpandedSCEVs,
10267 const EpilogueLoopVectorizationInfo &EPI) {
10268 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
10269 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10270 Header->setName("vec.epilog.vector.body");
10271
10272 // Re-use the trip count and steps expanded for the main loop, as
10273 // skeleton creation needs it as a value that dominates both the scalar
10274 // and vector epilogue loops
10275 // TODO: This is a workaround needed for epilogue vectorization and it
10276 // should be removed once induction resume value creation is done
10277 // directly in VPlan.
10278 for (auto &R : make_early_inc_range(*Plan.getEntry())) {
10279 auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R);
10280 if (!ExpandR)
10281 continue;
10282 auto *ExpandedVal =
10283 Plan.getOrAddLiveIn(ExpandedSCEVs.find(ExpandR->getSCEV())->second);
10284 ExpandR->replaceAllUsesWith(ExpandedVal);
10285 if (Plan.getTripCount() == ExpandR)
10286 Plan.resetTripCount(ExpandedVal);
10287 ExpandR->eraseFromParent();
10288 }
10289
10290 // Ensure that the start values for all header phi recipes are updated before
10291 // vectorizing the epilogue loop.
10292 for (VPRecipeBase &R : Header->phis()) {
10293 if (auto *IV = dyn_cast<VPCanonicalIVPHIRecipe>(&R)) {
10294 // When vectorizing the epilogue loop, the canonical induction start
10295 // value needs to be changed from zero to the value after the main
10296 // vector loop. Find the resume value created during execution of the main
10297 // VPlan.
10298 // FIXME: Improve modeling for canonical IV start values in the epilogue
10299 // loop.
10300 BasicBlock *MainMiddle = find_singleton<BasicBlock>(
10301 predecessors(L->getLoopPreheader()),
10302 [&EPI](BasicBlock *BB, bool) -> BasicBlock * {
10303 if (BB != EPI.MainLoopIterationCountCheck &&
10304 BB != EPI.EpilogueIterationCountCheck &&
10305 BB != EPI.SCEVSafetyCheck && BB != EPI.MemSafetyCheck)
10306 return BB;
10307 return nullptr;
10308 });
10309 using namespace llvm::PatternMatch;
10310 Type *IdxTy = IV->getScalarType();
10311 PHINode *EPResumeVal = find_singleton<PHINode>(
10312 L->getLoopPreheader()->phis(),
10313 [&EPI, IdxTy, MainMiddle](PHINode &P, bool) -> PHINode * {
10314 if (P.getType() == IdxTy &&
10315 P.getIncomingValueForBlock(MainMiddle) == EPI.VectorTripCount &&
10316 match(
10317 P.getIncomingValueForBlock(EPI.MainLoopIterationCountCheck),
10318 m_SpecificInt(0)))
10319 return &P;
10320 return nullptr;
10321 });
10322 assert(EPResumeVal && "must have a resume value for the canonical IV");
10323 VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal);
10324 assert(all_of(IV->users(),
10325 [](const VPUser *U) {
10326 return isa<VPScalarIVStepsRecipe>(U) ||
10327 isa<VPScalarCastRecipe>(U) ||
10328 isa<VPDerivedIVRecipe>(U) ||
10329 cast<VPInstruction>(U)->getOpcode() ==
10330 Instruction::Add;
10331 }) &&
10332 "the canonical IV should only be used by its increment or "
10333 "ScalarIVSteps when resetting the start value");
10334 IV->setOperand(0, VPV);
10335 continue;
10336 }
10337
10338 Value *ResumeV = nullptr;
10339 // TODO: Move setting of resume values to prepareToExecute.
10340 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10341 ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr())
10342 ->getIncomingValueForBlock(L->getLoopPreheader());
10343 const RecurrenceDescriptor &RdxDesc =
10344 ReductionPhi->getRecurrenceDescriptor();
10345 RecurKind RK = RdxDesc.getRecurrenceKind();
10347 // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
10348 // start value; compare the final value from the main vector loop
10349 // to the start value.
10350 BasicBlock *PBB = cast<Instruction>(ResumeV)->getParent();
10351 IRBuilder<> Builder(PBB, PBB->getFirstNonPHIIt());
10352 ResumeV =
10353 Builder.CreateICmpNE(ResumeV, RdxDesc.getRecurrenceStartValue());
10355 // VPReductionPHIRecipe for FindLastIV reductions requires an adjustment
10356 // to the resume value. The resume value is adjusted to the sentinel
10357 // value when the final value from the main vector loop equals the start
10358 // value. This ensures correctness when the start value might not be
10359 // less than the minimum value of a monotonically increasing induction
10360 // variable.
10361 BasicBlock *ResumeBB = cast<Instruction>(ResumeV)->getParent();
10362 IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt());
10363 Value *Cmp =
10364 Builder.CreateICmpEQ(ResumeV, RdxDesc.getRecurrenceStartValue());
10365 ResumeV =
10366 Builder.CreateSelect(Cmp, RdxDesc.getSentinelValue(), ResumeV);
10367 }
10368 } else {
10369 // Retrieve the induction resume values for wide inductions from
10370 // their original phi nodes in the scalar loop.
10371 PHINode *IndPhi = cast<VPWidenInductionRecipe>(&R)->getPHINode();
10372 // Hook up to the PHINode generated by a ResumePhi recipe of main
10373 // loop VPlan, which feeds the scalar loop.
10374 ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader());
10375 }
10376 assert(ResumeV && "Must have a resume value");
10377 VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
10378 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
10379 }
10380}
10381
10383 assert((EnableVPlanNativePath || L->isInnermost()) &&
10384 "VPlan-native path is not enabled. Only process inner loops.");
10385
10386 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
10387 << L->getHeader()->getParent()->getName() << "' from "
10388 << L->getLocStr() << "\n");
10389
10390 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
10391
10392 LLVM_DEBUG(
10393 dbgs() << "LV: Loop hints:"
10394 << " force="
10396 ? "disabled"
10398 ? "enabled"
10399 : "?"))
10400 << " width=" << Hints.getWidth()
10401 << " interleave=" << Hints.getInterleave() << "\n");
10402
10403 // Function containing loop
10404 Function *F = L->getHeader()->getParent();
10405
10406 // Looking at the diagnostic output is the only way to determine if a loop
10407 // was vectorized (other than looking at the IR or machine code), so it
10408 // is important to generate an optimization remark for each loop. Most of
10409 // these messages are generated as OptimizationRemarkAnalysis. Remarks
10410 // generated as OptimizationRemark and OptimizationRemarkMissed are
10411 // less verbose reporting vectorized loops and unvectorized loops that may
10412 // benefit from vectorization, respectively.
10413
10414 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10415 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10416 return false;
10417 }
10418
10419 PredicatedScalarEvolution PSE(*SE, *L);
10420
10421 // Check if it is legal to vectorize the loop.
10422 LoopVectorizationRequirements Requirements;
10423 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
10424 &Requirements, &Hints, DB, AC, BFI, PSI);
10426 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10427 Hints.emitRemarkWithHints();
10428 return false;
10429 }
10430
10432 reportVectorizationFailure("Auto-vectorization of loops with uncountable "
10433 "early exit is not enabled",
10434 "UncountableEarlyExitLoopsDisabled", ORE, L);
10435 return false;
10436 }
10437
10438 if (LVL.hasStructVectorCall()) {
10439 reportVectorizationFailure("Auto-vectorization of calls that return struct "
10440 "types is not yet supported",
10441 "StructCallVectorizationUnsupported", ORE, L);
10442 return false;
10443 }
10444
10445 // Entrance to the VPlan-native vectorization path. Outer loops are processed
10446 // here. They may require CFG and instruction level transformations before
10447 // even evaluating whether vectorization is profitable. Since we cannot modify
10448 // the incoming IR, we need to build VPlan upfront in the vectorization
10449 // pipeline.
10450 if (!L->isInnermost())
10451 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10452 ORE, BFI, PSI, Hints, Requirements);
10453
10454 assert(L->isInnermost() && "Inner loop expected.");
10455
10456 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10457 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10458
10459 // If an override option has been passed in for interleaved accesses, use it.
10461 UseInterleaved = EnableInterleavedMemAccesses;
10462
10463 // Analyze interleaved memory accesses.
10464 if (UseInterleaved)
10466
10467 if (LVL.hasUncountableEarlyExit()) {
10468 BasicBlock *LoopLatch = L->getLoopLatch();
10469 if (IAI.requiresScalarEpilogue() ||
10471 [LoopLatch](BasicBlock *BB) { return BB != LoopLatch; })) {
10472 reportVectorizationFailure("Auto-vectorization of early exit loops "
10473 "requiring a scalar epilogue is unsupported",
10474 "UncountableEarlyExitUnsupported", ORE, L);
10475 return false;
10476 }
10477 }
10478
10479 // Check the function attributes and profiles to find out if this function
10480 // should be optimized for size.
10482 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
10483
10484 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10485 // count by optimizing for size, to minimize overheads.
10486 auto ExpectedTC = getSmallBestKnownTC(PSE, L);
10487 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10488 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10489 << "This loop is worth vectorizing only if no scalar "
10490 << "iteration overheads are incurred.");
10492 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10493 else {
10494 if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
10495 LLVM_DEBUG(dbgs() << "\n");
10496 // Predicate tail-folded loops are efficient even when the loop
10497 // iteration count is low. However, setting the epilogue policy to
10498 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
10499 // with runtime checks. It's more effective to let
10500 // `areRuntimeChecksProfitable` determine if vectorization is beneficial
10501 // for the loop.
10504 } else {
10505 LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
10506 "small to consider vectorizing.\n");
10508 "The trip count is below the minial threshold value.",
10509 "loop trip count is too low, avoiding vectorization",
10510 "LowTripCount", ORE, L);
10511 Hints.emitRemarkWithHints();
10512 return false;
10513 }
10514 }
10515 }
10516
10517 // Check the function attributes to see if implicit floats or vectors are
10518 // allowed.
10519 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10521 "Can't vectorize when the NoImplicitFloat attribute is used",
10522 "loop not vectorized due to NoImplicitFloat attribute",
10523 "NoImplicitFloat", ORE, L);
10524 Hints.emitRemarkWithHints();
10525 return false;
10526 }
10527
10528 // Check if the target supports potentially unsafe FP vectorization.
10529 // FIXME: Add a check for the type of safety issue (denormal, signaling)
10530 // for the target we're vectorizing for, to make sure none of the
10531 // additional fp-math flags can help.
10532 if (Hints.isPotentiallyUnsafe() &&
10535 "Potentially unsafe FP op prevents vectorization",
10536 "loop not vectorized due to unsafe FP support.",
10537 "UnsafeFP", ORE, L);
10538 Hints.emitRemarkWithHints();
10539 return false;
10540 }
10541
10542 bool AllowOrderedReductions;
10543 // If the flag is set, use that instead and override the TTI behaviour.
10545 AllowOrderedReductions = ForceOrderedReductions;
10546 else
10547 AllowOrderedReductions = TTI->enableOrderedReductions();
10548 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10549 ORE->emit([&]() {
10550 auto *ExactFPMathInst = Requirements.getExactFPInst();
10551 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10552 ExactFPMathInst->getDebugLoc(),
10553 ExactFPMathInst->getParent())
10554 << "loop not vectorized: cannot prove it is safe to reorder "
10555 "floating-point operations";
10556 });
10557 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10558 "reorder floating-point operations\n");
10559 Hints.emitRemarkWithHints();
10560 return false;
10561 }
10562
10563 // Use the cost model.
10564 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10565 F, &Hints, IAI);
10566 // Use the planner for vectorization.
10567 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
10568 ORE);
10569
10570 // Get user vectorization factor and interleave count.
10571 ElementCount UserVF = Hints.getWidth();
10572 unsigned UserIC = Hints.getInterleave();
10573
10574 // Plan how to best vectorize.
10575 LVP.plan(UserVF, UserIC);
10577 unsigned IC = 1;
10578
10581
10582 bool AddBranchWeights =
10583 hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
10584 GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
10585 AddBranchWeights, CM.CostKind);
10586 if (LVP.hasPlanWithVF(VF.Width)) {
10587 // Select the interleave count.
10588 IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
10589
10590 unsigned SelectedIC = std::max(IC, UserIC);
10591 // Optimistically generate runtime checks if they are needed. Drop them if
10592 // they turn out to not be profitable.
10593 if (VF.Width.isVector() || SelectedIC > 1)
10594 Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
10595
10596 // Check if it is profitable to vectorize with runtime checks.
10597 bool ForceVectorization =
10599 if (!ForceVectorization &&
10600 !areRuntimeChecksProfitable(Checks, VF, L, PSE, SEL,
10601 CM.getVScaleForTuning())) {
10602 ORE->emit([&]() {
10604 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
10605 L->getHeader())
10606 << "loop not vectorized: cannot prove it is safe to reorder "
10607 "memory operations";
10608 });
10609 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
10610 Hints.emitRemarkWithHints();
10611 return false;
10612 }
10613 }
10614
10615 // Identify the diagnostic messages that should be produced.
10616 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10617 bool VectorizeLoop = true, InterleaveLoop = true;
10618 if (VF.Width.isScalar()) {
10619 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10620 VecDiagMsg = std::make_pair(
10621 "VectorizationNotBeneficial",
10622 "the cost-model indicates that vectorization is not beneficial");
10623 VectorizeLoop = false;
10624 }
10625
10626 if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) {
10627 // Tell the user interleaving was avoided up-front, despite being explicitly
10628 // requested.
10629 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10630 "interleaving should be avoided up front\n");
10631 IntDiagMsg = std::make_pair(
10632 "InterleavingAvoided",
10633 "Ignoring UserIC, because interleaving was avoided up front");
10634 InterleaveLoop = false;
10635 } else if (IC == 1 && UserIC <= 1) {
10636 // Tell the user interleaving is not beneficial.
10637 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10638 IntDiagMsg = std::make_pair(
10639 "InterleavingNotBeneficial",
10640 "the cost-model indicates that interleaving is not beneficial");
10641 InterleaveLoop = false;
10642 if (UserIC == 1) {
10643 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10644 IntDiagMsg.second +=
10645 " and is explicitly disabled or interleave count is set to 1";
10646 }
10647 } else if (IC > 1 && UserIC == 1) {
10648 // Tell the user interleaving is beneficial, but it explicitly disabled.
10649 LLVM_DEBUG(
10650 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10651 IntDiagMsg = std::make_pair(
10652 "InterleavingBeneficialButDisabled",
10653 "the cost-model indicates that interleaving is beneficial "
10654 "but is explicitly disabled or interleave count is set to 1");
10655 InterleaveLoop = false;
10656 }
10657
10658 // If there is a histogram in the loop, do not just interleave without
10659 // vectorizing. The order of operations will be incorrect without the
10660 // histogram intrinsics, which are only used for recipes with VF > 1.
10661 if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) {
10662 LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
10663 << "to histogram operations.\n");
10664 IntDiagMsg = std::make_pair(
10665 "HistogramPreventsScalarInterleaving",
10666 "Unable to interleave without vectorization due to constraints on "
10667 "the order of histogram operations");
10668 InterleaveLoop = false;
10669 }
10670
10671 // Override IC if user provided an interleave count.
10672 IC = UserIC > 0 ? UserIC : IC;
10673
10674 // Emit diagnostic messages, if any.
10675 const char *VAPassName = Hints.vectorizeAnalysisPassName();
10676 if (!VectorizeLoop && !InterleaveLoop) {
10677 // Do not vectorize or interleaving the loop.
10678 ORE->emit([&]() {
10679 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10680 L->getStartLoc(), L->getHeader())
10681 << VecDiagMsg.second;
10682 });
10683 ORE->emit([&]() {
10684 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10685 L->getStartLoc(), L->getHeader())
10686 << IntDiagMsg.second;
10687 });
10688 return false;
10689 }
10690
10691 if (!VectorizeLoop && InterleaveLoop) {
10692 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10693 ORE->emit([&]() {
10694 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10695 L->getStartLoc(), L->getHeader())
10696 << VecDiagMsg.second;
10697 });
10698 } else if (VectorizeLoop && !InterleaveLoop) {
10699 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10700 << ") in " << L->getLocStr() << '\n');
10701 ORE->emit([&]() {
10702 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10703 L->getStartLoc(), L->getHeader())
10704 << IntDiagMsg.second;
10705 });
10706 } else if (VectorizeLoop && InterleaveLoop) {
10707 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10708 << ") in " << L->getLocStr() << '\n');
10709 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10710 }
10711
10712 bool DisableRuntimeUnroll = false;
10713 MDNode *OrigLoopID = L->getLoopID();
10714 {
10715 using namespace ore;
10716 if (!VectorizeLoop) {
10717 assert(IC > 1 && "interleave count should not be 1 or 0");
10718 // If we decided that it is not legal to vectorize the loop, then
10719 // interleave it.
10720 VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10721 InnerLoopVectorizer Unroller(
10722 L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1),
10723 ElementCount::getFixed(1), IC, &LVL, &CM, BFI, PSI, Checks, BestPlan);
10724
10725 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10726
10727 ORE->emit([&]() {
10728 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10729 L->getHeader())
10730 << "interleaved loop (interleaved count: "
10731 << NV("InterleaveCount", IC) << ")";
10732 });
10733 } else {
10734 // If we decided that it is *legal* to vectorize the loop, then do it.
10735
10736 VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10737 // Consider vectorizing the epilogue too if it's profitable.
10738 VectorizationFactor EpilogueVF =
10740 if (EpilogueVF.Width.isVector()) {
10741 std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
10742
10743 // The first pass vectorizes the main loop and creates a scalar epilogue
10744 // to be vectorized by executing the plan (potentially with a different
10745 // factor) again shortly afterwards.
10746 VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width);
10747 preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);
10748 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
10749 BestEpiPlan);
10750 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10751 EPI, &LVL, &CM, BFI, PSI, Checks,
10752 *BestMainPlan);
10753 auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
10754 *BestMainPlan, MainILV, DT, false);
10755 ++LoopsVectorized;
10756
10757 // Second pass vectorizes the epilogue and adjusts the control flow
10758 // edges from the first pass.
10759 EPI.MainLoopVF = EPI.EpilogueVF;
10760 EPI.MainLoopUF = EPI.EpilogueUF;
10761 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10762 ORE, EPI, &LVL, &CM, BFI, PSI,
10763 Checks, BestEpiPlan);
10764 EpilogILV.setTripCount(MainILV.getTripCount());
10765 preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI);
10766
10767 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10768 DT, true, &ExpandedSCEVs);
10769 ++LoopsEpilogueVectorized;
10770
10771 if (!MainILV.areSafetyChecksAdded())
10772 DisableRuntimeUnroll = true;
10773 } else {
10774 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10775 VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10776 PSI, Checks, BestPlan);
10777 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10778 ++LoopsVectorized;
10779
10780 // Add metadata to disable runtime unrolling a scalar loop when there
10781 // are no runtime checks about strides and memory. A scalar loop that is
10782 // rarely used is not worth unrolling.
10783 if (!LB.areSafetyChecksAdded())
10784 DisableRuntimeUnroll = true;
10785 }
10786 // Report the vectorization decision.
10787 reportVectorization(ORE, L, VF, IC);
10788 }
10789
10792 }
10793
10794 assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
10795 "DT not preserved correctly");
10796
10797 std::optional<MDNode *> RemainderLoopID =
10800 if (RemainderLoopID) {
10801 L->setLoopID(*RemainderLoopID);
10802 } else {
10803 if (DisableRuntimeUnroll)
10805
10806 // Mark the loop as already vectorized to avoid vectorizing again.
10807 Hints.setAlreadyVectorized();
10808 }
10809
10810 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10811 return true;
10812}
10813
10815
10816 // Don't attempt if
10817 // 1. the target claims to have no vector registers, and
10818 // 2. interleaving won't help ILP.
10819 //
10820 // The second condition is necessary because, even if the target has no
10821 // vector registers, loop vectorization may still enable scalar
10822 // interleaving.
10825 return LoopVectorizeResult(false, false);
10826
10827 bool Changed = false, CFGChanged = false;
10828
10829 // The vectorizer requires loops to be in simplified form.
10830 // Since simplification may add new inner loops, it has to run before the
10831 // legality and profitability checks. This means running the loop vectorizer
10832 // will simplify all loops, regardless of whether anything end up being
10833 // vectorized.
10834 for (const auto &L : *LI)
10835 Changed |= CFGChanged |=
10836 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10837
10838 // Build up a worklist of inner-loops to vectorize. This is necessary as
10839 // the act of vectorizing or partially unrolling a loop creates new loops
10840 // and can invalidate iterators across the loops.
10841 SmallVector<Loop *, 8> Worklist;
10842
10843 for (Loop *L : *LI)
10844 collectSupportedLoops(*L, LI, ORE, Worklist);
10845
10846 LoopsAnalyzed += Worklist.size();
10847
10848 // Now walk the identified inner loops.
10849 while (!Worklist.empty()) {
10850 Loop *L = Worklist.pop_back_val();
10851
10852 // For the inner loops we actually process, form LCSSA to simplify the
10853 // transform.
10854 Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10855
10856 Changed |= CFGChanged |= processLoop(L);
10857
10858 if (Changed) {
10859 LAIs->clear();
10860
10861#ifndef NDEBUG
10862 if (VerifySCEV)
10863 SE->verify();
10864#endif
10865 }
10866 }
10867
10868 // Process each loop nest in the function.
10869 return LoopVectorizeResult(Changed, CFGChanged);
10870}
10871
10874 LI = &AM.getResult<LoopAnalysis>(F);
10875 // There are no loops in the function. Return before computing other
10876 // expensive analyses.
10877 if (LI->empty())
10878 return PreservedAnalyses::all();
10887
10888 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10889 PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10890 BFI = nullptr;
10891 if (PSI && PSI->hasProfileSummary())
10893 LoopVectorizeResult Result = runImpl(F);
10894 if (!Result.MadeAnyChange)
10895 return PreservedAnalyses::all();
10897
10898 if (isAssignmentTrackingEnabled(*F.getParent())) {
10899 for (auto &BB : F)
10901 }
10902
10903 PA.preserve<LoopAnalysis>();
10907
10908 if (Result.MadeCFGChange) {
10909 // Making CFG changes likely means a loop got vectorized. Indicate that
10910 // extra simplification passes should be run.
10911 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10912 // be run if runtime checks have been added.
10915 } else {
10917 }
10918 return PA;
10919}
10920
10922 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10923 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10924 OS, MapClassName2PassName);
10925
10926 OS << '<';
10927 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10928 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10929 OS << '>';
10930}
@ Poison
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
AMDGPU Lower Kernel Arguments
AMDGPU Register Bank Select
Rewrite undef for PHI
This file implements a class to represent arbitrary precision integral constant values and operations...
@ PostInc
ReachingDefAnalysis InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
This file contains the simple types necessary to represent the attributes associated with functions a...
This is the interface for LLVM's primary stateless and local alias analysis.
static bool IsEmptyBlock(MachineBasicBlock *MBB)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:686
This file contains the declarations for the subclasses of Constant, which represent the different fla...
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
#define DEBUG_WITH_TYPE(TYPE,...)
DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug information.
Definition: Debug.h:64
This file defines DenseMapInfo traits for DenseMap.
This file defines the DenseMap class.
std::string Name
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define Check(C,...)
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
#define _
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
Module.h This file contains the declarations for the Module class.
This defines the Use class.
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static Value * getMask(Value *WideMask, unsigned Factor, VectorType *LeafValueTy)
Legalize the Machine IR a function s Machine IR
Definition: Legalizer.cpp:80
This header provides classes for managing per-loop analyses.
static const char * VerboseDebug
loop Loop Strength Reduction
#define LV_NAME
This file defines the LoopVectorizationLegality class.
This file provides a LoopVectorizationPlanner class.
static void collectSupportedLoops(Loop &L, LoopInfo *LI, OptimizationRemarkEmitter *ORE, SmallVectorImpl< Loop * > &V)
static cl::opt< unsigned > EpilogueVectorizationMinVF("epilogue-vectorization-minimum-VF", cl::Hidden, cl::desc("Only loops with vectorization factor equal to or larger than " "the specified value are considered for epilogue vectorization."))
static cl::opt< unsigned > EpilogueVectorizationForceVF("epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, cl::desc("When epilogue vectorization is enabled, and a value greater than " "1 is specified, forces the given VF for all applicable epilogue " "loops."))
static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan, DenseMap< VPValue *, VPValue * > &IVEndValues)
Create resume phis in the scalar preheader for first-order recurrences, reductions and inductions,...
static void addRuntimeUnrollDisableMetaData(Loop *L)
static ElementCount determineVPlanVF(const TargetTransformInfo &TTI, LoopVectorizationCostModel &CM)
static cl::opt< unsigned > VectorizeMemoryCheckThreshold("vectorize-memory-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum allowed number of runtime memory checks"))
static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan)
Prepare MainPlan for vectorizing the main vector loop during epilogue vectorization.
static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, VectorizationFactor &VF, Loop *L, PredicatedScalarEvolution &PSE, ScalarEpilogueLowering SEL, std::optional< unsigned > VScale)
static cl::opt< unsigned > TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), cl::Hidden, cl::desc("Loops with a constant trip count that is smaller than this " "value are vectorized only if no scalar iteration overheads " "are incurred."))
Loops with a known constant trip count below this number are vectorized only if no scalar iteration o...
static void debugVectorizationMessage(const StringRef Prefix, const StringRef DebugMsg, Instruction *I)
Write a DebugMsg about vectorization to the debug output stream.
static cl::opt< bool > EnableCondStoresVectorization("enable-cond-stores-vec", cl::init(true), cl::Hidden, cl::desc("Enable if predication of stores during vectorization."))
static VPInstruction * addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Create and return a ResumePhi for WideIV, unless it is truncated.
static Value * emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, Value *Step, InductionDescriptor::InductionKind InductionKind, const BinaryOperator *InductionBinOp)
Compute the transformed value of Index at offset StartValue using step StepValue.
static DebugLoc getDebugLocFromInstOrOperands(Instruction *I)
Look for a meaningful debug location on the instruction or its operands.
static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB)
Replace VPBB with a VPIRBasicBlock wrapping IRBB.
const char LLVMLoopVectorizeFollowupAll[]
static SetVector< VPIRInstruction * > collectUsersInExitBlocks(Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan)
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, SetVector< VPIRInstruction * > &ExitUsersToFix)
Handle users in the exit block for first order reductions in the original exit block.
static cl::opt< bool > ForceTargetSupportsScalableVectors("force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, cl::desc("Pretend that scalable vectors are supported, even if the target does " "not support them. This flag should only be used for testing."))
static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, DebugLoc DL)
static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style)
static constexpr uint32_t MemCheckBypassWeights[]
static unsigned getEstimatedRuntimeVF(ElementCount VF, std::optional< unsigned > VScale)
This function attempts to return a value that represents the vectorization factor at runtime.
cl::opt< unsigned > ForceTargetInstructionCost("force-target-instruction-cost", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's expected cost for " "an instruction to a single constant value. Mostly " "useful for getting consistent testing."))
std::optional< unsigned > getMaxVScale(const Function &F, const TargetTransformInfo &TTI)
static constexpr uint32_t MinItersBypassWeights[]
static cl::opt< unsigned > ForceTargetNumScalarRegs("force-target-num-scalar-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of scalar registers."))
static cl::opt< bool > UseWiderVFIfCallVariantsPresent("vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), cl::Hidden, cl::desc("Try wider VFs if they enable the use of vector variants"))
static bool planContainsAdditionalSimplifications(VPlan &Plan, VPCostContext &CostCtx, Loop *TheLoop)
Return true if the original loop \ TheLoop contains any instructions that do not have corresponding r...
static cl::opt< unsigned > SmallLoopCost("small-loop-cost", cl::init(20), cl::Hidden, cl::desc("The cost of a loop that is considered 'small' by the interleaver."))
static cl::opt< unsigned > ForceTargetNumVectorRegs("force-target-num-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers."))
static bool isExplicitVecOuterLoop(Loop *OuterLp, OptimizationRemarkEmitter *ORE)
static cl::opt< bool > EnableIndVarRegisterHeur("enable-ind-var-reg-heur", cl::init(true), cl::Hidden, cl::desc("Count the induction variable only once when interleaving"))
static Type * maybeVectorizeType(Type *Elt, ElementCount VF)
static std::optional< unsigned > getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L, bool CanUseConstantMax=true)
Returns "best known" trip count for the specified loop L as defined by the following procedure: 1) Re...
static cl::opt< TailFoldingStyle > ForceTailFoldingStyle("force-tail-folding-style", cl::desc("Force the tail folding style"), cl::init(TailFoldingStyle::None), cl::values(clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), clEnumValN(TailFoldingStyle::Data, "data", "Create lane mask for data only, using active.lane.mask intrinsic"), clEnumValN(TailFoldingStyle::DataWithoutLaneMask, "data-without-lane-mask", "Create lane mask with compare/stepvector"), clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", "Create lane mask using active.lane.mask intrinsic, and use " "it for both data and control flow"), clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, "data-and-control-without-rt-check", "Similar to data-and-control, but remove the runtime check"), clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl", "Use predicated EVL instructions for tail folding. If EVL " "is unsupported, fallback to data-without-lane-mask.")))
static cl::opt< bool > EnableEpilogueVectorization("enable-epilogue-vectorization", cl::init(true), cl::Hidden, cl::desc("Enable vectorization of epilogue loops."))
static ScalarEpilogueLowering getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI)
const char VerboseDebug[]
static void fixReductionScalarResumeWhenVectorizingEpilog(VPRecipeBase *R, VPTransformState &State, BasicBlock *LoopMiddleBlock, BasicBlock *BypassBlock)
static cl::opt< bool > PreferPredicatedReductionSelect("prefer-predicated-reduction-select", cl::init(false), cl::Hidden, cl::desc("Prefer predicating a reduction operation over an after loop select."))
static VPWidenIntOrFpInductionRecipe * createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start, const InductionDescriptor &IndDesc, VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop)
Creates a VPWidenIntOrFpInductionRecpipe for Phi.
static constexpr uint32_t SCEVCheckBypassWeights[]
static cl::opt< bool > PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), cl::Hidden, cl::desc("Prefer in-loop vector reductions, " "overriding the targets preference."))
const char LLVMLoopVectorizeFollowupVectorized[]
static cl::opt< bool > EnableLoadStoreRuntimeInterleave("enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, cl::desc("Enable runtime interleaving until load/store ports are saturated"))
static cl::opt< bool > VPlanBuildStressTest("vplan-build-stress-test", cl::init(false), cl::Hidden, cl::desc("Build VPlan for every supported loop nest in the function and bail " "out right after the build (stress test the VPlan H-CFG construction " "in the VPlan-native vectorization path)."))
static bool hasIrregularType(Type *Ty, const DataLayout &DL)
A helper function that returns true if the given type is irregular.
static cl::opt< bool > LoopVectorizeWithBlockFrequency("loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, cl::desc("Enable the use of the block frequency analysis to access PGO " "heuristics minimizing code growth in cold regions and being more " "aggressive in hot regions."))
static Value * getExpandedStep(const InductionDescriptor &ID, const SCEV2ValueTy &ExpandedSCEVs)
Return the expanded step for ID using ExpandedSCEVs to look up SCEV expansion results.
const char LLVMLoopVectorizeFollowupEpilogue[]
static void preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, const SCEV2ValueTy &ExpandedSCEVs, const EpilogueLoopVectorizationInfo &EPI)
Prepare Plan for vectorizing the epilogue loop.
static bool useActiveLaneMask(TailFoldingStyle Style)
static bool isIndvarOverflowCheckKnownFalse(const LoopVectorizationCostModel *Cost, ElementCount VF, std::optional< unsigned > UF=std::nullopt)
For the given VF and UF and maximum trip count computed for the loop, return whether the induction va...
static void addFullyUnrolledInstructionsToIgnore(Loop *L, const LoopVectorizationLegality::InductionList &IL, SmallPtrSetImpl< Instruction * > &InstsToIgnore)
Knowing that loop L executes a single vector iteration, add instructions that will get simplified and...
static cl::opt< PreferPredicateTy::Option > PreferPredicateOverEpilogue("prefer-predicate-over-epilogue", cl::init(PreferPredicateTy::ScalarEpilogue), cl::Hidden, cl::desc("Tail-folding and predication preferences over creating a scalar " "epilogue loop."), cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, "scalar-epilogue", "Don't tail-predicate loops, create scalar epilogue"), clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, "predicate-else-scalar-epilogue", "prefer tail-folding, create scalar epilogue if tail " "folding fails."), clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, "predicate-dont-vectorize", "prefers tail-folding, don't attempt vectorization if " "tail-folding fails.")))
static cl::opt< bool > EnableInterleavedMemAccesses("enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop"))
static cl::opt< bool > EnableMaskedInterleavedMemAccesses("enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"))
An interleave-group may need masking if it resides in a block that needs predication,...
static cl::opt< bool > ForceOrderedReductions("force-ordered-reductions", cl::init(false), cl::Hidden, cl::desc("Enable the vectorisation of loops with in-order (strict) " "FP reductions"))
static void cse(BasicBlock *BB)
Perform cse of induction variable instructions.
static const SCEV * getAddressAccessSCEV(Value *Ptr, LoopVectorizationLegality *Legal, PredicatedScalarEvolution &PSE, const Loop *TheLoop)
Gets Address Access SCEV after verifying that the access pattern is loop invariant except the inducti...
static cl::opt< cl::boolOrDefault > ForceSafeDivisor("force-widen-divrem-via-safe-divisor", cl::Hidden, cl::desc("Override cost based safe divisor widening for div/rem instructions"))
#define DEBUG_TYPE
static cl::opt< unsigned > ForceTargetMaxVectorInterleaveFactor("force-target-max-vector-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "vectorized loops."))
static bool processLoopInVPlanNativePath(Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements)
static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI)
static cl::opt< unsigned > NumberOfStoresToPredicate("vectorize-num-stores-pred", cl::init(1), cl::Hidden, cl::desc("Max number of stores to be predicated behind an if."))
The number of stores in a loop that are allowed to need predication.
static cl::opt< unsigned > MaxNestedScalarReductionIC("max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop."))
static cl::opt< unsigned > ForceTargetMaxScalarInterleaveFactor("force-target-max-scalar-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "scalar loops."))
static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE)
static void addUsersInExitBlocks(VPlan &Plan, const SetVector< VPIRInstruction * > &ExitUsersToFix)
static bool willGenerateVectors(VPlan &Plan, ElementCount VF, const TargetTransformInfo &TTI)
Check if any recipe of Plan will generate a vector value, which will be assigned a vector register.
static cl::opt< bool > EnableEarlyExitVectorization("enable-early-exit-vectorization", cl::init(false), cl::Hidden, cl::desc("Enable vectorization of early exit loops with uncountable exits."))
static cl::opt< bool > MaximizeBandwidth("vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " "will be determined by the smallest type in loop."))
static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, Instruction *I, DebugLoc DL={})
Create an analysis remark that explains why vectorization failed.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
This file implements a map that provides insertion order iteration.
std::pair< uint64_t, uint64_t > Interval
This file contains the declarations for metadata subclasses.
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
if(PassOpts->AAPipeline)
This file contains the declarations for profiling metadata utility functions.
const SmallVectorImpl< MachineOperand > & Cond
static BinaryOperator * CreateMul(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
static BinaryOperator * CreateAdd(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
#define OP(OPC)
Definition: Instruction.h:45
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
This pass exposes codegen information to IR-level passes.
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file defines the VPlanHCFGBuilder class which contains the public interface (buildHierarchicalCF...
This file contains the declarations of different VPlan-related auxiliary helpers.
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const char PassName[]
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1520
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1542
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
void registerAssumption(AssumeInst *CI)
Add an @llvm.assume intrinsic to this function's cache.
unsigned getVScaleRangeMin() const
Returns the minimum value for the vscale_range attribute.
Definition: Attributes.cpp:464
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:461
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition: BasicBlock.h:530
InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:381
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:481
const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Definition: BasicBlock.cpp:511
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:220
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:168
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:240
BinaryOps getOpcode() const
Definition: InstrTypes.h:370
Analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Conditional or Unconditional Branch instruction.
bool isConditional() const
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
BasicBlock * getSuccessor(unsigned i) const
Value * getCondition() const
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:72
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:1892
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1341
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1286
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1277
unsigned arg_size() const
Definition: InstrTypes.h:1284
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:696
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:698
@ ICMP_EQ
equal
Definition: InstrTypes.h:694
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:699
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:787
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:866
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:873
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
A debug info location.
Definition: DebugLoc.h:33
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:103
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
bool empty() const
Definition: DenseMap.h:98
iterator begin()
Definition: DenseMap.h:75
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:152
iterator end()
Definition: DenseMap.h:84
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:202
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:147
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:211
Implements a dense probed hash-table based set.
Definition: DenseSet.h:278
DomTreeNodeBase * getIDom() const
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
bool verify(VerificationLevel VL=VerificationLevel::Full) const
verify - checks if the tree is correct.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
void eraseNode(NodeT *BB)
eraseNode - Removes a node from the dominator tree.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
constexpr bool isVector() const
One or more elements.
Definition: TypeSize.h:326
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:314
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
static constexpr ElementCount get(ScalarTy MinVal, bool Scalable)
Definition: TypeSize.h:317
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:322
BasicBlock * emitMinimumVectorEpilogueIterCountCheck(BasicBlock *Bypass, BasicBlock *Insert)
Emits an iteration count bypass check after the main vector loop has finished to see if there are any...
EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan)
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
BasicBlock * createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Implements the interface for creating a vectorized skeleton using the epilogue loop strategy (ie the ...
A specialized derived class of inner loop vectorizer that performs vectorization of main loops in the...
BasicBlock * createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Implements the interface for creating a vectorized skeleton using the main loop strategy (ie the firs...
EpilogueVectorizerMainLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check, VPlan &Plan)
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
BasicBlock * emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue)
Emits an iteration count bypass check once for the main loop (when ForEpilogue is false) and once for...
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition: Operator.h:338
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Class to represent function types.
Definition: DerivedTypes.h:105
param_iterator param_begin() const
Definition: DerivedTypes.h:130
param_iterator param_end() const
Definition: DerivedTypes.h:131
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:713
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:216
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:373
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:766
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:710
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:731
Represents flags for the getelementptr instruction/expression.
static GEPNoWrapFlags inBounds()
static GEPNoWrapFlags none()
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:113
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:485
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1053
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:330
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2274
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition: IRBuilder.cpp:889
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2270
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition: IRBuilder.h:164
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1387
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1370
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:490
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:199
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2380
Value * CreateURem(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1447
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2705
A struct for saving information about induction variables.
const SCEV * getStep() const
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_NoInduction
Not an induction variable.
@ IK_FpInduction
Floating point induction variable.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
const SmallVectorImpl< Instruction * > & getCastInsts() const
Returns a reference to the type cast instructions in the induction update chain, that are redundant w...
Value * getStartValue() const
An extension of the inner loop vectorizer that creates a skeleton for a vectorized loop that has its ...
InnerLoopAndEpilogueVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan)
BasicBlock * createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Create a new empty loop that will contain vectorized instructions later on, while the old loop will b...
virtual BasicBlock * createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs)=0
The interface for creating a vectorized skeleton using one of two different strategies,...
EpilogueLoopVectorizationInfo & EPI
Holds and updates state information required to vectorize the main loop and its epilogue in two separ...
InnerLoopVectorizer vectorizes loops which contain only one basic block to a specified vectorization ...
virtual void printDebugTracesAtStart()
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
Value * TripCount
Trip count of the original loop.
void sinkScalarOperands(Instruction *PredInst)
Iteratively sink the scalarized operands of a predicated instruction into the block that was created ...
const TargetLibraryInfo * TLI
Target Library Info.
ElementCount MinProfitableTripCount
const TargetTransformInfo * TTI
Target Transform Info.
Value * VectorTripCount
Trip count of the widened loop (TripCount - TripCount % (VF*UF))
BasicBlock * emitSCEVChecks(BasicBlock *Bypass)
Emit a bypass check to see if all of the SCEV assumptions we've had to make are correct.
virtual BasicBlock * createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs)
Create a new empty loop that will contain vectorized instructions later on, while the old loop will b...
LoopVectorizationCostModel * Cost
The profitablity analysis.
BasicBlock * AdditionalBypassBlock
The additional bypass block which conditionally skips over the epilogue loop after executing the main...
BlockFrequencyInfo * BFI
BFI and PSI are used to check for profile guided size optimizations.
Value * getTripCount() const
Returns the original loop trip count.
BasicBlock * LoopMiddleBlock
Middle Block between the vector and the scalar.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
void scalarizeInstruction(const Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPLane &Lane, VPTransformState &State)
A helper function to scalarize a single Instruction in the innermost loop.
SmallVector< Instruction *, 4 > PredicatedInstructions
Store instructions that were predicated.
DenseMap< PHINode *, Value * > Induction2AdditionalBypassValue
Mapping of induction phis to their additional bypass values.
void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB)
Introduces a new VPIRBasicBlock for CheckIRBB to Plan between the vector preheader and its predecesso...
void createVectorLoopSkeleton(StringRef Prefix)
Emit basic blocks (prefixed with Prefix) for the iteration check, vector loop preheader,...
BasicBlock * emitMemRuntimeChecks(BasicBlock *Bypass)
Emit bypass checks to check any memory assumptions we may have made.
BasicBlock * LoopScalarPreHeader
The scalar-loop preheader.
void createInductionAdditionalBypassValues(const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount)
Create and record the values for induction variables to resume coming from the additional bypass bloc...
VPBlockBase * VectorPHVPB
The vector preheader block of Plan, used as target for check blocks introduced during skeleton creati...
LoopVectorizationLegality * Legal
The legality analysis.
InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, ElementCount VecWidth, ElementCount MinProfitableTripCount, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks, VPlan &Plan)
void emitIterationCountCheck(BasicBlock *Bypass)
Emit a bypass check to see if the vector trip count is zero, including if it overflows.
PredicatedScalarEvolution & PSE
A wrapper around ScalarEvolution used to add runtime SCEV checks.
LoopInfo * LI
Loop Info.
ProfileSummaryInfo * PSI
Value * getInductionAdditionalBypassValue(PHINode *OrigPhi) const
induction header phi.
BasicBlock * getAdditionalBypassBlock() const
Return the additional bypass block which targets the scalar loop by skipping the epilogue loop after ...
DominatorTree * DT
Dominator Tree.
void setTripCount(Value *TC)
Used to set the trip count after ILV's construction and after the preheader block has been executed.
void fixVectorizedLoop(VPTransformState &State)
Fix the vectorized code, taking care of header phi's, and more.
BasicBlock * LoopVectorPreHeader
The vector-loop preheader.
virtual void printDebugTracesAtEnd()
AssumptionCache * AC
Assumption Cache.
Value * getOrCreateVectorTripCount(BasicBlock *InsertBlock)
Returns (and creates if needed) the trip count of the widened loop.
IRBuilder Builder
The builder that we use.
void fixNonInductionPHIs(VPTransformState &State)
Fix the non-induction PHIs in Plan.
unsigned UF
The vectorization unroll factor to use.
SmallVector< BasicBlock *, 4 > LoopBypassBlocks
A list of all bypass blocks. The first block is the entry of the loop.
GeneratedRTChecks & RTChecks
Structure to hold information about generated runtime checks, responsible for cleaning the checks,...
virtual ~InnerLoopVectorizer()=default
ElementCount VF
The vectorization SIMD factor to use.
Loop * OrigLoop
The original loop.
static InstructionCost getInvalid(CostType Val=0)
static InstructionCost getMax()
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:511
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:68
bool isBinaryOp() const
Definition: Instruction.h:315
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:94
void replaceSuccessorWith(BasicBlock *OldBB, BasicBlock *NewBB)
Replace specified successor OldBB to point at the provided block.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:169
FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
const char * getOpcodeName() const
Definition: Instruction.h:312
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:310
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:311
The group of interleaved loads/stores sharing the same stride and close to each other.
Definition: VectorUtils.h:488
uint32_t getFactor() const
Definition: VectorUtils.h:504
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
Definition: VectorUtils.h:558
InstTy * getInsertPos() const
Definition: VectorUtils.h:574
uint32_t getNumMembers() const
Definition: VectorUtils.h:506
Drive the analysis of interleaved memory accesses in the loop.
Definition: VectorUtils.h:630
InterleaveGroup< Instruction > * getInterleaveGroup(const Instruction *Instr) const
Get the interleave group that Instr belongs to.
Definition: VectorUtils.h:675
bool requiresScalarEpilogue() const
Returns true if an interleaved group that may access memory out-of-bounds requires a scalar epilogue ...
Definition: VectorUtils.h:686
bool isInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleave group.
Definition: VectorUtils.h:667
bool invalidateGroups()
Invalidate groups, e.g., in case all blocks in loop will be predicated contrary to original assumptio...
Definition: VectorUtils.h:650
iterator_range< SmallPtrSetIterator< llvm::InterleaveGroup< Instruction > * > > getInterleaveGroups()
Definition: VectorUtils.h:680
void analyzeInterleaving(bool EnableMaskedInterleavedGroup)
Analyze the interleaved accesses and collect them in interleave groups.
void invalidateGroupsRequiringScalarEpilogue()
Invalidate groups that require a scalar epilogue (due to gaps).
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:176
Type * getPointerOperandType() const
Definition: Instructions.h:258
This analysis provides dependence information for the memory accesses of a loop.
Drive the analysis of memory accesses in the loop.
const RuntimePointerChecking * getRuntimePointerChecking() const
unsigned getNumRuntimePointerChecks() const
Number of memchecks required to prove independence of otherwise may-alias pointers.
const DenseMap< Value *, const SCEV * > & getSymbolicStrides() const
If an access has a symbolic strides, this maps the pointer value to the stride symbol.
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
void getExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all of the successor blocks of this loop.
BlockT * getUniqueLatchExitBlock() const
Return the unique exit block for the latch, or null if there are multiple different exit blocks or th...
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
BlockT * getHeader() const
unsigned getLoopDepth() const
Return the nesting level of this loop.
void addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase< BlockT, LoopT > &LI)
This method is used by other analyses to update loop information.
iterator_range< block_iterator > blocks() const
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
LoopT * getParentLoop() const
Return the parent loop if it exists or nullptr for top level loops.
bool isLoopExiting(const BlockT *BB) const
True if terminator in the block can branch to another block that is outside of the current loop.
Store the result of a depth first search within basic blocks contained by a single loop.
Definition: LoopIterator.h:97
RPOIterator beginRPO() const
Reverse iterate over the cached postorder blocks.
Definition: LoopIterator.h:136
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
Definition: LoopInfo.cpp:1266
RPOIterator endRPO() const
Definition: LoopIterator.h:140
Wrapper class to LoopBlocksDFS that provides a standard begin()/end() interface for the DFS reverse p...
Definition: LoopIterator.h:172
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
Definition: LoopIterator.h:180
void removeBlock(BlockT *BB)
This method completely removes BB from all data structures, including all of the Loop objects it is n...
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LoopVectorizationCostModel - estimates the expected speedups due to vectorization.
SmallPtrSet< Type *, 16 > ElementTypesInLoop
All element types found in the loop.
void collectElementTypesForWidening()
Collect all element types in the loop for which widening is needed.
bool canVectorizeReductions(ElementCount VF) const
Returns true if the target machine supports all of the reduction variables found for the given VF.
bool isEpilogueVectorizationProfitable(const ElementCount VF, const unsigned IC) const
Returns true if epilogue vectorization is considered profitable, and false otherwise.
bool requiresScalarEpilogue(VFRange Range) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
bool isPredicatedInst(Instruction *I) const
Returns true if I is an instruction that needs to be predicated at runtime.
void collectValuesToIgnore()
Collect values we want to ignore in the cost model.
void collectInLoopReductions()
Split reductions into those that happen in the loop, and those that happen outside.
std::pair< unsigned, unsigned > getSmallestAndWidestTypes()
bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be uniform after vectorization.
bool usePredicatedReductionSelect(unsigned Opcode, Type *PhiTy) const
Returns true if the predicated reduction select should be used to set the incoming value for the redu...
PredicatedScalarEvolution & PSE
Predicated scalar evolution analysis.
const LoopVectorizeHints * Hints
Loop Vectorize Hint.
std::optional< unsigned > getMaxSafeElements() const
Return maximum safe number of elements to be processed per vector iteration, which do not prevent sto...
const TargetTransformInfo & TTI
Vector target information.
LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, const Function *F, const LoopVectorizeHints *Hints, InterleavedAccessInfo &IAI)
LoopVectorizationLegality * Legal
Vectorization legality.
bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const
Returns true if the target machine supports masked load operation for the given DataType and kind of ...
std::optional< InstructionCost > getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy) const
Return the cost of instructions in an inloop reduction pattern, if I is part of that pattern.
InstructionCost getInstructionCost(Instruction *I, ElementCount VF)
Returns the execution time cost of an instruction for a given vector width.
DemandedBits * DB
Demanded bits analysis.
bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const
Returns true if I is a memory instruction in an interleaved-group of memory accesses that can be vect...
const TargetLibraryInfo * TLI
Target Library Info.
bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF)
Returns true if I is a memory instruction with consecutive memory access that can be widened.
const InterleaveGroup< Instruction > * getInterleavedAccessGroup(Instruction *Instr) const
Get the interleaved access group that Instr belongs to.
InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const
Estimate cost of an intrinsic call instruction CI if it were vectorized with factor VF.
bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be scalar after vectorization.
bool isOptimizableIVTruncate(Instruction *I, ElementCount VF)
Return True if instruction I is an optimizable truncate whose operand is an induction variable.
FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC)
Loop * TheLoop
The loop that we evaluate.
TTI::TargetCostKind CostKind
The kind of cost that we are calculating.
TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow=true) const
Returns the TailFoldingStyle that is best for the current loop.
InterleavedAccessInfo & InterleaveInfo
The interleave access information contains groups of interleaved accesses with the same stride and cl...
SmallPtrSet< const Value *, 16 > ValuesToIgnore
Values to ignore in the cost model.
void setVectorizedCallDecision(ElementCount VF)
A call may be vectorized in different ways depending on whether we have vectorized variants available...
void invalidateCostModelingDecisions()
Invalidates decisions already taken by the cost model.
bool isAccessInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleaved access group.
bool selectUserVectorizationFactor(ElementCount UserVF)
Setup cost-based decisions for user vectorization factor.
std::optional< unsigned > getVScaleForTuning() const
Return the value of vscale used for tuning the cost model.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const
Returns true if the target machine supports masked store operation for the given DataType and kind of...
LoopInfo * LI
Loop Info analysis.
bool requiresScalarEpilogue(bool IsVectorizing) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
SmallVector< RegisterUsage, 8 > calculateRegisterUsage(ArrayRef< ElementCount > VFs)
SmallPtrSet< const Value *, 16 > VecValuesToIgnore
Values to ignore in the cost model when VF > 1.
bool isInLoopReduction(PHINode *Phi) const
Returns true if the Phi is part of an inloop reduction.
bool isProfitableToScalarize(Instruction *I, ElementCount VF) const
void setWideningDecision(const InterleaveGroup< Instruction > *Grp, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for interleaving group Grp and vector ...
const MapVector< Instruction *, uint64_t > & getMinimalBitwidths() const
CallWideningDecision getCallWideningDecision(CallInst *CI, ElementCount VF) const
bool isLegalGatherOrScatter(Value *V, ElementCount VF)
Returns true if the target machine can represent V as a masked gather or scatter operation.
bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const
bool shouldConsiderInvariant(Value *Op)
Returns true if Op should be considered invariant and if it is trivially hoistable.
bool foldTailByMasking() const
Returns true if all loop blocks should be masked to fold tail loop.
bool foldTailWithEVL() const
Returns true if VP intrinsics with explicit vector length support should be generated in the tail fol...
void collectUniformsAndScalars(ElementCount VF)
Collect Uniform and Scalar values for the given VF.
bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const
Returns true if the instructions in this block requires predication for any reason,...
void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind, Function *Variant, Intrinsic::ID IID, std::optional< unsigned > MaskPos, InstructionCost Cost)
void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC)
Selects and saves TailFoldingStyle for 2 options - if IV update may overflow or not.
AssumptionCache * AC
Assumption cache.
void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for instruction I and vector width VF.
InstWidening
Decision that was taken during cost calculation for memory instruction.
bool isScalarWithPredication(Instruction *I, ElementCount VF) const
Returns true if I is an instruction which requires predication and for which our chosen predication s...
InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const
Estimate cost of a call instruction CI if it were vectorized with factor VF.
bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const
Returns true if we should use strict in-order reductions for the given RdxDesc.
std::pair< InstructionCost, InstructionCost > getDivRemSpeculationCost(Instruction *I, ElementCount VF) const
Return the costs for our two available strategies for lowering a div/rem operation which requires spe...
bool isDivRemScalarWithPredication(InstructionCost ScalarCost, InstructionCost SafeDivisorCost) const
Given costs for both strategies, return true if the scalar predication lowering should be used for di...
InstructionCost expectedCost(ElementCount VF)
Returns the expected execution cost.
void setCostBasedWideningDecision(ElementCount VF)
Memory access instruction may be vectorized in more than one way.
InstWidening getWideningDecision(Instruction *I, ElementCount VF) const
Return the cost model decision for the given instruction I and vector width VF.
bool isScalarEpilogueAllowed() const
Returns true if a scalar epilogue is not allowed due to optsize or a loop hint annotation.
InstructionCost getWideningCost(Instruction *I, ElementCount VF)
Return the vectorization cost for the given instruction I and vector width VF.
unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost)
void collectInstsToScalarize(ElementCount VF)
Collects the instructions to scalarize for each predicated instruction in the loop.
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
bool isInvariantStoreOfReduction(StoreInst *SI)
Returns True if given store is a final invariant store of one of the reductions found in the loop.
bool hasVectorCallVariants() const
Returns true if there is at least one function call in the loop which has a vectorized variant availa...
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
bool isInvariantAddressOfReduction(Value *V)
Returns True if given address is invariant and is used to store recurrent expression.
bool blockNeedsPredication(BasicBlock *BB) const
Return true if the block BB needs to be predicated in order for the loop to be vectorized.
bool canVectorize(bool UseVPlanNativePath)
Returns true if it is legal to vectorize this loop.
int isConsecutivePtr(Type *AccessTy, Value *Ptr) const
Check if this pointer is consecutive when vectorizing.
std::optional< const HistogramInfo * > getHistogramInfo(Instruction *I) const
Returns a HistogramInfo* for the given instruction if it was determined to be part of a load -> updat...
bool canVectorizeFPMath(bool EnableStrictReductions)
Returns true if it is legal to vectorize the FP math operations in this loop.
bool isReductionVariable(PHINode *PN) const
Returns True if PN is a reduction variable in this loop.
bool isFixedOrderRecurrence(const PHINode *Phi) const
Returns True if Phi is a fixed-order recurrence in this loop.
const InductionDescriptor * getPointerInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is pointer induction.
const InductionDescriptor * getIntOrFpInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is an integer or floating point induction.
bool isInductionPhi(const Value *V) const
Returns True if V is a Phi node of an induction variable in this loop.
PHINode * getPrimaryInduction()
Returns the primary induction variable.
const SmallVector< BasicBlock *, 4 > & getCountableExitingBlocks() const
Returns all exiting blocks with a countable exit, i.e.
const InductionList & getInductionVars() const
Returns the induction variables found in the loop.
bool hasStructVectorCall() const
Returns true if there is at least one function call in the loop which returns a struct type and needs...
bool isInvariant(Value *V) const
Returns true if V is invariant across all loop iterations according to SCEV.
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
bool canFoldTailByMasking() const
Return true if we can vectorize this loop while folding its tail by masking.
void prepareToFoldTailByMasking()
Mark all respective loads/stores for masking.
Type * getWidestInductionType()
Returns the widest induction type.
bool hasUncountableEarlyExit() const
Returns true if the loop has exactly one uncountable early exit, i.e.
bool hasHistograms() const
Returns a list of all known histogram operations in the loop.
const LoopAccessInfo * getLAI() const
bool isUniformMemOp(Instruction &I, ElementCount VF) const
A uniform memory op is a load or store which accesses the same memory location on all VF lanes,...
BasicBlock * getUncountableEarlyExitingBlock() const
Returns the uncountable early exiting block, if there is exactly one.
bool isMaskRequired(const Instruction *I) const
Returns true if vector representation of the instruction I requires mask.
const RuntimePointerChecking * getRuntimePointerChecking() const
Returns the information that we collected about runtime memory check.
Planner drives the vectorization process after having passed Legality checks.
VectorizationFactor selectEpilogueVectorizationFactor(const ElementCount MaxVF, unsigned IC)
VPlan & getPlanFor(ElementCount VF) const
Return the VPlan for VF.
Definition: VPlan.cpp:1606
VectorizationFactor planInVPlanNativePath(ElementCount UserVF)
Use the VPlan-native path to plan how to best vectorize, return the best VF and its cost.
void buildVPlans(ElementCount MinVF, ElementCount MaxVF)
Build VPlans for power-of-2 VF's between MinVF and MaxVF inclusive, according to the information gath...
Definition: VPlan.cpp:1591
VectorizationFactor computeBestVF()
Compute and return the most profitable vectorization factor.
void emitInvalidCostRemarks(OptimizationRemarkEmitter *ORE)
Emit remarks for recipes with invalid costs in the available VPlans.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition: VPlan.cpp:1572
void printPlans(raw_ostream &O)
Definition: VPlan.cpp:1620
void plan(ElementCount UserVF, unsigned UserIC)
Build VPlans for the specified UserVF and UserIC if they are non-zero or all applicable candidate VFs...
DenseMap< const SCEV *, Value * > executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan, InnerLoopVectorizer &LB, DominatorTree *DT, bool VectorizingEpilogue, const DenseMap< const SCEV *, Value * > *ExpandedSCEVs=nullptr)
Generate the IR code for the vectorized loop captured in VPlan BestPlan according to the best selecte...
bool hasPlanWithVF(ElementCount VF) const
Look through the existing plans and return true if we have one with vectorization factor VF.
This holds vectorization requirements that must be verified late in the process.
Utility class for getting and setting loop vectorizer hints in the form of loop metadata.
bool allowVectorization(Function *F, Loop *L, bool VectorizeOnlyWhenForced) const
bool allowReordering() const
When enabling loop hints are provided we allow the vectorizer to change the order of operations that ...
void emitRemarkWithHints() const
Dumps all the hint information.
void setAlreadyVectorized()
Mark the loop L as already vectorized by setting the width to 1.
const char * vectorizeAnalysisPassName() const
If hints are provided that force vectorization, use the AlwaysPrint pass name to force the frontend t...
void prepareNoAliasMetadata()
Set up the aliasing scopes based on the memchecks.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
bool hasLoopInvariantOperands(const Instruction *I) const
Return true if all the operands of the specified instruction are loop invariant.
Definition: LoopInfo.cpp:67
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:632
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:61
MDNode * getLoopID() const
Return the llvm.loop loop id metadata node for this loop if it is present.
Definition: LoopInfo.cpp:502
Metadata node.
Definition: Metadata.h:1073
void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
Definition: Metadata.cpp:1077
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1434
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1549
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1440
static MDString * get(LLVMContext &Context, StringRef Str)
Definition: Metadata.cpp:606
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator find(const KeyT &Key)
Definition: MapVector.h:167
bool contains(const KeyT &Key) const
Definition: MapVector.h:163
bool empty() const
Definition: MapVector.h:79
size_type size() const
Definition: MapVector.h:60
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition: Module.cpp:228
Diagnostic information for optimization analysis remarks related to pointer aliasing.
Diagnostic information for optimization analysis remarks related to floating-point non-commutativity.
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
bool allowExtraAnalysis(StringRef PassName) const
Whether we allow for extra compile-time budget to perform more analysis to produce fewer false positi...
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
An analysis over an "inner" IR unit that provides access to an analysis manager over a "outer" IR uni...
Definition: PassManager.h:692
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
static unsigned getIncomingValueNumForOperand(unsigned i)
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1878
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
const SCEVPredicate & getPredicate() const
unsigned getSmallConstantMaxTripCount()
Returns the upper bound of the loop trip count as a normal unsigned value, or 0 if the trip count is ...
const SCEV * getBackedgeTakenCount()
Get the (predicated) backedge count for the analyzed loop.
const SCEV * getSymbolicMaxBackedgeTakenCount()
Get the (predicated) symbolic max backedge count for the analyzed loop.
const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:146
void preserve()
Mark an analysis as preserved.
Definition: Analysis.h:131
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
bool hasProfileSummary() const
Returns true if profile summary is available.
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: IVDescriptors.h:77
static bool isFMulAddIntrinsic(Instruction *I)
Returns true if the instruction is a call to the llvm.fmuladd intrinsic.
FastMathFlags getFastMathFlags() const
Instruction * getLoopExitInstr() const
static unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
Type * getRecurrenceType() const
Returns the type of the recurrence.
const SmallPtrSet< Instruction *, 8 > & getCastInsts() const
Returns a reference to the instructions used for type-promoting the recurrence.
unsigned getMinWidthCastToRecurrenceTypeInBits() const
Returns the minimum width used by the recurrence in bits.
TrackingVH< Value > getRecurrenceStartValue() const
SmallVector< Instruction *, 4 > getReductionOpChain(PHINode *Phi, Loop *L) const
Attempts to find a chain of operations from Phi to LoopExitInst that can be treated as a set of reduc...
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static bool isFindLastIVRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
bool isSigned() const
Returns true if all source operands of the recurrence are SExtInsts.
RecurKind getRecurrenceKind() const
bool isOrdered() const
Expose an ordered FP reduction to the instance users.
Value * getSentinelValue() const
Returns the sentinel value for FindLastIV recurrences to replace the start value.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
bool Need
This flag indicates if we need to add the runtime check.
std::optional< ArrayRef< PointerDiffInfo > > getDiffChecks() const
const SmallVectorImpl< RuntimePointerCheck > & getChecks() const
Returns the checks that generateChecks created.
This class represents a constant integer value.
const APInt & getAPInt() const
Helper to remove instructions inserted during SCEV expansion, unless they are marked as used.
This class uses information about analyze scalars to rewrite expressions in canonical form.
ScalarEvolution * getSE()
bool isInsertedInstruction(Instruction *I) const
Return true if the specified instruction was inserted by the code rewriter.
Value * expandCodeForPredicate(const SCEVPredicate *Pred, Instruction *Loc)
Generates a code sequence that evaluates this predicate.
This class represents an assumption made using SCEV expressions which can be checked at run-time.
virtual bool isAlwaysTrue() const =0
Returns true if the predicate is always true.
This class represents an analyzed expression in the program.
bool isOne() const
Return true if the expression is a constant one.
bool isZero() const
Return true if the expression is a constant zero.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getURemExpr(const SCEV *LHS, const SCEV *RHS)
Represents an unsigned remainder expression based on unsigned division.
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
void forgetBlockAndLoopDispositions(Value *V=nullptr)
Called when the client has changed the disposition of values in a loop or block.
const SCEV * getMinusOne(Type *Ty)
Return a SCEV for the constant -1 of a specific type.
void forgetLcssaPhiWithNewPredecessor(Loop *L, PHINode *V)
Forget LCSSA phi node V of loop L to which a new predecessor was added, such that it may no longer be...
unsigned getSmallConstantTripCount(const Loop *L)
Returns the exact trip count of the loop if we can compute it, and the result is a small constant.
APInt getUnsignedRangeMax(const SCEV *S)
Determine the max of the unsigned range for a particular SCEV.
const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
bool isKnownPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition: SetVector.h:84
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:113
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
value_type pop_back_val()
Definition: SetVector.h:285
size_type size() const
Definition: SmallPtrSet.h:94
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:363
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition: SmallPtrSet.h:401
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:452
iterator end() const
Definition: SmallPtrSet.h:477
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
iterator begin() const
Definition: SmallPtrSet.h:472
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:458
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
Multiway switch.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
std::optional< unsigned > getVScaleForTuning() const
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const
Return true if the target supports masked scatter.
bool enableAggressiveInterleaving(bool LoopHasReductions) const
Don't restrict interleaved unrolling to small loops.
bool preferInLoopReduction(unsigned Opcode, Type *Ty, ReductionFlags Flags) const
bool supportsEfficientVectorElementLoadStore() const
If target has efficient vector element load/store instructions, it can return true here so that inser...
bool prefersVectorizedAddressing() const
Return true if target doesn't mind addresses in vectors.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
bool hasBranchDivergence(const Function *F=nullptr) const
Return true if branch divergence exists.
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE=nullptr, const SCEV *Ptr=nullptr) const
void getUnrollingPreferences(Loop *L, ScalarEvolution &, UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const
Get target-customized preferences for the generic loop unrolling transformation.
InstructionCost getOperandsScalarizationOverhead(ArrayRef< const Value * > Args, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing an instructions unique non-constant operands.
TypeSize getRegisterBitWidth(RegisterKind K) const
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
std::optional< unsigned > getMaxVScale() const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
bool enableOrderedReductions() const
Return true if we should be enabling ordered reductions for the target.
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
static PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const
Query the target what the preferred style of tail folding is.
unsigned getRegUsageForType(Type *Ty) const
Returns the estimated number of registers required to represent Ty.
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of an Add ...
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool isElementTypeLegalForScalableVector(Type *Ty) const
ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const
bool enableMaskedInterleavedAccessVectorization() const
Enable matching of interleaved access groups that contain predicated accesses or gaps and therefore v...
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, ReductionFlags Flags) const
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
const char * getRegisterClassName(unsigned ClassID) const
bool preferEpilogueVectorization() const
Return true if the loop vectorizer should consider vectorizing an otherwise scalar epilogue loop.
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
bool hasActiveVectorLength(unsigned Opcode, Type *DataType, Align Alignment) const
unsigned getEpilogueVectorizationMinVF() const
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isFPVectorizationPotentiallyUnsafe() const
Indicate that it is potentially unsafe to automatically vectorize floating-point operations because t...
bool isLegalMaskedStore(Type *DataType, Align Alignment) const
Return true if the target supports masked store.
@ TCC_Free
Expected to fold away in lowering.
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
bool enableInterleavedAccessVectorization() const
Enable matching of interleaved access groups.
unsigned getMinTripCountTailFoldingThreshold() const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
unsigned getMaxInterleaveFactor(ElementCount VF) const
unsigned getNumberOfParts(Type *Tp) const
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp=std::nullopt) const
bool isTruncateFree(Type *Ty1, Type *Ty2) const
Return true if it's free to truncate a value of type Ty1 to type Ty2.
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const
Query the target whether it would be prefered to create a predicated vector loop, which can avoid the...
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ Interleave
The cast is used with an interleaved load/store.
@ GatherScatter
The cast is used with a gather/scatter.
bool isLegalMaskedLoad(Type *DataType, Align Alignment) const
Return true if the target supports masked load.
This class represents a truncation of integer types.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition: TypeSwitch.h:87
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition: TypeSwitch.h:96
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
static IntegerType * getInt1Ty(LLVMContext &C)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
bool isIntOrPtrTy() const
Return true if this is an integer type or a pointer type.
Definition: Type.h:252
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
bool isTokenTy() const
Return true if this is 'token'.
Definition: Type.h:234
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
This function has undefined behavior.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
op_range operands()
Definition: User.h:288
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
op_iterator op_begin()
Definition: User.h:280
void setOperand(unsigned i, Value *Val)
Definition: User.h:233
Value * getOperand(unsigned i) const
Definition: User.h:228
op_iterator op_end()
Definition: User.h:282
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:72
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition: VPlan.h:3202
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition: VPlan.h:3277
RecipeListTy::iterator iterator
Instruction iterators...
Definition: VPlan.h:3229
void execute(VPTransformState *State) override
The method which generates the output IR instructions that correspond to this VPBasicBlock,...
Definition: VPlan.cpp:482
iterator end()
Definition: VPlan.h:3239
iterator begin()
Recipe iterator methods.
Definition: VPlan.h:3237
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition: VPlan.h:3290
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition: VPlan.cpp:210
void insert(VPRecipeBase *Recipe, iterator InsertPt)
Definition: VPlan.h:3268
bool empty() const
Definition: VPlan.h:3248
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition: VPlan.h:2160
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition: VPlan.h:78
VPRegionBlock * getParent()
Definition: VPlan.h:170
const VPBasicBlock * getExitingBasicBlock() const
Definition: VPlan.cpp:180
void setName(const Twine &newName)
Definition: VPlan.h:163
size_t getNumSuccessors() const
Definition: VPlan.h:216
void swapSuccessors()
Swap successors of the block. The block must have exactly 2 successors.
Definition: VPlan.h:309
VPlan * getPlan()
Definition: VPlan.cpp:155
VPBlockBase * getSinglePredecessor() const
Definition: VPlan.h:212
const VPBasicBlock * getEntryBasicBlock() const
Definition: VPlan.cpp:160
VPBlockBase * getSingleSuccessor() const
Definition: VPlan.h:206
const VPBlocksTy & getSuccessors() const
Definition: VPlan.h:195
static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBase NewBlock after BlockPtr.
Definition: VPlanUtils.h:88
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition: VPlanUtils.h:212
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition: VPlanUtils.h:142
static void reassociateBlocks(VPBlockBase *Old, VPBlockBase *New)
Reassociate all the blocks connected to Old so that they now point to New.
Definition: VPlanUtils.h:169
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL={}, const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPValue * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL={}, const Twine &Name="")
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
void insert(VPRecipeBase *R)
Insert R at the current insertion point.
VPScalarCastRecipe * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL)
VPInstruction * createOverflowingOp(unsigned Opcode, std::initializer_list< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags, DebugLoc DL={}, const Twine &Name="")
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
VPValue * createNot(VPValue *Operand, DebugLoc DL={}, const Twine &Name="")
VPValue * createLogicalAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL={}, const Twine &Name="")
VPValue * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL={}, const Twine &Name="", std::optional< FastMathFlags > FMFs=std::nullopt)
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
Canonical scalar induction phi of the vector loop.
Definition: VPlan.h:2899
Type * getScalarType() const
Returns the scalar type of the induction.
Definition: VPlan.h:2930
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition: VPlanValue.h:394
void execute(VPTransformState &State) override
Generate the transformed value of the induction at offset StartValue (1.
VPValue * getStepValue() const
Definition: VPlan.h:3132
VPValue * getStartValue() const
Definition: VPlan.h:3131
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition: VPlan.h:1689
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition: VPlan.h:1737
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition: VPlan.h:1726
A recipe representing a sequence of load -> update -> store as part of a histogram operation.
Definition: VPlan.h:1439
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition: VPlan.h:3344
A recipe to wrap on original IR instruction not to be modified during execution, execept for PHIs.
Definition: VPlan.h:1033
This is a concrete Recipe that models a single VPlan-level instruction.
Definition: VPlan.h:845
@ ResumePhi
Creates a scalar phi in a leaf VPBB with a single predecessor in VPlan.
Definition: VPlan.h:863
@ ComputeReductionResult
Definition: VPlan.h:869
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition: VPlan.h:2227
In what follows, the term "input IR" refers to code that is fed into the vectorizer whereas the term ...
Definition: VPlanHelpers.h:116
static VPLane getLastLaneForVF(const ElementCount &VF)
Definition: VPlanHelpers.h:157
static VPLane getFirstLane()
Definition: VPlanHelpers.h:141
A recipe for forming partial reductions.
Definition: VPlan.h:2113
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition: VPlan.h:366
VPBasicBlock * getParent()
Definition: VPlan.h:391
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition: VPlan.h:460
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * tryToCreatePartialReduction(Instruction *Reduction, ArrayRef< VPValue * > Operands)
Create and return a partial reduction recipe for a reduction instruction along with binary operation ...
VPValue * createEdgeMask(BasicBlock *Src, BasicBlock *Dst)
A helper function that computes the predicate of the edge between SRC and DST.
VPRecipeBase * tryToCreateWidenRecipe(Instruction *Instr, ArrayRef< VPValue * > Operands, VFRange &Range)
Create and return a widened recipe for I if one can be created within the given VF Range.
void createSwitchEdgeMasks(SwitchInst *SI)
Create an edge mask for every destination of cases and/or default.
VPValue * getBlockInMask(BasicBlock *BB) const
Returns the entry mask for the block BB.
VPValue * getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const
A helper that returns the previously computed predicate of the edge between SRC and DST.
iterator_range< mapped_iterator< Use *, std::function< VPValue *(Value *)> > > mapToVPValues(User::op_range Operands)
Returns a range mapping the values of the range Operands to their corresponding VPValues.
void fixHeaderPhis()
Add the incoming values from the backedge to reduction & first-order recurrence cross-iteration phis.
VPValue * getVPValueOrAddLiveIn(Value *V)
void createHeaderMask()
Create the mask for the vector loop header block.
std::optional< unsigned > getScalingForReduction(const Instruction *ExitInst)
void createBlockInMask(BasicBlock *BB)
A helper function that computes the predicate of the block BB, assuming that the header block of the ...
void collectScaledReductions(VFRange &Range)
Find all possible partial reductions in the loop and track all of those that are valid so recipes can...
VPReplicateRecipe * handleReplication(Instruction *I, ArrayRef< VPValue * > Operands, VFRange &Range)
Build a VPReplicationRecipe for I using Operands.
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
void setFlags(Instruction *I) const
Set the IR flags for I.
Definition: VPlan.h:759
A recipe for handling reduction phis.
Definition: VPlan.h:2047
bool isInLoop() const
Returns true, if the phi is part of an in-loop reduction.
Definition: VPlan.h:2106
const RecurrenceDescriptor & getRecurrenceDescriptor() const
Definition: VPlan.h:2098
A recipe to represent inloop reduction operations, performing a reduction on a vector operand into a ...
Definition: VPlan.h:2322
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition: VPlan.h:3379
const VPBlockBase * getEntry() const
Definition: VPlan.h:3415
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition: VPlan.h:3447
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition: VPlan.h:2443
void execute(VPTransformState &State) override
Generate replicas of the desired Ingredient.
bool isUniform() const
Definition: VPlan.h:2487
bool shouldPack() const
Returns true if the recipe is used by a widened recipe via an intervening VPPredInstPHIRecipe.
A recipe to compute the pointers for widened memory accesses of IndexTy in reverse order.
Definition: VPlan.h:1566
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition: VPlan.h:493
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition: VPlan.h:563
An analysis for type-inference for VPValues.
Definition: VPlanAnalysis.h:40
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition: VPlanValue.h:206
operand_range operands()
Definition: VPlanValue.h:263
void setOperand(unsigned I, VPValue *New)
Definition: VPlanValue.h:248
unsigned getNumOperands() const
Definition: VPlanValue.h:242
VPValue * getOperand(unsigned N) const
Definition: VPlanValue.h:243
void addOperand(VPValue *Operand)
Definition: VPlanValue.h:237
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition: VPlan.cpp:125
void replaceAllUsesWith(VPValue *New)
Definition: VPlan.cpp:1438
Value * getLiveInIRValue()
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition: VPlanValue.h:178
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition: VPlan.cpp:1442
user_range users()
Definition: VPlanValue.h:138
A recipe to compute the pointers for widened memory accesses of IndexTy.
Definition: VPlan.h:1619
A recipe for widening Call instructions using library calls.
Definition: VPlan.h:1383
A Recipe for widening the canonical induction variable of the vector loop.
Definition: VPlan.h:3040
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition: VPlan.h:1191
A recipe for handling GEP instructions.
Definition: VPlan.h:1517
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition: VPlan.h:1751
VPValue * getStepValue()
Returns the step value of the induction.
Definition: VPlan.h:1779
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition: VPlan.h:1785
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition: VPlan.h:1804
A recipe for widening vector intrinsics.
Definition: VPlan.h:1291
A common base class for widening memory operations.
Definition: VPlan.h:2616
A recipe for handling phis that are widened in the vector loop.
Definition: VPlan.h:1964
VPValue * getIncomingValue(unsigned I)
Returns the I th incoming VPValue.
Definition: VPlan.h:2009
VPBasicBlock * getIncomingBlock(unsigned I)
Returns the I th incoming VPBasicBlock.
Definition: VPlan.h:2001
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition: VPlan.h:1093
Main class to build the VPlan H-CFG for an incoming IR.
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition: VPlan.h:3478
bool hasVF(ElementCount VF) const
Definition: VPlan.h:3672
void prepareToExecute(Value *TripCount, Value *VectorTripCount, VPTransformState &State)
Prepare the plan for execution, setting up the required live-in values.
Definition: VPlan.cpp:938
VPBasicBlock * getEntry()
Definition: VPlan.h:3591
VPValue & getVectorTripCount()
The vector trip count.
Definition: VPlan.h:3656
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition: VPlan.h:3662
VPValue & getVF()
Returns the VF of the vector loop region.
Definition: VPlan.h:3659
VPValue * getTripCount() const
The trip count of the original loop.
Definition: VPlan.h:3635
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition: VPlan.h:3649
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
Definition: VPlan.h:3679
unsigned getUF() const
Definition: VPlan.h:3692
static VPlanPtr createInitialVPlan(Type *InductionTy, PredicatedScalarEvolution &PSE, bool RequiresScalarEpilogueCheck, bool TailFolded, Loop *TheLoop)
Create initial VPlan, having an "entry" VPBasicBlock (wrapping original scalar pre-header) which cont...
Definition: VPlan.cpp:859
bool hasUF(unsigned UF) const
Definition: VPlan.h:3690
auto getExitBlocks()
Return an iterator range over the VPIRBasicBlock wrapping the exit blocks of the VPlan,...
Definition: VPlanCFG.h:310
VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition: VPlan.cpp:1070
InstructionCost cost(ElementCount VF, VPCostContext &Ctx)
Return the cost of this plan.
Definition: VPlan.cpp:1064
const VPBasicBlock * getMiddleBlock() const
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition: VPlan.h:3610
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition: VPlan.h:3642
void setEntry(VPBasicBlock *VPBB)
Definition: VPlan.h:3561
VPIRBasicBlock * createVPIRBasicBlock(BasicBlock *IRBB)
Create a VPIRBasicBlock from IRBB containing VPIRInstructions for all instructions in IRBB,...
Definition: VPlan.cpp:1270
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
Definition: VPlan.h:3710
bool hasScalarVFOnly() const
Definition: VPlan.h:3683
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition: VPlan.h:3618
void execute(VPTransformState *State)
Generate the IR code for this VPlan.
Definition: VPlan.cpp:974
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the vector loop.
Definition: VPlan.h:3742
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition: VPlan.h:3623
VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition: VPlan.cpp:1210
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition: Value.cpp:157
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:377
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition: Value.cpp:542
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1094
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
int getNumOccurrences() const
Definition: CommandLine.h:399
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:193
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:232
constexpr bool isNonZero() const
Definition: TypeSize.h:158
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:218
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition: TypeSize.h:258
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition: TypeSize.h:174
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
constexpr bool isZero() const
Definition: TypeSize.h:156
static constexpr bool isKnownGT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:225
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition: TypeSize.h:254
static constexpr bool isKnownGE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:239
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
A range adaptor for a pair of iterators.
IteratorT end() const
IteratorT begin() const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:661
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ Entry
Definition: COFF.h:844
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
ID ArrayRef< Type * > Tys
Definition: Intrinsics.h:102
std::variant< std::monostate, Loc::Single, Loc::Multi, Loc::MMI, Loc::EntryValue > Variant
Alias for the std::variant specialization base class of DbgVariable.
Definition: DwarfDebug.h:190
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
Definition: PatternMatch.h:982
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:826
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:885
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:711
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< InstrNode * > Instr
Definition: RDFGraph.h:389
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
const_iterator begin(StringRef path LLVM_LIFETIME_BOUND, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:226
const_iterator end(StringRef path LLVM_LIFETIME_BOUND)
Get end iterator over path.
Definition: Path.cpp:235
bool isUniformAfterVectorization(const VPValue *VPV)
Returns true if VPV is uniform after vectorization.
Definition: VPlanUtils.h:41
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr, ScalarEvolution &SE)
Get or create a VPValue that corresponds to the expansion of Expr.
Definition: VPlanUtils.cpp:26
const SCEV * getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE)
Return the SCEV expression for V.
Definition: VPlanUtils.cpp:72
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
Simplify each loop in a loop nest recursively.
void ReplaceInstWithInst(BasicBlock *BB, BasicBlock::iterator &BI, Instruction *I)
Replace the instruction specified by BI with the instruction specified by I.
@ Offset
Definition: DWP.cpp:480
Value * addRuntimeChecks(Instruction *Loc, Loop *TheLoop, const SmallVectorImpl< RuntimePointerCheck > &PointerChecks, SCEVExpander &Expander, bool HoistRuntimeChecks=false)
Add code that checks at runtime if the accessed arrays in PointerChecks overlap.
Definition: LoopUtils.cpp:1954
bool RemoveRedundantDbgInstrs(BasicBlock *BB)
Try to remove redundant dbg.value instructions from given basic block.
std::optional< unsigned > getLoopEstimatedTripCount(Loop *L, unsigned *EstimatedLoopInvocationWeight=nullptr)
Returns a loop's estimated trip count based on branch weight metadata.
Definition: LoopUtils.cpp:850
static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, VectorizationFactor VF, unsigned IC)
Report successful vectorization of the loop.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
unsigned getLoadStoreAddressSpace(const Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:989
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
auto pred_end(const MachineBasicBlock *BB)
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:7301
auto successors(const MachineBasicBlock *BB)
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
std::pair< Instruction *, ElementCount > InstructionVFPair
Value * getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF)
Return the runtime value for VF.
bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition: LCSSA.cpp:465
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
std::optional< MDNode * > makeFollowupLoopID(MDNode *OrigLoopID, ArrayRef< StringRef > FollowupAttrs, const char *InheritOptionsAttrsPrefix="", bool AlwaysNew=false)
Create a new loop identifier for a loop created from a loop transformation.
Definition: LoopUtils.cpp:263
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2115
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition: VPlanCFG.h:215
bool VerifySCEV
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition: VPlanCFG.h:227
auto map_range(ContainerTy &&C, FuncTy F)
Definition: STLExtras.h:377
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
void collectEphemeralRecipesForVPlan(VPlan &Plan, DenseSet< VPRecipeBase * > &EphRecipes)
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
void setBranchWeights(Instruction &I, ArrayRef< uint32_t > Weights, bool IsExpected)
Create a new branch_weights metadata node and add or overwrite a prof metadata reference to instructi...
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:292
cl::opt< bool > EnableVPlanNativePath("enable-vplan-native-path", cl::Hidden, cl::desc("Enable VPlan-native vectorization path with " "support for outer loop vectorization."))
Definition: VPlan.cpp:54
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1664
std::unique_ptr< VPlan > VPlanPtr
Definition: VPlan.h:74
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
cl::opt< bool > EnableLoopVectorization
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition: Local.cpp:425
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition: STLExtras.h:573
void llvm_unreachable_internal(const char *msg=nullptr, const char *file=nullptr, unsigned line=0)
This function calls abort(), and prints the optional message to stderr.
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition: STLExtras.h:336
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:404
TargetTransformInfo TTI
static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr, DebugLoc DL={})
Reports an informative message: print Msg for debugging purposes as well as an optimization remark.
bool isAssignmentTrackingEnabled(const Module &M)
Return true if assignment tracking is enabled for module M.
Definition: DebugInfo.cpp:2303
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:33
@ Or
Bitwise or logical OR of integers.
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
cl::opt< bool > VerifyEachVPlan("vplan-verify-each", cl::init(false), cl::Hidden, cl::desc("Verfiy VPlans after VPlan transforms."))
void setProfileInfoAfterUnrolling(Loop *OrigLoop, Loop *UnrolledLoop, Loop *RemainderLoop, uint64_t UF)
Set weights for UnrolledLoop and RemainderLoop based on weights for OrigLoop and the following distri...
Definition: LoopUtils.cpp:1761
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr)
Reports a vectorization failure: print DebugMsg for debugging purposes along with the corresponding o...
DWARFExpression::Operation Op
ScalarEpilogueLowering
@ CM_ScalarEpilogueNotAllowedLowTripLoop
@ CM_ScalarEpilogueNotNeededUsePredicate
@ CM_ScalarEpilogueNotAllowedOptSize
@ CM_ScalarEpilogueAllowed
@ CM_ScalarEpilogueNotAllowedUsePredicate
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1945
auto pred_begin(const MachineBasicBlock *BB)
Value * createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step)
Return a value for Step multiplied by VF.
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1766
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
Value * addDiffRuntimeChecks(Instruction *Loc, ArrayRef< PointerDiffInfo > Checks, SCEVExpander &Expander, function_ref< Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC)
Definition: LoopUtils.cpp:2012
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2087
InstructionCost Cost
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
@ None
Don't use tail folding.
@ DataWithEVL
Use predicated EVL instructions for tail-folding.
@ DataWithoutLaneMask
Same as Data, but avoids using the get.active.lane.mask intrinsic to calculate the mask and instead i...
unsigned getReciprocalPredBlockProb()
A helper function that returns the reciprocal of the block probability of predicated blocks.
Definition: VPlanHelpers.h:57
bool hasBranchWeightMD(const Instruction &I)
Checks if an instructions has Branch Weight Metadata.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:590
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
bool verifyVPlanIsValid(const VPlan &Plan)
Verify invariants for general VPlans.
MapVector< Instruction *, uint64_t > computeMinimumValueSizes(ArrayRef< BasicBlock * > Blocks, DemandedBits &DB, const TargetTransformInfo *TTI=nullptr)
Compute a map of integer instructions to their minimum legal type size.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:468
cl::opt< bool > EnableLoopInterleaving
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
A special type used by analysis passes to provide an address that identifies that particular analysis...
Definition: Analysis.h:28
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:52
Encapsulate information regarding vectorization of a loop and its epilogue.
EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, ElementCount EVF, unsigned EUF, VPlan &EpiloguePlan)
A class that represents two vectorization factors (initialized with 0 by default).
static FixedScalableVFPair getNone()
This holds details about a histogram operation – a load -> update -> store sequence where each lane i...
A struct that represents some properties of the register usage of a loop.
SmallMapVector< unsigned, unsigned, 4 > MaxLocalUsers
Holds the maximum number of concurrent live intervals in the loop.
SmallMapVector< unsigned, unsigned, 4 > LoopInvariantRegs
Holds the number of loop invariant values that are used in the loop.
TargetLibraryInfo * TLI
LoopVectorizeResult runImpl(Function &F)
ProfileSummaryInfo * PSI
LoopAccessInfoManager * LAIs
void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
LoopVectorizePass(LoopVectorizeOptions Opts={})
BlockFrequencyInfo * BFI
ScalarEvolution * SE
AssumptionCache * AC
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
OptimizationRemarkEmitter * ORE
Storage for information about made changes.
A chain of instructions that form a partial reduction.
Instruction * Reduction
The top-level binary operation that forms the reduction to a scalar after the loop body.
Instruction * ExtendA
The extension of each of the inner binary operation's operands.
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition: PassManager.h:69
A marker analysis to determine if extra passes should be run after loop vectorization.
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
Flags describing the kind of vector reduction.
Parameters that control the generic loop unrolling transformation.
bool UnrollVectorizedLoop
Don't disable runtime unroll for the loops which were vectorized.
Holds the VFShape for a specific scalar to vector function mapping.
std::optional< unsigned > getParamIndexForOptionalMask() const
Instruction Set Architecture.
Encapsulates information needed to describe a parameter.
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Definition: VPlanHelpers.h:62
ElementCount End
Definition: VPlanHelpers.h:67
Struct to hold various analysis needed for cost computations.
Definition: VPlanHelpers.h:356
LoopVectorizationCostModel & CM
Definition: VPlanHelpers.h:361
bool skipCostComputation(Instruction *UI, bool IsVector) const
Return true if the cost for UI shouldn't be computed, e.g.
InstructionCost getLegacyCost(Instruction *UI, ElementCount VF) const
Return the cost for UI with VF using the legacy cost model as fallback until computing the cost of al...
SmallPtrSet< Instruction *, 8 > SkipCostComputation
Definition: VPlanHelpers.h:362
A recipe for handling first-order recurrence phis.
Definition: VPlan.h:2015
BasicBlock * PrevBB
The previous IR BasicBlock created or used.
Definition: VPlanHelpers.h:304
SmallDenseMap< VPBasicBlock *, BasicBlock * > VPBB2IRBB
A mapping of each VPBasicBlock to the corresponding BasicBlock.
Definition: VPlanHelpers.h:312
VPTransformState holds information passed down when "executing" a VPlan, needed for generating the ou...
Definition: VPlanHelpers.h:196
DenseMap< const SCEV *, Value * > ExpandedSCEVs
Map SCEVs to their expanded values.
Definition: VPlanHelpers.h:349
VPTypeAnalysis TypeAnalysis
VPlan-based type analysis.
Definition: VPlanHelpers.h:352
void packScalarIntoVectorValue(VPValue *Def, const VPLane &Lane)
Construct the vector value of a scalarized value V one lane at a time.
Definition: VPlan.cpp:395
Value * get(VPValue *Def, bool IsScalar=false)
Get the generated vector Value for a given VPValue Def if IsScalar is false, otherwise return the gen...
Definition: VPlan.cpp:251
struct llvm::VPTransformState::CFGState CFG
LoopVersioning * LVer
LoopVersioning.
Definition: VPlanHelpers.h:345
void addNewMetadata(Instruction *To, const Instruction *Orig)
Add additional metadata to To that was not present on Orig.
Definition: VPlan.cpp:354
std::optional< VPLane > Lane
Hold the index to generate specific scalar instructions.
Definition: VPlanHelpers.h:210
IRBuilderBase & Builder
Hold a reference to the IRBuilder used to generate output IR code.
Definition: VPlanHelpers.h:329
VPlan * Plan
Pointer to the VPlan code is generated for.
Definition: VPlanHelpers.h:335
InnerLoopVectorizer * ILV
Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
Definition: VPlanHelpers.h:332
ElementCount VF
The chosen Vectorization Factor of the loop being vectorized.
Definition: VPlanHelpers.h:205
void setDebugLocFrom(DebugLoc DL)
Set the debug location in the builder using the debug location DL.
Definition: VPlan.cpp:373
void set(VPValue *Def, Value *V, bool IsScalar=false)
Set the generated vector Value for a given VPValue, if IsScalar is false.
Definition: VPlanHelpers.h:239
A recipe for widening load operations, using the address to load from and an optional mask.
Definition: VPlan.h:2696
A recipe for widening select instructions.
Definition: VPlan.h:1480
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition: VPlan.h:2774
static void handleUncountableEarlyExit(VPlan &Plan, ScalarEvolution &SE, Loop *OrigLoop, BasicBlock *UncountableExitingBlock, VPRecipeBuilder &RecipeBuilder)
Update Plan to account for the uncountable early exit block in UncountableExitingBlock by.
static void simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy)
Perform instcombine-like simplifications on recipes in Plan.
static void optimizeInductionExitUsers(VPlan &Plan, DenseMap< VPValue *, VPValue * > &EndValues)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static bool runPass(bool(*Transform)(VPlan &, ArgsTy...), VPlan &Plan, typename std::remove_reference< ArgsTy >::type &...Args)
Helper to run a VPlan transform Transform on VPlan, forwarding extra arguments to the transform.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx)
Explicitly unroll Plan by UF.
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static bool tryAddExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void VPInstructionsToVPRecipes(VPlanPtr &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, ScalarEvolution &SE, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
TODO: The following VectorizationFactor was pulled out of LoopVectorizationCostModel class.
InstructionCost Cost
Cost of the loop with that width.
ElementCount MinProfitableTripCount
The minimum trip count required to make vectorization profitable, e.g.
ElementCount Width
Vector width with best cost.
InstructionCost ScalarCost
Cost of the scalar loop.
static VectorizationFactor Disabled()
Width 1 means no vectorization, cost 0 means uncomputed cost.