LLVM 22.0.0git
LoopVectorize.cpp
Go to the documentation of this file.
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanAnalysis.h"
61#include "VPlanCFG.h"
62#include "VPlanHelpers.h"
63#include "VPlanPatternMatch.h"
64#include "VPlanTransforms.h"
65#include "VPlanUtils.h"
66#include "VPlanVerifier.h"
67#include "llvm/ADT/APInt.h"
68#include "llvm/ADT/ArrayRef.h"
69#include "llvm/ADT/DenseMap.h"
71#include "llvm/ADT/Hashing.h"
72#include "llvm/ADT/MapVector.h"
73#include "llvm/ADT/STLExtras.h"
76#include "llvm/ADT/Statistic.h"
77#include "llvm/ADT/StringRef.h"
78#include "llvm/ADT/Twine.h"
79#include "llvm/ADT/TypeSwitch.h"
84#include "llvm/Analysis/CFG.h"
101#include "llvm/IR/Attributes.h"
102#include "llvm/IR/BasicBlock.h"
103#include "llvm/IR/CFG.h"
104#include "llvm/IR/Constant.h"
105#include "llvm/IR/Constants.h"
106#include "llvm/IR/DataLayout.h"
107#include "llvm/IR/DebugInfo.h"
108#include "llvm/IR/DebugLoc.h"
109#include "llvm/IR/DerivedTypes.h"
111#include "llvm/IR/Dominators.h"
112#include "llvm/IR/Function.h"
113#include "llvm/IR/IRBuilder.h"
114#include "llvm/IR/InstrTypes.h"
115#include "llvm/IR/Instruction.h"
116#include "llvm/IR/Instructions.h"
118#include "llvm/IR/Intrinsics.h"
119#include "llvm/IR/MDBuilder.h"
120#include "llvm/IR/Metadata.h"
121#include "llvm/IR/Module.h"
122#include "llvm/IR/Operator.h"
123#include "llvm/IR/PatternMatch.h"
125#include "llvm/IR/Type.h"
126#include "llvm/IR/Use.h"
127#include "llvm/IR/User.h"
128#include "llvm/IR/Value.h"
129#include "llvm/IR/Verifier.h"
130#include "llvm/Support/Casting.h"
132#include "llvm/Support/Debug.h"
147#include <algorithm>
148#include <cassert>
149#include <cstdint>
150#include <functional>
151#include <iterator>
152#include <limits>
153#include <memory>
154#include <string>
155#include <tuple>
156#include <utility>
157
158using namespace llvm;
159using namespace SCEVPatternMatch;
160
161#define LV_NAME "loop-vectorize"
162#define DEBUG_TYPE LV_NAME
163
164#ifndef NDEBUG
165const char VerboseDebug[] = DEBUG_TYPE "-verbose";
166#endif
167
168STATISTIC(LoopsVectorized, "Number of loops vectorized");
169STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
170STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
171STATISTIC(LoopsEarlyExitVectorized, "Number of early exit loops vectorized");
172
174 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
175 cl::desc("Enable vectorization of epilogue loops."));
176
178 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
179 cl::desc("When epilogue vectorization is enabled, and a value greater than "
180 "1 is specified, forces the given VF for all applicable epilogue "
181 "loops."));
182
184 "epilogue-vectorization-minimum-VF", cl::Hidden,
185 cl::desc("Only loops with vectorization factor equal to or larger than "
186 "the specified value are considered for epilogue vectorization."));
187
188/// Loops with a known constant trip count below this number are vectorized only
189/// if no scalar iteration overheads are incurred.
191 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
192 cl::desc("Loops with a constant trip count that is smaller than this "
193 "value are vectorized only if no scalar iteration overheads "
194 "are incurred."));
195
197 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
198 cl::desc("The maximum allowed number of runtime memory checks"));
199
200// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
201// that predication is preferred, and this lists all options. I.e., the
202// vectorizer will try to fold the tail-loop (epilogue) into the vector body
203// and predicate the instructions accordingly. If tail-folding fails, there are
204// different fallback strategies depending on these values:
211} // namespace PreferPredicateTy
212
214 "prefer-predicate-over-epilogue",
217 cl::desc("Tail-folding and predication preferences over creating a scalar "
218 "epilogue loop."),
220 "scalar-epilogue",
221 "Don't tail-predicate loops, create scalar epilogue"),
223 "predicate-else-scalar-epilogue",
224 "prefer tail-folding, create scalar epilogue if tail "
225 "folding fails."),
227 "predicate-dont-vectorize",
228 "prefers tail-folding, don't attempt vectorization if "
229 "tail-folding fails.")));
230
232 "force-tail-folding-style", cl::desc("Force the tail folding style"),
235 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
238 "Create lane mask for data only, using active.lane.mask intrinsic"),
240 "data-without-lane-mask",
241 "Create lane mask with compare/stepvector"),
243 "Create lane mask using active.lane.mask intrinsic, and use "
244 "it for both data and control flow"),
246 "data-and-control-without-rt-check",
247 "Similar to data-and-control, but remove the runtime check"),
249 "Use predicated EVL instructions for tail folding. If EVL "
250 "is unsupported, fallback to data-without-lane-mask.")));
251
253 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
254 cl::desc("Maximize bandwidth when selecting vectorization factor which "
255 "will be determined by the smallest type in loop."));
256
258 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
259 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
260
261/// An interleave-group may need masking if it resides in a block that needs
262/// predication, or in order to mask away gaps.
264 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
265 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
266
268 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
269 cl::desc("A flag that overrides the target's number of scalar registers."));
270
272 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
273 cl::desc("A flag that overrides the target's number of vector registers."));
274
276 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
277 cl::desc("A flag that overrides the target's max interleave factor for "
278 "scalar loops."));
279
281 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
282 cl::desc("A flag that overrides the target's max interleave factor for "
283 "vectorized loops."));
284
286 "force-target-instruction-cost", cl::init(0), cl::Hidden,
287 cl::desc("A flag that overrides the target's expected cost for "
288 "an instruction to a single constant value. Mostly "
289 "useful for getting consistent testing."));
290
292 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
293 cl::desc(
294 "Pretend that scalable vectors are supported, even if the target does "
295 "not support them. This flag should only be used for testing."));
296
298 "small-loop-cost", cl::init(20), cl::Hidden,
299 cl::desc(
300 "The cost of a loop that is considered 'small' by the interleaver."));
301
303 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
304 cl::desc("Enable the use of the block frequency analysis to access PGO "
305 "heuristics minimizing code growth in cold regions and being more "
306 "aggressive in hot regions."));
307
308// Runtime interleave loops for load/store throughput.
310 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
311 cl::desc(
312 "Enable runtime interleaving until load/store ports are saturated"));
313
314/// The number of stores in a loop that are allowed to need predication.
316 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
317 cl::desc("Max number of stores to be predicated behind an if."));
318
320 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
321 cl::desc("Count the induction variable only once when interleaving"));
322
324 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
325 cl::desc("Enable if predication of stores during vectorization."));
326
328 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
329 cl::desc("The maximum interleave count to use when interleaving a scalar "
330 "reduction in a nested loop."));
331
332static cl::opt<bool>
333 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
335 cl::desc("Prefer in-loop vector reductions, "
336 "overriding the targets preference."));
337
339 "force-ordered-reductions", cl::init(false), cl::Hidden,
340 cl::desc("Enable the vectorisation of loops with in-order (strict) "
341 "FP reductions"));
342
344 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
345 cl::desc(
346 "Prefer predicating a reduction operation over an after loop select."));
347
349 "enable-vplan-native-path", cl::Hidden,
350 cl::desc("Enable VPlan-native vectorization path with "
351 "support for outer loop vectorization."));
352
354 llvm::VerifyEachVPlan("vplan-verify-each",
355#ifdef EXPENSIVE_CHECKS
356 cl::init(true),
357#else
358 cl::init(false),
359#endif
361 cl::desc("Verfiy VPlans after VPlan transforms."));
362
363// This flag enables the stress testing of the VPlan H-CFG construction in the
364// VPlan-native vectorization path. It must be used in conjuction with
365// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
366// verification of the H-CFGs built.
368 "vplan-build-stress-test", cl::init(false), cl::Hidden,
369 cl::desc(
370 "Build VPlan for every supported loop nest in the function and bail "
371 "out right after the build (stress test the VPlan H-CFG construction "
372 "in the VPlan-native vectorization path)."));
373
375 "interleave-loops", cl::init(true), cl::Hidden,
376 cl::desc("Enable loop interleaving in Loop vectorization passes"));
378 "vectorize-loops", cl::init(true), cl::Hidden,
379 cl::desc("Run the Loop vectorization passes"));
380
382 "force-widen-divrem-via-safe-divisor", cl::Hidden,
383 cl::desc(
384 "Override cost based safe divisor widening for div/rem instructions"));
385
387 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
389 cl::desc("Try wider VFs if they enable the use of vector variants"));
390
392 "enable-early-exit-vectorization", cl::init(true), cl::Hidden,
393 cl::desc(
394 "Enable vectorization of early exit loops with uncountable exits."));
395
396// Likelyhood of bypassing the vectorized loop because there are zero trips left
397// after prolog. See `emitIterationCountCheck`.
398static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
399
400/// A helper function that returns true if the given type is irregular. The
401/// type is irregular if its allocated size doesn't equal the store size of an
402/// element of the corresponding vector type.
403static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
404 // Determine if an array of N elements of type Ty is "bitcast compatible"
405 // with a <N x Ty> vector.
406 // This is only true if there is no padding between the array elements.
407 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
408}
409
410/// A version of ScalarEvolution::getSmallConstantTripCount that returns an
411/// ElementCount to include loops whose trip count is a function of vscale.
413 const Loop *L) {
414 if (unsigned ExpectedTC = SE->getSmallConstantTripCount(L))
415 return ElementCount::getFixed(ExpectedTC);
416
417 const SCEV *BTC = SE->getBackedgeTakenCount(L);
419 return ElementCount::getFixed(0);
420
421 const SCEV *ExitCount = SE->getTripCountFromExitCount(BTC, BTC->getType(), L);
422 if (isa<SCEVVScale>(ExitCount))
424
425 const APInt *Scale;
426 if (match(ExitCount, m_scev_Mul(m_scev_APInt(Scale), m_SCEVVScale())))
427 if (cast<SCEVMulExpr>(ExitCount)->hasNoUnsignedWrap())
428 if (Scale->getActiveBits() <= 32)
430
431 return ElementCount::getFixed(0);
432}
433
434/// Returns "best known" trip count, which is either a valid positive trip count
435/// or std::nullopt when an estimate cannot be made (including when the trip
436/// count would overflow), for the specified loop \p L as defined by the
437/// following procedure:
438/// 1) Returns exact trip count if it is known.
439/// 2) Returns expected trip count according to profile data if any.
440/// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
441/// 4) Returns std::nullopt if all of the above failed.
442static std::optional<ElementCount>
444 bool CanUseConstantMax = true) {
445 // Check if exact trip count is known.
446 if (auto ExpectedTC = getSmallConstantTripCount(PSE.getSE(), L))
447 return ExpectedTC;
448
449 // Check if there is an expected trip count available from profile data.
451 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
452 return ElementCount::getFixed(*EstimatedTC);
453
454 if (!CanUseConstantMax)
455 return std::nullopt;
456
457 // Check if upper bound estimate is known.
458 if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())
459 return ElementCount::getFixed(ExpectedTC);
460
461 return std::nullopt;
462}
463
464namespace {
465// Forward declare GeneratedRTChecks.
466class GeneratedRTChecks;
467
468using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
469} // namespace
470
471namespace llvm {
472
474
475/// InnerLoopVectorizer vectorizes loops which contain only one basic
476/// block to a specified vectorization factor (VF).
477/// This class performs the widening of scalars into vectors, or multiple
478/// scalars. This class also implements the following features:
479/// * It inserts an epilogue loop for handling loops that don't have iteration
480/// counts that are known to be a multiple of the vectorization factor.
481/// * It handles the code generation for reduction variables.
482/// * Scalarization (implementation using scalars) of un-vectorizable
483/// instructions.
484/// InnerLoopVectorizer does not perform any vectorization-legality
485/// checks, and relies on the caller to check for the different legality
486/// aspects. The InnerLoopVectorizer relies on the
487/// LoopVectorizationLegality class to provide information about the induction
488/// and reduction variables that were found to a given vectorization factor.
490public:
494 ElementCount VecWidth, unsigned UnrollFactor,
496 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks,
497 VPlan &Plan)
498 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TTI(TTI), AC(AC),
499 VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()),
502 Plan.getVectorLoopRegion()->getSinglePredecessor())) {}
503
504 virtual ~InnerLoopVectorizer() = default;
505
506 /// Creates a basic block for the scalar preheader. Both
507 /// EpilogueVectorizerMainLoop and EpilogueVectorizerEpilogueLoop overwrite
508 /// the method to create additional blocks and checks needed for epilogue
509 /// vectorization.
511
512 /// Fix the vectorized code, taking care of header phi's, and more.
514
515 /// Fix the non-induction PHIs in \p Plan.
517
518 /// Returns the original loop trip count.
519 Value *getTripCount() const { return TripCount; }
520
521 /// Used to set the trip count after ILV's construction and after the
522 /// preheader block has been executed. Note that this always holds the trip
523 /// count of the original loop for both main loop and epilogue vectorization.
524 void setTripCount(Value *TC) { TripCount = TC; }
525
526protected:
528
529 /// Create and return a new IR basic block for the scalar preheader whose name
530 /// is prefixed with \p Prefix.
532
533 /// Allow subclasses to override and print debug traces before/after vplan
534 /// execution, when trace information is requested.
535 virtual void printDebugTracesAtStart() {}
536 virtual void printDebugTracesAtEnd() {}
537
538 /// The original loop.
540
541 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
542 /// dynamic knowledge to simplify SCEV expressions and converts them to a
543 /// more usable form.
545
546 /// Loop Info.
548
549 /// Dominator Tree.
551
552 /// Target Transform Info.
554
555 /// Assumption Cache.
557
558 /// The vectorization SIMD factor to use. Each vector will have this many
559 /// vector elements.
561
562 /// The vectorization unroll factor to use. Each scalar is vectorized to this
563 /// many different vector instructions.
564 unsigned UF;
565
566 /// The builder that we use
568
569 // --- Vectorization state ---
570
571 /// Trip count of the original loop.
572 Value *TripCount = nullptr;
573
574 /// The profitablity analysis.
576
577 /// BFI and PSI are used to check for profile guided size optimizations.
580
581 /// Structure to hold information about generated runtime checks, responsible
582 /// for cleaning the checks, if vectorization turns out unprofitable.
583 GeneratedRTChecks &RTChecks;
584
586
587 /// The vector preheader block of \p Plan, used as target for check blocks
588 /// introduced during skeleton creation.
590};
591
592/// Encapsulate information regarding vectorization of a loop and its epilogue.
593/// This information is meant to be updated and used across two stages of
594/// epilogue vectorization.
597 unsigned MainLoopUF = 0;
599 unsigned EpilogueUF = 0;
602 Value *TripCount = nullptr;
605
607 ElementCount EVF, unsigned EUF,
609 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF),
611 assert(EUF == 1 &&
612 "A high UF for the epilogue loop is likely not beneficial.");
613 }
614};
615
616/// An extension of the inner loop vectorizer that creates a skeleton for a
617/// vectorized loop that has its epilogue (residual) also vectorized.
618/// The idea is to run the vplan on a given loop twice, firstly to setup the
619/// skeleton and vectorize the main loop, and secondly to complete the skeleton
620/// from the first step and vectorize the epilogue. This is achieved by
621/// deriving two concrete strategy classes from this base class and invoking
622/// them in succession from the loop vectorizer planner.
624public:
635
636 /// Holds and updates state information required to vectorize the main loop
637 /// and its epilogue in two separate passes. This setup helps us avoid
638 /// regenerating and recomputing runtime safety checks. It also helps us to
639 /// shorten the iteration-count-check path length for the cases where the
640 /// iteration count of the loop is so small that the main vector loop is
641 /// completely skipped.
643
644protected:
646};
647
648/// A specialized derived class of inner loop vectorizer that performs
649/// vectorization of *main* loops in the process of vectorizing loops and their
650/// epilogues.
652public:
664 /// Implements the interface for creating a vectorized skeleton using the
665 /// *main loop* strategy (i.e., the first pass of VPlan execution).
667
668protected:
669 /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
670 /// vector preheader and its predecessor, also connecting the new block to the
671 /// scalar preheader.
672 void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
673
674 // Create a check to see if the main vector loop should be executed
676 unsigned UF) const;
677
678 /// Emits an iteration count bypass check once for the main loop (when \p
679 /// ForEpilogue is false) and once for the epilogue loop (when \p
680 /// ForEpilogue is true).
682 bool ForEpilogue);
683 void printDebugTracesAtStart() override;
684 void printDebugTracesAtEnd() override;
685};
686
687// A specialized derived class of inner loop vectorizer that performs
688// vectorization of *epilogue* loops in the process of vectorizing loops and
689// their epilogues.
691 /// The additional bypass block which conditionally skips over the epilogue
692 /// loop after executing the main loop. Needed to resume inductions and
693 /// reductions during epilogue vectorization.
694 BasicBlock *AdditionalBypassBlock = nullptr;
695
696public:
708 /// Implements the interface for creating a vectorized skeleton using the
709 /// *epilogue loop* strategy (i.e., the second pass of VPlan execution).
711
712 /// Return the additional bypass block which targets the scalar loop by
713 /// skipping the epilogue loop after completing the main loop.
715 assert(AdditionalBypassBlock &&
716 "Trying to access AdditionalBypassBlock but it has not been set");
717 return AdditionalBypassBlock;
718 }
719
720protected:
721 /// Emits an iteration count bypass check after the main vector loop has
722 /// finished to see if there are any iterations left to execute by either
723 /// the vector epilogue or the scalar epilogue.
724 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(BasicBlock *VectorPH,
725 BasicBlock *Bypass,
726 BasicBlock *Insert);
727 void printDebugTracesAtStart() override;
728 void printDebugTracesAtEnd() override;
729};
730} // end namespace llvm
731
732/// Look for a meaningful debug location on the instruction or its operands.
734 if (!I)
735 return DebugLoc::getUnknown();
736
738 if (I->getDebugLoc() != Empty)
739 return I->getDebugLoc();
740
741 for (Use &Op : I->operands()) {
742 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
743 if (OpInst->getDebugLoc() != Empty)
744 return OpInst->getDebugLoc();
745 }
746
747 return I->getDebugLoc();
748}
749
750/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
751/// is passed, the message relates to that particular instruction.
752#ifndef NDEBUG
753static void debugVectorizationMessage(const StringRef Prefix,
754 const StringRef DebugMsg,
755 Instruction *I) {
756 dbgs() << "LV: " << Prefix << DebugMsg;
757 if (I != nullptr)
758 dbgs() << " " << *I;
759 else
760 dbgs() << '.';
761 dbgs() << '\n';
762}
763#endif
764
765/// Create an analysis remark that explains why vectorization failed
766///
767/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
768/// RemarkName is the identifier for the remark. If \p I is passed it is an
769/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
770/// the location of the remark. If \p DL is passed, use it as debug location for
771/// the remark. \return the remark object that can be streamed to.
773createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
774 Instruction *I, DebugLoc DL = {}) {
775 BasicBlock *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
776 // If debug location is attached to the instruction, use it. Otherwise if DL
777 // was not provided, use the loop's.
778 if (I && I->getDebugLoc())
779 DL = I->getDebugLoc();
780 else if (!DL)
781 DL = TheLoop->getStartLoc();
782
783 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
784}
785
786namespace llvm {
787
788/// Return a value for Step multiplied by VF.
790 int64_t Step) {
791 assert(Ty->isIntegerTy() && "Expected an integer step");
792 ElementCount VFxStep = VF.multiplyCoefficientBy(Step);
793 assert(isPowerOf2_64(VF.getKnownMinValue()) && "must pass power-of-2 VF");
794 if (VF.isScalable() && isPowerOf2_64(Step)) {
795 return B.CreateShl(
796 B.CreateVScale(Ty),
797 ConstantInt::get(Ty, Log2_64(VFxStep.getKnownMinValue())), "", true);
798 }
799 return B.CreateElementCount(Ty, VFxStep);
800}
801
802/// Return the runtime value for VF.
804 return B.CreateElementCount(Ty, VF);
805}
806
808 const StringRef OREMsg, const StringRef ORETag,
809 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
810 Instruction *I) {
811 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
812 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
813 ORE->emit(
814 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
815 << "loop not vectorized: " << OREMsg);
816}
817
818/// Reports an informative message: print \p Msg for debugging purposes as well
819/// as an optimization remark. Uses either \p I as location of the remark, or
820/// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the
821/// remark. If \p DL is passed, use it as debug location for the remark.
822static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
824 Loop *TheLoop, Instruction *I = nullptr,
825 DebugLoc DL = {}) {
827 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
828 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop,
829 I, DL)
830 << Msg);
831}
832
833/// Report successful vectorization of the loop. In case an outer loop is
834/// vectorized, prepend "outer" to the vectorization remark.
836 VectorizationFactor VF, unsigned IC) {
838 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
839 nullptr));
840 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
841 ORE->emit([&]() {
842 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
843 TheLoop->getHeader())
844 << "vectorized " << LoopType << "loop (vectorization width: "
845 << ore::NV("VectorizationFactor", VF.Width)
846 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
847 });
848}
849
850} // end namespace llvm
851
852namespace llvm {
853
854// Loop vectorization cost-model hints how the scalar epilogue loop should be
855// lowered.
857
858 // The default: allowing scalar epilogues.
860
861 // Vectorization with OptForSize: don't allow epilogues.
863
864 // A special case of vectorisation with OptForSize: loops with a very small
865 // trip count are considered for vectorization under OptForSize, thereby
866 // making sure the cost of their loop body is dominant, free of runtime
867 // guards and scalar iteration overheads.
869
870 // Loop hint predicate indicating an epilogue is undesired.
872
873 // Directive indicating we must either tail fold or not vectorize
875};
876
877/// LoopVectorizationCostModel - estimates the expected speedups due to
878/// vectorization.
879/// In many cases vectorization is not profitable. This can happen because of
880/// a number of reasons. In this class we mainly attempt to predict the
881/// expected speedup/slowdowns due to the supported instruction set. We use the
882/// TargetTransformInfo to query the different backends for the cost of
883/// different operations.
886
887public:
898 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
899 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
900 Hints(Hints), InterleaveInfo(IAI) {
901 if (TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors)
902 initializeVScaleForTuning();
904 // Query this against the original loop and save it here because the profile
905 // of the original loop header may change as the transformation happens.
906 OptForSize = llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
908 }
909
910 /// \return An upper bound for the vectorization factors (both fixed and
911 /// scalable). If the factors are 0, vectorization and interleaving should be
912 /// avoided up front.
913 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
914
915 /// \return True if runtime checks are required for vectorization, and false
916 /// otherwise.
918
919 /// Setup cost-based decisions for user vectorization factor.
920 /// \return true if the UserVF is a feasible VF to be chosen.
925
926 /// \return True if maximizing vector bandwidth is enabled by the target or
927 /// user options, for the given register kind.
929
930 /// \return True if register pressure should be considered for the given VF.
932
933 /// \return The size (in bits) of the smallest and widest types in the code
934 /// that needs to be vectorized. We ignore values that remain scalar such as
935 /// 64 bit loop indices.
936 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
937
938 /// Memory access instruction may be vectorized in more than one way.
939 /// Form of instruction after vectorization depends on cost.
940 /// This function takes cost-based decisions for Load/Store instructions
941 /// and collects them in a map. This decisions map is used for building
942 /// the lists of loop-uniform and loop-scalar instructions.
943 /// The calculated cost is saved with widening decision in order to
944 /// avoid redundant calculations.
946
947 /// A call may be vectorized in different ways depending on whether we have
948 /// vectorized variants available and whether the target supports masking.
949 /// This function analyzes all calls in the function at the supplied VF,
950 /// makes a decision based on the costs of available options, and stores that
951 /// decision in a map for use in planning and plan execution.
953
954 /// Collect values we want to ignore in the cost model.
956
957 /// Collect all element types in the loop for which widening is needed.
959
960 /// Split reductions into those that happen in the loop, and those that happen
961 /// outside. In loop reductions are collected into InLoopReductions.
963
964 /// Returns true if we should use strict in-order reductions for the given
965 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
966 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
967 /// of FP operations.
968 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
969 return !Hints->allowReordering() && RdxDesc.isOrdered();
970 }
971
972 /// \returns The smallest bitwidth each instruction can be represented with.
973 /// The vector equivalents of these instructions should be truncated to this
974 /// type.
976 return MinBWs;
977 }
978
979 /// \returns True if it is more profitable to scalarize instruction \p I for
980 /// vectorization factor \p VF.
982 assert(VF.isVector() &&
983 "Profitable to scalarize relevant only for VF > 1.");
984 assert(
985 TheLoop->isInnermost() &&
986 "cost-model should not be used for outer loops (in VPlan-native path)");
987
988 auto Scalars = InstsToScalarize.find(VF);
989 assert(Scalars != InstsToScalarize.end() &&
990 "VF not yet analyzed for scalarization profitability");
991 return Scalars->second.contains(I);
992 }
993
994 /// Returns true if \p I is known to be uniform after vectorization.
996 assert(
997 TheLoop->isInnermost() &&
998 "cost-model should not be used for outer loops (in VPlan-native path)");
999 // Pseudo probe needs to be duplicated for each unrolled iteration and
1000 // vector lane so that profiled loop trip count can be accurately
1001 // accumulated instead of being under counted.
1003 return false;
1004
1005 if (VF.isScalar())
1006 return true;
1007
1008 auto UniformsPerVF = Uniforms.find(VF);
1009 assert(UniformsPerVF != Uniforms.end() &&
1010 "VF not yet analyzed for uniformity");
1011 return UniformsPerVF->second.count(I);
1012 }
1013
1014 /// Returns true if \p I is known to be scalar after vectorization.
1016 assert(
1017 TheLoop->isInnermost() &&
1018 "cost-model should not be used for outer loops (in VPlan-native path)");
1019 if (VF.isScalar())
1020 return true;
1021
1022 auto ScalarsPerVF = Scalars.find(VF);
1023 assert(ScalarsPerVF != Scalars.end() &&
1024 "Scalar values are not calculated for VF");
1025 return ScalarsPerVF->second.count(I);
1026 }
1027
1028 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1029 /// for vectorization factor \p VF.
1031 return VF.isVector() && MinBWs.contains(I) &&
1032 !isProfitableToScalarize(I, VF) &&
1034 }
1035
1036 /// Decision that was taken during cost calculation for memory instruction.
1039 CM_Widen, // For consecutive accesses with stride +1.
1040 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1046 };
1047
1048 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1049 /// instruction \p I and vector width \p VF.
1052 assert(VF.isVector() && "Expected VF >=2");
1053 WideningDecisions[{I, VF}] = {W, Cost};
1054 }
1055
1056 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1057 /// interleaving group \p Grp and vector width \p VF.
1061 assert(VF.isVector() && "Expected VF >=2");
1062 /// Broadcast this decicion to all instructions inside the group.
1063 /// When interleaving, the cost will only be assigned one instruction, the
1064 /// insert position. For other cases, add the appropriate fraction of the
1065 /// total cost to each instruction. This ensures accurate costs are used,
1066 /// even if the insert position instruction is not used.
1067 InstructionCost InsertPosCost = Cost;
1068 InstructionCost OtherMemberCost = 0;
1069 if (W != CM_Interleave)
1070 OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers();
1071 ;
1072 for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) {
1073 if (auto *I = Grp->getMember(Idx)) {
1074 if (Grp->getInsertPos() == I)
1075 WideningDecisions[{I, VF}] = {W, InsertPosCost};
1076 else
1077 WideningDecisions[{I, VF}] = {W, OtherMemberCost};
1078 }
1079 }
1080 }
1081
1082 /// Return the cost model decision for the given instruction \p I and vector
1083 /// width \p VF. Return CM_Unknown if this instruction did not pass
1084 /// through the cost modeling.
1086 assert(VF.isVector() && "Expected VF to be a vector VF");
1087 assert(
1088 TheLoop->isInnermost() &&
1089 "cost-model should not be used for outer loops (in VPlan-native path)");
1090
1091 std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
1092 auto Itr = WideningDecisions.find(InstOnVF);
1093 if (Itr == WideningDecisions.end())
1094 return CM_Unknown;
1095 return Itr->second.first;
1096 }
1097
1098 /// Return the vectorization cost for the given instruction \p I and vector
1099 /// width \p VF.
1101 assert(VF.isVector() && "Expected VF >=2");
1102 std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
1103 assert(WideningDecisions.contains(InstOnVF) &&
1104 "The cost is not calculated");
1105 return WideningDecisions[InstOnVF].second;
1106 }
1107
1115
1117 Function *Variant, Intrinsic::ID IID,
1118 std::optional<unsigned> MaskPos,
1120 assert(!VF.isScalar() && "Expected vector VF");
1121 CallWideningDecisions[{CI, VF}] = {Kind, Variant, IID, MaskPos, Cost};
1122 }
1123
1125 ElementCount VF) const {
1126 assert(!VF.isScalar() && "Expected vector VF");
1127 auto I = CallWideningDecisions.find({CI, VF});
1128 if (I == CallWideningDecisions.end())
1129 return {CM_Unknown, nullptr, Intrinsic::not_intrinsic, std::nullopt, 0};
1130 return I->second;
1131 }
1132
1133 /// Return True if instruction \p I is an optimizable truncate whose operand
1134 /// is an induction variable. Such a truncate will be removed by adding a new
1135 /// induction variable with the destination type.
1137 // If the instruction is not a truncate, return false.
1138 auto *Trunc = dyn_cast<TruncInst>(I);
1139 if (!Trunc)
1140 return false;
1141
1142 // Get the source and destination types of the truncate.
1143 Type *SrcTy = toVectorTy(Trunc->getSrcTy(), VF);
1144 Type *DestTy = toVectorTy(Trunc->getDestTy(), VF);
1145
1146 // If the truncate is free for the given types, return false. Replacing a
1147 // free truncate with an induction variable would add an induction variable
1148 // update instruction to each iteration of the loop. We exclude from this
1149 // check the primary induction variable since it will need an update
1150 // instruction regardless.
1151 Value *Op = Trunc->getOperand(0);
1152 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1153 return false;
1154
1155 // If the truncated value is not an induction variable, return false.
1156 return Legal->isInductionPhi(Op);
1157 }
1158
1159 /// Collects the instructions to scalarize for each predicated instruction in
1160 /// the loop.
1162
1163 /// Collect values that will not be widened, including Uniforms, Scalars, and
1164 /// Instructions to Scalarize for the given \p VF.
1165 /// The sets depend on CM decision for Load/Store instructions
1166 /// that may be vectorized as interleave, gather-scatter or scalarized.
1167 /// Also make a decision on what to do about call instructions in the loop
1168 /// at that VF -- scalarize, call a known vector routine, or call a
1169 /// vector intrinsic.
1171 // Do the analysis once.
1172 if (VF.isScalar() || Uniforms.contains(VF))
1173 return;
1175 collectLoopUniforms(VF);
1177 collectLoopScalars(VF);
1179 }
1180
1181 /// Returns true if the target machine supports masked store operation
1182 /// for the given \p DataType and kind of access to \p Ptr.
1183 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment,
1184 unsigned AddressSpace) const {
1185 return Legal->isConsecutivePtr(DataType, Ptr) &&
1186 TTI.isLegalMaskedStore(DataType, Alignment, AddressSpace);
1187 }
1188
1189 /// Returns true if the target machine supports masked load operation
1190 /// for the given \p DataType and kind of access to \p Ptr.
1191 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment,
1192 unsigned AddressSpace) const {
1193 return Legal->isConsecutivePtr(DataType, Ptr) &&
1194 TTI.isLegalMaskedLoad(DataType, Alignment, AddressSpace);
1195 }
1196
1197 /// Returns true if the target machine can represent \p V as a masked gather
1198 /// or scatter operation.
1200 bool LI = isa<LoadInst>(V);
1201 bool SI = isa<StoreInst>(V);
1202 if (!LI && !SI)
1203 return false;
1204 auto *Ty = getLoadStoreType(V);
1206 if (VF.isVector())
1207 Ty = VectorType::get(Ty, VF);
1208 return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1209 (SI && TTI.isLegalMaskedScatter(Ty, Align));
1210 }
1211
1212 /// Returns true if the target machine supports all of the reduction
1213 /// variables found for the given VF.
1215 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1216 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1217 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1218 }));
1219 }
1220
1221 /// Given costs for both strategies, return true if the scalar predication
1222 /// lowering should be used for div/rem. This incorporates an override
1223 /// option so it is not simply a cost comparison.
1225 InstructionCost SafeDivisorCost) const {
1226 switch (ForceSafeDivisor) {
1227 case cl::BOU_UNSET:
1228 return ScalarCost < SafeDivisorCost;
1229 case cl::BOU_TRUE:
1230 return false;
1231 case cl::BOU_FALSE:
1232 return true;
1233 }
1234 llvm_unreachable("impossible case value");
1235 }
1236
1237 /// Returns true if \p I is an instruction which requires predication and
1238 /// for which our chosen predication strategy is scalarization (i.e. we
1239 /// don't have an alternate strategy such as masking available).
1240 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1242
1243 /// Returns true if \p I is an instruction that needs to be predicated
1244 /// at runtime. The result is independent of the predication mechanism.
1245 /// Superset of instructions that return true for isScalarWithPredication.
1246 bool isPredicatedInst(Instruction *I) const;
1247
1248 /// Return the costs for our two available strategies for lowering a
1249 /// div/rem operation which requires speculating at least one lane.
1250 /// First result is for scalarization (will be invalid for scalable
1251 /// vectors); second is for the safe-divisor strategy.
1252 std::pair<InstructionCost, InstructionCost>
1254 ElementCount VF) const;
1255
1256 /// Returns true if \p I is a memory instruction with consecutive memory
1257 /// access that can be widened.
1259
1260 /// Returns true if \p I is a memory instruction in an interleaved-group
1261 /// of memory accesses that can be vectorized with wide vector loads/stores
1262 /// and shuffles.
1264
1265 /// Check if \p Instr belongs to any interleaved access group.
1267 return InterleaveInfo.isInterleaved(Instr);
1268 }
1269
1270 /// Get the interleaved access group that \p Instr belongs to.
1273 return InterleaveInfo.getInterleaveGroup(Instr);
1274 }
1275
1276 /// Returns true if we're required to use a scalar epilogue for at least
1277 /// the final iteration of the original loop.
1278 bool requiresScalarEpilogue(bool IsVectorizing) const {
1279 if (!isScalarEpilogueAllowed()) {
1280 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1281 return false;
1282 }
1283 // If we might exit from anywhere but the latch and early exit vectorization
1284 // is disabled, we must run the exiting iteration in scalar form.
1285 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
1286 !(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) {
1287 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
1288 "from latch block\n");
1289 return true;
1290 }
1291 if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1292 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1293 "interleaved group requires scalar epilogue\n");
1294 return true;
1295 }
1296 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1297 return false;
1298 }
1299
1300 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1301 /// loop hint annotation.
1303 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1304 }
1305
1306 /// Returns the TailFoldingStyle that is best for the current loop.
1307 TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1308 if (!ChosenTailFoldingStyle)
1310 return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1311 : ChosenTailFoldingStyle->second;
1312 }
1313
1314 /// Selects and saves TailFoldingStyle for 2 options - if IV update may
1315 /// overflow or not.
1316 /// \param IsScalableVF true if scalable vector factors enabled.
1317 /// \param UserIC User specific interleave count.
1318 void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1319 assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1320 if (!Legal->canFoldTailByMasking()) {
1321 ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None};
1322 return;
1323 }
1324
1325 // Default to TTI preference, but allow command line override.
1326 ChosenTailFoldingStyle = {
1327 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
1328 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false)};
1329 if (ForceTailFoldingStyle.getNumOccurrences())
1330 ChosenTailFoldingStyle = {ForceTailFoldingStyle.getValue(),
1331 ForceTailFoldingStyle.getValue()};
1332
1333 if (ChosenTailFoldingStyle->first != TailFoldingStyle::DataWithEVL &&
1334 ChosenTailFoldingStyle->second != TailFoldingStyle::DataWithEVL)
1335 return;
1336 // Override EVL styles if needed.
1337 // FIXME: Investigate opportunity for fixed vector factor.
1338 bool EVLIsLegal = UserIC <= 1 && IsScalableVF &&
1339 TTI.hasActiveVectorLength() && !EnableVPlanNativePath;
1340 if (EVLIsLegal)
1341 return;
1342 // If for some reason EVL mode is unsupported, fallback to a scalar epilogue
1343 // if it's allowed, or DataWithoutLaneMask otherwise.
1344 if (ScalarEpilogueStatus == CM_ScalarEpilogueAllowed ||
1345 ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate)
1346 ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None};
1347 else
1348 ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask,
1350
1351 LLVM_DEBUG(
1352 dbgs() << "LV: Preference for VP intrinsics indicated. Will "
1353 "not try to generate VP Intrinsics "
1354 << (UserIC > 1
1355 ? "since interleave count specified is greater than 1.\n"
1356 : "due to non-interleaving reasons.\n"));
1357 }
1358
1359 /// Returns true if all loop blocks should be masked to fold tail loop.
1360 bool foldTailByMasking() const {
1361 // TODO: check if it is possible to check for None style independent of
1362 // IVUpdateMayOverflow flag in getTailFoldingStyle.
1364 }
1365
1366 /// Return maximum safe number of elements to be processed per vector
1367 /// iteration, which do not prevent store-load forwarding and are safe with
1368 /// regard to the memory dependencies. Required for EVL-based VPlans to
1369 /// correctly calculate AVL (application vector length) as min(remaining AVL,
1370 /// MaxSafeElements).
1371 /// TODO: need to consider adjusting cost model to use this value as a
1372 /// vectorization factor for EVL-based vectorization.
1373 std::optional<unsigned> getMaxSafeElements() const { return MaxSafeElements; }
1374
1375 /// Returns true if the instructions in this block requires predication
1376 /// for any reason, e.g. because tail folding now requires a predicate
1377 /// or because the block in the original loop was predicated.
1379 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1380 }
1381
1382 /// Returns true if VP intrinsics with explicit vector length support should
1383 /// be generated in the tail folded loop.
1387
1388 /// Returns true if the Phi is part of an inloop reduction.
1389 bool isInLoopReduction(PHINode *Phi) const {
1390 return InLoopReductions.contains(Phi);
1391 }
1392
1393 /// Returns true if the predicated reduction select should be used to set the
1394 /// incoming value for the reduction phi.
1396 // Force to use predicated reduction select since the EVL of the
1397 // second-to-last iteration might not be VF*UF.
1398 if (foldTailWithEVL())
1399 return true;
1401 TTI.preferPredicatedReductionSelect();
1402 }
1403
1404 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1405 /// with factor VF. Return the cost of the instruction, including
1406 /// scalarization overhead if it's needed.
1408
1409 /// Estimate cost of a call instruction CI if it were vectorized with factor
1410 /// VF. Return the cost of the instruction, including scalarization overhead
1411 /// if it's needed.
1413
1414 /// Invalidates decisions already taken by the cost model.
1416 WideningDecisions.clear();
1417 CallWideningDecisions.clear();
1418 Uniforms.clear();
1419 Scalars.clear();
1420 }
1421
1422 /// Returns the expected execution cost. The unit of the cost does
1423 /// not matter because we use the 'cost' units to compare different
1424 /// vector widths. The cost that is returned is *not* normalized by
1425 /// the factor width.
1427
1428 bool hasPredStores() const { return NumPredStores > 0; }
1429
1430 /// Returns true if epilogue vectorization is considered profitable, and
1431 /// false otherwise.
1432 /// \p VF is the vectorization factor chosen for the original loop.
1433 /// \p Multiplier is an aditional scaling factor applied to VF before
1434 /// comparing to EpilogueVectorizationMinVF.
1436 const unsigned IC) const;
1437
1438 /// Returns the execution time cost of an instruction for a given vector
1439 /// width. Vector width of one means scalar.
1441
1442 /// Return the cost of instructions in an inloop reduction pattern, if I is
1443 /// part of that pattern.
1444 std::optional<InstructionCost> getReductionPatternCost(Instruction *I,
1445 ElementCount VF,
1446 Type *VectorTy) const;
1447
1448 /// Returns true if \p Op should be considered invariant and if it is
1449 /// trivially hoistable.
1451
1452 /// Return the value of vscale used for tuning the cost model.
1453 std::optional<unsigned> getVScaleForTuning() const { return VScaleForTuning; }
1454
1455private:
1456 unsigned NumPredStores = 0;
1457
1458 /// Used to store the value of vscale used for tuning the cost model. It is
1459 /// initialized during object construction.
1460 std::optional<unsigned> VScaleForTuning;
1461
1462 /// Initializes the value of vscale used for tuning the cost model. If
1463 /// vscale_range.min == vscale_range.max then return vscale_range.max, else
1464 /// return the value returned by the corresponding TTI method.
1465 void initializeVScaleForTuning() {
1466 const Function *Fn = TheLoop->getHeader()->getParent();
1467 if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
1468 auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
1469 auto Min = Attr.getVScaleRangeMin();
1470 auto Max = Attr.getVScaleRangeMax();
1471 if (Max && Min == Max) {
1472 VScaleForTuning = Max;
1473 return;
1474 }
1475 }
1476
1477 VScaleForTuning = TTI.getVScaleForTuning();
1478 }
1479
1480 /// \return An upper bound for the vectorization factors for both
1481 /// fixed and scalable vectorization, where the minimum-known number of
1482 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1483 /// disabled or unsupported, then the scalable part will be equal to
1484 /// ElementCount::getScalable(0).
1485 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1486 ElementCount UserVF,
1487 bool FoldTailByMasking);
1488
1489 /// If \p VF > MaxTripcount, clamps it to the next lower VF that is <=
1490 /// MaxTripCount.
1491 ElementCount clampVFByMaxTripCount(ElementCount VF, unsigned MaxTripCount,
1492 bool FoldTailByMasking) const;
1493
1494 /// \return the maximized element count based on the targets vector
1495 /// registers and the loop trip-count, but limited to a maximum safe VF.
1496 /// This is a helper function of computeFeasibleMaxVF.
1497 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1498 unsigned SmallestType,
1499 unsigned WidestType,
1500 ElementCount MaxSafeVF,
1501 bool FoldTailByMasking);
1502
1503 /// Checks if scalable vectorization is supported and enabled. Caches the
1504 /// result to avoid repeated debug dumps for repeated queries.
1505 bool isScalableVectorizationAllowed();
1506
1507 /// \return the maximum legal scalable VF, based on the safe max number
1508 /// of elements.
1509 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1510
1511 /// Calculate vectorization cost of memory instruction \p I.
1512 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1513
1514 /// The cost computation for scalarized memory instruction.
1515 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1516
1517 /// The cost computation for interleaving group of memory instructions.
1518 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1519
1520 /// The cost computation for Gather/Scatter instruction.
1521 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1522
1523 /// The cost computation for widening instruction \p I with consecutive
1524 /// memory access.
1525 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1526
1527 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1528 /// Load: scalar load + broadcast.
1529 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1530 /// element)
1531 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1532
1533 /// Estimate the overhead of scalarizing an instruction. This is a
1534 /// convenience wrapper for the type-based getScalarizationOverhead API.
1535 InstructionCost getScalarizationOverhead(Instruction *I,
1536 ElementCount VF) const;
1537
1538 /// Returns true if an artificially high cost for emulated masked memrefs
1539 /// should be used.
1540 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1541
1542 /// Map of scalar integer values to the smallest bitwidth they can be legally
1543 /// represented as. The vector equivalents of these values should be truncated
1544 /// to this type.
1545 MapVector<Instruction *, uint64_t> MinBWs;
1546
1547 /// A type representing the costs for instructions if they were to be
1548 /// scalarized rather than vectorized. The entries are Instruction-Cost
1549 /// pairs.
1550 using ScalarCostsTy = MapVector<Instruction *, InstructionCost>;
1551
1552 /// A set containing all BasicBlocks that are known to present after
1553 /// vectorization as a predicated block.
1554 DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1555 PredicatedBBsAfterVectorization;
1556
1557 /// Records whether it is allowed to have the original scalar loop execute at
1558 /// least once. This may be needed as a fallback loop in case runtime
1559 /// aliasing/dependence checks fail, or to handle the tail/remainder
1560 /// iterations when the trip count is unknown or doesn't divide by the VF,
1561 /// or as a peel-loop to handle gaps in interleave-groups.
1562 /// Under optsize and when the trip count is very small we don't allow any
1563 /// iterations to execute in the scalar loop.
1564 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1565
1566 /// Control finally chosen tail folding style. The first element is used if
1567 /// the IV update may overflow, the second element - if it does not.
1568 std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1569 ChosenTailFoldingStyle;
1570
1571 /// true if scalable vectorization is supported and enabled.
1572 std::optional<bool> IsScalableVectorizationAllowed;
1573
1574 /// Maximum safe number of elements to be processed per vector iteration,
1575 /// which do not prevent store-load forwarding and are safe with regard to the
1576 /// memory dependencies. Required for EVL-based veectorization, where this
1577 /// value is used as the upper bound of the safe AVL.
1578 std::optional<unsigned> MaxSafeElements;
1579
1580 /// A map holding scalar costs for different vectorization factors. The
1581 /// presence of a cost for an instruction in the mapping indicates that the
1582 /// instruction will be scalarized when vectorizing with the associated
1583 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1584 MapVector<ElementCount, ScalarCostsTy> InstsToScalarize;
1585
1586 /// Holds the instructions known to be uniform after vectorization.
1587 /// The data is collected per VF.
1588 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1589
1590 /// Holds the instructions known to be scalar after vectorization.
1591 /// The data is collected per VF.
1592 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1593
1594 /// Holds the instructions (address computations) that are forced to be
1595 /// scalarized.
1596 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1597
1598 /// PHINodes of the reductions that should be expanded in-loop.
1599 SmallPtrSet<PHINode *, 4> InLoopReductions;
1600
1601 /// A Map of inloop reduction operations and their immediate chain operand.
1602 /// FIXME: This can be removed once reductions can be costed correctly in
1603 /// VPlan. This was added to allow quick lookup of the inloop operations.
1604 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1605
1606 /// Returns the expected difference in cost from scalarizing the expression
1607 /// feeding a predicated instruction \p PredInst. The instructions to
1608 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1609 /// non-negative return value implies the expression will be scalarized.
1610 /// Currently, only single-use chains are considered for scalarization.
1611 InstructionCost computePredInstDiscount(Instruction *PredInst,
1612 ScalarCostsTy &ScalarCosts,
1613 ElementCount VF);
1614
1615 /// Collect the instructions that are uniform after vectorization. An
1616 /// instruction is uniform if we represent it with a single scalar value in
1617 /// the vectorized loop corresponding to each vector iteration. Examples of
1618 /// uniform instructions include pointer operands of consecutive or
1619 /// interleaved memory accesses. Note that although uniformity implies an
1620 /// instruction will be scalar, the reverse is not true. In general, a
1621 /// scalarized instruction will be represented by VF scalar values in the
1622 /// vectorized loop, each corresponding to an iteration of the original
1623 /// scalar loop.
1624 void collectLoopUniforms(ElementCount VF);
1625
1626 /// Collect the instructions that are scalar after vectorization. An
1627 /// instruction is scalar if it is known to be uniform or will be scalarized
1628 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1629 /// to the list if they are used by a load/store instruction that is marked as
1630 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1631 /// VF values in the vectorized loop, each corresponding to an iteration of
1632 /// the original scalar loop.
1633 void collectLoopScalars(ElementCount VF);
1634
1635 /// Keeps cost model vectorization decision and cost for instructions.
1636 /// Right now it is used for memory instructions only.
1637 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1638 std::pair<InstWidening, InstructionCost>>;
1639
1640 DecisionList WideningDecisions;
1641
1642 using CallDecisionList =
1643 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1644
1645 CallDecisionList CallWideningDecisions;
1646
1647 /// Returns true if \p V is expected to be vectorized and it needs to be
1648 /// extracted.
1649 bool needsExtract(Value *V, ElementCount VF) const {
1651 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1652 TheLoop->isLoopInvariant(I) ||
1654 (isa<CallInst>(I) &&
1656 return false;
1657
1658 // Assume we can vectorize V (and hence we need extraction) if the
1659 // scalars are not computed yet. This can happen, because it is called
1660 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1661 // the scalars are collected. That should be a safe assumption in most
1662 // cases, because we check if the operands have vectorizable types
1663 // beforehand in LoopVectorizationLegality.
1664 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1665 };
1666
1667 /// Returns a range containing only operands needing to be extracted.
1668 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1669 ElementCount VF) const {
1670
1671 SmallPtrSet<const Value *, 4> UniqueOperands;
1673 for (Value *Op : Ops) {
1674 if (isa<Constant>(Op) || !UniqueOperands.insert(Op).second ||
1675 !needsExtract(Op, VF))
1676 continue;
1677 Res.push_back(Op);
1678 }
1679 return Res;
1680 }
1681
1682public:
1683 /// The loop that we evaluate.
1685
1686 /// Predicated scalar evolution analysis.
1688
1689 /// Loop Info analysis.
1691
1692 /// Vectorization legality.
1694
1695 /// Vector target information.
1697
1698 /// Target Library Info.
1700
1701 /// Demanded bits analysis.
1703
1704 /// Assumption cache.
1706
1707 /// Interface to emit optimization remarks.
1709
1711
1712 /// Loop Vectorize Hint.
1714
1715 /// The interleave access information contains groups of interleaved accesses
1716 /// with the same stride and close to each other.
1718
1719 /// Values to ignore in the cost model.
1721
1722 /// Values to ignore in the cost model when VF > 1.
1724
1725 /// All element types found in the loop.
1727
1728 /// The kind of cost that we are calculating
1730
1731 /// Whether this loop should be optimized for size based on function attribute
1732 /// or profile information.
1734
1735 /// The highest VF possible for this loop, without using MaxBandwidth.
1737};
1738} // end namespace llvm
1739
1740namespace {
1741/// Helper struct to manage generating runtime checks for vectorization.
1742///
1743/// The runtime checks are created up-front in temporary blocks to allow better
1744/// estimating the cost and un-linked from the existing IR. After deciding to
1745/// vectorize, the checks are moved back. If deciding not to vectorize, the
1746/// temporary blocks are completely removed.
1747class GeneratedRTChecks {
1748 /// Basic block which contains the generated SCEV checks, if any.
1749 BasicBlock *SCEVCheckBlock = nullptr;
1750
1751 /// The value representing the result of the generated SCEV checks. If it is
1752 /// nullptr no SCEV checks have been generated.
1753 Value *SCEVCheckCond = nullptr;
1754
1755 /// Basic block which contains the generated memory runtime checks, if any.
1756 BasicBlock *MemCheckBlock = nullptr;
1757
1758 /// The value representing the result of the generated memory runtime checks.
1759 /// If it is nullptr no memory runtime checks have been generated.
1760 Value *MemRuntimeCheckCond = nullptr;
1761
1762 DominatorTree *DT;
1763 LoopInfo *LI;
1765
1766 SCEVExpander SCEVExp;
1767 SCEVExpander MemCheckExp;
1768
1769 bool CostTooHigh = false;
1770
1771 Loop *OuterLoop = nullptr;
1772
1774
1775 /// The kind of cost that we are calculating
1777
1778public:
1779 GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
1782 : DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"),
1783 MemCheckExp(*PSE.getSE(), DL, "scev.check"), PSE(PSE),
1784 CostKind(CostKind) {}
1785
1786 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1787 /// accurately estimate the cost of the runtime checks. The blocks are
1788 /// un-linked from the IR and are added back during vector code generation. If
1789 /// there is no vector code generation, the check blocks are removed
1790 /// completely.
1791 void create(Loop *L, const LoopAccessInfo &LAI,
1792 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1793
1794 // Hard cutoff to limit compile-time increase in case a very large number of
1795 // runtime checks needs to be generated.
1796 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1797 // profile info.
1798 CostTooHigh =
1800 if (CostTooHigh)
1801 return;
1802
1803 BasicBlock *LoopHeader = L->getHeader();
1804 BasicBlock *Preheader = L->getLoopPreheader();
1805
1806 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1807 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1808 // may be used by SCEVExpander. The blocks will be un-linked from their
1809 // predecessors and removed from LI & DT at the end of the function.
1810 if (!UnionPred.isAlwaysTrue()) {
1811 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1812 nullptr, "vector.scevcheck");
1813
1814 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1815 &UnionPred, SCEVCheckBlock->getTerminator());
1816 if (isa<Constant>(SCEVCheckCond)) {
1817 // Clean up directly after expanding the predicate to a constant, to
1818 // avoid further expansions re-using anything left over from SCEVExp.
1819 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
1820 SCEVCleaner.cleanup();
1821 }
1822 }
1823
1824 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1825 if (RtPtrChecking.Need) {
1826 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1827 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1828 "vector.memcheck");
1829
1830 auto DiffChecks = RtPtrChecking.getDiffChecks();
1831 if (DiffChecks) {
1832 Value *RuntimeVF = nullptr;
1833 MemRuntimeCheckCond = addDiffRuntimeChecks(
1834 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1835 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1836 if (!RuntimeVF)
1837 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1838 return RuntimeVF;
1839 },
1840 IC);
1841 } else {
1842 MemRuntimeCheckCond = addRuntimeChecks(
1843 MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
1845 }
1846 assert(MemRuntimeCheckCond &&
1847 "no RT checks generated although RtPtrChecking "
1848 "claimed checks are required");
1849 }
1850
1851 SCEVExp.eraseDeadInstructions(SCEVCheckCond);
1852
1853 if (!MemCheckBlock && !SCEVCheckBlock)
1854 return;
1855
1856 // Unhook the temporary block with the checks, update various places
1857 // accordingly.
1858 if (SCEVCheckBlock)
1859 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1860 if (MemCheckBlock)
1861 MemCheckBlock->replaceAllUsesWith(Preheader);
1862
1863 if (SCEVCheckBlock) {
1864 SCEVCheckBlock->getTerminator()->moveBefore(
1865 Preheader->getTerminator()->getIterator());
1866 auto *UI = new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1867 UI->setDebugLoc(DebugLoc::getTemporary());
1868 Preheader->getTerminator()->eraseFromParent();
1869 }
1870 if (MemCheckBlock) {
1871 MemCheckBlock->getTerminator()->moveBefore(
1872 Preheader->getTerminator()->getIterator());
1873 auto *UI = new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1874 UI->setDebugLoc(DebugLoc::getTemporary());
1875 Preheader->getTerminator()->eraseFromParent();
1876 }
1877
1878 DT->changeImmediateDominator(LoopHeader, Preheader);
1879 if (MemCheckBlock) {
1880 DT->eraseNode(MemCheckBlock);
1881 LI->removeBlock(MemCheckBlock);
1882 }
1883 if (SCEVCheckBlock) {
1884 DT->eraseNode(SCEVCheckBlock);
1885 LI->removeBlock(SCEVCheckBlock);
1886 }
1887
1888 // Outer loop is used as part of the later cost calculations.
1889 OuterLoop = L->getParentLoop();
1890 }
1891
1893 if (SCEVCheckBlock || MemCheckBlock)
1894 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1895
1896 if (CostTooHigh) {
1898 Cost.setInvalid();
1899 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
1900 return Cost;
1901 }
1902
1903 InstructionCost RTCheckCost = 0;
1904 if (SCEVCheckBlock)
1905 for (Instruction &I : *SCEVCheckBlock) {
1906 if (SCEVCheckBlock->getTerminator() == &I)
1907 continue;
1909 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1910 RTCheckCost += C;
1911 }
1912 if (MemCheckBlock) {
1913 InstructionCost MemCheckCost = 0;
1914 for (Instruction &I : *MemCheckBlock) {
1915 if (MemCheckBlock->getTerminator() == &I)
1916 continue;
1918 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1919 MemCheckCost += C;
1920 }
1921
1922 // If the runtime memory checks are being created inside an outer loop
1923 // we should find out if these checks are outer loop invariant. If so,
1924 // the checks will likely be hoisted out and so the effective cost will
1925 // reduce according to the outer loop trip count.
1926 if (OuterLoop) {
1927 ScalarEvolution *SE = MemCheckExp.getSE();
1928 // TODO: If profitable, we could refine this further by analysing every
1929 // individual memory check, since there could be a mixture of loop
1930 // variant and invariant checks that mean the final condition is
1931 // variant.
1932 const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
1933 if (SE->isLoopInvariant(Cond, OuterLoop)) {
1934 // It seems reasonable to assume that we can reduce the effective
1935 // cost of the checks even when we know nothing about the trip
1936 // count. Assume that the outer loop executes at least twice.
1937 unsigned BestTripCount = 2;
1938
1939 // Get the best known TC estimate.
1940 if (auto EstimatedTC = getSmallBestKnownTC(
1941 PSE, OuterLoop, /* CanUseConstantMax = */ false))
1942 if (EstimatedTC->isFixed())
1943 BestTripCount = EstimatedTC->getFixedValue();
1944
1945 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
1946
1947 // Let's ensure the cost is always at least 1.
1948 NewMemCheckCost = std::max(NewMemCheckCost.getValue(),
1949 (InstructionCost::CostType)1);
1950
1951 if (BestTripCount > 1)
1953 << "We expect runtime memory checks to be hoisted "
1954 << "out of the outer loop. Cost reduced from "
1955 << MemCheckCost << " to " << NewMemCheckCost << '\n');
1956
1957 MemCheckCost = NewMemCheckCost;
1958 }
1959 }
1960
1961 RTCheckCost += MemCheckCost;
1962 }
1963
1964 if (SCEVCheckBlock || MemCheckBlock)
1965 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
1966 << "\n");
1967
1968 return RTCheckCost;
1969 }
1970
1971 /// Remove the created SCEV & memory runtime check blocks & instructions, if
1972 /// unused.
1973 ~GeneratedRTChecks() {
1974 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
1975 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
1976 bool SCEVChecksUsed = !SCEVCheckBlock || !pred_empty(SCEVCheckBlock);
1977 bool MemChecksUsed = !MemCheckBlock || !pred_empty(MemCheckBlock);
1978 if (SCEVChecksUsed)
1979 SCEVCleaner.markResultUsed();
1980
1981 if (MemChecksUsed) {
1982 MemCheckCleaner.markResultUsed();
1983 } else {
1984 auto &SE = *MemCheckExp.getSE();
1985 // Memory runtime check generation creates compares that use expanded
1986 // values. Remove them before running the SCEVExpanderCleaners.
1987 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
1988 if (MemCheckExp.isInsertedInstruction(&I))
1989 continue;
1990 SE.forgetValue(&I);
1991 I.eraseFromParent();
1992 }
1993 }
1994 MemCheckCleaner.cleanup();
1995 SCEVCleaner.cleanup();
1996
1997 if (!SCEVChecksUsed)
1998 SCEVCheckBlock->eraseFromParent();
1999 if (!MemChecksUsed)
2000 MemCheckBlock->eraseFromParent();
2001 }
2002
2003 /// Retrieves the SCEVCheckCond and SCEVCheckBlock that were generated as IR
2004 /// outside VPlan.
2005 std::pair<Value *, BasicBlock *> getSCEVChecks() const {
2006 using namespace llvm::PatternMatch;
2007 if (!SCEVCheckCond || match(SCEVCheckCond, m_ZeroInt()))
2008 return {nullptr, nullptr};
2009
2010 return {SCEVCheckCond, SCEVCheckBlock};
2011 }
2012
2013 /// Retrieves the MemCheckCond and MemCheckBlock that were generated as IR
2014 /// outside VPlan.
2015 std::pair<Value *, BasicBlock *> getMemRuntimeChecks() const {
2016 using namespace llvm::PatternMatch;
2017 if (MemRuntimeCheckCond && match(MemRuntimeCheckCond, m_ZeroInt()))
2018 return {nullptr, nullptr};
2019 return {MemRuntimeCheckCond, MemCheckBlock};
2020 }
2021
2022 /// Return true if any runtime checks have been added
2023 bool hasChecks() const {
2024 return getSCEVChecks().first || getMemRuntimeChecks().first;
2025 }
2026};
2027} // namespace
2028
2034
2039
2040// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2041// vectorization. The loop needs to be annotated with #pragma omp simd
2042// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2043// vector length information is not provided, vectorization is not considered
2044// explicit. Interleave hints are not allowed either. These limitations will be
2045// relaxed in the future.
2046// Please, note that we are currently forced to abuse the pragma 'clang
2047// vectorize' semantics. This pragma provides *auto-vectorization hints*
2048// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2049// provides *explicit vectorization hints* (LV can bypass legal checks and
2050// assume that vectorization is legal). However, both hints are implemented
2051// using the same metadata (llvm.loop.vectorize, processed by
2052// LoopVectorizeHints). This will be fixed in the future when the native IR
2053// representation for pragma 'omp simd' is introduced.
2054static bool isExplicitVecOuterLoop(Loop *OuterLp,
2056 assert(!OuterLp->isInnermost() && "This is not an outer loop");
2057 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2058
2059 // Only outer loops with an explicit vectorization hint are supported.
2060 // Unannotated outer loops are ignored.
2062 return false;
2063
2064 Function *Fn = OuterLp->getHeader()->getParent();
2065 if (!Hints.allowVectorization(Fn, OuterLp,
2066 true /*VectorizeOnlyWhenForced*/)) {
2067 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2068 return false;
2069 }
2070
2071 if (Hints.getInterleave() > 1) {
2072 // TODO: Interleave support is future work.
2073 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2074 "outer loops.\n");
2075 Hints.emitRemarkWithHints();
2076 return false;
2077 }
2078
2079 return true;
2080}
2081
2085 // Collect inner loops and outer loops without irreducible control flow. For
2086 // now, only collect outer loops that have explicit vectorization hints. If we
2087 // are stress testing the VPlan H-CFG construction, we collect the outermost
2088 // loop of every loop nest.
2089 if (L.isInnermost() || VPlanBuildStressTest ||
2091 LoopBlocksRPO RPOT(&L);
2092 RPOT.perform(LI);
2094 V.push_back(&L);
2095 // TODO: Collect inner loops inside marked outer loops in case
2096 // vectorization fails for the outer loop. Do not invoke
2097 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2098 // already known to be reducible. We can use an inherited attribute for
2099 // that.
2100 return;
2101 }
2102 }
2103 for (Loop *InnerL : L)
2104 collectSupportedLoops(*InnerL, LI, ORE, V);
2105}
2106
2107//===----------------------------------------------------------------------===//
2108// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2109// LoopVectorizationCostModel and LoopVectorizationPlanner.
2110//===----------------------------------------------------------------------===//
2111
2112/// Compute the transformed value of Index at offset StartValue using step
2113/// StepValue.
2114/// For integer induction, returns StartValue + Index * StepValue.
2115/// For pointer induction, returns StartValue[Index * StepValue].
2116/// FIXME: The newly created binary instructions should contain nsw/nuw
2117/// flags, which can be found from the original scalar operations.
2118static Value *
2120 Value *Step,
2122 const BinaryOperator *InductionBinOp) {
2123 using namespace llvm::PatternMatch;
2124 Type *StepTy = Step->getType();
2125 Value *CastedIndex = StepTy->isIntegerTy()
2126 ? B.CreateSExtOrTrunc(Index, StepTy)
2127 : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2128 if (CastedIndex != Index) {
2129 CastedIndex->setName(CastedIndex->getName() + ".cast");
2130 Index = CastedIndex;
2131 }
2132
2133 // Note: the IR at this point is broken. We cannot use SE to create any new
2134 // SCEV and then expand it, hoping that SCEV's simplification will give us
2135 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2136 // lead to various SCEV crashes. So all we can do is to use builder and rely
2137 // on InstCombine for future simplifications. Here we handle some trivial
2138 // cases only.
2139 auto CreateAdd = [&B](Value *X, Value *Y) {
2140 assert(X->getType() == Y->getType() && "Types don't match!");
2141 if (match(X, m_ZeroInt()))
2142 return Y;
2143 if (match(Y, m_ZeroInt()))
2144 return X;
2145 return B.CreateAdd(X, Y);
2146 };
2147
2148 // We allow X to be a vector type, in which case Y will potentially be
2149 // splatted into a vector with the same element count.
2150 auto CreateMul = [&B](Value *X, Value *Y) {
2151 assert(X->getType()->getScalarType() == Y->getType() &&
2152 "Types don't match!");
2153 if (match(X, m_One()))
2154 return Y;
2155 if (match(Y, m_One()))
2156 return X;
2157 VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2158 if (XVTy && !isa<VectorType>(Y->getType()))
2159 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2160 return B.CreateMul(X, Y);
2161 };
2162
2163 switch (InductionKind) {
2165 assert(!isa<VectorType>(Index->getType()) &&
2166 "Vector indices not supported for integer inductions yet");
2167 assert(Index->getType() == StartValue->getType() &&
2168 "Index type does not match StartValue type");
2169 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2170 return B.CreateSub(StartValue, Index);
2171 auto *Offset = CreateMul(Index, Step);
2172 return CreateAdd(StartValue, Offset);
2173 }
2175 return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
2177 assert(!isa<VectorType>(Index->getType()) &&
2178 "Vector indices not supported for FP inductions yet");
2179 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2180 assert(InductionBinOp &&
2181 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2182 InductionBinOp->getOpcode() == Instruction::FSub) &&
2183 "Original bin op should be defined for FP induction");
2184
2185 Value *MulExp = B.CreateFMul(Step, Index);
2186 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2187 "induction");
2188 }
2190 return nullptr;
2191 }
2192 llvm_unreachable("invalid enum");
2193}
2194
2195static std::optional<unsigned> getMaxVScale(const Function &F,
2196 const TargetTransformInfo &TTI) {
2197 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2198 return MaxVScale;
2199
2200 if (F.hasFnAttribute(Attribute::VScaleRange))
2201 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2202
2203 return std::nullopt;
2204}
2205
2206/// For the given VF and UF and maximum trip count computed for the loop, return
2207/// whether the induction variable might overflow in the vectorized loop. If not,
2208/// then we know a runtime overflow check always evaluates to false and can be
2209/// removed.
2211 const LoopVectorizationCostModel *Cost,
2212 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2213 // Always be conservative if we don't know the exact unroll factor.
2214 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2215
2216 IntegerType *IdxTy = Cost->Legal->getWidestInductionType();
2217 APInt MaxUIntTripCount = IdxTy->getMask();
2218
2219 // We know the runtime overflow check is known false iff the (max) trip-count
2220 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2221 // the vector loop induction variable.
2222 if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
2223 uint64_t MaxVF = VF.getKnownMinValue();
2224 if (VF.isScalable()) {
2225 std::optional<unsigned> MaxVScale =
2226 getMaxVScale(*Cost->TheFunction, Cost->TTI);
2227 if (!MaxVScale)
2228 return false;
2229 MaxVF *= *MaxVScale;
2230 }
2231
2232 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2233 }
2234
2235 return false;
2236}
2237
2238// Return whether we allow using masked interleave-groups (for dealing with
2239// strided loads/stores that reside in predicated blocks, or for dealing
2240// with gaps).
2242 // If an override option has been passed in for interleaved accesses, use it.
2243 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2245
2246 return TTI.enableMaskedInterleavedAccessVectorization();
2247}
2248
2250 BasicBlock *CheckIRBB) {
2251 // Note: The block with the minimum trip-count check is already connected
2252 // during earlier VPlan construction.
2253 VPBlockBase *ScalarPH = Plan.getScalarPreheader();
2254 VPBlockBase *PreVectorPH = VectorPHVPBB->getSinglePredecessor();
2255 assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors");
2256 assert(PreVectorPH->getSuccessors()[0] == ScalarPH && "Unexpected successor");
2257 VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB);
2258 VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPBB, CheckVPIRBB);
2259 PreVectorPH = CheckVPIRBB;
2260 VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH);
2261 PreVectorPH->swapSuccessors();
2262
2263 // We just connected a new block to the scalar preheader. Update all
2264 // VPPhis by adding an incoming value for it, replicating the last value.
2265 unsigned NumPredecessors = ScalarPH->getNumPredecessors();
2266 for (VPRecipeBase &R : cast<VPBasicBlock>(ScalarPH)->phis()) {
2267 assert(isa<VPPhi>(&R) && "Phi expected to be VPPhi");
2268 assert(cast<VPPhi>(&R)->getNumIncoming() == NumPredecessors - 1 &&
2269 "must have incoming values for all operands");
2270 R.addOperand(R.getOperand(NumPredecessors - 2));
2271 }
2272}
2273
2275 BasicBlock *VectorPH, ElementCount VF, unsigned UF) const {
2276 // Generate code to check if the loop's trip count is less than VF * UF, or
2277 // equal to it in case a scalar epilogue is required; this implies that the
2278 // vector trip count is zero. This check also covers the case where adding one
2279 // to the backedge-taken count overflowed leading to an incorrect trip count
2280 // of zero. In this case we will also jump to the scalar loop.
2281 auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2283
2284 // Reuse existing vector loop preheader for TC checks.
2285 // Note that new preheader block is generated for vector loop.
2286 BasicBlock *const TCCheckBlock = VectorPH;
2288 TCCheckBlock->getContext(),
2289 InstSimplifyFolder(TCCheckBlock->getDataLayout()));
2290 Builder.SetInsertPoint(TCCheckBlock->getTerminator());
2291
2292 // If tail is to be folded, vector loop takes care of all iterations.
2294 Type *CountTy = Count->getType();
2295 Value *CheckMinIters = Builder.getFalse();
2296 auto CreateStep = [&]() -> Value * {
2297 // Create step with max(MinProTripCount, UF * VF).
2298 if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2299 return createStepForVF(Builder, CountTy, VF, UF);
2300
2301 Value *MinProfTC =
2302 Builder.CreateElementCount(CountTy, MinProfitableTripCount);
2303 if (!VF.isScalable())
2304 return MinProfTC;
2305 return Builder.CreateBinaryIntrinsic(
2306 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2307 };
2308
2309 TailFoldingStyle Style = Cost->getTailFoldingStyle();
2310 if (Style == TailFoldingStyle::None) {
2311 Value *Step = CreateStep();
2312 ScalarEvolution &SE = *PSE.getSE();
2313 // TODO: Emit unconditional branch to vector preheader instead of
2314 // conditional branch with known condition.
2315 const SCEV *TripCountSCEV = SE.applyLoopGuards(SE.getSCEV(Count), OrigLoop);
2316 // Check if the trip count is < the step.
2317 if (SE.isKnownPredicate(P, TripCountSCEV, SE.getSCEV(Step))) {
2318 // TODO: Ensure step is at most the trip count when determining max VF and
2319 // UF, w/o tail folding.
2320 CheckMinIters = Builder.getTrue();
2322 TripCountSCEV, SE.getSCEV(Step))) {
2323 // Generate the minimum iteration check only if we cannot prove the
2324 // check is known to be true, or known to be false.
2325 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
2326 } // else step known to be < trip count, use CheckMinIters preset to false.
2327 } else if (VF.isScalable() && !TTI->isVScaleKnownToBeAPowerOfTwo() &&
2330 // vscale is not necessarily a power-of-2, which means we cannot guarantee
2331 // an overflow to zero when updating induction variables and so an
2332 // additional overflow check is required before entering the vector loop.
2333
2334 // Get the maximum unsigned value for the type.
2335 Value *MaxUIntTripCount =
2336 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2337 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2338
2339 // Don't execute the vector loop if (UMax - n) < (VF * UF).
2340 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2341 }
2342 return CheckMinIters;
2343}
2344
2345/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
2346/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
2347/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
2348/// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
2350 BasicBlock *IRBB) {
2351 VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB);
2352 auto IP = IRVPBB->begin();
2353 for (auto &R : make_early_inc_range(VPBB->phis()))
2354 R.moveBefore(*IRVPBB, IP);
2355
2356 for (auto &R :
2358 R.moveBefore(*IRVPBB, IRVPBB->end());
2359
2360 VPBlockUtils::reassociateBlocks(VPBB, IRVPBB);
2361 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
2362 return IRVPBB;
2363}
2364
2366 BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2367 assert(VectorPH && "Invalid loop structure");
2368 assert((OrigLoop->getUniqueLatchExitBlock() ||
2369 Cost->requiresScalarEpilogue(VF.isVector())) &&
2370 "loops not exiting via the latch without required epilogue?");
2371
2372 // NOTE: The Plan's scalar preheader VPBB isn't replaced with a VPIRBasicBlock
2373 // wrapping the newly created scalar preheader here at the moment, because the
2374 // Plan's scalar preheader may be unreachable at this point. Instead it is
2375 // replaced in executePlan.
2376 return SplitBlock(VectorPH, VectorPH->getTerminator(), DT, LI, nullptr,
2377 Twine(Prefix) + "scalar.ph");
2378}
2379
2380/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
2381/// expansion results.
2383 const SCEV2ValueTy &ExpandedSCEVs) {
2384 const SCEV *Step = ID.getStep();
2385 if (auto *C = dyn_cast<SCEVConstant>(Step))
2386 return C->getValue();
2387 if (auto *U = dyn_cast<SCEVUnknown>(Step))
2388 return U->getValue();
2389 Value *V = ExpandedSCEVs.lookup(Step);
2390 assert(V && "SCEV must be expanded at this point");
2391 return V;
2392}
2393
2394/// Knowing that loop \p L executes a single vector iteration, add instructions
2395/// that will get simplified and thus should not have any cost to \p
2396/// InstsToIgnore.
2399 SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
2400 auto *Cmp = L->getLatchCmpInst();
2401 if (Cmp)
2402 InstsToIgnore.insert(Cmp);
2403 for (const auto &KV : IL) {
2404 // Extract the key by hand so that it can be used in the lambda below. Note
2405 // that captured structured bindings are a C++20 extension.
2406 const PHINode *IV = KV.first;
2407
2408 // Get next iteration value of the induction variable.
2409 Instruction *IVInst =
2410 cast<Instruction>(IV->getIncomingValueForBlock(L->getLoopLatch()));
2411 if (all_of(IVInst->users(),
2412 [&](const User *U) { return U == IV || U == Cmp; }))
2413 InstsToIgnore.insert(IVInst);
2414 }
2415}
2416
2418 // Create a new IR basic block for the scalar preheader.
2419 BasicBlock *ScalarPH = createScalarPreheader("");
2420 return ScalarPH->getSinglePredecessor();
2421}
2422
2423namespace {
2424
2425struct CSEDenseMapInfo {
2426 static bool canHandle(const Instruction *I) {
2429 }
2430
2431 static inline Instruction *getEmptyKey() {
2433 }
2434
2435 static inline Instruction *getTombstoneKey() {
2436 return DenseMapInfo<Instruction *>::getTombstoneKey();
2437 }
2438
2439 static unsigned getHashValue(const Instruction *I) {
2440 assert(canHandle(I) && "Unknown instruction!");
2441 return hash_combine(I->getOpcode(),
2442 hash_combine_range(I->operand_values()));
2443 }
2444
2445 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2446 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2447 LHS == getTombstoneKey() || RHS == getTombstoneKey())
2448 return LHS == RHS;
2449 return LHS->isIdenticalTo(RHS);
2450 }
2451};
2452
2453} // end anonymous namespace
2454
2455///Perform cse of induction variable instructions.
2456static void cse(BasicBlock *BB) {
2457 // Perform simple cse.
2459 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
2460 if (!CSEDenseMapInfo::canHandle(&In))
2461 continue;
2462
2463 // Check if we can replace this instruction with any of the
2464 // visited instructions.
2465 if (Instruction *V = CSEMap.lookup(&In)) {
2466 In.replaceAllUsesWith(V);
2467 In.eraseFromParent();
2468 continue;
2469 }
2470
2471 CSEMap[&In] = &In;
2472 }
2473}
2474
2475/// This function attempts to return a value that represents the ElementCount
2476/// at runtime. For fixed-width VFs we know this precisely at compile
2477/// time, but for scalable VFs we calculate it based on an estimate of the
2478/// vscale value.
2480 std::optional<unsigned> VScale) {
2481 unsigned EstimatedVF = VF.getKnownMinValue();
2482 if (VF.isScalable())
2483 if (VScale)
2484 EstimatedVF *= *VScale;
2485 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
2486 return EstimatedVF;
2487}
2488
2491 ElementCount VF) const {
2492 // We only need to calculate a cost if the VF is scalar; for actual vectors
2493 // we should already have a pre-calculated cost at each VF.
2494 if (!VF.isScalar())
2495 return getCallWideningDecision(CI, VF).Cost;
2496
2497 Type *RetTy = CI->getType();
2499 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy))
2500 return *RedCost;
2501
2503 for (auto &ArgOp : CI->args())
2504 Tys.push_back(ArgOp->getType());
2505
2506 InstructionCost ScalarCallCost =
2507 TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind);
2508
2509 // If this is an intrinsic we may have a lower cost for it.
2512 return std::min(ScalarCallCost, IntrinsicCost);
2513 }
2514 return ScalarCallCost;
2515}
2516
2518 if (VF.isScalar() || !canVectorizeTy(Ty))
2519 return Ty;
2520 return toVectorizedTy(Ty, VF);
2521}
2522
2525 ElementCount VF) const {
2527 assert(ID && "Expected intrinsic call!");
2528 Type *RetTy = maybeVectorizeType(CI->getType(), VF);
2529 FastMathFlags FMF;
2530 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
2531 FMF = FPMO->getFastMathFlags();
2532
2535 SmallVector<Type *> ParamTys;
2536 std::transform(FTy->param_begin(), FTy->param_end(),
2537 std::back_inserter(ParamTys),
2538 [&](Type *Ty) { return maybeVectorizeType(Ty, VF); });
2539
2540 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
2543 return TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
2544}
2545
2547 // Fix widened non-induction PHIs by setting up the PHI operands.
2548 fixNonInductionPHIs(State);
2549
2550 // Don't apply optimizations below when no (vector) loop remains, as they all
2551 // require one at the moment.
2552 VPBasicBlock *HeaderVPBB =
2553 vputils::getFirstLoopHeader(*State.Plan, State.VPDT);
2554 if (!HeaderVPBB)
2555 return;
2556
2557 BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
2558
2559 // Remove redundant induction instructions.
2560 cse(HeaderBB);
2561}
2562
2564 auto Iter = vp_depth_first_shallow(Plan.getEntry());
2566 for (VPRecipeBase &P : VPBB->phis()) {
2568 if (!VPPhi)
2569 continue;
2570 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi));
2571 // Make sure the builder has a valid insert point.
2572 Builder.SetInsertPoint(NewPhi);
2573 for (const auto &[Inc, VPBB] : VPPhi->incoming_values_and_blocks())
2574 NewPhi->addIncoming(State.get(Inc), State.CFG.VPBB2IRBB[VPBB]);
2575 }
2576 }
2577}
2578
2579void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
2580 // We should not collect Scalars more than once per VF. Right now, this
2581 // function is called from collectUniformsAndScalars(), which already does
2582 // this check. Collecting Scalars for VF=1 does not make any sense.
2583 assert(VF.isVector() && !Scalars.contains(VF) &&
2584 "This function should not be visited twice for the same VF");
2585
2586 // This avoids any chances of creating a REPLICATE recipe during planning
2587 // since that would result in generation of scalarized code during execution,
2588 // which is not supported for scalable vectors.
2589 if (VF.isScalable()) {
2590 Scalars[VF].insert_range(Uniforms[VF]);
2591 return;
2592 }
2593
2595
2596 // These sets are used to seed the analysis with pointers used by memory
2597 // accesses that will remain scalar.
2599 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
2600 auto *Latch = TheLoop->getLoopLatch();
2601
2602 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
2603 // The pointer operands of loads and stores will be scalar as long as the
2604 // memory access is not a gather or scatter operation. The value operand of a
2605 // store will remain scalar if the store is scalarized.
2606 auto IsScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
2607 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
2608 assert(WideningDecision != CM_Unknown &&
2609 "Widening decision should be ready at this moment");
2610 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
2611 if (Ptr == Store->getValueOperand())
2612 return WideningDecision == CM_Scalarize;
2613 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
2614 "Ptr is neither a value or pointer operand");
2615 return WideningDecision != CM_GatherScatter;
2616 };
2617
2618 // A helper that returns true if the given value is a getelementptr
2619 // instruction contained in the loop.
2620 auto IsLoopVaryingGEP = [&](Value *V) {
2621 return isa<GetElementPtrInst>(V) && !TheLoop->isLoopInvariant(V);
2622 };
2623
2624 // A helper that evaluates a memory access's use of a pointer. If the use will
2625 // be a scalar use and the pointer is only used by memory accesses, we place
2626 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
2627 // PossibleNonScalarPtrs.
2628 auto EvaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
2629 // We only care about bitcast and getelementptr instructions contained in
2630 // the loop.
2631 if (!IsLoopVaryingGEP(Ptr))
2632 return;
2633
2634 // If the pointer has already been identified as scalar (e.g., if it was
2635 // also identified as uniform), there's nothing to do.
2636 auto *I = cast<Instruction>(Ptr);
2637 if (Worklist.count(I))
2638 return;
2639
2640 // If the use of the pointer will be a scalar use, and all users of the
2641 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
2642 // place the pointer in PossibleNonScalarPtrs.
2643 if (IsScalarUse(MemAccess, Ptr) &&
2645 ScalarPtrs.insert(I);
2646 else
2647 PossibleNonScalarPtrs.insert(I);
2648 };
2649
2650 // We seed the scalars analysis with three classes of instructions: (1)
2651 // instructions marked uniform-after-vectorization and (2) bitcast,
2652 // getelementptr and (pointer) phi instructions used by memory accesses
2653 // requiring a scalar use.
2654 //
2655 // (1) Add to the worklist all instructions that have been identified as
2656 // uniform-after-vectorization.
2657 Worklist.insert_range(Uniforms[VF]);
2658
2659 // (2) Add to the worklist all bitcast and getelementptr instructions used by
2660 // memory accesses requiring a scalar use. The pointer operands of loads and
2661 // stores will be scalar unless the operation is a gather or scatter.
2662 // The value operand of a store will remain scalar if the store is scalarized.
2663 for (auto *BB : TheLoop->blocks())
2664 for (auto &I : *BB) {
2665 if (auto *Load = dyn_cast<LoadInst>(&I)) {
2666 EvaluatePtrUse(Load, Load->getPointerOperand());
2667 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
2668 EvaluatePtrUse(Store, Store->getPointerOperand());
2669 EvaluatePtrUse(Store, Store->getValueOperand());
2670 }
2671 }
2672 for (auto *I : ScalarPtrs)
2673 if (!PossibleNonScalarPtrs.count(I)) {
2674 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
2675 Worklist.insert(I);
2676 }
2677
2678 // Insert the forced scalars.
2679 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
2680 // induction variable when the PHI user is scalarized.
2681 auto ForcedScalar = ForcedScalars.find(VF);
2682 if (ForcedScalar != ForcedScalars.end())
2683 for (auto *I : ForcedScalar->second) {
2684 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
2685 Worklist.insert(I);
2686 }
2687
2688 // Expand the worklist by looking through any bitcasts and getelementptr
2689 // instructions we've already identified as scalar. This is similar to the
2690 // expansion step in collectLoopUniforms(); however, here we're only
2691 // expanding to include additional bitcasts and getelementptr instructions.
2692 unsigned Idx = 0;
2693 while (Idx != Worklist.size()) {
2694 Instruction *Dst = Worklist[Idx++];
2695 if (!IsLoopVaryingGEP(Dst->getOperand(0)))
2696 continue;
2697 auto *Src = cast<Instruction>(Dst->getOperand(0));
2698 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
2699 auto *J = cast<Instruction>(U);
2700 return !TheLoop->contains(J) || Worklist.count(J) ||
2701 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
2702 IsScalarUse(J, Src));
2703 })) {
2704 Worklist.insert(Src);
2705 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
2706 }
2707 }
2708
2709 // An induction variable will remain scalar if all users of the induction
2710 // variable and induction variable update remain scalar.
2711 for (const auto &Induction : Legal->getInductionVars()) {
2712 auto *Ind = Induction.first;
2713 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
2714
2715 // If tail-folding is applied, the primary induction variable will be used
2716 // to feed a vector compare.
2717 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
2718 continue;
2719
2720 // Returns true if \p Indvar is a pointer induction that is used directly by
2721 // load/store instruction \p I.
2722 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
2723 Instruction *I) {
2724 return Induction.second.getKind() ==
2727 Indvar == getLoadStorePointerOperand(I) && IsScalarUse(I, Indvar);
2728 };
2729
2730 // Determine if all users of the induction variable are scalar after
2731 // vectorization.
2732 bool ScalarInd = all_of(Ind->users(), [&](User *U) -> bool {
2733 auto *I = cast<Instruction>(U);
2734 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
2735 IsDirectLoadStoreFromPtrIndvar(Ind, I);
2736 });
2737 if (!ScalarInd)
2738 continue;
2739
2740 // If the induction variable update is a fixed-order recurrence, neither the
2741 // induction variable or its update should be marked scalar after
2742 // vectorization.
2743 auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate);
2744 if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi))
2745 continue;
2746
2747 // Determine if all users of the induction variable update instruction are
2748 // scalar after vectorization.
2749 bool ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
2750 auto *I = cast<Instruction>(U);
2751 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
2752 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
2753 });
2754 if (!ScalarIndUpdate)
2755 continue;
2756
2757 // The induction variable and its update instruction will remain scalar.
2758 Worklist.insert(Ind);
2759 Worklist.insert(IndUpdate);
2760 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
2761 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
2762 << "\n");
2763 }
2764
2765 Scalars[VF].insert_range(Worklist);
2766}
2767
2769 Instruction *I, ElementCount VF) const {
2770 if (!isPredicatedInst(I))
2771 return false;
2772
2773 // Do we have a non-scalar lowering for this predicated
2774 // instruction? No - it is scalar with predication.
2775 switch(I->getOpcode()) {
2776 default:
2777 return true;
2778 case Instruction::Call:
2779 if (VF.isScalar())
2780 return true;
2782 case Instruction::Load:
2783 case Instruction::Store: {
2785 auto *Ty = getLoadStoreType(I);
2786 unsigned AS = getLoadStoreAddressSpace(I);
2787 Type *VTy = Ty;
2788 if (VF.isVector())
2789 VTy = VectorType::get(Ty, VF);
2790 const Align Alignment = getLoadStoreAlignment(I);
2791 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment, AS) ||
2792 TTI.isLegalMaskedGather(VTy, Alignment))
2793 : !(isLegalMaskedStore(Ty, Ptr, Alignment, AS) ||
2794 TTI.isLegalMaskedScatter(VTy, Alignment));
2795 }
2796 case Instruction::UDiv:
2797 case Instruction::SDiv:
2798 case Instruction::SRem:
2799 case Instruction::URem: {
2800 // We have the option to use the safe-divisor idiom to avoid predication.
2801 // The cost based decision here will always select safe-divisor for
2802 // scalable vectors as scalarization isn't legal.
2803 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
2804 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
2805 }
2806 }
2807}
2808
2809// TODO: Fold into LoopVectorizationLegality::isMaskRequired.
2811 // TODO: We can use the loop-preheader as context point here and get
2812 // context sensitive reasoning for isSafeToSpeculativelyExecute.
2814 (isa<LoadInst, StoreInst, CallInst>(I) && !Legal->isMaskRequired(I)) ||
2816 return false;
2817
2818 // If the instruction was executed conditionally in the original scalar loop,
2819 // predication is needed with a mask whose lanes are all possibly inactive.
2820 if (Legal->blockNeedsPredication(I->getParent()))
2821 return true;
2822
2823 // If we're not folding the tail by masking, predication is unnecessary.
2824 if (!foldTailByMasking())
2825 return false;
2826
2827 // All that remain are instructions with side-effects originally executed in
2828 // the loop unconditionally, but now execute under a tail-fold mask (only)
2829 // having at least one active lane (the first). If the side-effects of the
2830 // instruction are invariant, executing it w/o (the tail-folding) mask is safe
2831 // - it will cause the same side-effects as when masked.
2832 switch(I->getOpcode()) {
2833 default:
2835 "instruction should have been considered by earlier checks");
2836 case Instruction::Call:
2837 // Side-effects of a Call are assumed to be non-invariant, needing a
2838 // (fold-tail) mask.
2839 assert(Legal->isMaskRequired(I) &&
2840 "should have returned earlier for calls not needing a mask");
2841 return true;
2842 case Instruction::Load:
2843 // If the address is loop invariant no predication is needed.
2844 return !Legal->isInvariant(getLoadStorePointerOperand(I));
2845 case Instruction::Store: {
2846 // For stores, we need to prove both speculation safety (which follows from
2847 // the same argument as loads), but also must prove the value being stored
2848 // is correct. The easiest form of the later is to require that all values
2849 // stored are the same.
2850 return !(Legal->isInvariant(getLoadStorePointerOperand(I)) &&
2851 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()));
2852 }
2853 case Instruction::UDiv:
2854 case Instruction::SDiv:
2855 case Instruction::SRem:
2856 case Instruction::URem:
2857 // If the divisor is loop-invariant no predication is needed.
2858 return !Legal->isInvariant(I->getOperand(1));
2859 }
2860}
2861
2862std::pair<InstructionCost, InstructionCost>
2864 ElementCount VF) const {
2865 assert(I->getOpcode() == Instruction::UDiv ||
2866 I->getOpcode() == Instruction::SDiv ||
2867 I->getOpcode() == Instruction::SRem ||
2868 I->getOpcode() == Instruction::URem);
2870
2871 // Scalarization isn't legal for scalable vector types
2872 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
2873 if (!VF.isScalable()) {
2874 // Get the scalarization cost and scale this amount by the probability of
2875 // executing the predicated block. If the instruction is not predicated,
2876 // we fall through to the next case.
2877 ScalarizationCost = 0;
2878
2879 // These instructions have a non-void type, so account for the phi nodes
2880 // that we will create. This cost is likely to be zero. The phi node
2881 // cost, if any, should be scaled by the block probability because it
2882 // models a copy at the end of each predicated block.
2883 ScalarizationCost +=
2884 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
2885
2886 // The cost of the non-predicated instruction.
2887 ScalarizationCost +=
2888 VF.getFixedValue() *
2889 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
2890
2891 // The cost of insertelement and extractelement instructions needed for
2892 // scalarization.
2893 ScalarizationCost += getScalarizationOverhead(I, VF);
2894
2895 // Scale the cost by the probability of executing the predicated blocks.
2896 // This assumes the predicated block for each vector lane is equally
2897 // likely.
2898 ScalarizationCost = ScalarizationCost / getPredBlockCostDivisor(CostKind);
2899 }
2900 InstructionCost SafeDivisorCost = 0;
2901
2902 auto *VecTy = toVectorTy(I->getType(), VF);
2903
2904 // The cost of the select guard to ensure all lanes are well defined
2905 // after we speculate above any internal control flow.
2906 SafeDivisorCost +=
2907 TTI.getCmpSelInstrCost(Instruction::Select, VecTy,
2908 toVectorTy(Type::getInt1Ty(I->getContext()), VF),
2910
2911 SmallVector<const Value *, 4> Operands(I->operand_values());
2912 SafeDivisorCost += TTI.getArithmeticInstrCost(
2913 I->getOpcode(), VecTy, CostKind,
2914 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
2915 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
2916 Operands, I);
2917 return {ScalarizationCost, SafeDivisorCost};
2918}
2919
2921 Instruction *I, ElementCount VF) const {
2922 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
2924 "Decision should not be set yet.");
2925 auto *Group = getInterleavedAccessGroup(I);
2926 assert(Group && "Must have a group.");
2927 unsigned InterleaveFactor = Group->getFactor();
2928
2929 // If the instruction's allocated size doesn't equal its type size, it
2930 // requires padding and will be scalarized.
2931 auto &DL = I->getDataLayout();
2932 auto *ScalarTy = getLoadStoreType(I);
2933 if (hasIrregularType(ScalarTy, DL))
2934 return false;
2935
2936 // For scalable vectors, the interleave factors must be <= 8 since we require
2937 // the (de)interleaveN intrinsics instead of shufflevectors.
2938 if (VF.isScalable() && InterleaveFactor > 8)
2939 return false;
2940
2941 // If the group involves a non-integral pointer, we may not be able to
2942 // losslessly cast all values to a common type.
2943 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
2944 for (unsigned Idx = 0; Idx < InterleaveFactor; Idx++) {
2945 Instruction *Member = Group->getMember(Idx);
2946 if (!Member)
2947 continue;
2948 auto *MemberTy = getLoadStoreType(Member);
2949 bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
2950 // Don't coerce non-integral pointers to integers or vice versa.
2951 if (MemberNI != ScalarNI)
2952 // TODO: Consider adding special nullptr value case here
2953 return false;
2954 if (MemberNI && ScalarNI &&
2955 ScalarTy->getPointerAddressSpace() !=
2956 MemberTy->getPointerAddressSpace())
2957 return false;
2958 }
2959
2960 // Check if masking is required.
2961 // A Group may need masking for one of two reasons: it resides in a block that
2962 // needs predication, or it was decided to use masking to deal with gaps
2963 // (either a gap at the end of a load-access that may result in a speculative
2964 // load, or any gaps in a store-access).
2965 bool PredicatedAccessRequiresMasking =
2966 blockNeedsPredicationForAnyReason(I->getParent()) &&
2967 Legal->isMaskRequired(I);
2968 bool LoadAccessWithGapsRequiresEpilogMasking =
2969 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
2971 bool StoreAccessWithGapsRequiresMasking =
2972 isa<StoreInst>(I) && !Group->isFull();
2973 if (!PredicatedAccessRequiresMasking &&
2974 !LoadAccessWithGapsRequiresEpilogMasking &&
2975 !StoreAccessWithGapsRequiresMasking)
2976 return true;
2977
2978 // If masked interleaving is required, we expect that the user/target had
2979 // enabled it, because otherwise it either wouldn't have been created or
2980 // it should have been invalidated by the CostModel.
2982 "Masked interleave-groups for predicated accesses are not enabled.");
2983
2984 if (Group->isReverse())
2985 return false;
2986
2987 // TODO: Support interleaved access that requires a gap mask for scalable VFs.
2988 bool NeedsMaskForGaps = LoadAccessWithGapsRequiresEpilogMasking ||
2989 StoreAccessWithGapsRequiresMasking;
2990 if (VF.isScalable() && NeedsMaskForGaps)
2991 return false;
2992
2993 auto *Ty = getLoadStoreType(I);
2994 const Align Alignment = getLoadStoreAlignment(I);
2995 unsigned AS = getLoadStoreAddressSpace(I);
2996 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment, AS)
2997 : TTI.isLegalMaskedStore(Ty, Alignment, AS);
2998}
2999
3001 Instruction *I, ElementCount VF) {
3002 // Get and ensure we have a valid memory instruction.
3003 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
3004
3006 auto *ScalarTy = getLoadStoreType(I);
3007
3008 // In order to be widened, the pointer should be consecutive, first of all.
3009 if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
3010 return false;
3011
3012 // If the instruction is a store located in a predicated block, it will be
3013 // scalarized.
3014 if (isScalarWithPredication(I, VF))
3015 return false;
3016
3017 // If the instruction's allocated size doesn't equal it's type size, it
3018 // requires padding and will be scalarized.
3019 auto &DL = I->getDataLayout();
3020 if (hasIrregularType(ScalarTy, DL))
3021 return false;
3022
3023 return true;
3024}
3025
3026void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3027 // We should not collect Uniforms more than once per VF. Right now,
3028 // this function is called from collectUniformsAndScalars(), which
3029 // already does this check. Collecting Uniforms for VF=1 does not make any
3030 // sense.
3031
3032 assert(VF.isVector() && !Uniforms.contains(VF) &&
3033 "This function should not be visited twice for the same VF");
3034
3035 // Visit the list of Uniforms. If we find no uniform value, we won't
3036 // analyze again. Uniforms.count(VF) will return 1.
3037 Uniforms[VF].clear();
3038
3039 // Now we know that the loop is vectorizable!
3040 // Collect instructions inside the loop that will remain uniform after
3041 // vectorization.
3042
3043 // Global values, params and instructions outside of current loop are out of
3044 // scope.
3045 auto IsOutOfScope = [&](Value *V) -> bool {
3047 return (!I || !TheLoop->contains(I));
3048 };
3049
3050 // Worklist containing uniform instructions demanding lane 0.
3051 SetVector<Instruction *> Worklist;
3052
3053 // Add uniform instructions demanding lane 0 to the worklist. Instructions
3054 // that require predication must not be considered uniform after
3055 // vectorization, because that would create an erroneous replicating region
3056 // where only a single instance out of VF should be formed.
3057 auto AddToWorklistIfAllowed = [&](Instruction *I) -> void {
3058 if (IsOutOfScope(I)) {
3059 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
3060 << *I << "\n");
3061 return;
3062 }
3063 if (isPredicatedInst(I)) {
3064 LLVM_DEBUG(
3065 dbgs() << "LV: Found not uniform due to requiring predication: " << *I
3066 << "\n");
3067 return;
3068 }
3069 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
3070 Worklist.insert(I);
3071 };
3072
3073 // Start with the conditional branches exiting the loop. If the branch
3074 // condition is an instruction contained in the loop that is only used by the
3075 // branch, it is uniform. Note conditions from uncountable early exits are not
3076 // uniform.
3078 TheLoop->getExitingBlocks(Exiting);
3079 for (BasicBlock *E : Exiting) {
3080 if (Legal->hasUncountableEarlyExit() && TheLoop->getLoopLatch() != E)
3081 continue;
3082 auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
3083 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
3084 AddToWorklistIfAllowed(Cmp);
3085 }
3086
3087 auto PrevVF = VF.divideCoefficientBy(2);
3088 // Return true if all lanes perform the same memory operation, and we can
3089 // thus choose to execute only one.
3090 auto IsUniformMemOpUse = [&](Instruction *I) {
3091 // If the value was already known to not be uniform for the previous
3092 // (smaller VF), it cannot be uniform for the larger VF.
3093 if (PrevVF.isVector()) {
3094 auto Iter = Uniforms.find(PrevVF);
3095 if (Iter != Uniforms.end() && !Iter->second.contains(I))
3096 return false;
3097 }
3098 if (!Legal->isUniformMemOp(*I, VF))
3099 return false;
3100 if (isa<LoadInst>(I))
3101 // Loading the same address always produces the same result - at least
3102 // assuming aliasing and ordering which have already been checked.
3103 return true;
3104 // Storing the same value on every iteration.
3105 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
3106 };
3107
3108 auto IsUniformDecision = [&](Instruction *I, ElementCount VF) {
3109 InstWidening WideningDecision = getWideningDecision(I, VF);
3110 assert(WideningDecision != CM_Unknown &&
3111 "Widening decision should be ready at this moment");
3112
3113 if (IsUniformMemOpUse(I))
3114 return true;
3115
3116 return (WideningDecision == CM_Widen ||
3117 WideningDecision == CM_Widen_Reverse ||
3118 WideningDecision == CM_Interleave);
3119 };
3120
3121 // Returns true if Ptr is the pointer operand of a memory access instruction
3122 // I, I is known to not require scalarization, and the pointer is not also
3123 // stored.
3124 auto IsVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
3125 if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
3126 return false;
3127 return getLoadStorePointerOperand(I) == Ptr &&
3128 (IsUniformDecision(I, VF) || Legal->isInvariant(Ptr));
3129 };
3130
3131 // Holds a list of values which are known to have at least one uniform use.
3132 // Note that there may be other uses which aren't uniform. A "uniform use"
3133 // here is something which only demands lane 0 of the unrolled iterations;
3134 // it does not imply that all lanes produce the same value (e.g. this is not
3135 // the usual meaning of uniform)
3136 SetVector<Value *> HasUniformUse;
3137
3138 // Scan the loop for instructions which are either a) known to have only
3139 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
3140 for (auto *BB : TheLoop->blocks())
3141 for (auto &I : *BB) {
3142 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
3143 switch (II->getIntrinsicID()) {
3144 case Intrinsic::sideeffect:
3145 case Intrinsic::experimental_noalias_scope_decl:
3146 case Intrinsic::assume:
3147 case Intrinsic::lifetime_start:
3148 case Intrinsic::lifetime_end:
3149 if (TheLoop->hasLoopInvariantOperands(&I))
3150 AddToWorklistIfAllowed(&I);
3151 break;
3152 default:
3153 break;
3154 }
3155 }
3156
3157 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
3158 if (IsOutOfScope(EVI->getAggregateOperand())) {
3159 AddToWorklistIfAllowed(EVI);
3160 continue;
3161 }
3162 // Only ExtractValue instructions where the aggregate value comes from a
3163 // call are allowed to be non-uniform.
3164 assert(isa<CallInst>(EVI->getAggregateOperand()) &&
3165 "Expected aggregate value to be call return value");
3166 }
3167
3168 // If there's no pointer operand, there's nothing to do.
3170 if (!Ptr)
3171 continue;
3172
3173 if (IsUniformMemOpUse(&I))
3174 AddToWorklistIfAllowed(&I);
3175
3176 if (IsVectorizedMemAccessUse(&I, Ptr))
3177 HasUniformUse.insert(Ptr);
3178 }
3179
3180 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
3181 // demanding) users. Since loops are assumed to be in LCSSA form, this
3182 // disallows uses outside the loop as well.
3183 for (auto *V : HasUniformUse) {
3184 if (IsOutOfScope(V))
3185 continue;
3186 auto *I = cast<Instruction>(V);
3187 bool UsersAreMemAccesses = all_of(I->users(), [&](User *U) -> bool {
3188 auto *UI = cast<Instruction>(U);
3189 return TheLoop->contains(UI) && IsVectorizedMemAccessUse(UI, V);
3190 });
3191 if (UsersAreMemAccesses)
3192 AddToWorklistIfAllowed(I);
3193 }
3194
3195 // Expand Worklist in topological order: whenever a new instruction
3196 // is added , its users should be already inside Worklist. It ensures
3197 // a uniform instruction will only be used by uniform instructions.
3198 unsigned Idx = 0;
3199 while (Idx != Worklist.size()) {
3200 Instruction *I = Worklist[Idx++];
3201
3202 for (auto *OV : I->operand_values()) {
3203 // isOutOfScope operands cannot be uniform instructions.
3204 if (IsOutOfScope(OV))
3205 continue;
3206 // First order recurrence Phi's should typically be considered
3207 // non-uniform.
3208 auto *OP = dyn_cast<PHINode>(OV);
3209 if (OP && Legal->isFixedOrderRecurrence(OP))
3210 continue;
3211 // If all the users of the operand are uniform, then add the
3212 // operand into the uniform worklist.
3213 auto *OI = cast<Instruction>(OV);
3214 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
3215 auto *J = cast<Instruction>(U);
3216 return Worklist.count(J) || IsVectorizedMemAccessUse(J, OI);
3217 }))
3218 AddToWorklistIfAllowed(OI);
3219 }
3220 }
3221
3222 // For an instruction to be added into Worklist above, all its users inside
3223 // the loop should also be in Worklist. However, this condition cannot be
3224 // true for phi nodes that form a cyclic dependence. We must process phi
3225 // nodes separately. An induction variable will remain uniform if all users
3226 // of the induction variable and induction variable update remain uniform.
3227 // The code below handles both pointer and non-pointer induction variables.
3228 BasicBlock *Latch = TheLoop->getLoopLatch();
3229 for (const auto &Induction : Legal->getInductionVars()) {
3230 auto *Ind = Induction.first;
3231 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3232
3233 // Determine if all users of the induction variable are uniform after
3234 // vectorization.
3235 bool UniformInd = all_of(Ind->users(), [&](User *U) -> bool {
3236 auto *I = cast<Instruction>(U);
3237 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3238 IsVectorizedMemAccessUse(I, Ind);
3239 });
3240 if (!UniformInd)
3241 continue;
3242
3243 // Determine if all users of the induction variable update instruction are
3244 // uniform after vectorization.
3245 bool UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
3246 auto *I = cast<Instruction>(U);
3247 return I == Ind || Worklist.count(I) ||
3248 IsVectorizedMemAccessUse(I, IndUpdate);
3249 });
3250 if (!UniformIndUpdate)
3251 continue;
3252
3253 // The induction variable and its update instruction will remain uniform.
3254 AddToWorklistIfAllowed(Ind);
3255 AddToWorklistIfAllowed(IndUpdate);
3256 }
3257
3258 Uniforms[VF].insert_range(Worklist);
3259}
3260
3262 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
3263
3264 if (Legal->getRuntimePointerChecking()->Need) {
3265 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
3266 "runtime pointer checks needed. Enable vectorization of this "
3267 "loop with '#pragma clang loop vectorize(enable)' when "
3268 "compiling with -Os/-Oz",
3269 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3270 return true;
3271 }
3272
3273 if (!PSE.getPredicate().isAlwaysTrue()) {
3274 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
3275 "runtime SCEV checks needed. Enable vectorization of this "
3276 "loop with '#pragma clang loop vectorize(enable)' when "
3277 "compiling with -Os/-Oz",
3278 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3279 return true;
3280 }
3281
3282 // FIXME: Avoid specializing for stride==1 instead of bailing out.
3283 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
3284 reportVectorizationFailure("Runtime stride check for small trip count",
3285 "runtime stride == 1 checks needed. Enable vectorization of "
3286 "this loop without such check by compiling with -Os/-Oz",
3287 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3288 return true;
3289 }
3290
3291 return false;
3292}
3293
3294bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
3295 if (IsScalableVectorizationAllowed)
3296 return *IsScalableVectorizationAllowed;
3297
3298 IsScalableVectorizationAllowed = false;
3299 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
3300 return false;
3301
3303 reportVectorizationInfo("Scalable vectorization is explicitly disabled",
3304 "ScalableVectorizationDisabled", ORE, TheLoop);
3305 return false;
3306 }
3307
3308 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
3309
3310 auto MaxScalableVF = ElementCount::getScalable(
3311 std::numeric_limits<ElementCount::ScalarTy>::max());
3312
3313 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
3314 // FIXME: While for scalable vectors this is currently sufficient, this should
3315 // be replaced by a more detailed mechanism that filters out specific VFs,
3316 // instead of invalidating vectorization for a whole set of VFs based on the
3317 // MaxVF.
3318
3319 // Disable scalable vectorization if the loop contains unsupported reductions.
3320 if (!canVectorizeReductions(MaxScalableVF)) {
3322 "Scalable vectorization not supported for the reduction "
3323 "operations found in this loop.",
3324 "ScalableVFUnfeasible", ORE, TheLoop);
3325 return false;
3326 }
3327
3328 // Disable scalable vectorization if the loop contains any instructions
3329 // with element types not supported for scalable vectors.
3330 if (any_of(ElementTypesInLoop, [&](Type *Ty) {
3331 return !Ty->isVoidTy() &&
3332 !this->TTI.isElementTypeLegalForScalableVector(Ty);
3333 })) {
3334 reportVectorizationInfo("Scalable vectorization is not supported "
3335 "for all element types found in this loop.",
3336 "ScalableVFUnfeasible", ORE, TheLoop);
3337 return false;
3338 }
3339
3340 if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(*TheFunction, TTI)) {
3341 reportVectorizationInfo("The target does not provide maximum vscale value "
3342 "for safe distance analysis.",
3343 "ScalableVFUnfeasible", ORE, TheLoop);
3344 return false;
3345 }
3346
3347 IsScalableVectorizationAllowed = true;
3348 return true;
3349}
3350
3352LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
3353 if (!isScalableVectorizationAllowed())
3354 return ElementCount::getScalable(0);
3355
3356 auto MaxScalableVF = ElementCount::getScalable(
3357 std::numeric_limits<ElementCount::ScalarTy>::max());
3358 if (Legal->isSafeForAnyVectorWidth())
3359 return MaxScalableVF;
3360
3361 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
3362 // Limit MaxScalableVF by the maximum safe dependence distance.
3363 MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
3364
3365 if (!MaxScalableVF)
3367 "Max legal vector width too small, scalable vectorization "
3368 "unfeasible.",
3369 "ScalableVFUnfeasible", ORE, TheLoop);
3370
3371 return MaxScalableVF;
3372}
3373
3374FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
3375 unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
3376 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
3377 unsigned SmallestType, WidestType;
3378 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
3379
3380 // Get the maximum safe dependence distance in bits computed by LAA.
3381 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
3382 // the memory accesses that is most restrictive (involved in the smallest
3383 // dependence distance).
3384 unsigned MaxSafeElementsPowerOf2 =
3385 bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
3386 if (!Legal->isSafeForAnyStoreLoadForwardDistances()) {
3387 unsigned SLDist = Legal->getMaxStoreLoadForwardSafeDistanceInBits();
3388 MaxSafeElementsPowerOf2 =
3389 std::min(MaxSafeElementsPowerOf2, SLDist / WidestType);
3390 }
3391 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElementsPowerOf2);
3392 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElementsPowerOf2);
3393
3394 if (!Legal->isSafeForAnyVectorWidth())
3395 this->MaxSafeElements = MaxSafeElementsPowerOf2;
3396
3397 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
3398 << ".\n");
3399 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
3400 << ".\n");
3401
3402 // First analyze the UserVF, fall back if the UserVF should be ignored.
3403 if (UserVF) {
3404 auto MaxSafeUserVF =
3405 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
3406
3407 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
3408 // If `VF=vscale x N` is safe, then so is `VF=N`
3409 if (UserVF.isScalable())
3410 return FixedScalableVFPair(
3411 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
3412
3413 return UserVF;
3414 }
3415
3416 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
3417
3418 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
3419 // is better to ignore the hint and let the compiler choose a suitable VF.
3420 if (!UserVF.isScalable()) {
3421 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3422 << " is unsafe, clamping to max safe VF="
3423 << MaxSafeFixedVF << ".\n");
3424 ORE->emit([&]() {
3425 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3426 TheLoop->getStartLoc(),
3427 TheLoop->getHeader())
3428 << "User-specified vectorization factor "
3429 << ore::NV("UserVectorizationFactor", UserVF)
3430 << " is unsafe, clamping to maximum safe vectorization factor "
3431 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
3432 });
3433 return MaxSafeFixedVF;
3434 }
3435
3436 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
3437 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3438 << " is ignored because scalable vectors are not "
3439 "available.\n");
3440 ORE->emit([&]() {
3441 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3442 TheLoop->getStartLoc(),
3443 TheLoop->getHeader())
3444 << "User-specified vectorization factor "
3445 << ore::NV("UserVectorizationFactor", UserVF)
3446 << " is ignored because the target does not support scalable "
3447 "vectors. The compiler will pick a more suitable value.";
3448 });
3449 } else {
3450 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3451 << " is unsafe. Ignoring scalable UserVF.\n");
3452 ORE->emit([&]() {
3453 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3454 TheLoop->getStartLoc(),
3455 TheLoop->getHeader())
3456 << "User-specified vectorization factor "
3457 << ore::NV("UserVectorizationFactor", UserVF)
3458 << " is unsafe. Ignoring the hint to let the compiler pick a "
3459 "more suitable value.";
3460 });
3461 }
3462 }
3463
3464 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
3465 << " / " << WidestType << " bits.\n");
3466
3467 FixedScalableVFPair Result(ElementCount::getFixed(1),
3469 if (auto MaxVF =
3470 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3471 MaxSafeFixedVF, FoldTailByMasking))
3472 Result.FixedVF = MaxVF;
3473
3474 if (auto MaxVF =
3475 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3476 MaxSafeScalableVF, FoldTailByMasking))
3477 if (MaxVF.isScalable()) {
3478 Result.ScalableVF = MaxVF;
3479 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
3480 << "\n");
3481 }
3482
3483 return Result;
3484}
3485
3488 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
3489 // TODO: It may be useful to do since it's still likely to be dynamically
3490 // uniform if the target can skip.
3492 "Not inserting runtime ptr check for divergent target",
3493 "runtime pointer checks needed. Not enabled for divergent target",
3494 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
3496 }
3497
3498 ScalarEvolution *SE = PSE.getSE();
3500 unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
3501 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
3502 if (TC != ElementCount::getFixed(MaxTC))
3503 LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n');
3504 if (TC.isScalar()) {
3505 reportVectorizationFailure("Single iteration (non) loop",
3506 "loop trip count is one, irrelevant for vectorization",
3507 "SingleIterationLoop", ORE, TheLoop);
3509 }
3510
3511 // If BTC matches the widest induction type and is -1 then the trip count
3512 // computation will wrap to 0 and the vector trip count will be 0. Do not try
3513 // to vectorize.
3514 const SCEV *BTC = SE->getBackedgeTakenCount(TheLoop);
3515 if (!isa<SCEVCouldNotCompute>(BTC) &&
3516 BTC->getType()->getScalarSizeInBits() >=
3517 Legal->getWidestInductionType()->getScalarSizeInBits() &&
3519 SE->getMinusOne(BTC->getType()))) {
3521 "Trip count computation wrapped",
3522 "backedge-taken count is -1, loop trip count wrapped to 0",
3523 "TripCountWrapped", ORE, TheLoop);
3525 }
3526
3527 switch (ScalarEpilogueStatus) {
3529 return computeFeasibleMaxVF(MaxTC, UserVF, false);
3531 [[fallthrough]];
3533 LLVM_DEBUG(
3534 dbgs() << "LV: vector predicate hint/switch found.\n"
3535 << "LV: Not allowing scalar epilogue, creating predicated "
3536 << "vector loop.\n");
3537 break;
3539 // fallthrough as a special case of OptForSize
3541 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
3542 LLVM_DEBUG(
3543 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
3544 else
3545 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
3546 << "count.\n");
3547
3548 // Bail if runtime checks are required, which are not good when optimising
3549 // for size.
3552
3553 break;
3554 }
3555
3556 // Now try the tail folding
3557
3558 // Invalidate interleave groups that require an epilogue if we can't mask
3559 // the interleave-group.
3561 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
3562 "No decisions should have been taken at this point");
3563 // Note: There is no need to invalidate any cost modeling decisions here, as
3564 // none were taken so far.
3565 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
3566 }
3567
3568 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
3569
3570 // Avoid tail folding if the trip count is known to be a multiple of any VF
3571 // we choose.
3572 std::optional<unsigned> MaxPowerOf2RuntimeVF =
3573 MaxFactors.FixedVF.getFixedValue();
3574 if (MaxFactors.ScalableVF) {
3575 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
3576 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
3577 MaxPowerOf2RuntimeVF = std::max<unsigned>(
3578 *MaxPowerOf2RuntimeVF,
3579 *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
3580 } else
3581 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
3582 }
3583
3584 auto NoScalarEpilogueNeeded = [this, &UserIC](unsigned MaxVF) {
3585 // Return false if the loop is neither a single-latch-exit loop nor an
3586 // early-exit loop as tail-folding is not supported in that case.
3587 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
3588 !Legal->hasUncountableEarlyExit())
3589 return false;
3590 unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
3591 ScalarEvolution *SE = PSE.getSE();
3592 // Calling getSymbolicMaxBackedgeTakenCount enables support for loops
3593 // with uncountable exits. For countable loops, the symbolic maximum must
3594 // remain identical to the known back-edge taken count.
3595 const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount();
3596 assert((Legal->hasUncountableEarlyExit() ||
3597 BackedgeTakenCount == PSE.getBackedgeTakenCount()) &&
3598 "Invalid loop count");
3599 const SCEV *ExitCount = SE->getAddExpr(
3600 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3601 const SCEV *Rem = SE->getURemExpr(
3602 SE->applyLoopGuards(ExitCount, TheLoop),
3603 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
3604 return Rem->isZero();
3605 };
3606
3607 if (MaxPowerOf2RuntimeVF > 0u) {
3608 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
3609 "MaxFixedVF must be a power of 2");
3610 if (NoScalarEpilogueNeeded(*MaxPowerOf2RuntimeVF)) {
3611 // Accept MaxFixedVF if we do not have a tail.
3612 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
3613 return MaxFactors;
3614 }
3615 }
3616
3617 auto ExpectedTC = getSmallBestKnownTC(PSE, TheLoop);
3618 if (ExpectedTC && ExpectedTC->isFixed() &&
3619 ExpectedTC->getFixedValue() <=
3620 TTI.getMinTripCountTailFoldingThreshold()) {
3621 if (MaxPowerOf2RuntimeVF > 0u) {
3622 // If we have a low-trip-count, and the fixed-width VF is known to divide
3623 // the trip count but the scalable factor does not, use the fixed-width
3624 // factor in preference to allow the generation of a non-predicated loop.
3625 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedLowTripLoop &&
3626 NoScalarEpilogueNeeded(MaxFactors.FixedVF.getFixedValue())) {
3627 LLVM_DEBUG(dbgs() << "LV: Picking a fixed-width so that no tail will "
3628 "remain for any chosen VF.\n");
3629 MaxFactors.ScalableVF = ElementCount::getScalable(0);
3630 return MaxFactors;
3631 }
3632 }
3633
3635 "The trip count is below the minial threshold value.",
3636 "loop trip count is too low, avoiding vectorization", "LowTripCount",
3637 ORE, TheLoop);
3639 }
3640
3641 // If we don't know the precise trip count, or if the trip count that we
3642 // found modulo the vectorization factor is not zero, try to fold the tail
3643 // by masking.
3644 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
3645 bool ContainsScalableVF = MaxFactors.ScalableVF.isNonZero();
3646 setTailFoldingStyles(ContainsScalableVF, UserIC);
3647 if (foldTailByMasking()) {
3649 LLVM_DEBUG(
3650 dbgs()
3651 << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
3652 "try to generate VP Intrinsics with scalable vector "
3653 "factors only.\n");
3654 // Tail folded loop using VP intrinsics restricts the VF to be scalable
3655 // for now.
3656 // TODO: extend it for fixed vectors, if required.
3657 assert(ContainsScalableVF && "Expected scalable vector factor.");
3658
3659 MaxFactors.FixedVF = ElementCount::getFixed(1);
3660 }
3661 return MaxFactors;
3662 }
3663
3664 // If there was a tail-folding hint/switch, but we can't fold the tail by
3665 // masking, fallback to a vectorization with a scalar epilogue.
3666 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
3667 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
3668 "scalar epilogue instead.\n");
3669 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
3670 return MaxFactors;
3671 }
3672
3673 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
3674 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
3676 }
3677
3678 if (TC.isZero()) {
3680 "unable to calculate the loop count due to complex control flow",
3681 "UnknownLoopCountComplexCFG", ORE, TheLoop);
3683 }
3684
3686 "Cannot optimize for size and vectorize at the same time.",
3687 "cannot optimize for size and vectorize at the same time. "
3688 "Enable vectorization of this loop with '#pragma clang loop "
3689 "vectorize(enable)' when compiling with -Os/-Oz",
3690 "NoTailLoopWithOptForSize", ORE, TheLoop);
3692}
3693
3695 ElementCount VF) {
3696 if (!useMaxBandwidth(VF.isScalable()
3699 return false;
3700 // Only calculate register pressure for VFs enabled by MaxBandwidth.
3702 VF, VF.isScalable() ? MaxPermissibleVFWithoutMaxBW.ScalableVF
3704}
3705
3708 return MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
3709 (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
3711 Legal->hasVectorCallVariants())));
3712}
3713
3714ElementCount LoopVectorizationCostModel::clampVFByMaxTripCount(
3715 ElementCount VF, unsigned MaxTripCount, bool FoldTailByMasking) const {
3716 unsigned EstimatedVF = VF.getKnownMinValue();
3717 if (VF.isScalable() && TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
3718 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
3719 auto Min = Attr.getVScaleRangeMin();
3720 EstimatedVF *= Min;
3721 }
3722
3723 // When a scalar epilogue is required, at least one iteration of the scalar
3724 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
3725 // max VF that results in a dead vector loop.
3726 if (MaxTripCount > 0 && requiresScalarEpilogue(true))
3727 MaxTripCount -= 1;
3728
3729 if (MaxTripCount && MaxTripCount <= EstimatedVF &&
3730 (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
3731 // If upper bound loop trip count (TC) is known at compile time there is no
3732 // point in choosing VF greater than TC (as done in the loop below). Select
3733 // maximum power of two which doesn't exceed TC. If VF is
3734 // scalable, we only fall back on a fixed VF when the TC is less than or
3735 // equal to the known number of lanes.
3736 auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
3737 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
3738 "exceeding the constant trip count: "
3739 << ClampedUpperTripCount << "\n");
3740 return ElementCount::get(ClampedUpperTripCount,
3741 FoldTailByMasking ? VF.isScalable() : false);
3742 }
3743 return VF;
3744}
3745
3746ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
3747 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
3748 ElementCount MaxSafeVF, bool FoldTailByMasking) {
3749 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
3750 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
3751 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
3753
3754 // Convenience function to return the minimum of two ElementCounts.
3755 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
3756 assert((LHS.isScalable() == RHS.isScalable()) &&
3757 "Scalable flags must match");
3758 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
3759 };
3760
3761 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
3762 // Note that both WidestRegister and WidestType may not be a powers of 2.
3763 auto MaxVectorElementCount = ElementCount::get(
3764 llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
3765 ComputeScalableMaxVF);
3766 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
3767 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
3768 << (MaxVectorElementCount * WidestType) << " bits.\n");
3769
3770 if (!MaxVectorElementCount) {
3771 LLVM_DEBUG(dbgs() << "LV: The target has no "
3772 << (ComputeScalableMaxVF ? "scalable" : "fixed")
3773 << " vector registers.\n");
3774 return ElementCount::getFixed(1);
3775 }
3776
3777 ElementCount MaxVF = clampVFByMaxTripCount(MaxVectorElementCount,
3778 MaxTripCount, FoldTailByMasking);
3779 // If the MaxVF was already clamped, there's no point in trying to pick a
3780 // larger one.
3781 if (MaxVF != MaxVectorElementCount)
3782 return MaxVF;
3783
3785 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
3787
3788 if (MaxVF.isScalable())
3789 MaxPermissibleVFWithoutMaxBW.ScalableVF = MaxVF;
3790 else
3791 MaxPermissibleVFWithoutMaxBW.FixedVF = MaxVF;
3792
3793 if (useMaxBandwidth(RegKind)) {
3794 auto MaxVectorElementCountMaxBW = ElementCount::get(
3795 llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
3796 ComputeScalableMaxVF);
3797 MaxVF = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
3798
3799 if (ElementCount MinVF =
3800 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
3801 if (ElementCount::isKnownLT(MaxVF, MinVF)) {
3802 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
3803 << ") with target's minimum: " << MinVF << '\n');
3804 MaxVF = MinVF;
3805 }
3806 }
3807
3808 MaxVF = clampVFByMaxTripCount(MaxVF, MaxTripCount, FoldTailByMasking);
3809
3810 if (MaxVectorElementCount != MaxVF) {
3811 // Invalidate any widening decisions we might have made, in case the loop
3812 // requires prediction (decided later), but we have already made some
3813 // load/store widening decisions.
3815 }
3816 }
3817 return MaxVF;
3818}
3819
3820bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
3821 const VectorizationFactor &B,
3822 const unsigned MaxTripCount,
3823 bool HasTail,
3824 bool IsEpilogue) const {
3825 InstructionCost CostA = A.Cost;
3826 InstructionCost CostB = B.Cost;
3827
3828 // Improve estimate for the vector width if it is scalable.
3829 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
3830 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
3831 if (std::optional<unsigned> VScale = CM.getVScaleForTuning()) {
3832 if (A.Width.isScalable())
3833 EstimatedWidthA *= *VScale;
3834 if (B.Width.isScalable())
3835 EstimatedWidthB *= *VScale;
3836 }
3837
3838 // When optimizing for size choose whichever is smallest, which will be the
3839 // one with the smallest cost for the whole loop. On a tie pick the larger
3840 // vector width, on the assumption that throughput will be greater.
3841 if (CM.CostKind == TTI::TCK_CodeSize)
3842 return CostA < CostB ||
3843 (CostA == CostB && EstimatedWidthA > EstimatedWidthB);
3844
3845 // Assume vscale may be larger than 1 (or the value being tuned for),
3846 // so that scalable vectorization is slightly favorable over fixed-width
3847 // vectorization.
3848 bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost(IsEpilogue) &&
3849 A.Width.isScalable() && !B.Width.isScalable();
3850
3851 auto CmpFn = [PreferScalable](const InstructionCost &LHS,
3852 const InstructionCost &RHS) {
3853 return PreferScalable ? LHS <= RHS : LHS < RHS;
3854 };
3855
3856 // To avoid the need for FP division:
3857 // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
3858 // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
3859 if (!MaxTripCount)
3860 return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
3861
3862 auto GetCostForTC = [MaxTripCount, HasTail](unsigned VF,
3863 InstructionCost VectorCost,
3864 InstructionCost ScalarCost) {
3865 // If the trip count is a known (possibly small) constant, the trip count
3866 // will be rounded up to an integer number of iterations under
3867 // FoldTailByMasking. The total cost in that case will be
3868 // VecCost*ceil(TripCount/VF). When not folding the tail, the total
3869 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
3870 // some extra overheads, but for the purpose of comparing the costs of
3871 // different VFs we can use this to compare the total loop-body cost
3872 // expected after vectorization.
3873 if (HasTail)
3874 return VectorCost * (MaxTripCount / VF) +
3875 ScalarCost * (MaxTripCount % VF);
3876 return VectorCost * divideCeil(MaxTripCount, VF);
3877 };
3878
3879 auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
3880 auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
3881 return CmpFn(RTCostA, RTCostB);
3882}
3883
3884bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
3885 const VectorizationFactor &B,
3886 bool HasTail,
3887 bool IsEpilogue) const {
3888 const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
3889 return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount, HasTail,
3890 IsEpilogue);
3891}
3892
3895 using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
3896 SmallVector<RecipeVFPair> InvalidCosts;
3897 for (const auto &Plan : VPlans) {
3898 for (ElementCount VF : Plan->vectorFactors()) {
3899 // The VPlan-based cost model is designed for computing vector cost.
3900 // Querying VPlan-based cost model with a scarlar VF will cause some
3901 // errors because we expect the VF is vector for most of the widen
3902 // recipes.
3903 if (VF.isScalar())
3904 continue;
3905
3906 VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
3907 precomputeCosts(*Plan, VF, CostCtx);
3908 auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
3910 for (auto &R : *VPBB) {
3911 if (!R.cost(VF, CostCtx).isValid())
3912 InvalidCosts.emplace_back(&R, VF);
3913 }
3914 }
3915 }
3916 }
3917 if (InvalidCosts.empty())
3918 return;
3919
3920 // Emit a report of VFs with invalid costs in the loop.
3921
3922 // Group the remarks per recipe, keeping the recipe order from InvalidCosts.
3924 unsigned I = 0;
3925 for (auto &Pair : InvalidCosts)
3926 if (Numbering.try_emplace(Pair.first, I).second)
3927 ++I;
3928
3929 // Sort the list, first on recipe(number) then on VF.
3930 sort(InvalidCosts, [&Numbering](RecipeVFPair &A, RecipeVFPair &B) {
3931 unsigned NA = Numbering[A.first];
3932 unsigned NB = Numbering[B.first];
3933 if (NA != NB)
3934 return NA < NB;
3935 return ElementCount::isKnownLT(A.second, B.second);
3936 });
3937
3938 // For a list of ordered recipe-VF pairs:
3939 // [(load, VF1), (load, VF2), (store, VF1)]
3940 // group the recipes together to emit separate remarks for:
3941 // load (VF1, VF2)
3942 // store (VF1)
3943 auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts);
3944 auto Subset = ArrayRef<RecipeVFPair>();
3945 do {
3946 if (Subset.empty())
3947 Subset = Tail.take_front(1);
3948
3949 VPRecipeBase *R = Subset.front().first;
3950
3951 unsigned Opcode =
3954 [](const auto *R) { return Instruction::PHI; })
3955 .Case<VPWidenSelectRecipe>(
3956 [](const auto *R) { return Instruction::Select; })
3957 .Case<VPWidenStoreRecipe>(
3958 [](const auto *R) { return Instruction::Store; })
3959 .Case<VPWidenLoadRecipe>(
3960 [](const auto *R) { return Instruction::Load; })
3961 .Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
3962 [](const auto *R) { return Instruction::Call; })
3965 [](const auto *R) { return R->getOpcode(); })
3966 .Case<VPInterleaveRecipe>([](const VPInterleaveRecipe *R) {
3967 return R->getStoredValues().empty() ? Instruction::Load
3968 : Instruction::Store;
3969 });
3970
3971 // If the next recipe is different, or if there are no other pairs,
3972 // emit a remark for the collated subset. e.g.
3973 // [(load, VF1), (load, VF2))]
3974 // to emit:
3975 // remark: invalid costs for 'load' at VF=(VF1, VF2)
3976 if (Subset == Tail || Tail[Subset.size()].first != R) {
3977 std::string OutString;
3978 raw_string_ostream OS(OutString);
3979 assert(!Subset.empty() && "Unexpected empty range");
3980 OS << "Recipe with invalid costs prevented vectorization at VF=(";
3981 for (const auto &Pair : Subset)
3982 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
3983 OS << "):";
3984 if (Opcode == Instruction::Call) {
3985 StringRef Name = "";
3986 if (auto *Int = dyn_cast<VPWidenIntrinsicRecipe>(R)) {
3987 Name = Int->getIntrinsicName();
3988 } else {
3989 auto *WidenCall = dyn_cast<VPWidenCallRecipe>(R);
3990 Function *CalledFn =
3991 WidenCall ? WidenCall->getCalledScalarFunction()
3992 : cast<Function>(R->getOperand(R->getNumOperands() - 1)
3993 ->getLiveInIRValue());
3994 Name = CalledFn->getName();
3995 }
3996 OS << " call to " << Name;
3997 } else
3998 OS << " " << Instruction::getOpcodeName(Opcode);
3999 reportVectorizationInfo(OutString, "InvalidCost", ORE, OrigLoop, nullptr,
4000 R->getDebugLoc());
4001 Tail = Tail.drop_front(Subset.size());
4002 Subset = {};
4003 } else
4004 // Grow the subset by one element
4005 Subset = Tail.take_front(Subset.size() + 1);
4006 } while (!Tail.empty());
4007}
4008
4009/// Check if any recipe of \p Plan will generate a vector value, which will be
4010/// assigned a vector register.
4012 const TargetTransformInfo &TTI) {
4013 assert(VF.isVector() && "Checking a scalar VF?");
4014 VPTypeAnalysis TypeInfo(Plan);
4015 DenseSet<VPRecipeBase *> EphemeralRecipes;
4016 collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes);
4017 // Set of already visited types.
4018 DenseSet<Type *> Visited;
4021 for (VPRecipeBase &R : *VPBB) {
4022 if (EphemeralRecipes.contains(&R))
4023 continue;
4024 // Continue early if the recipe is considered to not produce a vector
4025 // result. Note that this includes VPInstruction where some opcodes may
4026 // produce a vector, to preserve existing behavior as VPInstructions model
4027 // aspects not directly mapped to existing IR instructions.
4028 switch (R.getVPDefID()) {
4029 case VPDef::VPDerivedIVSC:
4030 case VPDef::VPScalarIVStepsSC:
4031 case VPDef::VPReplicateSC:
4032 case VPDef::VPInstructionSC:
4033 case VPDef::VPCanonicalIVPHISC:
4034 case VPDef::VPVectorPointerSC:
4035 case VPDef::VPVectorEndPointerSC:
4036 case VPDef::VPExpandSCEVSC:
4037 case VPDef::VPEVLBasedIVPHISC:
4038 case VPDef::VPPredInstPHISC:
4039 case VPDef::VPBranchOnMaskSC:
4040 continue;
4041 case VPDef::VPReductionSC:
4042 case VPDef::VPActiveLaneMaskPHISC:
4043 case VPDef::VPWidenCallSC:
4044 case VPDef::VPWidenCanonicalIVSC:
4045 case VPDef::VPWidenCastSC:
4046 case VPDef::VPWidenGEPSC:
4047 case VPDef::VPWidenIntrinsicSC:
4048 case VPDef::VPWidenSC:
4049 case VPDef::VPWidenSelectSC:
4050 case VPDef::VPBlendSC:
4051 case VPDef::VPFirstOrderRecurrencePHISC:
4052 case VPDef::VPHistogramSC:
4053 case VPDef::VPWidenPHISC:
4054 case VPDef::VPWidenIntOrFpInductionSC:
4055 case VPDef::VPWidenPointerInductionSC:
4056 case VPDef::VPReductionPHISC:
4057 case VPDef::VPInterleaveEVLSC:
4058 case VPDef::VPInterleaveSC:
4059 case VPDef::VPWidenLoadEVLSC:
4060 case VPDef::VPWidenLoadSC:
4061 case VPDef::VPWidenStoreEVLSC:
4062 case VPDef::VPWidenStoreSC:
4063 break;
4064 default:
4065 llvm_unreachable("unhandled recipe");
4066 }
4067
4068 auto WillGenerateTargetVectors = [&TTI, VF](Type *VectorTy) {
4069 unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
4070 if (!NumLegalParts)
4071 return false;
4072 if (VF.isScalable()) {
4073 // <vscale x 1 x iN> is assumed to be profitable over iN because
4074 // scalable registers are a distinct register class from scalar
4075 // ones. If we ever find a target which wants to lower scalable
4076 // vectors back to scalars, we'll need to update this code to
4077 // explicitly ask TTI about the register class uses for each part.
4078 return NumLegalParts <= VF.getKnownMinValue();
4079 }
4080 // Two or more elements that share a register - are vectorized.
4081 return NumLegalParts < VF.getFixedValue();
4082 };
4083
4084 // If no def nor is a store, e.g., branches, continue - no value to check.
4085 if (R.getNumDefinedValues() == 0 &&
4087 continue;
4088 // For multi-def recipes, currently only interleaved loads, suffice to
4089 // check first def only.
4090 // For stores check their stored value; for interleaved stores suffice
4091 // the check first stored value only. In all cases this is the second
4092 // operand.
4093 VPValue *ToCheck =
4094 R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1);
4095 Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
4096 if (!Visited.insert({ScalarTy}).second)
4097 continue;
4098 Type *WideTy = toVectorizedTy(ScalarTy, VF);
4099 if (any_of(getContainedTypes(WideTy), WillGenerateTargetVectors))
4100 return true;
4101 }
4102 }
4103
4104 return false;
4105}
4106
4107static bool hasReplicatorRegion(VPlan &Plan) {
4109 Plan.getVectorLoopRegion()->getEntry())),
4110 [](auto *VPRB) { return VPRB->isReplicator(); });
4111}
4112
4113#ifndef NDEBUG
4114VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4115 InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1));
4116 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4117 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4118 assert(
4119 any_of(VPlans,
4120 [](std::unique_ptr<VPlan> &P) { return P->hasScalarVFOnly(); }) &&
4121 "Expected Scalar VF to be a candidate");
4122
4123 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4124 ExpectedCost);
4125 VectorizationFactor ChosenFactor = ScalarCost;
4126
4127 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4128 if (ForceVectorization &&
4129 (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
4130 // Ignore scalar width, because the user explicitly wants vectorization.
4131 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4132 // evaluation.
4133 ChosenFactor.Cost = InstructionCost::getMax();
4134 }
4135
4136 for (auto &P : VPlans) {
4137 ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
4138 P->vectorFactors().end());
4139
4141 if (any_of(VFs, [this](ElementCount VF) {
4142 return CM.shouldConsiderRegPressureForVF(VF);
4143 }))
4144 RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
4145
4146 for (unsigned I = 0; I < VFs.size(); I++) {
4147 ElementCount VF = VFs[I];
4148 // The cost for scalar VF=1 is already calculated, so ignore it.
4149 if (VF.isScalar())
4150 continue;
4151
4152 /// If the register pressure needs to be considered for VF,
4153 /// don't consider the VF as valid if it exceeds the number
4154 /// of registers for the target.
4155 if (CM.shouldConsiderRegPressureForVF(VF) &&
4156 RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs))
4157 continue;
4158
4159 InstructionCost C = CM.expectedCost(VF);
4160
4161 // Add on other costs that are modelled in VPlan, but not in the legacy
4162 // cost model.
4163 VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind);
4164 VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
4165 assert(VectorRegion && "Expected to have a vector region!");
4166 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4167 vp_depth_first_shallow(VectorRegion->getEntry()))) {
4168 for (VPRecipeBase &R : *VPBB) {
4169 auto *VPI = dyn_cast<VPInstruction>(&R);
4170 if (!VPI)
4171 continue;
4172 switch (VPI->getOpcode()) {
4173 // Selects are only modelled in the legacy cost model for safe
4174 // divisors.
4175 case Instruction::Select: {
4176 VPValue *VPV = VPI->getVPSingleValue();
4177 if (VPV->getNumUsers() == 1) {
4178 if (auto *WR = dyn_cast<VPWidenRecipe>(*VPV->user_begin())) {
4179 switch (WR->getOpcode()) {
4180 case Instruction::UDiv:
4181 case Instruction::SDiv:
4182 case Instruction::URem:
4183 case Instruction::SRem:
4184 continue;
4185 default:
4186 break;
4187 }
4188 }
4189 }
4190 C += VPI->cost(VF, CostCtx);
4191 break;
4192 }
4194 unsigned Multiplier =
4195 cast<ConstantInt>(VPI->getOperand(2)->getLiveInIRValue())
4196 ->getZExtValue();
4197 C += VPI->cost(VF * Multiplier, CostCtx);
4198 break;
4199 }
4201 C += VPI->cost(VF, CostCtx);
4202 break;
4203 default:
4204 break;
4205 }
4206 }
4207 }
4208
4209 VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
4210 unsigned Width =
4211 estimateElementCount(Candidate.Width, CM.getVScaleForTuning());
4212 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
4213 << " costs: " << (Candidate.Cost / Width));
4214 if (VF.isScalable())
4215 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4216 << CM.getVScaleForTuning().value_or(1) << ")");
4217 LLVM_DEBUG(dbgs() << ".\n");
4218
4219 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
4220 LLVM_DEBUG(
4221 dbgs()
4222 << "LV: Not considering vector loop of width " << VF
4223 << " because it will not generate any vector instructions.\n");
4224 continue;
4225 }
4226
4227 if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) {
4228 LLVM_DEBUG(
4229 dbgs()
4230 << "LV: Not considering vector loop of width " << VF
4231 << " because it would cause replicated blocks to be generated,"
4232 << " which isn't allowed when optimizing for size.\n");
4233 continue;
4234 }
4235
4236 if (isMoreProfitable(Candidate, ChosenFactor, P->hasScalarTail()))
4237 ChosenFactor = Candidate;
4238 }
4239 }
4240
4241 if (!EnableCondStoresVectorization && CM.hasPredStores()) {
4243 "There are conditional stores.",
4244 "store that is conditionally executed prevents vectorization",
4245 "ConditionalStore", ORE, OrigLoop);
4246 ChosenFactor = ScalarCost;
4247 }
4248
4249 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
4250 !isMoreProfitable(ChosenFactor, ScalarCost,
4251 !CM.foldTailByMasking())) dbgs()
4252 << "LV: Vectorization seems to be not beneficial, "
4253 << "but was forced by a user.\n");
4254 return ChosenFactor;
4255}
4256#endif
4257
4258bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4259 ElementCount VF) const {
4260 // Cross iteration phis such as fixed-order recurrences and FMaxNum/FMinNum
4261 // reductions need special handling and are currently unsupported.
4262 if (any_of(OrigLoop->getHeader()->phis(), [&](PHINode &Phi) {
4263 if (!Legal->isReductionVariable(&Phi))
4264 return Legal->isFixedOrderRecurrence(&Phi);
4265 RecurKind RK = Legal->getRecurrenceDescriptor(&Phi).getRecurrenceKind();
4266 return RK == RecurKind::FMinNum || RK == RecurKind::FMaxNum;
4267 }))
4268 return false;
4269
4270 // Phis with uses outside of the loop require special handling and are
4271 // currently unsupported.
4272 for (const auto &Entry : Legal->getInductionVars()) {
4273 // Look for uses of the value of the induction at the last iteration.
4274 Value *PostInc =
4275 Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
4276 for (User *U : PostInc->users())
4277 if (!OrigLoop->contains(cast<Instruction>(U)))
4278 return false;
4279 // Look for uses of penultimate value of the induction.
4280 for (User *U : Entry.first->users())
4281 if (!OrigLoop->contains(cast<Instruction>(U)))
4282 return false;
4283 }
4284
4285 // Epilogue vectorization code has not been auditted to ensure it handles
4286 // non-latch exits properly. It may be fine, but it needs auditted and
4287 // tested.
4288 // TODO: Add support for loops with an early exit.
4289 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
4290 return false;
4291
4292 return true;
4293}
4294
4296 const ElementCount VF, const unsigned IC) const {
4297 // FIXME: We need a much better cost-model to take different parameters such
4298 // as register pressure, code size increase and cost of extra branches into
4299 // account. For now we apply a very crude heuristic and only consider loops
4300 // with vectorization factors larger than a certain value.
4301
4302 // Allow the target to opt out entirely.
4303 if (!TTI.preferEpilogueVectorization())
4304 return false;
4305
4306 // We also consider epilogue vectorization unprofitable for targets that don't
4307 // consider interleaving beneficial (eg. MVE).
4308 if (TTI.getMaxInterleaveFactor(VF) <= 1)
4309 return false;
4310
4311 // TODO: PR #108190 introduced a discrepancy between fixed-width and scalable
4312 // VFs when deciding profitability.
4313 // See related "TODO: extend to support scalable VFs." in
4314 // selectEpilogueVectorizationFactor.
4315 unsigned Multiplier = VF.isFixed() ? IC : 1;
4316 unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
4318 : TTI.getEpilogueVectorizationMinVF();
4319 return estimateElementCount(VF * Multiplier, VScaleForTuning) >=
4320 MinVFThreshold;
4321}
4322
4324 const ElementCount MainLoopVF, unsigned IC) {
4327 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
4328 return Result;
4329 }
4330
4331 if (!CM.isScalarEpilogueAllowed()) {
4332 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
4333 "epilogue is allowed.\n");
4334 return Result;
4335 }
4336
4337 // Not really a cost consideration, but check for unsupported cases here to
4338 // simplify the logic.
4339 if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
4340 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
4341 "is not a supported candidate.\n");
4342 return Result;
4343 }
4344
4346 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
4348 if (hasPlanWithVF(ForcedEC))
4349 return {ForcedEC, 0, 0};
4350
4351 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
4352 "viable.\n");
4353 return Result;
4354 }
4355
4356 if (OrigLoop->getHeader()->getParent()->hasOptSize()) {
4357 LLVM_DEBUG(
4358 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
4359 return Result;
4360 }
4361
4362 if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, IC)) {
4363 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
4364 "this loop\n");
4365 return Result;
4366 }
4367
4368 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
4369 // the main loop handles 8 lanes per iteration. We could still benefit from
4370 // vectorizing the epilogue loop with VF=4.
4371 ElementCount EstimatedRuntimeVF = ElementCount::getFixed(
4372 estimateElementCount(MainLoopVF, CM.getVScaleForTuning()));
4373
4374 ScalarEvolution &SE = *PSE.getSE();
4375 Type *TCType = Legal->getWidestInductionType();
4376 const SCEV *RemainingIterations = nullptr;
4377 unsigned MaxTripCount = 0;
4378 const SCEV *TC =
4379 vputils::getSCEVExprForVPValue(getPlanFor(MainLoopVF).getTripCount(), SE);
4380 assert(!isa<SCEVCouldNotCompute>(TC) && "Trip count SCEV must be computable");
4381 RemainingIterations =
4382 SE.getURemExpr(TC, SE.getElementCount(TCType, MainLoopVF * IC));
4383
4384 // No iterations left to process in the epilogue.
4385 if (RemainingIterations->isZero())
4386 return Result;
4387
4388 if (MainLoopVF.isFixed()) {
4389 MaxTripCount = MainLoopVF.getFixedValue() * IC - 1;
4390 if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
4391 SE.getConstant(TCType, MaxTripCount))) {
4392 MaxTripCount = SE.getUnsignedRangeMax(RemainingIterations).getZExtValue();
4393 }
4394 LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
4395 << MaxTripCount << "\n");
4396 }
4397
4398 for (auto &NextVF : ProfitableVFs) {
4399 // Skip candidate VFs without a corresponding VPlan.
4400 if (!hasPlanWithVF(NextVF.Width))
4401 continue;
4402
4403 // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
4404 // vectors) or > the VF of the main loop (fixed vectors).
4405 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
4406 ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
4407 (NextVF.Width.isScalable() &&
4408 ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) ||
4409 (!NextVF.Width.isScalable() && !MainLoopVF.isScalable() &&
4410 ElementCount::isKnownGT(NextVF.Width, MainLoopVF)))
4411 continue;
4412
4413 // If NextVF is greater than the number of remaining iterations, the
4414 // epilogue loop would be dead. Skip such factors.
4415 if (RemainingIterations && !NextVF.Width.isScalable()) {
4416 if (SE.isKnownPredicate(
4418 SE.getConstant(TCType, NextVF.Width.getFixedValue()),
4419 RemainingIterations))
4420 continue;
4421 }
4422
4423 if (Result.Width.isScalar() ||
4424 isMoreProfitable(NextVF, Result, MaxTripCount, !CM.foldTailByMasking(),
4425 /*IsEpilogue*/ true))
4426 Result = NextVF;
4427 }
4428
4429 if (Result != VectorizationFactor::Disabled())
4430 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
4431 << Result.Width << "\n");
4432 return Result;
4433}
4434
4435std::pair<unsigned, unsigned>
4437 unsigned MinWidth = -1U;
4438 unsigned MaxWidth = 8;
4439 const DataLayout &DL = TheFunction->getDataLayout();
4440 // For in-loop reductions, no element types are added to ElementTypesInLoop
4441 // if there are no loads/stores in the loop. In this case, check through the
4442 // reduction variables to determine the maximum width.
4443 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
4444 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
4445 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
4446 // When finding the min width used by the recurrence we need to account
4447 // for casts on the input operands of the recurrence.
4448 MinWidth = std::min(
4449 MinWidth,
4450 std::min(RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
4452 MaxWidth = std::max(MaxWidth,
4454 }
4455 } else {
4456 for (Type *T : ElementTypesInLoop) {
4457 MinWidth = std::min<unsigned>(
4458 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4459 MaxWidth = std::max<unsigned>(
4460 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4461 }
4462 }
4463 return {MinWidth, MaxWidth};
4464}
4465
4467 ElementTypesInLoop.clear();
4468 // For each block.
4469 for (BasicBlock *BB : TheLoop->blocks()) {
4470 // For each instruction in the loop.
4471 for (Instruction &I : BB->instructionsWithoutDebug()) {
4472 Type *T = I.getType();
4473
4474 // Skip ignored values.
4475 if (ValuesToIgnore.count(&I))
4476 continue;
4477
4478 // Only examine Loads, Stores and PHINodes.
4479 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
4480 continue;
4481
4482 // Examine PHI nodes that are reduction variables. Update the type to
4483 // account for the recurrence type.
4484 if (auto *PN = dyn_cast<PHINode>(&I)) {
4485 if (!Legal->isReductionVariable(PN))
4486 continue;
4487 const RecurrenceDescriptor &RdxDesc =
4488 Legal->getRecurrenceDescriptor(PN);
4490 TTI.preferInLoopReduction(RdxDesc.getRecurrenceKind(),
4491 RdxDesc.getRecurrenceType()))
4492 continue;
4493 T = RdxDesc.getRecurrenceType();
4494 }
4495
4496 // Examine the stored values.
4497 if (auto *ST = dyn_cast<StoreInst>(&I))
4498 T = ST->getValueOperand()->getType();
4499
4500 assert(T->isSized() &&
4501 "Expected the load/store/recurrence type to be sized");
4502
4503 ElementTypesInLoop.insert(T);
4504 }
4505 }
4506}
4507
4508unsigned
4510 InstructionCost LoopCost) {
4511 // -- The interleave heuristics --
4512 // We interleave the loop in order to expose ILP and reduce the loop overhead.
4513 // There are many micro-architectural considerations that we can't predict
4514 // at this level. For example, frontend pressure (on decode or fetch) due to
4515 // code size, or the number and capabilities of the execution ports.
4516 //
4517 // We use the following heuristics to select the interleave count:
4518 // 1. If the code has reductions, then we interleave to break the cross
4519 // iteration dependency.
4520 // 2. If the loop is really small, then we interleave to reduce the loop
4521 // overhead.
4522 // 3. We don't interleave if we think that we will spill registers to memory
4523 // due to the increased register pressure.
4524
4525 if (!CM.isScalarEpilogueAllowed())
4526 return 1;
4527
4530 LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
4531 "Unroll factor forced to be 1.\n");
4532 return 1;
4533 }
4534
4535 // We used the distance for the interleave count.
4536 if (!Legal->isSafeForAnyVectorWidth())
4537 return 1;
4538
4539 // We don't attempt to perform interleaving for loops with uncountable early
4540 // exits because the VPInstruction::AnyOf code cannot currently handle
4541 // multiple parts.
4542 if (Plan.hasEarlyExit())
4543 return 1;
4544
4545 const bool HasReductions =
4548
4549 // If we did not calculate the cost for VF (because the user selected the VF)
4550 // then we calculate the cost of VF here.
4551 if (LoopCost == 0) {
4552 if (VF.isScalar())
4553 LoopCost = CM.expectedCost(VF);
4554 else
4555 LoopCost = cost(Plan, VF);
4556 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
4557
4558 // Loop body is free and there is no need for interleaving.
4559 if (LoopCost == 0)
4560 return 1;
4561 }
4562
4563 VPRegisterUsage R =
4564 calculateRegisterUsageForPlan(Plan, {VF}, TTI, CM.ValuesToIgnore)[0];
4565 // We divide by these constants so assume that we have at least one
4566 // instruction that uses at least one register.
4567 for (auto &Pair : R.MaxLocalUsers) {
4568 Pair.second = std::max(Pair.second, 1U);
4569 }
4570
4571 // We calculate the interleave count using the following formula.
4572 // Subtract the number of loop invariants from the number of available
4573 // registers. These registers are used by all of the interleaved instances.
4574 // Next, divide the remaining registers by the number of registers that is
4575 // required by the loop, in order to estimate how many parallel instances
4576 // fit without causing spills. All of this is rounded down if necessary to be
4577 // a power of two. We want power of two interleave count to simplify any
4578 // addressing operations or alignment considerations.
4579 // We also want power of two interleave counts to ensure that the induction
4580 // variable of the vector loop wraps to zero, when tail is folded by masking;
4581 // this currently happens when OptForSize, in which case IC is set to 1 above.
4582 unsigned IC = UINT_MAX;
4583
4584 for (const auto &Pair : R.MaxLocalUsers) {
4585 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(Pair.first);
4586 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
4587 << " registers of "
4588 << TTI.getRegisterClassName(Pair.first)
4589 << " register class\n");
4590 if (VF.isScalar()) {
4591 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
4592 TargetNumRegisters = ForceTargetNumScalarRegs;
4593 } else {
4594 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
4595 TargetNumRegisters = ForceTargetNumVectorRegs;
4596 }
4597 unsigned MaxLocalUsers = Pair.second;
4598 unsigned LoopInvariantRegs = 0;
4599 if (R.LoopInvariantRegs.contains(Pair.first))
4600 LoopInvariantRegs = R.LoopInvariantRegs[Pair.first];
4601
4602 unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
4603 MaxLocalUsers);
4604 // Don't count the induction variable as interleaved.
4606 TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
4607 std::max(1U, (MaxLocalUsers - 1)));
4608 }
4609
4610 IC = std::min(IC, TmpIC);
4611 }
4612
4613 // Clamp the interleave ranges to reasonable counts.
4614 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
4615
4616 // Check if the user has overridden the max.
4617 if (VF.isScalar()) {
4618 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
4619 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
4620 } else {
4621 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
4622 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4623 }
4624
4625 // Try to get the exact trip count, or an estimate based on profiling data or
4626 // ConstantMax from PSE, failing that.
4627 auto BestKnownTC = getSmallBestKnownTC(PSE, OrigLoop);
4628
4629 // For fixed length VFs treat a scalable trip count as unknown.
4630 if (BestKnownTC && (BestKnownTC->isFixed() || VF.isScalable())) {
4631 // Re-evaluate trip counts and VFs to be in the same numerical space.
4632 unsigned AvailableTC =
4633 estimateElementCount(*BestKnownTC, CM.getVScaleForTuning());
4634 unsigned EstimatedVF = estimateElementCount(VF, CM.getVScaleForTuning());
4635
4636 // At least one iteration must be scalar when this constraint holds. So the
4637 // maximum available iterations for interleaving is one less.
4638 if (CM.requiresScalarEpilogue(VF.isVector()))
4639 --AvailableTC;
4640
4641 unsigned InterleaveCountLB = bit_floor(std::max(
4642 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
4643
4644 if (getSmallConstantTripCount(PSE.getSE(), OrigLoop).isNonZero()) {
4645 // If the best known trip count is exact, we select between two
4646 // prospective ICs, where
4647 //
4648 // 1) the aggressive IC is capped by the trip count divided by VF
4649 // 2) the conservative IC is capped by the trip count divided by (VF * 2)
4650 //
4651 // The final IC is selected in a way that the epilogue loop trip count is
4652 // minimized while maximizing the IC itself, so that we either run the
4653 // vector loop at least once if it generates a small epilogue loop, or
4654 // else we run the vector loop at least twice.
4655
4656 unsigned InterleaveCountUB = bit_floor(std::max(
4657 1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
4658 MaxInterleaveCount = InterleaveCountLB;
4659
4660 if (InterleaveCountUB != InterleaveCountLB) {
4661 unsigned TailTripCountUB =
4662 (AvailableTC % (EstimatedVF * InterleaveCountUB));
4663 unsigned TailTripCountLB =
4664 (AvailableTC % (EstimatedVF * InterleaveCountLB));
4665 // If both produce same scalar tail, maximize the IC to do the same work
4666 // in fewer vector loop iterations
4667 if (TailTripCountUB == TailTripCountLB)
4668 MaxInterleaveCount = InterleaveCountUB;
4669 }
4670 } else {
4671 // If trip count is an estimated compile time constant, limit the
4672 // IC to be capped by the trip count divided by VF * 2, such that the
4673 // vector loop runs at least twice to make interleaving seem profitable
4674 // when there is an epilogue loop present. Since exact Trip count is not
4675 // known we choose to be conservative in our IC estimate.
4676 MaxInterleaveCount = InterleaveCountLB;
4677 }
4678 }
4679
4680 assert(MaxInterleaveCount > 0 &&
4681 "Maximum interleave count must be greater than 0");
4682
4683 // Clamp the calculated IC to be between the 1 and the max interleave count
4684 // that the target and trip count allows.
4685 if (IC > MaxInterleaveCount)
4686 IC = MaxInterleaveCount;
4687 else
4688 // Make sure IC is greater than 0.
4689 IC = std::max(1u, IC);
4690
4691 assert(IC > 0 && "Interleave count must be greater than 0.");
4692
4693 // Interleave if we vectorized this loop and there is a reduction that could
4694 // benefit from interleaving.
4695 if (VF.isVector() && HasReductions) {
4696 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
4697 return IC;
4698 }
4699
4700 // For any scalar loop that either requires runtime checks or predication we
4701 // are better off leaving this to the unroller. Note that if we've already
4702 // vectorized the loop we will have done the runtime check and so interleaving
4703 // won't require further checks.
4704 bool ScalarInterleavingRequiresPredication =
4705 (VF.isScalar() && any_of(OrigLoop->blocks(), [this](BasicBlock *BB) {
4706 return Legal->blockNeedsPredication(BB);
4707 }));
4708 bool ScalarInterleavingRequiresRuntimePointerCheck =
4709 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
4710
4711 // We want to interleave small loops in order to reduce the loop overhead and
4712 // potentially expose ILP opportunities.
4713 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
4714 << "LV: IC is " << IC << '\n'
4715 << "LV: VF is " << VF << '\n');
4716 const bool AggressivelyInterleaveReductions =
4717 TTI.enableAggressiveInterleaving(HasReductions);
4718 if (!ScalarInterleavingRequiresRuntimePointerCheck &&
4719 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
4720 // We assume that the cost overhead is 1 and we use the cost model
4721 // to estimate the cost of the loop and interleave until the cost of the
4722 // loop overhead is about 5% of the cost of the loop.
4723 unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
4724 SmallLoopCost / LoopCost.getValue()));
4725
4726 // Interleave until store/load ports (estimated by max interleave count) are
4727 // saturated.
4728 unsigned NumStores = 0;
4729 unsigned NumLoads = 0;
4732 for (VPRecipeBase &R : *VPBB) {
4734 NumLoads++;
4735 continue;
4736 }
4738 NumStores++;
4739 continue;
4740 }
4741
4742 if (auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R)) {
4743 if (unsigned StoreOps = InterleaveR->getNumStoreOperands())
4744 NumStores += StoreOps;
4745 else
4746 NumLoads += InterleaveR->getNumDefinedValues();
4747 continue;
4748 }
4749 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
4750 NumLoads += isa<LoadInst>(RepR->getUnderlyingInstr());
4751 NumStores += isa<StoreInst>(RepR->getUnderlyingInstr());
4752 continue;
4753 }
4754 if (isa<VPHistogramRecipe>(&R)) {
4755 NumLoads++;
4756 NumStores++;
4757 continue;
4758 }
4759 }
4760 }
4761 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
4762 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
4763
4764 // There is little point in interleaving for reductions containing selects
4765 // and compares when VF=1 since it may just create more overhead than it's
4766 // worth for loops with small trip counts. This is because we still have to
4767 // do the final reduction after the loop.
4768 bool HasSelectCmpReductions =
4769 HasReductions &&
4771 [](VPRecipeBase &R) {
4772 auto *RedR = dyn_cast<VPReductionPHIRecipe>(&R);
4773 return RedR && (RecurrenceDescriptor::isAnyOfRecurrenceKind(
4774 RedR->getRecurrenceKind()) ||
4775 RecurrenceDescriptor::isFindIVRecurrenceKind(
4776 RedR->getRecurrenceKind()));
4777 });
4778 if (HasSelectCmpReductions) {
4779 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
4780 return 1;
4781 }
4782
4783 // If we have a scalar reduction (vector reductions are already dealt with
4784 // by this point), we can increase the critical path length if the loop
4785 // we're interleaving is inside another loop. For tree-wise reductions
4786 // set the limit to 2, and for ordered reductions it's best to disable
4787 // interleaving entirely.
4788 if (HasReductions && OrigLoop->getLoopDepth() > 1) {
4789 bool HasOrderedReductions =
4791 [](VPRecipeBase &R) {
4792 auto *RedR = dyn_cast<VPReductionPHIRecipe>(&R);
4793
4794 return RedR && RedR->isOrdered();
4795 });
4796 if (HasOrderedReductions) {
4797 LLVM_DEBUG(
4798 dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
4799 return 1;
4800 }
4801
4802 unsigned F = MaxNestedScalarReductionIC;
4803 SmallIC = std::min(SmallIC, F);
4804 StoresIC = std::min(StoresIC, F);
4805 LoadsIC = std::min(LoadsIC, F);
4806 }
4807
4809 std::max(StoresIC, LoadsIC) > SmallIC) {
4810 LLVM_DEBUG(
4811 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
4812 return std::max(StoresIC, LoadsIC);
4813 }
4814
4815 // If there are scalar reductions and TTI has enabled aggressive
4816 // interleaving for reductions, we will interleave to expose ILP.
4817 if (VF.isScalar() && AggressivelyInterleaveReductions) {
4818 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
4819 // Interleave no less than SmallIC but not as aggressive as the normal IC
4820 // to satisfy the rare situation when resources are too limited.
4821 return std::max(IC / 2, SmallIC);
4822 }
4823
4824 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
4825 return SmallIC;
4826 }
4827
4828 // Interleave if this is a large loop (small loops are already dealt with by
4829 // this point) that could benefit from interleaving.
4830 if (AggressivelyInterleaveReductions) {
4831 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
4832 return IC;
4833 }
4834
4835 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
4836 return 1;
4837}
4838
4839bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
4840 ElementCount VF) {
4841 // TODO: Cost model for emulated masked load/store is completely
4842 // broken. This hack guides the cost model to use an artificially
4843 // high enough value to practically disable vectorization with such
4844 // operations, except where previously deployed legality hack allowed
4845 // using very low cost values. This is to avoid regressions coming simply
4846 // from moving "masked load/store" check from legality to cost model.
4847 // Masked Load/Gather emulation was previously never allowed.
4848 // Limited number of Masked Store/Scatter emulation was allowed.
4850 "Expecting a scalar emulated instruction");
4851 return isa<LoadInst>(I) ||
4852 (isa<StoreInst>(I) &&
4853 NumPredStores > NumberOfStoresToPredicate);
4854}
4855
4857 assert(VF.isVector() && "Expected VF >= 2");
4858
4859 // If we've already collected the instructions to scalarize or the predicated
4860 // BBs after vectorization, there's nothing to do. Collection may already have
4861 // occurred if we have a user-selected VF and are now computing the expected
4862 // cost for interleaving.
4863 if (InstsToScalarize.contains(VF) ||
4864 PredicatedBBsAfterVectorization.contains(VF))
4865 return;
4866
4867 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
4868 // not profitable to scalarize any instructions, the presence of VF in the
4869 // map will indicate that we've analyzed it already.
4870 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
4871
4872 // Find all the instructions that are scalar with predication in the loop and
4873 // determine if it would be better to not if-convert the blocks they are in.
4874 // If so, we also record the instructions to scalarize.
4875 for (BasicBlock *BB : TheLoop->blocks()) {
4877 continue;
4878 for (Instruction &I : *BB)
4879 if (isScalarWithPredication(&I, VF)) {
4880 ScalarCostsTy ScalarCosts;
4881 // Do not apply discount logic for:
4882 // 1. Scalars after vectorization, as there will only be a single copy
4883 // of the instruction.
4884 // 2. Scalable VF, as that would lead to invalid scalarization costs.
4885 // 3. Emulated masked memrefs, if a hacked cost is needed.
4886 if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() &&
4887 !useEmulatedMaskMemRefHack(&I, VF) &&
4888 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) {
4889 for (const auto &[I, IC] : ScalarCosts)
4890 ScalarCostsVF.insert({I, IC});
4891 // Check if we decided to scalarize a call. If so, update the widening
4892 // decision of the call to CM_Scalarize with the computed scalar cost.
4893 for (const auto &[I, Cost] : ScalarCosts) {
4894 auto *CI = dyn_cast<CallInst>(I);
4895 if (!CI || !CallWideningDecisions.contains({CI, VF}))
4896 continue;
4897 CallWideningDecisions[{CI, VF}].Kind = CM_Scalarize;
4898 CallWideningDecisions[{CI, VF}].Cost = Cost;
4899 }
4900 }
4901 // Remember that BB will remain after vectorization.
4902 PredicatedBBsAfterVectorization[VF].insert(BB);
4903 for (auto *Pred : predecessors(BB)) {
4904 if (Pred->getSingleSuccessor() == BB)
4905 PredicatedBBsAfterVectorization[VF].insert(Pred);
4906 }
4907 }
4908 }
4909}
4910
4911InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
4912 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
4913 assert(!isUniformAfterVectorization(PredInst, VF) &&
4914 "Instruction marked uniform-after-vectorization will be predicated");
4915
4916 // Initialize the discount to zero, meaning that the scalar version and the
4917 // vector version cost the same.
4918 InstructionCost Discount = 0;
4919
4920 // Holds instructions to analyze. The instructions we visit are mapped in
4921 // ScalarCosts. Those instructions are the ones that would be scalarized if
4922 // we find that the scalar version costs less.
4924
4925 // Returns true if the given instruction can be scalarized.
4926 auto CanBeScalarized = [&](Instruction *I) -> bool {
4927 // We only attempt to scalarize instructions forming a single-use chain
4928 // from the original predicated block that would otherwise be vectorized.
4929 // Although not strictly necessary, we give up on instructions we know will
4930 // already be scalar to avoid traversing chains that are unlikely to be
4931 // beneficial.
4932 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
4934 return false;
4935
4936 // If the instruction is scalar with predication, it will be analyzed
4937 // separately. We ignore it within the context of PredInst.
4938 if (isScalarWithPredication(I, VF))
4939 return false;
4940
4941 // If any of the instruction's operands are uniform after vectorization,
4942 // the instruction cannot be scalarized. This prevents, for example, a
4943 // masked load from being scalarized.
4944 //
4945 // We assume we will only emit a value for lane zero of an instruction
4946 // marked uniform after vectorization, rather than VF identical values.
4947 // Thus, if we scalarize an instruction that uses a uniform, we would
4948 // create uses of values corresponding to the lanes we aren't emitting code
4949 // for. This behavior can be changed by allowing getScalarValue to clone
4950 // the lane zero values for uniforms rather than asserting.
4951 for (Use &U : I->operands())
4952 if (auto *J = dyn_cast<Instruction>(U.get()))
4953 if (isUniformAfterVectorization(J, VF))
4954 return false;
4955
4956 // Otherwise, we can scalarize the instruction.
4957 return true;
4958 };
4959
4960 // Compute the expected cost discount from scalarizing the entire expression
4961 // feeding the predicated instruction. We currently only consider expressions
4962 // that are single-use instruction chains.
4963 Worklist.push_back(PredInst);
4964 while (!Worklist.empty()) {
4965 Instruction *I = Worklist.pop_back_val();
4966
4967 // If we've already analyzed the instruction, there's nothing to do.
4968 if (ScalarCosts.contains(I))
4969 continue;
4970
4971 // Cannot scalarize fixed-order recurrence phis at the moment.
4972 if (isa<PHINode>(I) && Legal->isFixedOrderRecurrence(cast<PHINode>(I)))
4973 continue;
4974
4975 // Compute the cost of the vector instruction. Note that this cost already
4976 // includes the scalarization overhead of the predicated instruction.
4977 InstructionCost VectorCost = getInstructionCost(I, VF);
4978
4979 // Compute the cost of the scalarized instruction. This cost is the cost of
4980 // the instruction as if it wasn't if-converted and instead remained in the
4981 // predicated block. We will scale this cost by block probability after
4982 // computing the scalarization overhead.
4983 InstructionCost ScalarCost =
4985
4986 // Compute the scalarization overhead of needed insertelement instructions
4987 // and phi nodes.
4988 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
4989 Type *WideTy = toVectorizedTy(I->getType(), VF);
4990 for (Type *VectorTy : getContainedTypes(WideTy)) {
4991 ScalarCost += TTI.getScalarizationOverhead(
4993 /*Insert=*/true,
4994 /*Extract=*/false, CostKind);
4995 }
4996 ScalarCost +=
4997 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
4998 }
4999
5000 // Compute the scalarization overhead of needed extractelement
5001 // instructions. For each of the instruction's operands, if the operand can
5002 // be scalarized, add it to the worklist; otherwise, account for the
5003 // overhead.
5004 for (Use &U : I->operands())
5005 if (auto *J = dyn_cast<Instruction>(U.get())) {
5006 assert(canVectorizeTy(J->getType()) &&
5007 "Instruction has non-scalar type");
5008 if (CanBeScalarized(J))
5009 Worklist.push_back(J);
5010 else if (needsExtract(J, VF)) {
5011 Type *WideTy = toVectorizedTy(J->getType(), VF);
5012 for (Type *VectorTy : getContainedTypes(WideTy)) {
5013 ScalarCost += TTI.getScalarizationOverhead(
5014 cast<VectorType>(VectorTy),
5015 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5016 /*Extract*/ true, CostKind);
5017 }
5018 }
5019 }
5020
5021 // Scale the total scalar cost by block probability.
5022 ScalarCost /= getPredBlockCostDivisor(CostKind);
5023
5024 // Compute the discount. A non-negative discount means the vector version
5025 // of the instruction costs more, and scalarizing would be beneficial.
5026 Discount += VectorCost - ScalarCost;
5027 ScalarCosts[I] = ScalarCost;
5028 }
5029
5030 return Discount;
5031}
5032
5035
5036 // If the vector loop gets executed exactly once with the given VF, ignore the
5037 // costs of comparison and induction instructions, as they'll get simplified
5038 // away.
5039 SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;
5040 auto TC = getSmallConstantTripCount(PSE.getSE(), TheLoop);
5041 if (TC == VF && !foldTailByMasking())
5043 ValuesToIgnoreForVF);
5044
5045 // For each block.
5046 for (BasicBlock *BB : TheLoop->blocks()) {
5047 InstructionCost BlockCost;
5048
5049 // For each instruction in the old loop.
5050 for (Instruction &I : BB->instructionsWithoutDebug()) {
5051 // Skip ignored values.
5052 if (ValuesToIgnore.count(&I) || ValuesToIgnoreForVF.count(&I) ||
5053 (VF.isVector() && VecValuesToIgnore.count(&I)))
5054 continue;
5055
5057
5058 // Check if we should override the cost.
5059 if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
5061
5062 BlockCost += C;
5063 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5064 << VF << " For instruction: " << I << '\n');
5065 }
5066
5067 // If we are vectorizing a predicated block, it will have been
5068 // if-converted. This means that the block's instructions (aside from
5069 // stores and instructions that may divide by zero) will now be
5070 // unconditionally executed. For the scalar case, we may not always execute
5071 // the predicated block, if it is an if-else block. Thus, scale the block's
5072 // cost by the probability of executing it. blockNeedsPredication from
5073 // Legal is used so as to not include all blocks in tail folded loops.
5074 if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5075 BlockCost /= getPredBlockCostDivisor(CostKind);
5076
5077 Cost += BlockCost;
5078 }
5079
5080 return Cost;
5081}
5082
5083/// Gets Address Access SCEV after verifying that the access pattern
5084/// is loop invariant except the induction variable dependence.
5085///
5086/// This SCEV can be sent to the Target in order to estimate the address
5087/// calculation cost.
5089 Value *Ptr,
5092 const Loop *TheLoop) {
5093
5094 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5095 if (!Gep)
5096 return nullptr;
5097
5098 // We are looking for a gep with all loop invariant indices except for one
5099 // which should be an induction variable.
5100 auto *SE = PSE.getSE();
5101 unsigned NumOperands = Gep->getNumOperands();
5102 for (unsigned Idx = 1; Idx < NumOperands; ++Idx) {
5103 Value *Opd = Gep->getOperand(Idx);
5104 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5105 !Legal->isInductionVariable(Opd))
5106 return nullptr;
5107 }
5108
5109 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5110 return PSE.getSCEV(Ptr);
5111}
5112
5114LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5115 ElementCount VF) {
5116 assert(VF.isVector() &&
5117 "Scalarization cost of instruction implies vectorization.");
5118 if (VF.isScalable())
5120
5121 Type *ValTy = getLoadStoreType(I);
5122 auto *SE = PSE.getSE();
5123
5124 unsigned AS = getLoadStoreAddressSpace(I);
5126 Type *PtrTy = toVectorTy(Ptr->getType(), VF);
5127 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
5128 // that it is being called from this specific place.
5129
5130 // Figure out whether the access is strided and get the stride value
5131 // if it's known in compile time
5132 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5133
5134 // Get the cost of the scalar memory instruction and address computation.
5135 InstructionCost Cost = VF.getFixedValue() * TTI.getAddressComputationCost(
5136 PtrTy, SE, PtrSCEV, CostKind);
5137
5138 // Don't pass *I here, since it is scalar but will actually be part of a
5139 // vectorized loop where the user of it is a vectorized instruction.
5140 const Align Alignment = getLoadStoreAlignment(I);
5141 Cost += VF.getFixedValue() * TTI.getMemoryOpCost(I->getOpcode(),
5142 ValTy->getScalarType(),
5143 Alignment, AS, CostKind);
5144
5145 // Get the overhead of the extractelement and insertelement instructions
5146 // we might create due to scalarization.
5147 Cost += getScalarizationOverhead(I, VF);
5148
5149 // If we have a predicated load/store, it will need extra i1 extracts and
5150 // conditional branches, but may not be executed for each vector lane. Scale
5151 // the cost by the probability of executing the predicated block.
5152 if (isPredicatedInst(I)) {
5154
5155 // Add the cost of an i1 extract and a branch
5156 auto *VecI1Ty =
5158 Cost += TTI.getScalarizationOverhead(
5159 VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
5160 /*Insert=*/false, /*Extract=*/true, CostKind);
5161 Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
5162
5163 if (useEmulatedMaskMemRefHack(I, VF))
5164 // Artificially setting to a high enough value to practically disable
5165 // vectorization with such operations.
5166 Cost = 3000000;
5167 }
5168
5169 return Cost;
5170}
5171
5173LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5174 ElementCount VF) {
5175 Type *ValTy = getLoadStoreType(I);
5176 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5178 unsigned AS = getLoadStoreAddressSpace(I);
5179 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
5180
5181 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5182 "Stride should be 1 or -1 for consecutive memory access");
5183 const Align Alignment = getLoadStoreAlignment(I);
5185 if (Legal->isMaskRequired(I)) {
5186 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5187 CostKind);
5188 } else {
5189 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5190 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5191 CostKind, OpInfo, I);
5192 }
5193
5194 bool Reverse = ConsecutiveStride < 0;
5195 if (Reverse)
5196 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
5197 VectorTy, {}, CostKind, 0);
5198 return Cost;
5199}
5200
5202LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5203 ElementCount VF) {
5204 assert(Legal->isUniformMemOp(*I, VF));
5205
5206 Type *ValTy = getLoadStoreType(I);
5208 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5209 const Align Alignment = getLoadStoreAlignment(I);
5210 unsigned AS = getLoadStoreAddressSpace(I);
5211 if (isa<LoadInst>(I)) {
5212 return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
5213 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
5214 CostKind) +
5215 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy,
5216 VectorTy, {}, CostKind);
5217 }
5218 StoreInst *SI = cast<StoreInst>(I);
5219
5220 bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
5221 // TODO: We have existing tests that request the cost of extracting element
5222 // VF.getKnownMinValue() - 1 from a scalable vector. This does not represent
5223 // the actual generated code, which involves extracting the last element of
5224 // a scalable vector where the lane to extract is unknown at compile time.
5226 TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
5227 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, CostKind);
5228 if (!IsLoopInvariantStoreValue)
5229 Cost += TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
5230 VectorTy, CostKind, 0);
5231 return Cost;
5232}
5233
5235LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5236 ElementCount VF) {
5237 Type *ValTy = getLoadStoreType(I);
5238 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5239 const Align Alignment = getLoadStoreAlignment(I);
5241 Type *PtrTy = Ptr->getType();
5242
5243 if (!Legal->isUniform(Ptr, VF))
5244 PtrTy = toVectorTy(PtrTy, VF);
5245
5246 return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
5247 TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5248 Legal->isMaskRequired(I), Alignment,
5249 CostKind, I);
5250}
5251
5253LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5254 ElementCount VF) {
5255 const auto *Group = getInterleavedAccessGroup(I);
5256 assert(Group && "Fail to get an interleaved access group.");
5257
5258 Instruction *InsertPos = Group->getInsertPos();
5259 Type *ValTy = getLoadStoreType(InsertPos);
5260 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5261 unsigned AS = getLoadStoreAddressSpace(InsertPos);
5262
5263 unsigned InterleaveFactor = Group->getFactor();
5264 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5265
5266 // Holds the indices of existing members in the interleaved group.
5267 SmallVector<unsigned, 4> Indices;
5268 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
5269 if (Group->getMember(IF))
5270 Indices.push_back(IF);
5271
5272 // Calculate the cost of the whole interleaved group.
5273 bool UseMaskForGaps =
5274 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
5275 (isa<StoreInst>(I) && !Group->isFull());
5276 InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
5277 InsertPos->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5278 Group->getAlign(), AS, CostKind, Legal->isMaskRequired(I),
5279 UseMaskForGaps);
5280
5281 if (Group->isReverse()) {
5282 // TODO: Add support for reversed masked interleaved access.
5283 assert(!Legal->isMaskRequired(I) &&
5284 "Reverse masked interleaved access not supported.");
5285 Cost += Group->getNumMembers() *
5286 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
5287 VectorTy, {}, CostKind, 0);
5288 }
5289 return Cost;
5290}
5291
5292std::optional<InstructionCost>
5294 ElementCount VF,
5295 Type *Ty) const {
5296 using namespace llvm::PatternMatch;
5297 // Early exit for no inloop reductions
5298 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
5299 return std::nullopt;
5300 auto *VectorTy = cast<VectorType>(Ty);
5301
5302 // We are looking for a pattern of, and finding the minimal acceptable cost:
5303 // reduce(mul(ext(A), ext(B))) or
5304 // reduce(mul(A, B)) or
5305 // reduce(ext(A)) or
5306 // reduce(A).
5307 // The basic idea is that we walk down the tree to do that, finding the root
5308 // reduction instruction in InLoopReductionImmediateChains. From there we find
5309 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
5310 // of the components. If the reduction cost is lower then we return it for the
5311 // reduction instruction and 0 for the other instructions in the pattern. If
5312 // it is not we return an invalid cost specifying the orignal cost method
5313 // should be used.
5314 Instruction *RetI = I;
5315 if (match(RetI, m_ZExtOrSExt(m_Value()))) {
5316 if (!RetI->hasOneUser())
5317 return std::nullopt;
5318 RetI = RetI->user_back();
5319 }
5320
5321 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
5322 RetI->user_back()->getOpcode() == Instruction::Add) {
5323 RetI = RetI->user_back();
5324 }
5325
5326 // Test if the found instruction is a reduction, and if not return an invalid
5327 // cost specifying the parent to use the original cost modelling.
5328 Instruction *LastChain = InLoopReductionImmediateChains.lookup(RetI);
5329 if (!LastChain)
5330 return std::nullopt;
5331
5332 // Find the reduction this chain is a part of and calculate the basic cost of
5333 // the reduction on its own.
5334 Instruction *ReductionPhi = LastChain;
5335 while (!isa<PHINode>(ReductionPhi))
5336 ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
5337
5338 const RecurrenceDescriptor &RdxDesc =
5339 Legal->getRecurrenceDescriptor(cast<PHINode>(ReductionPhi));
5340
5341 InstructionCost BaseCost;
5342 RecurKind RK = RdxDesc.getRecurrenceKind();
5345 BaseCost = TTI.getMinMaxReductionCost(MinMaxID, VectorTy,
5346 RdxDesc.getFastMathFlags(), CostKind);
5347 } else {
5348 BaseCost = TTI.getArithmeticReductionCost(
5349 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
5350 }
5351
5352 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
5353 // normal fmul instruction to the cost of the fadd reduction.
5354 if (RK == RecurKind::FMulAdd)
5355 BaseCost +=
5356 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
5357
5358 // If we're using ordered reductions then we can just return the base cost
5359 // here, since getArithmeticReductionCost calculates the full ordered
5360 // reduction cost when FP reassociation is not allowed.
5361 if (useOrderedReductions(RdxDesc))
5362 return BaseCost;
5363
5364 // Get the operand that was not the reduction chain and match it to one of the
5365 // patterns, returning the better cost if it is found.
5366 Instruction *RedOp = RetI->getOperand(1) == LastChain
5369
5370 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
5371
5372 Instruction *Op0, *Op1;
5373 if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5374 match(RedOp,
5376 match(Op0, m_ZExtOrSExt(m_Value())) &&
5377 Op0->getOpcode() == Op1->getOpcode() &&
5378 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
5379 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
5380 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
5381
5382 // Matched reduce.add(ext(mul(ext(A), ext(B)))
5383 // Note that the extend opcodes need to all match, or if A==B they will have
5384 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
5385 // which is equally fine.
5386 bool IsUnsigned = isa<ZExtInst>(Op0);
5387 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
5388 auto *MulType = VectorType::get(Op0->getType(), VectorTy);
5389
5390 InstructionCost ExtCost =
5391 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
5393 InstructionCost MulCost =
5394 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
5395 InstructionCost Ext2Cost =
5396 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
5398
5399 InstructionCost RedCost = TTI.getMulAccReductionCost(
5400 IsUnsigned, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), ExtType,
5401 CostKind);
5402
5403 if (RedCost.isValid() &&
5404 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
5405 return I == RetI ? RedCost : 0;
5406 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
5407 !TheLoop->isLoopInvariant(RedOp)) {
5408 // Matched reduce(ext(A))
5409 bool IsUnsigned = isa<ZExtInst>(RedOp);
5410 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
5411 InstructionCost RedCost = TTI.getExtendedReductionCost(
5412 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
5413 RdxDesc.getFastMathFlags(), CostKind);
5414
5415 InstructionCost ExtCost =
5416 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
5418 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
5419 return I == RetI ? RedCost : 0;
5420 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5421 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
5422 if (match(Op0, m_ZExtOrSExt(m_Value())) &&
5423 Op0->getOpcode() == Op1->getOpcode() &&
5424 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
5425 bool IsUnsigned = isa<ZExtInst>(Op0);
5426 Type *Op0Ty = Op0->getOperand(0)->getType();
5427 Type *Op1Ty = Op1->getOperand(0)->getType();
5428 Type *LargestOpTy =
5429 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
5430 : Op0Ty;
5431 auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
5432
5433 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
5434 // different sizes. We take the largest type as the ext to reduce, and add
5435 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
5436 InstructionCost ExtCost0 = TTI.getCastInstrCost(
5437 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
5439 InstructionCost ExtCost1 = TTI.getCastInstrCost(
5440 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
5442 InstructionCost MulCost =
5443 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
5444
5445 InstructionCost RedCost = TTI.getMulAccReductionCost(
5446 IsUnsigned, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), ExtType,
5447 CostKind);
5448 InstructionCost ExtraExtCost = 0;
5449 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
5450 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
5451 ExtraExtCost = TTI.getCastInstrCost(
5452 ExtraExtOp->getOpcode(), ExtType,
5453 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
5455 }
5456
5457 if (RedCost.isValid() &&
5458 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
5459 return I == RetI ? RedCost : 0;
5460 } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
5461 // Matched reduce.add(mul())
5462 InstructionCost MulCost =
5463 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
5464
5465 InstructionCost RedCost = TTI.getMulAccReductionCost(
5466 true, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), VectorTy,
5467 CostKind);
5468
5469 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
5470 return I == RetI ? RedCost : 0;
5471 }
5472 }
5473
5474 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
5475}
5476
5478LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5479 ElementCount VF) {
5480 // Calculate scalar cost only. Vectorization cost should be ready at this
5481 // moment.
5482 if (VF.isScalar()) {
5483 Type *ValTy = getLoadStoreType(I);
5485 const Align Alignment = getLoadStoreAlignment(I);
5486 unsigned AS = getLoadStoreAddressSpace(I);
5487
5488 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5489 return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
5490 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, CostKind,
5491 OpInfo, I);
5492 }
5493 return getWideningCost(I, VF);
5494}
5495
5497LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5498 ElementCount VF) const {
5499
5500 // There is no mechanism yet to create a scalable scalarization loop,
5501 // so this is currently Invalid.
5502 if (VF.isScalable())
5504
5505 if (VF.isScalar())
5506 return 0;
5507
5509 Type *RetTy = toVectorizedTy(I->getType(), VF);
5510 if (!RetTy->isVoidTy() &&
5511 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) {
5512
5513 for (Type *VectorTy : getContainedTypes(RetTy)) {
5514 Cost += TTI.getScalarizationOverhead(
5516 /*Insert=*/true,
5517 /*Extract=*/false, CostKind);
5518 }
5519 }
5520
5521 // Some targets keep addresses scalar.
5522 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5523 return Cost;
5524
5525 // Some targets support efficient element stores.
5526 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
5527 return Cost;
5528
5529 // Collect operands to consider.
5530 CallInst *CI = dyn_cast<CallInst>(I);
5531 Instruction::op_range Ops = CI ? CI->args() : I->operands();
5532
5533 // Skip operands that do not require extraction/scalarization and do not incur
5534 // any overhead.
5536 for (auto *V : filterExtractingOperands(Ops, VF))
5537 Tys.push_back(maybeVectorizeType(V->getType(), VF));
5538 return Cost + TTI.getOperandsScalarizationOverhead(Tys, CostKind);
5539}
5540
5542 if (VF.isScalar())
5543 return;
5544 NumPredStores = 0;
5545 for (BasicBlock *BB : TheLoop->blocks()) {
5546 // For each instruction in the old loop.
5547 for (Instruction &I : *BB) {
5549 if (!Ptr)
5550 continue;
5551
5552 // TODO: We should generate better code and update the cost model for
5553 // predicated uniform stores. Today they are treated as any other
5554 // predicated store (see added test cases in
5555 // invariant-store-vectorization.ll).
5557 NumPredStores++;
5558
5559 if (Legal->isUniformMemOp(I, VF)) {
5560 auto IsLegalToScalarize = [&]() {
5561 if (!VF.isScalable())
5562 // Scalarization of fixed length vectors "just works".
5563 return true;
5564
5565 // We have dedicated lowering for unpredicated uniform loads and
5566 // stores. Note that even with tail folding we know that at least
5567 // one lane is active (i.e. generalized predication is not possible
5568 // here), and the logic below depends on this fact.
5569 if (!foldTailByMasking())
5570 return true;
5571
5572 // For scalable vectors, a uniform memop load is always
5573 // uniform-by-parts and we know how to scalarize that.
5574 if (isa<LoadInst>(I))
5575 return true;
5576
5577 // A uniform store isn't neccessarily uniform-by-part
5578 // and we can't assume scalarization.
5579 auto &SI = cast<StoreInst>(I);
5580 return TheLoop->isLoopInvariant(SI.getValueOperand());
5581 };
5582
5583 const InstructionCost GatherScatterCost =
5585 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
5586
5587 // Load: Scalar load + broadcast
5588 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5589 // FIXME: This cost is a significant under-estimate for tail folded
5590 // memory ops.
5591 const InstructionCost ScalarizationCost =
5592 IsLegalToScalarize() ? getUniformMemOpCost(&I, VF)
5594
5595 // Choose better solution for the current VF, Note that Invalid
5596 // costs compare as maximumal large. If both are invalid, we get
5597 // scalable invalid which signals a failure and a vectorization abort.
5598 if (GatherScatterCost < ScalarizationCost)
5599 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
5600 else
5601 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
5602 continue;
5603 }
5604
5605 // We assume that widening is the best solution when possible.
5606 if (memoryInstructionCanBeWidened(&I, VF)) {
5607 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
5608 int ConsecutiveStride = Legal->isConsecutivePtr(
5610 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5611 "Expected consecutive stride.");
5612 InstWidening Decision =
5613 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
5614 setWideningDecision(&I, VF, Decision, Cost);
5615 continue;
5616 }
5617
5618 // Choose between Interleaving, Gather/Scatter or Scalarization.
5620 unsigned NumAccesses = 1;
5621 if (isAccessInterleaved(&I)) {
5622 const auto *Group = getInterleavedAccessGroup(&I);
5623 assert(Group && "Fail to get an interleaved access group.");
5624
5625 // Make one decision for the whole group.
5626 if (getWideningDecision(&I, VF) != CM_Unknown)
5627 continue;
5628
5629 NumAccesses = Group->getNumMembers();
5631 InterleaveCost = getInterleaveGroupCost(&I, VF);
5632 }
5633
5634 InstructionCost GatherScatterCost =
5636 ? getGatherScatterCost(&I, VF) * NumAccesses
5638
5639 InstructionCost ScalarizationCost =
5640 getMemInstScalarizationCost(&I, VF) * NumAccesses;
5641
5642 // Choose better solution for the current VF,
5643 // write down this decision and use it during vectorization.
5645 InstWidening Decision;
5646 if (InterleaveCost <= GatherScatterCost &&
5647 InterleaveCost < ScalarizationCost) {
5648 Decision = CM_Interleave;
5649 Cost = InterleaveCost;
5650 } else if (GatherScatterCost < ScalarizationCost) {
5651 Decision = CM_GatherScatter;
5652 Cost = GatherScatterCost;
5653 } else {
5654 Decision = CM_Scalarize;
5655 Cost = ScalarizationCost;
5656 }
5657 // If the instructions belongs to an interleave group, the whole group
5658 // receives the same decision. The whole group receives the cost, but
5659 // the cost will actually be assigned to one instruction.
5660 if (const auto *Group = getInterleavedAccessGroup(&I))
5661 setWideningDecision(Group, VF, Decision, Cost);
5662 else
5663 setWideningDecision(&I, VF, Decision, Cost);
5664 }
5665 }
5666
5667 // Make sure that any load of address and any other address computation
5668 // remains scalar unless there is gather/scatter support. This avoids
5669 // inevitable extracts into address registers, and also has the benefit of
5670 // activating LSR more, since that pass can't optimize vectorized
5671 // addresses.
5672 if (TTI.prefersVectorizedAddressing())
5673 return;
5674
5675 // Start with all scalar pointer uses.
5677 for (BasicBlock *BB : TheLoop->blocks())
5678 for (Instruction &I : *BB) {
5679 Instruction *PtrDef =
5681 if (PtrDef && TheLoop->contains(PtrDef) &&
5683 AddrDefs.insert(PtrDef);
5684 }
5685
5686 // Add all instructions used to generate the addresses.
5688 append_range(Worklist, AddrDefs);
5689 while (!Worklist.empty()) {
5690 Instruction *I = Worklist.pop_back_val();
5691 for (auto &Op : I->operands())
5692 if (auto *InstOp = dyn_cast<Instruction>(Op))
5693 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
5694 AddrDefs.insert(InstOp).second)
5695 Worklist.push_back(InstOp);
5696 }
5697
5698 for (auto *I : AddrDefs) {
5699 if (isa<LoadInst>(I)) {
5700 // Setting the desired widening decision should ideally be handled in
5701 // by cost functions, but since this involves the task of finding out
5702 // if the loaded register is involved in an address computation, it is
5703 // instead changed here when we know this is the case.
5704 InstWidening Decision = getWideningDecision(I, VF);
5705 if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
5706 // Scalarize a widened load of address.
5708 I, VF, CM_Scalarize,
5709 (VF.getKnownMinValue() *
5710 getMemoryInstructionCost(I, ElementCount::getFixed(1))));
5711 else if (const auto *Group = getInterleavedAccessGroup(I)) {
5712 // Scalarize an interleave group of address loads.
5713 for (unsigned I = 0; I < Group->getFactor(); ++I) {
5714 if (Instruction *Member = Group->getMember(I))
5716 Member, VF, CM_Scalarize,
5717 (VF.getKnownMinValue() *
5718 getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
5719 }
5720 }
5721 } else {
5722 // Cannot scalarize fixed-order recurrence phis at the moment.
5723 if (isa<PHINode>(I) && Legal->isFixedOrderRecurrence(cast<PHINode>(I)))
5724 continue;
5725
5726 // Make sure I gets scalarized and a cost estimate without
5727 // scalarization overhead.
5728 ForcedScalars[VF].insert(I);
5729 }
5730 }
5731}
5732
5734 assert(!VF.isScalar() &&
5735 "Trying to set a vectorization decision for a scalar VF");
5736
5737 auto ForcedScalar = ForcedScalars.find(VF);
5738 for (BasicBlock *BB : TheLoop->blocks()) {
5739 // For each instruction in the old loop.
5740 for (Instruction &I : *BB) {
5742
5743 if (!CI)
5744 continue;
5745
5749 Function *ScalarFunc = CI->getCalledFunction();
5750 Type *ScalarRetTy = CI->getType();
5751 SmallVector<Type *, 4> Tys, ScalarTys;
5752 for (auto &ArgOp : CI->args())
5753 ScalarTys.push_back(ArgOp->getType());
5754
5755 // Estimate cost of scalarized vector call. The source operands are
5756 // assumed to be vectors, so we need to extract individual elements from
5757 // there, execute VF scalar calls, and then gather the result into the
5758 // vector return value.
5759 if (VF.isFixed()) {
5760 InstructionCost ScalarCallCost =
5761 TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
5762
5763 // Compute costs of unpacking argument values for the scalar calls and
5764 // packing the return values to a vector.
5765 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
5766 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
5767 } else {
5768 // There is no point attempting to calculate the scalar cost for a
5769 // scalable VF as we know it will be Invalid.
5770 assert(!getScalarizationOverhead(CI, VF).isValid() &&
5771 "Unexpected valid cost for scalarizing scalable vectors");
5772 ScalarCost = InstructionCost::getInvalid();
5773 }
5774
5775 // Honor ForcedScalars and UniformAfterVectorization decisions.
5776 // TODO: For calls, it might still be more profitable to widen. Use
5777 // VPlan-based cost model to compare different options.
5778 if (VF.isVector() && ((ForcedScalar != ForcedScalars.end() &&
5779 ForcedScalar->second.contains(CI)) ||
5780 isUniformAfterVectorization(CI, VF))) {
5781 setCallWideningDecision(CI, VF, CM_Scalarize, nullptr,
5782 Intrinsic::not_intrinsic, std::nullopt,
5783 ScalarCost);
5784 continue;
5785 }
5786
5787 bool MaskRequired = Legal->isMaskRequired(CI);
5788 // Compute corresponding vector type for return value and arguments.
5789 Type *RetTy = toVectorizedTy(ScalarRetTy, VF);
5790 for (Type *ScalarTy : ScalarTys)
5791 Tys.push_back(toVectorizedTy(ScalarTy, VF));
5792
5793 // An in-loop reduction using an fmuladd intrinsic is a special case;
5794 // we don't want the normal cost for that intrinsic.
5796 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy)) {
5799 std::nullopt, *RedCost);
5800 continue;
5801 }
5802
5803 // Find the cost of vectorizing the call, if we can find a suitable
5804 // vector variant of the function.
5805 VFInfo FuncInfo;
5806 Function *VecFunc = nullptr;
5807 // Search through any available variants for one we can use at this VF.
5808 for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
5809 // Must match requested VF.
5810 if (Info.Shape.VF != VF)
5811 continue;
5812
5813 // Must take a mask argument if one is required
5814 if (MaskRequired && !Info.isMasked())
5815 continue;
5816
5817 // Check that all parameter kinds are supported
5818 bool ParamsOk = true;
5819 for (VFParameter Param : Info.Shape.Parameters) {
5820 switch (Param.ParamKind) {
5822 break;
5824 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
5825 // Make sure the scalar parameter in the loop is invariant.
5826 if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
5827 TheLoop))
5828 ParamsOk = false;
5829 break;
5830 }
5832 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
5833 // Find the stride for the scalar parameter in this loop and see if
5834 // it matches the stride for the variant.
5835 // TODO: do we need to figure out the cost of an extract to get the
5836 // first lane? Or do we hope that it will be folded away?
5837 ScalarEvolution *SE = PSE.getSE();
5838 if (!match(SE->getSCEV(ScalarParam),
5840 m_SCEV(), m_scev_SpecificSInt(Param.LinearStepOrPos),
5842 ParamsOk = false;
5843 break;
5844 }
5846 break;
5847 default:
5848 ParamsOk = false;
5849 break;
5850 }
5851 }
5852
5853 if (!ParamsOk)
5854 continue;
5855
5856 // Found a suitable candidate, stop here.
5857 VecFunc = CI->getModule()->getFunction(Info.VectorName);
5858 FuncInfo = Info;
5859 break;
5860 }
5861
5862 if (TLI && VecFunc && !CI->isNoBuiltin())
5863 VectorCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind);
5864
5865 // Find the cost of an intrinsic; some targets may have instructions that
5866 // perform the operation without needing an actual call.
5868 if (IID != Intrinsic::not_intrinsic)
5870
5871 InstructionCost Cost = ScalarCost;
5872 InstWidening Decision = CM_Scalarize;
5873
5874 if (VectorCost <= Cost) {
5875 Cost = VectorCost;
5876 Decision = CM_VectorCall;
5877 }
5878
5879 if (IntrinsicCost <= Cost) {
5881 Decision = CM_IntrinsicCall;
5882 }
5883
5884 setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
5886 }
5887 }
5888}
5889
5891 if (!Legal->isInvariant(Op))
5892 return false;
5893 // Consider Op invariant, if it or its operands aren't predicated
5894 // instruction in the loop. In that case, it is not trivially hoistable.
5895 auto *OpI = dyn_cast<Instruction>(Op);
5896 return !OpI || !TheLoop->contains(OpI) ||
5897 (!isPredicatedInst(OpI) &&
5898 (!isa<PHINode>(OpI) || OpI->getParent() != TheLoop->getHeader()) &&
5899 all_of(OpI->operands(),
5900 [this](Value *Op) { return shouldConsiderInvariant(Op); }));
5901}
5902
5905 ElementCount VF) {
5906 // If we know that this instruction will remain uniform, check the cost of
5907 // the scalar version.
5909 VF = ElementCount::getFixed(1);
5910
5911 if (VF.isVector() && isProfitableToScalarize(I, VF))
5912 return InstsToScalarize[VF][I];
5913
5914 // Forced scalars do not have any scalarization overhead.
5915 auto ForcedScalar = ForcedScalars.find(VF);
5916 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
5917 auto InstSet = ForcedScalar->second;
5918 if (InstSet.count(I))
5920 VF.getKnownMinValue();
5921 }
5922
5923 Type *RetTy = I->getType();
5925 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
5926 auto *SE = PSE.getSE();
5927
5928 Type *VectorTy;
5929 if (isScalarAfterVectorization(I, VF)) {
5930 [[maybe_unused]] auto HasSingleCopyAfterVectorization =
5931 [this](Instruction *I, ElementCount VF) -> bool {
5932 if (VF.isScalar())
5933 return true;
5934
5935 auto Scalarized = InstsToScalarize.find(VF);
5936 assert(Scalarized != InstsToScalarize.end() &&
5937 "VF not yet analyzed for scalarization profitability");
5938 return !Scalarized->second.count(I) &&
5939 llvm::all_of(I->users(), [&](User *U) {
5940 auto *UI = cast<Instruction>(U);
5941 return !Scalarized->second.count(UI);
5942 });
5943 };
5944
5945 // With the exception of GEPs and PHIs, after scalarization there should
5946 // only be one copy of the instruction generated in the loop. This is
5947 // because the VF is either 1, or any instructions that need scalarizing
5948 // have already been dealt with by the time we get here. As a result,
5949 // it means we don't have to multiply the instruction cost by VF.
5950 assert(I->getOpcode() == Instruction::GetElementPtr ||
5951 I->getOpcode() == Instruction::PHI ||
5952 (I->getOpcode() == Instruction::BitCast &&
5953 I->getType()->isPointerTy()) ||
5954 HasSingleCopyAfterVectorization(I, VF));
5955 VectorTy = RetTy;
5956 } else
5957 VectorTy = toVectorizedTy(RetTy, VF);
5958
5959 if (VF.isVector() && VectorTy->isVectorTy() &&
5960 !TTI.getNumberOfParts(VectorTy))
5962
5963 // TODO: We need to estimate the cost of intrinsic calls.
5964 switch (I->getOpcode()) {
5965 case Instruction::GetElementPtr:
5966 // We mark this instruction as zero-cost because the cost of GEPs in
5967 // vectorized code depends on whether the corresponding memory instruction
5968 // is scalarized or not. Therefore, we handle GEPs with the memory
5969 // instruction cost.
5970 return 0;
5971 case Instruction::Br: {
5972 // In cases of scalarized and predicated instructions, there will be VF
5973 // predicated blocks in the vectorized loop. Each branch around these
5974 // blocks requires also an extract of its vector compare i1 element.
5975 // Note that the conditional branch from the loop latch will be replaced by
5976 // a single branch controlling the loop, so there is no extra overhead from
5977 // scalarization.
5978 bool ScalarPredicatedBB = false;
5980 if (VF.isVector() && BI->isConditional() &&
5981 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
5982 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&
5983 BI->getParent() != TheLoop->getLoopLatch())
5984 ScalarPredicatedBB = true;
5985
5986 if (ScalarPredicatedBB) {
5987 // Not possible to scalarize scalable vector with predicated instructions.
5988 if (VF.isScalable())
5990 // Return cost for branches around scalarized and predicated blocks.
5991 auto *VecI1Ty =
5993 return (
5994 TTI.getScalarizationOverhead(
5995 VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
5996 /*Insert*/ false, /*Extract*/ true, CostKind) +
5997 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
5998 }
5999
6000 if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6001 // The back-edge branch will remain, as will all scalar branches.
6002 return TTI.getCFInstrCost(Instruction::Br, CostKind);
6003
6004 // This branch will be eliminated by if-conversion.
6005 return 0;
6006 // Note: We currently assume zero cost for an unconditional branch inside
6007 // a predicated block since it will become a fall-through, although we
6008 // may decide in the future to call TTI for all branches.
6009 }
6010 case Instruction::Switch: {
6011 if (VF.isScalar())
6012 return TTI.getCFInstrCost(Instruction::Switch, CostKind);
6013 auto *Switch = cast<SwitchInst>(I);
6014 return Switch->getNumCases() *
6015 TTI.getCmpSelInstrCost(
6016 Instruction::ICmp,
6017 toVectorTy(Switch->getCondition()->getType(), VF),
6018 toVectorTy(Type::getInt1Ty(I->getContext()), VF),
6020 }
6021 case Instruction::PHI: {
6022 auto *Phi = cast<PHINode>(I);
6023
6024 // First-order recurrences are replaced by vector shuffles inside the loop.
6025 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6027 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6028 return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
6029 cast<VectorType>(VectorTy),
6030 cast<VectorType>(VectorTy), Mask, CostKind,
6031 VF.getKnownMinValue() - 1);
6032 }
6033
6034 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6035 // converted into select instructions. We require N - 1 selects per phi
6036 // node, where N is the number of incoming values.
6037 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) {
6038 Type *ResultTy = Phi->getType();
6039
6040 // All instructions in an Any-of reduction chain are narrowed to bool.
6041 // Check if that is the case for this phi node.
6042 auto *HeaderUser = cast_if_present<PHINode>(
6043 find_singleton<User>(Phi->users(), [this](User *U, bool) -> User * {
6044 auto *Phi = dyn_cast<PHINode>(U);
6045 if (Phi && Phi->getParent() == TheLoop->getHeader())
6046 return Phi;
6047 return nullptr;
6048 }));
6049 if (HeaderUser) {
6050 auto &ReductionVars = Legal->getReductionVars();
6051 auto Iter = ReductionVars.find(HeaderUser);
6052 if (Iter != ReductionVars.end() &&
6054 Iter->second.getRecurrenceKind()))
6055 ResultTy = Type::getInt1Ty(Phi->getContext());
6056 }
6057 return (Phi->getNumIncomingValues() - 1) *
6058 TTI.getCmpSelInstrCost(
6059 Instruction::Select, toVectorTy(ResultTy, VF),
6060 toVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6062 }
6063
6064 // When tail folding with EVL, if the phi is part of an out of loop
6065 // reduction then it will be transformed into a wide vp_merge.
6066 if (VF.isVector() && foldTailWithEVL() &&
6067 Legal->getReductionVars().contains(Phi) && !isInLoopReduction(Phi)) {
6069 Intrinsic::vp_merge, toVectorTy(Phi->getType(), VF),
6070 {toVectorTy(Type::getInt1Ty(Phi->getContext()), VF)});
6071 return TTI.getIntrinsicInstrCost(ICA, CostKind);
6072 }
6073
6074 return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6075 }
6076 case Instruction::UDiv:
6077 case Instruction::SDiv:
6078 case Instruction::URem:
6079 case Instruction::SRem:
6080 if (VF.isVector() && isPredicatedInst(I)) {
6081 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6082 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6083 ScalarCost : SafeDivisorCost;
6084 }
6085 // We've proven all lanes safe to speculate, fall through.
6086 [[fallthrough]];
6087 case Instruction::Add:
6088 case Instruction::Sub: {
6089 auto Info = Legal->getHistogramInfo(I);
6090 if (Info && VF.isVector()) {
6091 const HistogramInfo *HGram = Info.value();
6092 // Assume that a non-constant update value (or a constant != 1) requires
6093 // a multiply, and add that into the cost.
6095 ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1));
6096 if (!RHS || RHS->getZExtValue() != 1)
6097 MulCost =
6098 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6099
6100 // Find the cost of the histogram operation itself.
6101 Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF);
6102 Type *ScalarTy = I->getType();
6103 Type *MaskTy = VectorType::get(Type::getInt1Ty(I->getContext()), VF);
6104 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
6105 Type::getVoidTy(I->getContext()),
6106 {PtrTy, ScalarTy, MaskTy});
6107
6108 // Add the costs together with the add/sub operation.
6109 return TTI.getIntrinsicInstrCost(ICA, CostKind) + MulCost +
6110 TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, CostKind);
6111 }
6112 [[fallthrough]];
6113 }
6114 case Instruction::FAdd:
6115 case Instruction::FSub:
6116 case Instruction::Mul:
6117 case Instruction::FMul:
6118 case Instruction::FDiv:
6119 case Instruction::FRem:
6120 case Instruction::Shl:
6121 case Instruction::LShr:
6122 case Instruction::AShr:
6123 case Instruction::And:
6124 case Instruction::Or:
6125 case Instruction::Xor: {
6126 // If we're speculating on the stride being 1, the multiplication may
6127 // fold away. We can generalize this for all operations using the notion
6128 // of neutral elements. (TODO)
6129 if (I->getOpcode() == Instruction::Mul &&
6130 ((TheLoop->isLoopInvariant(I->getOperand(0)) &&
6131 PSE.getSCEV(I->getOperand(0))->isOne()) ||
6132 (TheLoop->isLoopInvariant(I->getOperand(1)) &&
6133 PSE.getSCEV(I->getOperand(1))->isOne())))
6134 return 0;
6135
6136 // Detect reduction patterns
6137 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
6138 return *RedCost;
6139
6140 // Certain instructions can be cheaper to vectorize if they have a constant
6141 // second vector operand. One example of this are shifts on x86.
6142 Value *Op2 = I->getOperand(1);
6143 if (!isa<Constant>(Op2) && TheLoop->isLoopInvariant(Op2) &&
6144 PSE.getSE()->isSCEVable(Op2->getType()) &&
6145 isa<SCEVConstant>(PSE.getSCEV(Op2))) {
6146 Op2 = cast<SCEVConstant>(PSE.getSCEV(Op2))->getValue();
6147 }
6148 auto Op2Info = TTI.getOperandInfo(Op2);
6149 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6152
6153 SmallVector<const Value *, 4> Operands(I->operand_values());
6154 return TTI.getArithmeticInstrCost(
6155 I->getOpcode(), VectorTy, CostKind,
6156 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6157 Op2Info, Operands, I, TLI);
6158 }
6159 case Instruction::FNeg: {
6160 return TTI.getArithmeticInstrCost(
6161 I->getOpcode(), VectorTy, CostKind,
6162 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6163 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6164 I->getOperand(0), I);
6165 }
6166 case Instruction::Select: {
6168 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6169 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6170
6171 const Value *Op0, *Op1;
6172 using namespace llvm::PatternMatch;
6173 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
6174 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
6175 // select x, y, false --> x & y
6176 // select x, true, y --> x | y
6177 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
6178 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
6179 assert(Op0->getType()->getScalarSizeInBits() == 1 &&
6180 Op1->getType()->getScalarSizeInBits() == 1);
6181
6182 return TTI.getArithmeticInstrCost(
6183 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And,
6184 VectorTy, CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, {Op0, Op1}, I);
6185 }
6186
6187 Type *CondTy = SI->getCondition()->getType();
6188 if (!ScalarCond)
6189 CondTy = VectorType::get(CondTy, VF);
6190
6192 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
6193 Pred = Cmp->getPredicate();
6194 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
6195 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
6196 {TTI::OK_AnyValue, TTI::OP_None}, I);
6197 }
6198 case Instruction::ICmp:
6199 case Instruction::FCmp: {
6200 Type *ValTy = I->getOperand(0)->getType();
6201
6203 [[maybe_unused]] Instruction *Op0AsInstruction =
6204 dyn_cast<Instruction>(I->getOperand(0));
6205 assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) ||
6206 MinBWs[I] == MinBWs[Op0AsInstruction]) &&
6207 "if both the operand and the compare are marked for "
6208 "truncation, they must have the same bitwidth");
6209 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]);
6210 }
6211
6212 VectorTy = toVectorTy(ValTy, VF);
6213 return TTI.getCmpSelInstrCost(
6214 I->getOpcode(), VectorTy, CmpInst::makeCmpResultType(VectorTy),
6215 cast<CmpInst>(I)->getPredicate(), CostKind,
6216 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, I);
6217 }
6218 case Instruction::Store:
6219 case Instruction::Load: {
6220 ElementCount Width = VF;
6221 if (Width.isVector()) {
6222 InstWidening Decision = getWideningDecision(I, Width);
6223 assert(Decision != CM_Unknown &&
6224 "CM decision should be taken at this point");
6227 if (Decision == CM_Scalarize)
6228 Width = ElementCount::getFixed(1);
6229 }
6230 VectorTy = toVectorTy(getLoadStoreType(I), Width);
6231 return getMemoryInstructionCost(I, VF);
6232 }
6233 case Instruction::BitCast:
6234 if (I->getType()->isPointerTy())
6235 return 0;
6236 [[fallthrough]];
6237 case Instruction::ZExt:
6238 case Instruction::SExt:
6239 case Instruction::FPToUI:
6240 case Instruction::FPToSI:
6241 case Instruction::FPExt:
6242 case Instruction::PtrToInt:
6243 case Instruction::IntToPtr:
6244 case Instruction::SIToFP:
6245 case Instruction::UIToFP:
6246 case Instruction::Trunc:
6247 case Instruction::FPTrunc: {
6248 // Computes the CastContextHint from a Load/Store instruction.
6249 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6251 "Expected a load or a store!");
6252
6253 if (VF.isScalar() || !TheLoop->contains(I))
6255
6256 switch (getWideningDecision(I, VF)) {
6268 llvm_unreachable("Instr did not go through cost modelling?");
6271 llvm_unreachable_internal("Instr has invalid widening decision");
6272 }
6273
6274 llvm_unreachable("Unhandled case!");
6275 };
6276
6277 unsigned Opcode = I->getOpcode();
6279 // For Trunc, the context is the only user, which must be a StoreInst.
6280 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6281 if (I->hasOneUse())
6282 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
6283 CCH = ComputeCCH(Store);
6284 }
6285 // For Z/Sext, the context is the operand, which must be a LoadInst.
6286 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6287 Opcode == Instruction::FPExt) {
6288 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
6289 CCH = ComputeCCH(Load);
6290 }
6291
6292 // We optimize the truncation of induction variables having constant
6293 // integer steps. The cost of these truncations is the same as the scalar
6294 // operation.
6295 if (isOptimizableIVTruncate(I, VF)) {
6296 auto *Trunc = cast<TruncInst>(I);
6297 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6298 Trunc->getSrcTy(), CCH, CostKind, Trunc);
6299 }
6300
6301 // Detect reduction patterns
6302 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
6303 return *RedCost;
6304
6305 Type *SrcScalarTy = I->getOperand(0)->getType();
6306 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6307 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6308 SrcScalarTy =
6309 IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]);
6310 Type *SrcVecTy =
6311 VectorTy->isVectorTy() ? toVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6312
6314 // If the result type is <= the source type, there will be no extend
6315 // after truncating the users to the minimal required bitwidth.
6316 if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
6317 (I->getOpcode() == Instruction::ZExt ||
6318 I->getOpcode() == Instruction::SExt))
6319 return 0;
6320 }
6321
6322 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
6323 }
6324 case Instruction::Call:
6325 return getVectorCallCost(cast<CallInst>(I), VF);
6326 case Instruction::ExtractValue:
6327 return TTI.getInstructionCost(I, CostKind);
6328 case Instruction::Alloca:
6329 // We cannot easily widen alloca to a scalable alloca, as
6330 // the result would need to be a vector of pointers.
6331 if (VF.isScalable())
6333 [[fallthrough]];
6334 default:
6335 // This opcode is unknown. Assume that it is the same as 'mul'.
6336 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6337 } // end of switch.
6338}
6339
6341 // Ignore ephemeral values.
6343
6344 SmallVector<Value *, 4> DeadInterleavePointerOps;
6346
6347 // If a scalar epilogue is required, users outside the loop won't use
6348 // live-outs from the vector loop but from the scalar epilogue. Ignore them if
6349 // that is the case.
6350 bool RequiresScalarEpilogue = requiresScalarEpilogue(true);
6351 auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) {
6352 return RequiresScalarEpilogue &&
6353 !TheLoop->contains(cast<Instruction>(U)->getParent());
6354 };
6355
6357 DFS.perform(LI);
6358 MapVector<Value *, SmallVector<Value *>> DeadInvariantStoreOps;
6359 for (BasicBlock *BB : reverse(make_range(DFS.beginRPO(), DFS.endRPO())))
6360 for (Instruction &I : reverse(*BB)) {
6361 // Find all stores to invariant variables. Since they are going to sink
6362 // outside the loop we do not need calculate cost for them.
6363 StoreInst *SI;
6364 if ((SI = dyn_cast<StoreInst>(&I)) &&
6365 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
6366 ValuesToIgnore.insert(&I);
6367 DeadInvariantStoreOps[SI->getPointerOperand()].push_back(
6368 SI->getValueOperand());
6369 }
6370
6371 if (VecValuesToIgnore.contains(&I) || ValuesToIgnore.contains(&I))
6372 continue;
6373
6374 // Add instructions that would be trivially dead and are only used by
6375 // values already ignored to DeadOps to seed worklist.
6377 all_of(I.users(), [this, IsLiveOutDead](User *U) {
6378 return VecValuesToIgnore.contains(U) ||
6379 ValuesToIgnore.contains(U) || IsLiveOutDead(U);
6380 }))
6381 DeadOps.push_back(&I);
6382
6383 // For interleave groups, we only create a pointer for the start of the
6384 // interleave group. Queue up addresses of group members except the insert
6385 // position for further processing.
6386 if (isAccessInterleaved(&I)) {
6387 auto *Group = getInterleavedAccessGroup(&I);
6388 if (Group->getInsertPos() == &I)
6389 continue;
6390 Value *PointerOp = getLoadStorePointerOperand(&I);
6391 DeadInterleavePointerOps.push_back(PointerOp);
6392 }
6393
6394 // Queue branches for analysis. They are dead, if their successors only
6395 // contain dead instructions.
6396 if (auto *Br = dyn_cast<BranchInst>(&I)) {
6397 if (Br->isConditional())
6398 DeadOps.push_back(&I);
6399 }
6400 }
6401
6402 // Mark ops feeding interleave group members as free, if they are only used
6403 // by other dead computations.
6404 for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
6405 auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]);
6406 if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) {
6407 Instruction *UI = cast<Instruction>(U);
6408 return !VecValuesToIgnore.contains(U) &&
6409 (!isAccessInterleaved(UI) ||
6410 getInterleavedAccessGroup(UI)->getInsertPos() == UI);
6411 }))
6412 continue;
6413 VecValuesToIgnore.insert(Op);
6414 append_range(DeadInterleavePointerOps, Op->operands());
6415 }
6416
6417 for (const auto &[_, Ops] : DeadInvariantStoreOps)
6418 llvm::append_range(DeadOps, drop_end(Ops));
6419
6420 // Mark ops that would be trivially dead and are only used by ignored
6421 // instructions as free.
6422 BasicBlock *Header = TheLoop->getHeader();
6423
6424 // Returns true if the block contains only dead instructions. Such blocks will
6425 // be removed by VPlan-to-VPlan transforms and won't be considered by the
6426 // VPlan-based cost model, so skip them in the legacy cost-model as well.
6427 auto IsEmptyBlock = [this](BasicBlock *BB) {
6428 return all_of(*BB, [this](Instruction &I) {
6429 return ValuesToIgnore.contains(&I) || VecValuesToIgnore.contains(&I) ||
6430 (isa<BranchInst>(&I) && !cast<BranchInst>(&I)->isConditional());
6431 });
6432 };
6433 for (unsigned I = 0; I != DeadOps.size(); ++I) {
6434 auto *Op = dyn_cast<Instruction>(DeadOps[I]);
6435
6436 // Check if the branch should be considered dead.
6437 if (auto *Br = dyn_cast_or_null<BranchInst>(Op)) {
6438 BasicBlock *ThenBB = Br->getSuccessor(0);
6439 BasicBlock *ElseBB = Br->getSuccessor(1);
6440 // Don't considers branches leaving the loop for simplification.
6441 if (!TheLoop->contains(ThenBB) || !TheLoop->contains(ElseBB))
6442 continue;
6443 bool ThenEmpty = IsEmptyBlock(ThenBB);
6444 bool ElseEmpty = IsEmptyBlock(ElseBB);
6445 if ((ThenEmpty && ElseEmpty) ||
6446 (ThenEmpty && ThenBB->getSingleSuccessor() == ElseBB &&
6447 ElseBB->phis().empty()) ||
6448 (ElseEmpty && ElseBB->getSingleSuccessor() == ThenBB &&
6449 ThenBB->phis().empty())) {
6450 VecValuesToIgnore.insert(Br);
6451 DeadOps.push_back(Br->getCondition());
6452 }
6453 continue;
6454 }
6455
6456 // Skip any op that shouldn't be considered dead.
6457 if (!Op || !TheLoop->contains(Op) ||
6458 (isa<PHINode>(Op) && Op->getParent() == Header) ||
6460 any_of(Op->users(), [this, IsLiveOutDead](User *U) {
6461 return !VecValuesToIgnore.contains(U) &&
6462 !ValuesToIgnore.contains(U) && !IsLiveOutDead(U);
6463 }))
6464 continue;
6465
6466 // If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore
6467 // which applies for both scalar and vector versions. Otherwise it is only
6468 // dead in vector versions, so only add it to VecValuesToIgnore.
6469 if (all_of(Op->users(),
6470 [this](User *U) { return ValuesToIgnore.contains(U); }))
6471 ValuesToIgnore.insert(Op);
6472
6473 VecValuesToIgnore.insert(Op);
6474 append_range(DeadOps, Op->operands());
6475 }
6476
6477 // Ignore type-promoting instructions we identified during reduction
6478 // detection.
6479 for (const auto &Reduction : Legal->getReductionVars()) {
6480 const RecurrenceDescriptor &RedDes = Reduction.second;
6481 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6482 VecValuesToIgnore.insert_range(Casts);
6483 }
6484 // Ignore type-casting instructions we identified during induction
6485 // detection.
6486 for (const auto &Induction : Legal->getInductionVars()) {
6487 const InductionDescriptor &IndDes = Induction.second;
6488 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6489 VecValuesToIgnore.insert_range(Casts);
6490 }
6491}
6492
6494 // Avoid duplicating work finding in-loop reductions.
6495 if (!InLoopReductions.empty())
6496 return;
6497
6498 for (const auto &Reduction : Legal->getReductionVars()) {
6499 PHINode *Phi = Reduction.first;
6500 const RecurrenceDescriptor &RdxDesc = Reduction.second;
6501
6502 // We don't collect reductions that are type promoted (yet).
6503 if (RdxDesc.getRecurrenceType() != Phi->getType())
6504 continue;
6505
6506 // If the target would prefer this reduction to happen "in-loop", then we
6507 // want to record it as such.
6508 RecurKind Kind = RdxDesc.getRecurrenceKind();
6509 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
6510 !TTI.preferInLoopReduction(Kind, Phi->getType()))
6511 continue;
6512
6513 // Check that we can correctly put the reductions into the loop, by
6514 // finding the chain of operations that leads from the phi to the loop
6515 // exit value.
6516 SmallVector<Instruction *, 4> ReductionOperations =
6517 RdxDesc.getReductionOpChain(Phi, TheLoop);
6518 bool InLoop = !ReductionOperations.empty();
6519
6520 if (InLoop) {
6521 InLoopReductions.insert(Phi);
6522 // Add the elements to InLoopReductionImmediateChains for cost modelling.
6523 Instruction *LastChain = Phi;
6524 for (auto *I : ReductionOperations) {
6525 InLoopReductionImmediateChains[I] = LastChain;
6526 LastChain = I;
6527 }
6528 }
6529 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
6530 << " reduction for phi: " << *Phi << "\n");
6531 }
6532}
6533
6534// This function will select a scalable VF if the target supports scalable
6535// vectors and a fixed one otherwise.
6536// TODO: we could return a pair of values that specify the max VF and
6537// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6538// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6539// doesn't have a cost model that can choose which plan to execute if
6540// more than one is generated.
6543 unsigned WidestType;
6544 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6545
6547 TTI.enableScalableVectorization()
6550
6551 TypeSize RegSize = TTI.getRegisterBitWidth(RegKind);
6552 unsigned N = RegSize.getKnownMinValue() / WidestType;
6553 return ElementCount::get(N, RegSize.isScalable());
6554}
6555
6558 ElementCount VF = UserVF;
6559 // Outer loop handling: They may require CFG and instruction level
6560 // transformations before even evaluating whether vectorization is profitable.
6561 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6562 // the vectorization pipeline.
6563 if (!OrigLoop->isInnermost()) {
6564 // If the user doesn't provide a vectorization factor, determine a
6565 // reasonable one.
6566 if (UserVF.isZero()) {
6567 VF = determineVPlanVF(TTI, CM);
6568 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6569
6570 // Make sure we have a VF > 1 for stress testing.
6571 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
6572 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6573 << "overriding computed VF.\n");
6574 VF = ElementCount::getFixed(4);
6575 }
6576 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
6578 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
6579 << "not supported by the target.\n");
6581 "Scalable vectorization requested but not supported by the target",
6582 "the scalable user-specified vectorization width for outer-loop "
6583 "vectorization cannot be used because the target does not support "
6584 "scalable vectors.",
6585 "ScalableVFUnfeasible", ORE, OrigLoop);
6587 }
6588 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6590 "VF needs to be a power of two");
6591 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
6592 << "VF " << VF << " to build VPlans.\n");
6593 buildVPlans(VF, VF);
6594
6595 if (VPlans.empty())
6597
6598 // For VPlan build stress testing, we bail out after VPlan construction.
6601
6602 return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
6603 }
6604
6605 LLVM_DEBUG(
6606 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6607 "VPlan-native path.\n");
6609}
6610
6611void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
6612 assert(OrigLoop->isInnermost() && "Inner loop expected.");
6613 CM.collectValuesToIgnore();
6614 CM.collectElementTypesForWidening();
6615
6616 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
6617 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
6618 return;
6619
6620 // Invalidate interleave groups if all blocks of loop will be predicated.
6621 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
6623 LLVM_DEBUG(
6624 dbgs()
6625 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6626 "which requires masked-interleaved support.\n");
6627 if (CM.InterleaveInfo.invalidateGroups())
6628 // Invalidating interleave groups also requires invalidating all decisions
6629 // based on them, which includes widening decisions and uniform and scalar
6630 // values.
6631 CM.invalidateCostModelingDecisions();
6632 }
6633
6634 if (CM.foldTailByMasking())
6635 Legal->prepareToFoldTailByMasking();
6636
6637 ElementCount MaxUserVF =
6638 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
6639 if (UserVF) {
6640 if (!ElementCount::isKnownLE(UserVF, MaxUserVF)) {
6642 "UserVF ignored because it may be larger than the maximal safe VF",
6643 "InvalidUserVF", ORE, OrigLoop);
6644 } else {
6646 "VF needs to be a power of two");
6647 // Collect the instructions (and their associated costs) that will be more
6648 // profitable to scalarize.
6649 CM.collectInLoopReductions();
6650 if (CM.selectUserVectorizationFactor(UserVF)) {
6651 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6652 buildVPlansWithVPRecipes(UserVF, UserVF);
6654 return;
6655 }
6656 reportVectorizationInfo("UserVF ignored because of invalid costs.",
6657 "InvalidCost", ORE, OrigLoop);
6658 }
6659 }
6660
6661 // Collect the Vectorization Factor Candidates.
6662 SmallVector<ElementCount> VFCandidates;
6663 for (auto VF = ElementCount::getFixed(1);
6664 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
6665 VFCandidates.push_back(VF);
6666 for (auto VF = ElementCount::getScalable(1);
6667 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
6668 VFCandidates.push_back(VF);
6669
6670 CM.collectInLoopReductions();
6671 for (const auto &VF : VFCandidates) {
6672 // Collect Uniform and Scalar instructions after vectorization with VF.
6673 CM.collectNonVectorizedAndSetWideningDecisions(VF);
6674 }
6675
6676 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
6677 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
6678
6680}
6681
6683 ElementCount VF) const {
6684 InstructionCost Cost = CM.getInstructionCost(UI, VF);
6685 if (Cost.isValid() && ForceTargetInstructionCost.getNumOccurrences())
6687 return Cost;
6688}
6689
6691 ElementCount VF) const {
6692 return CM.isUniformAfterVectorization(I, VF);
6693}
6694
6695bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
6696 return CM.ValuesToIgnore.contains(UI) ||
6697 (IsVector && CM.VecValuesToIgnore.contains(UI)) ||
6698 SkipCostComputation.contains(UI);
6699}
6700
6702LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
6703 VPCostContext &CostCtx) const {
6705 // Cost modeling for inductions is inaccurate in the legacy cost model
6706 // compared to the recipes that are generated. To match here initially during
6707 // VPlan cost model bring up directly use the induction costs from the legacy
6708 // cost model. Note that we do this as pre-processing; the VPlan may not have
6709 // any recipes associated with the original induction increment instruction
6710 // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
6711 // the cost of induction phis and increments (both that are represented by
6712 // recipes and those that are not), to avoid distinguishing between them here,
6713 // and skip all recipes that represent induction phis and increments (the
6714 // former case) later on, if they exist, to avoid counting them twice.
6715 // Similarly we pre-compute the cost of any optimized truncates.
6716 // TODO: Switch to more accurate costing based on VPlan.
6717 for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
6719 IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
6720 SmallVector<Instruction *> IVInsts = {IVInc};
6721 for (unsigned I = 0; I != IVInsts.size(); I++) {
6722 for (Value *Op : IVInsts[I]->operands()) {
6723 auto *OpI = dyn_cast<Instruction>(Op);
6724 if (Op == IV || !OpI || !OrigLoop->contains(OpI) || !Op->hasOneUse())
6725 continue;
6726 IVInsts.push_back(OpI);
6727 }
6728 }
6729 IVInsts.push_back(IV);
6730 for (User *U : IV->users()) {
6731 auto *CI = cast<Instruction>(U);
6732 if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF))
6733 continue;
6734 IVInsts.push_back(CI);
6735 }
6736
6737 // If the vector loop gets executed exactly once with the given VF, ignore
6738 // the costs of comparison and induction instructions, as they'll get
6739 // simplified away.
6740 // TODO: Remove this code after stepping away from the legacy cost model and
6741 // adding code to simplify VPlans before calculating their costs.
6742 auto TC = getSmallConstantTripCount(PSE.getSE(), OrigLoop);
6743 if (TC == VF && !CM.foldTailByMasking())
6744 addFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(),
6745 CostCtx.SkipCostComputation);
6746
6747 for (Instruction *IVInst : IVInsts) {
6748 if (CostCtx.skipCostComputation(IVInst, VF.isVector()))
6749 continue;
6750 InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF);
6751 LLVM_DEBUG({
6752 dbgs() << "Cost of " << InductionCost << " for VF " << VF
6753 << ": induction instruction " << *IVInst << "\n";
6754 });
6755 Cost += InductionCost;
6756 CostCtx.SkipCostComputation.insert(IVInst);
6757 }
6758 }
6759
6760 /// Compute the cost of all exiting conditions of the loop using the legacy
6761 /// cost model. This is to match the legacy behavior, which adds the cost of
6762 /// all exit conditions. Note that this over-estimates the cost, as there will
6763 /// be a single condition to control the vector loop.
6765 CM.TheLoop->getExitingBlocks(Exiting);
6766 SetVector<Instruction *> ExitInstrs;
6767 // Collect all exit conditions.
6768 for (BasicBlock *EB : Exiting) {
6769 auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
6770 if (!Term || CostCtx.skipCostComputation(Term, VF.isVector()))
6771 continue;
6772 if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
6773 ExitInstrs.insert(CondI);
6774 }
6775 }
6776 // Compute the cost of all instructions only feeding the exit conditions.
6777 for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
6778 Instruction *CondI = ExitInstrs[I];
6779 if (!OrigLoop->contains(CondI) ||
6780 !CostCtx.SkipCostComputation.insert(CondI).second)
6781 continue;
6782 InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF);
6783 LLVM_DEBUG({
6784 dbgs() << "Cost of " << CondICost << " for VF " << VF
6785 << ": exit condition instruction " << *CondI << "\n";
6786 });
6787 Cost += CondICost;
6788 for (Value *Op : CondI->operands()) {
6789 auto *OpI = dyn_cast<Instruction>(Op);
6790 if (!OpI || CostCtx.skipCostComputation(OpI, VF.isVector()) ||
6791 any_of(OpI->users(), [&ExitInstrs, this](User *U) {
6792 return OrigLoop->contains(cast<Instruction>(U)->getParent()) &&
6793 !ExitInstrs.contains(cast<Instruction>(U));
6794 }))
6795 continue;
6796 ExitInstrs.insert(OpI);
6797 }
6798 }
6799
6800 // Pre-compute the costs for branches except for the backedge, as the number
6801 // of replicate regions in a VPlan may not directly match the number of
6802 // branches, which would lead to different decisions.
6803 // TODO: Compute cost of branches for each replicate region in the VPlan,
6804 // which is more accurate than the legacy cost model.
6805 for (BasicBlock *BB : OrigLoop->blocks()) {
6806 if (CostCtx.skipCostComputation(BB->getTerminator(), VF.isVector()))
6807 continue;
6808 CostCtx.SkipCostComputation.insert(BB->getTerminator());
6809 if (BB == OrigLoop->getLoopLatch())
6810 continue;
6811 auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF);
6812 Cost += BranchCost;
6813 }
6814
6815 // Pre-compute costs for instructions that are forced-scalar or profitable to
6816 // scalarize. Their costs will be computed separately in the legacy cost
6817 // model.
6818 for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) {
6819 if (CostCtx.skipCostComputation(ForcedScalar, VF.isVector()))
6820 continue;
6821 CostCtx.SkipCostComputation.insert(ForcedScalar);
6822 InstructionCost ForcedCost = CostCtx.getLegacyCost(ForcedScalar, VF);
6823 LLVM_DEBUG({
6824 dbgs() << "Cost of " << ForcedCost << " for VF " << VF
6825 << ": forced scalar " << *ForcedScalar << "\n";
6826 });
6827 Cost += ForcedCost;
6828 }
6829 for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) {
6830 if (CostCtx.skipCostComputation(Scalarized, VF.isVector()))
6831 continue;
6832 CostCtx.SkipCostComputation.insert(Scalarized);
6833 LLVM_DEBUG({
6834 dbgs() << "Cost of " << ScalarCost << " for VF " << VF
6835 << ": profitable to scalarize " << *Scalarized << "\n";
6836 });
6837 Cost += ScalarCost;
6838 }
6839
6840 return Cost;
6841}
6842
6843InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
6844 ElementCount VF) const {
6845 VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind);
6846 InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
6847
6848 // Now compute and add the VPlan-based cost.
6849 Cost += Plan.cost(VF, CostCtx);
6850#ifndef NDEBUG
6851 unsigned EstimatedWidth = estimateElementCount(VF, CM.getVScaleForTuning());
6852 LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
6853 << " (Estimated cost per lane: ");
6854 if (Cost.isValid()) {
6855 double CostPerLane = double(Cost.getValue()) / EstimatedWidth;
6856 LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane));
6857 } else /* No point dividing an invalid cost - it will still be invalid */
6858 LLVM_DEBUG(dbgs() << "Invalid");
6859 LLVM_DEBUG(dbgs() << ")\n");
6860#endif
6861 return Cost;
6862}
6863
6864#ifndef NDEBUG
6865/// Return true if the original loop \ TheLoop contains any instructions that do
6866/// not have corresponding recipes in \p Plan and are not marked to be ignored
6867/// in \p CostCtx. This means the VPlan contains simplification that the legacy
6868/// cost-model did not account for.
6870 VPCostContext &CostCtx,
6871 Loop *TheLoop,
6872 ElementCount VF) {
6873 // First collect all instructions for the recipes in Plan.
6874 auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction * {
6875 if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
6876 return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
6877 if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
6878 return &WidenMem->getIngredient();
6879 return nullptr;
6880 };
6881
6882 DenseSet<Instruction *> SeenInstrs;
6883 auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry());
6885 for (VPRecipeBase &R : *VPBB) {
6886 if (auto *IR = dyn_cast<VPInterleaveRecipe>(&R)) {
6887 auto *IG = IR->getInterleaveGroup();
6888 unsigned NumMembers = IG->getNumMembers();
6889 for (unsigned I = 0; I != NumMembers; ++I) {
6890 if (Instruction *M = IG->getMember(I))
6891 SeenInstrs.insert(M);
6892 }
6893 continue;
6894 }
6895 // Unused FOR splices are removed by VPlan transforms, so the VPlan-based
6896 // cost model won't cost it whilst the legacy will.
6897 if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) {
6898 if (none_of(FOR->users(), [](VPUser *U) {
6899 auto *VPI = dyn_cast<VPInstruction>(U);
6900 return VPI && VPI->getOpcode() ==
6901 VPInstruction::FirstOrderRecurrenceSplice;
6902 }))
6903 return true;
6904 }
6905 // The VPlan-based cost model is more accurate for partial reduction and
6906 // comparing against the legacy cost isn't desirable.
6908 return true;
6909
6910 /// If a VPlan transform folded a recipe to one producing a single-scalar,
6911 /// but the original instruction wasn't uniform-after-vectorization in the
6912 /// legacy cost model, the legacy cost overestimates the actual cost.
6913 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
6914 if (RepR->isSingleScalar() &&
6916 RepR->getUnderlyingInstr(), VF))
6917 return true;
6918 }
6919 if (Instruction *UI = GetInstructionForCost(&R)) {
6920 // If we adjusted the predicate of the recipe, the cost in the legacy
6921 // cost model may be different.
6922 using namespace VPlanPatternMatch;
6923 CmpPredicate Pred;
6924 if (match(&R, m_Cmp(Pred, m_VPValue(), m_VPValue())) &&
6925 cast<VPRecipeWithIRFlags>(R).getPredicate() !=
6926 cast<CmpInst>(UI)->getPredicate())
6927 return true;
6928 SeenInstrs.insert(UI);
6929 }
6930 }
6931 }
6932
6933 // Return true if the loop contains any instructions that are not also part of
6934 // the VPlan or are skipped for VPlan-based cost computations. This indicates
6935 // that the VPlan contains extra simplifications.
6936 return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx,
6937 TheLoop](BasicBlock *BB) {
6938 return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) {
6939 // Skip induction phis when checking for simplifications, as they may not
6940 // be lowered directly be lowered to a corresponding PHI recipe.
6941 if (isa<PHINode>(&I) && BB == TheLoop->getHeader() &&
6942 CostCtx.CM.Legal->isInductionPhi(cast<PHINode>(&I)))
6943 return false;
6944 return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true);
6945 });
6946 });
6947}
6948#endif
6949
6951 if (VPlans.empty())
6953 // If there is a single VPlan with a single VF, return it directly.
6954 VPlan &FirstPlan = *VPlans[0];
6955 if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
6956 return {*FirstPlan.vectorFactors().begin(), 0, 0};
6957
6958 LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: "
6959 << (CM.CostKind == TTI::TCK_RecipThroughput
6960 ? "Reciprocal Throughput\n"
6961 : CM.CostKind == TTI::TCK_Latency
6962 ? "Instruction Latency\n"
6963 : CM.CostKind == TTI::TCK_CodeSize ? "Code Size\n"
6964 : CM.CostKind == TTI::TCK_SizeAndLatency
6965 ? "Code Size and Latency\n"
6966 : "Unknown\n"));
6967
6969 assert(hasPlanWithVF(ScalarVF) &&
6970 "More than a single plan/VF w/o any plan having scalar VF");
6971
6972 // TODO: Compute scalar cost using VPlan-based cost model.
6973 InstructionCost ScalarCost = CM.expectedCost(ScalarVF);
6974 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n");
6975 VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost);
6976 VectorizationFactor BestFactor = ScalarFactor;
6977
6978 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
6979 if (ForceVectorization) {
6980 // Ignore scalar width, because the user explicitly wants vectorization.
6981 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
6982 // evaluation.
6983 BestFactor.Cost = InstructionCost::getMax();
6984 }
6985
6986 for (auto &P : VPlans) {
6987 ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
6988 P->vectorFactors().end());
6989
6991 if (any_of(VFs, [this](ElementCount VF) {
6992 return CM.shouldConsiderRegPressureForVF(VF);
6993 }))
6994 RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
6995
6996 for (unsigned I = 0; I < VFs.size(); I++) {
6997 ElementCount VF = VFs[I];
6998 if (VF.isScalar())
6999 continue;
7000 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
7001 LLVM_DEBUG(
7002 dbgs()
7003 << "LV: Not considering vector loop of width " << VF
7004 << " because it will not generate any vector instructions.\n");
7005 continue;
7006 }
7007 if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) {
7008 LLVM_DEBUG(
7009 dbgs()
7010 << "LV: Not considering vector loop of width " << VF
7011 << " because it would cause replicated blocks to be generated,"
7012 << " which isn't allowed when optimizing for size.\n");
7013 continue;
7014 }
7015
7016 InstructionCost Cost = cost(*P, VF);
7017 VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7018
7019 if (CM.shouldConsiderRegPressureForVF(VF) &&
7020 RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs)) {
7021 LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width "
7022 << VF << " because it uses too many registers\n");
7023 continue;
7024 }
7025
7026 if (isMoreProfitable(CurrentFactor, BestFactor, P->hasScalarTail()))
7027 BestFactor = CurrentFactor;
7028
7029 // If profitable add it to ProfitableVF list.
7030 if (isMoreProfitable(CurrentFactor, ScalarFactor, P->hasScalarTail()))
7031 ProfitableVFs.push_back(CurrentFactor);
7032 }
7033 }
7034
7035#ifndef NDEBUG
7036 // Select the optimal vectorization factor according to the legacy cost-model.
7037 // This is now only used to verify the decisions by the new VPlan-based
7038 // cost-model and will be retired once the VPlan-based cost-model is
7039 // stabilized.
7040 VectorizationFactor LegacyVF = selectVectorizationFactor();
7041 VPlan &BestPlan = getPlanFor(BestFactor.Width);
7042
7043 // Pre-compute the cost and use it to check if BestPlan contains any
7044 // simplifications not accounted for in the legacy cost model. If that's the
7045 // case, don't trigger the assertion, as the extra simplifications may cause a
7046 // different VF to be picked by the VPlan-based cost model.
7047 VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind);
7048 precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
7049 // Verify that the VPlan-based and legacy cost models agree, except for VPlans
7050 // with early exits and plans with additional VPlan simplifications. The
7051 // legacy cost model doesn't properly model costs for such loops.
7052 assert((BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() ||
7054 CostCtx, OrigLoop,
7055 BestFactor.Width) ||
7057 getPlanFor(LegacyVF.Width), CostCtx, OrigLoop, LegacyVF.Width)) &&
7058 " VPlan cost model and legacy cost model disagreed");
7059 assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
7060 "when vectorizing, the scalar cost must be computed.");
7061#endif
7062
7063 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n");
7064 return BestFactor;
7065}
7066
7068 using namespace VPlanPatternMatch;
7070 "RdxResult must be ComputeFindIVResult");
7071 VPValue *StartVPV = RdxResult->getOperand(1);
7072 match(StartVPV, m_Freeze(m_VPValue(StartVPV)));
7073 return StartVPV->getLiveInIRValue();
7074}
7075
7076// If \p EpiResumePhiR is resume VPPhi for a reduction when vectorizing the
7077// epilog loop, fix the reduction's scalar PHI node by adding the incoming value
7078// from the main vector loop.
7080 VPPhi *EpiResumePhiR, PHINode &EpiResumePhi, BasicBlock *BypassBlock) {
7081 // Get the VPInstruction computing the reduction result in the middle block.
7082 // The first operand may not be from the middle block if it is not connected
7083 // to the scalar preheader. In that case, there's nothing to fix.
7084 VPValue *Incoming = EpiResumePhiR->getOperand(0);
7087 auto *EpiRedResult = dyn_cast<VPInstruction>(Incoming);
7088 if (!EpiRedResult ||
7089 (EpiRedResult->getOpcode() != VPInstruction::ComputeAnyOfResult &&
7090 EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult &&
7091 EpiRedResult->getOpcode() != VPInstruction::ComputeFindIVResult))
7092 return;
7093
7094 auto *EpiRedHeaderPhi =
7095 cast<VPReductionPHIRecipe>(EpiRedResult->getOperand(0));
7096 RecurKind Kind = EpiRedHeaderPhi->getRecurrenceKind();
7097 Value *MainResumeValue;
7098 if (auto *VPI = dyn_cast<VPInstruction>(EpiRedHeaderPhi->getStartValue())) {
7099 assert((VPI->getOpcode() == VPInstruction::Broadcast ||
7100 VPI->getOpcode() == VPInstruction::ReductionStartVector) &&
7101 "unexpected start recipe");
7102 MainResumeValue = VPI->getOperand(0)->getUnderlyingValue();
7103 } else
7104 MainResumeValue = EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
7106 [[maybe_unused]] Value *StartV =
7107 EpiRedResult->getOperand(1)->getLiveInIRValue();
7108 auto *Cmp = cast<ICmpInst>(MainResumeValue);
7109 assert(Cmp->getPredicate() == CmpInst::ICMP_NE &&
7110 "AnyOf expected to start with ICMP_NE");
7111 assert(Cmp->getOperand(1) == StartV &&
7112 "AnyOf expected to start by comparing main resume value to original "
7113 "start value");
7114 MainResumeValue = Cmp->getOperand(0);
7116 Value *StartV = getStartValueFromReductionResult(EpiRedResult);
7117 Value *SentinelV = EpiRedResult->getOperand(2)->getLiveInIRValue();
7118 using namespace llvm::PatternMatch;
7119 Value *Cmp, *OrigResumeV, *CmpOp;
7120 [[maybe_unused]] bool IsExpectedPattern =
7121 match(MainResumeValue,
7122 m_Select(m_OneUse(m_Value(Cmp)), m_Specific(SentinelV),
7123 m_Value(OrigResumeV))) &&
7125 m_Value(CmpOp))) &&
7126 ((CmpOp == StartV && isGuaranteedNotToBeUndefOrPoison(CmpOp))));
7127 assert(IsExpectedPattern && "Unexpected reduction resume pattern");
7128 MainResumeValue = OrigResumeV;
7129 }
7130 PHINode *MainResumePhi = cast<PHINode>(MainResumeValue);
7131
7132 // When fixing reductions in the epilogue loop we should already have
7133 // created a bc.merge.rdx Phi after the main vector body. Ensure that we carry
7134 // over the incoming values correctly.
7135 EpiResumePhi.setIncomingValueForBlock(
7136 BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock));
7137}
7138
7140 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7141 InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue) {
7142 assert(BestVPlan.hasVF(BestVF) &&
7143 "Trying to execute plan with unsupported VF");
7144 assert(BestVPlan.hasUF(BestUF) &&
7145 "Trying to execute plan with unsupported UF");
7146 if (BestVPlan.hasEarlyExit())
7147 ++LoopsEarlyExitVectorized;
7148 // TODO: Move to VPlan transform stage once the transition to the VPlan-based
7149 // cost model is complete for better cost estimates.
7154 bool HasBranchWeights =
7155 hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator());
7156 if (HasBranchWeights) {
7157 std::optional<unsigned> VScale = CM.getVScaleForTuning();
7159 BestVPlan, BestVF, VScale);
7160 }
7161
7162 // Checks are the same for all VPlans, added to BestVPlan only for
7163 // compactness.
7164 attachRuntimeChecks(BestVPlan, ILV.RTChecks, HasBranchWeights);
7165
7166 // Retrieving VectorPH now when it's easier while VPlan still has Regions.
7167 VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader());
7168
7169 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7173 BestVPlan, BestVF,
7174 TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
7175 VPlanTransforms::cse(BestVPlan);
7177
7179 // Regions are dissolved after optimizing for VF and UF, which completely
7180 // removes unneeded loop regions first.
7182 // Canonicalize EVL loops after regions are dissolved.
7186 BestVPlan, VectorPH, CM.foldTailByMasking(),
7187 CM.requiresScalarEpilogue(BestVF.isVector()));
7188 VPlanTransforms::materializeVFAndVFxUF(BestVPlan, VectorPH, BestVF);
7190
7191 // 0. Generate SCEV-dependent code in the entry, including TripCount, before
7192 // making any changes to the CFG.
7193 DenseMap<const SCEV *, Value *> ExpandedSCEVs =
7194 VPlanTransforms::expandSCEVs(BestVPlan, *PSE.getSE());
7195 if (!ILV.getTripCount())
7196 ILV.setTripCount(BestVPlan.getTripCount()->getLiveInIRValue());
7197 else
7198 assert(VectorizingEpilogue && "should only re-use the existing trip "
7199 "count during epilogue vectorization");
7200
7201 // Perform the actual loop transformation.
7202 VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
7203 OrigLoop->getParentLoop(),
7204 Legal->getWidestInductionType());
7205
7206#ifdef EXPENSIVE_CHECKS
7207 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
7208#endif
7209
7210 // 1. Set up the skeleton for vectorization, including vector pre-header and
7211 // middle block. The vector loop is created during VPlan execution.
7212 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7214 State.CFG.PrevBB->getSingleSuccessor());
7216
7217 assert(verifyVPlanIsValid(BestVPlan, true /*VerifyLate*/) &&
7218 "final VPlan is invalid");
7219
7220 // After vectorization, the exit blocks of the original loop will have
7221 // additional predecessors. Invalidate SCEVs for the exit phis in case SE
7222 // looked through single-entry phis.
7223 ScalarEvolution &SE = *PSE.getSE();
7224 for (VPIRBasicBlock *Exit : BestVPlan.getExitBlocks()) {
7225 if (!Exit->hasPredecessors())
7226 continue;
7227 for (VPRecipeBase &PhiR : Exit->phis())
7229 OrigLoop, cast<PHINode>(&cast<VPIRPhi>(PhiR).getInstruction()));
7230 }
7231 // Forget the original loop and block dispositions.
7232 SE.forgetLoop(OrigLoop);
7234
7236
7237 //===------------------------------------------------===//
7238 //
7239 // Notice: any optimization or new instruction that go
7240 // into the code below should also be implemented in
7241 // the cost-model.
7242 //
7243 //===------------------------------------------------===//
7244
7245 BestVPlan.execute(&State);
7246
7247 // 2.6. Maintain Loop Hints
7248 // Keep all loop hints from the original loop on the vector loop (we'll
7249 // replace the vectorizer-specific hints below).
7250 VPBasicBlock *HeaderVPBB = vputils::getFirstLoopHeader(BestVPlan, State.VPDT);
7251 // Add metadata to disable runtime unrolling a scalar loop when there
7252 // are no runtime checks about strides and memory. A scalar loop that is
7253 // rarely used is not worth unrolling.
7254 bool DisableRuntimeUnroll = !ILV.RTChecks.hasChecks() && !BestVF.isScalar();
7256 HeaderVPBB ? LI->getLoopFor(State.CFG.VPBB2IRBB.lookup(HeaderVPBB))
7257 : nullptr,
7258 HeaderVPBB, VectorizingEpilogue,
7259 estimateElementCount(BestVF * BestUF, CM.getVScaleForTuning()),
7260 DisableRuntimeUnroll);
7261
7262 // 3. Fix the vectorized code: take care of header phi's, live-outs,
7263 // predication, updating analyses.
7264 ILV.fixVectorizedLoop(State);
7265
7267
7268 return ExpandedSCEVs;
7269}
7270
7271//===--------------------------------------------------------------------===//
7272// EpilogueVectorizerMainLoop
7273//===--------------------------------------------------------------------===//
7274
7275/// This function is partially responsible for generating the control flow
7276/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7278 BasicBlock *ScalarPH = createScalarPreheader("");
7279 BasicBlock *VectorPH = ScalarPH->getSinglePredecessor();
7280
7281 // Generate the code to check the minimum iteration count of the vector
7282 // epilogue (see below).
7283 EPI.EpilogueIterationCountCheck =
7284 emitIterationCountCheck(VectorPH, ScalarPH, true);
7285 EPI.EpilogueIterationCountCheck->setName("iter.check");
7286
7287 VectorPH = cast<BranchInst>(EPI.EpilogueIterationCountCheck->getTerminator())
7288 ->getSuccessor(1);
7289 // Generate the iteration count check for the main loop, *after* the check
7290 // for the epilogue loop, so that the path-length is shorter for the case
7291 // that goes directly through the vector epilogue. The longer-path length for
7292 // the main loop is compensated for, by the gain from vectorizing the larger
7293 // trip count. Note: the branch will get updated later on when we vectorize
7294 // the epilogue.
7295 EPI.MainLoopIterationCountCheck =
7296 emitIterationCountCheck(VectorPH, ScalarPH, false);
7297
7298 return cast<BranchInst>(EPI.MainLoopIterationCountCheck->getTerminator())
7299 ->getSuccessor(1);
7300}
7301
7303 LLVM_DEBUG({
7304 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7305 << "Main Loop VF:" << EPI.MainLoopVF
7306 << ", Main Loop UF:" << EPI.MainLoopUF
7307 << ", Epilogue Loop VF:" << EPI.EpilogueVF
7308 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7309 });
7310}
7311
7314 dbgs() << "intermediate fn:\n"
7315 << *OrigLoop->getHeader()->getParent() << "\n";
7316 });
7317}
7318
7320 BasicBlock *VectorPH, BasicBlock *Bypass, bool ForEpilogue) {
7321 assert(Bypass && "Expected valid bypass basic block.");
7324 Value *CheckMinIters = createIterationCountCheck(
7325 VectorPH, ForEpilogue ? EPI.EpilogueVF : EPI.MainLoopVF,
7326 ForEpilogue ? EPI.EpilogueUF : EPI.MainLoopUF);
7327
7328 BasicBlock *const TCCheckBlock = VectorPH;
7329 if (!ForEpilogue)
7330 TCCheckBlock->setName("vector.main.loop.iter.check");
7331
7332 // Create new preheader for vector loop.
7333 VectorPH = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7334 static_cast<DominatorTree *>(nullptr), LI, nullptr,
7335 "vector.ph");
7336 if (ForEpilogue) {
7337 // Save the trip count so we don't have to regenerate it in the
7338 // vec.epilog.iter.check. This is safe to do because the trip count
7339 // generated here dominates the vector epilog iter check.
7340 EPI.TripCount = Count;
7341 } else {
7343 }
7344
7345 BranchInst &BI = *BranchInst::Create(Bypass, VectorPH, CheckMinIters);
7346 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
7347 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
7348 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
7349
7350 // When vectorizing the main loop, its trip-count check is placed in a new
7351 // block, whereas the overall trip-count check is placed in the VPlan entry
7352 // block. When vectorizing the epilogue loop, its trip-count check is placed
7353 // in the VPlan entry block.
7354 if (!ForEpilogue)
7355 introduceCheckBlockInVPlan(TCCheckBlock);
7356 return TCCheckBlock;
7357}
7358
7359//===--------------------------------------------------------------------===//
7360// EpilogueVectorizerEpilogueLoop
7361//===--------------------------------------------------------------------===//
7362
7363/// This function is partially responsible for generating the control flow
7364/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7366 BasicBlock *ScalarPH = createScalarPreheader("vec.epilog.");
7367 BasicBlock *VectorPH = ScalarPH->getSinglePredecessor();
7368 // Now, compare the remaining count and if there aren't enough iterations to
7369 // execute the vectorized epilogue skip to the scalar part.
7370 VectorPH->setName("vec.epilog.ph");
7371 BasicBlock *VecEpilogueIterationCountCheck =
7372 SplitBlock(VectorPH, VectorPH->begin(), DT, LI, nullptr,
7373 "vec.epilog.iter.check", true);
7375
7376 emitMinimumVectorEpilogueIterCountCheck(VectorPH, ScalarPH,
7377 VecEpilogueIterationCountCheck);
7378 AdditionalBypassBlock = VecEpilogueIterationCountCheck;
7379
7380 // Adjust the control flow taking the state info from the main loop
7381 // vectorization into account.
7382 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7383 "expected this to be saved from the previous pass.");
7384 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7385 VecEpilogueIterationCountCheck, VectorPH);
7386
7387 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7388 VecEpilogueIterationCountCheck, ScalarPH);
7389
7390 // Adjust the terminators of runtime check blocks and phis using them.
7391 BasicBlock *SCEVCheckBlock = RTChecks.getSCEVChecks().second;
7392 BasicBlock *MemCheckBlock = RTChecks.getMemRuntimeChecks().second;
7393 if (SCEVCheckBlock)
7394 SCEVCheckBlock->getTerminator()->replaceUsesOfWith(
7395 VecEpilogueIterationCountCheck, ScalarPH);
7396 if (MemCheckBlock)
7397 MemCheckBlock->getTerminator()->replaceUsesOfWith(
7398 VecEpilogueIterationCountCheck, ScalarPH);
7399
7400 DT->changeImmediateDominator(ScalarPH, EPI.EpilogueIterationCountCheck);
7401
7402 // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7403 // reductions which merge control-flow from the latch block and the middle
7404 // block. Update the incoming values here and move the Phi into the preheader.
7405 SmallVector<PHINode *, 4> PhisInBlock(
7406 llvm::make_pointer_range(VecEpilogueIterationCountCheck->phis()));
7407
7408 for (PHINode *Phi : PhisInBlock) {
7409 Phi->moveBefore(VectorPH->getFirstNonPHIIt());
7410 Phi->replaceIncomingBlockWith(
7411 VecEpilogueIterationCountCheck->getSinglePredecessor(),
7412 VecEpilogueIterationCountCheck);
7413
7414 // If the phi doesn't have an incoming value from the
7415 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7416 // value and also those from other check blocks. This is needed for
7417 // reduction phis only.
7418 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
7419 return EPI.EpilogueIterationCountCheck == IncB;
7420 }))
7421 continue;
7422 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7423 if (SCEVCheckBlock)
7424 Phi->removeIncomingValue(SCEVCheckBlock);
7425 if (MemCheckBlock)
7426 Phi->removeIncomingValue(MemCheckBlock);
7427 }
7428
7429 return VectorPH;
7430}
7431
7432BasicBlock *
7434 BasicBlock *VectorPH, BasicBlock *Bypass, BasicBlock *Insert) {
7435
7436 assert(EPI.TripCount &&
7437 "Expected trip count to have been saved in the first pass.");
7438 Value *TC = EPI.TripCount;
7439 IRBuilder<> Builder(Insert->getTerminator());
7440 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7441
7442 // Generate code to check if the loop's trip count is less than VF * UF of the
7443 // vector epilogue loop.
7444 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
7447
7448 Value *CheckMinIters =
7449 Builder.CreateICmp(P, Count,
7450 createStepForVF(Builder, Count->getType(),
7451 EPI.EpilogueVF, EPI.EpilogueUF),
7452 "min.epilog.iters.check");
7453
7454 BranchInst &BI = *BranchInst::Create(Bypass, VectorPH, CheckMinIters);
7455 auto VScale = Cost->getVScaleForTuning();
7456 unsigned MainLoopStep =
7457 estimateElementCount(EPI.MainLoopVF * EPI.MainLoopUF, VScale);
7458 unsigned EpilogueLoopStep =
7459 estimateElementCount(EPI.EpilogueVF * EPI.EpilogueUF, VScale);
7460 // We assume the remaining `Count` is equally distributed in
7461 // [0, MainLoopStep)
7462 // So the probability for `Count < EpilogueLoopStep` should be
7463 // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
7464 // TODO: Improve the estimate by taking the estimated trip count into
7465 // consideration.
7466 unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
7467 const uint32_t Weights[] = {EstimatedSkipCount,
7468 MainLoopStep - EstimatedSkipCount};
7469 setBranchWeights(BI, Weights, /*IsExpected=*/false);
7470 ReplaceInstWithInst(Insert->getTerminator(), &BI);
7471
7472 // A new entry block has been created for the epilogue VPlan. Hook it in, as
7473 // otherwise we would try to modify the entry to the main vector loop.
7474 VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(Insert);
7475 VPBasicBlock *OldEntry = Plan.getEntry();
7476 VPBlockUtils::reassociateBlocks(OldEntry, NewEntry);
7477 Plan.setEntry(NewEntry);
7478 // OldEntry is now dead and will be cleaned up when the plan gets destroyed.
7479
7480 return Insert;
7481}
7482
7484 LLVM_DEBUG({
7485 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7486 << "Epilogue Loop VF:" << EPI.EpilogueVF
7487 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7488 });
7489}
7490
7493 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7494 });
7495}
7496
7498VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
7499 VFRange &Range) {
7501 "Must be called with either a load or store");
7502
7503 auto WillWiden = [&](ElementCount VF) -> bool {
7505 CM.getWideningDecision(I, VF);
7507 "CM decision should be taken at this point.");
7509 return true;
7510 if (CM.isScalarAfterVectorization(I, VF) ||
7511 CM.isProfitableToScalarize(I, VF))
7512 return false;
7514 };
7515
7517 return nullptr;
7518
7519 VPValue *Mask = nullptr;
7520 if (Legal->isMaskRequired(I))
7521 Mask = getBlockInMask(Builder.getInsertBlock());
7522
7523 // Determine if the pointer operand of the access is either consecutive or
7524 // reverse consecutive.
7526 CM.getWideningDecision(I, Range.Start);
7528 bool Consecutive =
7530
7532 if (Consecutive) {
7534 Ptr->getUnderlyingValue()->stripPointerCasts());
7535 VPSingleDefRecipe *VectorPtr;
7536 if (Reverse) {
7537 // When folding the tail, we may compute an address that we don't in the
7538 // original scalar loop and it may not be inbounds. Drop Inbounds in that
7539 // case.
7540 GEPNoWrapFlags Flags =
7541 (CM.foldTailByMasking() || !GEP || !GEP->isInBounds())
7543 : GEPNoWrapFlags::inBounds();
7544 VectorPtr =
7546 /*Stride*/ -1, Flags, I->getDebugLoc());
7547 } else {
7548 VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
7549 GEP ? GEP->getNoWrapFlags()
7551 I->getDebugLoc());
7552 }
7553 Builder.insert(VectorPtr);
7554 Ptr = VectorPtr;
7555 }
7556 if (LoadInst *Load = dyn_cast<LoadInst>(I))
7557 return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
7558 VPIRMetadata(*Load, LVer), I->getDebugLoc());
7559
7560 StoreInst *Store = cast<StoreInst>(I);
7561 return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
7562 Reverse, VPIRMetadata(*Store, LVer),
7563 I->getDebugLoc());
7564}
7565
7566/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
7567/// insert a recipe to expand the step for the induction recipe.
7570 VPValue *Start, const InductionDescriptor &IndDesc,
7571 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) {
7572 assert(IndDesc.getStartValue() ==
7573 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
7574 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
7575 "step must be loop invariant");
7576
7577 VPValue *Step =
7579 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
7580 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
7581 IndDesc, TruncI,
7582 TruncI->getDebugLoc());
7583 }
7584 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
7585 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
7586 IndDesc, Phi->getDebugLoc());
7587}
7588
7589VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
7591
7592 // Check if this is an integer or fp induction. If so, build the recipe that
7593 // produces its scalar and vector values.
7594 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
7595 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
7596 *PSE.getSE(), *OrigLoop);
7597
7598 // Check if this is pointer induction. If so, build the recipe for it.
7599 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
7600 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep());
7601 return new VPWidenPointerInductionRecipe(
7602 Phi, Operands[0], Step, &Plan.getVFxUF(), *II,
7604 [&](ElementCount VF) {
7605 return CM.isScalarAfterVectorization(Phi, VF);
7606 },
7607 Range),
7608 Phi->getDebugLoc());
7609 }
7610 return nullptr;
7611}
7612
7613VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
7615 // Optimize the special case where the source is a constant integer
7616 // induction variable. Notice that we can only optimize the 'trunc' case
7617 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
7618 // (c) other casts depend on pointer size.
7619
7620 // Determine whether \p K is a truncation based on an induction variable that
7621 // can be optimized.
7622 auto IsOptimizableIVTruncate =
7623 [&](Instruction *K) -> std::function<bool(ElementCount)> {
7624 return [=](ElementCount VF) -> bool {
7625 return CM.isOptimizableIVTruncate(K, VF);
7626 };
7627 };
7628
7630 IsOptimizableIVTruncate(I), Range)) {
7631
7632 auto *Phi = cast<PHINode>(I->getOperand(0));
7633 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
7634 VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue());
7635 return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
7636 *OrigLoop);
7637 }
7638 return nullptr;
7639}
7640
7641VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
7643 VFRange &Range) {
7645 [this, CI](ElementCount VF) {
7646 return CM.isScalarWithPredication(CI, VF);
7647 },
7648 Range);
7649
7650 if (IsPredicated)
7651 return nullptr;
7652
7654 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
7655 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
7656 ID == Intrinsic::pseudoprobe ||
7657 ID == Intrinsic::experimental_noalias_scope_decl))
7658 return nullptr;
7659
7661
7662 // Is it beneficial to perform intrinsic call compared to lib call?
7663 bool ShouldUseVectorIntrinsic =
7665 [&](ElementCount VF) -> bool {
7666 return CM.getCallWideningDecision(CI, VF).Kind ==
7668 },
7669 Range);
7670 if (ShouldUseVectorIntrinsic)
7671 return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(),
7672 CI->getDebugLoc());
7673
7674 Function *Variant = nullptr;
7675 std::optional<unsigned> MaskPos;
7676 // Is better to call a vectorized version of the function than to to scalarize
7677 // the call?
7678 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
7679 [&](ElementCount VF) -> bool {
7680 // The following case may be scalarized depending on the VF.
7681 // The flag shows whether we can use a usual Call for vectorized
7682 // version of the instruction.
7683
7684 // If we've found a variant at a previous VF, then stop looking. A
7685 // vectorized variant of a function expects input in a certain shape
7686 // -- basically the number of input registers, the number of lanes
7687 // per register, and whether there's a mask required.
7688 // We store a pointer to the variant in the VPWidenCallRecipe, so
7689 // once we have an appropriate variant it's only valid for that VF.
7690 // This will force a different vplan to be generated for each VF that
7691 // finds a valid variant.
7692 if (Variant)
7693 return false;
7694 LoopVectorizationCostModel::CallWideningDecision Decision =
7695 CM.getCallWideningDecision(CI, VF);
7697 Variant = Decision.Variant;
7698 MaskPos = Decision.MaskPos;
7699 return true;
7700 }
7701
7702 return false;
7703 },
7704 Range);
7705 if (ShouldUseVectorCall) {
7706 if (MaskPos.has_value()) {
7707 // We have 2 cases that would require a mask:
7708 // 1) The block needs to be predicated, either due to a conditional
7709 // in the scalar loop or use of an active lane mask with
7710 // tail-folding, and we use the appropriate mask for the block.
7711 // 2) No mask is required for the block, but the only available
7712 // vector variant at this VF requires a mask, so we synthesize an
7713 // all-true mask.
7714 VPValue *Mask = nullptr;
7715 if (Legal->isMaskRequired(CI))
7716 Mask = getBlockInMask(Builder.getInsertBlock());
7717 else
7718 Mask = Plan.getOrAddLiveIn(
7720
7721 Ops.insert(Ops.begin() + *MaskPos, Mask);
7722 }
7723
7724 Ops.push_back(Operands.back());
7725 return new VPWidenCallRecipe(CI, Variant, Ops, CI->getDebugLoc());
7726 }
7727
7728 return nullptr;
7729}
7730
7731bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
7733 !isa<StoreInst>(I) && "Instruction should have been handled earlier");
7734 // Instruction should be widened, unless it is scalar after vectorization,
7735 // scalarization is profitable or it is predicated.
7736 auto WillScalarize = [this, I](ElementCount VF) -> bool {
7737 return CM.isScalarAfterVectorization(I, VF) ||
7738 CM.isProfitableToScalarize(I, VF) ||
7739 CM.isScalarWithPredication(I, VF);
7740 };
7742 Range);
7743}
7744
7745VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
7747 switch (I->getOpcode()) {
7748 default:
7749 return nullptr;
7750 case Instruction::SDiv:
7751 case Instruction::UDiv:
7752 case Instruction::SRem:
7753 case Instruction::URem: {
7754 // If not provably safe, use a select to form a safe divisor before widening the
7755 // div/rem operation itself. Otherwise fall through to general handling below.
7756 if (CM.isPredicatedInst(I)) {
7758 VPValue *Mask = getBlockInMask(Builder.getInsertBlock());
7759 VPValue *One =
7760 Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));
7761 auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc());
7762 Ops[1] = SafeRHS;
7763 return new VPWidenRecipe(*I, Ops);
7764 }
7765 [[fallthrough]];
7766 }
7767 case Instruction::Add:
7768 case Instruction::And:
7769 case Instruction::AShr:
7770 case Instruction::FAdd:
7771 case Instruction::FCmp:
7772 case Instruction::FDiv:
7773 case Instruction::FMul:
7774 case Instruction::FNeg:
7775 case Instruction::FRem:
7776 case Instruction::FSub:
7777 case Instruction::ICmp:
7778 case Instruction::LShr:
7779 case Instruction::Mul:
7780 case Instruction::Or:
7781 case Instruction::Select:
7782 case Instruction::Shl:
7783 case Instruction::Sub:
7784 case Instruction::Xor:
7785 case Instruction::Freeze: {
7787 if (Instruction::isBinaryOp(I->getOpcode())) {
7788 // The legacy cost model uses SCEV to check if some of the operands are
7789 // constants. To match the legacy cost model's behavior, use SCEV to try
7790 // to replace operands with constants.
7791 ScalarEvolution &SE = *PSE.getSE();
7792 auto GetConstantViaSCEV = [this, &SE](VPValue *Op) {
7793 if (!Op->isLiveIn())
7794 return Op;
7795 Value *V = Op->getUnderlyingValue();
7796 if (isa<Constant>(V) || !SE.isSCEVable(V->getType()))
7797 return Op;
7798 auto *C = dyn_cast<SCEVConstant>(SE.getSCEV(V));
7799 if (!C)
7800 return Op;
7801 return Plan.getOrAddLiveIn(C->getValue());
7802 };
7803 // For Mul, the legacy cost model checks both operands.
7804 if (I->getOpcode() == Instruction::Mul)
7805 NewOps[0] = GetConstantViaSCEV(NewOps[0]);
7806 // For other binops, the legacy cost model only checks the second operand.
7807 NewOps[1] = GetConstantViaSCEV(NewOps[1]);
7808 }
7809 return new VPWidenRecipe(*I, NewOps);
7810 }
7811 case Instruction::ExtractValue: {
7813 Type *I32Ty = IntegerType::getInt32Ty(I->getContext());
7814 auto *EVI = cast<ExtractValueInst>(I);
7815 assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index");
7816 unsigned Idx = EVI->getIndices()[0];
7817 NewOps.push_back(Plan.getOrAddLiveIn(ConstantInt::get(I32Ty, Idx, false)));
7818 return new VPWidenRecipe(*I, NewOps);
7819 }
7820 };
7821}
7822
7824VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
7826 // FIXME: Support other operations.
7827 unsigned Opcode = HI->Update->getOpcode();
7828 assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
7829 "Histogram update operation must be an Add or Sub");
7830
7832 // Bucket address.
7833 HGramOps.push_back(Operands[1]);
7834 // Increment value.
7835 HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update->getOperand(1)));
7836
7837 // In case of predicated execution (due to tail-folding, or conditional
7838 // execution, or both), pass the relevant mask.
7839 if (Legal->isMaskRequired(HI->Store))
7840 HGramOps.push_back(getBlockInMask(Builder.getInsertBlock()));
7841
7842 return new VPHistogramRecipe(Opcode, HGramOps, HI->Store->getDebugLoc());
7843}
7844
7847 VFRange &Range) {
7849 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
7850 Range);
7851
7852 bool IsPredicated = CM.isPredicatedInst(I);
7853
7854 // Even if the instruction is not marked as uniform, there are certain
7855 // intrinsic calls that can be effectively treated as such, so we check for
7856 // them here. Conservatively, we only do this for scalable vectors, since
7857 // for fixed-width VFs we can always fall back on full scalarization.
7858 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
7859 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
7860 case Intrinsic::assume:
7861 case Intrinsic::lifetime_start:
7862 case Intrinsic::lifetime_end:
7863 // For scalable vectors if one of the operands is variant then we still
7864 // want to mark as uniform, which will generate one instruction for just
7865 // the first lane of the vector. We can't scalarize the call in the same
7866 // way as for fixed-width vectors because we don't know how many lanes
7867 // there are.
7868 //
7869 // The reasons for doing it this way for scalable vectors are:
7870 // 1. For the assume intrinsic generating the instruction for the first
7871 // lane is still be better than not generating any at all. For
7872 // example, the input may be a splat across all lanes.
7873 // 2. For the lifetime start/end intrinsics the pointer operand only
7874 // does anything useful when the input comes from a stack object,
7875 // which suggests it should always be uniform. For non-stack objects
7876 // the effect is to poison the object, which still allows us to
7877 // remove the call.
7878 IsUniform = true;
7879 break;
7880 default:
7881 break;
7882 }
7883 }
7884 VPValue *BlockInMask = nullptr;
7885 if (!IsPredicated) {
7886 // Finalize the recipe for Instr, first if it is not predicated.
7887 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
7888 } else {
7889 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
7890 // Instructions marked for predication are replicated and a mask operand is
7891 // added initially. Masked replicate recipes will later be placed under an
7892 // if-then construct to prevent side-effects. Generate recipes to compute
7893 // the block mask for this region.
7894 BlockInMask = getBlockInMask(Builder.getInsertBlock());
7895 }
7896
7897 // Note that there is some custom logic to mark some intrinsics as uniform
7898 // manually above for scalable vectors, which this assert needs to account for
7899 // as well.
7900 assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
7901 (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
7902 "Should not predicate a uniform recipe");
7903 auto *Recipe = new VPReplicateRecipe(I, Operands, IsUniform, BlockInMask,
7904 VPIRMetadata(*I, LVer));
7905 return Recipe;
7906}
7907
7908/// Find all possible partial reductions in the loop and track all of those that
7909/// are valid so recipes can be formed later.
7911 // Find all possible partial reductions.
7913 PartialReductionChains;
7914 for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) {
7915 getScaledReductions(Phi, RdxDesc.getLoopExitInstr(), Range,
7916 PartialReductionChains);
7917 }
7918
7919 // A partial reduction is invalid if any of its extends are used by
7920 // something that isn't another partial reduction. This is because the
7921 // extends are intended to be lowered along with the reduction itself.
7922
7923 // Build up a set of partial reduction ops for efficient use checking.
7924 SmallPtrSet<User *, 4> PartialReductionOps;
7925 for (const auto &[PartialRdx, _] : PartialReductionChains)
7926 PartialReductionOps.insert(PartialRdx.ExtendUser);
7927
7928 auto ExtendIsOnlyUsedByPartialReductions =
7929 [&PartialReductionOps](Instruction *Extend) {
7930 return all_of(Extend->users(), [&](const User *U) {
7931 return PartialReductionOps.contains(U);
7932 });
7933 };
7934
7935 // Check if each use of a chain's two extends is a partial reduction
7936 // and only add those that don't have non-partial reduction users.
7937 for (auto Pair : PartialReductionChains) {
7938 PartialReductionChain Chain = Pair.first;
7939 if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) &&
7940 (!Chain.ExtendB || ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB)))
7941 ScaledReductionMap.try_emplace(Chain.Reduction, Pair.second);
7942 }
7943}
7944
7945bool VPRecipeBuilder::getScaledReductions(
7946 Instruction *PHI, Instruction *RdxExitInstr, VFRange &Range,
7947 SmallVectorImpl<std::pair<PartialReductionChain, unsigned>> &Chains) {
7948 if (!CM.TheLoop->contains(RdxExitInstr))
7949 return false;
7950
7951 auto *Update = dyn_cast<BinaryOperator>(RdxExitInstr);
7952 if (!Update)
7953 return false;
7954
7955 Value *Op = Update->getOperand(0);
7956 Value *PhiOp = Update->getOperand(1);
7957 if (Op == PHI)
7958 std::swap(Op, PhiOp);
7959
7960 // Try and get a scaled reduction from the first non-phi operand.
7961 // If one is found, we use the discovered reduction instruction in
7962 // place of the accumulator for costing.
7963 if (auto *OpInst = dyn_cast<Instruction>(Op)) {
7964 if (getScaledReductions(PHI, OpInst, Range, Chains)) {
7965 PHI = Chains.rbegin()->first.Reduction;
7966
7967 Op = Update->getOperand(0);
7968 PhiOp = Update->getOperand(1);
7969 if (Op == PHI)
7970 std::swap(Op, PhiOp);
7971 }
7972 }
7973 if (PhiOp != PHI)
7974 return false;
7975
7976 using namespace llvm::PatternMatch;
7977
7978 // If the update is a binary operator, check both of its operands to see if
7979 // they are extends. Otherwise, see if the update comes directly from an
7980 // extend.
7981 Instruction *Exts[2] = {nullptr};
7982 BinaryOperator *ExtendUser = dyn_cast<BinaryOperator>(Op);
7983 std::optional<unsigned> BinOpc;
7984 Type *ExtOpTypes[2] = {nullptr};
7985
7986 auto CollectExtInfo = [this, &Exts,
7987 &ExtOpTypes](SmallVectorImpl<Value *> &Ops) -> bool {
7988 unsigned I = 0;
7989 for (Value *OpI : Ops) {
7990 Value *ExtOp;
7991 if (!match(OpI, m_ZExtOrSExt(m_Value(ExtOp))))
7992 return false;
7993 Exts[I] = cast<Instruction>(OpI);
7994
7995 // TODO: We should be able to support live-ins.
7996 if (!CM.TheLoop->contains(Exts[I]))
7997 return false;
7998
7999 ExtOpTypes[I] = ExtOp->getType();
8000 I++;
8001 }
8002 return true;
8003 };
8004
8005 if (ExtendUser) {
8006 if (!ExtendUser->hasOneUse())
8007 return false;
8008
8009 // Use the side-effect of match to replace BinOp only if the pattern is
8010 // matched, we don't care at this point whether it actually matched.
8011 match(ExtendUser, m_Neg(m_BinOp(ExtendUser)));
8012
8013 SmallVector<Value *> Ops(ExtendUser->operands());
8014 if (!CollectExtInfo(Ops))
8015 return false;
8016
8017 BinOpc = std::make_optional(ExtendUser->getOpcode());
8018 } else if (match(Update, m_Add(m_Value(), m_Value()))) {
8019 // We already know the operands for Update are Op and PhiOp.
8021 if (!CollectExtInfo(Ops))
8022 return false;
8023
8024 ExtendUser = Update;
8025 BinOpc = std::nullopt;
8026 } else
8027 return false;
8028
8032 Exts[1] ? TTI::getPartialReductionExtendKind(Exts[1]) : TTI::PR_None;
8033 PartialReductionChain Chain(RdxExitInstr, Exts[0], Exts[1], ExtendUser);
8034
8035 TypeSize PHISize = PHI->getType()->getPrimitiveSizeInBits();
8036 TypeSize ASize = ExtOpTypes[0]->getPrimitiveSizeInBits();
8037 if (!PHISize.hasKnownScalarFactor(ASize))
8038 return false;
8039 unsigned TargetScaleFactor = PHISize.getKnownScalarFactor(ASize);
8040
8042 [&](ElementCount VF) {
8043 InstructionCost Cost = TTI->getPartialReductionCost(
8044 Update->getOpcode(), ExtOpTypes[0], ExtOpTypes[1],
8045 PHI->getType(), VF, OpAExtend, OpBExtend, BinOpc, CM.CostKind);
8046 return Cost.isValid();
8047 },
8048 Range)) {
8049 Chains.emplace_back(Chain, TargetScaleFactor);
8050 return true;
8051 }
8052
8053 return false;
8054}
8055
8057 VFRange &Range) {
8058 // First, check for specific widening recipes that deal with inductions, Phi
8059 // nodes, calls and memory operations.
8060 VPRecipeBase *Recipe;
8061 Instruction *Instr = R->getUnderlyingInstr();
8062 SmallVector<VPValue *, 4> Operands(R->operands());
8063 if (auto *PhiR = dyn_cast<VPPhi>(R)) {
8064 VPBasicBlock *Parent = PhiR->getParent();
8065 [[maybe_unused]] VPRegionBlock *LoopRegionOf =
8066 Parent->getEnclosingLoopRegion();
8067 assert(LoopRegionOf && LoopRegionOf->getEntry() == Parent &&
8068 "Non-header phis should have been handled during predication");
8069 auto *Phi = cast<PHINode>(R->getUnderlyingInstr());
8070 assert(Operands.size() == 2 && "Must have 2 operands for header phis");
8071 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8072 return Recipe;
8073
8074 VPHeaderPHIRecipe *PhiRecipe = nullptr;
8075 assert((Legal->isReductionVariable(Phi) ||
8076 Legal->isFixedOrderRecurrence(Phi)) &&
8077 "can only widen reductions and fixed-order recurrences here");
8078 VPValue *StartV = Operands[0];
8079 if (Legal->isReductionVariable(Phi)) {
8080 const RecurrenceDescriptor &RdxDesc = Legal->getRecurrenceDescriptor(Phi);
8081 assert(RdxDesc.getRecurrenceStartValue() ==
8082 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8083
8084 // If the PHI is used by a partial reduction, set the scale factor.
8085 unsigned ScaleFactor =
8086 getScalingForReduction(RdxDesc.getLoopExitInstr()).value_or(1);
8087 PhiRecipe = new VPReductionPHIRecipe(
8088 Phi, RdxDesc.getRecurrenceKind(), *StartV, CM.isInLoopReduction(Phi),
8089 CM.useOrderedReductions(RdxDesc), ScaleFactor);
8090 } else {
8091 // TODO: Currently fixed-order recurrences are modeled as chains of
8092 // first-order recurrences. If there are no users of the intermediate
8093 // recurrences in the chain, the fixed order recurrence should be modeled
8094 // directly, enabling more efficient codegen.
8095 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8096 }
8097 // Add backedge value.
8098 PhiRecipe->addOperand(Operands[1]);
8099 return PhiRecipe;
8100 }
8101 assert(!R->isPhi() && "only VPPhi nodes expected at this point");
8102
8103 if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
8104 cast<TruncInst>(Instr), Operands, Range)))
8105 return Recipe;
8106
8107 // All widen recipes below deal only with VF > 1.
8109 [&](ElementCount VF) { return VF.isScalar(); }, Range))
8110 return nullptr;
8111
8112 if (auto *CI = dyn_cast<CallInst>(Instr))
8113 return tryToWidenCall(CI, Operands, Range);
8114
8115 if (StoreInst *SI = dyn_cast<StoreInst>(Instr))
8116 if (auto HistInfo = Legal->getHistogramInfo(SI))
8117 return tryToWidenHistogram(*HistInfo, Operands);
8118
8119 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8120 return tryToWidenMemory(Instr, Operands, Range);
8121
8122 if (std::optional<unsigned> ScaleFactor = getScalingForReduction(Instr))
8123 return tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value());
8124
8125 if (!shouldWiden(Instr, Range))
8126 return nullptr;
8127
8128 if (auto *GEP = dyn_cast<GetElementPtrInst>(Instr))
8129 return new VPWidenGEPRecipe(GEP, Operands);
8130
8131 if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8132 return new VPWidenSelectRecipe(*SI, Operands);
8133 }
8134
8135 if (auto *CI = dyn_cast<CastInst>(Instr)) {
8136 return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(),
8137 *CI);
8138 }
8139
8140 return tryToWiden(Instr, Operands);
8141}
8142
8146 unsigned ScaleFactor) {
8147 assert(Operands.size() == 2 &&
8148 "Unexpected number of operands for partial reduction");
8149
8150 VPValue *BinOp = Operands[0];
8152 VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe();
8153 if (isa<VPReductionPHIRecipe>(BinOpRecipe) ||
8154 isa<VPPartialReductionRecipe>(BinOpRecipe))
8155 std::swap(BinOp, Accumulator);
8156
8157 unsigned ReductionOpcode = Reduction->getOpcode();
8158 if (ReductionOpcode == Instruction::Sub) {
8159 auto *const Zero = ConstantInt::get(Reduction->getType(), 0);
8161 Ops.push_back(Plan.getOrAddLiveIn(Zero));
8162 Ops.push_back(BinOp);
8163 BinOp = new VPWidenRecipe(*Reduction, Ops);
8164 Builder.insert(BinOp->getDefiningRecipe());
8165 ReductionOpcode = Instruction::Add;
8166 }
8167
8168 VPValue *Cond = nullptr;
8169 if (CM.blockNeedsPredicationForAnyReason(Reduction->getParent())) {
8170 assert((ReductionOpcode == Instruction::Add ||
8171 ReductionOpcode == Instruction::Sub) &&
8172 "Expected an ADD or SUB operation for predicated partial "
8173 "reductions (because the neutral element in the mask is zero)!");
8174 Cond = getBlockInMask(Builder.getInsertBlock());
8175 VPValue *Zero =
8176 Plan.getOrAddLiveIn(ConstantInt::get(Reduction->getType(), 0));
8177 BinOp = Builder.createSelect(Cond, BinOp, Zero, Reduction->getDebugLoc());
8178 }
8179 return new VPPartialReductionRecipe(ReductionOpcode, Accumulator, BinOp, Cond,
8180 ScaleFactor, Reduction);
8181}
8182
8183void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8184 ElementCount MaxVF) {
8185 if (ElementCount::isKnownGT(MinVF, MaxVF))
8186 return;
8187
8188 assert(OrigLoop->isInnermost() && "Inner loop expected.");
8189
8190 const LoopAccessInfo *LAI = Legal->getLAI();
8192 OrigLoop, LI, DT, PSE.getSE());
8193 if (!LAI->getRuntimePointerChecking()->getChecks().empty() &&
8195 // Only use noalias metadata when using memory checks guaranteeing no
8196 // overlap across all iterations.
8197 LVer.prepareNoAliasMetadata();
8198 }
8199
8200 // Create initial base VPlan0, to serve as common starting point for all
8201 // candidates built later for specific VF ranges.
8202 auto VPlan0 = VPlanTransforms::buildVPlan0(
8203 OrigLoop, *LI, Legal->getWidestInductionType(),
8204 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE);
8205
8206 auto MaxVFTimes2 = MaxVF * 2;
8207 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8208 VFRange SubRange = {VF, MaxVFTimes2};
8209 if (auto Plan = tryToBuildVPlanWithVPRecipes(
8210 std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange, &LVer)) {
8211 bool HasScalarVF = Plan->hasScalarVFOnly();
8212 // Now optimize the initial VPlan.
8213 if (!HasScalarVF)
8215 *Plan, CM.getMinimalBitwidths());
8217 // TODO: try to put it close to addActiveLaneMask().
8218 if (CM.foldTailWithEVL() && !HasScalarVF)
8220 *Plan, CM.getMaxSafeElements());
8221 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8222 VPlans.push_back(std::move(Plan));
8223 }
8224 VF = SubRange.End;
8225 }
8226}
8227
8228/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
8229/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
8230/// the end value of the induction.
8232 VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder,
8233 VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) {
8234 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
8235 // Truncated wide inductions resume from the last lane of their vector value
8236 // in the last vector iteration which is handled elsewhere.
8237 if (WideIntOrFp && WideIntOrFp->getTruncInst())
8238 return nullptr;
8239
8240 VPValue *Start = WideIV->getStartValue();
8241 VPValue *Step = WideIV->getStepValue();
8243 VPValue *EndValue = VectorTC;
8244 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
8245 EndValue = VectorPHBuilder.createDerivedIV(
8246 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
8247 Start, VectorTC, Step);
8248 }
8249
8250 // EndValue is derived from the vector trip count (which has the same type as
8251 // the widest induction) and thus may be wider than the induction here.
8252 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
8253 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
8254 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
8255 ScalarTypeOfWideIV,
8256 WideIV->getDebugLoc());
8257 }
8258
8259 auto *ResumePhiRecipe = ScalarPHBuilder.createScalarPhi(
8260 {EndValue, Start}, WideIV->getDebugLoc(), "bc.resume.val");
8261 return ResumePhiRecipe;
8262}
8263
8264/// Create resume phis in the scalar preheader for first-order recurrences,
8265/// reductions and inductions, and update the VPIRInstructions wrapping the
8266/// original phis in the scalar header. End values for inductions are added to
8267/// \p IVEndValues.
8268static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
8269 DenseMap<VPValue *, VPValue *> &IVEndValues) {
8270 VPTypeAnalysis TypeInfo(Plan);
8271 auto *ScalarPH = Plan.getScalarPreheader();
8272 auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]);
8273 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
8274 VPBuilder VectorPHBuilder(
8275 cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
8276 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
8277 VPBuilder ScalarPHBuilder(ScalarPH);
8278 for (VPRecipeBase &ScalarPhiR : Plan.getScalarHeader()->phis()) {
8279 auto *ScalarPhiIRI = cast<VPIRPhi>(&ScalarPhiR);
8280
8281 // TODO: Extract final value from induction recipe initially, optimize to
8282 // pre-computed end value together in optimizeInductionExitUsers.
8283 auto *VectorPhiR =
8284 cast<VPHeaderPHIRecipe>(Builder.getRecipe(&ScalarPhiIRI->getIRPhi()));
8285 if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
8287 WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
8288 &Plan.getVectorTripCount())) {
8289 assert(isa<VPPhi>(ResumePhi) && "Expected a phi");
8290 IVEndValues[WideIVR] = ResumePhi->getOperand(0);
8291 ScalarPhiIRI->addOperand(ResumePhi);
8292 continue;
8293 }
8294 // TODO: Also handle truncated inductions here. Computing end-values
8295 // separately should be done as VPlan-to-VPlan optimization, after
8296 // legalizing all resume values to use the last lane from the loop.
8297 assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
8298 "should only skip truncated wide inductions");
8299 continue;
8300 }
8301
8302 // The backedge value provides the value to resume coming out of a loop,
8303 // which for FORs is a vector whose last element needs to be extracted. The
8304 // start value provides the value if the loop is bypassed.
8305 bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
8306 auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
8307 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
8308 "Cannot handle loops with uncountable early exits");
8309 if (IsFOR)
8310 ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
8311 VPInstruction::ExtractLastElement, {ResumeFromVectorLoop}, {},
8312 "vector.recur.extract");
8313 StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
8314 auto *ResumePhiR = ScalarPHBuilder.createScalarPhi(
8315 {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name);
8316 ScalarPhiIRI->addOperand(ResumePhiR);
8317 }
8318}
8319
8320/// Handle users in the exit block for first order reductions in the original
8321/// exit block. The penultimate value of recurrences is fed to their LCSSA phi
8322/// users in the original exit block using the VPIRInstruction wrapping to the
8323/// LCSSA phi.
8325 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
8326 auto *ScalarPHVPBB = Plan.getScalarPreheader();
8327 auto *MiddleVPBB = Plan.getMiddleBlock();
8328 VPBuilder ScalarPHBuilder(ScalarPHVPBB);
8329 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
8330
8331 auto IsScalableOne = [](ElementCount VF) -> bool {
8332 return VF == ElementCount::getScalable(1);
8333 };
8334
8335 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
8336 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
8337 if (!FOR)
8338 continue;
8339
8340 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
8341 "Cannot handle loops with uncountable early exits");
8342
8343 // This is the second phase of vectorizing first-order recurrences, creating
8344 // extract for users outside the loop. An overview of the transformation is
8345 // described below. Suppose we have the following loop with some use after
8346 // the loop of the last a[i-1],
8347 //
8348 // for (int i = 0; i < n; ++i) {
8349 // t = a[i - 1];
8350 // b[i] = a[i] - t;
8351 // }
8352 // use t;
8353 //
8354 // There is a first-order recurrence on "a". For this loop, the shorthand
8355 // scalar IR looks like:
8356 //
8357 // scalar.ph:
8358 // s.init = a[-1]
8359 // br scalar.body
8360 //
8361 // scalar.body:
8362 // i = phi [0, scalar.ph], [i+1, scalar.body]
8363 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
8364 // s2 = a[i]
8365 // b[i] = s2 - s1
8366 // br cond, scalar.body, exit.block
8367 //
8368 // exit.block:
8369 // use = lcssa.phi [s1, scalar.body]
8370 //
8371 // In this example, s1 is a recurrence because it's value depends on the
8372 // previous iteration. In the first phase of vectorization, we created a
8373 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
8374 // for users in the scalar preheader and exit block.
8375 //
8376 // vector.ph:
8377 // v_init = vector(..., ..., ..., a[-1])
8378 // br vector.body
8379 //
8380 // vector.body
8381 // i = phi [0, vector.ph], [i+4, vector.body]
8382 // v1 = phi [v_init, vector.ph], [v2, vector.body]
8383 // v2 = a[i, i+1, i+2, i+3]
8384 // b[i] = v2 - v1
8385 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
8386 // b[i, i+1, i+2, i+3] = v2 - v1
8387 // br cond, vector.body, middle.block
8388 //
8389 // middle.block:
8390 // vector.recur.extract.for.phi = v2(2)
8391 // vector.recur.extract = v2(3)
8392 // br cond, scalar.ph, exit.block
8393 //
8394 // scalar.ph:
8395 // scalar.recur.init = phi [vector.recur.extract, middle.block],
8396 // [s.init, otherwise]
8397 // br scalar.body
8398 //
8399 // scalar.body:
8400 // i = phi [0, scalar.ph], [i+1, scalar.body]
8401 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
8402 // s2 = a[i]
8403 // b[i] = s2 - s1
8404 // br cond, scalar.body, exit.block
8405 //
8406 // exit.block:
8407 // lo = lcssa.phi [s1, scalar.body],
8408 // [vector.recur.extract.for.phi, middle.block]
8409 //
8410 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
8411 // Extract the penultimate value of the recurrence and use it as operand for
8412 // the VPIRInstruction modeling the phi.
8413 for (VPUser *U : FOR->users()) {
8414 using namespace llvm::VPlanPatternMatch;
8415 if (!match(U, m_ExtractLastElement(m_Specific(FOR))))
8416 continue;
8417 // For VF vscale x 1, if vscale = 1, we are unable to extract the
8418 // penultimate value of the recurrence. Instead we rely on the existing
8419 // extract of the last element from the result of
8420 // VPInstruction::FirstOrderRecurrenceSplice.
8421 // TODO: Consider vscale_range info and UF.
8423 Range))
8424 return;
8425 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
8426 VPInstruction::ExtractPenultimateElement, {FOR->getBackedgeValue()},
8427 {}, "vector.recur.extract.for.phi");
8428 cast<VPInstruction>(U)->replaceAllUsesWith(PenultimateElement);
8429 }
8430 }
8431}
8432
8433VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
8434 VPlanPtr Plan, VFRange &Range, LoopVersioning *LVer) {
8435
8436 using namespace llvm::VPlanPatternMatch;
8437 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8438
8439 // ---------------------------------------------------------------------------
8440 // Build initial VPlan: Scan the body of the loop in a topological order to
8441 // visit each basic block after having visited its predecessor basic blocks.
8442 // ---------------------------------------------------------------------------
8443
8444 bool RequiresScalarEpilogueCheck =
8446 [this](ElementCount VF) {
8447 return !CM.requiresScalarEpilogue(VF.isVector());
8448 },
8449 Range);
8450 VPlanTransforms::handleEarlyExits(*Plan, Legal->hasUncountableEarlyExit());
8451 VPlanTransforms::addMiddleCheck(*Plan, RequiresScalarEpilogueCheck,
8452 CM.foldTailByMasking());
8453
8455
8456 // Don't use getDecisionAndClampRange here, because we don't know the UF
8457 // so this function is better to be conservative, rather than to split
8458 // it up into different VPlans.
8459 // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8460 bool IVUpdateMayOverflow = false;
8461 for (ElementCount VF : Range)
8462 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
8463
8464 TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
8465 // Use NUW for the induction increment if we proved that it won't overflow in
8466 // the vector loop or when not folding the tail. In the later case, we know
8467 // that the canonical induction increment will not overflow as the vector trip
8468 // count is >= increment and a multiple of the increment.
8469 bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
8470 if (!HasNUW) {
8471 auto *IVInc = Plan->getVectorLoopRegion()
8472 ->getExitingBasicBlock()
8473 ->getTerminator()
8474 ->getOperand(0);
8475 assert(match(IVInc, m_VPInstruction<Instruction::Add>(
8476 m_Specific(Plan->getCanonicalIV()), m_VPValue())) &&
8477 "Did not find the canonical IV increment");
8478 cast<VPRecipeWithIRFlags>(IVInc)->dropPoisonGeneratingFlags();
8479 }
8480
8481 // ---------------------------------------------------------------------------
8482 // Pre-construction: record ingredients whose recipes we'll need to further
8483 // process after constructing the initial VPlan.
8484 // ---------------------------------------------------------------------------
8485
8486 // For each interleave group which is relevant for this (possibly trimmed)
8487 // Range, add it to the set of groups to be later applied to the VPlan and add
8488 // placeholders for its members' Recipes which we'll be replacing with a
8489 // single VPInterleaveRecipe.
8490 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8491 auto ApplyIG = [IG, this](ElementCount VF) -> bool {
8492 bool Result = (VF.isVector() && // Query is illegal for VF == 1
8493 CM.getWideningDecision(IG->getInsertPos(), VF) ==
8495 // For scalable vectors, the interleave factors must be <= 8 since we
8496 // require the (de)interleaveN intrinsics instead of shufflevectors.
8497 assert((!Result || !VF.isScalable() || IG->getFactor() <= 8) &&
8498 "Unsupported interleave factor for scalable vectors");
8499 return Result;
8500 };
8501 if (!getDecisionAndClampRange(ApplyIG, Range))
8502 continue;
8503 InterleaveGroups.insert(IG);
8504 }
8505
8506 // ---------------------------------------------------------------------------
8507 // Predicate and linearize the top-level loop region.
8508 // ---------------------------------------------------------------------------
8509 auto BlockMaskCache = VPlanTransforms::introduceMasksAndLinearize(
8510 *Plan, CM.foldTailByMasking());
8511
8512 // ---------------------------------------------------------------------------
8513 // Construct wide recipes and apply predication for original scalar
8514 // VPInstructions in the loop.
8515 // ---------------------------------------------------------------------------
8516 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
8517 Builder, BlockMaskCache, LVer);
8518 RecipeBuilder.collectScaledReductions(Range);
8519
8520 // Scan the body of the loop in a topological order to visit each basic block
8521 // after having visited its predecessor basic blocks.
8522 VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion();
8523 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
8524 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
8525 HeaderVPBB);
8526
8527 auto *MiddleVPBB = Plan->getMiddleBlock();
8528 VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
8529 // Mapping from VPValues in the initial plan to their widened VPValues. Needed
8530 // temporarily to update created block masks.
8531 DenseMap<VPValue *, VPValue *> Old2New;
8532 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
8533 // Convert input VPInstructions to widened recipes.
8534 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
8535 auto *SingleDef = cast<VPSingleDefRecipe>(&R);
8536 auto *UnderlyingValue = SingleDef->getUnderlyingValue();
8537 // Skip recipes that do not need transforming, including canonical IV,
8538 // wide canonical IV and VPInstructions without underlying values. The
8539 // latter are added above for masking.
8540 // FIXME: Migrate code relying on the underlying instruction from VPlan0
8541 // to construct recipes below to not use the underlying instruction.
8543 &R) ||
8544 (isa<VPInstruction>(&R) && !UnderlyingValue))
8545 continue;
8546
8547 // FIXME: VPlan0, which models a copy of the original scalar loop, should
8548 // not use VPWidenPHIRecipe to model the phis.
8550 UnderlyingValue && "unsupported recipe");
8551
8552 // TODO: Gradually replace uses of underlying instruction by analyses on
8553 // VPlan.
8554 Instruction *Instr = cast<Instruction>(UnderlyingValue);
8555 Builder.setInsertPoint(SingleDef);
8556
8557 // The stores with invariant address inside the loop will be deleted, and
8558 // in the exit block, a uniform store recipe will be created for the final
8559 // invariant store of the reduction.
8560 StoreInst *SI;
8561 if ((SI = dyn_cast<StoreInst>(Instr)) &&
8562 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
8563 // Only create recipe for the final invariant store of the reduction.
8564 if (Legal->isInvariantStoreOfReduction(SI)) {
8565 auto *Recipe =
8566 new VPReplicateRecipe(SI, R.operands(), true /* IsUniform */,
8567 nullptr /*Mask*/, VPIRMetadata(*SI, LVer));
8568 Recipe->insertBefore(*MiddleVPBB, MBIP);
8569 }
8570 R.eraseFromParent();
8571 continue;
8572 }
8573
8574 VPRecipeBase *Recipe =
8575 RecipeBuilder.tryToCreateWidenRecipe(SingleDef, Range);
8576 if (!Recipe)
8577 Recipe = RecipeBuilder.handleReplication(Instr, R.operands(), Range);
8578
8579 RecipeBuilder.setRecipe(Instr, Recipe);
8580 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && isa<TruncInst>(Instr)) {
8581 // Optimized a truncate to VPWidenIntOrFpInductionRecipe. It needs to be
8582 // moved to the phi section in the header.
8583 Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
8584 } else {
8585 Builder.insert(Recipe);
8586 }
8587 if (Recipe->getNumDefinedValues() == 1) {
8588 SingleDef->replaceAllUsesWith(Recipe->getVPSingleValue());
8589 Old2New[SingleDef] = Recipe->getVPSingleValue();
8590 } else {
8591 assert(Recipe->getNumDefinedValues() == 0 &&
8592 "Unexpected multidef recipe");
8593 R.eraseFromParent();
8594 }
8595 }
8596 }
8597
8598 // replaceAllUsesWith above may invalidate the block masks. Update them here.
8599 // TODO: Include the masks as operands in the predicated VPlan directly
8600 // to remove the need to keep a map of masks beyond the predication
8601 // transform.
8602 RecipeBuilder.updateBlockMaskCache(Old2New);
8603 for (VPValue *Old : Old2New.keys())
8604 Old->getDefiningRecipe()->eraseFromParent();
8605
8606 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8607 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8608 "entry block must be set to a VPRegionBlock having a non-empty entry "
8609 "VPBasicBlock");
8610
8611 // Update wide induction increments to use the same step as the corresponding
8612 // wide induction. This enables detecting induction increments directly in
8613 // VPlan and removes redundant splats.
8614 for (const auto &[Phi, ID] : Legal->getInductionVars()) {
8615 auto *IVInc = cast<Instruction>(
8616 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
8617 if (IVInc->getOperand(0) != Phi || IVInc->getOpcode() != Instruction::Add)
8618 continue;
8619 VPWidenInductionRecipe *WideIV =
8620 cast<VPWidenInductionRecipe>(RecipeBuilder.getRecipe(Phi));
8621 VPRecipeBase *R = RecipeBuilder.getRecipe(IVInc);
8622 R->setOperand(1, WideIV->getStepValue());
8623 }
8624
8626 DenseMap<VPValue *, VPValue *> IVEndValues;
8627 addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
8628
8629 // ---------------------------------------------------------------------------
8630 // Transform initial VPlan: Apply previously taken decisions, in order, to
8631 // bring the VPlan to its final state.
8632 // ---------------------------------------------------------------------------
8633
8634 // Adjust the recipes for any inloop reductions.
8635 adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
8636
8637 // Apply mandatory transformation to handle FP maxnum/minnum reduction with
8638 // NaNs if possible, bail out otherwise.
8640 *Plan))
8641 return nullptr;
8642
8643 // Transform recipes to abstract recipes if it is legal and beneficial and
8644 // clamp the range for better cost estimation.
8645 // TODO: Enable following transform when the EVL-version of extended-reduction
8646 // and mulacc-reduction are implemented.
8647 if (!CM.foldTailWithEVL()) {
8648 VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
8650 CostCtx, Range);
8651 }
8652
8653 for (ElementCount VF : Range)
8654 Plan->addVF(VF);
8655 Plan->setName("Initial VPlan");
8656
8657 // Interleave memory: for each Interleave Group we marked earlier as relevant
8658 // for this VPlan, replace the Recipes widening its memory instructions with a
8659 // single VPInterleaveRecipe at its insertion point.
8661 InterleaveGroups, RecipeBuilder,
8662 CM.isScalarEpilogueAllowed());
8663
8664 // Replace VPValues for known constant strides.
8666 Legal->getLAI()->getSymbolicStrides());
8667
8668 auto BlockNeedsPredication = [this](BasicBlock *BB) {
8669 return Legal->blockNeedsPredication(BB);
8670 };
8672 BlockNeedsPredication);
8673
8674 // Sink users of fixed-order recurrence past the recipe defining the previous
8675 // value and introduce FirstOrderRecurrenceSplice VPInstructions.
8677 *Plan, Builder))
8678 return nullptr;
8679
8680 if (useActiveLaneMask(Style)) {
8681 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8682 // TailFoldingStyle is visible there.
8683 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
8684 bool WithoutRuntimeCheck =
8686 VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
8687 WithoutRuntimeCheck);
8688 }
8689 VPlanTransforms::optimizeInductionExitUsers(*Plan, IVEndValues, *PSE.getSE());
8690
8691 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8692 return Plan;
8693}
8694
8695VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
8696 // Outer loop handling: They may require CFG and instruction level
8697 // transformations before even evaluating whether vectorization is profitable.
8698 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8699 // the vectorization pipeline.
8700 assert(!OrigLoop->isInnermost());
8701 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8702
8703 auto Plan = VPlanTransforms::buildVPlan0(
8704 OrigLoop, *LI, Legal->getWidestInductionType(),
8705 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE);
8707 /*HasUncountableExit*/ false);
8708 VPlanTransforms::addMiddleCheck(*Plan, /*RequiresScalarEpilogue*/ true,
8709 /*TailFolded*/ false);
8710
8712
8713 for (ElementCount VF : Range)
8714 Plan->addVF(VF);
8715
8717 Plan,
8718 [this](PHINode *P) {
8719 return Legal->getIntOrFpInductionDescriptor(P);
8720 },
8721 *TLI))
8722 return nullptr;
8723
8724 // Collect mapping of IR header phis to header phi recipes, to be used in
8725 // addScalarResumePhis.
8726 DenseMap<VPBasicBlock *, VPValue *> BlockMaskCache;
8727 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
8728 Builder, BlockMaskCache, nullptr /*LVer*/);
8729 for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
8731 continue;
8732 auto *HeaderR = cast<VPHeaderPHIRecipe>(&R);
8733 RecipeBuilder.setRecipe(HeaderR->getUnderlyingInstr(), HeaderR);
8734 }
8735 DenseMap<VPValue *, VPValue *> IVEndValues;
8736 // TODO: IVEndValues are not used yet in the native path, to optimize exit
8737 // values.
8738 addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
8739
8740 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8741 return Plan;
8742}
8743
8744// Adjust the recipes for reductions. For in-loop reductions the chain of
8745// instructions leading from the loop exit instr to the phi need to be converted
8746// to reductions, with one operand being vector and the other being the scalar
8747// reduction chain. For other reductions, a select is introduced between the phi
8748// and users outside the vector region when folding the tail.
8749//
8750// A ComputeReductionResult recipe is added to the middle block, also for
8751// in-loop reductions which compute their result in-loop, because generating
8752// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
8753//
8754// Adjust AnyOf reductions; replace the reduction phi for the selected value
8755// with a boolean reduction phi node to check if the condition is true in any
8756// iteration. The final value is selected by the final ComputeReductionResult.
8757void LoopVectorizationPlanner::adjustRecipesForReductions(
8758 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
8759 using namespace VPlanPatternMatch;
8760 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
8761 VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
8762 VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();
8764
8765 for (VPRecipeBase &R : Header->phis()) {
8766 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
8767 if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
8768 continue;
8769
8770 RecurKind Kind = PhiR->getRecurrenceKind();
8771 assert(
8774 "AnyOf and FindIV reductions are not allowed for in-loop reductions");
8775
8776 // Collect the chain of "link" recipes for the reduction starting at PhiR.
8777 SetVector<VPSingleDefRecipe *> Worklist;
8778 Worklist.insert(PhiR);
8779 for (unsigned I = 0; I != Worklist.size(); ++I) {
8780 VPSingleDefRecipe *Cur = Worklist[I];
8781 for (VPUser *U : Cur->users()) {
8782 auto *UserRecipe = cast<VPSingleDefRecipe>(U);
8783 if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
8784 assert((UserRecipe->getParent() == MiddleVPBB ||
8785 UserRecipe->getParent() == Plan->getScalarPreheader()) &&
8786 "U must be either in the loop region, the middle block or the "
8787 "scalar preheader.");
8788 continue;
8789 }
8790 Worklist.insert(UserRecipe);
8791 }
8792 }
8793
8794 // Visit operation "Links" along the reduction chain top-down starting from
8795 // the phi until LoopExitValue. We keep track of the previous item
8796 // (PreviousLink) to tell which of the two operands of a Link will remain
8797 // scalar and which will be reduced. For minmax by select(cmp), Link will be
8798 // the select instructions. Blend recipes of in-loop reduction phi's will
8799 // get folded to their non-phi operand, as the reduction recipe handles the
8800 // condition directly.
8801 VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
8802 for (VPSingleDefRecipe *CurrentLink : drop_begin(Worklist)) {
8803 if (auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink)) {
8804 assert(Blend->getNumIncomingValues() == 2 &&
8805 "Blend must have 2 incoming values");
8806 if (Blend->getIncomingValue(0) == PhiR) {
8807 Blend->replaceAllUsesWith(Blend->getIncomingValue(1));
8808 } else {
8809 assert(Blend->getIncomingValue(1) == PhiR &&
8810 "PhiR must be an operand of the blend");
8811 Blend->replaceAllUsesWith(Blend->getIncomingValue(0));
8812 }
8813 continue;
8814 }
8815
8816 Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
8817
8818 // Index of the first operand which holds a non-mask vector operand.
8819 unsigned IndexOfFirstOperand;
8820 // Recognize a call to the llvm.fmuladd intrinsic.
8821 bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
8822 VPValue *VecOp;
8823 VPBasicBlock *LinkVPBB = CurrentLink->getParent();
8824 if (IsFMulAdd) {
8825 assert(
8827 "Expected instruction to be a call to the llvm.fmuladd intrinsic");
8828 assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
8829 isa<VPWidenIntrinsicRecipe>(CurrentLink)) &&
8830 CurrentLink->getOperand(2) == PreviousLink &&
8831 "expected a call where the previous link is the added operand");
8832
8833 // If the instruction is a call to the llvm.fmuladd intrinsic then we
8834 // need to create an fmul recipe (multiplying the first two operands of
8835 // the fmuladd together) to use as the vector operand for the fadd
8836 // reduction.
8837 VPInstruction *FMulRecipe = new VPInstruction(
8838 Instruction::FMul,
8839 {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
8840 CurrentLinkI->getFastMathFlags());
8841 LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
8842 VecOp = FMulRecipe;
8843 } else if (PhiR->isInLoop() && Kind == RecurKind::AddChainWithSubs &&
8844 CurrentLinkI->getOpcode() == Instruction::Sub) {
8845 Type *PhiTy = PhiR->getUnderlyingValue()->getType();
8846 auto *Zero = Plan->getOrAddLiveIn(ConstantInt::get(PhiTy, 0));
8847 VPWidenRecipe *Sub = new VPWidenRecipe(
8848 Instruction::Sub, {Zero, CurrentLink->getOperand(1)}, {},
8849 VPIRMetadata(), CurrentLinkI->getDebugLoc());
8850 Sub->setUnderlyingValue(CurrentLinkI);
8851 LinkVPBB->insert(Sub, CurrentLink->getIterator());
8852 VecOp = Sub;
8853 } else {
8855 if (isa<VPWidenRecipe>(CurrentLink)) {
8856 assert(isa<CmpInst>(CurrentLinkI) &&
8857 "need to have the compare of the select");
8858 continue;
8859 }
8860 assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
8861 "must be a select recipe");
8862 IndexOfFirstOperand = 1;
8863 } else {
8864 assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
8865 "Expected to replace a VPWidenSC");
8866 IndexOfFirstOperand = 0;
8867 }
8868 // Note that for non-commutable operands (cmp-selects), the semantics of
8869 // the cmp-select are captured in the recurrence kind.
8870 unsigned VecOpId =
8871 CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink
8872 ? IndexOfFirstOperand + 1
8873 : IndexOfFirstOperand;
8874 VecOp = CurrentLink->getOperand(VecOpId);
8875 assert(VecOp != PreviousLink &&
8876 CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
8877 (VecOpId - IndexOfFirstOperand)) ==
8878 PreviousLink &&
8879 "PreviousLink must be the operand other than VecOp");
8880 }
8881
8882 VPValue *CondOp = nullptr;
8883 if (CM.blockNeedsPredicationForAnyReason(CurrentLinkI->getParent()))
8884 CondOp = RecipeBuilder.getBlockInMask(CurrentLink->getParent());
8885
8886 // TODO: Retrieve FMFs from recipes directly.
8887 RecurrenceDescriptor RdxDesc = Legal->getRecurrenceDescriptor(
8888 cast<PHINode>(PhiR->getUnderlyingInstr()));
8889 // Non-FP RdxDescs will have all fast math flags set, so clear them.
8890 FastMathFlags FMFs = isa<FPMathOperator>(CurrentLinkI)
8891 ? RdxDesc.getFastMathFlags()
8892 : FastMathFlags();
8893 auto *RedRecipe = new VPReductionRecipe(
8894 Kind, FMFs, CurrentLinkI, PreviousLink, VecOp, CondOp,
8895 PhiR->isOrdered(), CurrentLinkI->getDebugLoc());
8896 // Append the recipe to the end of the VPBasicBlock because we need to
8897 // ensure that it comes after all of it's inputs, including CondOp.
8898 // Delete CurrentLink as it will be invalid if its operand is replaced
8899 // with a reduction defined at the bottom of the block in the next link.
8900 if (LinkVPBB->getNumSuccessors() == 0)
8901 RedRecipe->insertBefore(&*std::prev(std::prev(LinkVPBB->end())));
8902 else
8903 LinkVPBB->appendRecipe(RedRecipe);
8904
8905 CurrentLink->replaceAllUsesWith(RedRecipe);
8906 ToDelete.push_back(CurrentLink);
8907 PreviousLink = RedRecipe;
8908 }
8909 }
8910 VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
8911 Builder.setInsertPoint(&*std::prev(std::prev(LatchVPBB->end())));
8912 VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
8913 for (VPRecipeBase &R :
8914 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
8915 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
8916 if (!PhiR)
8917 continue;
8918
8919 const RecurrenceDescriptor &RdxDesc = Legal->getRecurrenceDescriptor(
8921 Type *PhiTy = PhiR->getUnderlyingValue()->getType();
8922 // If tail is folded by masking, introduce selects between the phi
8923 // and the users outside the vector region of each reduction, at the
8924 // beginning of the dedicated latch block.
8925 auto *OrigExitingVPV = PhiR->getBackedgeValue();
8926 auto *NewExitingVPV = PhiR->getBackedgeValue();
8927 // Don't output selects for partial reductions because they have an output
8928 // with fewer lanes than the VF. So the operands of the select would have
8929 // different numbers of lanes. Partial reductions mask the input instead.
8930 if (!PhiR->isInLoop() && CM.foldTailByMasking() &&
8931 !isa<VPPartialReductionRecipe>(OrigExitingVPV->getDefiningRecipe())) {
8932 VPValue *Cond = RecipeBuilder.getBlockInMask(PhiR->getParent());
8933 std::optional<FastMathFlags> FMFs =
8934 PhiTy->isFloatingPointTy()
8935 ? std::make_optional(RdxDesc.getFastMathFlags())
8936 : std::nullopt;
8937 NewExitingVPV =
8938 Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
8939 OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
8940 return isa<VPInstruction>(&U) &&
8941 (cast<VPInstruction>(&U)->getOpcode() ==
8943 cast<VPInstruction>(&U)->getOpcode() ==
8945 cast<VPInstruction>(&U)->getOpcode() ==
8947 });
8948 if (CM.usePredicatedReductionSelect())
8949 PhiR->setOperand(1, NewExitingVPV);
8950 }
8951
8952 // We want code in the middle block to appear to execute on the location of
8953 // the scalar loop's latch terminator because: (a) it is all compiler
8954 // generated, (b) these instructions are always executed after evaluating
8955 // the latch conditional branch, and (c) other passes may add new
8956 // predecessors which terminate on this line. This is the easiest way to
8957 // ensure we don't accidentally cause an extra step back into the loop while
8958 // debugging.
8959 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
8960
8961 // TODO: At the moment ComputeReductionResult also drives creation of the
8962 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
8963 // even for in-loop reductions, until the reduction resume value handling is
8964 // also modeled in VPlan.
8965 VPInstruction *FinalReductionResult;
8966 VPBuilder::InsertPointGuard Guard(Builder);
8967 Builder.setInsertPoint(MiddleVPBB, IP);
8968 RecurKind RecurrenceKind = PhiR->getRecurrenceKind();
8970 VPValue *Start = PhiR->getStartValue();
8971 VPValue *Sentinel = Plan->getOrAddLiveIn(RdxDesc.getSentinelValue());
8972 FinalReductionResult =
8973 Builder.createNaryOp(VPInstruction::ComputeFindIVResult,
8974 {PhiR, Start, Sentinel, NewExitingVPV}, ExitDL);
8975 } else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RecurrenceKind)) {
8976 VPValue *Start = PhiR->getStartValue();
8977 FinalReductionResult =
8978 Builder.createNaryOp(VPInstruction::ComputeAnyOfResult,
8979 {PhiR, Start, NewExitingVPV}, ExitDL);
8980 } else {
8981 VPIRFlags Flags =
8983 ? VPIRFlags(RdxDesc.getFastMathFlags())
8984 : VPIRFlags();
8985 FinalReductionResult =
8986 Builder.createNaryOp(VPInstruction::ComputeReductionResult,
8987 {PhiR, NewExitingVPV}, Flags, ExitDL);
8988 }
8989 // If the vector reduction can be performed in a smaller type, we truncate
8990 // then extend the loop exit value to enable InstCombine to evaluate the
8991 // entire expression in the smaller type.
8992 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
8994 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
8996 "Unexpected truncated min-max recurrence!");
8997 Type *RdxTy = RdxDesc.getRecurrenceType();
8998 auto *Trunc =
8999 new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9000 Instruction::CastOps ExtendOpc =
9001 RdxDesc.isSigned() ? Instruction::SExt : Instruction::ZExt;
9002 auto *Extnd = new VPWidenCastRecipe(ExtendOpc, Trunc, PhiTy);
9003 Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
9004 Extnd->insertAfter(Trunc);
9005 if (PhiR->getOperand(1) == NewExitingVPV)
9006 PhiR->setOperand(1, Extnd->getVPSingleValue());
9007
9008 // Update ComputeReductionResult with the truncated exiting value and
9009 // extend its result.
9010 FinalReductionResult->setOperand(1, Trunc);
9011 FinalReductionResult =
9012 Builder.createScalarCast(ExtendOpc, FinalReductionResult, PhiTy, {});
9013 }
9014
9015 // Update all users outside the vector region. Also replace redundant
9016 // ExtractLastElement.
9017 for (auto *U : to_vector(OrigExitingVPV->users())) {
9018 auto *Parent = cast<VPRecipeBase>(U)->getParent();
9019 if (FinalReductionResult == U || Parent->getParent())
9020 continue;
9021 U->replaceUsesOfWith(OrigExitingVPV, FinalReductionResult);
9023 cast<VPInstruction>(U)->replaceAllUsesWith(FinalReductionResult);
9024 }
9025
9026 // Adjust AnyOf reductions; replace the reduction phi for the selected value
9027 // with a boolean reduction phi node to check if the condition is true in
9028 // any iteration. The final value is selected by the final
9029 // ComputeReductionResult.
9030 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RecurrenceKind)) {
9031 auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) {
9032 return isa<VPWidenSelectRecipe>(U) ||
9033 (isa<VPReplicateRecipe>(U) &&
9034 cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() ==
9035 Instruction::Select);
9036 }));
9037 VPValue *Cmp = Select->getOperand(0);
9038 // If the compare is checking the reduction PHI node, adjust it to check
9039 // the start value.
9040 if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe())
9041 CmpR->replaceUsesOfWith(PhiR, PhiR->getStartValue());
9042 Builder.setInsertPoint(Select);
9043
9044 // If the true value of the select is the reduction phi, the new value is
9045 // selected if the negated condition is true in any iteration.
9046 if (Select->getOperand(1) == PhiR)
9047 Cmp = Builder.createNot(Cmp);
9048 VPValue *Or = Builder.createOr(PhiR, Cmp);
9049 Select->getVPSingleValue()->replaceAllUsesWith(Or);
9050 // Delete Select now that it has invalid types.
9051 ToDelete.push_back(Select);
9052
9053 // Convert the reduction phi to operate on bools.
9054 PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(
9055 OrigLoop->getHeader()->getContext())));
9056 continue;
9057 }
9058
9060 RdxDesc.getRecurrenceKind())) {
9061 // Adjust the start value for FindFirstIV/FindLastIV recurrences to use
9062 // the sentinel value after generating the ResumePhi recipe, which uses
9063 // the original start value.
9064 PhiR->setOperand(0, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue()));
9065 }
9066 RecurKind RK = RdxDesc.getRecurrenceKind();
9070 VPBuilder PHBuilder(Plan->getVectorPreheader());
9071 VPValue *Iden = Plan->getOrAddLiveIn(
9072 getRecurrenceIdentity(RK, PhiTy, RdxDesc.getFastMathFlags()));
9073 // If the PHI is used by a partial reduction, set the scale factor.
9074 unsigned ScaleFactor =
9075 RecipeBuilder.getScalingForReduction(RdxDesc.getLoopExitInstr())
9076 .value_or(1);
9077 Type *I32Ty = IntegerType::getInt32Ty(PhiTy->getContext());
9078 auto *ScaleFactorVPV =
9079 Plan->getOrAddLiveIn(ConstantInt::get(I32Ty, ScaleFactor));
9080 VPValue *StartV = PHBuilder.createNaryOp(
9082 {PhiR->getStartValue(), Iden, ScaleFactorVPV},
9083 PhiTy->isFloatingPointTy() ? RdxDesc.getFastMathFlags()
9084 : FastMathFlags());
9085 PhiR->setOperand(0, StartV);
9086 }
9087 }
9088 for (VPRecipeBase *R : ToDelete)
9089 R->eraseFromParent();
9090
9092}
9093
9094void LoopVectorizationPlanner::attachRuntimeChecks(
9095 VPlan &Plan, GeneratedRTChecks &RTChecks, bool HasBranchWeights) const {
9096 const auto &[SCEVCheckCond, SCEVCheckBlock] = RTChecks.getSCEVChecks();
9097 if (SCEVCheckBlock && SCEVCheckBlock->hasNPredecessors(0)) {
9098 assert((!CM.OptForSize ||
9099 CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
9100 "Cannot SCEV check stride or overflow when optimizing for size");
9101 VPlanTransforms::attachCheckBlock(Plan, SCEVCheckCond, SCEVCheckBlock,
9102 HasBranchWeights);
9103 }
9104 const auto &[MemCheckCond, MemCheckBlock] = RTChecks.getMemRuntimeChecks();
9105 if (MemCheckBlock && MemCheckBlock->hasNPredecessors(0)) {
9106 // VPlan-native path does not do any analysis for runtime checks
9107 // currently.
9108 assert((!EnableVPlanNativePath || OrigLoop->isInnermost()) &&
9109 "Runtime checks are not supported for outer loops yet");
9110
9111 if (CM.OptForSize) {
9112 assert(
9113 CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
9114 "Cannot emit memory checks when optimizing for size, unless forced "
9115 "to vectorize.");
9116 ORE->emit([&]() {
9117 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
9118 OrigLoop->getStartLoc(),
9119 OrigLoop->getHeader())
9120 << "Code-size may be reduced by not forcing "
9121 "vectorization, or by source-code modifications "
9122 "eliminating the need for runtime checks "
9123 "(e.g., adding 'restrict').";
9124 });
9125 }
9126 VPlanTransforms::attachCheckBlock(Plan, MemCheckCond, MemCheckBlock,
9127 HasBranchWeights);
9128 }
9129}
9130
9132 VPlan &Plan, ElementCount VF, unsigned UF,
9133 ElementCount MinProfitableTripCount) const {
9134 // vscale is not necessarily a power-of-2, which means we cannot guarantee
9135 // an overflow to zero when updating induction variables and so an
9136 // additional overflow check is required before entering the vector loop.
9137 bool IsIndvarOverflowCheckNeededForVF =
9138 VF.isScalable() && !TTI.isVScaleKnownToBeAPowerOfTwo() &&
9139 !isIndvarOverflowCheckKnownFalse(&CM, VF, UF) &&
9140 CM.getTailFoldingStyle() !=
9142 const uint32_t *BranchWeigths =
9143 hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())
9145 : nullptr;
9147 Plan, VF, UF, MinProfitableTripCount,
9148 CM.requiresScalarEpilogue(VF.isVector()), CM.foldTailByMasking(),
9149 IsIndvarOverflowCheckNeededForVF, OrigLoop, BranchWeigths,
9150 OrigLoop->getLoopPredecessor()->getTerminator()->getDebugLoc(),
9151 *PSE.getSE());
9152}
9153
9155 assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
9156
9157 // Fast-math-flags propagate from the original induction instruction.
9158 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9159 if (FPBinOp)
9160 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9161
9162 Value *Step = State.get(getStepValue(), VPLane(0));
9163 Value *Index = State.get(getOperand(1), VPLane(0));
9164 Value *DerivedIV = emitTransformedIndex(
9165 State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind,
9167 DerivedIV->setName(Name);
9168 State.set(this, DerivedIV, VPLane(0));
9169}
9170
9171// Determine how to lower the scalar epilogue, which depends on 1) optimising
9172// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9173// predication, and 4) a TTI hook that analyses whether the loop is suitable
9174// for predication.
9179 // 1) OptSize takes precedence over all other options, i.e. if this is set,
9180 // don't look at hints or options, and don't request a scalar epilogue.
9181 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9182 // LoopAccessInfo (due to code dependency and not being able to reliably get
9183 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9184 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9185 // versioning when the vectorization is forced, unlike hasOptSize. So revert
9186 // back to the old way and vectorize with versioning when forced. See D81345.)
9187 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9191
9192 // 2) If set, obey the directives
9193 if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9201 };
9202 }
9203
9204 // 3) If set, obey the hints
9205 switch (Hints.getPredicate()) {
9210 };
9211
9212 // 4) if the TTI hook indicates this is profitable, request predication.
9213 TailFoldingInfo TFI(TLI, &LVL, IAI);
9214 if (TTI->preferPredicateOverEpilogue(&TFI))
9216
9218}
9219
9220// Process the loop in the VPlan-native vectorization path. This path builds
9221// VPlan upfront in the vectorization pipeline, which allows to apply
9222// VPlan-to-VPlan transformations from the very beginning without modifying the
9223// input LLVM IR.
9230 LoopVectorizationRequirements &Requirements) {
9231
9233 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9234 return false;
9235 }
9236 assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9237 Function *F = L->getHeader()->getParent();
9238 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9239
9241 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
9242
9243 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9244 &Hints, IAI, PSI, BFI);
9245 // Use the planner for outer loop vectorization.
9246 // TODO: CM is not used at this point inside the planner. Turn CM into an
9247 // optional argument if we don't need it in the future.
9248 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9249 ORE);
9250
9251 // Get user vectorization factor.
9252 ElementCount UserVF = Hints.getWidth();
9253
9255
9256 // Plan how to best vectorize, return the best VF and its cost.
9257 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9258
9259 // If we are stress testing VPlan builds, do not attempt to generate vector
9260 // code. Masked vector code generation support will follow soon.
9261 // Also, do not attempt to vectorize if no vector code will be produced.
9263 return false;
9264
9265 VPlan &BestPlan = LVP.getPlanFor(VF.Width);
9266
9267 {
9268 GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
9269 InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, /*UF=*/1, &CM,
9270 BFI, PSI, Checks, BestPlan);
9271 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9272 << L->getHeader()->getParent()->getName() << "\"\n");
9273 LVP.addMinimumIterationCheck(BestPlan, VF.Width, /*UF=*/1,
9275
9276 LVP.executePlan(VF.Width, /*UF=*/1, BestPlan, LB, DT, false);
9277 }
9278
9279 reportVectorization(ORE, L, VF, 1);
9280
9281 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9282 return true;
9283}
9284
9285// Emit a remark if there are stores to floats that required a floating point
9286// extension. If the vectorized loop was generated with floating point there
9287// will be a performance penalty from the conversion overhead and the change in
9288// the vector width.
9291 for (BasicBlock *BB : L->getBlocks()) {
9292 for (Instruction &Inst : *BB) {
9293 if (auto *S = dyn_cast<StoreInst>(&Inst)) {
9294 if (S->getValueOperand()->getType()->isFloatTy())
9295 Worklist.push_back(S);
9296 }
9297 }
9298 }
9299
9300 // Traverse the floating point stores upwards searching, for floating point
9301 // conversions.
9304 while (!Worklist.empty()) {
9305 auto *I = Worklist.pop_back_val();
9306 if (!L->contains(I))
9307 continue;
9308 if (!Visited.insert(I).second)
9309 continue;
9310
9311 // Emit a remark if the floating point store required a floating
9312 // point conversion.
9313 // TODO: More work could be done to identify the root cause such as a
9314 // constant or a function return type and point the user to it.
9315 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
9316 ORE->emit([&]() {
9317 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9318 I->getDebugLoc(), L->getHeader())
9319 << "floating point conversion changes vector width. "
9320 << "Mixed floating point precision requires an up/down "
9321 << "cast that will negatively impact performance.";
9322 });
9323
9324 for (Use &Op : I->operands())
9325 if (auto *OpI = dyn_cast<Instruction>(Op))
9326 Worklist.push_back(OpI);
9327 }
9328}
9329
9330/// For loops with uncountable early exits, find the cost of doing work when
9331/// exiting the loop early, such as calculating the final exit values of
9332/// variables used outside the loop.
9333/// TODO: This is currently overly pessimistic because the loop may not take
9334/// the early exit, but better to keep this conservative for now. In future,
9335/// it might be possible to relax this by using branch probabilities.
9337 VPlan &Plan, ElementCount VF) {
9338 InstructionCost Cost = 0;
9339 for (auto *ExitVPBB : Plan.getExitBlocks()) {
9340 for (auto *PredVPBB : ExitVPBB->getPredecessors()) {
9341 // If the predecessor is not the middle.block, then it must be the
9342 // vector.early.exit block, which may contain work to calculate the exit
9343 // values of variables used outside the loop.
9344 if (PredVPBB != Plan.getMiddleBlock()) {
9345 LLVM_DEBUG(dbgs() << "Calculating cost of work in exit block "
9346 << PredVPBB->getName() << ":\n");
9347 Cost += PredVPBB->cost(VF, CostCtx);
9348 }
9349 }
9350 }
9351 return Cost;
9352}
9353
9354/// This function determines whether or not it's still profitable to vectorize
9355/// the loop given the extra work we have to do outside of the loop:
9356/// 1. Perform the runtime checks before entering the loop to ensure it's safe
9357/// to vectorize.
9358/// 2. In the case of loops with uncountable early exits, we may have to do
9359/// extra work when exiting the loop early, such as calculating the final
9360/// exit values of variables used outside the loop.
9361static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
9362 VectorizationFactor &VF, Loop *L,
9364 VPCostContext &CostCtx, VPlan &Plan,
9366 std::optional<unsigned> VScale) {
9367 InstructionCost TotalCost = Checks.getCost();
9368 if (!TotalCost.isValid())
9369 return false;
9370
9371 // Add on the cost of any work required in the vector early exit block, if
9372 // one exists.
9373 TotalCost += calculateEarlyExitCost(CostCtx, Plan, VF.Width);
9374
9375 // When interleaving only scalar and vector cost will be equal, which in turn
9376 // would lead to a divide by 0. Fall back to hard threshold.
9377 if (VF.Width.isScalar()) {
9378 // TODO: Should we rename VectorizeMemoryCheckThreshold?
9379 if (TotalCost > VectorizeMemoryCheckThreshold) {
9380 LLVM_DEBUG(
9381 dbgs()
9382 << "LV: Interleaving only is not profitable due to runtime checks\n");
9383 return false;
9384 }
9385 return true;
9386 }
9387
9388 // The scalar cost should only be 0 when vectorizing with a user specified
9389 // VF/IC. In those cases, runtime checks should always be generated.
9390 uint64_t ScalarC = VF.ScalarCost.getValue();
9391 if (ScalarC == 0)
9392 return true;
9393
9394 // First, compute the minimum iteration count required so that the vector
9395 // loop outperforms the scalar loop.
9396 // The total cost of the scalar loop is
9397 // ScalarC * TC
9398 // where
9399 // * TC is the actual trip count of the loop.
9400 // * ScalarC is the cost of a single scalar iteration.
9401 //
9402 // The total cost of the vector loop is
9403 // RtC + VecC * (TC / VF) + EpiC
9404 // where
9405 // * RtC is the cost of the generated runtime checks plus the cost of
9406 // performing any additional work in the vector.early.exit block for loops
9407 // with uncountable early exits.
9408 // * VecC is the cost of a single vector iteration.
9409 // * TC is the actual trip count of the loop
9410 // * VF is the vectorization factor
9411 // * EpiCost is the cost of the generated epilogue, including the cost
9412 // of the remaining scalar operations.
9413 //
9414 // Vectorization is profitable once the total vector cost is less than the
9415 // total scalar cost:
9416 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC
9417 //
9418 // Now we can compute the minimum required trip count TC as
9419 // VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC
9420 //
9421 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
9422 // the computations are performed on doubles, not integers and the result
9423 // is rounded up, hence we get an upper estimate of the TC.
9424 unsigned IntVF = estimateElementCount(VF.Width, VScale);
9425 uint64_t RtC = TotalCost.getValue();
9426 uint64_t Div = ScalarC * IntVF - VF.Cost.getValue();
9427 uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
9428
9429 // Second, compute a minimum iteration count so that the cost of the
9430 // runtime checks is only a fraction of the total scalar loop cost. This
9431 // adds a loop-dependent bound on the overhead incurred if the runtime
9432 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
9433 // * TC. To bound the runtime check to be a fraction 1/X of the scalar
9434 // cost, compute
9435 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
9436 uint64_t MinTC2 = divideCeil(RtC * 10, ScalarC);
9437
9438 // Now pick the larger minimum. If it is not a multiple of VF and a scalar
9439 // epilogue is allowed, choose the next closest multiple of VF. This should
9440 // partly compensate for ignoring the epilogue cost.
9441 uint64_t MinTC = std::max(MinTC1, MinTC2);
9442 if (SEL == CM_ScalarEpilogueAllowed)
9443 MinTC = alignTo(MinTC, IntVF);
9445
9446 LLVM_DEBUG(
9447 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
9448 << VF.MinProfitableTripCount << "\n");
9449
9450 // Skip vectorization if the expected trip count is less than the minimum
9451 // required trip count.
9452 if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
9453 if (ElementCount::isKnownLT(*ExpectedTC, VF.MinProfitableTripCount)) {
9454 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
9455 "trip count < minimum profitable VF ("
9456 << *ExpectedTC << " < " << VF.MinProfitableTripCount
9457 << ")\n");
9458
9459 return false;
9460 }
9461 }
9462 return true;
9463}
9464
9466 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9468 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9470
9471/// Prepare \p MainPlan for vectorizing the main vector loop during epilogue
9472/// vectorization. Remove ResumePhis from \p MainPlan for inductions that
9473/// don't have a corresponding wide induction in \p EpiPlan.
9474static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
9475 // Collect PHI nodes of widened phis in the VPlan for the epilogue. Those
9476 // will need their resume-values computed in the main vector loop. Others
9477 // can be removed from the main VPlan.
9478 SmallPtrSet<PHINode *, 2> EpiWidenedPhis;
9479 for (VPRecipeBase &R :
9482 continue;
9483 EpiWidenedPhis.insert(
9484 cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue()));
9485 }
9486 for (VPRecipeBase &R :
9487 make_early_inc_range(MainPlan.getScalarHeader()->phis())) {
9488 auto *VPIRInst = cast<VPIRPhi>(&R);
9489 if (EpiWidenedPhis.contains(&VPIRInst->getIRPhi()))
9490 continue;
9491 // There is no corresponding wide induction in the epilogue plan that would
9492 // need a resume value. Remove the VPIRInst wrapping the scalar header phi
9493 // together with the corresponding ResumePhi. The resume values for the
9494 // scalar loop will be created during execution of EpiPlan.
9495 VPRecipeBase *ResumePhi = VPIRInst->getOperand(0)->getDefiningRecipe();
9496 VPIRInst->eraseFromParent();
9497 ResumePhi->eraseFromParent();
9498 }
9500
9501 using namespace VPlanPatternMatch;
9502 // When vectorizing the epilogue, FindFirstIV & FindLastIV reductions can
9503 // introduce multiple uses of undef/poison. If the reduction start value may
9504 // be undef or poison it needs to be frozen and the frozen start has to be
9505 // used when computing the reduction result. We also need to use the frozen
9506 // value in the resume phi generated by the main vector loop, as this is also
9507 // used to compute the reduction result after the epilogue vector loop.
9508 auto AddFreezeForFindLastIVReductions = [](VPlan &Plan,
9509 bool UpdateResumePhis) {
9510 VPBuilder Builder(Plan.getEntry());
9511 for (VPRecipeBase &R : *Plan.getMiddleBlock()) {
9512 auto *VPI = dyn_cast<VPInstruction>(&R);
9513 if (!VPI || VPI->getOpcode() != VPInstruction::ComputeFindIVResult)
9514 continue;
9515 VPValue *OrigStart = VPI->getOperand(1);
9517 continue;
9518 VPInstruction *Freeze =
9519 Builder.createNaryOp(Instruction::Freeze, {OrigStart}, {}, "fr");
9520 VPI->setOperand(1, Freeze);
9521 if (UpdateResumePhis)
9522 OrigStart->replaceUsesWithIf(Freeze, [Freeze](VPUser &U, unsigned) {
9523 return Freeze != &U && isa<VPPhi>(&U);
9524 });
9525 }
9526 };
9527 AddFreezeForFindLastIVReductions(MainPlan, true);
9528 AddFreezeForFindLastIVReductions(EpiPlan, false);
9529
9530 VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
9531 VPValue *VectorTC = &MainPlan.getVectorTripCount();
9532 // If there is a suitable resume value for the canonical induction in the
9533 // scalar (which will become vector) epilogue loop, use it and move it to the
9534 // beginning of the scalar preheader. Otherwise create it below.
9535 auto ResumePhiIter =
9536 find_if(MainScalarPH->phis(), [VectorTC](VPRecipeBase &R) {
9537 return match(&R, m_VPInstruction<Instruction::PHI>(m_Specific(VectorTC),
9538 m_SpecificInt(0)));
9539 });
9540 VPPhi *ResumePhi = nullptr;
9541 if (ResumePhiIter == MainScalarPH->phis().end()) {
9542 VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
9543 ResumePhi = ScalarPHBuilder.createScalarPhi(
9544 {VectorTC, MainPlan.getCanonicalIV()->getStartValue()}, {},
9545 "vec.epilog.resume.val");
9546 } else {
9547 ResumePhi = cast<VPPhi>(&*ResumePhiIter);
9548 if (MainScalarPH->begin() == MainScalarPH->end())
9549 ResumePhi->moveBefore(*MainScalarPH, MainScalarPH->end());
9550 else if (&*MainScalarPH->begin() != ResumePhi)
9551 ResumePhi->moveBefore(*MainScalarPH, MainScalarPH->begin());
9552 }
9553 // Add a user to to make sure the resume phi won't get removed.
9554 VPBuilder(MainScalarPH)
9556}
9557
9558/// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
9559/// SCEVs from \p ExpandedSCEVs and set resume values for header recipes.
9560static void
9562 const SCEV2ValueTy &ExpandedSCEVs,
9564 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
9565 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
9566 Header->setName("vec.epilog.vector.body");
9567
9569 // Ensure that the start values for all header phi recipes are updated before
9570 // vectorizing the epilogue loop.
9571 for (VPRecipeBase &R : Header->phis()) {
9572 if (auto *IV = dyn_cast<VPCanonicalIVPHIRecipe>(&R)) {
9573 // When vectorizing the epilogue loop, the canonical induction start
9574 // value needs to be changed from zero to the value after the main
9575 // vector loop. Find the resume value created during execution of the main
9576 // VPlan. It must be the first phi in the loop preheader.
9577 // FIXME: Improve modeling for canonical IV start values in the epilogue
9578 // loop.
9579 using namespace llvm::PatternMatch;
9580 PHINode *EPResumeVal = &*L->getLoopPreheader()->phis().begin();
9581 for (Value *Inc : EPResumeVal->incoming_values()) {
9582 if (match(Inc, m_SpecificInt(0)))
9583 continue;
9584 assert(!EPI.VectorTripCount &&
9585 "Must only have a single non-zero incoming value");
9586 EPI.VectorTripCount = Inc;
9587 }
9588 // If we didn't find a non-zero vector trip count, all incoming values
9589 // must be zero, which also means the vector trip count is zero. Pick the
9590 // first zero as vector trip count.
9591 // TODO: We should not choose VF * UF so the main vector loop is known to
9592 // be dead.
9593 if (!EPI.VectorTripCount) {
9594 assert(
9595 EPResumeVal->getNumIncomingValues() > 0 &&
9596 all_of(EPResumeVal->incoming_values(),
9597 [](Value *Inc) { return match(Inc, m_SpecificInt(0)); }) &&
9598 "all incoming values must be 0");
9599 EPI.VectorTripCount = EPResumeVal->getOperand(0);
9600 }
9601 VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal);
9602 assert(all_of(IV->users(),
9603 [](const VPUser *U) {
9604 return isa<VPScalarIVStepsRecipe>(U) ||
9605 isa<VPDerivedIVRecipe>(U) ||
9606 cast<VPRecipeBase>(U)->isScalarCast() ||
9607 cast<VPInstruction>(U)->getOpcode() ==
9608 Instruction::Add;
9609 }) &&
9610 "the canonical IV should only be used by its increment or "
9611 "ScalarIVSteps when resetting the start value");
9612 IV->setOperand(0, VPV);
9613 continue;
9614 }
9615
9616 Value *ResumeV = nullptr;
9617 // TODO: Move setting of resume values to prepareToExecute.
9618 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
9619 auto *RdxResult =
9620 cast<VPInstruction>(*find_if(ReductionPhi->users(), [](VPUser *U) {
9621 auto *VPI = dyn_cast<VPInstruction>(U);
9622 return VPI &&
9623 (VPI->getOpcode() == VPInstruction::ComputeAnyOfResult ||
9624 VPI->getOpcode() == VPInstruction::ComputeReductionResult ||
9625 VPI->getOpcode() == VPInstruction::ComputeFindIVResult);
9626 }));
9627 ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr())
9628 ->getIncomingValueForBlock(L->getLoopPreheader());
9629 RecurKind RK = ReductionPhi->getRecurrenceKind();
9631 Value *StartV = RdxResult->getOperand(1)->getLiveInIRValue();
9632 // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
9633 // start value; compare the final value from the main vector loop
9634 // to the start value.
9635 BasicBlock *PBB = cast<Instruction>(ResumeV)->getParent();
9636 IRBuilder<> Builder(PBB, PBB->getFirstNonPHIIt());
9637 ResumeV = Builder.CreateICmpNE(ResumeV, StartV);
9639 Value *StartV = getStartValueFromReductionResult(RdxResult);
9640 ToFrozen[StartV] = cast<PHINode>(ResumeV)->getIncomingValueForBlock(
9642
9643 // VPReductionPHIRecipe for FindFirstIV/FindLastIV reductions requires
9644 // an adjustment to the resume value. The resume value is adjusted to
9645 // the sentinel value when the final value from the main vector loop
9646 // equals the start value. This ensures correctness when the start value
9647 // might not be less than the minimum value of a monotonically
9648 // increasing induction variable.
9649 BasicBlock *ResumeBB = cast<Instruction>(ResumeV)->getParent();
9650 IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt());
9651 Value *Cmp = Builder.CreateICmpEQ(ResumeV, ToFrozen[StartV]);
9652 Value *Sentinel = RdxResult->getOperand(2)->getLiveInIRValue();
9653 ResumeV = Builder.CreateSelect(Cmp, Sentinel, ResumeV);
9654 } else {
9655 VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
9656 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9657 if (auto *VPI = dyn_cast<VPInstruction>(PhiR->getStartValue())) {
9658 assert(VPI->getOpcode() == VPInstruction::ReductionStartVector &&
9659 "unexpected start value");
9660 VPI->setOperand(0, StartVal);
9661 continue;
9662 }
9663 }
9664 } else {
9665 // Retrieve the induction resume values for wide inductions from
9666 // their original phi nodes in the scalar loop.
9667 PHINode *IndPhi = cast<VPWidenInductionRecipe>(&R)->getPHINode();
9668 // Hook up to the PHINode generated by a ResumePhi recipe of main
9669 // loop VPlan, which feeds the scalar loop.
9670 ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader());
9671 }
9672 assert(ResumeV && "Must have a resume value");
9673 VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
9674 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
9675 }
9676
9677 // For some VPValues in the epilogue plan we must re-use the generated IR
9678 // values from the main plan. Replace them with live-in VPValues.
9679 // TODO: This is a workaround needed for epilogue vectorization and it
9680 // should be removed once induction resume value creation is done
9681 // directly in VPlan.
9682 for (auto &R : make_early_inc_range(*Plan.getEntry())) {
9683 // Re-use frozen values from the main plan for Freeze VPInstructions in the
9684 // epilogue plan. This ensures all users use the same frozen value.
9685 auto *VPI = dyn_cast<VPInstruction>(&R);
9686 if (VPI && VPI->getOpcode() == Instruction::Freeze) {
9687 VPI->replaceAllUsesWith(Plan.getOrAddLiveIn(
9688 ToFrozen.lookup(VPI->getOperand(0)->getLiveInIRValue())));
9689 continue;
9690 }
9691
9692 // Re-use the trip count and steps expanded for the main loop, as
9693 // skeleton creation needs it as a value that dominates both the scalar
9694 // and vector epilogue loops
9695 auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R);
9696 if (!ExpandR)
9697 continue;
9698 VPValue *ExpandedVal =
9699 Plan.getOrAddLiveIn(ExpandedSCEVs.lookup(ExpandR->getSCEV()));
9700 ExpandR->replaceAllUsesWith(ExpandedVal);
9701 if (Plan.getTripCount() == ExpandR)
9702 Plan.resetTripCount(ExpandedVal);
9703 ExpandR->eraseFromParent();
9704 }
9705}
9706
9707// Generate bypass values from the additional bypass block. Note that when the
9708// vectorized epilogue is skipped due to iteration count check, then the
9709// resume value for the induction variable comes from the trip count of the
9710// main vector loop, passed as the second argument.
9712 PHINode *OrigPhi, const InductionDescriptor &II, IRBuilder<> &BypassBuilder,
9713 const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount,
9714 Instruction *OldInduction) {
9715 Value *Step = getExpandedStep(II, ExpandedSCEVs);
9716 // For the primary induction the additional bypass end value is known.
9717 // Otherwise it is computed.
9718 Value *EndValueFromAdditionalBypass = MainVectorTripCount;
9719 if (OrigPhi != OldInduction) {
9720 auto *BinOp = II.getInductionBinOp();
9721 // Fast-math-flags propagate from the original induction instruction.
9723 BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags());
9724
9725 // Compute the end value for the additional bypass.
9726 EndValueFromAdditionalBypass =
9727 emitTransformedIndex(BypassBuilder, MainVectorTripCount,
9728 II.getStartValue(), Step, II.getKind(), BinOp);
9729 EndValueFromAdditionalBypass->setName("ind.end");
9730 }
9731 return EndValueFromAdditionalBypass;
9732}
9733
9735 VPlan &BestEpiPlan,
9737 const SCEV2ValueTy &ExpandedSCEVs,
9738 Value *MainVectorTripCount) {
9739 // Fix reduction resume values from the additional bypass block.
9740 BasicBlock *PH = L->getLoopPreheader();
9741 for (auto *Pred : predecessors(PH)) {
9742 for (PHINode &Phi : PH->phis()) {
9743 if (Phi.getBasicBlockIndex(Pred) != -1)
9744 continue;
9745 Phi.addIncoming(Phi.getIncomingValueForBlock(BypassBlock), Pred);
9746 }
9747 }
9748 auto *ScalarPH = cast<VPIRBasicBlock>(BestEpiPlan.getScalarPreheader());
9749 if (ScalarPH->hasPredecessors()) {
9750 // If ScalarPH has predecessors, we may need to update its reduction
9751 // resume values.
9752 for (const auto &[R, IRPhi] :
9753 zip(ScalarPH->phis(), ScalarPH->getIRBasicBlock()->phis())) {
9755 BypassBlock);
9756 }
9757 }
9758
9759 // Fix induction resume values from the additional bypass block.
9760 IRBuilder<> BypassBuilder(BypassBlock, BypassBlock->getFirstInsertionPt());
9761 for (const auto &[IVPhi, II] : LVL.getInductionVars()) {
9762 auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
9764 IVPhi, II, BypassBuilder, ExpandedSCEVs, MainVectorTripCount,
9765 LVL.getPrimaryInduction());
9766 // TODO: Directly add as extra operand to the VPResumePHI recipe.
9767 Inc->setIncomingValueForBlock(BypassBlock, V);
9768 }
9769}
9770
9772 assert((EnableVPlanNativePath || L->isInnermost()) &&
9773 "VPlan-native path is not enabled. Only process inner loops.");
9774
9775 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
9776 << L->getHeader()->getParent()->getName() << "' from "
9777 << L->getLocStr() << "\n");
9778
9779 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
9780
9781 LLVM_DEBUG(
9782 dbgs() << "LV: Loop hints:"
9783 << " force="
9785 ? "disabled"
9787 ? "enabled"
9788 : "?"))
9789 << " width=" << Hints.getWidth()
9790 << " interleave=" << Hints.getInterleave() << "\n");
9791
9792 // Function containing loop
9793 Function *F = L->getHeader()->getParent();
9794
9795 // Looking at the diagnostic output is the only way to determine if a loop
9796 // was vectorized (other than looking at the IR or machine code), so it
9797 // is important to generate an optimization remark for each loop. Most of
9798 // these messages are generated as OptimizationRemarkAnalysis. Remarks
9799 // generated as OptimizationRemark and OptimizationRemarkMissed are
9800 // less verbose reporting vectorized loops and unvectorized loops that may
9801 // benefit from vectorization, respectively.
9802
9803 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9804 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9805 return false;
9806 }
9807
9808 PredicatedScalarEvolution PSE(*SE, *L);
9809
9810 // Check if it is legal to vectorize the loop.
9811 LoopVectorizationRequirements Requirements;
9812 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
9813 &Requirements, &Hints, DB, AC, BFI, PSI);
9815 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9816 Hints.emitRemarkWithHints();
9817 return false;
9818 }
9819
9821 reportVectorizationFailure("Auto-vectorization of loops with uncountable "
9822 "early exit is not enabled",
9823 "UncountableEarlyExitLoopsDisabled", ORE, L);
9824 return false;
9825 }
9826
9827 if (!LVL.getPotentiallyFaultingLoads().empty()) {
9828 reportVectorizationFailure("Auto-vectorization of loops with potentially "
9829 "faulting load is not supported",
9830 "PotentiallyFaultingLoadsNotSupported", ORE, L);
9831 return false;
9832 }
9833
9834 // Entrance to the VPlan-native vectorization path. Outer loops are processed
9835 // here. They may require CFG and instruction level transformations before
9836 // even evaluating whether vectorization is profitable. Since we cannot modify
9837 // the incoming IR, we need to build VPlan upfront in the vectorization
9838 // pipeline.
9839 if (!L->isInnermost())
9840 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9841 ORE, BFI, PSI, Hints, Requirements);
9842
9843 assert(L->isInnermost() && "Inner loop expected.");
9844
9845 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9846 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9847
9848 // If an override option has been passed in for interleaved accesses, use it.
9849 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9850 UseInterleaved = EnableInterleavedMemAccesses;
9851
9852 // Analyze interleaved memory accesses.
9853 if (UseInterleaved)
9855
9856 if (LVL.hasUncountableEarlyExit()) {
9857 BasicBlock *LoopLatch = L->getLoopLatch();
9858 if (IAI.requiresScalarEpilogue() ||
9860 [LoopLatch](BasicBlock *BB) { return BB != LoopLatch; })) {
9861 reportVectorizationFailure("Auto-vectorization of early exit loops "
9862 "requiring a scalar epilogue is unsupported",
9863 "UncountableEarlyExitUnsupported", ORE, L);
9864 return false;
9865 }
9866 }
9867
9868 // Check the function attributes and profiles to find out if this function
9869 // should be optimized for size.
9871 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
9872
9873 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9874 // count by optimizing for size, to minimize overheads.
9875 auto ExpectedTC = getSmallBestKnownTC(PSE, L);
9876 if (ExpectedTC && ExpectedTC->isFixed() &&
9877 ExpectedTC->getFixedValue() < TinyTripCountVectorThreshold) {
9878 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9879 << "This loop is worth vectorizing only if no scalar "
9880 << "iteration overheads are incurred.");
9882 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9883 else {
9884 LLVM_DEBUG(dbgs() << "\n");
9885 // Predicate tail-folded loops are efficient even when the loop
9886 // iteration count is low. However, setting the epilogue policy to
9887 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
9888 // with runtime checks. It's more effective to let
9889 // `isOutsideLoopWorkProfitable` determine if vectorization is
9890 // beneficial for the loop.
9893 }
9894 }
9895
9896 // Check the function attributes to see if implicit floats or vectors are
9897 // allowed.
9898 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9900 "Can't vectorize when the NoImplicitFloat attribute is used",
9901 "loop not vectorized due to NoImplicitFloat attribute",
9902 "NoImplicitFloat", ORE, L);
9903 Hints.emitRemarkWithHints();
9904 return false;
9905 }
9906
9907 // Check if the target supports potentially unsafe FP vectorization.
9908 // FIXME: Add a check for the type of safety issue (denormal, signaling)
9909 // for the target we're vectorizing for, to make sure none of the
9910 // additional fp-math flags can help.
9911 if (Hints.isPotentiallyUnsafe() &&
9912 TTI->isFPVectorizationPotentiallyUnsafe()) {
9914 "Potentially unsafe FP op prevents vectorization",
9915 "loop not vectorized due to unsafe FP support.",
9916 "UnsafeFP", ORE, L);
9917 Hints.emitRemarkWithHints();
9918 return false;
9919 }
9920
9921 bool AllowOrderedReductions;
9922 // If the flag is set, use that instead and override the TTI behaviour.
9923 if (ForceOrderedReductions.getNumOccurrences() > 0)
9924 AllowOrderedReductions = ForceOrderedReductions;
9925 else
9926 AllowOrderedReductions = TTI->enableOrderedReductions();
9927 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
9928 ORE->emit([&]() {
9929 auto *ExactFPMathInst = Requirements.getExactFPInst();
9930 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
9931 ExactFPMathInst->getDebugLoc(),
9932 ExactFPMathInst->getParent())
9933 << "loop not vectorized: cannot prove it is safe to reorder "
9934 "floating-point operations";
9935 });
9936 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
9937 "reorder floating-point operations\n");
9938 Hints.emitRemarkWithHints();
9939 return false;
9940 }
9941
9942 // Use the cost model.
9943 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9944 F, &Hints, IAI, PSI, BFI);
9945 // Use the planner for vectorization.
9946 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
9947 ORE);
9948
9949 // Get user vectorization factor and interleave count.
9950 ElementCount UserVF = Hints.getWidth();
9951 unsigned UserIC = Hints.getInterleave();
9952
9953 // Plan how to best vectorize.
9954 LVP.plan(UserVF, UserIC);
9956 unsigned IC = 1;
9957
9958 if (ORE->allowExtraAnalysis(LV_NAME))
9960
9961 GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
9962 if (LVP.hasPlanWithVF(VF.Width)) {
9963 // Select the interleave count.
9964 IC = LVP.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost);
9965
9966 unsigned SelectedIC = std::max(IC, UserIC);
9967 // Optimistically generate runtime checks if they are needed. Drop them if
9968 // they turn out to not be profitable.
9969 if (VF.Width.isVector() || SelectedIC > 1) {
9970 Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
9971
9972 // Bail out early if either the SCEV or memory runtime checks are known to
9973 // fail. In that case, the vector loop would never execute.
9974 using namespace llvm::PatternMatch;
9975 if (Checks.getSCEVChecks().first &&
9976 match(Checks.getSCEVChecks().first, m_One()))
9977 return false;
9978 if (Checks.getMemRuntimeChecks().first &&
9979 match(Checks.getMemRuntimeChecks().first, m_One()))
9980 return false;
9981 }
9982
9983 // Check if it is profitable to vectorize with runtime checks.
9984 bool ForceVectorization =
9986 VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM,
9987 CM.CostKind);
9988 if (!ForceVectorization &&
9989 !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
9990 LVP.getPlanFor(VF.Width), SEL,
9991 CM.getVScaleForTuning())) {
9992 ORE->emit([&]() {
9994 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
9995 L->getHeader())
9996 << "loop not vectorized: cannot prove it is safe to reorder "
9997 "memory operations";
9998 });
9999 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
10000 Hints.emitRemarkWithHints();
10001 return false;
10002 }
10003 }
10004
10005 // Identify the diagnostic messages that should be produced.
10006 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10007 bool VectorizeLoop = true, InterleaveLoop = true;
10008 if (VF.Width.isScalar()) {
10009 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10010 VecDiagMsg = {
10011 "VectorizationNotBeneficial",
10012 "the cost-model indicates that vectorization is not beneficial"};
10013 VectorizeLoop = false;
10014 }
10015
10016 if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) {
10017 // Tell the user interleaving was avoided up-front, despite being explicitly
10018 // requested.
10019 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10020 "interleaving should be avoided up front\n");
10021 IntDiagMsg = {"InterleavingAvoided",
10022 "Ignoring UserIC, because interleaving was avoided up front"};
10023 InterleaveLoop = false;
10024 } else if (IC == 1 && UserIC <= 1) {
10025 // Tell the user interleaving is not beneficial.
10026 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10027 IntDiagMsg = {
10028 "InterleavingNotBeneficial",
10029 "the cost-model indicates that interleaving is not beneficial"};
10030 InterleaveLoop = false;
10031 if (UserIC == 1) {
10032 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10033 IntDiagMsg.second +=
10034 " and is explicitly disabled or interleave count is set to 1";
10035 }
10036 } else if (IC > 1 && UserIC == 1) {
10037 // Tell the user interleaving is beneficial, but it explicitly disabled.
10038 LLVM_DEBUG(dbgs() << "LV: Interleaving is beneficial but is explicitly "
10039 "disabled.\n");
10040 IntDiagMsg = {"InterleavingBeneficialButDisabled",
10041 "the cost-model indicates that interleaving is beneficial "
10042 "but is explicitly disabled or interleave count is set to 1"};
10043 InterleaveLoop = false;
10044 }
10045
10046 // If there is a histogram in the loop, do not just interleave without
10047 // vectorizing. The order of operations will be incorrect without the
10048 // histogram intrinsics, which are only used for recipes with VF > 1.
10049 if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) {
10050 LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
10051 << "to histogram operations.\n");
10052 IntDiagMsg = {
10053 "HistogramPreventsScalarInterleaving",
10054 "Unable to interleave without vectorization due to constraints on "
10055 "the order of histogram operations"};
10056 InterleaveLoop = false;
10057 }
10058
10059 // Override IC if user provided an interleave count.
10060 IC = UserIC > 0 ? UserIC : IC;
10061
10062 // Emit diagnostic messages, if any.
10063 const char *VAPassName = Hints.vectorizeAnalysisPassName();
10064 if (!VectorizeLoop && !InterleaveLoop) {
10065 // Do not vectorize or interleaving the loop.
10066 ORE->emit([&]() {
10067 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10068 L->getStartLoc(), L->getHeader())
10069 << VecDiagMsg.second;
10070 });
10071 ORE->emit([&]() {
10072 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10073 L->getStartLoc(), L->getHeader())
10074 << IntDiagMsg.second;
10075 });
10076 return false;
10077 }
10078
10079 if (!VectorizeLoop && InterleaveLoop) {
10080 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10081 ORE->emit([&]() {
10082 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10083 L->getStartLoc(), L->getHeader())
10084 << VecDiagMsg.second;
10085 });
10086 } else if (VectorizeLoop && !InterleaveLoop) {
10087 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10088 << ") in " << L->getLocStr() << '\n');
10089 ORE->emit([&]() {
10090 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10091 L->getStartLoc(), L->getHeader())
10092 << IntDiagMsg.second;
10093 });
10094 } else if (VectorizeLoop && InterleaveLoop) {
10095 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10096 << ") in " << L->getLocStr() << '\n');
10097 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10098 }
10099
10100 // Report the vectorization decision.
10101 if (VF.Width.isScalar()) {
10102 using namespace ore;
10103 assert(IC > 1);
10104 ORE->emit([&]() {
10105 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10106 L->getHeader())
10107 << "interleaved loop (interleaved count: "
10108 << NV("InterleaveCount", IC) << ")";
10109 });
10110 } else {
10111 // Report the vectorization decision.
10112 reportVectorization(ORE, L, VF, IC);
10113 }
10114 if (ORE->allowExtraAnalysis(LV_NAME))
10116
10117 // If we decided that it is *legal* to interleave or vectorize the loop, then
10118 // do it.
10119
10120 VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10121 // Consider vectorizing the epilogue too if it's profitable.
10122 VectorizationFactor EpilogueVF =
10124 if (EpilogueVF.Width.isVector()) {
10125 std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
10126
10127 // The first pass vectorizes the main loop and creates a scalar epilogue
10128 // to be vectorized by executing the plan (potentially with a different
10129 // factor) again shortly afterwards.
10130 VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width);
10131 BestEpiPlan.getMiddleBlock()->setName("vec.epilog.middle.block");
10132 preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);
10133 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
10134 BestEpiPlan);
10135 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TTI, AC, EPI, &CM, BFI,
10136 PSI, Checks, *BestMainPlan);
10137 auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
10138 *BestMainPlan, MainILV, DT, false);
10139 ++LoopsVectorized;
10140
10141 // Second pass vectorizes the epilogue and adjusts the control flow
10142 // edges from the first pass.
10143 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
10144 BFI, PSI, Checks, BestEpiPlan);
10145 EpilogILV.setTripCount(MainILV.getTripCount());
10146 preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI);
10147
10148 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, DT,
10149 true);
10150
10152 BestEpiPlan, LVL, ExpandedSCEVs,
10153 EPI.VectorTripCount);
10154 ++LoopsEpilogueVectorized;
10155 } else {
10156 InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, IC, &CM, BFI, PSI,
10157 Checks, BestPlan);
10158 // TODO: Move to general VPlan pipeline once epilogue loops are also
10159 // supported.
10162 IC, PSE);
10163 LVP.addMinimumIterationCheck(BestPlan, VF.Width, IC,
10165
10166 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10167 ++LoopsVectorized;
10168 }
10169
10170 assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
10171 "DT not preserved correctly");
10172 assert(!verifyFunction(*F, &dbgs()));
10173
10174 return true;
10175}
10176
10178
10179 // Don't attempt if
10180 // 1. the target claims to have no vector registers, and
10181 // 2. interleaving won't help ILP.
10182 //
10183 // The second condition is necessary because, even if the target has no
10184 // vector registers, loop vectorization may still enable scalar
10185 // interleaving.
10186 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10187 TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2)
10188 return LoopVectorizeResult(false, false);
10189
10190 bool Changed = false, CFGChanged = false;
10191
10192 // The vectorizer requires loops to be in simplified form.
10193 // Since simplification may add new inner loops, it has to run before the
10194 // legality and profitability checks. This means running the loop vectorizer
10195 // will simplify all loops, regardless of whether anything end up being
10196 // vectorized.
10197 for (const auto &L : *LI)
10198 Changed |= CFGChanged |=
10199 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10200
10201 // Build up a worklist of inner-loops to vectorize. This is necessary as
10202 // the act of vectorizing or partially unrolling a loop creates new loops
10203 // and can invalidate iterators across the loops.
10204 SmallVector<Loop *, 8> Worklist;
10205
10206 for (Loop *L : *LI)
10207 collectSupportedLoops(*L, LI, ORE, Worklist);
10208
10209 LoopsAnalyzed += Worklist.size();
10210
10211 // Now walk the identified inner loops.
10212 while (!Worklist.empty()) {
10213 Loop *L = Worklist.pop_back_val();
10214
10215 // For the inner loops we actually process, form LCSSA to simplify the
10216 // transform.
10217 Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10218
10219 Changed |= CFGChanged |= processLoop(L);
10220
10221 if (Changed) {
10222 LAIs->clear();
10223
10224#ifndef NDEBUG
10225 if (VerifySCEV)
10226 SE->verify();
10227#endif
10228 }
10229 }
10230
10231 // Process each loop nest in the function.
10232 return LoopVectorizeResult(Changed, CFGChanged);
10233}
10234
10237 LI = &AM.getResult<LoopAnalysis>(F);
10238 // There are no loops in the function. Return before computing other
10239 // expensive analyses.
10240 if (LI->empty())
10241 return PreservedAnalyses::all();
10250
10251 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10252 PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10253 BFI = nullptr;
10254 if (PSI && PSI->hasProfileSummary())
10256 LoopVectorizeResult Result = runImpl(F);
10257 if (!Result.MadeAnyChange)
10258 return PreservedAnalyses::all();
10260
10261 if (isAssignmentTrackingEnabled(*F.getParent())) {
10262 for (auto &BB : F)
10264 }
10265
10266 PA.preserve<LoopAnalysis>();
10270
10271 if (Result.MadeCFGChange) {
10272 // Making CFG changes likely means a loop got vectorized. Indicate that
10273 // extra simplification passes should be run.
10274 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10275 // be run if runtime checks have been added.
10278 } else {
10280 }
10281 return PA;
10282}
10283
10285 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10286 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10287 OS, MapClassName2PassName);
10288
10289 OS << '<';
10290 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10291 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10292 OS << '>';
10293}
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
AMDGPU Lower Kernel Arguments
AMDGPU Register Bank Select
Rewrite undef for PHI
This file implements a class to represent arbitrary precision integral constant values and operations...
@ PostInc
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
This is the interface for LLVM's primary stateless and local alias analysis.
static bool IsEmptyBlock(MachineBasicBlock *MBB)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static InstructionCost getCost(Instruction &Inst, TTI::TargetCostKind CostKind, TargetTransformInfo &TTI, TargetLibraryInfo &TLI)
Definition CostModel.cpp:74
This file defines DenseMapInfo traits for DenseMap.
This file defines the DenseMap class.
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
#define _
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
Module.h This file contains the declarations for the Module class.
This defines the Use class.
static bool hasNoUnsignedWrap(BinaryOperator &I)
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:80
static cl::opt< unsigned, true > VectorizationFactor("force-vector-width", cl::Hidden, cl::desc("Sets the SIMD width. Zero is autoselect."), cl::location(VectorizerParams::VectorizationFactor))
This header provides classes for managing per-loop analyses.
static const char * VerboseDebug
#define LV_NAME
This file defines the LoopVectorizationLegality class.
This file provides a LoopVectorizationPlanner class.
static void collectSupportedLoops(Loop &L, LoopInfo *LI, OptimizationRemarkEmitter *ORE, SmallVectorImpl< Loop * > &V)
static cl::opt< unsigned > EpilogueVectorizationMinVF("epilogue-vectorization-minimum-VF", cl::Hidden, cl::desc("Only loops with vectorization factor equal to or larger than " "the specified value are considered for epilogue vectorization."))
static cl::opt< unsigned > EpilogueVectorizationForceVF("epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, cl::desc("When epilogue vectorization is enabled, and a value greater than " "1 is specified, forces the given VF for all applicable epilogue " "loops."))
static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan, DenseMap< VPValue *, VPValue * > &IVEndValues)
Create resume phis in the scalar preheader for first-order recurrences, reductions and inductions,...
static Type * maybeVectorizeType(Type *Ty, ElementCount VF)
static ElementCount determineVPlanVF(const TargetTransformInfo &TTI, LoopVectorizationCostModel &CM)
static ElementCount getSmallConstantTripCount(ScalarEvolution *SE, const Loop *L)
A version of ScalarEvolution::getSmallConstantTripCount that returns an ElementCount to include loops...
static cl::opt< unsigned > VectorizeMemoryCheckThreshold("vectorize-memory-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum allowed number of runtime memory checks"))
static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan)
Prepare MainPlan for vectorizing the main vector loop during epilogue vectorization.
static cl::opt< unsigned > TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), cl::Hidden, cl::desc("Loops with a constant trip count that is smaller than this " "value are vectorized only if no scalar iteration overheads " "are incurred."))
Loops with a known constant trip count below this number are vectorized only if no scalar iteration o...
static void debugVectorizationMessage(const StringRef Prefix, const StringRef DebugMsg, Instruction *I)
Write a DebugMsg about vectorization to the debug output stream.
static cl::opt< bool > EnableCondStoresVectorization("enable-cond-stores-vec", cl::init(true), cl::Hidden, cl::desc("Enable if predication of stores during vectorization."))
static VPInstruction * addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Create and return a ResumePhi for WideIV, unless it is truncated.
static Value * emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, Value *Step, InductionDescriptor::InductionKind InductionKind, const BinaryOperator *InductionBinOp)
Compute the transformed value of Index at offset StartValue using step StepValue.
static VPIRBasicBlock * replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB)
Replace VPBB with a VPIRBasicBlock wrapping IRBB.
static DebugLoc getDebugLocFromInstOrOperands(Instruction *I)
Look for a meaningful debug location on the instruction or its operands.
static Value * createInductionAdditionalBypassValues(PHINode *OrigPhi, const InductionDescriptor &II, IRBuilder<> &BypassBuilder, const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount, Instruction *OldInduction)
static void fixReductionScalarResumeWhenVectorizingEpilog(VPPhi *EpiResumePhiR, PHINode &EpiResumePhi, BasicBlock *BypassBlock)
static Value * getStartValueFromReductionResult(VPInstruction *RdxResult)
static cl::opt< bool > ForceTargetSupportsScalableVectors("force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, cl::desc("Pretend that scalable vectors are supported, even if the target does " "not support them. This flag should only be used for testing."))
static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style)
static cl::opt< bool > EnableEarlyExitVectorization("enable-early-exit-vectorization", cl::init(true), cl::Hidden, cl::desc("Enable vectorization of early exit loops with uncountable exits."))
static unsigned estimateElementCount(ElementCount VF, std::optional< unsigned > VScale)
This function attempts to return a value that represents the ElementCount at runtime.
static constexpr uint32_t MinItersBypassWeights[]
static cl::opt< unsigned > ForceTargetNumScalarRegs("force-target-num-scalar-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of scalar registers."))
static cl::opt< bool > UseWiderVFIfCallVariantsPresent("vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), cl::Hidden, cl::desc("Try wider VFs if they enable the use of vector variants"))
static std::optional< unsigned > getMaxVScale(const Function &F, const TargetTransformInfo &TTI)
static cl::opt< unsigned > SmallLoopCost("small-loop-cost", cl::init(20), cl::Hidden, cl::desc("The cost of a loop that is considered 'small' by the interleaver."))
static bool planContainsAdditionalSimplifications(VPlan &Plan, VPCostContext &CostCtx, Loop *TheLoop, ElementCount VF)
Return true if the original loop \ TheLoop contains any instructions that do not have corresponding r...
static cl::opt< unsigned > ForceTargetNumVectorRegs("force-target-num-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers."))
static bool isExplicitVecOuterLoop(Loop *OuterLp, OptimizationRemarkEmitter *ORE)
static cl::opt< bool > EnableIndVarRegisterHeur("enable-ind-var-reg-heur", cl::init(true), cl::Hidden, cl::desc("Count the induction variable only once when interleaving"))
static cl::opt< TailFoldingStyle > ForceTailFoldingStyle("force-tail-folding-style", cl::desc("Force the tail folding style"), cl::init(TailFoldingStyle::None), cl::values(clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), clEnumValN(TailFoldingStyle::Data, "data", "Create lane mask for data only, using active.lane.mask intrinsic"), clEnumValN(TailFoldingStyle::DataWithoutLaneMask, "data-without-lane-mask", "Create lane mask with compare/stepvector"), clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", "Create lane mask using active.lane.mask intrinsic, and use " "it for both data and control flow"), clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, "data-and-control-without-rt-check", "Similar to data-and-control, but remove the runtime check"), clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl", "Use predicated EVL instructions for tail folding. If EVL " "is unsupported, fallback to data-without-lane-mask.")))
static cl::opt< bool > EnableEpilogueVectorization("enable-epilogue-vectorization", cl::init(true), cl::Hidden, cl::desc("Enable vectorization of epilogue loops."))
static ScalarEpilogueLowering getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI)
static void preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, const SCEV2ValueTy &ExpandedSCEVs, EpilogueLoopVectorizationInfo &EPI)
Prepare Plan for vectorizing the epilogue loop.
static cl::opt< bool > PreferPredicatedReductionSelect("prefer-predicated-reduction-select", cl::init(false), cl::Hidden, cl::desc("Prefer predicating a reduction operation over an after loop select."))
static VPWidenIntOrFpInductionRecipe * createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start, const InductionDescriptor &IndDesc, VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop)
Creates a VPWidenIntOrFpInductionRecpipe for Phi.
static cl::opt< bool > PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), cl::Hidden, cl::desc("Prefer in-loop vector reductions, " "overriding the targets preference."))
static cl::opt< bool > EnableLoadStoreRuntimeInterleave("enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, cl::desc("Enable runtime interleaving until load/store ports are saturated"))
static cl::opt< bool > VPlanBuildStressTest("vplan-build-stress-test", cl::init(false), cl::Hidden, cl::desc("Build VPlan for every supported loop nest in the function and bail " "out right after the build (stress test the VPlan H-CFG construction " "in the VPlan-native vectorization path)."))
static bool hasIrregularType(Type *Ty, const DataLayout &DL)
A helper function that returns true if the given type is irregular.
static cl::opt< bool > LoopVectorizeWithBlockFrequency("loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, cl::desc("Enable the use of the block frequency analysis to access PGO " "heuristics minimizing code growth in cold regions and being more " "aggressive in hot regions."))
static std::optional< ElementCount > getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L, bool CanUseConstantMax=true)
Returns "best known" trip count, which is either a valid positive trip count or std::nullopt when an ...
static Value * getExpandedStep(const InductionDescriptor &ID, const SCEV2ValueTy &ExpandedSCEVs)
Return the expanded step for ID using ExpandedSCEVs to look up SCEV expansion results.
static bool useActiveLaneMask(TailFoldingStyle Style)
static bool hasReplicatorRegion(VPlan &Plan)
static bool isIndvarOverflowCheckKnownFalse(const LoopVectorizationCostModel *Cost, ElementCount VF, std::optional< unsigned > UF=std::nullopt)
For the given VF and UF and maximum trip count computed for the loop, return whether the induction va...
static void addFullyUnrolledInstructionsToIgnore(Loop *L, const LoopVectorizationLegality::InductionList &IL, SmallPtrSetImpl< Instruction * > &InstsToIgnore)
Knowing that loop L executes a single vector iteration, add instructions that will get simplified and...
static cl::opt< PreferPredicateTy::Option > PreferPredicateOverEpilogue("prefer-predicate-over-epilogue", cl::init(PreferPredicateTy::ScalarEpilogue), cl::Hidden, cl::desc("Tail-folding and predication preferences over creating a scalar " "epilogue loop."), cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, "scalar-epilogue", "Don't tail-predicate loops, create scalar epilogue"), clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, "predicate-else-scalar-epilogue", "prefer tail-folding, create scalar epilogue if tail " "folding fails."), clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, "predicate-dont-vectorize", "prefers tail-folding, don't attempt vectorization if " "tail-folding fails.")))
static cl::opt< bool > EnableInterleavedMemAccesses("enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop"))
static cl::opt< bool > EnableMaskedInterleavedMemAccesses("enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"))
An interleave-group may need masking if it resides in a block that needs predication,...
static cl::opt< bool > ForceOrderedReductions("force-ordered-reductions", cl::init(false), cl::Hidden, cl::desc("Enable the vectorisation of loops with in-order (strict) " "FP reductions"))
static void cse(BasicBlock *BB)
Perform cse of induction variable instructions.
static const SCEV * getAddressAccessSCEV(Value *Ptr, LoopVectorizationLegality *Legal, PredicatedScalarEvolution &PSE, const Loop *TheLoop)
Gets Address Access SCEV after verifying that the access pattern is loop invariant except the inducti...
static cl::opt< cl::boolOrDefault > ForceSafeDivisor("force-widen-divrem-via-safe-divisor", cl::Hidden, cl::desc("Override cost based safe divisor widening for div/rem instructions"))
static InstructionCost calculateEarlyExitCost(VPCostContext &CostCtx, VPlan &Plan, ElementCount VF)
For loops with uncountable early exits, find the cost of doing work when exiting the loop early,...
static cl::opt< unsigned > ForceTargetMaxVectorInterleaveFactor("force-target-max-vector-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "vectorized loops."))
static bool processLoopInVPlanNativePath(Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements)
static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI)
static cl::opt< unsigned > NumberOfStoresToPredicate("vectorize-num-stores-pred", cl::init(1), cl::Hidden, cl::desc("Max number of stores to be predicated behind an if."))
The number of stores in a loop that are allowed to need predication.
static cl::opt< unsigned > MaxNestedScalarReductionIC("max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop."))
static cl::opt< unsigned > ForceTargetMaxScalarInterleaveFactor("force-target-max-scalar-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "scalar loops."))
static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE)
static bool willGenerateVectors(VPlan &Plan, ElementCount VF, const TargetTransformInfo &TTI)
Check if any recipe of Plan will generate a vector value, which will be assigned a vector register.
static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks, VectorizationFactor &VF, Loop *L, PredicatedScalarEvolution &PSE, VPCostContext &CostCtx, VPlan &Plan, ScalarEpilogueLowering SEL, std::optional< unsigned > VScale)
This function determines whether or not it's still profitable to vectorize the loop given the extra w...
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range)
Handle users in the exit block for first order reductions in the original exit block.
static void fixScalarResumeValuesFromBypass(BasicBlock *BypassBlock, Loop *L, VPlan &BestEpiPlan, LoopVectorizationLegality &LVL, const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount)
static cl::opt< bool > MaximizeBandwidth("vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " "will be determined by the smallest type in loop."))
static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, Instruction *I, DebugLoc DL={})
Create an analysis remark that explains why vectorization failed.
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
mir Rename Register Operands
This file implements a map that provides insertion order iteration.
This file contains the declarations for metadata subclasses.
#define T
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
#define P(N)
This file contains the declarations for profiling metadata utility functions.
const SmallVectorImpl< MachineOperand > & Cond
static BinaryOperator * CreateMul(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
static BinaryOperator * CreateAdd(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
This file contains some templates that are useful if you are working with the STL at all.
#define OP(OPC)
Definition Instruction.h:46
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:167
#define LLVM_DEBUG(...)
Definition Debug.h:119
#define DEBUG_WITH_TYPE(TYPE,...)
DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug information.
Definition Debug.h:77
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
This pass exposes codegen information to IR-level passes.
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file contains the declarations of different VPlan-related auxiliary helpers.
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const char PassName[]
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition blake3_impl.h:83
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:234
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1512
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM_ABI unsigned getVScaleRangeMin() const
Returns the minimum value for the vscale_range attribute.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:459
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition BasicBlock.h:528
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
LLVM_ABI const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this basic block belongs to.
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
BinaryOps getOpcode() const
Definition InstrTypes.h:374
Analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Conditional or Unconditional Branch instruction.
bool isConditional() const
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
BasicBlock * getSuccessor(unsigned i) const
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:984
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:701
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:703
@ ICMP_NE
not equal
Definition InstrTypes.h:700
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:704
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:791
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
A debug info location.
Definition DebugLoc.h:124
static DebugLoc getTemporary()
Definition DebugLoc.h:161
static DebugLoc getUnknown()
Definition DebugLoc.h:162
An analysis that produces DemandedBits for a function.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:187
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:229
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:156
void insert_range(Range &&R)
Inserts range of 'std::pair<KeyT, ValueT>' values into the map.
Definition DenseMap.h:267
Implements a dense probed hash-table based set.
Definition DenseSet.h:261
Analysis pass which computes a DominatorTree.
Definition Dominators.h:284
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
void eraseNode(NodeT *BB)
eraseNode - Removes a node from the dominator tree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:165
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
static constexpr ElementCount get(ScalarTy MinVal, bool Scalable)
Definition TypeSize.h:315
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
BasicBlock * getAdditionalBypassBlock() const
Return the additional bypass block which targets the scalar loop by skipping the epilogue loop after ...
BasicBlock * createVectorizedLoopSkeleton() final
Implements the interface for creating a vectorized skeleton using the epilogue loop strategy (i....
EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan)
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
BasicBlock * emitMinimumVectorEpilogueIterCountCheck(BasicBlock *VectorPH, BasicBlock *Bypass, BasicBlock *Insert)
Emits an iteration count bypass check after the main vector loop has finished to see if there are any...
A specialized derived class of inner loop vectorizer that performs vectorization of main loops in the...
void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB)
Introduces a new VPIRBasicBlock for CheckIRBB to Plan between the vector preheader and its predecesso...
BasicBlock * emitIterationCountCheck(BasicBlock *VectorPH, BasicBlock *Bypass, bool ForEpilogue)
Emits an iteration count bypass check once for the main loop (when ForEpilogue is false) and once for...
EpilogueVectorizerMainLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check, VPlan &Plan)
Value * createIterationCountCheck(BasicBlock *VectorPH, ElementCount VF, unsigned UF) const
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
BasicBlock * createVectorizedLoopSkeleton() final
Implements the interface for creating a vectorized skeleton using the main loop strategy (i....
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
Class to represent function types.
param_iterator param_begin() const
param_iterator param_end() const
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:762
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:727
Represents flags for the getelementptr instruction/expression.
static GEPNoWrapFlags none()
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:345
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2780
A struct for saving information about induction variables.
const SCEV * getStep() const
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_NoInduction
Not an induction variable.
@ IK_FpInduction
Floating point induction variable.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
const SmallVectorImpl< Instruction * > & getCastInsts() const
Returns a reference to the type cast instructions in the induction update chain, that are redundant w...
Value * getStartValue() const
InnerLoopAndEpilogueVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan, ElementCount VecWidth, ElementCount MinProfitableTripCount, unsigned UnrollFactor)
EpilogueLoopVectorizationInfo & EPI
Holds and updates state information required to vectorize the main loop and its epilogue in two separ...
InnerLoopVectorizer vectorizes loops which contain only one basic block to a specified vectorization ...
virtual void printDebugTracesAtStart()
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
Value * TripCount
Trip count of the original loop.
const TargetTransformInfo * TTI
Target Transform Info.
LoopVectorizationCostModel * Cost
The profitablity analysis.
BlockFrequencyInfo * BFI
BFI and PSI are used to check for profile guided size optimizations.
Value * getTripCount() const
Returns the original loop trip count.
friend class LoopVectorizationPlanner
PredicatedScalarEvolution & PSE
A wrapper around ScalarEvolution used to add runtime SCEV checks.
LoopInfo * LI
Loop Info.
ProfileSummaryInfo * PSI
DominatorTree * DT
Dominator Tree.
void setTripCount(Value *TC)
Used to set the trip count after ILV's construction and after the preheader block has been executed.
void fixVectorizedLoop(VPTransformState &State)
Fix the vectorized code, taking care of header phi's, and more.
virtual BasicBlock * createVectorizedLoopSkeleton()
Creates a basic block for the scalar preheader.
virtual void printDebugTracesAtEnd()
AssumptionCache * AC
Assumption Cache.
InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC, ElementCount VecWidth, unsigned UnrollFactor, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks, VPlan &Plan)
IRBuilder Builder
The builder that we use.
void fixNonInductionPHIs(VPTransformState &State)
Fix the non-induction PHIs in Plan.
VPBasicBlock * VectorPHVPBB
The vector preheader block of Plan, used as target for check blocks introduced during skeleton creati...
unsigned UF
The vectorization unroll factor to use.
GeneratedRTChecks & RTChecks
Structure to hold information about generated runtime checks, responsible for cleaning the checks,...
virtual ~InnerLoopVectorizer()=default
ElementCount VF
The vectorization SIMD factor to use.
Loop * OrigLoop
The original loop.
BasicBlock * createScalarPreheader(StringRef Prefix)
Create and return a new IR basic block for the scalar preheader whose name is prefixed with Prefix.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
static InstructionCost getInvalid(CostType Val=0)
static InstructionCost getMax()
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI void moveBefore(InstListType::iterator InsertPos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool isBinaryOp() const
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
LLVM_ABI FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
const char * getOpcodeName() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Class to represent integer types.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
LLVM_ABI APInt getMask() const
For example, this is 0xFF for an 8 bit integer, 0xFFFF for i16, etc.
Definition Type.cpp:343
The group of interleaved loads/stores sharing the same stride and close to each other.
uint32_t getFactor() const
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
InstTy * getInsertPos() const
uint32_t getNumMembers() const
Drive the analysis of interleaved memory accesses in the loop.
bool requiresScalarEpilogue() const
Returns true if an interleaved group that may access memory out-of-bounds requires a scalar epilogue ...
LLVM_ABI void analyzeInterleaving(bool EnableMaskedInterleavedGroup)
Analyze the interleaved accesses and collect them in interleave groups.
An instruction for reading from memory.
Type * getPointerOperandType() const
This analysis provides dependence information for the memory accesses of a loop.
Drive the analysis of memory accesses in the loop.
const RuntimePointerChecking * getRuntimePointerChecking() const
unsigned getNumRuntimePointerChecks() const
Number of memchecks required to prove independence of otherwise may-alias pointers.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:570
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
BlockT * getHeader() const
iterator_range< block_iterator > blocks() const
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Store the result of a depth first search within basic blocks contained by a single loop.
RPOIterator beginRPO() const
Reverse iterate over the cached postorder blocks.
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
RPOIterator endRPO() const
Wrapper class to LoopBlocksDFS that provides a standard begin()/end() interface for the DFS reverse p...
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
void removeBlock(BlockT *BB)
This method completely removes BB from all data structures, including all of the Loop objects it is n...
LoopVectorizationCostModel - estimates the expected speedups due to vectorization.
SmallPtrSet< Type *, 16 > ElementTypesInLoop
All element types found in the loop.
bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment, unsigned AddressSpace) const
Returns true if the target machine supports masked load operation for the given DataType and kind of ...
LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, const Function *F, const LoopVectorizeHints *Hints, InterleavedAccessInfo &IAI, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
void collectElementTypesForWidening()
Collect all element types in the loop for which widening is needed.
bool canVectorizeReductions(ElementCount VF) const
Returns true if the target machine supports all of the reduction variables found for the given VF.
bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment, unsigned AddressSpace) const
Returns true if the target machine supports masked store operation for the given DataType and kind of...
bool isEpilogueVectorizationProfitable(const ElementCount VF, const unsigned IC) const
Returns true if epilogue vectorization is considered profitable, and false otherwise.
bool isPredicatedInst(Instruction *I) const
Returns true if I is an instruction that needs to be predicated at runtime.
void collectValuesToIgnore()
Collect values we want to ignore in the cost model.
void collectInLoopReductions()
Split reductions into those that happen in the loop, and those that happen outside.
std::pair< unsigned, unsigned > getSmallestAndWidestTypes()
bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be uniform after vectorization.
void collectNonVectorizedAndSetWideningDecisions(ElementCount VF)
Collect values that will not be widened, including Uniforms, Scalars, and Instructions to Scalarize f...
PredicatedScalarEvolution & PSE
Predicated scalar evolution analysis.
const LoopVectorizeHints * Hints
Loop Vectorize Hint.
std::optional< unsigned > getMaxSafeElements() const
Return maximum safe number of elements to be processed per vector iteration, which do not prevent sto...
const TargetTransformInfo & TTI
Vector target information.
LoopVectorizationLegality * Legal
Vectorization legality.
std::optional< InstructionCost > getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy) const
Return the cost of instructions in an inloop reduction pattern, if I is part of that pattern.
InstructionCost getInstructionCost(Instruction *I, ElementCount VF)
Returns the execution time cost of an instruction for a given vector width.
DemandedBits * DB
Demanded bits analysis.
bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const
Returns true if I is a memory instruction in an interleaved-group of memory accesses that can be vect...
const TargetLibraryInfo * TLI
Target Library Info.
bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF)
Returns true if I is a memory instruction with consecutive memory access that can be widened.
const InterleaveGroup< Instruction > * getInterleavedAccessGroup(Instruction *Instr) const
Get the interleaved access group that Instr belongs to.
InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const
Estimate cost of an intrinsic call instruction CI if it were vectorized with factor VF.
bool OptForSize
Whether this loop should be optimized for size based on function attribute or profile information.
bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind)
bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be scalar after vectorization.
bool isOptimizableIVTruncate(Instruction *I, ElementCount VF)
Return True if instruction I is an optimizable truncate whose operand is an induction variable.
FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC)
bool shouldConsiderRegPressureForVF(ElementCount VF)
Loop * TheLoop
The loop that we evaluate.
TTI::TargetCostKind CostKind
The kind of cost that we are calculating.
TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow=true) const
Returns the TailFoldingStyle that is best for the current loop.
InterleavedAccessInfo & InterleaveInfo
The interleave access information contains groups of interleaved accesses with the same stride and cl...
SmallPtrSet< const Value *, 16 > ValuesToIgnore
Values to ignore in the cost model.
void setVectorizedCallDecision(ElementCount VF)
A call may be vectorized in different ways depending on whether we have vectorized variants available...
void invalidateCostModelingDecisions()
Invalidates decisions already taken by the cost model.
bool isAccessInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleaved access group.
bool selectUserVectorizationFactor(ElementCount UserVF)
Setup cost-based decisions for user vectorization factor.
std::optional< unsigned > getVScaleForTuning() const
Return the value of vscale used for tuning the cost model.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
LoopInfo * LI
Loop Info analysis.
bool requiresScalarEpilogue(bool IsVectorizing) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
SmallPtrSet< const Value *, 16 > VecValuesToIgnore
Values to ignore in the cost model when VF > 1.
bool isInLoopReduction(PHINode *Phi) const
Returns true if the Phi is part of an inloop reduction.
bool isProfitableToScalarize(Instruction *I, ElementCount VF) const
void setWideningDecision(const InterleaveGroup< Instruction > *Grp, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for interleaving group Grp and vector ...
const MapVector< Instruction *, uint64_t > & getMinimalBitwidths() const
CallWideningDecision getCallWideningDecision(CallInst *CI, ElementCount VF) const
bool isLegalGatherOrScatter(Value *V, ElementCount VF)
Returns true if the target machine can represent V as a masked gather or scatter operation.
bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const
bool shouldConsiderInvariant(Value *Op)
Returns true if Op should be considered invariant and if it is trivially hoistable.
bool foldTailByMasking() const
Returns true if all loop blocks should be masked to fold tail loop.
bool foldTailWithEVL() const
Returns true if VP intrinsics with explicit vector length support should be generated in the tail fol...
bool usePredicatedReductionSelect() const
Returns true if the predicated reduction select should be used to set the incoming value for the redu...
bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const
Returns true if the instructions in this block requires predication for any reason,...
void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind, Function *Variant, Intrinsic::ID IID, std::optional< unsigned > MaskPos, InstructionCost Cost)
void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC)
Selects and saves TailFoldingStyle for 2 options - if IV update may overflow or not.
AssumptionCache * AC
Assumption cache.
void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for instruction I and vector width VF.
InstWidening
Decision that was taken during cost calculation for memory instruction.
bool isScalarWithPredication(Instruction *I, ElementCount VF) const
Returns true if I is an instruction which requires predication and for which our chosen predication s...
InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const
Estimate cost of a call instruction CI if it were vectorized with factor VF.
bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const
Returns true if we should use strict in-order reductions for the given RdxDesc.
std::pair< InstructionCost, InstructionCost > getDivRemSpeculationCost(Instruction *I, ElementCount VF) const
Return the costs for our two available strategies for lowering a div/rem operation which requires spe...
bool isDivRemScalarWithPredication(InstructionCost ScalarCost, InstructionCost SafeDivisorCost) const
Given costs for both strategies, return true if the scalar predication lowering should be used for di...
InstructionCost expectedCost(ElementCount VF)
Returns the expected execution cost.
void setCostBasedWideningDecision(ElementCount VF)
Memory access instruction may be vectorized in more than one way.
InstWidening getWideningDecision(Instruction *I, ElementCount VF) const
Return the cost model decision for the given instruction I and vector width VF.
FixedScalableVFPair MaxPermissibleVFWithoutMaxBW
The highest VF possible for this loop, without using MaxBandwidth.
bool isScalarEpilogueAllowed() const
Returns true if a scalar epilogue is not allowed due to optsize or a loop hint annotation.
InstructionCost getWideningCost(Instruction *I, ElementCount VF)
Return the vectorization cost for the given instruction I and vector width VF.
void collectInstsToScalarize(ElementCount VF)
Collects the instructions to scalarize for each predicated instruction in the loop.
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
MapVector< PHINode *, InductionDescriptor > InductionList
InductionList saves induction variables and maps them to the induction descriptor.
const SmallPtrSetImpl< const Instruction * > & getPotentiallyFaultingLoads() const
Returns potentially faulting loads.
bool canVectorize(bool UseVPlanNativePath)
Returns true if it is legal to vectorize this loop.
bool canVectorizeFPMath(bool EnableStrictReductions)
Returns true if it is legal to vectorize the FP math operations in this loop.
PHINode * getPrimaryInduction()
Returns the primary induction variable.
const SmallVector< BasicBlock *, 4 > & getCountableExitingBlocks() const
Returns all exiting blocks with a countable exit, i.e.
const InductionList & getInductionVars() const
Returns the induction variables found in the loop.
bool hasUncountableEarlyExit() const
Returns true if the loop has exactly one uncountable early exit, i.e.
bool hasHistograms() const
Returns a list of all known histogram operations in the loop.
const LoopAccessInfo * getLAI() const
Planner drives the vectorization process after having passed Legality checks.
VectorizationFactor selectEpilogueVectorizationFactor(const ElementCount MaxVF, unsigned IC)
VPlan & getPlanFor(ElementCount VF) const
Return the VPlan for VF.
Definition VPlan.cpp:1615
VectorizationFactor planInVPlanNativePath(ElementCount UserVF)
Use the VPlan-native path to plan how to best vectorize, return the best VF and its cost.
void buildVPlans(ElementCount MinVF, ElementCount MaxVF)
Build VPlans for power-of-2 VF's between MinVF and MaxVF inclusive, according to the information gath...
Definition VPlan.cpp:1599
void updateLoopMetadataAndProfileInfo(Loop *VectorLoop, VPBasicBlock *HeaderVPBB, bool VectorizingEpilogue, unsigned EstimatedVFxUF, bool DisableRuntimeUnroll)
Update loop metadata and profile info for both the scalar remainder loop and VectorLoop,...
Definition VPlan.cpp:1666
VectorizationFactor computeBestVF()
Compute and return the most profitable vectorization factor.
DenseMap< const SCEV *, Value * > executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan, InnerLoopVectorizer &LB, DominatorTree *DT, bool VectorizingEpilogue)
Generate the IR code for the vectorized loop captured in VPlan BestPlan according to the best selecte...
unsigned selectInterleaveCount(VPlan &Plan, ElementCount VF, InstructionCost LoopCost)
void emitInvalidCostRemarks(OptimizationRemarkEmitter *ORE)
Emit remarks for recipes with invalid costs in the available VPlans.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1580
void printPlans(raw_ostream &O)
Definition VPlan.cpp:1746
void plan(ElementCount UserVF, unsigned UserIC)
Build VPlans for the specified UserVF and UserIC if they are non-zero or all applicable candidate VFs...
void addMinimumIterationCheck(VPlan &Plan, ElementCount VF, unsigned UF, ElementCount MinProfitableTripCount) const
Create a check to Plan to see if the vector loop should be executed based on its trip count.
bool hasPlanWithVF(ElementCount VF) const
Look through the existing plans and return true if we have one with vectorization factor VF.
This holds vectorization requirements that must be verified late in the process.
Utility class for getting and setting loop vectorizer hints in the form of loop metadata.
bool allowVectorization(Function *F, Loop *L, bool VectorizeOnlyWhenForced) const
void emitRemarkWithHints() const
Dumps all the hint information.
const char * vectorizeAnalysisPassName() const
If hints are provided that force vectorization, use the AlwaysPrint pass name to force the frontend t...
This class emits a version of the loop where run-time checks ensure that may-alias pointers can't ove...
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition LoopInfo.cpp:644
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition MapVector.h:115
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition Module.cpp:230
Diagnostic information for optimization analysis remarks related to pointer aliasing.
Diagnostic information for optimization analysis remarks related to floating-point non-commutativity.
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
op_range incoming_values()
void setIncomingValueForBlock(const BasicBlock *BB, Value *V)
Set every incoming value(s) for block BB to V.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
unsigned getNumIncomingValues() const
Return the number of incoming edges.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEVPredicate & getPredicate() const
LLVM_ABI unsigned getSmallConstantMaxTripCount()
Returns the upper bound of the loop trip count as a normal unsigned value, or 0 if the trip count is ...
LLVM_ABI const SCEV * getBackedgeTakenCount()
Get the (predicated) backedge count for the analyzed loop.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PreservedAnalyses & preserve()
Mark an analysis as preserved.
Definition Analysis.h:132
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
static bool isFMulAddIntrinsic(Instruction *I)
Returns true if the instruction is a call to the llvm.fmuladd intrinsic.
FastMathFlags getFastMathFlags() const
Instruction * getLoopExitInstr() const
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
Type * getRecurrenceType() const
Returns the type of the recurrence.
const SmallPtrSet< Instruction *, 8 > & getCastInsts() const
Returns a reference to the instructions used for type-promoting the recurrence.
unsigned getMinWidthCastToRecurrenceTypeInBits() const
Returns the minimum width used by the recurrence in bits.
TrackingVH< Value > getRecurrenceStartValue() const
LLVM_ABI SmallVector< Instruction *, 4 > getReductionOpChain(PHINode *Phi, Loop *L) const
Attempts to find a chain of operations from Phi to LoopExitInst that can be treated as a set of reduc...
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
bool isSigned() const
Returns true if all source operands of the recurrence are SExtInsts.
RecurKind getRecurrenceKind() const
bool isOrdered() const
Expose an ordered FP reduction to the instance users.
static LLVM_ABI bool isFloatingPointRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is a floating point kind.
static bool isFindIVRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
Value * getSentinelValue() const
Returns the sentinel value for FindFirstIV & FindLastIV recurrences to replace the start value.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
std::optional< ArrayRef< PointerDiffInfo > > getDiffChecks() const
const SmallVectorImpl< RuntimePointerCheck > & getChecks() const
Returns the checks that generateChecks created.
This class uses information about analyze scalars to rewrite expressions in canonical form.
ScalarEvolution * getSE()
bool isInsertedInstruction(Instruction *I) const
Return true if the specified instruction was inserted by the code rewriter.
LLVM_ABI Value * expandCodeForPredicate(const SCEVPredicate *Pred, Instruction *Loc)
Generates a code sequence that evaluates this predicate.
void eraseDeadInstructions(Value *Root)
Remove inserted instructions that are dead, e.g.
virtual bool isAlwaysTrue() const =0
Returns true if the predicate is always true.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getURemExpr(const SCEV *LHS, const SCEV *RHS)
Represents an unsigned remainder expression based on unsigned division.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getTripCountFromExitCount(const SCEV *ExitCount)
A version of getTripCountFromExitCount below which always picks an evaluation type which can not resu...
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
LLVM_ABI void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
LLVM_ABI void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
LLVM_ABI void forgetBlockAndLoopDispositions(Value *V=nullptr)
Called when the client has changed the disposition of values in a loop or block.
const SCEV * getMinusOne(Type *Ty)
Return a SCEV for the constant -1 of a specific type.
LLVM_ABI void forgetLcssaPhiWithNewPredecessor(Loop *L, PHINode *V)
Forget LCSSA phi node V of loop L to which a new predecessor was added, such that it may no longer be...
LLVM_ABI unsigned getSmallConstantTripCount(const Loop *L)
Returns the exact trip count of the loop if we can compute it, and the result is a small constant.
APInt getUnsignedRangeMax(const SCEV *S)
Determine the max of the unsigned range for a particular SCEV.
LLVM_ABI const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:59
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:104
void insert_range(Range &&R)
Definition SetVector.h:193
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition SetVector.h:279
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:168
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:356
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
@ TCC_Free
Expected to fold away in lowering.
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ Interleave
The cast is used with an interleaved load/store.
@ GatherScatter
The cast is used with a gather/scatter.
This class represents a truncation of integer types.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:87
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:96
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:281
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:292
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition User.cpp:21
iterator_range< op_iterator > op_range
Definition User.h:281
Value * getOperand(unsigned i) const
Definition User.h:232
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition VectorUtils.h:74
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:3750
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:3825
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:3777
iterator end()
Definition VPlan.h:3787
iterator begin()
Recipe iterator methods.
Definition VPlan.h:3785
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:3838
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:246
VPRegionBlock * getEnclosingLoopRegion()
Definition VPlan.cpp:635
void insert(VPRecipeBase *Recipe, iterator InsertPt)
Definition VPlan.h:3816
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:81
VPRegionBlock * getParent()
Definition VPlan.h:173
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:190
void setName(const Twine &newName)
Definition VPlan.h:166
size_t getNumSuccessors() const
Definition VPlan.h:219
void swapSuccessors()
Swap successors of the block. The block must have exactly 2 successors.
Definition VPlan.h:322
size_t getNumPredecessors() const
Definition VPlan.h:220
VPlan * getPlan()
Definition VPlan.cpp:165
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:215
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:170
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:209
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:198
static auto blocksOnly(const T &Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:217
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:238
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:176
static void reassociateBlocks(VPBlockBase *Old, VPBlockBase *New)
Reassociate all the blocks connected to Old so that they now point to New.
Definition VPlanUtils.h:203
VPlan-based builder utility analogous to IRBuilder.
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL, const Twine &Name="")
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL)
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:422
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:395
void execute(VPTransformState &State) override
Generate the transformed value of the induction at offset StartValue (1.
VPValue * getStepValue() const
Definition VPlan.h:3627
VPValue * getStartValue() const
Definition VPlan.h:3626
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:1963
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2011
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2000
A recipe representing a sequence of load -> update -> store as part of a histogram operation.
Definition VPlan.h:1678
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:3903
Helper to manage IR metadata for recipes.
Definition VPlan.h:934
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:975
@ ComputeAnyOfResult
Compute the final result of a AnyOf reduction with select(cmp(),x,y), where one of (x,...
Definition VPlan.h:1008
@ ResumeForEpilogue
Explicit user for the resume phi of the canonical induction in the main VPlan, used by the epilogue v...
Definition VPlan.h:1055
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1046
unsigned getOpcode() const
Definition VPlan.h:1116
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2562
In what follows, the term "input IR" refers to code that is fed into the vectorizer whereas the term ...
A recipe for forming partial reductions.
Definition VPlan.h:2739
detail::zippy< llvm::detail::zip_first, VPUser::const_operand_range, const_incoming_blocks_range > incoming_values_and_blocks() const
Returns an iterator range over pairs of incoming values and corresponding incoming blocks.
Definition VPlan.h:1287
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:394
VPBasicBlock * getParent()
Definition VPlan.h:415
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:482
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * tryToCreateWidenRecipe(VPSingleDefRecipe *R, VFRange &Range)
Create and return a widened recipe for R if one can be created within the given VF Range.
VPValue * getBlockInMask(VPBasicBlock *VPBB) const
Returns the entry mask for block VPBB or null if the mask is all-true.
VPValue * getVPValueOrAddLiveIn(Value *V)
std::optional< unsigned > getScalingForReduction(const Instruction *ExitInst)
void collectScaledReductions(VFRange &Range)
Find all possible partial reductions in the loop and track all of those that are valid so recipes can...
VPReplicateRecipe * handleReplication(Instruction *I, ArrayRef< VPValue * > Operands, VFRange &Range)
Build a VPReplicationRecipe for I using Operands.
VPRecipeBase * tryToCreatePartialReduction(Instruction *Reduction, ArrayRef< VPValue * > Operands, unsigned ScaleFactor)
Create and return a partial reduction recipe for a reduction instruction along with binary operation ...
A recipe for handling reduction phis.
Definition VPlan.h:2317
bool isInLoop() const
Returns true, if the phi is part of an in-loop reduction.
Definition VPlan.h:2377
RecurKind getRecurrenceKind() const
Returns the recurrence kind of the reduction.
Definition VPlan.h:2371
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:3938
const VPBlockBase * getEntry() const
Definition VPlan.h:3974
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:2842
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:521
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:586
An analysis for type-inference for VPValues.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:197
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:241
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:236
void addOperand(VPValue *Operand)
Definition VPlanValue.h:230
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:135
Value * getLiveInIRValue() const
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition VPlanValue.h:174
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:85
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1416
user_iterator user_begin()
Definition VPlanValue.h:130
unsigned getNumUsers() const
Definition VPlanValue.h:113
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1420
user_range users()
Definition VPlanValue.h:134
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:1829
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1479
A recipe for handling GEP instructions.
Definition VPlan.h:1765
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2028
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2056
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2073
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2103
A common base class for widening memory operations.
Definition VPlan.h:3119
A recipe for widened phis.
Definition VPlan.h:2239
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1436
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4041
bool hasVF(ElementCount VF) const
Definition VPlan.h:4250
VPBasicBlock * getEntry()
Definition VPlan.h:4140
VPValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4230
VPValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4233
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4202
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
Definition VPlan.h:4257
bool hasUF(unsigned UF) const
Definition VPlan.h:4268
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4192
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1050
bool hasEarlyExit() const
Returns true if the VPlan is based on a loop with an early exit.
Definition VPlan.h:4413
InstructionCost cost(ElementCount VF, VPCostContext &Ctx)
Return the cost of this plan.
Definition VPlan.cpp:1032
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4216
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4165
LLVM_ABI_FOR_TEST VPIRBasicBlock * createVPIRBasicBlock(BasicBlock *IRBB)
Create a VPIRBasicBlock from IRBB containing VPIRInstructions for all instructions in IRBB,...
Definition VPlan.cpp:1265
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4292
bool hasScalarVFOnly() const
Definition VPlan.h:4261
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4183
void execute(VPTransformState *State)
Generate the IR code for this VPlan.
Definition VPlan.cpp:968
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the vector loop.
Definition VPlan.h:4346
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition VPlan.h:4188
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4145
VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition VPlan.cpp:1192
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition Value.cpp:166
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:390
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1101
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:194
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:169
constexpr bool hasKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns true if there exists a value X where RHS.multiplyCoefficientBy(X) will result in a value whos...
Definition TypeSize.h:269
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:230
constexpr bool isNonZero() const
Definition TypeSize.h:156
constexpr ScalarTy getKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns a value X where RHS.multiplyCoefficientBy(X) will result in a value whose quantity matches ou...
Definition TypeSize.h:277
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:169
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:172
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:166
constexpr bool isZero() const
Definition TypeSize.h:154
static constexpr bool isKnownGT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:223
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
static constexpr bool isKnownGE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:237
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:134
IteratorT end() const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
A raw_ostream that writes to an std::string.
Changed
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
std::variant< std::monostate, Loc::Single, Loc::Multi, Loc::MMI, Loc::EntryValue > Variant
Alias for the std::variant specialization base class of DbgVariable.
Definition DwarfDebug.h:189
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
OneOps_match< OpTy, Instruction::Freeze > m_Freeze(const OpTy &Op)
Matches FreezeInst.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
class_match< const SCEVVScale > m_SCEVVScale()
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
specificloop_ty m_SpecificLoop(const Loop *L)
cst_pred_ty< is_specific_signed_cst > m_scev_SpecificSInt(int64_t V)
Match an SCEV constant with a plain signed integer (sign-extended value will be matched)
SCEVAffineAddRec_match< Op0_t, Op1_t, class_match< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
SCEVBinaryExpr_match< SCEVMulExpr, Op0_t, Op1_t > m_scev_Mul(const Op0_t &Op0, const Op1_t &Op1)
bool match(const SCEV *S, const Pattern &P)
class_match< const SCEV > m_SCEV()
match_combine_or< AllRecipe_match< Instruction::ZExt, Op0_t >, AllRecipe_match< Instruction::SExt, Op0_t > > m_ZExtOrSExt(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExtractLastElement, Op0_t > m_ExtractLastElement(const Op0_t &Op0)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
Add a small namespace to avoid name clashes with the classes used in the streaming interface.
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< InstrNode * > Instr
Definition RDFGraph.h:389
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
VPBasicBlock * getFirstLoopHeader(VPlan &Plan, VPDominatorTree &VPDT)
Returns the header block of the first, top-level loop, or null if none exist.
const SCEV * getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE)
Return the SCEV expression for V.
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
Simplify each loop in a loop nest recursively.
LLVM_ABI void ReplaceInstWithInst(BasicBlock *BB, BasicBlock::iterator &BI, Instruction *I)
Replace the instruction specified by BI with the instruction specified by I.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:330
@ Offset
Definition DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:843
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI Value * addRuntimeChecks(Instruction *Loc, Loop *TheLoop, const SmallVectorImpl< RuntimePointerCheck > &PointerChecks, SCEVExpander &Expander, bool HoistRuntimeChecks=false)
Add code that checks at runtime if the accessed arrays in PointerChecks overlap.
auto cast_if_present(const Y &Val)
cast_if_present<X> - Functionally identical to cast, except that a null value is accepted.
Definition Casting.h:689
LLVM_ABI bool RemoveRedundantDbgInstrs(BasicBlock *BB)
Try to remove redundant dbg.value instructions from given basic block.
cl::opt< bool > VerifyEachVPlan
LLVM_ABI std::optional< unsigned > getLoopEstimatedTripCount(Loop *L, unsigned *EstimatedLoopInvocationWeight=nullptr)
Returns a loop's estimated trip count based on branch weight metadata.
static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, VectorizationFactor VF, unsigned IC)
Report successful vectorization of the loop.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1727
unsigned getLoadStoreAddressSpace(const Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1685
LLVM_ABI_FOR_TEST bool verifyVPlanIsValid(const VPlan &Plan, bool VerifyLate=false)
Verify invariants for general VPlans.
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
InstructionCost Cost
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
OuterAnalysisManagerProxy< ModuleAnalysisManager, Function > ModuleAnalysisManagerFunctionProxy
Provide the ModuleAnalysisManager to Function proxy.
Value * getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF)
Return the runtime value for VF.
LLVM_ABI bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition LCSSA.cpp:449
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2138
LLVM_ABI bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:646
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:293
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:216
LLVM_ABI bool VerifySCEV
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:682
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:243
SmallVector< VPRegisterUsage, 8 > calculateRegisterUsageForPlan(VPlan &Plan, ArrayRef< ElementCount > VFs, const TargetTransformInfo &TTI, const SmallPtrSetImpl< const Value * > &ValuesToIgnore)
Estimate the register usage for Plan and vectorization factors in VFs by calculating the highest numb...
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:759
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1734
void collectEphemeralRecipesForVPlan(VPlan &Plan, DenseSet< VPRecipeBase * > &EphRecipes)
auto reverse(ContainerTy &&C)
Definition STLExtras.h:420
LLVM_ABI void setBranchWeights(Instruction &I, ArrayRef< uint32_t > Weights, bool IsExpected)
Create a new branch_weights metadata node and add or overwrite a prof metadata reference to instructi...
bool containsIrreducibleCFG(RPOTraversalT &RPOTraversal, const LoopInfoT &LI)
Return true if the control flow in RPOTraversal is irreducible.
Definition CFG.h:149
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1652
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1741
LLVM_ABI cl::opt< bool > EnableLoopVectorization
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition Local.cpp:421
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
Type * toVectorizedTy(Type *Ty, ElementCount EC)
A helper for converting to vectorized types.
LLVM_ABI void llvm_unreachable_internal(const char *msg=nullptr, const char *file=nullptr, unsigned line=0)
This function calls abort(), and prints the optional message to stderr.
T * find_singleton(R &&Range, Predicate P, bool AllowRepeats=false)
Return the single value in Range that satisfies P(<member of Range> *, AllowRepeats)->T * returning n...
Definition STLExtras.h:1789
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
cl::opt< unsigned > ForceTargetInstructionCost
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:126
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:337
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:399
bool canVectorizeTy(Type *Ty)
Returns true if Ty is a valid vector element type, void, or an unpacked literal struct where all elem...
TargetTransformInfo TTI
static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr, DebugLoc DL={})
Reports an informative message: print Msg for debugging purposes as well as an optimization remark.
LLVM_ABI bool isAssignmentTrackingEnabled(const Module &M)
Return true if assignment tracking is enabled for module M.
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ Sub
Subtraction of integers.
@ AddChainWithSubs
A chain of adds and subs.
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:155
LLVM_ABI void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr)
Reports a vectorization failure: print DebugMsg for debugging purposes along with the corresponding o...
DWARFExpression::Operation Op
ScalarEpilogueLowering
@ CM_ScalarEpilogueNotAllowedLowTripLoop
@ CM_ScalarEpilogueNotNeededUsePredicate
@ CM_ScalarEpilogueNotAllowedOptSize
@ CM_ScalarEpilogueAllowed
@ CM_ScalarEpilogueNotAllowedUsePredicate
LLVM_ABI bool isGuaranteedNotToBeUndefOrPoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Return true if this function can prove that V does not have undef bits and is never poison.
ArrayRef(const T &OneElt) -> ArrayRef< T >
Value * createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step)
Return a value for Step multiplied by VF.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1760
auto predecessors(const MachineBasicBlock *BB)
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
Definition iterator.h:363
cl::opt< bool > EnableVPlanNativePath
Definition VPlan.cpp:56
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
ArrayRef< Type * > getContainedTypes(Type *const &Ty)
Returns the types contained in Ty.
LLVM_ABI Value * addDiffRuntimeChecks(Instruction *Loc, ArrayRef< PointerDiffInfo > Checks, SCEVExpander &Expander, function_ref< Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC)
bool pred_empty(const BasicBlock *BB)
Definition CFG.h:119
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
@ None
Don't use tail folding.
@ DataWithEVL
Use predicated EVL instructions for tail-folding.
@ DataAndControlFlow
Use predicate to control both data and control flow.
@ DataWithoutLaneMask
Same as Data, but avoids using the get.active.lane.mask intrinsic to calculate the mask and instead i...
@ Data
Use predicate only to mask operations on data in the loop.
unsigned getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind)
A helper function that returns how much we should divide the cost of a predicated block by.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI bool hasBranchWeightMD(const Instruction &I)
Checks if an instructions has Branch Weight Metadata.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:591
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:280
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
std::unique_ptr< VPlan > VPlanPtr
Definition VPlan.h:77
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:836
LLVM_ABI MapVector< Instruction *, uint64_t > computeMinimumValueSizes(ArrayRef< BasicBlock * > Blocks, DemandedBits &DB, const TargetTransformInfo *TTI=nullptr)
Compute a map of integer instructions to their minimum legal type size.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:465
LLVM_ABI cl::opt< bool > EnableLoopInterleaving
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:853
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
A special type used by analysis passes to provide an address that identifies that particular analysis...
Definition Analysis.h:29
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
An information struct used to provide DenseMap with the various necessary components for a given valu...
Encapsulate information regarding vectorization of a loop and its epilogue.
EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, ElementCount EVF, unsigned EUF, VPlan &EpiloguePlan)
A class that represents two vectorization factors (initialized with 0 by default).
static FixedScalableVFPair getNone()
This holds details about a histogram operation – a load -> update -> store sequence where each lane i...
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
TargetLibraryInfo * TLI
LLVM_ABI LoopVectorizeResult runImpl(Function &F)
LLVM_ABI bool processLoop(Loop *L)
ProfileSummaryInfo * PSI
LoopAccessInfoManager * LAIs
LLVM_ABI void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
LLVM_ABI LoopVectorizePass(LoopVectorizeOptions Opts={})
BlockFrequencyInfo * BFI
ScalarEvolution * SE
AssumptionCache * AC
LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
OptimizationRemarkEmitter * ORE
TargetTransformInfo * TTI
Storage for information about made changes.
A chain of instructions that form a partial reduction.
Instruction * Reduction
The top-level binary operation that forms the reduction to a scalar after the loop body.
Instruction * ExtendA
The extension of each of the inner binary operation's operands.
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition PassManager.h:70
A marker analysis to determine if extra passes should be run after loop vectorization.
static LLVM_ABI AnalysisKey Key
Holds the VFShape for a specific scalar to vector function mapping.
std::optional< unsigned > getParamIndexForOptionalMask() const
Instruction Set Architecture.
Encapsulates information needed to describe a parameter.
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
ElementCount End
Struct to hold various analysis needed for cost computations.
LoopVectorizationCostModel & CM
bool isLegacyUniformAfterVectorization(Instruction *I, ElementCount VF) const
Return true if I is considered uniform-after-vectorization in the legacy cost model for VF.
bool skipCostComputation(Instruction *UI, bool IsVector) const
Return true if the cost for UI shouldn't be computed, e.g.
InstructionCost getLegacyCost(Instruction *UI, ElementCount VF) const
Return the cost for UI with VF using the legacy cost model as fallback until computing the cost of al...
SmallPtrSet< Instruction *, 8 > SkipCostComputation
A recipe for handling first-order recurrence phis.
Definition VPlan.h:2282
A struct that represents some properties of the register usage of a loop.
VPTransformState holds information passed down when "executing" a VPlan, needed for generating the ou...
A recipe for widening select instructions.
Definition VPlan.h:1719
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static LLVM_ABI_FOR_TEST std::unique_ptr< VPlan > buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, DebugLoc IVDL, PredicatedScalarEvolution &PSE)
Create a base VPlan0, serving as the common starting point for all later candidates.
static void optimizeInductionExitUsers(VPlan &Plan, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static LLVM_ABI_FOR_TEST void handleEarlyExits(VPlan &Plan, bool HasUncountableExit)
Update Plan to account for all early exits.
static void canonicalizeEVLLoops(VPlan &Plan)
Transform EVL loops to use variable-length stepping after region dissolution.
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static bool runPass(bool(*Transform)(VPlan &, ArgsTy...), VPlan &Plan, typename std::remove_reference< ArgsTy >::type &...Args)
Helper to run a VPlan transform Transform on VPlan, forwarding extra arguments to the transform.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static void materializeBuildVectors(VPlan &Plan)
Add explicit Build[Struct]Vector recipes that combine multiple scalar values into single vectors.
static void unrollByUF(VPlan &Plan, unsigned UF)
Explicitly unroll Plan by UF.
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void addMinimumIterationCheck(VPlan &Plan, ElementCount VF, unsigned UF, ElementCount MinProfitableTripCount, bool RequiresScalarEpilogue, bool TailFolded, bool CheckNeededWithTailFolding, Loop *OrigLoop, const uint32_t *MinItersBypassWeights, DebugLoc DL, ScalarEvolution &SE)
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static DenseMap< VPBasicBlock *, VPValue * > introduceMasksAndLinearize(VPlan &Plan, bool FoldTail)
Predicate and linearize the control-flow in the only loop region of Plan.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static bool handleMaxMinNumReductions(VPlan &Plan)
Check if Plan contains any FMaxNum or FMinNum reductions.
static void removeBranchOnConst(VPlan &Plan)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static LLVM_ABI_FOR_TEST void createLoopRegions(VPlan &Plan)
Replace loops in Plan's flat CFG with VPRegionBlocks, turning Plan's flat CFG into a hierarchical CFG...
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void attachCheckBlock(VPlan &Plan, Value *Cond, BasicBlock *CheckBlock, bool AddBranchWeights)
Wrap runtime check block CheckBlock in a VPIRBB and Cond in a VPValue and connect the block to Plan,...
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue)
Materialize vector trip count computations to a set of VPInstructions.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlanPtr &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void replicateByVF(VPlan &Plan, ElementCount VF)
Replace each VPReplicateRecipe outside on any replicate region in Plan with VF single-scalar recipes.
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF, unsigned VectorRegWidth)
Try to convert a plan with interleave groups with VF elements to a plan with the interleave groups re...
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize VF and VFxUF to be computed explicitly using VPInstructions.
static LLVM_ABI_FOR_TEST void addMiddleCheck(VPlan &Plan, bool RequiresScalarEpilogueCheck, bool TailFolded)
If a check is needed to guard executing the scalar epilogue loop, it will be added to the middle bloc...
TODO: The following VectorizationFactor was pulled out of LoopVectorizationCostModel class.
InstructionCost Cost
Cost of the loop with that width.
ElementCount MinProfitableTripCount
The minimum trip count required to make vectorization profitable, e.g.
ElementCount Width
Vector width with best cost.
InstructionCost ScalarCost
Cost of the scalar loop.
static VectorizationFactor Disabled()
Width 1 means no vectorization, cost 0 means uncomputed cost.
static LLVM_ABI bool HoistRuntimeChecks