LLVM 22.0.0git
LoopVectorize.cpp
Go to the documentation of this file.
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanAnalysis.h"
61#include "VPlanCFG.h"
62#include "VPlanHelpers.h"
63#include "VPlanPatternMatch.h"
64#include "VPlanTransforms.h"
65#include "VPlanUtils.h"
66#include "VPlanVerifier.h"
67#include "llvm/ADT/APInt.h"
68#include "llvm/ADT/ArrayRef.h"
69#include "llvm/ADT/DenseMap.h"
71#include "llvm/ADT/Hashing.h"
72#include "llvm/ADT/MapVector.h"
73#include "llvm/ADT/STLExtras.h"
76#include "llvm/ADT/Statistic.h"
77#include "llvm/ADT/StringRef.h"
78#include "llvm/ADT/Twine.h"
79#include "llvm/ADT/TypeSwitch.h"
84#include "llvm/Analysis/CFG.h"
101#include "llvm/IR/Attributes.h"
102#include "llvm/IR/BasicBlock.h"
103#include "llvm/IR/CFG.h"
104#include "llvm/IR/Constant.h"
105#include "llvm/IR/Constants.h"
106#include "llvm/IR/DataLayout.h"
107#include "llvm/IR/DebugInfo.h"
108#include "llvm/IR/DebugLoc.h"
109#include "llvm/IR/DerivedTypes.h"
111#include "llvm/IR/Dominators.h"
112#include "llvm/IR/Function.h"
113#include "llvm/IR/IRBuilder.h"
114#include "llvm/IR/InstrTypes.h"
115#include "llvm/IR/Instruction.h"
116#include "llvm/IR/Instructions.h"
118#include "llvm/IR/Intrinsics.h"
119#include "llvm/IR/MDBuilder.h"
120#include "llvm/IR/Metadata.h"
121#include "llvm/IR/Module.h"
122#include "llvm/IR/Operator.h"
123#include "llvm/IR/PatternMatch.h"
125#include "llvm/IR/Type.h"
126#include "llvm/IR/Use.h"
127#include "llvm/IR/User.h"
128#include "llvm/IR/Value.h"
129#include "llvm/IR/Verifier.h"
130#include "llvm/Support/Casting.h"
132#include "llvm/Support/Debug.h"
147#include <algorithm>
148#include <cassert>
149#include <cstdint>
150#include <functional>
151#include <iterator>
152#include <limits>
153#include <memory>
154#include <string>
155#include <tuple>
156#include <utility>
157
158using namespace llvm;
159using namespace SCEVPatternMatch;
160
161#define LV_NAME "loop-vectorize"
162#define DEBUG_TYPE LV_NAME
163
164#ifndef NDEBUG
165const char VerboseDebug[] = DEBUG_TYPE "-verbose";
166#endif
167
168/// @{
169/// Metadata attribute names
170const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
172 "llvm.loop.vectorize.followup_vectorized";
174 "llvm.loop.vectorize.followup_epilogue";
175/// @}
176
177STATISTIC(LoopsVectorized, "Number of loops vectorized");
178STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
179STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
180STATISTIC(LoopsEarlyExitVectorized, "Number of early exit loops vectorized");
181
183 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
184 cl::desc("Enable vectorization of epilogue loops."));
185
187 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
188 cl::desc("When epilogue vectorization is enabled, and a value greater than "
189 "1 is specified, forces the given VF for all applicable epilogue "
190 "loops."));
191
193 "epilogue-vectorization-minimum-VF", cl::Hidden,
194 cl::desc("Only loops with vectorization factor equal to or larger than "
195 "the specified value are considered for epilogue vectorization."));
196
197/// Loops with a known constant trip count below this number are vectorized only
198/// if no scalar iteration overheads are incurred.
200 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
201 cl::desc("Loops with a constant trip count that is smaller than this "
202 "value are vectorized only if no scalar iteration overheads "
203 "are incurred."));
204
206 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
207 cl::desc("The maximum allowed number of runtime memory checks"));
208
209// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
210// that predication is preferred, and this lists all options. I.e., the
211// vectorizer will try to fold the tail-loop (epilogue) into the vector body
212// and predicate the instructions accordingly. If tail-folding fails, there are
213// different fallback strategies depending on these values:
215 enum Option {
219 };
220} // namespace PreferPredicateTy
221
223 "prefer-predicate-over-epilogue",
226 cl::desc("Tail-folding and predication preferences over creating a scalar "
227 "epilogue loop."),
229 "scalar-epilogue",
230 "Don't tail-predicate loops, create scalar epilogue"),
232 "predicate-else-scalar-epilogue",
233 "prefer tail-folding, create scalar epilogue if tail "
234 "folding fails."),
236 "predicate-dont-vectorize",
237 "prefers tail-folding, don't attempt vectorization if "
238 "tail-folding fails.")));
239
241 "force-tail-folding-style", cl::desc("Force the tail folding style"),
242 cl::init(TailFoldingStyle::None),
244 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
246 TailFoldingStyle::Data, "data",
247 "Create lane mask for data only, using active.lane.mask intrinsic"),
248 clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
249 "data-without-lane-mask",
250 "Create lane mask with compare/stepvector"),
251 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
252 "Create lane mask using active.lane.mask intrinsic, and use "
253 "it for both data and control flow"),
254 clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
255 "data-and-control-without-rt-check",
256 "Similar to data-and-control, but remove the runtime check"),
257 clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
258 "Use predicated EVL instructions for tail folding. If EVL "
259 "is unsupported, fallback to data-without-lane-mask.")));
260
262 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
263 cl::desc("Maximize bandwidth when selecting vectorization factor which "
264 "will be determined by the smallest type in loop."));
265
267 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
268 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
269
270/// An interleave-group may need masking if it resides in a block that needs
271/// predication, or in order to mask away gaps.
273 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
274 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
275
277 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
278 cl::desc("A flag that overrides the target's number of scalar registers."));
279
281 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
282 cl::desc("A flag that overrides the target's number of vector registers."));
283
285 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
286 cl::desc("A flag that overrides the target's max interleave factor for "
287 "scalar loops."));
288
290 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
291 cl::desc("A flag that overrides the target's max interleave factor for "
292 "vectorized loops."));
293
295 "force-target-instruction-cost", cl::init(0), cl::Hidden,
296 cl::desc("A flag that overrides the target's expected cost for "
297 "an instruction to a single constant value. Mostly "
298 "useful for getting consistent testing."));
299
301 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
302 cl::desc(
303 "Pretend that scalable vectors are supported, even if the target does "
304 "not support them. This flag should only be used for testing."));
305
307 "small-loop-cost", cl::init(20), cl::Hidden,
308 cl::desc(
309 "The cost of a loop that is considered 'small' by the interleaver."));
310
312 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
313 cl::desc("Enable the use of the block frequency analysis to access PGO "
314 "heuristics minimizing code growth in cold regions and being more "
315 "aggressive in hot regions."));
316
317// Runtime interleave loops for load/store throughput.
319 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
320 cl::desc(
321 "Enable runtime interleaving until load/store ports are saturated"));
322
323/// The number of stores in a loop that are allowed to need predication.
325 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
326 cl::desc("Max number of stores to be predicated behind an if."));
327
329 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
330 cl::desc("Count the induction variable only once when interleaving"));
331
333 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
334 cl::desc("Enable if predication of stores during vectorization."));
335
337 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
338 cl::desc("The maximum interleave count to use when interleaving a scalar "
339 "reduction in a nested loop."));
340
341static cl::opt<bool>
342 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
344 cl::desc("Prefer in-loop vector reductions, "
345 "overriding the targets preference."));
346
348 "force-ordered-reductions", cl::init(false), cl::Hidden,
349 cl::desc("Enable the vectorisation of loops with in-order (strict) "
350 "FP reductions"));
351
353 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
354 cl::desc(
355 "Prefer predicating a reduction operation over an after loop select."));
356
358 "enable-vplan-native-path", cl::Hidden,
359 cl::desc("Enable VPlan-native vectorization path with "
360 "support for outer loop vectorization."));
361
363 llvm::VerifyEachVPlan("vplan-verify-each",
364#ifdef EXPENSIVE_CHECKS
365 cl::init(true),
366#else
367 cl::init(false),
368#endif
370 cl::desc("Verfiy VPlans after VPlan transforms."));
371
372// This flag enables the stress testing of the VPlan H-CFG construction in the
373// VPlan-native vectorization path. It must be used in conjuction with
374// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
375// verification of the H-CFGs built.
377 "vplan-build-stress-test", cl::init(false), cl::Hidden,
378 cl::desc(
379 "Build VPlan for every supported loop nest in the function and bail "
380 "out right after the build (stress test the VPlan H-CFG construction "
381 "in the VPlan-native vectorization path)."));
382
384 "interleave-loops", cl::init(true), cl::Hidden,
385 cl::desc("Enable loop interleaving in Loop vectorization passes"));
387 "vectorize-loops", cl::init(true), cl::Hidden,
388 cl::desc("Run the Loop vectorization passes"));
389
391 "force-widen-divrem-via-safe-divisor", cl::Hidden,
392 cl::desc(
393 "Override cost based safe divisor widening for div/rem instructions"));
394
396 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
398 cl::desc("Try wider VFs if they enable the use of vector variants"));
399
401 "enable-early-exit-vectorization", cl::init(true), cl::Hidden,
402 cl::desc(
403 "Enable vectorization of early exit loops with uncountable exits."));
404
405// Likelyhood of bypassing the vectorized loop because there are zero trips left
406// after prolog. See `emitIterationCountCheck`.
407static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
408
409/// A helper function that returns true if the given type is irregular. The
410/// type is irregular if its allocated size doesn't equal the store size of an
411/// element of the corresponding vector type.
412static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
413 // Determine if an array of N elements of type Ty is "bitcast compatible"
414 // with a <N x Ty> vector.
415 // This is only true if there is no padding between the array elements.
416 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
417}
418
419/// A version of ScalarEvolution::getSmallConstantTripCount that returns an
420/// ElementCount to include loops whose trip count is a function of vscale.
422 const Loop *L) {
423 if (unsigned ExpectedTC = SE->getSmallConstantTripCount(L))
424 return ElementCount::getFixed(ExpectedTC);
425
426 const SCEV *BTC = SE->getBackedgeTakenCount(L);
427 if (isa<SCEVCouldNotCompute>(BTC))
428 return ElementCount::getFixed(0);
429
430 const SCEV *ExitCount = SE->getTripCountFromExitCount(BTC, BTC->getType(), L);
431 if (isa<SCEVVScale>(ExitCount))
433
434 const APInt *Scale;
435 if (match(ExitCount, m_scev_Mul(m_scev_APInt(Scale), m_SCEVVScale())))
436 if (cast<SCEVMulExpr>(ExitCount)->hasNoUnsignedWrap())
437 if (Scale->getActiveBits() <= 32)
439
440 return ElementCount::getFixed(0);
441}
442
443/// Returns "best known" trip count, which is either a valid positive trip count
444/// or std::nullopt when an estimate cannot be made (including when the trip
445/// count would overflow), for the specified loop \p L as defined by the
446/// following procedure:
447/// 1) Returns exact trip count if it is known.
448/// 2) Returns expected trip count according to profile data if any.
449/// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
450/// 4) Returns std::nullopt if all of the above failed.
451static std::optional<ElementCount>
453 bool CanUseConstantMax = true) {
454 // Check if exact trip count is known.
455 if (auto ExpectedTC = getSmallConstantTripCount(PSE.getSE(), L))
456 return ExpectedTC;
457
458 // Check if there is an expected trip count available from profile data.
460 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
461 return ElementCount::getFixed(*EstimatedTC);
462
463 if (!CanUseConstantMax)
464 return std::nullopt;
465
466 // Check if upper bound estimate is known.
467 if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())
468 return ElementCount::getFixed(ExpectedTC);
469
470 return std::nullopt;
471}
472
473namespace {
474// Forward declare GeneratedRTChecks.
475class GeneratedRTChecks;
476
477using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
478} // namespace
479
480namespace llvm {
481
483
484/// InnerLoopVectorizer vectorizes loops which contain only one basic
485/// block to a specified vectorization factor (VF).
486/// This class performs the widening of scalars into vectors, or multiple
487/// scalars. This class also implements the following features:
488/// * It inserts an epilogue loop for handling loops that don't have iteration
489/// counts that are known to be a multiple of the vectorization factor.
490/// * It handles the code generation for reduction variables.
491/// * Scalarization (implementation using scalars) of un-vectorizable
492/// instructions.
493/// InnerLoopVectorizer does not perform any vectorization-legality
494/// checks, and relies on the caller to check for the different legality
495/// aspects. The InnerLoopVectorizer relies on the
496/// LoopVectorizationLegality class to provide information about the induction
497/// and reduction variables that were found to a given vectorization factor.
499public:
503 ElementCount VecWidth, unsigned UnrollFactor,
505 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks,
506 VPlan &Plan)
507 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TTI(TTI), AC(AC),
508 VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()),
511 Plan.getVectorLoopRegion()->getSinglePredecessor())) {}
512
513 virtual ~InnerLoopVectorizer() = default;
514
515 /// Creates a basic block for the scalar preheader. Both
516 /// EpilogueVectorizerMainLoop and EpilogueVectorizerEpilogueLoop overwrite
517 /// the method to create additional blocks and checks needed for epilogue
518 /// vectorization.
520
521 /// Fix the vectorized code, taking care of header phi's, and more.
523
524 /// Fix the non-induction PHIs in \p Plan.
526
527 /// Returns the original loop trip count.
528 Value *getTripCount() const { return TripCount; }
529
530 /// Used to set the trip count after ILV's construction and after the
531 /// preheader block has been executed. Note that this always holds the trip
532 /// count of the original loop for both main loop and epilogue vectorization.
533 void setTripCount(Value *TC) { TripCount = TC; }
534
535protected:
537
538 /// Create and return a new IR basic block for the scalar preheader whose name
539 /// is prefixed with \p Prefix.
541
542 /// Allow subclasses to override and print debug traces before/after vplan
543 /// execution, when trace information is requested.
544 virtual void printDebugTracesAtStart() {}
545 virtual void printDebugTracesAtEnd() {}
546
547 /// The original loop.
549
550 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
551 /// dynamic knowledge to simplify SCEV expressions and converts them to a
552 /// more usable form.
554
555 /// Loop Info.
557
558 /// Dominator Tree.
560
561 /// Target Transform Info.
563
564 /// Assumption Cache.
566
567 /// The vectorization SIMD factor to use. Each vector will have this many
568 /// vector elements.
570
571 /// The vectorization unroll factor to use. Each scalar is vectorized to this
572 /// many different vector instructions.
573 unsigned UF;
574
575 /// The builder that we use
577
578 // --- Vectorization state ---
579
580 /// The vector-loop preheader.
582
583 /// Trip count of the original loop.
584 Value *TripCount = nullptr;
585
586 /// The profitablity analysis.
588
589 /// BFI and PSI are used to check for profile guided size optimizations.
592
593 /// Structure to hold information about generated runtime checks, responsible
594 /// for cleaning the checks, if vectorization turns out unprofitable.
595 GeneratedRTChecks &RTChecks;
596
598
599 /// The vector preheader block of \p Plan, used as target for check blocks
600 /// introduced during skeleton creation.
602};
603
604/// Encapsulate information regarding vectorization of a loop and its epilogue.
605/// This information is meant to be updated and used across two stages of
606/// epilogue vectorization.
609 unsigned MainLoopUF = 0;
611 unsigned EpilogueUF = 0;
614 Value *TripCount = nullptr;
617
619 ElementCount EVF, unsigned EUF,
621 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF),
623 assert(EUF == 1 &&
624 "A high UF for the epilogue loop is likely not beneficial.");
625 }
626};
627
628/// An extension of the inner loop vectorizer that creates a skeleton for a
629/// vectorized loop that has its epilogue (residual) also vectorized.
630/// The idea is to run the vplan on a given loop twice, firstly to setup the
631/// skeleton and vectorize the main loop, and secondly to complete the skeleton
632/// from the first step and vectorize the epilogue. This is achieved by
633/// deriving two concrete strategy classes from this base class and invoking
634/// them in succession from the loop vectorizer planner.
636public:
642 GeneratedRTChecks &Checks, VPlan &Plan, ElementCount VecWidth,
643 ElementCount MinProfitableTripCount, unsigned UnrollFactor)
644 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, VecWidth,
645 UnrollFactor, CM, BFI, PSI, Checks, Plan),
647
648 /// Holds and updates state information required to vectorize the main loop
649 /// and its epilogue in two separate passes. This setup helps us avoid
650 /// regenerating and recomputing runtime safety checks. It also helps us to
651 /// shorten the iteration-count-check path length for the cases where the
652 /// iteration count of the loop is so small that the main vector loop is
653 /// completely skipped.
655
656protected:
658};
659
660/// A specialized derived class of inner loop vectorizer that performs
661/// vectorization of *main* loops in the process of vectorizing loops and their
662/// epilogues.
664public:
672 GeneratedRTChecks &Check, VPlan &Plan)
674 BFI, PSI, Check, Plan, EPI.MainLoopVF,
675 EPI.MainLoopVF, EPI.MainLoopUF) {}
676 /// Implements the interface for creating a vectorized skeleton using the
677 /// *main loop* strategy (i.e., the first pass of VPlan execution).
679
680protected:
681 /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
682 /// vector preheader and its predecessor, also connecting the new block to the
683 /// scalar preheader.
684 void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
685
686 // Create a check to see if the main vector loop should be executed
688
689 /// Emits an iteration count bypass check once for the main loop (when \p
690 /// ForEpilogue is false) and once for the epilogue loop (when \p
691 /// ForEpilogue is true).
692 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
693 void printDebugTracesAtStart() override;
694 void printDebugTracesAtEnd() override;
695};
696
697// A specialized derived class of inner loop vectorizer that performs
698// vectorization of *epilogue* loops in the process of vectorizing loops and
699// their epilogues.
701 /// The additional bypass block which conditionally skips over the epilogue
702 /// loop after executing the main loop. Needed to resume inductions and
703 /// reductions during epilogue vectorization.
704 BasicBlock *AdditionalBypassBlock = nullptr;
705
706public:
712 GeneratedRTChecks &Checks, VPlan &Plan)
714 BFI, PSI, Checks, Plan, EPI.EpilogueVF,
715 EPI.EpilogueVF, EPI.EpilogueUF) {
717 }
718 /// Implements the interface for creating a vectorized skeleton using the
719 /// *epilogue loop* strategy (i.e., the second pass of VPlan execution).
721
722 /// Return the additional bypass block which targets the scalar loop by
723 /// skipping the epilogue loop after completing the main loop.
724 BasicBlock *getAdditionalBypassBlock() const {
725 assert(AdditionalBypassBlock &&
726 "Trying to access AdditionalBypassBlock but it has not been set");
727 return AdditionalBypassBlock;
728 }
729
730protected:
731 /// Emits an iteration count bypass check after the main vector loop has
732 /// finished to see if there are any iterations left to execute by either
733 /// the vector epilogue or the scalar epilogue.
734 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
735 BasicBlock *Bypass,
736 BasicBlock *Insert);
737 void printDebugTracesAtStart() override;
738 void printDebugTracesAtEnd() override;
739};
740} // end namespace llvm
741
742/// Look for a meaningful debug location on the instruction or its operands.
744 if (!I)
745 return DebugLoc::getUnknown();
746
747 DebugLoc Empty;
748 if (I->getDebugLoc() != Empty)
749 return I->getDebugLoc();
750
751 for (Use &Op : I->operands()) {
752 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
753 if (OpInst->getDebugLoc() != Empty)
754 return OpInst->getDebugLoc();
755 }
756
757 return I->getDebugLoc();
758}
759
760/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
761/// is passed, the message relates to that particular instruction.
762#ifndef NDEBUG
763static void debugVectorizationMessage(const StringRef Prefix,
764 const StringRef DebugMsg,
765 Instruction *I) {
766 dbgs() << "LV: " << Prefix << DebugMsg;
767 if (I != nullptr)
768 dbgs() << " " << *I;
769 else
770 dbgs() << '.';
771 dbgs() << '\n';
772}
773#endif
774
775/// Create an analysis remark that explains why vectorization failed
776///
777/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
778/// RemarkName is the identifier for the remark. If \p I is passed it is an
779/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
780/// the location of the remark. If \p DL is passed, use it as debug location for
781/// the remark. \return the remark object that can be streamed to.
783createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
784 Instruction *I, DebugLoc DL = {}) {
785 BasicBlock *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
786 // If debug location is attached to the instruction, use it. Otherwise if DL
787 // was not provided, use the loop's.
788 if (I && I->getDebugLoc())
789 DL = I->getDebugLoc();
790 else if (!DL)
791 DL = TheLoop->getStartLoc();
792
793 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
794}
795
796namespace llvm {
797
798/// Return a value for Step multiplied by VF.
800 int64_t Step) {
801 assert(Ty->isIntegerTy() && "Expected an integer step");
802 ElementCount VFxStep = VF.multiplyCoefficientBy(Step);
803 assert(isPowerOf2_64(VF.getKnownMinValue()) && "must pass power-of-2 VF");
804 if (VF.isScalable() && isPowerOf2_64(Step)) {
805 return B.CreateShl(
806 B.CreateVScale(Ty),
807 ConstantInt::get(Ty, Log2_64(VFxStep.getKnownMinValue())), "", true);
808 }
809 return B.CreateElementCount(Ty, VFxStep);
810}
811
812/// Return the runtime value for VF.
814 return B.CreateElementCount(Ty, VF);
815}
816
818 const StringRef OREMsg, const StringRef ORETag,
819 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
820 Instruction *I) {
821 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
822 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
823 ORE->emit(
824 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
825 << "loop not vectorized: " << OREMsg);
826}
827
828/// Reports an informative message: print \p Msg for debugging purposes as well
829/// as an optimization remark. Uses either \p I as location of the remark, or
830/// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the
831/// remark. If \p DL is passed, use it as debug location for the remark.
832static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
834 Loop *TheLoop, Instruction *I = nullptr,
835 DebugLoc DL = {}) {
837 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
838 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop,
839 I, DL)
840 << Msg);
841}
842
843/// Report successful vectorization of the loop. In case an outer loop is
844/// vectorized, prepend "outer" to the vectorization remark.
846 VectorizationFactor VF, unsigned IC) {
848 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
849 nullptr));
850 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
851 ORE->emit([&]() {
852 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
853 TheLoop->getHeader())
854 << "vectorized " << LoopType << "loop (vectorization width: "
855 << ore::NV("VectorizationFactor", VF.Width)
856 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
857 });
858}
859
860} // end namespace llvm
861
862namespace llvm {
863
864// Loop vectorization cost-model hints how the scalar epilogue loop should be
865// lowered.
867
868 // The default: allowing scalar epilogues.
870
871 // Vectorization with OptForSize: don't allow epilogues.
873
874 // A special case of vectorisation with OptForSize: loops with a very small
875 // trip count are considered for vectorization under OptForSize, thereby
876 // making sure the cost of their loop body is dominant, free of runtime
877 // guards and scalar iteration overheads.
879
880 // Loop hint predicate indicating an epilogue is undesired.
882
883 // Directive indicating we must either tail fold or not vectorize
886
887/// LoopVectorizationCostModel - estimates the expected speedups due to
888/// vectorization.
889/// In many cases vectorization is not profitable. This can happen because of
890/// a number of reasons. In this class we mainly attempt to predict the
891/// expected speedup/slowdowns due to the supported instruction set. We use the
892/// TargetTransformInfo to query the different backends for the cost of
893/// different operations.
896
897public:
908 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
909 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
910 Hints(Hints), InterleaveInfo(IAI) {
912 initializeVScaleForTuning();
914 // Query this against the original loop and save it here because the profile
915 // of the original loop header may change as the transformation happens.
916 OptForSize = llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
918 }
919
920 /// \return An upper bound for the vectorization factors (both fixed and
921 /// scalable). If the factors are 0, vectorization and interleaving should be
922 /// avoided up front.
923 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
924
925 /// \return True if runtime checks are required for vectorization, and false
926 /// otherwise.
928
929 /// Setup cost-based decisions for user vectorization factor.
930 /// \return true if the UserVF is a feasible VF to be chosen.
933 return expectedCost(UserVF).isValid();
934 }
935
936 /// \return True if maximizing vector bandwidth is enabled by the target or
937 /// user options, for the given register kind.
939
940 /// \return True if register pressure should be calculated for the given VF.
942
943 /// \return The size (in bits) of the smallest and widest types in the code
944 /// that needs to be vectorized. We ignore values that remain scalar such as
945 /// 64 bit loop indices.
946 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
947
948 /// Memory access instruction may be vectorized in more than one way.
949 /// Form of instruction after vectorization depends on cost.
950 /// This function takes cost-based decisions for Load/Store instructions
951 /// and collects them in a map. This decisions map is used for building
952 /// the lists of loop-uniform and loop-scalar instructions.
953 /// The calculated cost is saved with widening decision in order to
954 /// avoid redundant calculations.
956
957 /// A call may be vectorized in different ways depending on whether we have
958 /// vectorized variants available and whether the target supports masking.
959 /// This function analyzes all calls in the function at the supplied VF,
960 /// makes a decision based on the costs of available options, and stores that
961 /// decision in a map for use in planning and plan execution.
963
964 /// Collect values we want to ignore in the cost model.
966
967 /// Collect all element types in the loop for which widening is needed.
969
970 /// Split reductions into those that happen in the loop, and those that happen
971 /// outside. In loop reductions are collected into InLoopReductions.
973
974 /// Returns true if we should use strict in-order reductions for the given
975 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
976 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
977 /// of FP operations.
978 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
979 return !Hints->allowReordering() && RdxDesc.isOrdered();
980 }
981
982 /// \returns The smallest bitwidth each instruction can be represented with.
983 /// The vector equivalents of these instructions should be truncated to this
984 /// type.
986 return MinBWs;
987 }
988
989 /// \returns True if it is more profitable to scalarize instruction \p I for
990 /// vectorization factor \p VF.
992 assert(VF.isVector() &&
993 "Profitable to scalarize relevant only for VF > 1.");
994 assert(
995 TheLoop->isInnermost() &&
996 "cost-model should not be used for outer loops (in VPlan-native path)");
997
998 auto Scalars = InstsToScalarize.find(VF);
999 assert(Scalars != InstsToScalarize.end() &&
1000 "VF not yet analyzed for scalarization profitability");
1001 return Scalars->second.contains(I);
1002 }
1003
1004 /// Returns true if \p I is known to be uniform after vectorization.
1006 assert(
1007 TheLoop->isInnermost() &&
1008 "cost-model should not be used for outer loops (in VPlan-native path)");
1009 // Pseudo probe needs to be duplicated for each unrolled iteration and
1010 // vector lane so that profiled loop trip count can be accurately
1011 // accumulated instead of being under counted.
1012 if (isa<PseudoProbeInst>(I))
1013 return false;
1014
1015 if (VF.isScalar())
1016 return true;
1017
1018 auto UniformsPerVF = Uniforms.find(VF);
1019 assert(UniformsPerVF != Uniforms.end() &&
1020 "VF not yet analyzed for uniformity");
1021 return UniformsPerVF->second.count(I);
1022 }
1023
1024 /// Returns true if \p I is known to be scalar after vectorization.
1026 assert(
1027 TheLoop->isInnermost() &&
1028 "cost-model should not be used for outer loops (in VPlan-native path)");
1029 if (VF.isScalar())
1030 return true;
1031
1032 auto ScalarsPerVF = Scalars.find(VF);
1033 assert(ScalarsPerVF != Scalars.end() &&
1034 "Scalar values are not calculated for VF");
1035 return ScalarsPerVF->second.count(I);
1036 }
1037
1038 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1039 /// for vectorization factor \p VF.
1041 return VF.isVector() && MinBWs.contains(I) &&
1042 !isProfitableToScalarize(I, VF) &&
1044 }
1045
1046 /// Decision that was taken during cost calculation for memory instruction.
1049 CM_Widen, // For consecutive accesses with stride +1.
1050 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1057
1058 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1059 /// instruction \p I and vector width \p VF.
1062 assert(VF.isVector() && "Expected VF >=2");
1063 WideningDecisions[{I, VF}] = {W, Cost};
1064 }
1065
1066 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1067 /// interleaving group \p Grp and vector width \p VF.
1071 assert(VF.isVector() && "Expected VF >=2");
1072 /// Broadcast this decicion to all instructions inside the group.
1073 /// When interleaving, the cost will only be assigned one instruction, the
1074 /// insert position. For other cases, add the appropriate fraction of the
1075 /// total cost to each instruction. This ensures accurate costs are used,
1076 /// even if the insert position instruction is not used.
1077 InstructionCost InsertPosCost = Cost;
1078 InstructionCost OtherMemberCost = 0;
1079 if (W != CM_Interleave)
1080 OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers();
1081 ;
1082 for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) {
1083 if (auto *I = Grp->getMember(Idx)) {
1084 if (Grp->getInsertPos() == I)
1085 WideningDecisions[{I, VF}] = {W, InsertPosCost};
1086 else
1087 WideningDecisions[{I, VF}] = {W, OtherMemberCost};
1088 }
1089 }
1090 }
1091
1092 /// Return the cost model decision for the given instruction \p I and vector
1093 /// width \p VF. Return CM_Unknown if this instruction did not pass
1094 /// through the cost modeling.
1096 assert(VF.isVector() && "Expected VF to be a vector VF");
1097 assert(
1098 TheLoop->isInnermost() &&
1099 "cost-model should not be used for outer loops (in VPlan-native path)");
1100
1101 std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
1102 auto Itr = WideningDecisions.find(InstOnVF);
1103 if (Itr == WideningDecisions.end())
1104 return CM_Unknown;
1105 return Itr->second.first;
1106 }
1107
1108 /// Return the vectorization cost for the given instruction \p I and vector
1109 /// width \p VF.
1111 assert(VF.isVector() && "Expected VF >=2");
1112 std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
1113 assert(WideningDecisions.contains(InstOnVF) &&
1114 "The cost is not calculated");
1115 return WideningDecisions[InstOnVF].second;
1116 }
1117
1122 std::optional<unsigned> MaskPos;
1124 };
1125
1127 Function *Variant, Intrinsic::ID IID,
1128 std::optional<unsigned> MaskPos,
1130 assert(!VF.isScalar() && "Expected vector VF");
1131 CallWideningDecisions[{CI, VF}] = {Kind, Variant, IID, MaskPos, Cost};
1132 }
1133
1135 ElementCount VF) const {
1136 assert(!VF.isScalar() && "Expected vector VF");
1137 auto I = CallWideningDecisions.find({CI, VF});
1138 if (I == CallWideningDecisions.end())
1139 return {CM_Unknown, nullptr, Intrinsic::not_intrinsic, std::nullopt, 0};
1140 return I->second;
1141 }
1142
1143 /// Return True if instruction \p I is an optimizable truncate whose operand
1144 /// is an induction variable. Such a truncate will be removed by adding a new
1145 /// induction variable with the destination type.
1147 // If the instruction is not a truncate, return false.
1148 auto *Trunc = dyn_cast<TruncInst>(I);
1149 if (!Trunc)
1150 return false;
1151
1152 // Get the source and destination types of the truncate.
1153 Type *SrcTy = toVectorTy(Trunc->getSrcTy(), VF);
1154 Type *DestTy = toVectorTy(Trunc->getDestTy(), VF);
1155
1156 // If the truncate is free for the given types, return false. Replacing a
1157 // free truncate with an induction variable would add an induction variable
1158 // update instruction to each iteration of the loop. We exclude from this
1159 // check the primary induction variable since it will need an update
1160 // instruction regardless.
1161 Value *Op = Trunc->getOperand(0);
1162 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1163 return false;
1164
1165 // If the truncated value is not an induction variable, return false.
1166 return Legal->isInductionPhi(Op);
1167 }
1168
1169 /// Collects the instructions to scalarize for each predicated instruction in
1170 /// the loop.
1172
1173 /// Collect values that will not be widened, including Uniforms, Scalars, and
1174 /// Instructions to Scalarize for the given \p VF.
1175 /// The sets depend on CM decision for Load/Store instructions
1176 /// that may be vectorized as interleave, gather-scatter or scalarized.
1177 /// Also make a decision on what to do about call instructions in the loop
1178 /// at that VF -- scalarize, call a known vector routine, or call a
1179 /// vector intrinsic.
1181 // Do the analysis once.
1182 if (VF.isScalar() || Uniforms.contains(VF))
1183 return;
1185 collectLoopUniforms(VF);
1187 collectLoopScalars(VF);
1189 }
1190
1191 /// Returns true if the target machine supports masked store operation
1192 /// for the given \p DataType and kind of access to \p Ptr.
1193 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment,
1194 unsigned AddressSpace) const {
1195 return Legal->isConsecutivePtr(DataType, Ptr) &&
1196 TTI.isLegalMaskedStore(DataType, Alignment, AddressSpace);
1197 }
1198
1199 /// Returns true if the target machine supports masked load operation
1200 /// for the given \p DataType and kind of access to \p Ptr.
1201 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment,
1202 unsigned AddressSpace) const {
1203 return Legal->isConsecutivePtr(DataType, Ptr) &&
1204 TTI.isLegalMaskedLoad(DataType, Alignment, AddressSpace);
1205 }
1206
1207 /// Returns true if the target machine can represent \p V as a masked gather
1208 /// or scatter operation.
1210 bool LI = isa<LoadInst>(V);
1211 bool SI = isa<StoreInst>(V);
1212 if (!LI && !SI)
1213 return false;
1214 auto *Ty = getLoadStoreType(V);
1216 if (VF.isVector())
1217 Ty = VectorType::get(Ty, VF);
1218 return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1219 (SI && TTI.isLegalMaskedScatter(Ty, Align));
1220 }
1221
1222 /// Returns true if the target machine supports all of the reduction
1223 /// variables found for the given VF.
1225 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1226 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1227 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1228 }));
1229 }
1230
1231 /// Given costs for both strategies, return true if the scalar predication
1232 /// lowering should be used for div/rem. This incorporates an override
1233 /// option so it is not simply a cost comparison.
1235 InstructionCost SafeDivisorCost) const {
1236 switch (ForceSafeDivisor) {
1237 case cl::BOU_UNSET:
1238 return ScalarCost < SafeDivisorCost;
1239 case cl::BOU_TRUE:
1240 return false;
1241 case cl::BOU_FALSE:
1242 return true;
1243 }
1244 llvm_unreachable("impossible case value");
1245 }
1246
1247 /// Returns true if \p I is an instruction which requires predication and
1248 /// for which our chosen predication strategy is scalarization (i.e. we
1249 /// don't have an alternate strategy such as masking available).
1250 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1252
1253 /// Returns true if \p I is an instruction that needs to be predicated
1254 /// at runtime. The result is independent of the predication mechanism.
1255 /// Superset of instructions that return true for isScalarWithPredication.
1256 bool isPredicatedInst(Instruction *I) const;
1257
1258 /// Return the costs for our two available strategies for lowering a
1259 /// div/rem operation which requires speculating at least one lane.
1260 /// First result is for scalarization (will be invalid for scalable
1261 /// vectors); second is for the safe-divisor strategy.
1262 std::pair<InstructionCost, InstructionCost>
1264 ElementCount VF) const;
1265
1266 /// Returns true if \p I is a memory instruction with consecutive memory
1267 /// access that can be widened.
1269
1270 /// Returns true if \p I is a memory instruction in an interleaved-group
1271 /// of memory accesses that can be vectorized with wide vector loads/stores
1272 /// and shuffles.
1274
1275 /// Check if \p Instr belongs to any interleaved access group.
1277 return InterleaveInfo.isInterleaved(Instr);
1278 }
1279
1280 /// Get the interleaved access group that \p Instr belongs to.
1283 return InterleaveInfo.getInterleaveGroup(Instr);
1284 }
1285
1286 /// Returns true if we're required to use a scalar epilogue for at least
1287 /// the final iteration of the original loop.
1288 bool requiresScalarEpilogue(bool IsVectorizing) const {
1289 if (!isScalarEpilogueAllowed()) {
1290 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1291 return false;
1292 }
1293 // If we might exit from anywhere but the latch and early exit vectorization
1294 // is disabled, we must run the exiting iteration in scalar form.
1297 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
1298 "from latch block\n");
1299 return true;
1300 }
1301 if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1302 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1303 "interleaved group requires scalar epilogue\n");
1304 return true;
1305 }
1306 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1307 return false;
1308 }
1309
1310 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1311 /// loop hint annotation.
1313 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1314 }
1315
1316 /// Returns the TailFoldingStyle that is best for the current loop.
1317 TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1318 if (!ChosenTailFoldingStyle)
1320 return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1321 : ChosenTailFoldingStyle->second;
1322 }
1323
1324 /// Selects and saves TailFoldingStyle for 2 options - if IV update may
1325 /// overflow or not.
1326 /// \param IsScalableVF true if scalable vector factors enabled.
1327 /// \param UserIC User specific interleave count.
1328 void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1329 assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1330 if (!Legal->canFoldTailByMasking()) {
1331 ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None};
1332 return;
1333 }
1334
1335 // Default to TTI preference, but allow command line override.
1336 ChosenTailFoldingStyle = {
1337 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
1338 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false)};
1339 if (ForceTailFoldingStyle.getNumOccurrences())
1340 ChosenTailFoldingStyle = {ForceTailFoldingStyle.getValue(),
1341 ForceTailFoldingStyle.getValue()};
1342
1343 if (ChosenTailFoldingStyle->first != TailFoldingStyle::DataWithEVL &&
1344 ChosenTailFoldingStyle->second != TailFoldingStyle::DataWithEVL)
1345 return;
1346 // Override EVL styles if needed.
1347 // FIXME: Investigate opportunity for fixed vector factor.
1348 bool EVLIsLegal = UserIC <= 1 && IsScalableVF &&
1350 if (EVLIsLegal)
1351 return;
1352 // If for some reason EVL mode is unsupported, fallback to a scalar epilogue
1353 // if it's allowed, or DataWithoutLaneMask otherwise.
1354 if (ScalarEpilogueStatus == CM_ScalarEpilogueAllowed ||
1355 ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate)
1356 ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None};
1357 else
1358 ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask,
1360
1361 LLVM_DEBUG(
1362 dbgs() << "LV: Preference for VP intrinsics indicated. Will "
1363 "not try to generate VP Intrinsics "
1364 << (UserIC > 1
1365 ? "since interleave count specified is greater than 1.\n"
1366 : "due to non-interleaving reasons.\n"));
1367 }
1368
1369 /// Returns true if all loop blocks should be masked to fold tail loop.
1370 bool foldTailByMasking() const {
1371 // TODO: check if it is possible to check for None style independent of
1372 // IVUpdateMayOverflow flag in getTailFoldingStyle.
1374 }
1375
1376 /// Return maximum safe number of elements to be processed per vector
1377 /// iteration, which do not prevent store-load forwarding and are safe with
1378 /// regard to the memory dependencies. Required for EVL-based VPlans to
1379 /// correctly calculate AVL (application vector length) as min(remaining AVL,
1380 /// MaxSafeElements).
1381 /// TODO: need to consider adjusting cost model to use this value as a
1382 /// vectorization factor for EVL-based vectorization.
1383 std::optional<unsigned> getMaxSafeElements() const { return MaxSafeElements; }
1384
1385 /// Returns true if the instructions in this block requires predication
1386 /// for any reason, e.g. because tail folding now requires a predicate
1387 /// or because the block in the original loop was predicated.
1390 }
1391
1392 /// Returns true if VP intrinsics with explicit vector length support should
1393 /// be generated in the tail folded loop.
1394 bool foldTailWithEVL() const {
1396 }
1397
1398 /// Returns true if the Phi is part of an inloop reduction.
1399 bool isInLoopReduction(PHINode *Phi) const {
1400 return InLoopReductions.contains(Phi);
1401 }
1402
1403 /// Returns true if the predicated reduction select should be used to set the
1404 /// incoming value for the reduction phi.
1406 // Force to use predicated reduction select since the EVL of the
1407 // second-to-last iteration might not be VF*UF.
1408 if (foldTailWithEVL())
1409 return true;
1412 }
1413
1414 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1415 /// with factor VF. Return the cost of the instruction, including
1416 /// scalarization overhead if it's needed.
1418
1419 /// Estimate cost of a call instruction CI if it were vectorized with factor
1420 /// VF. Return the cost of the instruction, including scalarization overhead
1421 /// if it's needed.
1423
1424 /// Invalidates decisions already taken by the cost model.
1426 WideningDecisions.clear();
1427 CallWideningDecisions.clear();
1428 Uniforms.clear();
1429 Scalars.clear();
1430 }
1431
1432 /// Returns the expected execution cost. The unit of the cost does
1433 /// not matter because we use the 'cost' units to compare different
1434 /// vector widths. The cost that is returned is *not* normalized by
1435 /// the factor width.
1437
1438 bool hasPredStores() const { return NumPredStores > 0; }
1439
1440 /// Returns true if epilogue vectorization is considered profitable, and
1441 /// false otherwise.
1442 /// \p VF is the vectorization factor chosen for the original loop.
1443 /// \p Multiplier is an aditional scaling factor applied to VF before
1444 /// comparing to EpilogueVectorizationMinVF.
1446 const unsigned IC) const;
1447
1448 /// Returns the execution time cost of an instruction for a given vector
1449 /// width. Vector width of one means scalar.
1451
1452 /// Return the cost of instructions in an inloop reduction pattern, if I is
1453 /// part of that pattern.
1454 std::optional<InstructionCost> getReductionPatternCost(Instruction *I,
1455 ElementCount VF,
1456 Type *VectorTy) const;
1457
1458 /// Returns true if \p Op should be considered invariant and if it is
1459 /// trivially hoistable.
1461
1462 /// Return the value of vscale used for tuning the cost model.
1463 std::optional<unsigned> getVScaleForTuning() const { return VScaleForTuning; }
1464
1465private:
1466 unsigned NumPredStores = 0;
1467
1468 /// Used to store the value of vscale used for tuning the cost model. It is
1469 /// initialized during object construction.
1470 std::optional<unsigned> VScaleForTuning;
1471
1472 /// Initializes the value of vscale used for tuning the cost model. If
1473 /// vscale_range.min == vscale_range.max then return vscale_range.max, else
1474 /// return the value returned by the corresponding TTI method.
1475 void initializeVScaleForTuning() {
1476 const Function *Fn = TheLoop->getHeader()->getParent();
1477 if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
1478 auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
1479 auto Min = Attr.getVScaleRangeMin();
1480 auto Max = Attr.getVScaleRangeMax();
1481 if (Max && Min == Max) {
1482 VScaleForTuning = Max;
1483 return;
1484 }
1485 }
1486
1487 VScaleForTuning = TTI.getVScaleForTuning();
1488 }
1489
1490 /// \return An upper bound for the vectorization factors for both
1491 /// fixed and scalable vectorization, where the minimum-known number of
1492 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1493 /// disabled or unsupported, then the scalable part will be equal to
1494 /// ElementCount::getScalable(0).
1495 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1496 ElementCount UserVF,
1497 bool FoldTailByMasking);
1498
1499 /// If \p VF > MaxTripcount, clamps it to the next lower VF that is <=
1500 /// MaxTripCount.
1501 ElementCount clampVFByMaxTripCount(ElementCount VF, unsigned MaxTripCount,
1502 bool FoldTailByMasking) const;
1503
1504 /// \return the maximized element count based on the targets vector
1505 /// registers and the loop trip-count, but limited to a maximum safe VF.
1506 /// This is a helper function of computeFeasibleMaxVF.
1507 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1508 unsigned SmallestType,
1509 unsigned WidestType,
1510 ElementCount MaxSafeVF,
1511 bool FoldTailByMasking);
1512
1513 /// Checks if scalable vectorization is supported and enabled. Caches the
1514 /// result to avoid repeated debug dumps for repeated queries.
1515 bool isScalableVectorizationAllowed();
1516
1517 /// \return the maximum legal scalable VF, based on the safe max number
1518 /// of elements.
1519 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1520
1521 /// Calculate vectorization cost of memory instruction \p I.
1522 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1523
1524 /// The cost computation for scalarized memory instruction.
1525 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1526
1527 /// The cost computation for interleaving group of memory instructions.
1528 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1529
1530 /// The cost computation for Gather/Scatter instruction.
1531 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1532
1533 /// The cost computation for widening instruction \p I with consecutive
1534 /// memory access.
1535 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1536
1537 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1538 /// Load: scalar load + broadcast.
1539 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1540 /// element)
1541 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1542
1543 /// Estimate the overhead of scalarizing an instruction. This is a
1544 /// convenience wrapper for the type-based getScalarizationOverhead API.
1545 InstructionCost getScalarizationOverhead(Instruction *I,
1546 ElementCount VF) const;
1547
1548 /// Returns true if an artificially high cost for emulated masked memrefs
1549 /// should be used.
1550 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1551
1552 /// Map of scalar integer values to the smallest bitwidth they can be legally
1553 /// represented as. The vector equivalents of these values should be truncated
1554 /// to this type.
1556
1557 /// A type representing the costs for instructions if they were to be
1558 /// scalarized rather than vectorized. The entries are Instruction-Cost
1559 /// pairs.
1560 using ScalarCostsTy = MapVector<Instruction *, InstructionCost>;
1561
1562 /// A set containing all BasicBlocks that are known to present after
1563 /// vectorization as a predicated block.
1565 PredicatedBBsAfterVectorization;
1566
1567 /// Records whether it is allowed to have the original scalar loop execute at
1568 /// least once. This may be needed as a fallback loop in case runtime
1569 /// aliasing/dependence checks fail, or to handle the tail/remainder
1570 /// iterations when the trip count is unknown or doesn't divide by the VF,
1571 /// or as a peel-loop to handle gaps in interleave-groups.
1572 /// Under optsize and when the trip count is very small we don't allow any
1573 /// iterations to execute in the scalar loop.
1574 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1575
1576 /// Control finally chosen tail folding style. The first element is used if
1577 /// the IV update may overflow, the second element - if it does not.
1578 std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1579 ChosenTailFoldingStyle;
1580
1581 /// true if scalable vectorization is supported and enabled.
1582 std::optional<bool> IsScalableVectorizationAllowed;
1583
1584 /// Maximum safe number of elements to be processed per vector iteration,
1585 /// which do not prevent store-load forwarding and are safe with regard to the
1586 /// memory dependencies. Required for EVL-based veectorization, where this
1587 /// value is used as the upper bound of the safe AVL.
1588 std::optional<unsigned> MaxSafeElements;
1589
1590 /// A map holding scalar costs for different vectorization factors. The
1591 /// presence of a cost for an instruction in the mapping indicates that the
1592 /// instruction will be scalarized when vectorizing with the associated
1593 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1595
1596 /// Holds the instructions known to be uniform after vectorization.
1597 /// The data is collected per VF.
1599
1600 /// Holds the instructions known to be scalar after vectorization.
1601 /// The data is collected per VF.
1603
1604 /// Holds the instructions (address computations) that are forced to be
1605 /// scalarized.
1607
1608 /// PHINodes of the reductions that should be expanded in-loop.
1609 SmallPtrSet<PHINode *, 4> InLoopReductions;
1610
1611 /// A Map of inloop reduction operations and their immediate chain operand.
1612 /// FIXME: This can be removed once reductions can be costed correctly in
1613 /// VPlan. This was added to allow quick lookup of the inloop operations.
1614 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1615
1616 /// Returns the expected difference in cost from scalarizing the expression
1617 /// feeding a predicated instruction \p PredInst. The instructions to
1618 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1619 /// non-negative return value implies the expression will be scalarized.
1620 /// Currently, only single-use chains are considered for scalarization.
1621 InstructionCost computePredInstDiscount(Instruction *PredInst,
1622 ScalarCostsTy &ScalarCosts,
1623 ElementCount VF);
1624
1625 /// Collect the instructions that are uniform after vectorization. An
1626 /// instruction is uniform if we represent it with a single scalar value in
1627 /// the vectorized loop corresponding to each vector iteration. Examples of
1628 /// uniform instructions include pointer operands of consecutive or
1629 /// interleaved memory accesses. Note that although uniformity implies an
1630 /// instruction will be scalar, the reverse is not true. In general, a
1631 /// scalarized instruction will be represented by VF scalar values in the
1632 /// vectorized loop, each corresponding to an iteration of the original
1633 /// scalar loop.
1634 void collectLoopUniforms(ElementCount VF);
1635
1636 /// Collect the instructions that are scalar after vectorization. An
1637 /// instruction is scalar if it is known to be uniform or will be scalarized
1638 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1639 /// to the list if they are used by a load/store instruction that is marked as
1640 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1641 /// VF values in the vectorized loop, each corresponding to an iteration of
1642 /// the original scalar loop.
1643 void collectLoopScalars(ElementCount VF);
1644
1645 /// Keeps cost model vectorization decision and cost for instructions.
1646 /// Right now it is used for memory instructions only.
1648 std::pair<InstWidening, InstructionCost>>;
1649
1650 DecisionList WideningDecisions;
1651
1652 using CallDecisionList =
1653 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1654
1655 CallDecisionList CallWideningDecisions;
1656
1657 /// Returns true if \p V is expected to be vectorized and it needs to be
1658 /// extracted.
1659 bool needsExtract(Value *V, ElementCount VF) const {
1660 Instruction *I = dyn_cast<Instruction>(V);
1661 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1664 (isa<CallInst>(I) &&
1665 getCallWideningDecision(cast<CallInst>(I), VF).Kind == CM_Scalarize))
1666 return false;
1667
1668 // Assume we can vectorize V (and hence we need extraction) if the
1669 // scalars are not computed yet. This can happen, because it is called
1670 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1671 // the scalars are collected. That should be a safe assumption in most
1672 // cases, because we check if the operands have vectorizable types
1673 // beforehand in LoopVectorizationLegality.
1674 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1675 };
1676
1677 /// Returns a range containing only operands needing to be extracted.
1678 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1679 ElementCount VF) const {
1680
1681 SmallPtrSet<const Value *, 4> UniqueOperands;
1683 for (Value *Op : Ops) {
1684 if (isa<Constant>(Op) || !UniqueOperands.insert(Op).second ||
1685 !needsExtract(Op, VF))
1686 continue;
1687 Res.push_back(Op);
1688 }
1689 return Res;
1690 }
1691
1692public:
1693 /// The loop that we evaluate.
1695
1696 /// Predicated scalar evolution analysis.
1698
1699 /// Loop Info analysis.
1701
1702 /// Vectorization legality.
1704
1705 /// Vector target information.
1707
1708 /// Target Library Info.
1710
1711 /// Demanded bits analysis.
1713
1714 /// Assumption cache.
1716
1717 /// Interface to emit optimization remarks.
1719
1721
1722 /// Loop Vectorize Hint.
1724
1725 /// The interleave access information contains groups of interleaved accesses
1726 /// with the same stride and close to each other.
1728
1729 /// Values to ignore in the cost model.
1731
1732 /// Values to ignore in the cost model when VF > 1.
1734
1735 /// All element types found in the loop.
1737
1738 /// The kind of cost that we are calculating
1740
1741 /// Whether this loop should be optimized for size based on function attribute
1742 /// or profile information.
1744
1745 /// The highest VF possible for this loop, without using MaxBandwidth.
1747};
1748} // end namespace llvm
1749
1750namespace {
1751/// Helper struct to manage generating runtime checks for vectorization.
1752///
1753/// The runtime checks are created up-front in temporary blocks to allow better
1754/// estimating the cost and un-linked from the existing IR. After deciding to
1755/// vectorize, the checks are moved back. If deciding not to vectorize, the
1756/// temporary blocks are completely removed.
1757class GeneratedRTChecks {
1758 /// Basic block which contains the generated SCEV checks, if any.
1759 BasicBlock *SCEVCheckBlock = nullptr;
1760
1761 /// The value representing the result of the generated SCEV checks. If it is
1762 /// nullptr no SCEV checks have been generated.
1763 Value *SCEVCheckCond = nullptr;
1764
1765 /// Basic block which contains the generated memory runtime checks, if any.
1766 BasicBlock *MemCheckBlock = nullptr;
1767
1768 /// The value representing the result of the generated memory runtime checks.
1769 /// If it is nullptr no memory runtime checks have been generated.
1770 Value *MemRuntimeCheckCond = nullptr;
1771
1772 DominatorTree *DT;
1773 LoopInfo *LI;
1775
1776 SCEVExpander SCEVExp;
1777 SCEVExpander MemCheckExp;
1778
1779 bool CostTooHigh = false;
1780
1781 Loop *OuterLoop = nullptr;
1782
1784
1785 /// The kind of cost that we are calculating
1787
1788public:
1789 GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
1792 : DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"),
1793 MemCheckExp(*PSE.getSE(), DL, "scev.check"), PSE(PSE),
1794 CostKind(CostKind) {}
1795
1796 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1797 /// accurately estimate the cost of the runtime checks. The blocks are
1798 /// un-linked from the IR and are added back during vector code generation. If
1799 /// there is no vector code generation, the check blocks are removed
1800 /// completely.
1801 void create(Loop *L, const LoopAccessInfo &LAI,
1802 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1803
1804 // Hard cutoff to limit compile-time increase in case a very large number of
1805 // runtime checks needs to be generated.
1806 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1807 // profile info.
1808 CostTooHigh =
1810 if (CostTooHigh)
1811 return;
1812
1813 BasicBlock *LoopHeader = L->getHeader();
1814 BasicBlock *Preheader = L->getLoopPreheader();
1815
1816 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1817 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1818 // may be used by SCEVExpander. The blocks will be un-linked from their
1819 // predecessors and removed from LI & DT at the end of the function.
1820 if (!UnionPred.isAlwaysTrue()) {
1821 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1822 nullptr, "vector.scevcheck");
1823
1824 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1825 &UnionPred, SCEVCheckBlock->getTerminator());
1826 if (isa<Constant>(SCEVCheckCond)) {
1827 // Clean up directly after expanding the predicate to a constant, to
1828 // avoid further expansions re-using anything left over from SCEVExp.
1829 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
1830 SCEVCleaner.cleanup();
1831 }
1832 }
1833
1834 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1835 if (RtPtrChecking.Need) {
1836 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1837 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1838 "vector.memcheck");
1839
1840 auto DiffChecks = RtPtrChecking.getDiffChecks();
1841 if (DiffChecks) {
1842 Value *RuntimeVF = nullptr;
1843 MemRuntimeCheckCond = addDiffRuntimeChecks(
1844 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1845 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1846 if (!RuntimeVF)
1847 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1848 return RuntimeVF;
1849 },
1850 IC);
1851 } else {
1852 MemRuntimeCheckCond = addRuntimeChecks(
1853 MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
1855 }
1856 assert(MemRuntimeCheckCond &&
1857 "no RT checks generated although RtPtrChecking "
1858 "claimed checks are required");
1859 }
1860
1861 if (!MemCheckBlock && !SCEVCheckBlock)
1862 return;
1863
1864 // Unhook the temporary block with the checks, update various places
1865 // accordingly.
1866 if (SCEVCheckBlock)
1867 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1868 if (MemCheckBlock)
1869 MemCheckBlock->replaceAllUsesWith(Preheader);
1870
1871 if (SCEVCheckBlock) {
1872 SCEVCheckBlock->getTerminator()->moveBefore(
1873 Preheader->getTerminator()->getIterator());
1874 auto *UI = new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1875 UI->setDebugLoc(DebugLoc::getTemporary());
1876 Preheader->getTerminator()->eraseFromParent();
1877 }
1878 if (MemCheckBlock) {
1879 MemCheckBlock->getTerminator()->moveBefore(
1880 Preheader->getTerminator()->getIterator());
1881 auto *UI = new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1882 UI->setDebugLoc(DebugLoc::getTemporary());
1883 Preheader->getTerminator()->eraseFromParent();
1884 }
1885
1886 DT->changeImmediateDominator(LoopHeader, Preheader);
1887 if (MemCheckBlock) {
1888 DT->eraseNode(MemCheckBlock);
1889 LI->removeBlock(MemCheckBlock);
1890 }
1891 if (SCEVCheckBlock) {
1892 DT->eraseNode(SCEVCheckBlock);
1893 LI->removeBlock(SCEVCheckBlock);
1894 }
1895
1896 // Outer loop is used as part of the later cost calculations.
1897 OuterLoop = L->getParentLoop();
1898 }
1899
1901 if (SCEVCheckBlock || MemCheckBlock)
1902 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1903
1904 if (CostTooHigh) {
1906 Cost.setInvalid();
1907 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
1908 return Cost;
1909 }
1910
1911 InstructionCost RTCheckCost = 0;
1912 if (SCEVCheckBlock)
1913 for (Instruction &I : *SCEVCheckBlock) {
1914 if (SCEVCheckBlock->getTerminator() == &I)
1915 continue;
1917 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1918 RTCheckCost += C;
1919 }
1920 if (MemCheckBlock) {
1921 InstructionCost MemCheckCost = 0;
1922 for (Instruction &I : *MemCheckBlock) {
1923 if (MemCheckBlock->getTerminator() == &I)
1924 continue;
1926 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1927 MemCheckCost += C;
1928 }
1929
1930 // If the runtime memory checks are being created inside an outer loop
1931 // we should find out if these checks are outer loop invariant. If so,
1932 // the checks will likely be hoisted out and so the effective cost will
1933 // reduce according to the outer loop trip count.
1934 if (OuterLoop) {
1935 ScalarEvolution *SE = MemCheckExp.getSE();
1936 // TODO: If profitable, we could refine this further by analysing every
1937 // individual memory check, since there could be a mixture of loop
1938 // variant and invariant checks that mean the final condition is
1939 // variant.
1940 const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
1941 if (SE->isLoopInvariant(Cond, OuterLoop)) {
1942 // It seems reasonable to assume that we can reduce the effective
1943 // cost of the checks even when we know nothing about the trip
1944 // count. Assume that the outer loop executes at least twice.
1945 unsigned BestTripCount = 2;
1946
1947 // Get the best known TC estimate.
1948 if (auto EstimatedTC = getSmallBestKnownTC(
1949 PSE, OuterLoop, /* CanUseConstantMax = */ false))
1950 if (EstimatedTC->isFixed())
1951 BestTripCount = EstimatedTC->getFixedValue();
1952
1953 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
1954
1955 // Let's ensure the cost is always at least 1.
1956 NewMemCheckCost = std::max(NewMemCheckCost.getValue(),
1958
1959 if (BestTripCount > 1)
1961 << "We expect runtime memory checks to be hoisted "
1962 << "out of the outer loop. Cost reduced from "
1963 << MemCheckCost << " to " << NewMemCheckCost << '\n');
1964
1965 MemCheckCost = NewMemCheckCost;
1966 }
1967 }
1968
1969 RTCheckCost += MemCheckCost;
1970 }
1971
1972 if (SCEVCheckBlock || MemCheckBlock)
1973 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
1974 << "\n");
1975
1976 return RTCheckCost;
1977 }
1978
1979 /// Remove the created SCEV & memory runtime check blocks & instructions, if
1980 /// unused.
1981 ~GeneratedRTChecks() {
1982 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
1983 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
1984 bool SCEVChecksUsed = !SCEVCheckBlock || !pred_empty(SCEVCheckBlock);
1985 bool MemChecksUsed = !MemCheckBlock || !pred_empty(MemCheckBlock);
1986 if (SCEVChecksUsed)
1987 SCEVCleaner.markResultUsed();
1988
1989 if (MemChecksUsed) {
1990 MemCheckCleaner.markResultUsed();
1991 } else {
1992 auto &SE = *MemCheckExp.getSE();
1993 // Memory runtime check generation creates compares that use expanded
1994 // values. Remove them before running the SCEVExpanderCleaners.
1995 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
1996 if (MemCheckExp.isInsertedInstruction(&I))
1997 continue;
1998 SE.forgetValue(&I);
1999 I.eraseFromParent();
2000 }
2001 }
2002 MemCheckCleaner.cleanup();
2003 SCEVCleaner.cleanup();
2004
2005 if (!SCEVChecksUsed)
2006 SCEVCheckBlock->eraseFromParent();
2007 if (!MemChecksUsed)
2008 MemCheckBlock->eraseFromParent();
2009 }
2010
2011 /// Retrieves the SCEVCheckCond and SCEVCheckBlock that were generated as IR
2012 /// outside VPlan.
2013 std::pair<Value *, BasicBlock *> getSCEVChecks() {
2014 using namespace llvm::PatternMatch;
2015 if (!SCEVCheckCond || match(SCEVCheckCond, m_ZeroInt()))
2016 return {nullptr, nullptr};
2017
2018 return {SCEVCheckCond, SCEVCheckBlock};
2019 }
2020
2021 /// Retrieves the MemCheckCond and MemCheckBlock that were generated as IR
2022 /// outside VPlan.
2023 std::pair<Value *, BasicBlock *> getMemRuntimeChecks() {
2024 using namespace llvm::PatternMatch;
2025 if (MemRuntimeCheckCond && match(MemRuntimeCheckCond, m_ZeroInt()))
2026 return {nullptr, nullptr};
2027 return {MemRuntimeCheckCond, MemCheckBlock};
2028 }
2029
2030 /// Return true if any runtime checks have been added
2031 bool hasChecks() const {
2032 using namespace llvm::PatternMatch;
2033 return (SCEVCheckCond && !match(SCEVCheckCond, m_ZeroInt())) ||
2034 MemRuntimeCheckCond;
2035 }
2036};
2037} // namespace
2038
2040 return Style == TailFoldingStyle::Data ||
2043}
2044
2046 return Style == TailFoldingStyle::DataAndControlFlow ||
2048}
2049
2050// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2051// vectorization. The loop needs to be annotated with #pragma omp simd
2052// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2053// vector length information is not provided, vectorization is not considered
2054// explicit. Interleave hints are not allowed either. These limitations will be
2055// relaxed in the future.
2056// Please, note that we are currently forced to abuse the pragma 'clang
2057// vectorize' semantics. This pragma provides *auto-vectorization hints*
2058// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2059// provides *explicit vectorization hints* (LV can bypass legal checks and
2060// assume that vectorization is legal). However, both hints are implemented
2061// using the same metadata (llvm.loop.vectorize, processed by
2062// LoopVectorizeHints). This will be fixed in the future when the native IR
2063// representation for pragma 'omp simd' is introduced.
2064static bool isExplicitVecOuterLoop(Loop *OuterLp,
2066 assert(!OuterLp->isInnermost() && "This is not an outer loop");
2067 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2068
2069 // Only outer loops with an explicit vectorization hint are supported.
2070 // Unannotated outer loops are ignored.
2072 return false;
2073
2074 Function *Fn = OuterLp->getHeader()->getParent();
2075 if (!Hints.allowVectorization(Fn, OuterLp,
2076 true /*VectorizeOnlyWhenForced*/)) {
2077 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2078 return false;
2079 }
2080
2081 if (Hints.getInterleave() > 1) {
2082 // TODO: Interleave support is future work.
2083 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2084 "outer loops.\n");
2085 Hints.emitRemarkWithHints();
2086 return false;
2087 }
2088
2089 return true;
2090}
2091
2095 // Collect inner loops and outer loops without irreducible control flow. For
2096 // now, only collect outer loops that have explicit vectorization hints. If we
2097 // are stress testing the VPlan H-CFG construction, we collect the outermost
2098 // loop of every loop nest.
2099 if (L.isInnermost() || VPlanBuildStressTest ||
2101 LoopBlocksRPO RPOT(&L);
2102 RPOT.perform(LI);
2103 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2104 V.push_back(&L);
2105 // TODO: Collect inner loops inside marked outer loops in case
2106 // vectorization fails for the outer loop. Do not invoke
2107 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2108 // already known to be reducible. We can use an inherited attribute for
2109 // that.
2110 return;
2111 }
2112 }
2113 for (Loop *InnerL : L)
2114 collectSupportedLoops(*InnerL, LI, ORE, V);
2115}
2116
2117//===----------------------------------------------------------------------===//
2118// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2119// LoopVectorizationCostModel and LoopVectorizationPlanner.
2120//===----------------------------------------------------------------------===//
2121
2122/// Compute the transformed value of Index at offset StartValue using step
2123/// StepValue.
2124/// For integer induction, returns StartValue + Index * StepValue.
2125/// For pointer induction, returns StartValue[Index * StepValue].
2126/// FIXME: The newly created binary instructions should contain nsw/nuw
2127/// flags, which can be found from the original scalar operations.
2128static Value *
2130 Value *Step,
2132 const BinaryOperator *InductionBinOp) {
2133 using namespace llvm::PatternMatch;
2134 Type *StepTy = Step->getType();
2135 Value *CastedIndex = StepTy->isIntegerTy()
2136 ? B.CreateSExtOrTrunc(Index, StepTy)
2137 : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2138 if (CastedIndex != Index) {
2139 CastedIndex->setName(CastedIndex->getName() + ".cast");
2140 Index = CastedIndex;
2141 }
2142
2143 // Note: the IR at this point is broken. We cannot use SE to create any new
2144 // SCEV and then expand it, hoping that SCEV's simplification will give us
2145 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2146 // lead to various SCEV crashes. So all we can do is to use builder and rely
2147 // on InstCombine for future simplifications. Here we handle some trivial
2148 // cases only.
2149 auto CreateAdd = [&B](Value *X, Value *Y) {
2150 assert(X->getType() == Y->getType() && "Types don't match!");
2151 if (match(X, m_ZeroInt()))
2152 return Y;
2153 if (match(Y, m_ZeroInt()))
2154 return X;
2155 return B.CreateAdd(X, Y);
2156 };
2157
2158 // We allow X to be a vector type, in which case Y will potentially be
2159 // splatted into a vector with the same element count.
2160 auto CreateMul = [&B](Value *X, Value *Y) {
2161 assert(X->getType()->getScalarType() == Y->getType() &&
2162 "Types don't match!");
2163 if (match(X, m_One()))
2164 return Y;
2165 if (match(Y, m_One()))
2166 return X;
2167 VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2168 if (XVTy && !isa<VectorType>(Y->getType()))
2169 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2170 return B.CreateMul(X, Y);
2171 };
2172
2173 switch (InductionKind) {
2175 assert(!isa<VectorType>(Index->getType()) &&
2176 "Vector indices not supported for integer inductions yet");
2177 assert(Index->getType() == StartValue->getType() &&
2178 "Index type does not match StartValue type");
2179 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2180 return B.CreateSub(StartValue, Index);
2181 auto *Offset = CreateMul(Index, Step);
2182 return CreateAdd(StartValue, Offset);
2183 }
2185 return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
2187 assert(!isa<VectorType>(Index->getType()) &&
2188 "Vector indices not supported for FP inductions yet");
2189 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2190 assert(InductionBinOp &&
2191 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2192 InductionBinOp->getOpcode() == Instruction::FSub) &&
2193 "Original bin op should be defined for FP induction");
2194
2195 Value *MulExp = B.CreateFMul(Step, Index);
2196 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2197 "induction");
2198 }
2200 return nullptr;
2201 }
2202 llvm_unreachable("invalid enum");
2203}
2204
2205static std::optional<unsigned> getMaxVScale(const Function &F,
2206 const TargetTransformInfo &TTI) {
2207 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2208 return MaxVScale;
2209
2210 if (F.hasFnAttribute(Attribute::VScaleRange))
2211 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2212
2213 return std::nullopt;
2214}
2215
2216/// For the given VF and UF and maximum trip count computed for the loop, return
2217/// whether the induction variable might overflow in the vectorized loop. If not,
2218/// then we know a runtime overflow check always evaluates to false and can be
2219/// removed.
2222 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2223 // Always be conservative if we don't know the exact unroll factor.
2224 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2225
2226 IntegerType *IdxTy = Cost->Legal->getWidestInductionType();
2227 APInt MaxUIntTripCount = IdxTy->getMask();
2228
2229 // We know the runtime overflow check is known false iff the (max) trip-count
2230 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2231 // the vector loop induction variable.
2232 if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
2233 uint64_t MaxVF = VF.getKnownMinValue();
2234 if (VF.isScalable()) {
2235 std::optional<unsigned> MaxVScale =
2236 getMaxVScale(*Cost->TheFunction, Cost->TTI);
2237 if (!MaxVScale)
2238 return false;
2239 MaxVF *= *MaxVScale;
2240 }
2241
2242 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2243 }
2244
2245 return false;
2246}
2247
2248// Return whether we allow using masked interleave-groups (for dealing with
2249// strided loads/stores that reside in predicated blocks, or for dealing
2250// with gaps).
2252 // If an override option has been passed in for interleaved accesses, use it.
2255
2257}
2258
2260 BasicBlock *CheckIRBB) {
2261 // Note: The block with the minimum trip-count check is already connected
2262 // during earlier VPlan construction.
2263 VPBlockBase *ScalarPH = Plan.getScalarPreheader();
2265 assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors");
2266 assert(PreVectorPH->getSuccessors()[0] == ScalarPH && "Unexpected successor");
2267 VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB);
2268 VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPBB, CheckVPIRBB);
2269 PreVectorPH = CheckVPIRBB;
2270 VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH);
2271 PreVectorPH->swapSuccessors();
2272
2273 // We just connected a new block to the scalar preheader. Update all
2274 // VPPhis by adding an incoming value for it, replicating the last value.
2275 unsigned NumPredecessors = ScalarPH->getNumPredecessors();
2276 for (VPRecipeBase &R : cast<VPBasicBlock>(ScalarPH)->phis()) {
2277 assert(isa<VPPhi>(&R) && "Phi expected to be VPPhi");
2278 assert(cast<VPPhi>(&R)->getNumIncoming() == NumPredecessors - 1 &&
2279 "must have incoming values for all operands");
2280 R.addOperand(R.getOperand(NumPredecessors - 2));
2281 }
2282}
2283
2284Value *
2286 unsigned UF) const {
2287 // Generate code to check if the loop's trip count is less than VF * UF, or
2288 // equal to it in case a scalar epilogue is required; this implies that the
2289 // vector trip count is zero. This check also covers the case where adding one
2290 // to the backedge-taken count overflowed leading to an incorrect trip count
2291 // of zero. In this case we will also jump to the scalar loop.
2292 auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2294
2295 // Reuse existing vector loop preheader for TC checks.
2296 // Note that new preheader block is generated for vector loop.
2297 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2299 TCCheckBlock->getContext(),
2300 InstSimplifyFolder(TCCheckBlock->getDataLayout()));
2301 Builder.SetInsertPoint(TCCheckBlock->getTerminator());
2302
2303 // If tail is to be folded, vector loop takes care of all iterations.
2304 Value *Count = getTripCount();
2305 Type *CountTy = Count->getType();
2306 Value *CheckMinIters = Builder.getFalse();
2307 auto CreateStep = [&]() -> Value * {
2308 // Create step with max(MinProTripCount, UF * VF).
2310 return createStepForVF(Builder, CountTy, VF, UF);
2311
2312 Value *MinProfTC =
2314 if (!VF.isScalable())
2315 return MinProfTC;
2317 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2318 };
2319
2320 TailFoldingStyle Style = Cost->getTailFoldingStyle();
2321 if (Style == TailFoldingStyle::None) {
2322 Value *Step = CreateStep();
2323 ScalarEvolution &SE = *PSE.getSE();
2324 // TODO: Emit unconditional branch to vector preheader instead of
2325 // conditional branch with known condition.
2326 const SCEV *TripCountSCEV = SE.applyLoopGuards(SE.getSCEV(Count), OrigLoop);
2327 // Check if the trip count is < the step.
2328 if (SE.isKnownPredicate(P, TripCountSCEV, SE.getSCEV(Step))) {
2329 // TODO: Ensure step is at most the trip count when determining max VF and
2330 // UF, w/o tail folding.
2331 CheckMinIters = Builder.getTrue();
2333 TripCountSCEV, SE.getSCEV(Step))) {
2334 // Generate the minimum iteration check only if we cannot prove the
2335 // check is known to be true, or known to be false.
2336 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
2337 } // else step known to be < trip count, use CheckMinIters preset to false.
2338 } else if (VF.isScalable() && !TTI->isVScaleKnownToBeAPowerOfTwo() &&
2341 // vscale is not necessarily a power-of-2, which means we cannot guarantee
2342 // an overflow to zero when updating induction variables and so an
2343 // additional overflow check is required before entering the vector loop.
2344
2345 // Get the maximum unsigned value for the type.
2346 Value *MaxUIntTripCount =
2347 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2348 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2349
2350 // Don't execute the vector loop if (UMax - n) < (VF * UF).
2351 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2352 }
2353 return CheckMinIters;
2354}
2355
2356/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
2357/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
2358/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
2359/// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
2361 BasicBlock *IRBB) {
2362 VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB);
2363 auto IP = IRVPBB->begin();
2364 for (auto &R : make_early_inc_range(VPBB->phis()))
2365 R.moveBefore(*IRVPBB, IP);
2366
2367 for (auto &R :
2369 R.moveBefore(*IRVPBB, IRVPBB->end());
2370
2371 VPBlockUtils::reassociateBlocks(VPBB, IRVPBB);
2372 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
2373 return IRVPBB;
2374}
2375
2378 assert(LoopVectorPreHeader && "Invalid loop structure");
2380 Cost->requiresScalarEpilogue(VF.isVector())) &&
2381 "loops not exiting via the latch without required epilogue?");
2382
2383 // NOTE: The Plan's scalar preheader VPBB isn't replaced with a VPIRBasicBlock
2384 // wrapping the newly created scalar preheader here at the moment, because the
2385 // Plan's scalar preheader may be unreachable at this point. Instead it is
2386 // replaced in executePlan.
2388 DT, LI, nullptr, Twine(Prefix) + "scalar.ph");
2389}
2390
2391/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
2392/// expansion results.
2394 const SCEV2ValueTy &ExpandedSCEVs) {
2395 const SCEV *Step = ID.getStep();
2396 if (auto *C = dyn_cast<SCEVConstant>(Step))
2397 return C->getValue();
2398 if (auto *U = dyn_cast<SCEVUnknown>(Step))
2399 return U->getValue();
2400 Value *V = ExpandedSCEVs.lookup(Step);
2401 assert(V && "SCEV must be expanded at this point");
2402 return V;
2403}
2404
2405/// Knowing that loop \p L executes a single vector iteration, add instructions
2406/// that will get simplified and thus should not have any cost to \p
2407/// InstsToIgnore.
2410 SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
2411 auto *Cmp = L->getLatchCmpInst();
2412 if (Cmp)
2413 InstsToIgnore.insert(Cmp);
2414 for (const auto &KV : IL) {
2415 // Extract the key by hand so that it can be used in the lambda below. Note
2416 // that captured structured bindings are a C++20 extension.
2417 const PHINode *IV = KV.first;
2418
2419 // Get next iteration value of the induction variable.
2420 Instruction *IVInst =
2421 cast<Instruction>(IV->getIncomingValueForBlock(L->getLoopLatch()));
2422 if (all_of(IVInst->users(),
2423 [&](const User *U) { return U == IV || U == Cmp; }))
2424 InstsToIgnore.insert(IVInst);
2425 }
2426}
2427
2429 // Create a new IR basic block for the scalar preheader.
2430 BasicBlock *ScalarPH = createScalarPreheader("");
2431 return ScalarPH->getSinglePredecessor();
2432}
2433
2434namespace {
2435
2436struct CSEDenseMapInfo {
2437 static bool canHandle(const Instruction *I) {
2438 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
2439 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
2440 }
2441
2442 static inline Instruction *getEmptyKey() {
2444 }
2445
2446 static inline Instruction *getTombstoneKey() {
2448 }
2449
2450 static unsigned getHashValue(const Instruction *I) {
2451 assert(canHandle(I) && "Unknown instruction!");
2452 return hash_combine(I->getOpcode(),
2453 hash_combine_range(I->operand_values()));
2454 }
2455
2456 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2457 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2458 LHS == getTombstoneKey() || RHS == getTombstoneKey())
2459 return LHS == RHS;
2460 return LHS->isIdenticalTo(RHS);
2461 }
2462};
2463
2464} // end anonymous namespace
2465
2466///Perform cse of induction variable instructions.
2467static void cse(BasicBlock *BB) {
2468 // Perform simple cse.
2470 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
2471 if (!CSEDenseMapInfo::canHandle(&In))
2472 continue;
2473
2474 // Check if we can replace this instruction with any of the
2475 // visited instructions.
2476 if (Instruction *V = CSEMap.lookup(&In)) {
2477 In.replaceAllUsesWith(V);
2478 In.eraseFromParent();
2479 continue;
2480 }
2481
2482 CSEMap[&In] = &In;
2483 }
2484}
2485
2486/// This function attempts to return a value that represents the ElementCount
2487/// at runtime. For fixed-width VFs we know this precisely at compile
2488/// time, but for scalable VFs we calculate it based on an estimate of the
2489/// vscale value.
2491 std::optional<unsigned> VScale) {
2492 unsigned EstimatedVF = VF.getKnownMinValue();
2493 if (VF.isScalable())
2494 if (VScale)
2495 EstimatedVF *= *VScale;
2496 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
2497 return EstimatedVF;
2498}
2499
2502 ElementCount VF) const {
2503 // We only need to calculate a cost if the VF is scalar; for actual vectors
2504 // we should already have a pre-calculated cost at each VF.
2505 if (!VF.isScalar())
2506 return getCallWideningDecision(CI, VF).Cost;
2507
2508 Type *RetTy = CI->getType();
2510 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy))
2511 return *RedCost;
2512
2514 for (auto &ArgOp : CI->args())
2515 Tys.push_back(ArgOp->getType());
2516
2517 InstructionCost ScalarCallCost =
2519
2520 // If this is an intrinsic we may have a lower cost for it.
2523 return std::min(ScalarCallCost, IntrinsicCost);
2524 }
2525 return ScalarCallCost;
2526}
2527
2529 if (VF.isScalar() || !canVectorizeTy(Ty))
2530 return Ty;
2531 return toVectorizedTy(Ty, VF);
2532}
2533
2536 ElementCount VF) const {
2538 assert(ID && "Expected intrinsic call!");
2539 Type *RetTy = maybeVectorizeType(CI->getType(), VF);
2540 FastMathFlags FMF;
2541 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
2542 FMF = FPMO->getFastMathFlags();
2543
2546 SmallVector<Type *> ParamTys;
2547 std::transform(FTy->param_begin(), FTy->param_end(),
2548 std::back_inserter(ParamTys),
2549 [&](Type *Ty) { return maybeVectorizeType(Ty, VF); });
2550
2551 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
2552 dyn_cast<IntrinsicInst>(CI),
2554 return TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
2555}
2556
2558 // Fix widened non-induction PHIs by setting up the PHI operands.
2559 fixNonInductionPHIs(State);
2560
2561 // Don't apply optimizations below when no (vector) loop remains, as they all
2562 // require one at the moment.
2563 VPBasicBlock *HeaderVPBB =
2564 vputils::getFirstLoopHeader(*State.Plan, State.VPDT);
2565 if (!HeaderVPBB)
2566 return;
2567
2568 BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
2569
2570 // Remove redundant induction instructions.
2571 cse(HeaderBB);
2572
2573 // Set/update profile weights for the vector and remainder loops as original
2574 // loop iterations are now distributed among them. Note that original loop
2575 // becomes the scalar remainder loop after vectorization.
2576 //
2577 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
2578 // end up getting slightly roughened result but that should be OK since
2579 // profile is not inherently precise anyway. Note also possible bypass of
2580 // vector code caused by legality checks is ignored, assigning all the weight
2581 // to the vector loop, optimistically.
2582 //
2583 // For scalable vectorization we can't know at compile time how many
2584 // iterations of the loop are handled in one vector iteration, so instead
2585 // use the value of vscale used for tuning.
2586 Loop *VectorLoop = LI->getLoopFor(HeaderBB);
2587 unsigned EstimatedVFxUF =
2588 estimateElementCount(VF * UF, Cost->getVScaleForTuning());
2589 setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop, EstimatedVFxUF);
2590}
2591
2593 auto Iter = vp_depth_first_shallow(Plan.getEntry());
2594 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
2595 for (VPRecipeBase &P : VPBB->phis()) {
2596 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
2597 if (!VPPhi)
2598 continue;
2599 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi));
2600 // Make sure the builder has a valid insert point.
2601 Builder.SetInsertPoint(NewPhi);
2602 for (const auto &[Inc, VPBB] : VPPhi->incoming_values_and_blocks())
2603 NewPhi->addIncoming(State.get(Inc), State.CFG.VPBB2IRBB[VPBB]);
2604 }
2605 }
2606}
2607
2608void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
2609 // We should not collect Scalars more than once per VF. Right now, this
2610 // function is called from collectUniformsAndScalars(), which already does
2611 // this check. Collecting Scalars for VF=1 does not make any sense.
2612 assert(VF.isVector() && !Scalars.contains(VF) &&
2613 "This function should not be visited twice for the same VF");
2614
2615 // This avoids any chances of creating a REPLICATE recipe during planning
2616 // since that would result in generation of scalarized code during execution,
2617 // which is not supported for scalable vectors.
2618 if (VF.isScalable()) {
2619 Scalars[VF].insert_range(Uniforms[VF]);
2620 return;
2621 }
2622
2624
2625 // These sets are used to seed the analysis with pointers used by memory
2626 // accesses that will remain scalar.
2628 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
2629 auto *Latch = TheLoop->getLoopLatch();
2630
2631 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
2632 // The pointer operands of loads and stores will be scalar as long as the
2633 // memory access is not a gather or scatter operation. The value operand of a
2634 // store will remain scalar if the store is scalarized.
2635 auto IsScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
2636 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
2637 assert(WideningDecision != CM_Unknown &&
2638 "Widening decision should be ready at this moment");
2639 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
2640 if (Ptr == Store->getValueOperand())
2641 return WideningDecision == CM_Scalarize;
2642 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
2643 "Ptr is neither a value or pointer operand");
2644 return WideningDecision != CM_GatherScatter;
2645 };
2646
2647 // A helper that returns true if the given value is a getelementptr
2648 // instruction contained in the loop.
2649 auto IsLoopVaryingGEP = [&](Value *V) {
2650 return isa<GetElementPtrInst>(V) && !TheLoop->isLoopInvariant(V);
2651 };
2652
2653 // A helper that evaluates a memory access's use of a pointer. If the use will
2654 // be a scalar use and the pointer is only used by memory accesses, we place
2655 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
2656 // PossibleNonScalarPtrs.
2657 auto EvaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
2658 // We only care about bitcast and getelementptr instructions contained in
2659 // the loop.
2660 if (!IsLoopVaryingGEP(Ptr))
2661 return;
2662
2663 // If the pointer has already been identified as scalar (e.g., if it was
2664 // also identified as uniform), there's nothing to do.
2665 auto *I = cast<Instruction>(Ptr);
2666 if (Worklist.count(I))
2667 return;
2668
2669 // If the use of the pointer will be a scalar use, and all users of the
2670 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
2671 // place the pointer in PossibleNonScalarPtrs.
2672 if (IsScalarUse(MemAccess, Ptr) &&
2673 all_of(I->users(), IsaPred<LoadInst, StoreInst>))
2674 ScalarPtrs.insert(I);
2675 else
2676 PossibleNonScalarPtrs.insert(I);
2677 };
2678
2679 // We seed the scalars analysis with three classes of instructions: (1)
2680 // instructions marked uniform-after-vectorization and (2) bitcast,
2681 // getelementptr and (pointer) phi instructions used by memory accesses
2682 // requiring a scalar use.
2683 //
2684 // (1) Add to the worklist all instructions that have been identified as
2685 // uniform-after-vectorization.
2686 Worklist.insert_range(Uniforms[VF]);
2687
2688 // (2) Add to the worklist all bitcast and getelementptr instructions used by
2689 // memory accesses requiring a scalar use. The pointer operands of loads and
2690 // stores will be scalar unless the operation is a gather or scatter.
2691 // The value operand of a store will remain scalar if the store is scalarized.
2692 for (auto *BB : TheLoop->blocks())
2693 for (auto &I : *BB) {
2694 if (auto *Load = dyn_cast<LoadInst>(&I)) {
2695 EvaluatePtrUse(Load, Load->getPointerOperand());
2696 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
2697 EvaluatePtrUse(Store, Store->getPointerOperand());
2698 EvaluatePtrUse(Store, Store->getValueOperand());
2699 }
2700 }
2701 for (auto *I : ScalarPtrs)
2702 if (!PossibleNonScalarPtrs.count(I)) {
2703 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
2704 Worklist.insert(I);
2705 }
2706
2707 // Insert the forced scalars.
2708 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
2709 // induction variable when the PHI user is scalarized.
2710 auto ForcedScalar = ForcedScalars.find(VF);
2711 if (ForcedScalar != ForcedScalars.end())
2712 for (auto *I : ForcedScalar->second) {
2713 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
2714 Worklist.insert(I);
2715 }
2716
2717 // Expand the worklist by looking through any bitcasts and getelementptr
2718 // instructions we've already identified as scalar. This is similar to the
2719 // expansion step in collectLoopUniforms(); however, here we're only
2720 // expanding to include additional bitcasts and getelementptr instructions.
2721 unsigned Idx = 0;
2722 while (Idx != Worklist.size()) {
2723 Instruction *Dst = Worklist[Idx++];
2724 if (!IsLoopVaryingGEP(Dst->getOperand(0)))
2725 continue;
2726 auto *Src = cast<Instruction>(Dst->getOperand(0));
2727 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
2728 auto *J = cast<Instruction>(U);
2729 return !TheLoop->contains(J) || Worklist.count(J) ||
2730 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
2731 IsScalarUse(J, Src));
2732 })) {
2733 Worklist.insert(Src);
2734 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
2735 }
2736 }
2737
2738 // An induction variable will remain scalar if all users of the induction
2739 // variable and induction variable update remain scalar.
2740 for (const auto &Induction : Legal->getInductionVars()) {
2741 auto *Ind = Induction.first;
2742 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
2743
2744 // If tail-folding is applied, the primary induction variable will be used
2745 // to feed a vector compare.
2746 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
2747 continue;
2748
2749 // Returns true if \p Indvar is a pointer induction that is used directly by
2750 // load/store instruction \p I.
2751 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
2752 Instruction *I) {
2753 return Induction.second.getKind() ==
2755 (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
2756 Indvar == getLoadStorePointerOperand(I) && IsScalarUse(I, Indvar);
2757 };
2758
2759 // Determine if all users of the induction variable are scalar after
2760 // vectorization.
2761 bool ScalarInd = all_of(Ind->users(), [&](User *U) -> bool {
2762 auto *I = cast<Instruction>(U);
2763 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
2764 IsDirectLoadStoreFromPtrIndvar(Ind, I);
2765 });
2766 if (!ScalarInd)
2767 continue;
2768
2769 // If the induction variable update is a fixed-order recurrence, neither the
2770 // induction variable or its update should be marked scalar after
2771 // vectorization.
2772 auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate);
2773 if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi))
2774 continue;
2775
2776 // Determine if all users of the induction variable update instruction are
2777 // scalar after vectorization.
2778 bool ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
2779 auto *I = cast<Instruction>(U);
2780 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
2781 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
2782 });
2783 if (!ScalarIndUpdate)
2784 continue;
2785
2786 // The induction variable and its update instruction will remain scalar.
2787 Worklist.insert(Ind);
2788 Worklist.insert(IndUpdate);
2789 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
2790 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
2791 << "\n");
2792 }
2793
2794 Scalars[VF].insert_range(Worklist);
2795}
2796
2798 Instruction *I, ElementCount VF) const {
2799 if (!isPredicatedInst(I))
2800 return false;
2801
2802 // Do we have a non-scalar lowering for this predicated
2803 // instruction? No - it is scalar with predication.
2804 switch(I->getOpcode()) {
2805 default:
2806 return true;
2807 case Instruction::Call:
2808 if (VF.isScalar())
2809 return true;
2810 return getCallWideningDecision(cast<CallInst>(I), VF).Kind == CM_Scalarize;
2811 case Instruction::Load:
2812 case Instruction::Store: {
2814 auto *Ty = getLoadStoreType(I);
2815 unsigned AS = getLoadStoreAddressSpace(I);
2816 Type *VTy = Ty;
2817 if (VF.isVector())
2818 VTy = VectorType::get(Ty, VF);
2819 const Align Alignment = getLoadStoreAlignment(I);
2820 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment, AS) ||
2821 TTI.isLegalMaskedGather(VTy, Alignment))
2822 : !(isLegalMaskedStore(Ty, Ptr, Alignment, AS) ||
2823 TTI.isLegalMaskedScatter(VTy, Alignment));
2824 }
2825 case Instruction::UDiv:
2826 case Instruction::SDiv:
2827 case Instruction::SRem:
2828 case Instruction::URem: {
2829 // We have the option to use the safe-divisor idiom to avoid predication.
2830 // The cost based decision here will always select safe-divisor for
2831 // scalable vectors as scalarization isn't legal.
2832 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
2833 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
2834 }
2835 }
2836}
2837
2838// TODO: Fold into LoopVectorizationLegality::isMaskRequired.
2840 // TODO: We can use the loop-preheader as context point here and get
2841 // context sensitive reasoning for isSafeToSpeculativelyExecute.
2843 (isa<LoadInst, StoreInst, CallInst>(I) && !Legal->isMaskRequired(I)) ||
2844 isa<BranchInst, SwitchInst, PHINode, AllocaInst>(I))
2845 return false;
2846
2847 // If the instruction was executed conditionally in the original scalar loop,
2848 // predication is needed with a mask whose lanes are all possibly inactive.
2849 if (Legal->blockNeedsPredication(I->getParent()))
2850 return true;
2851
2852 // If we're not folding the tail by masking, predication is unnecessary.
2853 if (!foldTailByMasking())
2854 return false;
2855
2856 // All that remain are instructions with side-effects originally executed in
2857 // the loop unconditionally, but now execute under a tail-fold mask (only)
2858 // having at least one active lane (the first). If the side-effects of the
2859 // instruction are invariant, executing it w/o (the tail-folding) mask is safe
2860 // - it will cause the same side-effects as when masked.
2861 switch(I->getOpcode()) {
2862 default:
2864 "instruction should have been considered by earlier checks");
2865 case Instruction::Call:
2866 // Side-effects of a Call are assumed to be non-invariant, needing a
2867 // (fold-tail) mask.
2869 "should have returned earlier for calls not needing a mask");
2870 return true;
2871 case Instruction::Load:
2872 // If the address is loop invariant no predication is needed.
2874 case Instruction::Store: {
2875 // For stores, we need to prove both speculation safety (which follows from
2876 // the same argument as loads), but also must prove the value being stored
2877 // is correct. The easiest form of the later is to require that all values
2878 // stored are the same.
2880 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()));
2881 }
2882 case Instruction::UDiv:
2883 case Instruction::SDiv:
2884 case Instruction::SRem:
2885 case Instruction::URem:
2886 // If the divisor is loop-invariant no predication is needed.
2887 return !Legal->isInvariant(I->getOperand(1));
2888 }
2889}
2890
2891std::pair<InstructionCost, InstructionCost>
2893 ElementCount VF) const {
2894 assert(I->getOpcode() == Instruction::UDiv ||
2895 I->getOpcode() == Instruction::SDiv ||
2896 I->getOpcode() == Instruction::SRem ||
2897 I->getOpcode() == Instruction::URem);
2899
2900 // Scalarization isn't legal for scalable vector types
2901 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
2902 if (!VF.isScalable()) {
2903 // Get the scalarization cost and scale this amount by the probability of
2904 // executing the predicated block. If the instruction is not predicated,
2905 // we fall through to the next case.
2906 ScalarizationCost = 0;
2907
2908 // These instructions have a non-void type, so account for the phi nodes
2909 // that we will create. This cost is likely to be zero. The phi node
2910 // cost, if any, should be scaled by the block probability because it
2911 // models a copy at the end of each predicated block.
2912 ScalarizationCost +=
2913 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
2914
2915 // The cost of the non-predicated instruction.
2916 ScalarizationCost +=
2917 VF.getFixedValue() *
2918 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
2919
2920 // The cost of insertelement and extractelement instructions needed for
2921 // scalarization.
2922 ScalarizationCost += getScalarizationOverhead(I, VF);
2923
2924 // Scale the cost by the probability of executing the predicated blocks.
2925 // This assumes the predicated block for each vector lane is equally
2926 // likely.
2927 ScalarizationCost = ScalarizationCost / getPredBlockCostDivisor(CostKind);
2928 }
2929 InstructionCost SafeDivisorCost = 0;
2930
2931 auto *VecTy = toVectorTy(I->getType(), VF);
2932
2933 // The cost of the select guard to ensure all lanes are well defined
2934 // after we speculate above any internal control flow.
2935 SafeDivisorCost +=
2936 TTI.getCmpSelInstrCost(Instruction::Select, VecTy,
2937 toVectorTy(Type::getInt1Ty(I->getContext()), VF),
2939
2940 // Certain instructions can be cheaper to vectorize if they have a constant
2941 // second vector operand. One example of this are shifts on x86.
2942 Value *Op2 = I->getOperand(1);
2943 auto Op2Info = TTI.getOperandInfo(Op2);
2944 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
2945 Legal->isInvariant(Op2))
2947
2948 SmallVector<const Value *, 4> Operands(I->operand_values());
2949 SafeDivisorCost += TTI.getArithmeticInstrCost(
2950 I->getOpcode(), VecTy, CostKind,
2951 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
2952 Op2Info, Operands, I);
2953 return {ScalarizationCost, SafeDivisorCost};
2954}
2955
2957 Instruction *I, ElementCount VF) const {
2958 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
2960 "Decision should not be set yet.");
2961 auto *Group = getInterleavedAccessGroup(I);
2962 assert(Group && "Must have a group.");
2963 unsigned InterleaveFactor = Group->getFactor();
2964
2965 // If the instruction's allocated size doesn't equal its type size, it
2966 // requires padding and will be scalarized.
2967 auto &DL = I->getDataLayout();
2968 auto *ScalarTy = getLoadStoreType(I);
2969 if (hasIrregularType(ScalarTy, DL))
2970 return false;
2971
2972 // For scalable vectors, the interleave factors must be <= 8 since we require
2973 // the (de)interleaveN intrinsics instead of shufflevectors.
2974 if (VF.isScalable() && InterleaveFactor > 8)
2975 return false;
2976
2977 // If the group involves a non-integral pointer, we may not be able to
2978 // losslessly cast all values to a common type.
2979 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
2980 for (unsigned Idx = 0; Idx < InterleaveFactor; Idx++) {
2981 Instruction *Member = Group->getMember(Idx);
2982 if (!Member)
2983 continue;
2984 auto *MemberTy = getLoadStoreType(Member);
2985 bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
2986 // Don't coerce non-integral pointers to integers or vice versa.
2987 if (MemberNI != ScalarNI)
2988 // TODO: Consider adding special nullptr value case here
2989 return false;
2990 if (MemberNI && ScalarNI &&
2991 ScalarTy->getPointerAddressSpace() !=
2992 MemberTy->getPointerAddressSpace())
2993 return false;
2994 }
2995
2996 // Check if masking is required.
2997 // A Group may need masking for one of two reasons: it resides in a block that
2998 // needs predication, or it was decided to use masking to deal with gaps
2999 // (either a gap at the end of a load-access that may result in a speculative
3000 // load, or any gaps in a store-access).
3001 bool PredicatedAccessRequiresMasking =
3002 blockNeedsPredicationForAnyReason(I->getParent()) &&
3004 bool LoadAccessWithGapsRequiresEpilogMasking =
3005 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
3007 bool StoreAccessWithGapsRequiresMasking =
3008 isa<StoreInst>(I) && !Group->isFull();
3009 if (!PredicatedAccessRequiresMasking &&
3010 !LoadAccessWithGapsRequiresEpilogMasking &&
3011 !StoreAccessWithGapsRequiresMasking)
3012 return true;
3013
3014 // If masked interleaving is required, we expect that the user/target had
3015 // enabled it, because otherwise it either wouldn't have been created or
3016 // it should have been invalidated by the CostModel.
3018 "Masked interleave-groups for predicated accesses are not enabled.");
3019
3020 if (Group->isReverse())
3021 return false;
3022
3023 // TODO: Support interleaved access that requires a gap mask for scalable VFs.
3024 bool NeedsMaskForGaps = LoadAccessWithGapsRequiresEpilogMasking ||
3025 StoreAccessWithGapsRequiresMasking;
3026 if (VF.isScalable() && NeedsMaskForGaps)
3027 return false;
3028
3029 auto *Ty = getLoadStoreType(I);
3030 const Align Alignment = getLoadStoreAlignment(I);
3031 unsigned AS = getLoadStoreAddressSpace(I);
3032 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment, AS)
3033 : TTI.isLegalMaskedStore(Ty, Alignment, AS);
3034}
3035
3037 Instruction *I, ElementCount VF) {
3038 // Get and ensure we have a valid memory instruction.
3039 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
3040
3042 auto *ScalarTy = getLoadStoreType(I);
3043
3044 // In order to be widened, the pointer should be consecutive, first of all.
3045 if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
3046 return false;
3047
3048 // If the instruction is a store located in a predicated block, it will be
3049 // scalarized.
3050 if (isScalarWithPredication(I, VF))
3051 return false;
3052
3053 // If the instruction's allocated size doesn't equal it's type size, it
3054 // requires padding and will be scalarized.
3055 auto &DL = I->getDataLayout();
3056 if (hasIrregularType(ScalarTy, DL))
3057 return false;
3058
3059 return true;
3060}
3061
3062void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3063 // We should not collect Uniforms more than once per VF. Right now,
3064 // this function is called from collectUniformsAndScalars(), which
3065 // already does this check. Collecting Uniforms for VF=1 does not make any
3066 // sense.
3067
3068 assert(VF.isVector() && !Uniforms.contains(VF) &&
3069 "This function should not be visited twice for the same VF");
3070
3071 // Visit the list of Uniforms. If we find no uniform value, we won't
3072 // analyze again. Uniforms.count(VF) will return 1.
3073 Uniforms[VF].clear();
3074
3075 // Now we know that the loop is vectorizable!
3076 // Collect instructions inside the loop that will remain uniform after
3077 // vectorization.
3078
3079 // Global values, params and instructions outside of current loop are out of
3080 // scope.
3081 auto IsOutOfScope = [&](Value *V) -> bool {
3082 Instruction *I = dyn_cast<Instruction>(V);
3083 return (!I || !TheLoop->contains(I));
3084 };
3085
3086 // Worklist containing uniform instructions demanding lane 0.
3087 SetVector<Instruction *> Worklist;
3088
3089 // Add uniform instructions demanding lane 0 to the worklist. Instructions
3090 // that require predication must not be considered uniform after
3091 // vectorization, because that would create an erroneous replicating region
3092 // where only a single instance out of VF should be formed.
3093 auto AddToWorklistIfAllowed = [&](Instruction *I) -> void {
3094 if (IsOutOfScope(I)) {
3095 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
3096 << *I << "\n");
3097 return;
3098 }
3099 if (isPredicatedInst(I)) {
3100 LLVM_DEBUG(
3101 dbgs() << "LV: Found not uniform due to requiring predication: " << *I
3102 << "\n");
3103 return;
3104 }
3105 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
3106 Worklist.insert(I);
3107 };
3108
3109 // Start with the conditional branches exiting the loop. If the branch
3110 // condition is an instruction contained in the loop that is only used by the
3111 // branch, it is uniform. Note conditions from uncountable early exits are not
3112 // uniform.
3114 TheLoop->getExitingBlocks(Exiting);
3115 for (BasicBlock *E : Exiting) {
3117 continue;
3118 auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
3119 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
3120 AddToWorklistIfAllowed(Cmp);
3121 }
3122
3123 auto PrevVF = VF.divideCoefficientBy(2);
3124 // Return true if all lanes perform the same memory operation, and we can
3125 // thus choose to execute only one.
3126 auto IsUniformMemOpUse = [&](Instruction *I) {
3127 // If the value was already known to not be uniform for the previous
3128 // (smaller VF), it cannot be uniform for the larger VF.
3129 if (PrevVF.isVector()) {
3130 auto Iter = Uniforms.find(PrevVF);
3131 if (Iter != Uniforms.end() && !Iter->second.contains(I))
3132 return false;
3133 }
3134 if (!Legal->isUniformMemOp(*I, VF))
3135 return false;
3136 if (isa<LoadInst>(I))
3137 // Loading the same address always produces the same result - at least
3138 // assuming aliasing and ordering which have already been checked.
3139 return true;
3140 // Storing the same value on every iteration.
3141 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
3142 };
3143
3144 auto IsUniformDecision = [&](Instruction *I, ElementCount VF) {
3145 InstWidening WideningDecision = getWideningDecision(I, VF);
3146 assert(WideningDecision != CM_Unknown &&
3147 "Widening decision should be ready at this moment");
3148
3149 if (IsUniformMemOpUse(I))
3150 return true;
3151
3152 return (WideningDecision == CM_Widen ||
3153 WideningDecision == CM_Widen_Reverse ||
3154 WideningDecision == CM_Interleave);
3155 };
3156
3157 // Returns true if Ptr is the pointer operand of a memory access instruction
3158 // I, I is known to not require scalarization, and the pointer is not also
3159 // stored.
3160 auto IsVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
3161 if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
3162 return false;
3163 return getLoadStorePointerOperand(I) == Ptr &&
3164 (IsUniformDecision(I, VF) || Legal->isInvariant(Ptr));
3165 };
3166
3167 // Holds a list of values which are known to have at least one uniform use.
3168 // Note that there may be other uses which aren't uniform. A "uniform use"
3169 // here is something which only demands lane 0 of the unrolled iterations;
3170 // it does not imply that all lanes produce the same value (e.g. this is not
3171 // the usual meaning of uniform)
3172 SetVector<Value *> HasUniformUse;
3173
3174 // Scan the loop for instructions which are either a) known to have only
3175 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
3176 for (auto *BB : TheLoop->blocks())
3177 for (auto &I : *BB) {
3178 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
3179 switch (II->getIntrinsicID()) {
3180 case Intrinsic::sideeffect:
3181 case Intrinsic::experimental_noalias_scope_decl:
3182 case Intrinsic::assume:
3183 case Intrinsic::lifetime_start:
3184 case Intrinsic::lifetime_end:
3186 AddToWorklistIfAllowed(&I);
3187 break;
3188 default:
3189 break;
3190 }
3191 }
3192
3193 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
3194 if (IsOutOfScope(EVI->getAggregateOperand())) {
3195 AddToWorklistIfAllowed(EVI);
3196 continue;
3197 }
3198 // Only ExtractValue instructions where the aggregate value comes from a
3199 // call are allowed to be non-uniform.
3200 assert(isa<CallInst>(EVI->getAggregateOperand()) &&
3201 "Expected aggregate value to be call return value");
3202 }
3203
3204 // If there's no pointer operand, there's nothing to do.
3206 if (!Ptr)
3207 continue;
3208
3209 if (IsUniformMemOpUse(&I))
3210 AddToWorklistIfAllowed(&I);
3211
3212 if (IsVectorizedMemAccessUse(&I, Ptr))
3213 HasUniformUse.insert(Ptr);
3214 }
3215
3216 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
3217 // demanding) users. Since loops are assumed to be in LCSSA form, this
3218 // disallows uses outside the loop as well.
3219 for (auto *V : HasUniformUse) {
3220 if (IsOutOfScope(V))
3221 continue;
3222 auto *I = cast<Instruction>(V);
3223 bool UsersAreMemAccesses = all_of(I->users(), [&](User *U) -> bool {
3224 auto *UI = cast<Instruction>(U);
3225 return TheLoop->contains(UI) && IsVectorizedMemAccessUse(UI, V);
3226 });
3227 if (UsersAreMemAccesses)
3228 AddToWorklistIfAllowed(I);
3229 }
3230
3231 // Expand Worklist in topological order: whenever a new instruction
3232 // is added , its users should be already inside Worklist. It ensures
3233 // a uniform instruction will only be used by uniform instructions.
3234 unsigned Idx = 0;
3235 while (Idx != Worklist.size()) {
3236 Instruction *I = Worklist[Idx++];
3237
3238 for (auto *OV : I->operand_values()) {
3239 // isOutOfScope operands cannot be uniform instructions.
3240 if (IsOutOfScope(OV))
3241 continue;
3242 // First order recurrence Phi's should typically be considered
3243 // non-uniform.
3244 auto *OP = dyn_cast<PHINode>(OV);
3246 continue;
3247 // If all the users of the operand are uniform, then add the
3248 // operand into the uniform worklist.
3249 auto *OI = cast<Instruction>(OV);
3250 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
3251 auto *J = cast<Instruction>(U);
3252 return Worklist.count(J) || IsVectorizedMemAccessUse(J, OI);
3253 }))
3254 AddToWorklistIfAllowed(OI);
3255 }
3256 }
3257
3258 // For an instruction to be added into Worklist above, all its users inside
3259 // the loop should also be in Worklist. However, this condition cannot be
3260 // true for phi nodes that form a cyclic dependence. We must process phi
3261 // nodes separately. An induction variable will remain uniform if all users
3262 // of the induction variable and induction variable update remain uniform.
3263 // The code below handles both pointer and non-pointer induction variables.
3264 BasicBlock *Latch = TheLoop->getLoopLatch();
3265 for (const auto &Induction : Legal->getInductionVars()) {
3266 auto *Ind = Induction.first;
3267 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3268
3269 // Determine if all users of the induction variable are uniform after
3270 // vectorization.
3271 bool UniformInd = all_of(Ind->users(), [&](User *U) -> bool {
3272 auto *I = cast<Instruction>(U);
3273 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3274 IsVectorizedMemAccessUse(I, Ind);
3275 });
3276 if (!UniformInd)
3277 continue;
3278
3279 // Determine if all users of the induction variable update instruction are
3280 // uniform after vectorization.
3281 bool UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
3282 auto *I = cast<Instruction>(U);
3283 return I == Ind || Worklist.count(I) ||
3284 IsVectorizedMemAccessUse(I, IndUpdate);
3285 });
3286 if (!UniformIndUpdate)
3287 continue;
3288
3289 // The induction variable and its update instruction will remain uniform.
3290 AddToWorklistIfAllowed(Ind);
3291 AddToWorklistIfAllowed(IndUpdate);
3292 }
3293
3294 Uniforms[VF].insert_range(Worklist);
3295}
3296
3298 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
3299
3301 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
3302 "runtime pointer checks needed. Enable vectorization of this "
3303 "loop with '#pragma clang loop vectorize(enable)' when "
3304 "compiling with -Os/-Oz",
3305 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3306 return true;
3307 }
3308
3309 if (!PSE.getPredicate().isAlwaysTrue()) {
3310 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
3311 "runtime SCEV checks needed. Enable vectorization of this "
3312 "loop with '#pragma clang loop vectorize(enable)' when "
3313 "compiling with -Os/-Oz",
3314 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3315 return true;
3316 }
3317
3318 // FIXME: Avoid specializing for stride==1 instead of bailing out.
3319 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
3320 reportVectorizationFailure("Runtime stride check for small trip count",
3321 "runtime stride == 1 checks needed. Enable vectorization of "
3322 "this loop without such check by compiling with -Os/-Oz",
3323 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3324 return true;
3325 }
3326
3327 return false;
3328}
3329
3330bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
3331 if (IsScalableVectorizationAllowed)
3332 return *IsScalableVectorizationAllowed;
3333
3334 IsScalableVectorizationAllowed = false;
3336 return false;
3337
3339 reportVectorizationInfo("Scalable vectorization is explicitly disabled",
3340 "ScalableVectorizationDisabled", ORE, TheLoop);
3341 return false;
3342 }
3343
3344 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
3345
3346 auto MaxScalableVF = ElementCount::getScalable(
3347 std::numeric_limits<ElementCount::ScalarTy>::max());
3348
3349 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
3350 // FIXME: While for scalable vectors this is currently sufficient, this should
3351 // be replaced by a more detailed mechanism that filters out specific VFs,
3352 // instead of invalidating vectorization for a whole set of VFs based on the
3353 // MaxVF.
3354
3355 // Disable scalable vectorization if the loop contains unsupported reductions.
3356 if (!canVectorizeReductions(MaxScalableVF)) {
3358 "Scalable vectorization not supported for the reduction "
3359 "operations found in this loop.",
3360 "ScalableVFUnfeasible", ORE, TheLoop);
3361 return false;
3362 }
3363
3364 // Disable scalable vectorization if the loop contains any instructions
3365 // with element types not supported for scalable vectors.
3366 if (any_of(ElementTypesInLoop, [&](Type *Ty) {
3367 return !Ty->isVoidTy() &&
3369 })) {
3370 reportVectorizationInfo("Scalable vectorization is not supported "
3371 "for all element types found in this loop.",
3372 "ScalableVFUnfeasible", ORE, TheLoop);
3373 return false;
3374 }
3375
3377 reportVectorizationInfo("The target does not provide maximum vscale value "
3378 "for safe distance analysis.",
3379 "ScalableVFUnfeasible", ORE, TheLoop);
3380 return false;
3381 }
3382
3383 IsScalableVectorizationAllowed = true;
3384 return true;
3385}
3386
3388LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
3389 if (!isScalableVectorizationAllowed())
3390 return ElementCount::getScalable(0);
3391
3392 auto MaxScalableVF = ElementCount::getScalable(
3393 std::numeric_limits<ElementCount::ScalarTy>::max());
3395 return MaxScalableVF;
3396
3397 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
3398 // Limit MaxScalableVF by the maximum safe dependence distance.
3399 MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
3400
3401 if (!MaxScalableVF)
3403 "Max legal vector width too small, scalable vectorization "
3404 "unfeasible.",
3405 "ScalableVFUnfeasible", ORE, TheLoop);
3406
3407 return MaxScalableVF;
3408}
3409
3410FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
3411 unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
3413 unsigned SmallestType, WidestType;
3414 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
3415
3416 // Get the maximum safe dependence distance in bits computed by LAA.
3417 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
3418 // the memory accesses that is most restrictive (involved in the smallest
3419 // dependence distance).
3420 unsigned MaxSafeElementsPowerOf2 =
3424 MaxSafeElementsPowerOf2 =
3425 std::min(MaxSafeElementsPowerOf2, SLDist / WidestType);
3426 }
3427 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElementsPowerOf2);
3428 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElementsPowerOf2);
3429
3431 this->MaxSafeElements = MaxSafeElementsPowerOf2;
3432
3433 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
3434 << ".\n");
3435 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
3436 << ".\n");
3437
3438 // First analyze the UserVF, fall back if the UserVF should be ignored.
3439 if (UserVF) {
3440 auto MaxSafeUserVF =
3441 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
3442
3443 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
3444 // If `VF=vscale x N` is safe, then so is `VF=N`
3445 if (UserVF.isScalable())
3446 return FixedScalableVFPair(
3447 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
3448
3449 return UserVF;
3450 }
3451
3452 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
3453
3454 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
3455 // is better to ignore the hint and let the compiler choose a suitable VF.
3456 if (!UserVF.isScalable()) {
3457 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3458 << " is unsafe, clamping to max safe VF="
3459 << MaxSafeFixedVF << ".\n");
3460 ORE->emit([&]() {
3461 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3463 TheLoop->getHeader())
3464 << "User-specified vectorization factor "
3465 << ore::NV("UserVectorizationFactor", UserVF)
3466 << " is unsafe, clamping to maximum safe vectorization factor "
3467 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
3468 });
3469 return MaxSafeFixedVF;
3470 }
3471
3473 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3474 << " is ignored because scalable vectors are not "
3475 "available.\n");
3476 ORE->emit([&]() {
3477 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3479 TheLoop->getHeader())
3480 << "User-specified vectorization factor "
3481 << ore::NV("UserVectorizationFactor", UserVF)
3482 << " is ignored because the target does not support scalable "
3483 "vectors. The compiler will pick a more suitable value.";
3484 });
3485 } else {
3486 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3487 << " is unsafe. Ignoring scalable UserVF.\n");
3488 ORE->emit([&]() {
3489 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3491 TheLoop->getHeader())
3492 << "User-specified vectorization factor "
3493 << ore::NV("UserVectorizationFactor", UserVF)
3494 << " is unsafe. Ignoring the hint to let the compiler pick a "
3495 "more suitable value.";
3496 });
3497 }
3498 }
3499
3500 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
3501 << " / " << WidestType << " bits.\n");
3502
3505 if (auto MaxVF =
3506 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3507 MaxSafeFixedVF, FoldTailByMasking))
3508 Result.FixedVF = MaxVF;
3509
3510 if (auto MaxVF =
3511 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3512 MaxSafeScalableVF, FoldTailByMasking))
3513 if (MaxVF.isScalable()) {
3514 Result.ScalableVF = MaxVF;
3515 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
3516 << "\n");
3517 }
3518
3519 return Result;
3520}
3521
3525 // TODO: It may be useful to do since it's still likely to be dynamically
3526 // uniform if the target can skip.
3528 "Not inserting runtime ptr check for divergent target",
3529 "runtime pointer checks needed. Not enabled for divergent target",
3530 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
3532 }
3533
3534 ScalarEvolution *SE = PSE.getSE();
3536 unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
3537 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
3538 if (TC != ElementCount::getFixed(MaxTC))
3539 LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n');
3540 if (TC.isScalar()) {
3541 reportVectorizationFailure("Single iteration (non) loop",
3542 "loop trip count is one, irrelevant for vectorization",
3543 "SingleIterationLoop", ORE, TheLoop);
3545 }
3546
3547 // If BTC matches the widest induction type and is -1 then the trip count
3548 // computation will wrap to 0 and the vector trip count will be 0. Do not try
3549 // to vectorize.
3550 const SCEV *BTC = SE->getBackedgeTakenCount(TheLoop);
3551 if (!isa<SCEVCouldNotCompute>(BTC) &&
3552 BTC->getType()->getScalarSizeInBits() >=
3555 SE->getMinusOne(BTC->getType()))) {
3557 "Trip count computation wrapped",
3558 "backedge-taken count is -1, loop trip count wrapped to 0",
3559 "TripCountWrapped", ORE, TheLoop);
3561 }
3562
3563 switch (ScalarEpilogueStatus) {
3565 return computeFeasibleMaxVF(MaxTC, UserVF, false);
3567 [[fallthrough]];
3569 LLVM_DEBUG(
3570 dbgs() << "LV: vector predicate hint/switch found.\n"
3571 << "LV: Not allowing scalar epilogue, creating predicated "
3572 << "vector loop.\n");
3573 break;
3575 // fallthrough as a special case of OptForSize
3577 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
3578 LLVM_DEBUG(
3579 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
3580 else
3581 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
3582 << "count.\n");
3583
3584 // Bail if runtime checks are required, which are not good when optimising
3585 // for size.
3588
3589 break;
3590 }
3591
3592 // Now try the tail folding
3593
3594 // Invalidate interleave groups that require an epilogue if we can't mask
3595 // the interleave-group.
3597 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
3598 "No decisions should have been taken at this point");
3599 // Note: There is no need to invalidate any cost modeling decisions here, as
3600 // none were taken so far.
3602 }
3603
3604 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
3605
3606 // Avoid tail folding if the trip count is known to be a multiple of any VF
3607 // we choose.
3608 std::optional<unsigned> MaxPowerOf2RuntimeVF =
3609 MaxFactors.FixedVF.getFixedValue();
3610 if (MaxFactors.ScalableVF) {
3611 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
3612 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
3613 MaxPowerOf2RuntimeVF = std::max<unsigned>(
3614 *MaxPowerOf2RuntimeVF,
3615 *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
3616 } else
3617 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
3618 }
3619
3620 auto NoScalarEpilogueNeeded = [this, &UserIC](unsigned MaxVF) {
3621 // Return false if the loop is neither a single-latch-exit loop nor an
3622 // early-exit loop as tail-folding is not supported in that case.
3625 return false;
3626 unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
3627 ScalarEvolution *SE = PSE.getSE();
3628 // Calling getSymbolicMaxBackedgeTakenCount enables support for loops
3629 // with uncountable exits. For countable loops, the symbolic maximum must
3630 // remain identical to the known back-edge taken count.
3631 const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount();
3633 BackedgeTakenCount == PSE.getBackedgeTakenCount()) &&
3634 "Invalid loop count");
3635 const SCEV *ExitCount = SE->getAddExpr(
3636 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3637 const SCEV *Rem = SE->getURemExpr(
3638 SE->applyLoopGuards(ExitCount, TheLoop),
3639 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
3640 return Rem->isZero();
3641 };
3642
3643 if (MaxPowerOf2RuntimeVF > 0u) {
3644 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
3645 "MaxFixedVF must be a power of 2");
3646 if (NoScalarEpilogueNeeded(*MaxPowerOf2RuntimeVF)) {
3647 // Accept MaxFixedVF if we do not have a tail.
3648 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
3649 return MaxFactors;
3650 }
3651 }
3652
3653 auto ExpectedTC = getSmallBestKnownTC(PSE, TheLoop);
3654 if (ExpectedTC && ExpectedTC->isFixed() &&
3655 ExpectedTC->getFixedValue() <=
3657 if (MaxPowerOf2RuntimeVF > 0u) {
3658 // If we have a low-trip-count, and the fixed-width VF is known to divide
3659 // the trip count but the scalable factor does not, use the fixed-width
3660 // factor in preference to allow the generation of a non-predicated loop.
3661 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedLowTripLoop &&
3662 NoScalarEpilogueNeeded(MaxFactors.FixedVF.getFixedValue())) {
3663 LLVM_DEBUG(dbgs() << "LV: Picking a fixed-width so that no tail will "
3664 "remain for any chosen VF.\n");
3665 MaxFactors.ScalableVF = ElementCount::getScalable(0);
3666 return MaxFactors;
3667 }
3668 }
3669
3671 "The trip count is below the minial threshold value.",
3672 "loop trip count is too low, avoiding vectorization", "LowTripCount",
3673 ORE, TheLoop);
3675 }
3676
3677 // If we don't know the precise trip count, or if the trip count that we
3678 // found modulo the vectorization factor is not zero, try to fold the tail
3679 // by masking.
3680 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
3681 bool ContainsScalableVF = MaxFactors.ScalableVF.isNonZero();
3682 setTailFoldingStyles(ContainsScalableVF, UserIC);
3683 if (foldTailByMasking()) {
3685 LLVM_DEBUG(
3686 dbgs()
3687 << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
3688 "try to generate VP Intrinsics with scalable vector "
3689 "factors only.\n");
3690 // Tail folded loop using VP intrinsics restricts the VF to be scalable
3691 // for now.
3692 // TODO: extend it for fixed vectors, if required.
3693 assert(ContainsScalableVF && "Expected scalable vector factor.");
3694
3695 MaxFactors.FixedVF = ElementCount::getFixed(1);
3696 }
3697 return MaxFactors;
3698 }
3699
3700 // If there was a tail-folding hint/switch, but we can't fold the tail by
3701 // masking, fallback to a vectorization with a scalar epilogue.
3702 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
3703 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
3704 "scalar epilogue instead.\n");
3705 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
3706 return MaxFactors;
3707 }
3708
3709 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
3710 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
3712 }
3713
3714 if (TC.isZero()) {
3716 "unable to calculate the loop count due to complex control flow",
3717 "UnknownLoopCountComplexCFG", ORE, TheLoop);
3719 }
3720
3722 "Cannot optimize for size and vectorize at the same time.",
3723 "cannot optimize for size and vectorize at the same time. "
3724 "Enable vectorization of this loop with '#pragma clang loop "
3725 "vectorize(enable)' when compiling with -Os/-Oz",
3726 "NoTailLoopWithOptForSize", ORE, TheLoop);
3728}
3729
3731 ElementCount VF) {
3732 if (!useMaxBandwidth(VF.isScalable()
3735 return false;
3736 // Only calculate register pressure for VFs enabled by MaxBandwidth.
3740}
3741
3748}
3749
3750ElementCount LoopVectorizationCostModel::clampVFByMaxTripCount(
3751 ElementCount VF, unsigned MaxTripCount, bool FoldTailByMasking) const {
3752 unsigned EstimatedVF = VF.getKnownMinValue();
3753 if (VF.isScalable() && TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
3754 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
3755 auto Min = Attr.getVScaleRangeMin();
3756 EstimatedVF *= Min;
3757 }
3758
3759 // When a scalar epilogue is required, at least one iteration of the scalar
3760 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
3761 // max VF that results in a dead vector loop.
3762 if (MaxTripCount > 0 && requiresScalarEpilogue(true))
3763 MaxTripCount -= 1;
3764
3765 if (MaxTripCount && MaxTripCount <= EstimatedVF &&
3766 (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
3767 // If upper bound loop trip count (TC) is known at compile time there is no
3768 // point in choosing VF greater than TC (as done in the loop below). Select
3769 // maximum power of two which doesn't exceed TC. If VF is
3770 // scalable, we only fall back on a fixed VF when the TC is less than or
3771 // equal to the known number of lanes.
3772 auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
3773 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
3774 "exceeding the constant trip count: "
3775 << ClampedUpperTripCount << "\n");
3776 return ElementCount::get(ClampedUpperTripCount,
3777 FoldTailByMasking ? VF.isScalable() : false);
3778 }
3779 return VF;
3780}
3781
3782ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
3783 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
3784 ElementCount MaxSafeVF, bool FoldTailByMasking) {
3785 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
3786 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
3787 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
3789
3790 // Convenience function to return the minimum of two ElementCounts.
3791 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
3792 assert((LHS.isScalable() == RHS.isScalable()) &&
3793 "Scalable flags must match");
3794 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
3795 };
3796
3797 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
3798 // Note that both WidestRegister and WidestType may not be a powers of 2.
3799 auto MaxVectorElementCount = ElementCount::get(
3800 llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
3801 ComputeScalableMaxVF);
3802 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
3803 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
3804 << (MaxVectorElementCount * WidestType) << " bits.\n");
3805
3806 if (!MaxVectorElementCount) {
3807 LLVM_DEBUG(dbgs() << "LV: The target has no "
3808 << (ComputeScalableMaxVF ? "scalable" : "fixed")
3809 << " vector registers.\n");
3810 return ElementCount::getFixed(1);
3811 }
3812
3813 ElementCount MaxVF = clampVFByMaxTripCount(MaxVectorElementCount,
3814 MaxTripCount, FoldTailByMasking);
3815 // If the MaxVF was already clamped, there's no point in trying to pick a
3816 // larger one.
3817 if (MaxVF != MaxVectorElementCount)
3818 return MaxVF;
3819
3821 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
3823
3824 if (MaxVF.isScalable())
3826 else
3828
3829 if (useMaxBandwidth(RegKind)) {
3830 auto MaxVectorElementCountMaxBW = ElementCount::get(
3831 llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
3832 ComputeScalableMaxVF);
3833 MaxVF = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
3834
3835 if (ElementCount MinVF =
3836 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
3837 if (ElementCount::isKnownLT(MaxVF, MinVF)) {
3838 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
3839 << ") with target's minimum: " << MinVF << '\n');
3840 MaxVF = MinVF;
3841 }
3842 }
3843
3844 MaxVF = clampVFByMaxTripCount(MaxVF, MaxTripCount, FoldTailByMasking);
3845
3846 if (MaxVectorElementCount != MaxVF) {
3847 // Invalidate any widening decisions we might have made, in case the loop
3848 // requires prediction (decided later), but we have already made some
3849 // load/store widening decisions.
3851 }
3852 }
3853 return MaxVF;
3854}
3855
3856bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
3857 const VectorizationFactor &B,
3858 const unsigned MaxTripCount,
3859 bool HasTail) const {
3860 InstructionCost CostA = A.Cost;
3861 InstructionCost CostB = B.Cost;
3862
3863 // Improve estimate for the vector width if it is scalable.
3864 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
3865 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
3866 if (std::optional<unsigned> VScale = CM.getVScaleForTuning()) {
3867 if (A.Width.isScalable())
3868 EstimatedWidthA *= *VScale;
3869 if (B.Width.isScalable())
3870 EstimatedWidthB *= *VScale;
3871 }
3872
3873 // When optimizing for size choose whichever is smallest, which will be the
3874 // one with the smallest cost for the whole loop. On a tie pick the larger
3875 // vector width, on the assumption that throughput will be greater.
3876 if (CM.CostKind == TTI::TCK_CodeSize)
3877 return CostA < CostB ||
3878 (CostA == CostB && EstimatedWidthA > EstimatedWidthB);
3879
3880 // Assume vscale may be larger than 1 (or the value being tuned for),
3881 // so that scalable vectorization is slightly favorable over fixed-width
3882 // vectorization.
3883 bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() &&
3884 A.Width.isScalable() && !B.Width.isScalable();
3885
3886 auto CmpFn = [PreferScalable](const InstructionCost &LHS,
3887 const InstructionCost &RHS) {
3888 return PreferScalable ? LHS <= RHS : LHS < RHS;
3889 };
3890
3891 // To avoid the need for FP division:
3892 // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
3893 // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
3894 if (!MaxTripCount)
3895 return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
3896
3897 auto GetCostForTC = [MaxTripCount, HasTail](unsigned VF,
3898 InstructionCost VectorCost,
3899 InstructionCost ScalarCost) {
3900 // If the trip count is a known (possibly small) constant, the trip count
3901 // will be rounded up to an integer number of iterations under
3902 // FoldTailByMasking. The total cost in that case will be
3903 // VecCost*ceil(TripCount/VF). When not folding the tail, the total
3904 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
3905 // some extra overheads, but for the purpose of comparing the costs of
3906 // different VFs we can use this to compare the total loop-body cost
3907 // expected after vectorization.
3908 if (HasTail)
3909 return VectorCost * (MaxTripCount / VF) +
3910 ScalarCost * (MaxTripCount % VF);
3911 return VectorCost * divideCeil(MaxTripCount, VF);
3912 };
3913
3914 auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
3915 auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
3916 return CmpFn(RTCostA, RTCostB);
3917}
3918
3919bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
3920 const VectorizationFactor &B,
3921 bool HasTail) const {
3922 const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
3923 return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount,
3924 HasTail);
3925}
3926
3929 using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
3930 SmallVector<RecipeVFPair> InvalidCosts;
3931 for (const auto &Plan : VPlans) {
3932 for (ElementCount VF : Plan->vectorFactors()) {
3933 // The VPlan-based cost model is designed for computing vector cost.
3934 // Querying VPlan-based cost model with a scarlar VF will cause some
3935 // errors because we expect the VF is vector for most of the widen
3936 // recipes.
3937 if (VF.isScalar())
3938 continue;
3939
3940 VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
3941 precomputeCosts(*Plan, VF, CostCtx);
3942 auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
3943 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
3944 for (auto &R : *VPBB) {
3945 if (!R.cost(VF, CostCtx).isValid())
3946 InvalidCosts.emplace_back(&R, VF);
3947 }
3948 }
3949 }
3950 }
3951 if (InvalidCosts.empty())
3952 return;
3953
3954 // Emit a report of VFs with invalid costs in the loop.
3955
3956 // Group the remarks per recipe, keeping the recipe order from InvalidCosts.
3958 unsigned I = 0;
3959 for (auto &Pair : InvalidCosts)
3960 if (Numbering.try_emplace(Pair.first, I).second)
3961 ++I;
3962
3963 // Sort the list, first on recipe(number) then on VF.
3964 sort(InvalidCosts, [&Numbering](RecipeVFPair &A, RecipeVFPair &B) {
3965 unsigned NA = Numbering[A.first];
3966 unsigned NB = Numbering[B.first];
3967 if (NA != NB)
3968 return NA < NB;
3969 return ElementCount::isKnownLT(A.second, B.second);
3970 });
3971
3972 // For a list of ordered recipe-VF pairs:
3973 // [(load, VF1), (load, VF2), (store, VF1)]
3974 // group the recipes together to emit separate remarks for:
3975 // load (VF1, VF2)
3976 // store (VF1)
3977 auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts);
3978 auto Subset = ArrayRef<RecipeVFPair>();
3979 do {
3980 if (Subset.empty())
3981 Subset = Tail.take_front(1);
3982
3983 VPRecipeBase *R = Subset.front().first;
3984
3985 unsigned Opcode =
3988 [](const auto *R) { return Instruction::PHI; })
3989 .Case<VPWidenSelectRecipe>(
3990 [](const auto *R) { return Instruction::Select; })
3991 .Case<VPWidenStoreRecipe>(
3992 [](const auto *R) { return Instruction::Store; })
3993 .Case<VPWidenLoadRecipe>(
3994 [](const auto *R) { return Instruction::Load; })
3995 .Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
3996 [](const auto *R) { return Instruction::Call; })
3999 [](const auto *R) { return R->getOpcode(); })
4000 .Case<VPInterleaveRecipe>([](const VPInterleaveRecipe *R) {
4001 return R->getStoredValues().empty() ? Instruction::Load
4002 : Instruction::Store;
4003 });
4004
4005 // If the next recipe is different, or if there are no other pairs,
4006 // emit a remark for the collated subset. e.g.
4007 // [(load, VF1), (load, VF2))]
4008 // to emit:
4009 // remark: invalid costs for 'load' at VF=(VF1, VF2)
4010 if (Subset == Tail || Tail[Subset.size()].first != R) {
4011 std::string OutString;
4012 raw_string_ostream OS(OutString);
4013 assert(!Subset.empty() && "Unexpected empty range");
4014 OS << "Recipe with invalid costs prevented vectorization at VF=(";
4015 for (const auto &Pair : Subset)
4016 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4017 OS << "):";
4018 if (Opcode == Instruction::Call) {
4019 StringRef Name = "";
4020 if (auto *Int = dyn_cast<VPWidenIntrinsicRecipe>(R)) {
4021 Name = Int->getIntrinsicName();
4022 } else {
4023 auto *WidenCall = dyn_cast<VPWidenCallRecipe>(R);
4024 Function *CalledFn =
4025 WidenCall ? WidenCall->getCalledScalarFunction()
4026 : cast<Function>(R->getOperand(R->getNumOperands() - 1)
4027 ->getLiveInIRValue());
4028 Name = CalledFn->getName();
4029 }
4030 OS << " call to " << Name;
4031 } else
4032 OS << " " << Instruction::getOpcodeName(Opcode);
4033 reportVectorizationInfo(OutString, "InvalidCost", ORE, OrigLoop, nullptr,
4034 R->getDebugLoc());
4035 Tail = Tail.drop_front(Subset.size());
4036 Subset = {};
4037 } else
4038 // Grow the subset by one element
4039 Subset = Tail.take_front(Subset.size() + 1);
4040 } while (!Tail.empty());
4041}
4042
4043/// Check if any recipe of \p Plan will generate a vector value, which will be
4044/// assigned a vector register.
4046 const TargetTransformInfo &TTI) {
4047 assert(VF.isVector() && "Checking a scalar VF?");
4048 VPTypeAnalysis TypeInfo(Plan);
4049 DenseSet<VPRecipeBase *> EphemeralRecipes;
4050 collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes);
4051 // Set of already visited types.
4052 DenseSet<Type *> Visited;
4053 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4055 for (VPRecipeBase &R : *VPBB) {
4056 if (EphemeralRecipes.contains(&R))
4057 continue;
4058 // Continue early if the recipe is considered to not produce a vector
4059 // result. Note that this includes VPInstruction where some opcodes may
4060 // produce a vector, to preserve existing behavior as VPInstructions model
4061 // aspects not directly mapped to existing IR instructions.
4062 switch (R.getVPDefID()) {
4063 case VPDef::VPDerivedIVSC:
4064 case VPDef::VPScalarIVStepsSC:
4065 case VPDef::VPReplicateSC:
4066 case VPDef::VPInstructionSC:
4067 case VPDef::VPCanonicalIVPHISC:
4068 case VPDef::VPVectorPointerSC:
4069 case VPDef::VPVectorEndPointerSC:
4070 case VPDef::VPExpandSCEVSC:
4071 case VPDef::VPEVLBasedIVPHISC:
4072 case VPDef::VPPredInstPHISC:
4073 case VPDef::VPBranchOnMaskSC:
4074 continue;
4075 case VPDef::VPReductionSC:
4076 case VPDef::VPActiveLaneMaskPHISC:
4077 case VPDef::VPWidenCallSC:
4078 case VPDef::VPWidenCanonicalIVSC:
4079 case VPDef::VPWidenCastSC:
4080 case VPDef::VPWidenGEPSC:
4081 case VPDef::VPWidenIntrinsicSC:
4082 case VPDef::VPWidenSC:
4083 case VPDef::VPWidenSelectSC:
4084 case VPDef::VPBlendSC:
4085 case VPDef::VPFirstOrderRecurrencePHISC:
4086 case VPDef::VPHistogramSC:
4087 case VPDef::VPWidenPHISC:
4088 case VPDef::VPWidenIntOrFpInductionSC:
4089 case VPDef::VPWidenPointerInductionSC:
4090 case VPDef::VPReductionPHISC:
4091 case VPDef::VPInterleaveEVLSC:
4092 case VPDef::VPInterleaveSC:
4093 case VPDef::VPWidenLoadEVLSC:
4094 case VPDef::VPWidenLoadSC:
4095 case VPDef::VPWidenStoreEVLSC:
4096 case VPDef::VPWidenStoreSC:
4097 break;
4098 default:
4099 llvm_unreachable("unhandled recipe");
4100 }
4101
4102 auto WillGenerateTargetVectors = [&TTI, VF](Type *VectorTy) {
4103 unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
4104 if (!NumLegalParts)
4105 return false;
4106 if (VF.isScalable()) {
4107 // <vscale x 1 x iN> is assumed to be profitable over iN because
4108 // scalable registers are a distinct register class from scalar
4109 // ones. If we ever find a target which wants to lower scalable
4110 // vectors back to scalars, we'll need to update this code to
4111 // explicitly ask TTI about the register class uses for each part.
4112 return NumLegalParts <= VF.getKnownMinValue();
4113 }
4114 // Two or more elements that share a register - are vectorized.
4115 return NumLegalParts < VF.getFixedValue();
4116 };
4117
4118 // If no def nor is a store, e.g., branches, continue - no value to check.
4119 if (R.getNumDefinedValues() == 0 &&
4120 !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveBase>(&R))
4121 continue;
4122 // For multi-def recipes, currently only interleaved loads, suffice to
4123 // check first def only.
4124 // For stores check their stored value; for interleaved stores suffice
4125 // the check first stored value only. In all cases this is the second
4126 // operand.
4127 VPValue *ToCheck =
4128 R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1);
4129 Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
4130 if (!Visited.insert({ScalarTy}).second)
4131 continue;
4132 Type *WideTy = toVectorizedTy(ScalarTy, VF);
4133 if (any_of(getContainedTypes(WideTy), WillGenerateTargetVectors))
4134 return true;
4135 }
4136 }
4137
4138 return false;
4139}
4140
4141static bool hasReplicatorRegion(VPlan &Plan) {
4142 return any_of(VPBlockUtils::blocksOnly<VPRegionBlock>(vp_depth_first_shallow(
4143 Plan.getVectorLoopRegion()->getEntry())),
4144 [](auto *VPRB) { return VPRB->isReplicator(); });
4145}
4146
4147#ifndef NDEBUG
4148VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4150 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4151 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4152 assert(
4153 any_of(VPlans,
4154 [](std::unique_ptr<VPlan> &P) { return P->hasScalarVFOnly(); }) &&
4155 "Expected Scalar VF to be a candidate");
4156
4157 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4158 ExpectedCost);
4159 VectorizationFactor ChosenFactor = ScalarCost;
4160
4161 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4162 if (ForceVectorization &&
4163 (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
4164 // Ignore scalar width, because the user explicitly wants vectorization.
4165 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4166 // evaluation.
4167 ChosenFactor.Cost = InstructionCost::getMax();
4168 }
4169
4170 for (auto &P : VPlans) {
4171 ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
4172 P->vectorFactors().end());
4173
4178
4179 for (unsigned I = 0; I < VFs.size(); I++) {
4180 ElementCount VF = VFs[I];
4181 // The cost for scalar VF=1 is already calculated, so ignore it.
4182 if (VF.isScalar())
4183 continue;
4184
4185 /// If the register pressure needs to be considered for VF,
4186 /// don't consider the VF as valid if it exceeds the number
4187 /// of registers for the target.
4189 RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs))
4190 continue;
4191
4193
4194 // Add on other costs that are modelled in VPlan, but not in the legacy
4195 // cost model.
4196 VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind);
4197 VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
4198 assert(VectorRegion && "Expected to have a vector region!");
4199 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4200 vp_depth_first_shallow(VectorRegion->getEntry()))) {
4201 for (VPRecipeBase &R : *VPBB) {
4202 auto *VPI = dyn_cast<VPInstruction>(&R);
4203 if (!VPI)
4204 continue;
4205 switch (VPI->getOpcode()) {
4206 // Selects are only modelled in the legacy cost model for safe
4207 // divisors.
4208 case Instruction::Select: {
4209 VPValue *VPV = VPI->getVPSingleValue();
4210 if (VPV->getNumUsers() == 1) {
4211 if (auto *WR = dyn_cast<VPWidenRecipe>(*VPV->user_begin())) {
4212 switch (WR->getOpcode()) {
4213 case Instruction::UDiv:
4214 case Instruction::SDiv:
4215 case Instruction::URem:
4216 case Instruction::SRem:
4217 continue;
4218 default:
4219 break;
4220 }
4221 }
4222 }
4223 C += VPI->cost(VF, CostCtx);
4224 break;
4225 }
4227 unsigned Multiplier =
4228 cast<ConstantInt>(VPI->getOperand(2)->getLiveInIRValue())
4229 ->getZExtValue();
4230 C += VPI->cost(VF * Multiplier, CostCtx);
4231 break;
4232 }
4234 C += VPI->cost(VF, CostCtx);
4235 break;
4236 default:
4237 break;
4238 }
4239 }
4240 }
4241
4242 VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
4243 unsigned Width =
4244 estimateElementCount(Candidate.Width, CM.getVScaleForTuning());
4245 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
4246 << " costs: " << (Candidate.Cost / Width));
4247 if (VF.isScalable())
4248 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4249 << CM.getVScaleForTuning().value_or(1) << ")");
4250 LLVM_DEBUG(dbgs() << ".\n");
4251
4252 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
4253 LLVM_DEBUG(
4254 dbgs()
4255 << "LV: Not considering vector loop of width " << VF
4256 << " because it will not generate any vector instructions.\n");
4257 continue;
4258 }
4259
4260 if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) {
4261 LLVM_DEBUG(
4262 dbgs()
4263 << "LV: Not considering vector loop of width " << VF
4264 << " because it would cause replicated blocks to be generated,"
4265 << " which isn't allowed when optimizing for size.\n");
4266 continue;
4267 }
4268
4269 if (isMoreProfitable(Candidate, ChosenFactor, P->hasScalarTail()))
4270 ChosenFactor = Candidate;
4271 }
4272 }
4273
4276 "There are conditional stores.",
4277 "store that is conditionally executed prevents vectorization",
4278 "ConditionalStore", ORE, OrigLoop);
4279 ChosenFactor = ScalarCost;
4280 }
4281
4282 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
4283 !isMoreProfitable(ChosenFactor, ScalarCost,
4284 !CM.foldTailByMasking())) dbgs()
4285 << "LV: Vectorization seems to be not beneficial, "
4286 << "but was forced by a user.\n");
4287 return ChosenFactor;
4288}
4289#endif
4290
4291bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4292 ElementCount VF) const {
4293 // Cross iteration phis such as fixed-order recurrences and FMaxNum/FMinNum
4294 // reductions need special handling and are currently unsupported.
4295 if (any_of(OrigLoop->getHeader()->phis(), [&](PHINode &Phi) {
4296 if (!Legal->isReductionVariable(&Phi))
4297 return Legal->isFixedOrderRecurrence(&Phi);
4298 RecurKind RK = Legal->getRecurrenceDescriptor(&Phi).getRecurrenceKind();
4299 return RK == RecurKind::FMinNum || RK == RecurKind::FMaxNum;
4300 }))
4301 return false;
4302
4303 // Phis with uses outside of the loop require special handling and are
4304 // currently unsupported.
4305 for (const auto &Entry : Legal->getInductionVars()) {
4306 // Look for uses of the value of the induction at the last iteration.
4307 Value *PostInc =
4308 Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
4309 for (User *U : PostInc->users())
4310 if (!OrigLoop->contains(cast<Instruction>(U)))
4311 return false;
4312 // Look for uses of penultimate value of the induction.
4313 for (User *U : Entry.first->users())
4314 if (!OrigLoop->contains(cast<Instruction>(U)))
4315 return false;
4316 }
4317
4318 // Epilogue vectorization code has not been auditted to ensure it handles
4319 // non-latch exits properly. It may be fine, but it needs auditted and
4320 // tested.
4321 // TODO: Add support for loops with an early exit.
4322 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
4323 return false;
4324
4325 return true;
4326}
4327
4329 const ElementCount VF, const unsigned IC) const {
4330 // FIXME: We need a much better cost-model to take different parameters such
4331 // as register pressure, code size increase and cost of extra branches into
4332 // account. For now we apply a very crude heuristic and only consider loops
4333 // with vectorization factors larger than a certain value.
4334
4335 // Allow the target to opt out entirely.
4337 return false;
4338
4339 // We also consider epilogue vectorization unprofitable for targets that don't
4340 // consider interleaving beneficial (eg. MVE).
4341 if (TTI.getMaxInterleaveFactor(VF) <= 1)
4342 return false;
4343
4344 // TODO: PR #108190 introduced a discrepancy between fixed-width and scalable
4345 // VFs when deciding profitability.
4346 // See related "TODO: extend to support scalable VFs." in
4347 // selectEpilogueVectorizationFactor.
4348 unsigned Multiplier = VF.isFixed() ? IC : 1;
4349 unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
4352 return estimateElementCount(VF * Multiplier, VScaleForTuning) >=
4353 MinVFThreshold;
4354}
4355
4357 const ElementCount MainLoopVF, unsigned IC) {
4360 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
4361 return Result;
4362 }
4363
4364 if (!CM.isScalarEpilogueAllowed()) {
4365 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
4366 "epilogue is allowed.\n");
4367 return Result;
4368 }
4369
4370 // Not really a cost consideration, but check for unsupported cases here to
4371 // simplify the logic.
4372 if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
4373 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
4374 "is not a supported candidate.\n");
4375 return Result;
4376 }
4377
4379 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
4381 if (hasPlanWithVF(ForcedEC))
4382 return {ForcedEC, 0, 0};
4383
4384 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
4385 "viable.\n");
4386 return Result;
4387 }
4388
4389 if (OrigLoop->getHeader()->getParent()->hasOptSize()) {
4390 LLVM_DEBUG(
4391 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
4392 return Result;
4393 }
4394
4395 if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, IC)) {
4396 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
4397 "this loop\n");
4398 return Result;
4399 }
4400
4401 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
4402 // the main loop handles 8 lanes per iteration. We could still benefit from
4403 // vectorizing the epilogue loop with VF=4.
4404 ElementCount EstimatedRuntimeVF = ElementCount::getFixed(
4405 estimateElementCount(MainLoopVF, CM.getVScaleForTuning()));
4406
4407 ScalarEvolution &SE = *PSE.getSE();
4408 Type *TCType = Legal->getWidestInductionType();
4409 const SCEV *RemainingIterations = nullptr;
4410 unsigned MaxTripCount = 0;
4411 const SCEV *TC =
4412 vputils::getSCEVExprForVPValue(getPlanFor(MainLoopVF).getTripCount(), SE);
4413 assert(!isa<SCEVCouldNotCompute>(TC) && "Trip count SCEV must be computable");
4414 RemainingIterations =
4415 SE.getURemExpr(TC, SE.getElementCount(TCType, MainLoopVF * IC));
4416
4417 // No iterations left to process in the epilogue.
4418 if (RemainingIterations->isZero())
4419 return Result;
4420
4421 if (MainLoopVF.isFixed()) {
4422 MaxTripCount = MainLoopVF.getFixedValue() * IC - 1;
4423 if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
4424 SE.getConstant(TCType, MaxTripCount))) {
4425 MaxTripCount = SE.getUnsignedRangeMax(RemainingIterations).getZExtValue();
4426 }
4427 LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
4428 << MaxTripCount << "\n");
4429 }
4430
4431 for (auto &NextVF : ProfitableVFs) {
4432 // Skip candidate VFs without a corresponding VPlan.
4433 if (!hasPlanWithVF(NextVF.Width))
4434 continue;
4435
4436 // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
4437 // vectors) or > the VF of the main loop (fixed vectors).
4438 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
4439 ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
4440 (NextVF.Width.isScalable() &&
4441 ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) ||
4442 (!NextVF.Width.isScalable() && !MainLoopVF.isScalable() &&
4443 ElementCount::isKnownGT(NextVF.Width, MainLoopVF)))
4444 continue;
4445
4446 // If NextVF is greater than the number of remaining iterations, the
4447 // epilogue loop would be dead. Skip such factors.
4448 if (RemainingIterations && !NextVF.Width.isScalable()) {
4449 if (SE.isKnownPredicate(
4451 SE.getConstant(TCType, NextVF.Width.getFixedValue()),
4452 RemainingIterations))
4453 continue;
4454 }
4455
4456 if (Result.Width.isScalar() ||
4457 isMoreProfitable(NextVF, Result, MaxTripCount, !CM.foldTailByMasking()))
4458 Result = NextVF;
4459 }
4460
4461 if (Result != VectorizationFactor::Disabled())
4462 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
4463 << Result.Width << "\n");
4464 return Result;
4465}
4466
4467std::pair<unsigned, unsigned>
4469 unsigned MinWidth = -1U;
4470 unsigned MaxWidth = 8;
4472 // For in-loop reductions, no element types are added to ElementTypesInLoop
4473 // if there are no loads/stores in the loop. In this case, check through the
4474 // reduction variables to determine the maximum width.
4475 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
4476 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
4477 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
4478 // When finding the min width used by the recurrence we need to account
4479 // for casts on the input operands of the recurrence.
4480 MinWidth = std::min(
4481 MinWidth,
4482 std::min(RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
4484 MaxWidth = std::max(MaxWidth,
4486 }
4487 } else {
4488 for (Type *T : ElementTypesInLoop) {
4489 MinWidth = std::min<unsigned>(
4490 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4491 MaxWidth = std::max<unsigned>(
4492 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4493 }
4494 }
4495 return {MinWidth, MaxWidth};
4496}
4497
4499 ElementTypesInLoop.clear();
4500 // For each block.
4501 for (BasicBlock *BB : TheLoop->blocks()) {
4502 // For each instruction in the loop.
4503 for (Instruction &I : BB->instructionsWithoutDebug()) {
4504 Type *T = I.getType();
4505
4506 // Skip ignored values.
4507 if (ValuesToIgnore.count(&I))
4508 continue;
4509
4510 // Only examine Loads, Stores and PHINodes.
4511 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
4512 continue;
4513
4514 // Examine PHI nodes that are reduction variables. Update the type to
4515 // account for the recurrence type.
4516 if (auto *PN = dyn_cast<PHINode>(&I)) {
4517 if (!Legal->isReductionVariable(PN))
4518 continue;
4519 const RecurrenceDescriptor &RdxDesc =
4523 RdxDesc.getRecurrenceType()))
4524 continue;
4525 T = RdxDesc.getRecurrenceType();
4526 }
4527
4528 // Examine the stored values.
4529 if (auto *ST = dyn_cast<StoreInst>(&I))
4530 T = ST->getValueOperand()->getType();
4531
4532 assert(T->isSized() &&
4533 "Expected the load/store/recurrence type to be sized");
4534
4535 ElementTypesInLoop.insert(T);
4536 }
4537 }
4538}
4539
4540unsigned
4542 InstructionCost LoopCost) {
4543 // -- The interleave heuristics --
4544 // We interleave the loop in order to expose ILP and reduce the loop overhead.
4545 // There are many micro-architectural considerations that we can't predict
4546 // at this level. For example, frontend pressure (on decode or fetch) due to
4547 // code size, or the number and capabilities of the execution ports.
4548 //
4549 // We use the following heuristics to select the interleave count:
4550 // 1. If the code has reductions, then we interleave to break the cross
4551 // iteration dependency.
4552 // 2. If the loop is really small, then we interleave to reduce the loop
4553 // overhead.
4554 // 3. We don't interleave if we think that we will spill registers to memory
4555 // due to the increased register pressure.
4556
4557 if (!CM.isScalarEpilogueAllowed())
4558 return 1;
4559
4561 IsaPred<VPEVLBasedIVPHIRecipe>)) {
4562 LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
4563 "Unroll factor forced to be 1.\n");
4564 return 1;
4565 }
4566
4567 // We used the distance for the interleave count.
4568 if (!Legal->isSafeForAnyVectorWidth())
4569 return 1;
4570
4571 // We don't attempt to perform interleaving for loops with uncountable early
4572 // exits because the VPInstruction::AnyOf code cannot currently handle
4573 // multiple parts.
4574 if (Plan.hasEarlyExit())
4575 return 1;
4576
4577 const bool HasReductions =
4579 IsaPred<VPReductionPHIRecipe>);
4580
4581 // If we did not calculate the cost for VF (because the user selected the VF)
4582 // then we calculate the cost of VF here.
4583 if (LoopCost == 0) {
4584 if (VF.isScalar())
4585 LoopCost = CM.expectedCost(VF);
4586 else
4587 LoopCost = cost(Plan, VF);
4588 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
4589
4590 // Loop body is free and there is no need for interleaving.
4591 if (LoopCost == 0)
4592 return 1;
4593 }
4594
4595 VPRegisterUsage R =
4597 // We divide by these constants so assume that we have at least one
4598 // instruction that uses at least one register.
4599 for (auto &Pair : R.MaxLocalUsers) {
4600 Pair.second = std::max(Pair.second, 1U);
4601 }
4602
4603 // We calculate the interleave count using the following formula.
4604 // Subtract the number of loop invariants from the number of available
4605 // registers. These registers are used by all of the interleaved instances.
4606 // Next, divide the remaining registers by the number of registers that is
4607 // required by the loop, in order to estimate how many parallel instances
4608 // fit without causing spills. All of this is rounded down if necessary to be
4609 // a power of two. We want power of two interleave count to simplify any
4610 // addressing operations or alignment considerations.
4611 // We also want power of two interleave counts to ensure that the induction
4612 // variable of the vector loop wraps to zero, when tail is folded by masking;
4613 // this currently happens when OptForSize, in which case IC is set to 1 above.
4614 unsigned IC = UINT_MAX;
4615
4616 for (const auto &Pair : R.MaxLocalUsers) {
4617 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(Pair.first);
4618 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
4619 << " registers of "
4620 << TTI.getRegisterClassName(Pair.first)
4621 << " register class\n");
4622 if (VF.isScalar()) {
4623 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
4624 TargetNumRegisters = ForceTargetNumScalarRegs;
4625 } else {
4626 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
4627 TargetNumRegisters = ForceTargetNumVectorRegs;
4628 }
4629 unsigned MaxLocalUsers = Pair.second;
4630 unsigned LoopInvariantRegs = 0;
4631 if (R.LoopInvariantRegs.contains(Pair.first))
4632 LoopInvariantRegs = R.LoopInvariantRegs[Pair.first];
4633
4634 unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
4635 MaxLocalUsers);
4636 // Don't count the induction variable as interleaved.
4638 TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
4639 std::max(1U, (MaxLocalUsers - 1)));
4640 }
4641
4642 IC = std::min(IC, TmpIC);
4643 }
4644
4645 // Clamp the interleave ranges to reasonable counts.
4646 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
4647
4648 // Check if the user has overridden the max.
4649 if (VF.isScalar()) {
4650 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
4651 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
4652 } else {
4653 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
4654 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4655 }
4656
4657 // Try to get the exact trip count, or an estimate based on profiling data or
4658 // ConstantMax from PSE, failing that.
4659 auto BestKnownTC = getSmallBestKnownTC(PSE, OrigLoop);
4660
4661 // For fixed length VFs treat a scalable trip count as unknown.
4662 if (BestKnownTC && (BestKnownTC->isFixed() || VF.isScalable())) {
4663 // Re-evaluate trip counts and VFs to be in the same numerical space.
4664 unsigned AvailableTC =
4665 estimateElementCount(*BestKnownTC, CM.getVScaleForTuning());
4666 unsigned EstimatedVF = estimateElementCount(VF, CM.getVScaleForTuning());
4667
4668 // At least one iteration must be scalar when this constraint holds. So the
4669 // maximum available iterations for interleaving is one less.
4670 if (CM.requiresScalarEpilogue(VF.isVector()))
4671 --AvailableTC;
4672
4673 unsigned InterleaveCountLB = bit_floor(std::max(
4674 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
4675
4676 if (getSmallConstantTripCount(PSE.getSE(), OrigLoop).isNonZero()) {
4677 // If the best known trip count is exact, we select between two
4678 // prospective ICs, where
4679 //
4680 // 1) the aggressive IC is capped by the trip count divided by VF
4681 // 2) the conservative IC is capped by the trip count divided by (VF * 2)
4682 //
4683 // The final IC is selected in a way that the epilogue loop trip count is
4684 // minimized while maximizing the IC itself, so that we either run the
4685 // vector loop at least once if it generates a small epilogue loop, or
4686 // else we run the vector loop at least twice.
4687
4688 unsigned InterleaveCountUB = bit_floor(std::max(
4689 1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
4690 MaxInterleaveCount = InterleaveCountLB;
4691
4692 if (InterleaveCountUB != InterleaveCountLB) {
4693 unsigned TailTripCountUB =
4694 (AvailableTC % (EstimatedVF * InterleaveCountUB));
4695 unsigned TailTripCountLB =
4696 (AvailableTC % (EstimatedVF * InterleaveCountLB));
4697 // If both produce same scalar tail, maximize the IC to do the same work
4698 // in fewer vector loop iterations
4699 if (TailTripCountUB == TailTripCountLB)
4700 MaxInterleaveCount = InterleaveCountUB;
4701 }
4702 } else {
4703 // If trip count is an estimated compile time constant, limit the
4704 // IC to be capped by the trip count divided by VF * 2, such that the
4705 // vector loop runs at least twice to make interleaving seem profitable
4706 // when there is an epilogue loop present. Since exact Trip count is not
4707 // known we choose to be conservative in our IC estimate.
4708 MaxInterleaveCount = InterleaveCountLB;
4709 }
4710 }
4711
4712 assert(MaxInterleaveCount > 0 &&
4713 "Maximum interleave count must be greater than 0");
4714
4715 // Clamp the calculated IC to be between the 1 and the max interleave count
4716 // that the target and trip count allows.
4717 if (IC > MaxInterleaveCount)
4718 IC = MaxInterleaveCount;
4719 else
4720 // Make sure IC is greater than 0.
4721 IC = std::max(1u, IC);
4722
4723 assert(IC > 0 && "Interleave count must be greater than 0.");
4724
4725 // Interleave if we vectorized this loop and there is a reduction that could
4726 // benefit from interleaving.
4727 if (VF.isVector() && HasReductions) {
4728 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
4729 return IC;
4730 }
4731
4732 // For any scalar loop that either requires runtime checks or predication we
4733 // are better off leaving this to the unroller. Note that if we've already
4734 // vectorized the loop we will have done the runtime check and so interleaving
4735 // won't require further checks.
4736 bool ScalarInterleavingRequiresPredication =
4737 (VF.isScalar() && any_of(OrigLoop->blocks(), [this](BasicBlock *BB) {
4738 return Legal->blockNeedsPredication(BB);
4739 }));
4740 bool ScalarInterleavingRequiresRuntimePointerCheck =
4741 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
4742
4743 // We want to interleave small loops in order to reduce the loop overhead and
4744 // potentially expose ILP opportunities.
4745 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
4746 << "LV: IC is " << IC << '\n'
4747 << "LV: VF is " << VF << '\n');
4748 const bool AggressivelyInterleaveReductions =
4749 TTI.enableAggressiveInterleaving(HasReductions);
4750 if (!ScalarInterleavingRequiresRuntimePointerCheck &&
4751 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
4752 // We assume that the cost overhead is 1 and we use the cost model
4753 // to estimate the cost of the loop and interleave until the cost of the
4754 // loop overhead is about 5% of the cost of the loop.
4755 unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
4756 SmallLoopCost / LoopCost.getValue()));
4757
4758 // Interleave until store/load ports (estimated by max interleave count) are
4759 // saturated.
4760 unsigned NumStores = 0;
4761 unsigned NumLoads = 0;
4762 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4764 for (VPRecipeBase &R : *VPBB) {
4765 if (isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(&R)) {
4766 NumLoads++;
4767 continue;
4768 }
4769 if (isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe>(&R)) {
4770 NumStores++;
4771 continue;
4772 }
4773
4774 if (auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R)) {
4775 if (unsigned StoreOps = InterleaveR->getNumStoreOperands())
4776 NumStores += StoreOps;
4777 else
4778 NumLoads += InterleaveR->getNumDefinedValues();
4779 continue;
4780 }
4781 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
4782 NumLoads += isa<LoadInst>(RepR->getUnderlyingInstr());
4783 NumStores += isa<StoreInst>(RepR->getUnderlyingInstr());
4784 continue;
4785 }
4786 if (isa<VPHistogramRecipe>(&R)) {
4787 NumLoads++;
4788 NumStores++;
4789 continue;
4790 }
4791 }
4792 }
4793 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
4794 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
4795
4796 // There is little point in interleaving for reductions containing selects
4797 // and compares when VF=1 since it may just create more overhead than it's
4798 // worth for loops with small trip counts. This is because we still have to
4799 // do the final reduction after the loop.
4800 bool HasSelectCmpReductions =
4801 HasReductions &&
4803 [](VPRecipeBase &R) {
4804 auto *RedR = dyn_cast<VPReductionPHIRecipe>(&R);
4805 return RedR && (RecurrenceDescriptor::isAnyOfRecurrenceKind(
4806 RedR->getRecurrenceKind()) ||
4807 RecurrenceDescriptor::isFindIVRecurrenceKind(
4808 RedR->getRecurrenceKind()));
4809 });
4810 if (HasSelectCmpReductions) {
4811 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
4812 return 1;
4813 }
4814
4815 // If we have a scalar reduction (vector reductions are already dealt with
4816 // by this point), we can increase the critical path length if the loop
4817 // we're interleaving is inside another loop. For tree-wise reductions
4818 // set the limit to 2, and for ordered reductions it's best to disable
4819 // interleaving entirely.
4820 if (HasReductions && OrigLoop->getLoopDepth() > 1) {
4821 bool HasOrderedReductions =
4823 [](VPRecipeBase &R) {
4824 auto *RedR = dyn_cast<VPReductionPHIRecipe>(&R);
4825
4826 return RedR && RedR->isOrdered();
4827 });
4828 if (HasOrderedReductions) {
4829 LLVM_DEBUG(
4830 dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
4831 return 1;
4832 }
4833
4834 unsigned F = MaxNestedScalarReductionIC;
4835 SmallIC = std::min(SmallIC, F);
4836 StoresIC = std::min(StoresIC, F);
4837 LoadsIC = std::min(LoadsIC, F);
4838 }
4839
4841 std::max(StoresIC, LoadsIC) > SmallIC) {
4842 LLVM_DEBUG(
4843 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
4844 return std::max(StoresIC, LoadsIC);
4845 }
4846
4847 // If there are scalar reductions and TTI has enabled aggressive
4848 // interleaving for reductions, we will interleave to expose ILP.
4849 if (VF.isScalar() && AggressivelyInterleaveReductions) {
4850 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
4851 // Interleave no less than SmallIC but not as aggressive as the normal IC
4852 // to satisfy the rare situation when resources are too limited.
4853 return std::max(IC / 2, SmallIC);
4854 }
4855
4856 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
4857 return SmallIC;
4858 }
4859
4860 // Interleave if this is a large loop (small loops are already dealt with by
4861 // this point) that could benefit from interleaving.
4862 if (AggressivelyInterleaveReductions) {
4863 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
4864 return IC;
4865 }
4866
4867 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
4868 return 1;
4869}
4870
4871bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
4872 ElementCount VF) {
4873 // TODO: Cost model for emulated masked load/store is completely
4874 // broken. This hack guides the cost model to use an artificially
4875 // high enough value to practically disable vectorization with such
4876 // operations, except where previously deployed legality hack allowed
4877 // using very low cost values. This is to avoid regressions coming simply
4878 // from moving "masked load/store" check from legality to cost model.
4879 // Masked Load/Gather emulation was previously never allowed.
4880 // Limited number of Masked Store/Scatter emulation was allowed.
4882 "Expecting a scalar emulated instruction");
4883 return isa<LoadInst>(I) ||
4884 (isa<StoreInst>(I) &&
4885 NumPredStores > NumberOfStoresToPredicate);
4886}
4887
4889 assert(VF.isVector() && "Expected VF >= 2");
4890
4891 // If we've already collected the instructions to scalarize or the predicated
4892 // BBs after vectorization, there's nothing to do. Collection may already have
4893 // occurred if we have a user-selected VF and are now computing the expected
4894 // cost for interleaving.
4895 if (InstsToScalarize.contains(VF) ||
4896 PredicatedBBsAfterVectorization.contains(VF))
4897 return;
4898
4899 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
4900 // not profitable to scalarize any instructions, the presence of VF in the
4901 // map will indicate that we've analyzed it already.
4902 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
4903
4904 // Find all the instructions that are scalar with predication in the loop and
4905 // determine if it would be better to not if-convert the blocks they are in.
4906 // If so, we also record the instructions to scalarize.
4907 for (BasicBlock *BB : TheLoop->blocks()) {
4909 continue;
4910 for (Instruction &I : *BB)
4911 if (isScalarWithPredication(&I, VF)) {
4912 ScalarCostsTy ScalarCosts;
4913 // Do not apply discount logic for:
4914 // 1. Scalars after vectorization, as there will only be a single copy
4915 // of the instruction.
4916 // 2. Scalable VF, as that would lead to invalid scalarization costs.
4917 // 3. Emulated masked memrefs, if a hacked cost is needed.
4918 if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() &&
4919 !useEmulatedMaskMemRefHack(&I, VF) &&
4920 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) {
4921 for (const auto &[I, IC] : ScalarCosts)
4922 ScalarCostsVF.insert({I, IC});
4923 // Check if we decided to scalarize a call. If so, update the widening
4924 // decision of the call to CM_Scalarize with the computed scalar cost.
4925 for (const auto &[I, Cost] : ScalarCosts) {
4926 auto *CI = dyn_cast<CallInst>(I);
4927 if (!CI || !CallWideningDecisions.contains({CI, VF}))
4928 continue;
4929 CallWideningDecisions[{CI, VF}].Kind = CM_Scalarize;
4930 CallWideningDecisions[{CI, VF}].Cost = Cost;
4931 }
4932 }
4933 // Remember that BB will remain after vectorization.
4934 PredicatedBBsAfterVectorization[VF].insert(BB);
4935 for (auto *Pred : predecessors(BB)) {
4936 if (Pred->getSingleSuccessor() == BB)
4937 PredicatedBBsAfterVectorization[VF].insert(Pred);
4938 }
4939 }
4940 }
4941}
4942
4943InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
4944 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
4945 assert(!isUniformAfterVectorization(PredInst, VF) &&
4946 "Instruction marked uniform-after-vectorization will be predicated");
4947
4948 // Initialize the discount to zero, meaning that the scalar version and the
4949 // vector version cost the same.
4950 InstructionCost Discount = 0;
4951
4952 // Holds instructions to analyze. The instructions we visit are mapped in
4953 // ScalarCosts. Those instructions are the ones that would be scalarized if
4954 // we find that the scalar version costs less.
4956
4957 // Returns true if the given instruction can be scalarized.
4958 auto CanBeScalarized = [&](Instruction *I) -> bool {
4959 // We only attempt to scalarize instructions forming a single-use chain
4960 // from the original predicated block that would otherwise be vectorized.
4961 // Although not strictly necessary, we give up on instructions we know will
4962 // already be scalar to avoid traversing chains that are unlikely to be
4963 // beneficial.
4964 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
4966 return false;
4967
4968 // If the instruction is scalar with predication, it will be analyzed
4969 // separately. We ignore it within the context of PredInst.
4970 if (isScalarWithPredication(I, VF))
4971 return false;
4972
4973 // If any of the instruction's operands are uniform after vectorization,
4974 // the instruction cannot be scalarized. This prevents, for example, a
4975 // masked load from being scalarized.
4976 //
4977 // We assume we will only emit a value for lane zero of an instruction
4978 // marked uniform after vectorization, rather than VF identical values.
4979 // Thus, if we scalarize an instruction that uses a uniform, we would
4980 // create uses of values corresponding to the lanes we aren't emitting code
4981 // for. This behavior can be changed by allowing getScalarValue to clone
4982 // the lane zero values for uniforms rather than asserting.
4983 for (Use &U : I->operands())
4984 if (auto *J = dyn_cast<Instruction>(U.get()))
4985 if (isUniformAfterVectorization(J, VF))
4986 return false;
4987
4988 // Otherwise, we can scalarize the instruction.
4989 return true;
4990 };
4991
4992 // Compute the expected cost discount from scalarizing the entire expression
4993 // feeding the predicated instruction. We currently only consider expressions
4994 // that are single-use instruction chains.
4995 Worklist.push_back(PredInst);
4996 while (!Worklist.empty()) {
4997 Instruction *I = Worklist.pop_back_val();
4998
4999 // If we've already analyzed the instruction, there's nothing to do.
5000 if (ScalarCosts.contains(I))
5001 continue;
5002
5003 // Cannot scalarize fixed-order recurrence phis at the moment.
5004 if (isa<PHINode>(I) && Legal->isFixedOrderRecurrence(cast<PHINode>(I)))
5005 continue;
5006
5007 // Compute the cost of the vector instruction. Note that this cost already
5008 // includes the scalarization overhead of the predicated instruction.
5009 InstructionCost VectorCost = getInstructionCost(I, VF);
5010
5011 // Compute the cost of the scalarized instruction. This cost is the cost of
5012 // the instruction as if it wasn't if-converted and instead remained in the
5013 // predicated block. We will scale this cost by block probability after
5014 // computing the scalarization overhead.
5015 InstructionCost ScalarCost =
5017
5018 // Compute the scalarization overhead of needed insertelement instructions
5019 // and phi nodes.
5020 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5021 Type *WideTy = toVectorizedTy(I->getType(), VF);
5022 for (Type *VectorTy : getContainedTypes(WideTy)) {
5023 ScalarCost += TTI.getScalarizationOverhead(
5024 cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getFixedValue()),
5025 /*Insert=*/true,
5026 /*Extract=*/false, CostKind);
5027 }
5028 ScalarCost +=
5029 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
5030 }
5031
5032 // Compute the scalarization overhead of needed extractelement
5033 // instructions. For each of the instruction's operands, if the operand can
5034 // be scalarized, add it to the worklist; otherwise, account for the
5035 // overhead.
5036 for (Use &U : I->operands())
5037 if (auto *J = dyn_cast<Instruction>(U.get())) {
5038 assert(canVectorizeTy(J->getType()) &&
5039 "Instruction has non-scalar type");
5040 if (CanBeScalarized(J))
5041 Worklist.push_back(J);
5042 else if (needsExtract(J, VF)) {
5043 Type *WideTy = toVectorizedTy(J->getType(), VF);
5044 for (Type *VectorTy : getContainedTypes(WideTy)) {
5045 ScalarCost += TTI.getScalarizationOverhead(
5046 cast<VectorType>(VectorTy),
5047 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5048 /*Extract*/ true, CostKind);
5049 }
5050 }
5051 }
5052
5053 // Scale the total scalar cost by block probability.
5054 ScalarCost /= getPredBlockCostDivisor(CostKind);
5055
5056 // Compute the discount. A non-negative discount means the vector version
5057 // of the instruction costs more, and scalarizing would be beneficial.
5058 Discount += VectorCost - ScalarCost;
5059 ScalarCosts[I] = ScalarCost;
5060 }
5061
5062 return Discount;
5063}
5064
5067
5068 // If the vector loop gets executed exactly once with the given VF, ignore the
5069 // costs of comparison and induction instructions, as they'll get simplified
5070 // away.
5071 SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;
5073 if (TC == VF && !foldTailByMasking())
5075 ValuesToIgnoreForVF);
5076
5077 // For each block.
5078 for (BasicBlock *BB : TheLoop->blocks()) {
5079 InstructionCost BlockCost;
5080
5081 // For each instruction in the old loop.
5082 for (Instruction &I : BB->instructionsWithoutDebug()) {
5083 // Skip ignored values.
5084 if (ValuesToIgnore.count(&I) || ValuesToIgnoreForVF.count(&I) ||
5085 (VF.isVector() && VecValuesToIgnore.count(&I)))
5086 continue;
5087
5089
5090 // Check if we should override the cost.
5091 if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
5093
5094 BlockCost += C;
5095 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5096 << VF << " For instruction: " << I << '\n');
5097 }
5098
5099 // If we are vectorizing a predicated block, it will have been
5100 // if-converted. This means that the block's instructions (aside from
5101 // stores and instructions that may divide by zero) will now be
5102 // unconditionally executed. For the scalar case, we may not always execute
5103 // the predicated block, if it is an if-else block. Thus, scale the block's
5104 // cost by the probability of executing it. blockNeedsPredication from
5105 // Legal is used so as to not include all blocks in tail folded loops.
5106 if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5107 BlockCost /= getPredBlockCostDivisor(CostKind);
5108
5109 Cost += BlockCost;
5110 }
5111
5112 return Cost;
5113}
5114
5115/// Gets Address Access SCEV after verifying that the access pattern
5116/// is loop invariant except the induction variable dependence.
5117///
5118/// This SCEV can be sent to the Target in order to estimate the address
5119/// calculation cost.
5121 Value *Ptr,
5124 const Loop *TheLoop) {
5125
5126 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5127 if (!Gep)
5128 return nullptr;
5129
5130 // We are looking for a gep with all loop invariant indices except for one
5131 // which should be an induction variable.
5132 auto *SE = PSE.getSE();
5133 unsigned NumOperands = Gep->getNumOperands();
5134 for (unsigned Idx = 1; Idx < NumOperands; ++Idx) {
5135 Value *Opd = Gep->getOperand(Idx);
5136 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5137 !Legal->isInductionVariable(Opd))
5138 return nullptr;
5139 }
5140
5141 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5142 return PSE.getSCEV(Ptr);
5143}
5144
5146LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5147 ElementCount VF) {
5148 assert(VF.isVector() &&
5149 "Scalarization cost of instruction implies vectorization.");
5150 if (VF.isScalable())
5152
5153 Type *ValTy = getLoadStoreType(I);
5154 auto *SE = PSE.getSE();
5155
5156 unsigned AS = getLoadStoreAddressSpace(I);
5158 Type *PtrTy = toVectorTy(Ptr->getType(), VF);
5159 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
5160 // that it is being called from this specific place.
5161
5162 // Figure out whether the access is strided and get the stride value
5163 // if it's known in compile time
5164 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5165
5166 // Get the cost of the scalar memory instruction and address computation.
5168 PtrTy, SE, PtrSCEV, CostKind);
5169
5170 // Don't pass *I here, since it is scalar but will actually be part of a
5171 // vectorized loop where the user of it is a vectorized instruction.
5172 const Align Alignment = getLoadStoreAlignment(I);
5173 Cost += VF.getFixedValue() * TTI.getMemoryOpCost(I->getOpcode(),
5174 ValTy->getScalarType(),
5175 Alignment, AS, CostKind);
5176
5177 // Get the overhead of the extractelement and insertelement instructions
5178 // we might create due to scalarization.
5179 Cost += getScalarizationOverhead(I, VF);
5180
5181 // If we have a predicated load/store, it will need extra i1 extracts and
5182 // conditional branches, but may not be executed for each vector lane. Scale
5183 // the cost by the probability of executing the predicated block.
5184 if (isPredicatedInst(I)) {
5186
5187 // Add the cost of an i1 extract and a branch
5188 auto *VecI1Ty =
5191 VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
5192 /*Insert=*/false, /*Extract=*/true, CostKind);
5193 Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
5194
5195 if (useEmulatedMaskMemRefHack(I, VF))
5196 // Artificially setting to a high enough value to practically disable
5197 // vectorization with such operations.
5198 Cost = 3000000;
5199 }
5200
5201 return Cost;
5202}
5203
5205LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5206 ElementCount VF) {
5207 Type *ValTy = getLoadStoreType(I);
5208 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5210 unsigned AS = getLoadStoreAddressSpace(I);
5211 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
5212
5213 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5214 "Stride should be 1 or -1 for consecutive memory access");
5215 const Align Alignment = getLoadStoreAlignment(I);
5217 if (Legal->isMaskRequired(I)) {
5218 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5219 CostKind);
5220 } else {
5221 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5222 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5223 CostKind, OpInfo, I);
5224 }
5225
5226 bool Reverse = ConsecutiveStride < 0;
5227 if (Reverse)
5229 VectorTy, {}, CostKind, 0);
5230 return Cost;
5231}
5232
5234LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5235 ElementCount VF) {
5236 assert(Legal->isUniformMemOp(*I, VF));
5237
5238 Type *ValTy = getLoadStoreType(I);
5240 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5241 const Align Alignment = getLoadStoreAlignment(I);
5242 unsigned AS = getLoadStoreAddressSpace(I);
5243 if (isa<LoadInst>(I)) {
5244 return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
5245 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
5246 CostKind) +
5248 VectorTy, {}, CostKind);
5249 }
5250 StoreInst *SI = cast<StoreInst>(I);
5251
5252 bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
5253 // TODO: We have existing tests that request the cost of extracting element
5254 // VF.getKnownMinValue() - 1 from a scalable vector. This does not represent
5255 // the actual generated code, which involves extracting the last element of
5256 // a scalable vector where the lane to extract is unknown at compile time.
5258 TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
5259 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, CostKind);
5260 if (!IsLoopInvariantStoreValue)
5261 Cost += TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
5262 VectorTy, CostKind, 0);
5263 return Cost;
5264}
5265
5267LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5268 ElementCount VF) {
5269 Type *ValTy = getLoadStoreType(I);
5270 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5271 const Align Alignment = getLoadStoreAlignment(I);
5273 Type *PtrTy = Ptr->getType();
5274
5275 if (!Legal->isUniform(Ptr, VF))
5276 PtrTy = toVectorTy(PtrTy, VF);
5277
5278 return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
5279 TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5280 Legal->isMaskRequired(I), Alignment,
5281 CostKind, I);
5282}
5283
5285LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5286 ElementCount VF) {
5287 const auto *Group = getInterleavedAccessGroup(I);
5288 assert(Group && "Fail to get an interleaved access group.");
5289
5290 Instruction *InsertPos = Group->getInsertPos();
5291 Type *ValTy = getLoadStoreType(InsertPos);
5292 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5293 unsigned AS = getLoadStoreAddressSpace(InsertPos);
5294
5295 unsigned InterleaveFactor = Group->getFactor();
5296 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5297
5298 // Holds the indices of existing members in the interleaved group.
5300 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
5301 if (Group->getMember(IF))
5302 Indices.push_back(IF);
5303
5304 // Calculate the cost of the whole interleaved group.
5305 bool UseMaskForGaps =
5306 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
5307 (isa<StoreInst>(I) && !Group->isFull());
5309 InsertPos->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5310 Group->getAlign(), AS, CostKind, Legal->isMaskRequired(I),
5311 UseMaskForGaps);
5312
5313 if (Group->isReverse()) {
5314 // TODO: Add support for reversed masked interleaved access.
5316 "Reverse masked interleaved access not supported.");
5317 Cost += Group->getNumMembers() *
5319 VectorTy, {}, CostKind, 0);
5320 }
5321 return Cost;
5322}
5323
5324std::optional<InstructionCost>
5326 ElementCount VF,
5327 Type *Ty) const {
5328 using namespace llvm::PatternMatch;
5329 // Early exit for no inloop reductions
5330 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
5331 return std::nullopt;
5332 auto *VectorTy = cast<VectorType>(Ty);
5333
5334 // We are looking for a pattern of, and finding the minimal acceptable cost:
5335 // reduce(mul(ext(A), ext(B))) or
5336 // reduce(mul(A, B)) or
5337 // reduce(ext(A)) or
5338 // reduce(A).
5339 // The basic idea is that we walk down the tree to do that, finding the root
5340 // reduction instruction in InLoopReductionImmediateChains. From there we find
5341 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
5342 // of the components. If the reduction cost is lower then we return it for the
5343 // reduction instruction and 0 for the other instructions in the pattern. If
5344 // it is not we return an invalid cost specifying the orignal cost method
5345 // should be used.
5346 Instruction *RetI = I;
5347 if (match(RetI, m_ZExtOrSExt(m_Value()))) {
5348 if (!RetI->hasOneUser())
5349 return std::nullopt;
5350 RetI = RetI->user_back();
5351 }
5352
5353 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
5354 RetI->user_back()->getOpcode() == Instruction::Add) {
5355 RetI = RetI->user_back();
5356 }
5357
5358 // Test if the found instruction is a reduction, and if not return an invalid
5359 // cost specifying the parent to use the original cost modelling.
5360 Instruction *LastChain = InLoopReductionImmediateChains.lookup(RetI);
5361 if (!LastChain)
5362 return std::nullopt;
5363
5364 // Find the reduction this chain is a part of and calculate the basic cost of
5365 // the reduction on its own.
5366 Instruction *ReductionPhi = LastChain;
5367 while (!isa<PHINode>(ReductionPhi))
5368 ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
5369
5370 const RecurrenceDescriptor &RdxDesc =
5371 Legal->getRecurrenceDescriptor(cast<PHINode>(ReductionPhi));
5372
5373 InstructionCost BaseCost;
5374 RecurKind RK = RdxDesc.getRecurrenceKind();
5377 BaseCost = TTI.getMinMaxReductionCost(MinMaxID, VectorTy,
5378 RdxDesc.getFastMathFlags(), CostKind);
5379 } else {
5381 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
5382 }
5383
5384 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
5385 // normal fmul instruction to the cost of the fadd reduction.
5386 if (RK == RecurKind::FMulAdd)
5387 BaseCost +=
5388 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
5389
5390 // If we're using ordered reductions then we can just return the base cost
5391 // here, since getArithmeticReductionCost calculates the full ordered
5392 // reduction cost when FP reassociation is not allowed.
5393 if (useOrderedReductions(RdxDesc))
5394 return BaseCost;
5395
5396 // Get the operand that was not the reduction chain and match it to one of the
5397 // patterns, returning the better cost if it is found.
5398 Instruction *RedOp = RetI->getOperand(1) == LastChain
5399 ? dyn_cast<Instruction>(RetI->getOperand(0))
5400 : dyn_cast<Instruction>(RetI->getOperand(1));
5401
5402 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
5403
5404 Instruction *Op0, *Op1;
5405 if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5406 match(RedOp,
5408 match(Op0, m_ZExtOrSExt(m_Value())) &&
5409 Op0->getOpcode() == Op1->getOpcode() &&
5410 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
5412 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
5413
5414 // Matched reduce.add(ext(mul(ext(A), ext(B)))
5415 // Note that the extend opcodes need to all match, or if A==B they will have
5416 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
5417 // which is equally fine.
5418 bool IsUnsigned = isa<ZExtInst>(Op0);
5419 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
5420 auto *MulType = VectorType::get(Op0->getType(), VectorTy);
5421
5422 InstructionCost ExtCost =
5423 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
5425 InstructionCost MulCost =
5426 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
5427 InstructionCost Ext2Cost =
5428 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
5430
5432 IsUnsigned, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), ExtType,
5433 CostKind);
5434
5435 if (RedCost.isValid() &&
5436 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
5437 return I == RetI ? RedCost : 0;
5438 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
5439 !TheLoop->isLoopInvariant(RedOp)) {
5440 // Matched reduce(ext(A))
5441 bool IsUnsigned = isa<ZExtInst>(RedOp);
5442 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
5444 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
5445 RdxDesc.getFastMathFlags(), CostKind);
5446
5447 InstructionCost ExtCost =
5448 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
5450 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
5451 return I == RetI ? RedCost : 0;
5452 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5453 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
5454 if (match(Op0, m_ZExtOrSExt(m_Value())) &&
5455 Op0->getOpcode() == Op1->getOpcode() &&
5457 bool IsUnsigned = isa<ZExtInst>(Op0);
5458 Type *Op0Ty = Op0->getOperand(0)->getType();
5459 Type *Op1Ty = Op1->getOperand(0)->getType();
5460 Type *LargestOpTy =
5461 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
5462 : Op0Ty;
5463 auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
5464
5465 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
5466 // different sizes. We take the largest type as the ext to reduce, and add
5467 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
5469 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
5472 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
5474 InstructionCost MulCost =
5475 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
5476
5478 IsUnsigned, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), ExtType,
5479 CostKind);
5480 InstructionCost ExtraExtCost = 0;
5481 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
5482 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
5483 ExtraExtCost = TTI.getCastInstrCost(
5484 ExtraExtOp->getOpcode(), ExtType,
5485 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
5487 }
5488
5489 if (RedCost.isValid() &&
5490 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
5491 return I == RetI ? RedCost : 0;
5492 } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
5493 // Matched reduce.add(mul())
5494 InstructionCost MulCost =
5495 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
5496
5498 true, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), VectorTy,
5499 CostKind);
5500
5501 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
5502 return I == RetI ? RedCost : 0;
5503 }
5504 }
5505
5506 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
5507}
5508
5510LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5511 ElementCount VF) {
5512 // Calculate scalar cost only. Vectorization cost should be ready at this
5513 // moment.
5514 if (VF.isScalar()) {
5515 Type *ValTy = getLoadStoreType(I);
5517 const Align Alignment = getLoadStoreAlignment(I);
5518 unsigned AS = getLoadStoreAddressSpace(I);
5519
5520 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5521 return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
5522 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, CostKind,
5523 OpInfo, I);
5524 }
5525 return getWideningCost(I, VF);
5526}
5527
5529LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5530 ElementCount VF) const {
5531
5532 // There is no mechanism yet to create a scalable scalarization loop,
5533 // so this is currently Invalid.
5534 if (VF.isScalable())
5536
5537 if (VF.isScalar())
5538 return 0;
5539
5541 Type *RetTy = toVectorizedTy(I->getType(), VF);
5542 if (!RetTy->isVoidTy() &&
5543 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) {
5544
5545 for (Type *VectorTy : getContainedTypes(RetTy)) {
5547 cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getFixedValue()),
5548 /*Insert=*/true,
5549 /*Extract=*/false, CostKind);
5550 }
5551 }
5552
5553 // Some targets keep addresses scalar.
5554 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5555 return Cost;
5556
5557 // Some targets support efficient element stores.
5558 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
5559 return Cost;
5560
5561 // Collect operands to consider.
5562 CallInst *CI = dyn_cast<CallInst>(I);
5563 Instruction::op_range Ops = CI ? CI->args() : I->operands();
5564
5565 // Skip operands that do not require extraction/scalarization and do not incur
5566 // any overhead.
5568 for (auto *V : filterExtractingOperands(Ops, VF))
5569 Tys.push_back(maybeVectorizeType(V->getType(), VF));
5571}
5572
5574 if (VF.isScalar())
5575 return;
5576 NumPredStores = 0;
5577 for (BasicBlock *BB : TheLoop->blocks()) {
5578 // For each instruction in the old loop.
5579 for (Instruction &I : *BB) {
5581 if (!Ptr)
5582 continue;
5583
5584 // TODO: We should generate better code and update the cost model for
5585 // predicated uniform stores. Today they are treated as any other
5586 // predicated store (see added test cases in
5587 // invariant-store-vectorization.ll).
5588 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
5589 NumPredStores++;
5590
5591 if (Legal->isUniformMemOp(I, VF)) {
5592 auto IsLegalToScalarize = [&]() {
5593 if (!VF.isScalable())
5594 // Scalarization of fixed length vectors "just works".
5595 return true;
5596
5597 // We have dedicated lowering for unpredicated uniform loads and
5598 // stores. Note that even with tail folding we know that at least
5599 // one lane is active (i.e. generalized predication is not possible
5600 // here), and the logic below depends on this fact.
5601 if (!foldTailByMasking())
5602 return true;
5603
5604 // For scalable vectors, a uniform memop load is always
5605 // uniform-by-parts and we know how to scalarize that.
5606 if (isa<LoadInst>(I))
5607 return true;
5608
5609 // A uniform store isn't neccessarily uniform-by-part
5610 // and we can't assume scalarization.
5611 auto &SI = cast<StoreInst>(I);
5612 return TheLoop->isLoopInvariant(SI.getValueOperand());
5613 };
5614
5615 const InstructionCost GatherScatterCost =
5617 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
5618
5619 // Load: Scalar load + broadcast
5620 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5621 // FIXME: This cost is a significant under-estimate for tail folded
5622 // memory ops.
5623 const InstructionCost ScalarizationCost =
5624 IsLegalToScalarize() ? getUniformMemOpCost(&I, VF)
5626
5627 // Choose better solution for the current VF, Note that Invalid
5628 // costs compare as maximumal large. If both are invalid, we get
5629 // scalable invalid which signals a failure and a vectorization abort.
5630 if (GatherScatterCost < ScalarizationCost)
5631 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
5632 else
5633 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
5634 continue;
5635 }
5636
5637 // We assume that widening is the best solution when possible.
5638 if (memoryInstructionCanBeWidened(&I, VF)) {
5639 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
5640 int ConsecutiveStride = Legal->isConsecutivePtr(
5642 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5643 "Expected consecutive stride.");
5644 InstWidening Decision =
5645 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
5646 setWideningDecision(&I, VF, Decision, Cost);
5647 continue;
5648 }
5649
5650 // Choose between Interleaving, Gather/Scatter or Scalarization.
5652 unsigned NumAccesses = 1;
5653 if (isAccessInterleaved(&I)) {
5654 const auto *Group = getInterleavedAccessGroup(&I);
5655 assert(Group && "Fail to get an interleaved access group.");
5656
5657 // Make one decision for the whole group.
5658 if (getWideningDecision(&I, VF) != CM_Unknown)
5659 continue;
5660
5661 NumAccesses = Group->getNumMembers();
5663 InterleaveCost = getInterleaveGroupCost(&I, VF);
5664 }
5665
5666 InstructionCost GatherScatterCost =
5668 ? getGatherScatterCost(&I, VF) * NumAccesses
5670
5671 InstructionCost ScalarizationCost =
5672 getMemInstScalarizationCost(&I, VF) * NumAccesses;
5673
5674 // Choose better solution for the current VF,
5675 // write down this decision and use it during vectorization.
5677 InstWidening Decision;
5678 if (InterleaveCost <= GatherScatterCost &&
5679 InterleaveCost < ScalarizationCost) {
5680 Decision = CM_Interleave;
5681 Cost = InterleaveCost;
5682 } else if (GatherScatterCost < ScalarizationCost) {
5683 Decision = CM_GatherScatter;
5684 Cost = GatherScatterCost;
5685 } else {
5686 Decision = CM_Scalarize;
5687 Cost = ScalarizationCost;
5688 }
5689 // If the instructions belongs to an interleave group, the whole group
5690 // receives the same decision. The whole group receives the cost, but
5691 // the cost will actually be assigned to one instruction.
5692 if (const auto *Group = getInterleavedAccessGroup(&I))
5693 setWideningDecision(Group, VF, Decision, Cost);
5694 else
5695 setWideningDecision(&I, VF, Decision, Cost);
5696 }
5697 }
5698
5699 // Make sure that any load of address and any other address computation
5700 // remains scalar unless there is gather/scatter support. This avoids
5701 // inevitable extracts into address registers, and also has the benefit of
5702 // activating LSR more, since that pass can't optimize vectorized
5703 // addresses.
5705 return;
5706
5707 // Start with all scalar pointer uses.
5709 for (BasicBlock *BB : TheLoop->blocks())
5710 for (Instruction &I : *BB) {
5711 Instruction *PtrDef =
5712 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
5713 if (PtrDef && TheLoop->contains(PtrDef) &&
5715 AddrDefs.insert(PtrDef);
5716 }
5717
5718 // Add all instructions used to generate the addresses.
5720 append_range(Worklist, AddrDefs);
5721 while (!Worklist.empty()) {
5722 Instruction *I = Worklist.pop_back_val();
5723 for (auto &Op : I->operands())
5724 if (auto *InstOp = dyn_cast<Instruction>(Op))
5725 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
5726 AddrDefs.insert(InstOp).second)
5727 Worklist.push_back(InstOp);
5728 }
5729
5730 for (auto *I : AddrDefs) {
5731 if (isa<LoadInst>(I)) {
5732 // Setting the desired widening decision should ideally be handled in
5733 // by cost functions, but since this involves the task of finding out
5734 // if the loaded register is involved in an address computation, it is
5735 // instead changed here when we know this is the case.
5736 InstWidening Decision = getWideningDecision(I, VF);
5737 if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
5738 // Scalarize a widened load of address.
5740 I, VF, CM_Scalarize,
5741 (VF.getKnownMinValue() *
5742 getMemoryInstructionCost(I, ElementCount::getFixed(1))));
5743 else if (const auto *Group = getInterleavedAccessGroup(I)) {
5744 // Scalarize an interleave group of address loads.
5745 for (unsigned I = 0; I < Group->getFactor(); ++I) {
5746 if (Instruction *Member = Group->getMember(I))
5748 Member, VF, CM_Scalarize,
5749 (VF.getKnownMinValue() *
5750 getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
5751 }
5752 }
5753 } else {
5754 // Cannot scalarize fixed-order recurrence phis at the moment.
5755 if (isa<PHINode>(I) && Legal->isFixedOrderRecurrence(cast<PHINode>(I)))
5756 continue;
5757
5758 // Make sure I gets scalarized and a cost estimate without
5759 // scalarization overhead.
5760 ForcedScalars[VF].insert(I);
5761 }
5762 }
5763}
5764
5766 assert(!VF.isScalar() &&
5767 "Trying to set a vectorization decision for a scalar VF");
5768
5769 auto ForcedScalar = ForcedScalars.find(VF);
5770 for (BasicBlock *BB : TheLoop->blocks()) {
5771 // For each instruction in the old loop.
5772 for (Instruction &I : *BB) {
5773 CallInst *CI = dyn_cast<CallInst>(&I);
5774
5775 if (!CI)
5776 continue;
5777
5781 Function *ScalarFunc = CI->getCalledFunction();
5782 Type *ScalarRetTy = CI->getType();
5783 SmallVector<Type *, 4> Tys, ScalarTys;
5784 for (auto &ArgOp : CI->args())
5785 ScalarTys.push_back(ArgOp->getType());
5786
5787 // Estimate cost of scalarized vector call. The source operands are
5788 // assumed to be vectors, so we need to extract individual elements from
5789 // there, execute VF scalar calls, and then gather the result into the
5790 // vector return value.
5791 if (VF.isFixed()) {
5792 InstructionCost ScalarCallCost =
5793 TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
5794
5795 // Compute costs of unpacking argument values for the scalar calls and
5796 // packing the return values to a vector.
5797 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
5798 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
5799 } else {
5800 // There is no point attempting to calculate the scalar cost for a
5801 // scalable VF as we know it will be Invalid.
5802 assert(!getScalarizationOverhead(CI, VF).isValid() &&
5803 "Unexpected valid cost for scalarizing scalable vectors");
5804 ScalarCost = InstructionCost::getInvalid();
5805 }
5806
5807 // Honor ForcedScalars and UniformAfterVectorization decisions.
5808 // TODO: For calls, it might still be more profitable to widen. Use
5809 // VPlan-based cost model to compare different options.
5810 if (VF.isVector() && ((ForcedScalar != ForcedScalars.end() &&
5811 ForcedScalar->second.contains(CI)) ||
5812 isUniformAfterVectorization(CI, VF))) {
5813 setCallWideningDecision(CI, VF, CM_Scalarize, nullptr,
5814 Intrinsic::not_intrinsic, std::nullopt,
5815 ScalarCost);
5816 continue;
5817 }
5818
5819 bool MaskRequired = Legal->isMaskRequired(CI);
5820 // Compute corresponding vector type for return value and arguments.
5821 Type *RetTy = toVectorizedTy(ScalarRetTy, VF);
5822 for (Type *ScalarTy : ScalarTys)
5823 Tys.push_back(toVectorizedTy(ScalarTy, VF));
5824
5825 // An in-loop reduction using an fmuladd intrinsic is a special case;
5826 // we don't want the normal cost for that intrinsic.
5828 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy)) {
5831 std::nullopt, *RedCost);
5832 continue;
5833 }
5834
5835 // Find the cost of vectorizing the call, if we can find a suitable
5836 // vector variant of the function.
5837 VFInfo FuncInfo;
5838 Function *VecFunc = nullptr;
5839 // Search through any available variants for one we can use at this VF.
5840 for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
5841 // Must match requested VF.
5842 if (Info.Shape.VF != VF)
5843 continue;
5844
5845 // Must take a mask argument if one is required
5846 if (MaskRequired && !Info.isMasked())
5847 continue;
5848
5849 // Check that all parameter kinds are supported
5850 bool ParamsOk = true;
5851 for (VFParameter Param : Info.Shape.Parameters) {
5852 switch (Param.ParamKind) {
5854 break;
5856 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
5857 // Make sure the scalar parameter in the loop is invariant.
5858 if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
5859 TheLoop))
5860 ParamsOk = false;
5861 break;
5862 }
5864 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
5865 // Find the stride for the scalar parameter in this loop and see if
5866 // it matches the stride for the variant.
5867 // TODO: do we need to figure out the cost of an extract to get the
5868 // first lane? Or do we hope that it will be folded away?
5869 ScalarEvolution *SE = PSE.getSE();
5870 if (!match(SE->getSCEV(ScalarParam),
5872 m_SCEV(), m_scev_SpecificSInt(Param.LinearStepOrPos),
5874 ParamsOk = false;
5875 break;
5876 }
5878 break;
5879 default:
5880 ParamsOk = false;
5881 break;
5882 }
5883 }
5884
5885 if (!ParamsOk)
5886 continue;
5887
5888 // Found a suitable candidate, stop here.
5889 VecFunc = CI->getModule()->getFunction(Info.VectorName);
5890 FuncInfo = Info;
5891 break;
5892 }
5893
5894 if (TLI && VecFunc && !CI->isNoBuiltin())
5895 VectorCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind);
5896
5897 // Find the cost of an intrinsic; some targets may have instructions that
5898 // perform the operation without needing an actual call.
5900 if (IID != Intrinsic::not_intrinsic)
5902
5903 InstructionCost Cost = ScalarCost;
5904 InstWidening Decision = CM_Scalarize;
5905
5906 if (VectorCost <= Cost) {
5907 Cost = VectorCost;
5908 Decision = CM_VectorCall;
5909 }
5910
5911 if (IntrinsicCost <= Cost) {
5913 Decision = CM_IntrinsicCall;
5914 }
5915
5916 setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
5918 }
5919 }
5920}
5921
5923 if (!Legal->isInvariant(Op))
5924 return false;
5925 // Consider Op invariant, if it or its operands aren't predicated
5926 // instruction in the loop. In that case, it is not trivially hoistable.
5927 auto *OpI = dyn_cast<Instruction>(Op);
5928 return !OpI || !TheLoop->contains(OpI) ||
5929 (!isPredicatedInst(OpI) &&
5930 (!isa<PHINode>(OpI) || OpI->getParent() != TheLoop->getHeader()) &&
5931 all_of(OpI->operands(),
5932 [this](Value *Op) { return shouldConsiderInvariant(Op); }));
5933}
5934
5937 ElementCount VF) {
5938 // If we know that this instruction will remain uniform, check the cost of
5939 // the scalar version.
5941 VF = ElementCount::getFixed(1);
5942
5943 if (VF.isVector() && isProfitableToScalarize(I, VF))
5944 return InstsToScalarize[VF][I];
5945
5946 // Forced scalars do not have any scalarization overhead.
5947 auto ForcedScalar = ForcedScalars.find(VF);
5948 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
5949 auto InstSet = ForcedScalar->second;
5950 if (InstSet.count(I))
5952 VF.getKnownMinValue();
5953 }
5954
5955 Type *RetTy = I->getType();
5957 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
5958 auto *SE = PSE.getSE();
5959
5960 Type *VectorTy;
5961 if (isScalarAfterVectorization(I, VF)) {
5962 [[maybe_unused]] auto HasSingleCopyAfterVectorization =
5963 [this](Instruction *I, ElementCount VF) -> bool {
5964 if (VF.isScalar())
5965 return true;
5966
5967 auto Scalarized = InstsToScalarize.find(VF);
5968 assert(Scalarized != InstsToScalarize.end() &&
5969 "VF not yet analyzed for scalarization profitability");
5970 return !Scalarized->second.count(I) &&
5971 llvm::all_of(I->users(), [&](User *U) {
5972 auto *UI = cast<Instruction>(U);
5973 return !Scalarized->second.count(UI);
5974 });
5975 };
5976
5977 // With the exception of GEPs and PHIs, after scalarization there should
5978 // only be one copy of the instruction generated in the loop. This is
5979 // because the VF is either 1, or any instructions that need scalarizing
5980 // have already been dealt with by the time we get here. As a result,
5981 // it means we don't have to multiply the instruction cost by VF.
5982 assert(I->getOpcode() == Instruction::GetElementPtr ||
5983 I->getOpcode() == Instruction::PHI ||
5984 (I->getOpcode() == Instruction::BitCast &&
5985 I->getType()->isPointerTy()) ||
5986 HasSingleCopyAfterVectorization(I, VF));
5987 VectorTy = RetTy;
5988 } else
5989 VectorTy = toVectorizedTy(RetTy, VF);
5990
5991 if (VF.isVector() && VectorTy->isVectorTy() &&
5992 !TTI.getNumberOfParts(VectorTy))
5994
5995 // TODO: We need to estimate the cost of intrinsic calls.
5996 switch (I->getOpcode()) {
5997 case Instruction::GetElementPtr:
5998 // We mark this instruction as zero-cost because the cost of GEPs in
5999 // vectorized code depends on whether the corresponding memory instruction
6000 // is scalarized or not. Therefore, we handle GEPs with the memory
6001 // instruction cost.
6002 return 0;
6003 case Instruction::Br: {
6004 // In cases of scalarized and predicated instructions, there will be VF
6005 // predicated blocks in the vectorized loop. Each branch around these
6006 // blocks requires also an extract of its vector compare i1 element.
6007 // Note that the conditional branch from the loop latch will be replaced by
6008 // a single branch controlling the loop, so there is no extra overhead from
6009 // scalarization.
6010 bool ScalarPredicatedBB = false;
6011 BranchInst *BI = cast<BranchInst>(I);
6012 if (VF.isVector() && BI->isConditional() &&
6013 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6014 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&
6015 BI->getParent() != TheLoop->getLoopLatch())
6016 ScalarPredicatedBB = true;
6017
6018 if (ScalarPredicatedBB) {
6019 // Not possible to scalarize scalable vector with predicated instructions.
6020 if (VF.isScalable())
6022 // Return cost for branches around scalarized and predicated blocks.
6023 auto *VecI1Ty =
6024 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6025 return (
6027 VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
6028 /*Insert*/ false, /*Extract*/ true, CostKind) +
6029 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6030 }
6031
6032 if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6033 // The back-edge branch will remain, as will all scalar branches.
6034 return TTI.getCFInstrCost(Instruction::Br, CostKind);
6035
6036 // This branch will be eliminated by if-conversion.
6037 return 0;
6038 // Note: We currently assume zero cost for an unconditional branch inside
6039 // a predicated block since it will become a fall-through, although we
6040 // may decide in the future to call TTI for all branches.
6041 }
6042 case Instruction::Switch: {
6043 if (VF.isScalar())
6044 return TTI.getCFInstrCost(Instruction::Switch, CostKind);
6045 auto *Switch = cast<SwitchInst>(I);
6046 return Switch->getNumCases() *
6048 Instruction::ICmp,
6049 toVectorTy(Switch->getCondition()->getType(), VF),
6050 toVectorTy(Type::getInt1Ty(I->getContext()), VF),
6052 }
6053 case Instruction::PHI: {
6054 auto *Phi = cast<PHINode>(I);
6055
6056 // First-order recurrences are replaced by vector shuffles inside the loop.
6057 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6059 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6061 cast<VectorType>(VectorTy),
6062 cast<VectorType>(VectorTy), Mask, CostKind,
6063 VF.getKnownMinValue() - 1);
6064 }
6065
6066 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6067 // converted into select instructions. We require N - 1 selects per phi
6068 // node, where N is the number of incoming values.
6069 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) {
6070 Type *ResultTy = Phi->getType();
6071
6072 // All instructions in an Any-of reduction chain are narrowed to bool.
6073 // Check if that is the case for this phi node.
6074 auto *HeaderUser = cast_if_present<PHINode>(
6075 find_singleton<User>(Phi->users(), [this](User *U, bool) -> User * {
6076 auto *Phi = dyn_cast<PHINode>(U);
6077 if (Phi && Phi->getParent() == TheLoop->getHeader())
6078 return Phi;
6079 return nullptr;
6080 }));
6081 if (HeaderUser) {
6082 auto &ReductionVars = Legal->getReductionVars();
6083 auto Iter = ReductionVars.find(HeaderUser);
6084 if (Iter != ReductionVars.end() &&
6086 Iter->second.getRecurrenceKind()))
6087 ResultTy = Type::getInt1Ty(Phi->getContext());
6088 }
6089 return (Phi->getNumIncomingValues() - 1) *
6091 Instruction::Select, toVectorTy(ResultTy, VF),
6092 toVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6094 }
6095
6096 // When tail folding with EVL, if the phi is part of an out of loop
6097 // reduction then it will be transformed into a wide vp_merge.
6098 if (VF.isVector() && foldTailWithEVL() &&
6101 Intrinsic::vp_merge, toVectorTy(Phi->getType(), VF),
6102 {toVectorTy(Type::getInt1Ty(Phi->getContext()), VF)});
6103 return TTI.getIntrinsicInstrCost(ICA, CostKind);
6104 }
6105
6106 return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6107 }
6108 case Instruction::UDiv:
6109 case Instruction::SDiv:
6110 case Instruction::URem:
6111 case Instruction::SRem:
6112 if (VF.isVector() && isPredicatedInst(I)) {
6113 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6114 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6115 ScalarCost : SafeDivisorCost;
6116 }
6117 // We've proven all lanes safe to speculate, fall through.
6118 [[fallthrough]];
6119 case Instruction::Add:
6120 case Instruction::Sub: {
6121 auto Info = Legal->getHistogramInfo(I);
6122 if (Info && VF.isVector()) {
6123 const HistogramInfo *HGram = Info.value();
6124 // Assume that a non-constant update value (or a constant != 1) requires
6125 // a multiply, and add that into the cost.
6127 ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1));
6128 if (!RHS || RHS->getZExtValue() != 1)
6129 MulCost =
6130 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6131
6132 // Find the cost of the histogram operation itself.
6133 Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF);
6134 Type *ScalarTy = I->getType();
6135 Type *MaskTy = VectorType::get(Type::getInt1Ty(I->getContext()), VF);
6136 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
6137 Type::getVoidTy(I->getContext()),
6138 {PtrTy, ScalarTy, MaskTy});
6139
6140 // Add the costs together with the add/sub operation.
6141 return TTI.getIntrinsicInstrCost(ICA, CostKind) + MulCost +
6142 TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, CostKind);
6143 }
6144 [[fallthrough]];
6145 }
6146 case Instruction::FAdd:
6147 case Instruction::FSub:
6148 case Instruction::Mul:
6149 case Instruction::FMul:
6150 case Instruction::FDiv:
6151 case Instruction::FRem:
6152 case Instruction::Shl:
6153 case Instruction::LShr:
6154 case Instruction::AShr:
6155 case Instruction::And:
6156 case Instruction::Or:
6157 case Instruction::Xor: {
6158 // If we're speculating on the stride being 1, the multiplication may
6159 // fold away. We can generalize this for all operations using the notion
6160 // of neutral elements. (TODO)
6161 if (I->getOpcode() == Instruction::Mul &&
6162 ((TheLoop->isLoopInvariant(I->getOperand(0)) &&
6163 PSE.getSCEV(I->getOperand(0))->isOne()) ||
6164 (TheLoop->isLoopInvariant(I->getOperand(1)) &&
6165 PSE.getSCEV(I->getOperand(1))->isOne())))
6166 return 0;
6167
6168 // Detect reduction patterns
6169 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
6170 return *RedCost;
6171
6172 // Certain instructions can be cheaper to vectorize if they have a constant
6173 // second vector operand. One example of this are shifts on x86.
6174 Value *Op2 = I->getOperand(1);
6175 if (!isa<Constant>(Op2) && TheLoop->isLoopInvariant(Op2) &&
6176 PSE.getSE()->isSCEVable(Op2->getType()) &&
6177 isa<SCEVConstant>(PSE.getSCEV(Op2))) {
6178 Op2 = cast<SCEVConstant>(PSE.getSCEV(Op2))->getValue();
6179 }
6180 auto Op2Info = TTI.getOperandInfo(Op2);
6181 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6184
6185 SmallVector<const Value *, 4> Operands(I->operand_values());
6187 I->getOpcode(), VectorTy, CostKind,
6188 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6189 Op2Info, Operands, I, TLI);
6190 }
6191 case Instruction::FNeg: {
6193 I->getOpcode(), VectorTy, CostKind,
6194 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6195 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6196 I->getOperand(0), I);
6197 }
6198 case Instruction::Select: {
6199 SelectInst *SI = cast<SelectInst>(I);
6200 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6201 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6202
6203 const Value *Op0, *Op1;
6204 using namespace llvm::PatternMatch;
6205 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
6206 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
6207 // select x, y, false --> x & y
6208 // select x, true, y --> x | y
6209 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
6210 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
6211 assert(Op0->getType()->getScalarSizeInBits() == 1 &&
6212 Op1->getType()->getScalarSizeInBits() == 1);
6213
6215 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And,
6216 VectorTy, CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, {Op0, Op1}, I);
6217 }
6218
6219 Type *CondTy = SI->getCondition()->getType();
6220 if (!ScalarCond)
6221 CondTy = VectorType::get(CondTy, VF);
6222
6224 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
6225 Pred = Cmp->getPredicate();
6226 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
6227 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
6228 {TTI::OK_AnyValue, TTI::OP_None}, I);
6229 }
6230 case Instruction::ICmp:
6231 case Instruction::FCmp: {
6232 Type *ValTy = I->getOperand(0)->getType();
6233
6235 [[maybe_unused]] Instruction *Op0AsInstruction =
6236 dyn_cast<Instruction>(I->getOperand(0));
6237 assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) ||
6238 MinBWs[I] == MinBWs[Op0AsInstruction]) &&
6239 "if both the operand and the compare are marked for "
6240 "truncation, they must have the same bitwidth");
6241 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]);
6242 }
6243
6244 VectorTy = toVectorTy(ValTy, VF);
6245 return TTI.getCmpSelInstrCost(
6246 I->getOpcode(), VectorTy, CmpInst::makeCmpResultType(VectorTy),
6247 cast<CmpInst>(I)->getPredicate(), CostKind,
6248 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, I);
6249 }
6250 case Instruction::Store:
6251 case Instruction::Load: {
6252 ElementCount Width = VF;
6253 if (Width.isVector()) {
6254 InstWidening Decision = getWideningDecision(I, Width);
6255 assert(Decision != CM_Unknown &&
6256 "CM decision should be taken at this point");
6259 if (Decision == CM_Scalarize)
6260 Width = ElementCount::getFixed(1);
6261 }
6262 VectorTy = toVectorTy(getLoadStoreType(I), Width);
6263 return getMemoryInstructionCost(I, VF);
6264 }
6265 case Instruction::BitCast:
6266 if (I->getType()->isPointerTy())
6267 return 0;
6268 [[fallthrough]];
6269 case Instruction::ZExt:
6270 case Instruction::SExt:
6271 case Instruction::FPToUI:
6272 case Instruction::FPToSI:
6273 case Instruction::FPExt:
6274 case Instruction::PtrToInt:
6275 case Instruction::IntToPtr:
6276 case Instruction::SIToFP:
6277 case Instruction::UIToFP:
6278 case Instruction::Trunc:
6279 case Instruction::FPTrunc: {
6280 // Computes the CastContextHint from a Load/Store instruction.
6281 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6282 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6283 "Expected a load or a store!");
6284
6285 if (VF.isScalar() || !TheLoop->contains(I))
6287
6288 switch (getWideningDecision(I, VF)) {
6300 llvm_unreachable("Instr did not go through cost modelling?");
6303 llvm_unreachable_internal("Instr has invalid widening decision");
6304 }
6305
6306 llvm_unreachable("Unhandled case!");
6307 };
6308
6309 unsigned Opcode = I->getOpcode();
6311 // For Trunc, the context is the only user, which must be a StoreInst.
6312 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6313 if (I->hasOneUse())
6314 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
6315 CCH = ComputeCCH(Store);
6316 }
6317 // For Z/Sext, the context is the operand, which must be a LoadInst.
6318 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6319 Opcode == Instruction::FPExt) {
6320 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
6321 CCH = ComputeCCH(Load);
6322 }
6323
6324 // We optimize the truncation of induction variables having constant
6325 // integer steps. The cost of these truncations is the same as the scalar
6326 // operation.
6327 if (isOptimizableIVTruncate(I, VF)) {
6328 auto *Trunc = cast<TruncInst>(I);
6329 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6330 Trunc->getSrcTy(), CCH, CostKind, Trunc);
6331 }
6332
6333 // Detect reduction patterns
6334 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
6335 return *RedCost;
6336
6337 Type *SrcScalarTy = I->getOperand(0)->getType();
6338 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6339 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6340 SrcScalarTy =
6341 IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]);
6342 Type *SrcVecTy =
6343 VectorTy->isVectorTy() ? toVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6344
6346 // If the result type is <= the source type, there will be no extend
6347 // after truncating the users to the minimal required bitwidth.
6348 if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
6349 (I->getOpcode() == Instruction::ZExt ||
6350 I->getOpcode() == Instruction::SExt))
6351 return 0;
6352 }
6353
6354 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
6355 }
6356 case Instruction::Call:
6357 return getVectorCallCost(cast<CallInst>(I), VF);
6358 case Instruction::ExtractValue:
6360 case Instruction::Alloca:
6361 // We cannot easily widen alloca to a scalable alloca, as
6362 // the result would need to be a vector of pointers.
6363 if (VF.isScalable())
6365 [[fallthrough]];
6366 default:
6367 // This opcode is unknown. Assume that it is the same as 'mul'.
6368 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6369 } // end of switch.
6370}
6371
6373 // Ignore ephemeral values.
6375
6376 SmallVector<Value *, 4> DeadInterleavePointerOps;
6378
6379 // If a scalar epilogue is required, users outside the loop won't use
6380 // live-outs from the vector loop but from the scalar epilogue. Ignore them if
6381 // that is the case.
6382 bool RequiresScalarEpilogue = requiresScalarEpilogue(true);
6383 auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) {
6384 return RequiresScalarEpilogue &&
6385 !TheLoop->contains(cast<Instruction>(U)->getParent());
6386 };
6387
6389 DFS.perform(LI);
6390 MapVector<Value *, SmallVector<Value *>> DeadInvariantStoreOps;
6391 for (BasicBlock *BB : reverse(make_range(DFS.beginRPO(), DFS.endRPO())))
6392 for (Instruction &I : reverse(*BB)) {
6393 // Find all stores to invariant variables. Since they are going to sink
6394 // outside the loop we do not need calculate cost for them.
6395 StoreInst *SI;
6396 if ((SI = dyn_cast<StoreInst>(&I)) &&
6397 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
6398 ValuesToIgnore.insert(&I);
6399 DeadInvariantStoreOps[SI->getPointerOperand()].push_back(
6400 SI->getValueOperand());
6401 }
6402
6403 if (VecValuesToIgnore.contains(&I) || ValuesToIgnore.contains(&I))
6404 continue;
6405
6406 // Add instructions that would be trivially dead and are only used by
6407 // values already ignored to DeadOps to seed worklist.
6409 all_of(I.users(), [this, IsLiveOutDead](User *U) {
6410 return VecValuesToIgnore.contains(U) ||
6411 ValuesToIgnore.contains(U) || IsLiveOutDead(U);
6412 }))
6413 DeadOps.push_back(&I);
6414
6415 // For interleave groups, we only create a pointer for the start of the
6416 // interleave group. Queue up addresses of group members except the insert
6417 // position for further processing.
6418 if (isAccessInterleaved(&I)) {
6419 auto *Group = getInterleavedAccessGroup(&I);
6420 if (Group->getInsertPos() == &I)
6421 continue;
6422 Value *PointerOp = getLoadStorePointerOperand(&I);
6423 DeadInterleavePointerOps.push_back(PointerOp);
6424 }
6425
6426 // Queue branches for analysis. They are dead, if their successors only
6427 // contain dead instructions.
6428 if (auto *Br = dyn_cast<BranchInst>(&I)) {
6429 if (Br->isConditional())
6430 DeadOps.push_back(&I);
6431 }
6432 }
6433
6434 // Mark ops feeding interleave group members as free, if they are only used
6435 // by other dead computations.
6436 for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
6437 auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]);
6438 if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) {
6439 Instruction *UI = cast<Instruction>(U);
6440 return !VecValuesToIgnore.contains(U) &&
6441 (!isAccessInterleaved(UI) ||
6442 getInterleavedAccessGroup(UI)->getInsertPos() == UI);
6443 }))
6444 continue;
6445 VecValuesToIgnore.insert(Op);
6446 append_range(DeadInterleavePointerOps, Op->operands());
6447 }
6448
6449 for (const auto &[_, Ops] : DeadInvariantStoreOps)
6450 llvm::append_range(DeadOps, drop_end(Ops));
6451
6452 // Mark ops that would be trivially dead and are only used by ignored
6453 // instructions as free.
6454 BasicBlock *Header = TheLoop->getHeader();
6455
6456 // Returns true if the block contains only dead instructions. Such blocks will
6457 // be removed by VPlan-to-VPlan transforms and won't be considered by the
6458 // VPlan-based cost model, so skip them in the legacy cost-model as well.
6459 auto IsEmptyBlock = [this](BasicBlock *BB) {
6460 return all_of(*BB, [this](Instruction &I) {
6461 return ValuesToIgnore.contains(&I) || VecValuesToIgnore.contains(&I) ||
6462 (isa<BranchInst>(&I) && !cast<BranchInst>(&I)->isConditional());
6463 });
6464 };
6465 for (unsigned I = 0; I != DeadOps.size(); ++I) {
6466 auto *Op = dyn_cast<Instruction>(DeadOps[I]);
6467
6468 // Check if the branch should be considered dead.
6469 if (auto *Br = dyn_cast_or_null<BranchInst>(Op)) {
6470 BasicBlock *ThenBB = Br->getSuccessor(0);
6471 BasicBlock *ElseBB = Br->getSuccessor(1);
6472 // Don't considers branches leaving the loop for simplification.
6473 if (!TheLoop->contains(ThenBB) || !TheLoop->contains(ElseBB))
6474 continue;
6475 bool ThenEmpty = IsEmptyBlock(ThenBB);
6476 bool ElseEmpty = IsEmptyBlock(ElseBB);
6477 if ((ThenEmpty && ElseEmpty) ||
6478 (ThenEmpty && ThenBB->getSingleSuccessor() == ElseBB &&
6479 ElseBB->phis().empty()) ||
6480 (ElseEmpty && ElseBB->getSingleSuccessor() == ThenBB &&
6481 ThenBB->phis().empty())) {
6482 VecValuesToIgnore.insert(Br);
6483 DeadOps.push_back(Br->getCondition());
6484 }
6485 continue;
6486 }
6487
6488 // Skip any op that shouldn't be considered dead.
6489 if (!Op || !TheLoop->contains(Op) ||
6490 (isa<PHINode>(Op) && Op->getParent() == Header) ||
6492 any_of(Op->users(), [this, IsLiveOutDead](User *U) {
6493 return !VecValuesToIgnore.contains(U) &&
6494 !ValuesToIgnore.contains(U) && !IsLiveOutDead(U);
6495 }))
6496 continue;
6497
6498 // If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore
6499 // which applies for both scalar and vector versions. Otherwise it is only
6500 // dead in vector versions, so only add it to VecValuesToIgnore.
6501 if (all_of(Op->users(),
6502 [this](User *U) { return ValuesToIgnore.contains(U); }))
6503 ValuesToIgnore.insert(Op);
6504
6505 VecValuesToIgnore.insert(Op);
6506 append_range(DeadOps, Op->operands());
6507 }
6508
6509 // Ignore type-promoting instructions we identified during reduction
6510 // detection.
6511 for (const auto &Reduction : Legal->getReductionVars()) {
6512 const RecurrenceDescriptor &RedDes = Reduction.second;
6513 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6514 VecValuesToIgnore.insert_range(Casts);
6515 }
6516 // Ignore type-casting instructions we identified during induction
6517 // detection.
6518 for (const auto &Induction : Legal->getInductionVars()) {
6519 const InductionDescriptor &IndDes = Induction.second;
6520 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6521 VecValuesToIgnore.insert_range(Casts);
6522 }
6523}
6524
6526 // Avoid duplicating work finding in-loop reductions.
6527 if (!InLoopReductions.empty())
6528 return;
6529
6530 for (const auto &Reduction : Legal->getReductionVars()) {
6531 PHINode *Phi = Reduction.first;
6532 const RecurrenceDescriptor &RdxDesc = Reduction.second;
6533
6534 // We don't collect reductions that are type promoted (yet).
6535 if (RdxDesc.getRecurrenceType() != Phi->getType())
6536 continue;
6537
6538 // If the target would prefer this reduction to happen "in-loop", then we
6539 // want to record it as such.
6540 RecurKind Kind = RdxDesc.getRecurrenceKind();
6541 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
6542 !TTI.preferInLoopReduction(Kind, Phi->getType()))
6543 continue;
6544
6545 // Check that we can correctly put the reductions into the loop, by
6546 // finding the chain of operations that leads from the phi to the loop
6547 // exit value.
6548 SmallVector<Instruction *, 4> ReductionOperations =
6549 RdxDesc.getReductionOpChain(Phi, TheLoop);
6550 bool InLoop = !ReductionOperations.empty();
6551
6552 if (InLoop) {
6553 InLoopReductions.insert(Phi);
6554 // Add the elements to InLoopReductionImmediateChains for cost modelling.
6555 Instruction *LastChain = Phi;
6556 for (auto *I : ReductionOperations) {
6557 InLoopReductionImmediateChains[I] = LastChain;
6558 LastChain = I;
6559 }
6560 }
6561 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
6562 << " reduction for phi: " << *Phi << "\n");
6563 }
6564}
6565
6566// This function will select a scalable VF if the target supports scalable
6567// vectors and a fixed one otherwise.
6568// TODO: we could return a pair of values that specify the max VF and
6569// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6570// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6571// doesn't have a cost model that can choose which plan to execute if
6572// more than one is generated.
6575 unsigned WidestType;
6576 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6577
6582
6584 unsigned N = RegSize.getKnownMinValue() / WidestType;
6585 return ElementCount::get(N, RegSize.isScalable());
6586}
6587
6590 ElementCount VF = UserVF;
6591 // Outer loop handling: They may require CFG and instruction level
6592 // transformations before even evaluating whether vectorization is profitable.
6593 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6594 // the vectorization pipeline.
6595 if (!OrigLoop->isInnermost()) {
6596 // If the user doesn't provide a vectorization factor, determine a
6597 // reasonable one.
6598 if (UserVF.isZero()) {
6599 VF = determineVPlanVF(TTI, CM);
6600 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6601
6602 // Make sure we have a VF > 1 for stress testing.
6603 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
6604 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6605 << "overriding computed VF.\n");
6606 VF = ElementCount::getFixed(4);
6607 }
6608 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
6610 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
6611 << "not supported by the target.\n");
6613 "Scalable vectorization requested but not supported by the target",
6614 "the scalable user-specified vectorization width for outer-loop "
6615 "vectorization cannot be used because the target does not support "
6616 "scalable vectors.",
6617 "ScalableVFUnfeasible", ORE, OrigLoop);
6619 }
6620 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6622 "VF needs to be a power of two");
6623 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
6624 << "VF " << VF << " to build VPlans.\n");
6625 buildVPlans(VF, VF);
6626
6627 if (VPlans.empty())
6629
6630 // For VPlan build stress testing, we bail out after VPlan construction.
6633
6634 return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
6635 }
6636
6637 LLVM_DEBUG(
6638 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6639 "VPlan-native path.\n");
6641}
6642
6643void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
6644 assert(OrigLoop->isInnermost() && "Inner loop expected.");
6647
6648 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
6649 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
6650 return;
6651
6652 // Invalidate interleave groups if all blocks of loop will be predicated.
6653 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
6655 LLVM_DEBUG(
6656 dbgs()
6657 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6658 "which requires masked-interleaved support.\n");
6660 // Invalidating interleave groups also requires invalidating all decisions
6661 // based on them, which includes widening decisions and uniform and scalar
6662 // values.
6664 }
6665
6666 if (CM.foldTailByMasking())
6668
6669 ElementCount MaxUserVF =
6670 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
6671 if (UserVF) {
6672 if (!ElementCount::isKnownLE(UserVF, MaxUserVF)) {
6674 "UserVF ignored because it may be larger than the maximal safe VF",
6675 "InvalidUserVF", ORE, OrigLoop);
6676 } else {
6678 "VF needs to be a power of two");
6679 // Collect the instructions (and their associated costs) that will be more
6680 // profitable to scalarize.
6682 if (CM.selectUserVectorizationFactor(UserVF)) {
6683 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6684 buildVPlansWithVPRecipes(UserVF, UserVF);
6686 return;
6687 }
6688 reportVectorizationInfo("UserVF ignored because of invalid costs.",
6689 "InvalidCost", ORE, OrigLoop);
6690 }
6691 }
6692
6693 // Collect the Vectorization Factor Candidates.
6694 SmallVector<ElementCount> VFCandidates;
6695 for (auto VF = ElementCount::getFixed(1);
6696 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
6697 VFCandidates.push_back(VF);
6698 for (auto VF = ElementCount::getScalable(1);
6699 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
6700 VFCandidates.push_back(VF);
6701
6703 for (const auto &VF : VFCandidates) {
6704 // Collect Uniform and Scalar instructions after vectorization with VF.
6706 }
6707
6708 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
6709 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
6710
6712}
6713
6715 ElementCount VF) const {
6717 if (Cost.isValid() && ForceTargetInstructionCost.getNumOccurrences())
6719 return Cost;
6720}
6721
6723 ElementCount VF) const {
6724 return CM.isUniformAfterVectorization(I, VF);
6725}
6726
6727bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
6728 return CM.ValuesToIgnore.contains(UI) ||
6729 (IsVector && CM.VecValuesToIgnore.contains(UI)) ||
6730 SkipCostComputation.contains(UI);
6731}
6732
6734LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
6735 VPCostContext &CostCtx) const {
6737 // Cost modeling for inductions is inaccurate in the legacy cost model
6738 // compared to the recipes that are generated. To match here initially during
6739 // VPlan cost model bring up directly use the induction costs from the legacy
6740 // cost model. Note that we do this as pre-processing; the VPlan may not have
6741 // any recipes associated with the original induction increment instruction
6742 // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
6743 // the cost of induction phis and increments (both that are represented by
6744 // recipes and those that are not), to avoid distinguishing between them here,
6745 // and skip all recipes that represent induction phis and increments (the
6746 // former case) later on, if they exist, to avoid counting them twice.
6747 // Similarly we pre-compute the cost of any optimized truncates.
6748 // TODO: Switch to more accurate costing based on VPlan.
6749 for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
6750 Instruction *IVInc = cast<Instruction>(
6751 IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
6752 SmallVector<Instruction *> IVInsts = {IVInc};
6753 for (unsigned I = 0; I != IVInsts.size(); I++) {
6754 for (Value *Op : IVInsts[I]->operands()) {
6755 auto *OpI = dyn_cast<Instruction>(Op);
6756 if (Op == IV || !OpI || !OrigLoop->contains(OpI) || !Op->hasOneUse())
6757 continue;
6758 IVInsts.push_back(OpI);
6759 }
6760 }
6761 IVInsts.push_back(IV);
6762 for (User *U : IV->users()) {
6763 auto *CI = cast<Instruction>(U);
6764 if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF))
6765 continue;
6766 IVInsts.push_back(CI);
6767 }
6768
6769 // If the vector loop gets executed exactly once with the given VF, ignore
6770 // the costs of comparison and induction instructions, as they'll get
6771 // simplified away.
6772 // TODO: Remove this code after stepping away from the legacy cost model and
6773 // adding code to simplify VPlans before calculating their costs.
6774 auto TC = getSmallConstantTripCount(PSE.getSE(), OrigLoop);
6775 if (TC == VF && !CM.foldTailByMasking())
6777 CostCtx.SkipCostComputation);
6778
6779 for (Instruction *IVInst : IVInsts) {
6780 if (CostCtx.skipCostComputation(IVInst, VF.isVector()))
6781 continue;
6782 InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF);
6783 LLVM_DEBUG({
6784 dbgs() << "Cost of " << InductionCost << " for VF " << VF
6785 << ": induction instruction " << *IVInst << "\n";
6786 });
6787 Cost += InductionCost;
6788 CostCtx.SkipCostComputation.insert(IVInst);
6789 }
6790 }
6791
6792 /// Compute the cost of all exiting conditions of the loop using the legacy
6793 /// cost model. This is to match the legacy behavior, which adds the cost of
6794 /// all exit conditions. Note that this over-estimates the cost, as there will
6795 /// be a single condition to control the vector loop.
6797 CM.TheLoop->getExitingBlocks(Exiting);
6798 SetVector<Instruction *> ExitInstrs;
6799 // Collect all exit conditions.
6800 for (BasicBlock *EB : Exiting) {
6801 auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
6802 if (!Term || CostCtx.skipCostComputation(Term, VF.isVector()))
6803 continue;
6804 if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
6805 ExitInstrs.insert(CondI);
6806 }
6807 }
6808 // Compute the cost of all instructions only feeding the exit conditions.
6809 for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
6810 Instruction *CondI = ExitInstrs[I];
6811 if (!OrigLoop->contains(CondI) ||
6812 !CostCtx.SkipCostComputation.insert(CondI).second)
6813 continue;
6814 InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF);
6815 LLVM_DEBUG({
6816 dbgs() << "Cost of " << CondICost << " for VF " << VF
6817 << ": exit condition instruction " << *CondI << "\n";
6818 });
6819 Cost += CondICost;
6820 for (Value *Op : CondI->operands()) {
6821 auto *OpI = dyn_cast<Instruction>(Op);
6822 if (!OpI || CostCtx.skipCostComputation(OpI, VF.isVector()) ||
6823 any_of(OpI->users(), [&ExitInstrs, this](User *U) {
6824 return OrigLoop->contains(cast<Instruction>(U)->getParent()) &&
6825 !ExitInstrs.contains(cast<Instruction>(U));
6826 }))
6827 continue;
6828 ExitInstrs.insert(OpI);
6829 }
6830 }
6831
6832 // Pre-compute the costs for branches except for the backedge, as the number
6833 // of replicate regions in a VPlan may not directly match the number of
6834 // branches, which would lead to different decisions.
6835 // TODO: Compute cost of branches for each replicate region in the VPlan,
6836 // which is more accurate than the legacy cost model.
6837 for (BasicBlock *BB : OrigLoop->blocks()) {
6838 if (CostCtx.skipCostComputation(BB->getTerminator(), VF.isVector()))
6839 continue;
6840 CostCtx.SkipCostComputation.insert(BB->getTerminator());
6841 if (BB == OrigLoop->getLoopLatch())
6842 continue;
6843 auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF);
6844 Cost += BranchCost;
6845 }
6846
6847 // Pre-compute costs for instructions that are forced-scalar or profitable to
6848 // scalarize. Their costs will be computed separately in the legacy cost
6849 // model.
6850 for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) {
6851 if (CostCtx.skipCostComputation(ForcedScalar, VF.isVector()))
6852 continue;
6853 CostCtx.SkipCostComputation.insert(ForcedScalar);
6854 InstructionCost ForcedCost = CostCtx.getLegacyCost(ForcedScalar, VF);
6855 LLVM_DEBUG({
6856 dbgs() << "Cost of " << ForcedCost << " for VF " << VF
6857 << ": forced scalar " << *ForcedScalar << "\n";
6858 });
6859 Cost += ForcedCost;
6860 }
6861 for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) {
6862 if (CostCtx.skipCostComputation(Scalarized, VF.isVector()))
6863 continue;
6864 CostCtx.SkipCostComputation.insert(Scalarized);
6865 LLVM_DEBUG({
6866 dbgs() << "Cost of " << ScalarCost << " for VF " << VF
6867 << ": profitable to scalarize " << *Scalarized << "\n";
6868 });
6869 Cost += ScalarCost;
6870 }
6871
6872 return Cost;
6873}
6874
6875InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
6876 ElementCount VF) const {
6877 VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind);
6878 InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
6879
6880 // Now compute and add the VPlan-based cost.
6881 Cost += Plan.cost(VF, CostCtx);
6882#ifndef NDEBUG
6883 unsigned EstimatedWidth = estimateElementCount(VF, CM.getVScaleForTuning());
6884 LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
6885 << " (Estimated cost per lane: ");
6886 if (Cost.isValid()) {
6887 double CostPerLane = double(Cost.getValue()) / EstimatedWidth;
6888 LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane));
6889 } else /* No point dividing an invalid cost - it will still be invalid */
6890 LLVM_DEBUG(dbgs() << "Invalid");
6891 LLVM_DEBUG(dbgs() << ")\n");
6892#endif
6893 return Cost;
6894}
6895
6896#ifndef NDEBUG
6897/// Return true if the original loop \ TheLoop contains any instructions that do
6898/// not have corresponding recipes in \p Plan and are not marked to be ignored
6899/// in \p CostCtx. This means the VPlan contains simplification that the legacy
6900/// cost-model did not account for.
6902 VPCostContext &CostCtx,
6903 Loop *TheLoop,
6904 ElementCount VF) {
6905 // First collect all instructions for the recipes in Plan.
6906 auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction * {
6907 if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
6908 return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
6909 if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
6910 return &WidenMem->getIngredient();
6911 return nullptr;
6912 };
6913
6914 DenseSet<Instruction *> SeenInstrs;
6915 auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry());
6916 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
6917 for (VPRecipeBase &R : *VPBB) {
6918 if (auto *IR = dyn_cast<VPInterleaveRecipe>(&R)) {
6919 auto *IG = IR->getInterleaveGroup();
6920 unsigned NumMembers = IG->getNumMembers();
6921 for (unsigned I = 0; I != NumMembers; ++I) {
6922 if (Instruction *M = IG->getMember(I))
6923 SeenInstrs.insert(M);
6924 }
6925 continue;
6926 }
6927 // Unused FOR splices are removed by VPlan transforms, so the VPlan-based
6928 // cost model won't cost it whilst the legacy will.
6929 if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) {
6930 if (none_of(FOR->users(), [](VPUser *U) {
6931 auto *VPI = dyn_cast<VPInstruction>(U);
6932 return VPI && VPI->getOpcode() ==
6933 VPInstruction::FirstOrderRecurrenceSplice;
6934 }))
6935 return true;
6936 }
6937 // The VPlan-based cost model is more accurate for partial reduction and
6938 // comparing against the legacy cost isn't desirable.
6939 if (isa<VPPartialReductionRecipe>(&R))
6940 return true;
6941
6942 /// If a VPlan transform folded a recipe to one producing a single-scalar,
6943 /// but the original instruction wasn't uniform-after-vectorization in the
6944 /// legacy cost model, the legacy cost overestimates the actual cost.
6945 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
6946 if (RepR->isSingleScalar() &&
6948 RepR->getUnderlyingInstr(), VF))
6949 return true;
6950 }
6951 if (Instruction *UI = GetInstructionForCost(&R)) {
6952 // If we adjusted the predicate of the recipe, the cost in the legacy
6953 // cost model may be different.
6954 using namespace VPlanPatternMatch;
6955 CmpPredicate Pred;
6956 if (match(&R, m_Cmp(Pred, m_VPValue(), m_VPValue())) &&
6957 cast<VPRecipeWithIRFlags>(R).getPredicate() !=
6958 cast<CmpInst>(UI)->getPredicate())
6959 return true;
6960 SeenInstrs.insert(UI);
6961 }
6962 }
6963 }
6964
6965 // Return true if the loop contains any instructions that are not also part of
6966 // the VPlan or are skipped for VPlan-based cost computations. This indicates
6967 // that the VPlan contains extra simplifications.
6968 return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx,
6969 TheLoop](BasicBlock *BB) {
6970 return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) {
6971 // Skip induction phis when checking for simplifications, as they may not
6972 // be lowered directly be lowered to a corresponding PHI recipe.
6973 if (isa<PHINode>(&I) && BB == TheLoop->getHeader() &&
6974 CostCtx.CM.Legal->isInductionPhi(cast<PHINode>(&I)))
6975 return false;
6976 return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true);
6977 });
6978 });
6979}
6980#endif
6981
6983 if (VPlans.empty())
6985 // If there is a single VPlan with a single VF, return it directly.
6986 VPlan &FirstPlan = *VPlans[0];
6987 if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
6988 return {*FirstPlan.vectorFactors().begin(), 0, 0};
6989
6990 LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: "
6992 ? "Reciprocal Throughput\n"
6994 ? "Instruction Latency\n"
6995 : CM.CostKind == TTI::TCK_CodeSize ? "Code Size\n"
6997 ? "Code Size and Latency\n"
6998 : "Unknown\n"));
6999
7001 assert(hasPlanWithVF(ScalarVF) &&
7002 "More than a single plan/VF w/o any plan having scalar VF");
7003
7004 // TODO: Compute scalar cost using VPlan-based cost model.
7005 InstructionCost ScalarCost = CM.expectedCost(ScalarVF);
7006 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n");
7007 VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost);
7008 VectorizationFactor BestFactor = ScalarFactor;
7009
7010 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7011 if (ForceVectorization) {
7012 // Ignore scalar width, because the user explicitly wants vectorization.
7013 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
7014 // evaluation.
7015 BestFactor.Cost = InstructionCost::getMax();
7016 }
7017
7018 for (auto &P : VPlans) {
7019 ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
7020 P->vectorFactors().end());
7021
7026
7027 for (unsigned I = 0; I < VFs.size(); I++) {
7028 ElementCount VF = VFs[I];
7029 if (VF.isScalar())
7030 continue;
7031 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
7032 LLVM_DEBUG(
7033 dbgs()
7034 << "LV: Not considering vector loop of width " << VF
7035 << " because it will not generate any vector instructions.\n");
7036 continue;
7037 }
7038 if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) {
7039 LLVM_DEBUG(
7040 dbgs()
7041 << "LV: Not considering vector loop of width " << VF
7042 << " because it would cause replicated blocks to be generated,"
7043 << " which isn't allowed when optimizing for size.\n");
7044 continue;
7045 }
7046
7047 InstructionCost Cost = cost(*P, VF);
7048 VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7049
7051 RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs)) {
7052 LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width "
7053 << VF << " because it uses too many registers\n");
7054 continue;
7055 }
7056
7057 if (isMoreProfitable(CurrentFactor, BestFactor, P->hasScalarTail()))
7058 BestFactor = CurrentFactor;
7059
7060 // If profitable add it to ProfitableVF list.
7061 if (isMoreProfitable(CurrentFactor, ScalarFactor, P->hasScalarTail()))
7062 ProfitableVFs.push_back(CurrentFactor);
7063 }
7064 }
7065
7066#ifndef NDEBUG
7067 // Select the optimal vectorization factor according to the legacy cost-model.
7068 // This is now only used to verify the decisions by the new VPlan-based
7069 // cost-model and will be retired once the VPlan-based cost-model is
7070 // stabilized.
7071 VectorizationFactor LegacyVF = selectVectorizationFactor();
7072 VPlan &BestPlan = getPlanFor(BestFactor.Width);
7073
7074 // Pre-compute the cost and use it to check if BestPlan contains any
7075 // simplifications not accounted for in the legacy cost model. If that's the
7076 // case, don't trigger the assertion, as the extra simplifications may cause a
7077 // different VF to be picked by the VPlan-based cost model.
7078 VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind);
7079 precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
7080 // Verify that the VPlan-based and legacy cost models agree, except for VPlans
7081 // with early exits and plans with additional VPlan simplifications. The
7082 // legacy cost model doesn't properly model costs for such loops.
7083 assert((BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() ||
7085 CostCtx, OrigLoop,
7086 BestFactor.Width) ||
7088 getPlanFor(LegacyVF.Width), CostCtx, OrigLoop, LegacyVF.Width)) &&
7089 " VPlan cost model and legacy cost model disagreed");
7090 assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
7091 "when vectorizing, the scalar cost must be computed.");
7092#endif
7093
7094 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n");
7095 return BestFactor;
7096}
7097
7100 // Reserve first location for self reference to the LoopID metadata node.
7101 MDs.push_back(nullptr);
7102 bool IsUnrollMetadata = false;
7103 MDNode *LoopID = L->getLoopID();
7104 if (LoopID) {
7105 // First find existing loop unrolling disable metadata.
7106 for (unsigned I = 1, IE = LoopID->getNumOperands(); I < IE; ++I) {
7107 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(I));
7108 if (MD) {
7109 const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7110 IsUnrollMetadata =
7111 S && S->getString().starts_with("llvm.loop.unroll.disable");
7112 }
7113 MDs.push_back(LoopID->getOperand(I));
7114 }
7115 }
7116
7117 if (!IsUnrollMetadata) {
7118 // Add runtime unroll disable metadata.
7119 LLVMContext &Context = L->getHeader()->getContext();
7120 SmallVector<Metadata *, 1> DisableOperands;
7121 DisableOperands.push_back(
7122 MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7123 MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7124 MDs.push_back(DisableNode);
7125 MDNode *NewLoopID = MDNode::get(Context, MDs);
7126 // Set operand 0 to refer to the loop id itself.
7127 NewLoopID->replaceOperandWith(0, NewLoopID);
7128 L->setLoopID(NewLoopID);
7129 }
7130}
7131
7133 using namespace VPlanPatternMatch;
7135 "RdxResult must be ComputeFindIVResult");
7136 VPValue *StartVPV = RdxResult->getOperand(1);
7137 match(StartVPV, m_Freeze(m_VPValue(StartVPV)));
7138 return StartVPV->getLiveInIRValue();
7139}
7140
7141// If \p EpiResumePhiR is resume VPPhi for a reduction when vectorizing the
7142// epilog loop, fix the reduction's scalar PHI node by adding the incoming value
7143// from the main vector loop.
7145 VPPhi *EpiResumePhiR, PHINode &EpiResumePhi, BasicBlock *BypassBlock) {
7146 // Get the VPInstruction computing the reduction result in the middle block.
7147 // The first operand may not be from the middle block if it is not connected
7148 // to the scalar preheader. In that case, there's nothing to fix.
7149 VPValue *Incoming = EpiResumePhiR->getOperand(0);
7152 auto *EpiRedResult = dyn_cast<VPInstruction>(Incoming);
7153 if (!EpiRedResult ||
7154 (EpiRedResult->getOpcode() != VPInstruction::ComputeAnyOfResult &&
7155 EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult &&
7156 EpiRedResult->getOpcode() != VPInstruction::ComputeFindIVResult))
7157 return;
7158
7159 auto *EpiRedHeaderPhi =
7160 cast<VPReductionPHIRecipe>(EpiRedResult->getOperand(0));
7161 RecurKind Kind = EpiRedHeaderPhi->getRecurrenceKind();
7162 Value *MainResumeValue;
7163 if (auto *VPI = dyn_cast<VPInstruction>(EpiRedHeaderPhi->getStartValue())) {
7164 assert((VPI->getOpcode() == VPInstruction::Broadcast ||
7165 VPI->getOpcode() == VPInstruction::ReductionStartVector) &&
7166 "unexpected start recipe");
7167 MainResumeValue = VPI->getOperand(0)->getUnderlyingValue();
7168 } else
7169 MainResumeValue = EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
7171 [[maybe_unused]] Value *StartV =
7172 EpiRedResult->getOperand(1)->getLiveInIRValue();
7173 auto *Cmp = cast<ICmpInst>(MainResumeValue);
7174 assert(Cmp->getPredicate() == CmpInst::ICMP_NE &&
7175 "AnyOf expected to start with ICMP_NE");
7176 assert(Cmp->getOperand(1) == StartV &&
7177 "AnyOf expected to start by comparing main resume value to original "
7178 "start value");
7179 MainResumeValue = Cmp->getOperand(0);
7181 Value *StartV = getStartValueFromReductionResult(EpiRedResult);
7182 Value *SentinelV = EpiRedResult->getOperand(2)->getLiveInIRValue();
7183 using namespace llvm::PatternMatch;
7184 Value *Cmp, *OrigResumeV, *CmpOp;
7185 [[maybe_unused]] bool IsExpectedPattern =
7186 match(MainResumeValue,
7187 m_Select(m_OneUse(m_Value(Cmp)), m_Specific(SentinelV),
7188 m_Value(OrigResumeV))) &&
7190 m_Value(CmpOp))) &&
7191 ((CmpOp == StartV && isGuaranteedNotToBeUndefOrPoison(CmpOp))));
7192 assert(IsExpectedPattern && "Unexpected reduction resume pattern");
7193 MainResumeValue = OrigResumeV;
7194 }
7195 PHINode *MainResumePhi = cast<PHINode>(MainResumeValue);
7196
7197 // When fixing reductions in the epilogue loop we should already have
7198 // created a bc.merge.rdx Phi after the main vector body. Ensure that we carry
7199 // over the incoming values correctly.
7200 EpiResumePhi.setIncomingValueForBlock(
7201 BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock));
7202}
7203
7205 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7206 InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue) {
7207 assert(BestVPlan.hasVF(BestVF) &&
7208 "Trying to execute plan with unsupported VF");
7209 assert(BestVPlan.hasUF(BestUF) &&
7210 "Trying to execute plan with unsupported UF");
7211 if (BestVPlan.hasEarlyExit())
7212 ++LoopsEarlyExitVectorized;
7213 // TODO: Move to VPlan transform stage once the transition to the VPlan-based
7214 // cost model is complete for better cost estimates.
7219 bool HasBranchWeights =
7221 if (HasBranchWeights) {
7222 std::optional<unsigned> VScale = CM.getVScaleForTuning();
7224 BestVPlan, BestVF, VScale);
7225 }
7226
7227 // Checks are the same for all VPlans, added to BestVPlan only for
7228 // compactness.
7229 attachRuntimeChecks(BestVPlan, ILV.RTChecks, HasBranchWeights);
7230
7231 // Retrieving VectorPH now when it's easier while VPlan still has Regions.
7232 VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader());
7233
7234 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7238 BestVPlan, BestVF,
7240 VPlanTransforms::cse(BestVPlan);
7242
7244 // Regions are dissolved after optimizing for VF and UF, which completely
7245 // removes unneeded loop regions first.
7247 // Canonicalize EVL loops after regions are dissolved.
7251 BestVPlan, VectorPH, CM.foldTailByMasking(),
7252 CM.requiresScalarEpilogue(BestVF.isVector()));
7253 VPlanTransforms::materializeVFAndVFxUF(BestVPlan, VectorPH, BestVF);
7255
7256 // 0. Generate SCEV-dependent code in the entry, including TripCount, before
7257 // making any changes to the CFG.
7258 DenseMap<const SCEV *, Value *> ExpandedSCEVs =
7259 VPlanTransforms::expandSCEVs(BestVPlan, *PSE.getSE());
7260 if (!ILV.getTripCount())
7261 ILV.setTripCount(BestVPlan.getTripCount()->getLiveInIRValue());
7262 else
7263 assert(VectorizingEpilogue && "should only re-use the existing trip "
7264 "count during epilogue vectorization");
7265
7266 // Perform the actual loop transformation.
7267 VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
7268 OrigLoop->getParentLoop(),
7269 Legal->getWidestInductionType());
7270
7271#ifdef EXPENSIVE_CHECKS
7272 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
7273#endif
7274
7275 // 1. Set up the skeleton for vectorization, including vector pre-header and
7276 // middle block. The vector loop is created during VPlan execution.
7279 State.CFG.PrevBB->getSingleSuccessor());
7281
7282 assert(verifyVPlanIsValid(BestVPlan, true /*VerifyLate*/) &&
7283 "final VPlan is invalid");
7284
7285 // After vectorization, the exit blocks of the original loop will have
7286 // additional predecessors. Invalidate SCEVs for the exit phis in case SE
7287 // looked through single-entry phis.
7288 ScalarEvolution &SE = *PSE.getSE();
7289 for (VPIRBasicBlock *Exit : BestVPlan.getExitBlocks()) {
7290 if (!Exit->hasPredecessors())
7291 continue;
7292 for (VPRecipeBase &PhiR : Exit->phis())
7294 OrigLoop, cast<PHINode>(&cast<VPIRPhi>(PhiR).getInstruction()));
7295 }
7296 // Forget the original loop and block dispositions.
7297 SE.forgetLoop(OrigLoop);
7299
7301
7302 //===------------------------------------------------===//
7303 //
7304 // Notice: any optimization or new instruction that go
7305 // into the code below should also be implemented in
7306 // the cost-model.
7307 //
7308 //===------------------------------------------------===//
7309
7310 BestVPlan.execute(&State);
7311
7312 // 2.6. Maintain Loop Hints
7313 // Keep all loop hints from the original loop on the vector loop (we'll
7314 // replace the vectorizer-specific hints below).
7315 VPBasicBlock *HeaderVPBB = vputils::getFirstLoopHeader(BestVPlan, State.VPDT);
7316 if (HeaderVPBB) {
7317 MDNode *OrigLoopID = OrigLoop->getLoopID();
7318
7319 std::optional<MDNode *> VectorizedLoopID =
7322
7323 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7324 if (VectorizedLoopID) {
7325 L->setLoopID(*VectorizedLoopID);
7326 } else {
7327 // Keep all loop hints from the original loop on the vector loop (we'll
7328 // replace the vectorizer-specific hints below).
7329 if (MDNode *LID = OrigLoop->getLoopID())
7330 L->setLoopID(LID);
7331
7332 LoopVectorizeHints Hints(L, true, *ORE);
7333 Hints.setAlreadyVectorized();
7334
7335 // Check if it's EVL-vectorized and mark the corresponding metadata.
7336 bool IsEVLVectorized =
7337 llvm::any_of(*HeaderVPBB, [](const VPRecipeBase &Recipe) {
7338 // Looking for the ExplictVectorLength VPInstruction.
7339 if (const auto *VI = dyn_cast<VPInstruction>(&Recipe))
7340 return VI->getOpcode() == VPInstruction::ExplicitVectorLength;
7341 return false;
7342 });
7343 if (IsEVLVectorized) {
7344 LLVMContext &Context = L->getHeader()->getContext();
7345 MDNode *LoopID = L->getLoopID();
7346 auto *IsEVLVectorizedMD = MDNode::get(
7347 Context,
7348 {MDString::get(Context, "llvm.loop.isvectorized.tailfoldingstyle"),
7349 MDString::get(Context, "evl")});
7350 MDNode *NewLoopID = makePostTransformationMetadata(Context, LoopID, {},
7351 {IsEVLVectorizedMD});
7352 L->setLoopID(NewLoopID);
7353 }
7354 }
7356 TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7357 if (!UP.UnrollVectorizedLoop || VectorizingEpilogue)
7359 }
7360
7361 // 3. Fix the vectorized code: take care of header phi's, live-outs,
7362 // predication, updating analyses.
7363 ILV.fixVectorizedLoop(State);
7364
7366
7367 return ExpandedSCEVs;
7368}
7369
7370//===--------------------------------------------------------------------===//
7371// EpilogueVectorizerMainLoop
7372//===--------------------------------------------------------------------===//
7373
7374/// This function is partially responsible for generating the control flow
7375/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7377 BasicBlock *ScalarPH = createScalarPreheader("");
7378
7379 // Generate the code to check the minimum iteration count of the vector
7380 // epilogue (see below).
7383
7384 // Generate the iteration count check for the main loop, *after* the check
7385 // for the epilogue loop, so that the path-length is shorter for the case
7386 // that goes directly through the vector epilogue. The longer-path length for
7387 // the main loop is compensated for, by the gain from vectorizing the larger
7388 // trip count. Note: the branch will get updated later on when we vectorize
7389 // the epilogue.
7391
7392 return LoopVectorPreHeader;
7393}
7394
7396 LLVM_DEBUG({
7397 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7398 << "Main Loop VF:" << EPI.MainLoopVF
7399 << ", Main Loop UF:" << EPI.MainLoopUF
7400 << ", Epilogue Loop VF:" << EPI.EpilogueVF
7401 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7402 });
7403}
7404
7407 dbgs() << "intermediate fn:\n"
7408 << *OrigLoop->getHeader()->getParent() << "\n";
7409 });
7410}
7411
7412BasicBlock *
7414 bool ForEpilogue) {
7415 assert(Bypass && "Expected valid bypass basic block.");
7416 Value *Count = getTripCount();
7418 Value *CheckMinIters =
7420 ForEpilogue ? EPI.EpilogueUF : EPI.MainLoopUF);
7421
7422 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7423 if (!ForEpilogue)
7424 TCCheckBlock->setName("vector.main.loop.iter.check");
7425
7426 // Create new preheader for vector loop.
7427 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7428 static_cast<DominatorTree *>(nullptr), LI,
7429 nullptr, "vector.ph");
7430 if (ForEpilogue) {
7431 // Save the trip count so we don't have to regenerate it in the
7432 // vec.epilog.iter.check. This is safe to do because the trip count
7433 // generated here dominates the vector epilog iter check.
7434 EPI.TripCount = Count;
7435 } else {
7437 }
7438
7439 BranchInst &BI =
7440 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7442 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
7443 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
7444
7445 // When vectorizing the main loop, its trip-count check is placed in a new
7446 // block, whereas the overall trip-count check is placed in the VPlan entry
7447 // block. When vectorizing the epilogue loop, its trip-count check is placed
7448 // in the VPlan entry block.
7449 if (!ForEpilogue)
7450 introduceCheckBlockInVPlan(TCCheckBlock);
7451 return TCCheckBlock;
7452}
7453
7454//===--------------------------------------------------------------------===//
7455// EpilogueVectorizerEpilogueLoop
7456//===--------------------------------------------------------------------===//
7457
7458/// This function is partially responsible for generating the control flow
7459/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7461 BasicBlock *ScalarPH = createScalarPreheader("vec.epilog.");
7462
7463 // Now, compare the remaining count and if there aren't enough iterations to
7464 // execute the vectorized epilogue skip to the scalar part.
7465 LoopVectorPreHeader->setName("vec.epilog.ph");
7466 BasicBlock *VecEpilogueIterationCountCheck =
7468 nullptr, "vec.epilog.iter.check", true);
7470
7472 VecEpilogueIterationCountCheck);
7473 AdditionalBypassBlock = VecEpilogueIterationCountCheck;
7474
7475 // Adjust the control flow taking the state info from the main loop
7476 // vectorization into account.
7478 "expected this to be saved from the previous pass.");
7480 VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7481
7483 VecEpilogueIterationCountCheck, ScalarPH);
7484
7485 // Adjust the terminators of runtime check blocks and phis using them.
7486 BasicBlock *SCEVCheckBlock = RTChecks.getSCEVChecks().second;
7487 BasicBlock *MemCheckBlock = RTChecks.getMemRuntimeChecks().second;
7488 if (SCEVCheckBlock)
7489 SCEVCheckBlock->getTerminator()->replaceUsesOfWith(
7490 VecEpilogueIterationCountCheck, ScalarPH);
7491 if (MemCheckBlock)
7492 MemCheckBlock->getTerminator()->replaceUsesOfWith(
7493 VecEpilogueIterationCountCheck, ScalarPH);
7494
7496
7497 // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7498 // reductions which merge control-flow from the latch block and the middle
7499 // block. Update the incoming values here and move the Phi into the preheader.
7500 SmallVector<PHINode *, 4> PhisInBlock(
7501 llvm::make_pointer_range(VecEpilogueIterationCountCheck->phis()));
7502
7503 for (PHINode *Phi : PhisInBlock) {
7504 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHIIt());
7505 Phi->replaceIncomingBlockWith(
7506 VecEpilogueIterationCountCheck->getSinglePredecessor(),
7507 VecEpilogueIterationCountCheck);
7508
7509 // If the phi doesn't have an incoming value from the
7510 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7511 // value and also those from other check blocks. This is needed for
7512 // reduction phis only.
7513 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
7514 return EPI.EpilogueIterationCountCheck == IncB;
7515 }))
7516 continue;
7517 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7518 if (SCEVCheckBlock)
7519 Phi->removeIncomingValue(SCEVCheckBlock);
7520 if (MemCheckBlock)
7521 Phi->removeIncomingValue(MemCheckBlock);
7522 }
7523
7524 return LoopVectorPreHeader;
7525}
7526
7527BasicBlock *
7529 BasicBlock *Bypass, BasicBlock *Insert) {
7530
7532 "Expected trip count to have been saved in the first pass.");
7533 Value *TC = EPI.TripCount;
7534 IRBuilder<> Builder(Insert->getTerminator());
7535 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7536
7537 // Generate code to check if the loop's trip count is less than VF * UF of the
7538 // vector epilogue loop.
7539 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
7542
7543 Value *CheckMinIters =
7544 Builder.CreateICmp(P, Count,
7547 "min.epilog.iters.check");
7548
7549 BranchInst &BI =
7550 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7551 auto VScale = Cost->getVScaleForTuning();
7552 unsigned MainLoopStep =
7554 unsigned EpilogueLoopStep =
7556 // We assume the remaining `Count` is equally distributed in
7557 // [0, MainLoopStep)
7558 // So the probability for `Count < EpilogueLoopStep` should be
7559 // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
7560 // TODO: Improve the estimate by taking the estimated trip count into
7561 // consideration.
7562 unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
7563 const uint32_t Weights[] = {EstimatedSkipCount,
7564 MainLoopStep - EstimatedSkipCount};
7565 setBranchWeights(BI, Weights, /*IsExpected=*/false);
7566 ReplaceInstWithInst(Insert->getTerminator(), &BI);
7567
7568 // A new entry block has been created for the epilogue VPlan. Hook it in, as
7569 // otherwise we would try to modify the entry to the main vector loop.
7570 VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(Insert);
7571 VPBasicBlock *OldEntry = Plan.getEntry();
7572 VPBlockUtils::reassociateBlocks(OldEntry, NewEntry);
7573 Plan.setEntry(NewEntry);
7574 // OldEntry is now dead and will be cleaned up when the plan gets destroyed.
7575
7576 return Insert;
7577}
7578
7580 LLVM_DEBUG({
7581 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7582 << "Epilogue Loop VF:" << EPI.EpilogueVF
7583 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7584 });
7585}
7586
7589 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7590 });
7591}
7592
7594VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
7595 VFRange &Range) {
7596 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7597 "Must be called with either a load or store");
7598
7599 auto WillWiden = [&](ElementCount VF) -> bool {
7601 CM.getWideningDecision(I, VF);
7603 "CM decision should be taken at this point.");
7605 return true;
7606 if (CM.isScalarAfterVectorization(I, VF) ||
7607 CM.isProfitableToScalarize(I, VF))
7608 return false;
7610 };
7611
7613 return nullptr;
7614
7615 VPValue *Mask = nullptr;
7616 if (Legal->isMaskRequired(I))
7617 Mask = getBlockInMask(Builder.getInsertBlock());
7618
7619 // Determine if the pointer operand of the access is either consecutive or
7620 // reverse consecutive.
7622 CM.getWideningDecision(I, Range.Start);
7624 bool Consecutive =
7626
7627 VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
7628 if (Consecutive) {
7629 auto *GEP = dyn_cast<GetElementPtrInst>(
7630 Ptr->getUnderlyingValue()->stripPointerCasts());
7631 VPSingleDefRecipe *VectorPtr;
7632 if (Reverse) {
7633 // When folding the tail, we may compute an address that we don't in the
7634 // original scalar loop and it may not be inbounds. Drop Inbounds in that
7635 // case.
7636 GEPNoWrapFlags Flags =
7637 (CM.foldTailByMasking() || !GEP || !GEP->isInBounds())
7640 VectorPtr =
7642 /*Stride*/ -1, Flags, I->getDebugLoc());
7643 } else {
7644 VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
7645 GEP ? GEP->getNoWrapFlags()
7647 I->getDebugLoc());
7648 }
7649 Builder.insert(VectorPtr);
7650 Ptr = VectorPtr;
7651 }
7652 if (LoadInst *Load = dyn_cast<LoadInst>(I))
7653 return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
7654 VPIRMetadata(*Load, LVer), I->getDebugLoc());
7655
7656 StoreInst *Store = cast<StoreInst>(I);
7657 return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
7658 Reverse, VPIRMetadata(*Store, LVer),
7659 I->getDebugLoc());
7660}
7661
7662/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
7663/// insert a recipe to expand the step for the induction recipe.
7666 VPValue *Start, const InductionDescriptor &IndDesc,
7667 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) {
7668 assert(IndDesc.getStartValue() ==
7669 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
7670 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
7671 "step must be loop invariant");
7672
7673 VPValue *Step =
7675 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
7676 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
7677 IndDesc, TruncI,
7678 TruncI->getDebugLoc());
7679 }
7680 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
7681 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
7682 IndDesc, Phi->getDebugLoc());
7683}
7684
7685VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
7687
7688 // Check if this is an integer or fp induction. If so, build the recipe that
7689 // produces its scalar and vector values.
7690 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
7691 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
7692 *PSE.getSE(), *OrigLoop);
7693
7694 // Check if this is pointer induction. If so, build the recipe for it.
7695 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
7696 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep());
7698 Phi, Operands[0], Step, &Plan.getVFxUF(), *II,
7700 [&](ElementCount VF) {
7701 return CM.isScalarAfterVectorization(Phi, VF);
7702 },
7703 Range),
7704 Phi->getDebugLoc());
7705 }
7706 return nullptr;
7707}
7708
7709VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
7711 // Optimize the special case where the source is a constant integer
7712 // induction variable. Notice that we can only optimize the 'trunc' case
7713 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
7714 // (c) other casts depend on pointer size.
7715
7716 // Determine whether \p K is a truncation based on an induction variable that
7717 // can be optimized.
7718 auto IsOptimizableIVTruncate =
7719 [&](Instruction *K) -> std::function<bool(ElementCount)> {
7720 return [=](ElementCount VF) -> bool {
7721 return CM.isOptimizableIVTruncate(K, VF);
7722 };
7723 };
7724
7726 IsOptimizableIVTruncate(I), Range)) {
7727
7728 auto *Phi = cast<PHINode>(I->getOperand(0));
7730 VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue());
7731 return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
7732 *OrigLoop);
7733 }
7734 return nullptr;
7735}
7736
7737VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
7739 VFRange &Range) {
7741 [this, CI](ElementCount VF) {
7742 return CM.isScalarWithPredication(CI, VF);
7743 },
7744 Range);
7745
7746 if (IsPredicated)
7747 return nullptr;
7748
7750 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
7751 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
7752 ID == Intrinsic::pseudoprobe ||
7753 ID == Intrinsic::experimental_noalias_scope_decl))
7754 return nullptr;
7755
7756 SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
7757
7758 // Is it beneficial to perform intrinsic call compared to lib call?
7759 bool ShouldUseVectorIntrinsic =
7761 [&](ElementCount VF) -> bool {
7762 return CM.getCallWideningDecision(CI, VF).Kind ==
7764 },
7765 Range);
7766 if (ShouldUseVectorIntrinsic)
7767 return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(),
7768 CI->getDebugLoc());
7769
7770 Function *Variant = nullptr;
7771 std::optional<unsigned> MaskPos;
7772 // Is better to call a vectorized version of the function than to to scalarize
7773 // the call?
7774 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
7775 [&](ElementCount VF) -> bool {
7776 // The following case may be scalarized depending on the VF.
7777 // The flag shows whether we can use a usual Call for vectorized
7778 // version of the instruction.
7779
7780 // If we've found a variant at a previous VF, then stop looking. A
7781 // vectorized variant of a function expects input in a certain shape
7782 // -- basically the number of input registers, the number of lanes
7783 // per register, and whether there's a mask required.
7784 // We store a pointer to the variant in the VPWidenCallRecipe, so
7785 // once we have an appropriate variant it's only valid for that VF.
7786 // This will force a different vplan to be generated for each VF that
7787 // finds a valid variant.
7788 if (Variant)
7789 return false;
7791 CM.getCallWideningDecision(CI, VF);
7793 Variant = Decision.Variant;
7794 MaskPos = Decision.MaskPos;
7795 return true;
7796 }
7797
7798 return false;
7799 },
7800 Range);
7801 if (ShouldUseVectorCall) {
7802 if (MaskPos.has_value()) {
7803 // We have 2 cases that would require a mask:
7804 // 1) The block needs to be predicated, either due to a conditional
7805 // in the scalar loop or use of an active lane mask with
7806 // tail-folding, and we use the appropriate mask for the block.
7807 // 2) No mask is required for the block, but the only available
7808 // vector variant at this VF requires a mask, so we synthesize an
7809 // all-true mask.
7810 VPValue *Mask = nullptr;
7811 if (Legal->isMaskRequired(CI))
7812 Mask = getBlockInMask(Builder.getInsertBlock());
7813 else
7814 Mask = Plan.getOrAddLiveIn(
7816
7817 Ops.insert(Ops.begin() + *MaskPos, Mask);
7818 }
7819
7820 Ops.push_back(Operands.back());
7821 return new VPWidenCallRecipe(CI, Variant, Ops, CI->getDebugLoc());
7822 }
7823
7824 return nullptr;
7825}
7826
7827bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
7828 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
7829 !isa<StoreInst>(I) && "Instruction should have been handled earlier");
7830 // Instruction should be widened, unless it is scalar after vectorization,
7831 // scalarization is profitable or it is predicated.
7832 auto WillScalarize = [this, I](ElementCount VF) -> bool {
7833 return CM.isScalarAfterVectorization(I, VF) ||
7834 CM.isProfitableToScalarize(I, VF) ||
7835 CM.isScalarWithPredication(I, VF);
7836 };
7838 Range);
7839}
7840
7841VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
7843 switch (I->getOpcode()) {
7844 default:
7845 return nullptr;
7846 case Instruction::SDiv:
7847 case Instruction::UDiv:
7848 case Instruction::SRem:
7849 case Instruction::URem: {
7850 // If not provably safe, use a select to form a safe divisor before widening the
7851 // div/rem operation itself. Otherwise fall through to general handling below.
7852 if (CM.isPredicatedInst(I)) {
7855 VPValue *One =
7856 Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));
7857 auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc());
7858 Ops[1] = SafeRHS;
7859 return new VPWidenRecipe(*I, Ops);
7860 }
7861 [[fallthrough]];
7862 }
7863 case Instruction::Add:
7864 case Instruction::And:
7865 case Instruction::AShr:
7866 case Instruction::FAdd:
7867 case Instruction::FCmp:
7868 case Instruction::FDiv:
7869 case Instruction::FMul:
7870 case Instruction::FNeg:
7871 case Instruction::FRem:
7872 case Instruction::FSub:
7873 case Instruction::ICmp:
7874 case Instruction::LShr:
7875 case Instruction::Mul:
7876 case Instruction::Or:
7877 case Instruction::Select:
7878 case Instruction::Shl:
7879 case Instruction::Sub:
7880 case Instruction::Xor:
7881 case Instruction::Freeze: {
7883 if (Instruction::isBinaryOp(I->getOpcode())) {
7884 // The legacy cost model uses SCEV to check if some of the operands are
7885 // constants. To match the legacy cost model's behavior, use SCEV to try
7886 // to replace operands with constants.
7887 ScalarEvolution &SE = *PSE.getSE();
7888 auto GetConstantViaSCEV = [this, &SE](VPValue *Op) {
7889 if (!Op->isLiveIn())
7890 return Op;
7891 Value *V = Op->getUnderlyingValue();
7892 if (isa<Constant>(V) || !SE.isSCEVable(V->getType()))
7893 return Op;
7894 auto *C = dyn_cast<SCEVConstant>(SE.getSCEV(V));
7895 if (!C)
7896 return Op;
7897 return Plan.getOrAddLiveIn(C->getValue());
7898 };
7899 // For Mul, the legacy cost model checks both operands.
7900 if (I->getOpcode() == Instruction::Mul)
7901 NewOps[0] = GetConstantViaSCEV(NewOps[0]);
7902 // For other binops, the legacy cost model only checks the second operand.
7903 NewOps[1] = GetConstantViaSCEV(NewOps[1]);
7904 }
7905 return new VPWidenRecipe(*I, NewOps);
7906 }
7907 case Instruction::ExtractValue: {
7909 Type *I32Ty = IntegerType::getInt32Ty(I->getContext());
7910 auto *EVI = cast<ExtractValueInst>(I);
7911 assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index");
7912 unsigned Idx = EVI->getIndices()[0];
7913 NewOps.push_back(Plan.getOrAddLiveIn(ConstantInt::get(I32Ty, Idx, false)));
7914 return new VPWidenRecipe(*I, NewOps);
7915 }
7916 };
7917}
7918
7920VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
7922 // FIXME: Support other operations.
7923 unsigned Opcode = HI->Update->getOpcode();
7924 assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
7925 "Histogram update operation must be an Add or Sub");
7926
7928 // Bucket address.
7929 HGramOps.push_back(Operands[1]);
7930 // Increment value.
7931 HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update->getOperand(1)));
7932
7933 // In case of predicated execution (due to tail-folding, or conditional
7934 // execution, or both), pass the relevant mask.
7935 if (Legal->isMaskRequired(HI->Store))
7936 HGramOps.push_back(getBlockInMask(Builder.getInsertBlock()));
7937
7938 return new VPHistogramRecipe(Opcode, HGramOps, HI->Store->getDebugLoc());
7939}
7940
7943 VFRange &Range) {
7945 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
7946 Range);
7947
7948 bool IsPredicated = CM.isPredicatedInst(I);
7949
7950 // Even if the instruction is not marked as uniform, there are certain
7951 // intrinsic calls that can be effectively treated as such, so we check for
7952 // them here. Conservatively, we only do this for scalable vectors, since
7953 // for fixed-width VFs we can always fall back on full scalarization.
7954 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
7955 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
7956 case Intrinsic::assume:
7957 case Intrinsic::lifetime_start:
7958 case Intrinsic::lifetime_end:
7959 // For scalable vectors if one of the operands is variant then we still
7960 // want to mark as uniform, which will generate one instruction for just
7961 // the first lane of the vector. We can't scalarize the call in the same
7962 // way as for fixed-width vectors because we don't know how many lanes
7963 // there are.
7964 //
7965 // The reasons for doing it this way for scalable vectors are:
7966 // 1. For the assume intrinsic generating the instruction for the first
7967 // lane is still be better than not generating any at all. For
7968 // example, the input may be a splat across all lanes.
7969 // 2. For the lifetime start/end intrinsics the pointer operand only
7970 // does anything useful when the input comes from a stack object,
7971 // which suggests it should always be uniform. For non-stack objects
7972 // the effect is to poison the object, which still allows us to
7973 // remove the call.
7974 IsUniform = true;
7975 break;
7976 default:
7977 break;
7978 }
7979 }
7980 VPValue *BlockInMask = nullptr;
7981 if (!IsPredicated) {
7982 // Finalize the recipe for Instr, first if it is not predicated.
7983 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
7984 } else {
7985 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
7986 // Instructions marked for predication are replicated and a mask operand is
7987 // added initially. Masked replicate recipes will later be placed under an
7988 // if-then construct to prevent side-effects. Generate recipes to compute
7989 // the block mask for this region.
7990 BlockInMask = getBlockInMask(Builder.getInsertBlock());
7991 }
7992
7993 // Note that there is some custom logic to mark some intrinsics as uniform
7994 // manually above for scalable vectors, which this assert needs to account for
7995 // as well.
7996 assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
7997 (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
7998 "Should not predicate a uniform recipe");
7999 auto *Recipe = new VPReplicateRecipe(I, Operands, IsUniform, BlockInMask,
8000 VPIRMetadata(*I, LVer));
8001 return Recipe;
8002}
8003
8004/// Find all possible partial reductions in the loop and track all of those that
8005/// are valid so recipes can be formed later.
8007 // Find all possible partial reductions.
8009 PartialReductionChains;
8010 for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) {
8011 getScaledReductions(Phi, RdxDesc.getLoopExitInstr(), Range,
8012 PartialReductionChains);
8013 }
8014
8015 // A partial reduction is invalid if any of its extends are used by
8016 // something that isn't another partial reduction. This is because the
8017 // extends are intended to be lowered along with the reduction itself.
8018
8019 // Build up a set of partial reduction ops for efficient use checking.
8020 SmallPtrSet<User *, 4> PartialReductionOps;
8021 for (const auto &[PartialRdx, _] : PartialReductionChains)
8022 PartialReductionOps.insert(PartialRdx.ExtendUser);
8023
8024 auto ExtendIsOnlyUsedByPartialReductions =
8025 [&PartialReductionOps](Instruction *Extend) {
8026 return all_of(Extend->users(), [&](const User *U) {
8027 return PartialReductionOps.contains(U);
8028 });
8029 };
8030
8031 // Check if each use of a chain's two extends is a partial reduction
8032 // and only add those that don't have non-partial reduction users.
8033 for (auto Pair : PartialReductionChains) {
8034 PartialReductionChain Chain = Pair.first;
8035 if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) &&
8036 (!Chain.ExtendB || ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB)))
8037 ScaledReductionMap.try_emplace(Chain.Reduction, Pair.second);
8038 }
8039}
8040
8041bool VPRecipeBuilder::getScaledReductions(
8042 Instruction *PHI, Instruction *RdxExitInstr, VFRange &Range,
8043 SmallVectorImpl<std::pair<PartialReductionChain, unsigned>> &Chains) {
8044 if (!CM.TheLoop->contains(RdxExitInstr))
8045 return false;
8046
8047 auto *Update = dyn_cast<BinaryOperator>(RdxExitInstr);
8048 if (!Update)
8049 return false;
8050
8051 Value *Op = Update->getOperand(0);
8052 Value *PhiOp = Update->getOperand(1);
8053 if (Op == PHI)
8054 std::swap(Op, PhiOp);
8055
8056 // Try and get a scaled reduction from the first non-phi operand.
8057 // If one is found, we use the discovered reduction instruction in
8058 // place of the accumulator for costing.
8059 if (auto *OpInst = dyn_cast<Instruction>(Op)) {
8060 if (getScaledReductions(PHI, OpInst, Range, Chains)) {
8061 PHI = Chains.rbegin()->first.Reduction;
8062
8063 Op = Update->getOperand(0);
8064 PhiOp = Update->getOperand(1);
8065 if (Op == PHI)
8066 std::swap(Op, PhiOp);
8067 }
8068 }
8069 if (PhiOp != PHI)
8070 return false;
8071
8072 using namespace llvm::PatternMatch;
8073
8074 // If the update is a binary operator, check both of its operands to see if
8075 // they are extends. Otherwise, see if the update comes directly from an
8076 // extend.
8077 Instruction *Exts[2] = {nullptr};
8078 BinaryOperator *ExtendUser = dyn_cast<BinaryOperator>(Op);
8079 std::optional<unsigned> BinOpc;
8080 Type *ExtOpTypes[2] = {nullptr};
8081
8082 auto CollectExtInfo = [this, &Exts,
8083 &ExtOpTypes](SmallVectorImpl<Value *> &Ops) -> bool {
8084 unsigned I = 0;
8085 for (Value *OpI : Ops) {
8086 Value *ExtOp;
8087 if (!match(OpI, m_ZExtOrSExt(m_Value(ExtOp))))
8088 return false;
8089 Exts[I] = cast<Instruction>(OpI);
8090
8091 // TODO: We should be able to support live-ins.
8092 if (!CM.TheLoop->contains(Exts[I]))
8093 return false;
8094
8095 ExtOpTypes[I] = ExtOp->getType();
8096 I++;
8097 }
8098 return true;
8099 };
8100
8101 if (ExtendUser) {
8102 if (!ExtendUser->hasOneUse())
8103 return false;
8104
8105 // Use the side-effect of match to replace BinOp only if the pattern is
8106 // matched, we don't care at this point whether it actually matched.
8107 match(ExtendUser, m_Neg(m_BinOp(ExtendUser)));
8108
8109 SmallVector<Value *> Ops(ExtendUser->operands());
8110 if (!CollectExtInfo(Ops))
8111 return false;
8112
8113 BinOpc = std::make_optional(ExtendUser->getOpcode());
8114 } else if (match(Update, m_Add(m_Value(), m_Value()))) {
8115 // We already know the operands for Update are Op and PhiOp.
8116 SmallVector<Value *> Ops({Op});
8117 if (!CollectExtInfo(Ops))
8118 return false;
8119
8120 ExtendUser = Update;
8121 BinOpc = std::nullopt;
8122 } else
8123 return false;
8124
8128 Exts[1] ? TTI::getPartialReductionExtendKind(Exts[1]) : TTI::PR_None;
8129 PartialReductionChain Chain(RdxExitInstr, Exts[0], Exts[1], ExtendUser);
8130
8131 TypeSize PHISize = PHI->getType()->getPrimitiveSizeInBits();
8132 TypeSize ASize = ExtOpTypes[0]->getPrimitiveSizeInBits();
8133 if (!PHISize.hasKnownScalarFactor(ASize))
8134 return false;
8135 unsigned TargetScaleFactor = PHISize.getKnownScalarFactor(ASize);
8136
8138 [&](ElementCount VF) {
8140 Update->getOpcode(), ExtOpTypes[0], ExtOpTypes[1],
8141 PHI->getType(), VF, OpAExtend, OpBExtend, BinOpc, CM.CostKind);
8142 return Cost.isValid();
8143 },
8144 Range)) {
8145 Chains.emplace_back(Chain, TargetScaleFactor);
8146 return true;
8147 }
8148
8149 return false;
8150}
8151
8153 VFRange &Range) {
8154 // First, check for specific widening recipes that deal with inductions, Phi
8155 // nodes, calls and memory operations.
8156 VPRecipeBase *Recipe;
8157 Instruction *Instr = R->getUnderlyingInstr();
8158 SmallVector<VPValue *, 4> Operands(R->operands());
8159 if (auto *PhiR = dyn_cast<VPPhi>(R)) {
8160 VPBasicBlock *Parent = PhiR->getParent();
8161 [[maybe_unused]] VPRegionBlock *LoopRegionOf =
8162 Parent->getEnclosingLoopRegion();
8163 assert(LoopRegionOf && LoopRegionOf->getEntry() == Parent &&
8164 "Non-header phis should have been handled during predication");
8165 auto *Phi = cast<PHINode>(R->getUnderlyingInstr());
8166 assert(Operands.size() == 2 && "Must have 2 operands for header phis");
8167 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8168 return Recipe;
8169
8170 VPHeaderPHIRecipe *PhiRecipe = nullptr;
8171 assert((Legal->isReductionVariable(Phi) ||
8172 Legal->isFixedOrderRecurrence(Phi)) &&
8173 "can only widen reductions and fixed-order recurrences here");
8174 VPValue *StartV = Operands[0];
8175 if (Legal->isReductionVariable(Phi)) {
8176 const RecurrenceDescriptor &RdxDesc = Legal->getRecurrenceDescriptor(Phi);
8177 assert(RdxDesc.getRecurrenceStartValue() ==
8178 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8179
8180 // If the PHI is used by a partial reduction, set the scale factor.
8181 unsigned ScaleFactor =
8182 getScalingForReduction(RdxDesc.getLoopExitInstr()).value_or(1);
8183 PhiRecipe = new VPReductionPHIRecipe(
8184 Phi, RdxDesc.getRecurrenceKind(), *StartV, CM.isInLoopReduction(Phi),
8185 CM.useOrderedReductions(RdxDesc), ScaleFactor);
8186 } else {
8187 // TODO: Currently fixed-order recurrences are modeled as chains of
8188 // first-order recurrences. If there are no users of the intermediate
8189 // recurrences in the chain, the fixed order recurrence should be modeled
8190 // directly, enabling more efficient codegen.
8191 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8192 }
8193 // Add backedge value.
8194 PhiRecipe->addOperand(Operands[1]);
8195 return PhiRecipe;
8196 }
8197 assert(!R->isPhi() && "only VPPhi nodes expected at this point");
8198
8199 if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
8200 cast<TruncInst>(Instr), Operands, Range)))
8201 return Recipe;
8202
8203 // All widen recipes below deal only with VF > 1.
8205 [&](ElementCount VF) { return VF.isScalar(); }, Range))
8206 return nullptr;
8207
8208 if (auto *CI = dyn_cast<CallInst>(Instr))
8209 return tryToWidenCall(CI, Operands, Range);
8210
8211 if (StoreInst *SI = dyn_cast<StoreInst>(Instr))
8212 if (auto HistInfo = Legal->getHistogramInfo(SI))
8213 return tryToWidenHistogram(*HistInfo, Operands);
8214
8215 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8216 return tryToWidenMemory(Instr, Operands, Range);
8217
8218 if (std::optional<unsigned> ScaleFactor = getScalingForReduction(Instr))
8219 return tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value());
8220
8221 if (!shouldWiden(Instr, Range))
8222 return nullptr;
8223
8224 if (auto *GEP = dyn_cast<GetElementPtrInst>(Instr))
8225 return new VPWidenGEPRecipe(GEP, Operands);
8226
8227 if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8228 return new VPWidenSelectRecipe(*SI, Operands);
8229 }
8230
8231 if (auto *CI = dyn_cast<CastInst>(Instr)) {
8232 return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(),
8233 *CI);
8234 }
8235
8236 return tryToWiden(Instr, Operands);
8237}
8238
8242 unsigned ScaleFactor) {
8243 assert(Operands.size() == 2 &&
8244 "Unexpected number of operands for partial reduction");
8245
8246 VPValue *BinOp = Operands[0];
8248 VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe();
8249 if (isa<VPReductionPHIRecipe>(BinOpRecipe) ||
8250 isa<VPPartialReductionRecipe>(BinOpRecipe))
8251 std::swap(BinOp, Accumulator);
8252
8253 unsigned ReductionOpcode = Reduction->getOpcode();
8254 if (ReductionOpcode == Instruction::Sub) {
8255 auto *const Zero = ConstantInt::get(Reduction->getType(), 0);
8257 Ops.push_back(Plan.getOrAddLiveIn(Zero));
8258 Ops.push_back(BinOp);
8259 BinOp = new VPWidenRecipe(*Reduction, Ops);
8260 Builder.insert(BinOp->getDefiningRecipe());
8261 ReductionOpcode = Instruction::Add;
8262 }
8263
8264 VPValue *Cond = nullptr;
8265 if (CM.blockNeedsPredicationForAnyReason(Reduction->getParent())) {
8266 assert((ReductionOpcode == Instruction::Add ||
8267 ReductionOpcode == Instruction::Sub) &&
8268 "Expected an ADD or SUB operation for predicated partial "
8269 "reductions (because the neutral element in the mask is zero)!");
8270 Cond = getBlockInMask(Builder.getInsertBlock());
8271 VPValue *Zero =
8272 Plan.getOrAddLiveIn(ConstantInt::get(Reduction->getType(), 0));
8273 BinOp = Builder.createSelect(Cond, BinOp, Zero, Reduction->getDebugLoc());
8274 }
8275 return new VPPartialReductionRecipe(ReductionOpcode, Accumulator, BinOp, Cond,
8276 ScaleFactor, Reduction);
8277}
8278
8279void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8280 ElementCount MaxVF) {
8281 if (ElementCount::isKnownGT(MinVF, MaxVF))
8282 return;
8283
8284 assert(OrigLoop->isInnermost() && "Inner loop expected.");
8285
8286 const LoopAccessInfo *LAI = Legal->getLAI();
8288 OrigLoop, LI, DT, PSE.getSE());
8289 if (!LAI->getRuntimePointerChecking()->getChecks().empty() &&
8291 // Only use noalias metadata when using memory checks guaranteeing no
8292 // overlap across all iterations.
8293 LVer.prepareNoAliasMetadata();
8294 }
8295
8296 // Create initial base VPlan0, to serve as common starting point for all
8297 // candidates built later for specific VF ranges.
8298 auto VPlan0 = VPlanTransforms::buildVPlan0(
8299 OrigLoop, *LI, Legal->getWidestInductionType(),
8301
8302 auto MaxVFTimes2 = MaxVF * 2;
8303 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8304 VFRange SubRange = {VF, MaxVFTimes2};
8305 if (auto Plan = tryToBuildVPlanWithVPRecipes(
8306 std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange, &LVer)) {
8307 bool HasScalarVF = Plan->hasScalarVFOnly();
8308 // Now optimize the initial VPlan.
8309 if (!HasScalarVF)
8311 *Plan, CM.getMinimalBitwidths());
8313 // TODO: try to put it close to addActiveLaneMask().
8314 if (CM.foldTailWithEVL() && !HasScalarVF)
8316 *Plan, CM.getMaxSafeElements());
8317 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8318 VPlans.push_back(std::move(Plan));
8319 }
8320 VF = SubRange.End;
8321 }
8322}
8323
8324/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
8325/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
8326/// the end value of the induction.
8328 VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder,
8329 VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) {
8330 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
8331 // Truncated wide inductions resume from the last lane of their vector value
8332 // in the last vector iteration which is handled elsewhere.
8333 if (WideIntOrFp && WideIntOrFp->getTruncInst())
8334 return nullptr;
8335
8336 VPValue *Start = WideIV->getStartValue();
8337 VPValue *Step = WideIV->getStepValue();
8339 VPValue *EndValue = VectorTC;
8340 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
8341 EndValue = VectorPHBuilder.createDerivedIV(
8342 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
8343 Start, VectorTC, Step);
8344 }
8345
8346 // EndValue is derived from the vector trip count (which has the same type as
8347 // the widest induction) and thus may be wider than the induction here.
8348 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
8349 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
8350 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
8351 ScalarTypeOfWideIV,
8352 WideIV->getDebugLoc());
8353 }
8354
8355 auto *ResumePhiRecipe = ScalarPHBuilder.createScalarPhi(
8356 {EndValue, Start}, WideIV->getDebugLoc(), "bc.resume.val");
8357 return ResumePhiRecipe;
8358}
8359
8360/// Create resume phis in the scalar preheader for first-order recurrences,
8361/// reductions and inductions, and update the VPIRInstructions wrapping the
8362/// original phis in the scalar header. End values for inductions are added to
8363/// \p IVEndValues.
8364static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
8365 DenseMap<VPValue *, VPValue *> &IVEndValues) {
8366 VPTypeAnalysis TypeInfo(Plan);
8367 auto *ScalarPH = Plan.getScalarPreheader();
8368 auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]);
8369 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
8370 VPBuilder VectorPHBuilder(
8371 cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
8372 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
8373 VPBuilder ScalarPHBuilder(ScalarPH);
8374 for (VPRecipeBase &ScalarPhiR : Plan.getScalarHeader()->phis()) {
8375 auto *ScalarPhiIRI = cast<VPIRPhi>(&ScalarPhiR);
8376
8377 // TODO: Extract final value from induction recipe initially, optimize to
8378 // pre-computed end value together in optimizeInductionExitUsers.
8379 auto *VectorPhiR =
8380 cast<VPHeaderPHIRecipe>(Builder.getRecipe(&ScalarPhiIRI->getIRPhi()));
8381 if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
8383 WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
8384 &Plan.getVectorTripCount())) {
8385 assert(isa<VPPhi>(ResumePhi) && "Expected a phi");
8386 IVEndValues[WideIVR] = ResumePhi->getOperand(0);
8387 ScalarPhiIRI->addOperand(ResumePhi);
8388 continue;
8389 }
8390 // TODO: Also handle truncated inductions here. Computing end-values
8391 // separately should be done as VPlan-to-VPlan optimization, after
8392 // legalizing all resume values to use the last lane from the loop.
8393 assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
8394 "should only skip truncated wide inductions");
8395 continue;
8396 }
8397
8398 // The backedge value provides the value to resume coming out of a loop,
8399 // which for FORs is a vector whose last element needs to be extracted. The
8400 // start value provides the value if the loop is bypassed.
8401 bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
8402 auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
8403 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
8404 "Cannot handle loops with uncountable early exits");
8405 if (IsFOR)
8406 ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
8407 VPInstruction::ExtractLastElement, {ResumeFromVectorLoop}, {},
8408 "vector.recur.extract");
8409 StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
8410 auto *ResumePhiR = ScalarPHBuilder.createScalarPhi(
8411 {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name);
8412 ScalarPhiIRI->addOperand(ResumePhiR);
8413 }
8414}
8415
8416/// Handle users in the exit block for first order reductions in the original
8417/// exit block. The penultimate value of recurrences is fed to their LCSSA phi
8418/// users in the original exit block using the VPIRInstruction wrapping to the
8419/// LCSSA phi.
8421 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
8422 auto *ScalarPHVPBB = Plan.getScalarPreheader();
8423 auto *MiddleVPBB = Plan.getMiddleBlock();
8424 VPBuilder ScalarPHBuilder(ScalarPHVPBB);
8425 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
8426
8427 auto IsScalableOne = [](ElementCount VF) -> bool {
8428 return VF == ElementCount::getScalable(1);
8429 };
8430
8431 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
8432 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
8433 if (!FOR)
8434 continue;
8435
8436 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
8437 "Cannot handle loops with uncountable early exits");
8438
8439 // This is the second phase of vectorizing first-order recurrences, creating
8440 // extract for users outside the loop. An overview of the transformation is
8441 // described below. Suppose we have the following loop with some use after
8442 // the loop of the last a[i-1],
8443 //
8444 // for (int i = 0; i < n; ++i) {
8445 // t = a[i - 1];
8446 // b[i] = a[i] - t;
8447 // }
8448 // use t;
8449 //
8450 // There is a first-order recurrence on "a". For this loop, the shorthand
8451 // scalar IR looks like:
8452 //
8453 // scalar.ph:
8454 // s.init = a[-1]
8455 // br scalar.body
8456 //
8457 // scalar.body:
8458 // i = phi [0, scalar.ph], [i+1, scalar.body]
8459 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
8460 // s2 = a[i]
8461 // b[i] = s2 - s1
8462 // br cond, scalar.body, exit.block
8463 //
8464 // exit.block:
8465 // use = lcssa.phi [s1, scalar.body]
8466 //
8467 // In this example, s1 is a recurrence because it's value depends on the
8468 // previous iteration. In the first phase of vectorization, we created a
8469 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
8470 // for users in the scalar preheader and exit block.
8471 //
8472 // vector.ph:
8473 // v_init = vector(..., ..., ..., a[-1])
8474 // br vector.body
8475 //
8476 // vector.body
8477 // i = phi [0, vector.ph], [i+4, vector.body]
8478 // v1 = phi [v_init, vector.ph], [v2, vector.body]
8479 // v2 = a[i, i+1, i+2, i+3]
8480 // b[i] = v2 - v1
8481 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
8482 // b[i, i+1, i+2, i+3] = v2 - v1
8483 // br cond, vector.body, middle.block
8484 //
8485 // middle.block:
8486 // vector.recur.extract.for.phi = v2(2)
8487 // vector.recur.extract = v2(3)
8488 // br cond, scalar.ph, exit.block
8489 //
8490 // scalar.ph:
8491 // scalar.recur.init = phi [vector.recur.extract, middle.block],
8492 // [s.init, otherwise]
8493 // br scalar.body
8494 //
8495 // scalar.body:
8496 // i = phi [0, scalar.ph], [i+1, scalar.body]
8497 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
8498 // s2 = a[i]
8499 // b[i] = s2 - s1
8500 // br cond, scalar.body, exit.block
8501 //
8502 // exit.block:
8503 // lo = lcssa.phi [s1, scalar.body],
8504 // [vector.recur.extract.for.phi, middle.block]
8505 //
8506 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
8507 // Extract the penultimate value of the recurrence and use it as operand for
8508 // the VPIRInstruction modeling the phi.
8509 for (VPUser *U : FOR->users()) {
8510 using namespace llvm::VPlanPatternMatch;
8511 if (!match(U, m_ExtractLastElement(m_Specific(FOR))))
8512 continue;
8513 // For VF vscale x 1, if vscale = 1, we are unable to extract the
8514 // penultimate value of the recurrence. Instead we rely on the existing
8515 // extract of the last element from the result of
8516 // VPInstruction::FirstOrderRecurrenceSplice.
8517 // TODO: Consider vscale_range info and UF.
8519 Range))
8520 return;
8521 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
8522 VPInstruction::ExtractPenultimateElement, {FOR->getBackedgeValue()},
8523 {}, "vector.recur.extract.for.phi");
8524 cast<VPInstruction>(U)->replaceAllUsesWith(PenultimateElement);
8525 }
8526 }
8527}
8528
8529VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
8530 VPlanPtr Plan, VFRange &Range, LoopVersioning *LVer) {
8531
8532 using namespace llvm::VPlanPatternMatch;
8534
8535 // ---------------------------------------------------------------------------
8536 // Build initial VPlan: Scan the body of the loop in a topological order to
8537 // visit each basic block after having visited its predecessor basic blocks.
8538 // ---------------------------------------------------------------------------
8539
8540 bool RequiresScalarEpilogueCheck =
8542 [this](ElementCount VF) {
8543 return !CM.requiresScalarEpilogue(VF.isVector());
8544 },
8545 Range);
8547 VPlanTransforms::addMiddleCheck(*Plan, RequiresScalarEpilogueCheck,
8548 CM.foldTailByMasking());
8549
8551
8552 // Don't use getDecisionAndClampRange here, because we don't know the UF
8553 // so this function is better to be conservative, rather than to split
8554 // it up into different VPlans.
8555 // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8556 bool IVUpdateMayOverflow = false;
8557 for (ElementCount VF : Range)
8558 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
8559
8560 TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
8561 // Use NUW for the induction increment if we proved that it won't overflow in
8562 // the vector loop or when not folding the tail. In the later case, we know
8563 // that the canonical induction increment will not overflow as the vector trip
8564 // count is >= increment and a multiple of the increment.
8565 bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
8566 if (!HasNUW) {
8567 auto *IVInc = Plan->getVectorLoopRegion()
8568 ->getExitingBasicBlock()
8569 ->getTerminator()
8570 ->getOperand(0);
8571 assert(match(IVInc, m_VPInstruction<Instruction::Add>(
8572 m_Specific(Plan->getCanonicalIV()), m_VPValue())) &&
8573 "Did not find the canonical IV increment");
8574 cast<VPRecipeWithIRFlags>(IVInc)->dropPoisonGeneratingFlags();
8575 }
8576
8577 // ---------------------------------------------------------------------------
8578 // Pre-construction: record ingredients whose recipes we'll need to further
8579 // process after constructing the initial VPlan.
8580 // ---------------------------------------------------------------------------
8581
8582 // For each interleave group which is relevant for this (possibly trimmed)
8583 // Range, add it to the set of groups to be later applied to the VPlan and add
8584 // placeholders for its members' Recipes which we'll be replacing with a
8585 // single VPInterleaveRecipe.
8587 auto ApplyIG = [IG, this](ElementCount VF) -> bool {
8588 bool Result = (VF.isVector() && // Query is illegal for VF == 1
8589 CM.getWideningDecision(IG->getInsertPos(), VF) ==
8591 // For scalable vectors, the interleave factors must be <= 8 since we
8592 // require the (de)interleaveN intrinsics instead of shufflevectors.
8593 assert((!Result || !VF.isScalable() || IG->getFactor() <= 8) &&
8594 "Unsupported interleave factor for scalable vectors");
8595 return Result;
8596 };
8597 if (!getDecisionAndClampRange(ApplyIG, Range))
8598 continue;
8599 InterleaveGroups.insert(IG);
8600 }
8601
8602 // ---------------------------------------------------------------------------
8603 // Predicate and linearize the top-level loop region.
8604 // ---------------------------------------------------------------------------
8605 auto BlockMaskCache = VPlanTransforms::introduceMasksAndLinearize(
8606 *Plan, CM.foldTailByMasking());
8607
8608 // ---------------------------------------------------------------------------
8609 // Construct wide recipes and apply predication for original scalar
8610 // VPInstructions in the loop.
8611 // ---------------------------------------------------------------------------
8612 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
8613 Builder, BlockMaskCache, LVer);
8614 RecipeBuilder.collectScaledReductions(Range);
8615
8616 // Scan the body of the loop in a topological order to visit each basic block
8617 // after having visited its predecessor basic blocks.
8618 VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion();
8619 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
8621 HeaderVPBB);
8622
8623 auto *MiddleVPBB = Plan->getMiddleBlock();
8624 VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
8625 // Mapping from VPValues in the initial plan to their widened VPValues. Needed
8626 // temporarily to update created block masks.
8628 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
8629 // Convert input VPInstructions to widened recipes.
8630 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
8631 auto *SingleDef = cast<VPSingleDefRecipe>(&R);
8632 auto *UnderlyingValue = SingleDef->getUnderlyingValue();
8633 // Skip recipes that do not need transforming, including canonical IV,
8634 // wide canonical IV and VPInstructions without underlying values. The
8635 // latter are added above for masking.
8636 // FIXME: Migrate code relying on the underlying instruction from VPlan0
8637 // to construct recipes below to not use the underlying instruction.
8638 if (isa<VPCanonicalIVPHIRecipe, VPWidenCanonicalIVRecipe, VPBlendRecipe>(
8639 &R) ||
8640 (isa<VPInstruction>(&R) && !UnderlyingValue))
8641 continue;
8642
8643 // FIXME: VPlan0, which models a copy of the original scalar loop, should
8644 // not use VPWidenPHIRecipe to model the phis.
8645 assert((isa<VPWidenPHIRecipe>(&R) || isa<VPInstruction>(&R)) &&
8646 UnderlyingValue && "unsupported recipe");
8647
8648 // TODO: Gradually replace uses of underlying instruction by analyses on
8649 // VPlan.
8650 Instruction *Instr = cast<Instruction>(UnderlyingValue);
8651 Builder.setInsertPoint(SingleDef);
8652
8653 // The stores with invariant address inside the loop will be deleted, and
8654 // in the exit block, a uniform store recipe will be created for the final
8655 // invariant store of the reduction.
8656 StoreInst *SI;
8657 if ((SI = dyn_cast<StoreInst>(Instr)) &&
8658 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
8659 // Only create recipe for the final invariant store of the reduction.
8660 if (Legal->isInvariantStoreOfReduction(SI)) {
8661 auto *Recipe =
8662 new VPReplicateRecipe(SI, R.operands(), true /* IsUniform */,
8663 nullptr /*Mask*/, VPIRMetadata(*SI, LVer));
8664 Recipe->insertBefore(*MiddleVPBB, MBIP);
8665 }
8666 R.eraseFromParent();
8667 continue;
8668 }
8669
8670 VPRecipeBase *Recipe =
8671 RecipeBuilder.tryToCreateWidenRecipe(SingleDef, Range);
8672 if (!Recipe)
8673 Recipe = RecipeBuilder.handleReplication(Instr, R.operands(), Range);
8674
8675 RecipeBuilder.setRecipe(Instr, Recipe);
8676 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && isa<TruncInst>(Instr)) {
8677 // Optimized a truncate to VPWidenIntOrFpInductionRecipe. It needs to be
8678 // moved to the phi section in the header.
8679 Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
8680 } else {
8681 Builder.insert(Recipe);
8682 }
8683 if (Recipe->getNumDefinedValues() == 1) {
8684 SingleDef->replaceAllUsesWith(Recipe->getVPSingleValue());
8685 Old2New[SingleDef] = Recipe->getVPSingleValue();
8686 } else {
8687 assert(Recipe->getNumDefinedValues() == 0 &&
8688 "Unexpected multidef recipe");
8689 R.eraseFromParent();
8690 }
8691 }
8692 }
8693
8694 // replaceAllUsesWith above may invalidate the block masks. Update them here.
8695 // TODO: Include the masks as operands in the predicated VPlan directly
8696 // to remove the need to keep a map of masks beyond the predication
8697 // transform.
8698 RecipeBuilder.updateBlockMaskCache(Old2New);
8699 for (VPValue *Old : Old2New.keys())
8700 Old->getDefiningRecipe()->eraseFromParent();
8701
8702 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8703 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8704 "entry block must be set to a VPRegionBlock having a non-empty entry "
8705 "VPBasicBlock");
8706
8707 // Update wide induction increments to use the same step as the corresponding
8708 // wide induction. This enables detecting induction increments directly in
8709 // VPlan and removes redundant splats.
8710 for (const auto &[Phi, ID] : Legal->getInductionVars()) {
8711 auto *IVInc = cast<Instruction>(
8712 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
8713 if (IVInc->getOperand(0) != Phi || IVInc->getOpcode() != Instruction::Add)
8714 continue;
8715 VPWidenInductionRecipe *WideIV =
8716 cast<VPWidenInductionRecipe>(RecipeBuilder.getRecipe(Phi));
8717 VPRecipeBase *R = RecipeBuilder.getRecipe(IVInc);
8718 R->setOperand(1, WideIV->getStepValue());
8719 }
8720
8723 addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
8724
8725 // ---------------------------------------------------------------------------
8726 // Transform initial VPlan: Apply previously taken decisions, in order, to
8727 // bring the VPlan to its final state.
8728 // ---------------------------------------------------------------------------
8729
8730 // Adjust the recipes for any inloop reductions.
8731 adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
8732
8733 // Apply mandatory transformation to handle FP maxnum/minnum reduction with
8734 // NaNs if possible, bail out otherwise.
8736 *Plan))
8737 return nullptr;
8738
8739 // Transform recipes to abstract recipes if it is legal and beneficial and
8740 // clamp the range for better cost estimation.
8741 // TODO: Enable following transform when the EVL-version of extended-reduction
8742 // and mulacc-reduction are implemented.
8743 if (!CM.foldTailWithEVL()) {
8744 VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
8746 CostCtx, Range);
8747 }
8748
8749 for (ElementCount VF : Range)
8750 Plan->addVF(VF);
8751 Plan->setName("Initial VPlan");
8752
8753 // Interleave memory: for each Interleave Group we marked earlier as relevant
8754 // for this VPlan, replace the Recipes widening its memory instructions with a
8755 // single VPInterleaveRecipe at its insertion point.
8757 InterleaveGroups, RecipeBuilder,
8759
8760 // Replace VPValues for known constant strides.
8762 Legal->getLAI()->getSymbolicStrides());
8763
8764 auto BlockNeedsPredication = [this](BasicBlock *BB) {
8765 return Legal->blockNeedsPredication(BB);
8766 };
8768 BlockNeedsPredication);
8769
8770 // Sink users of fixed-order recurrence past the recipe defining the previous
8771 // value and introduce FirstOrderRecurrenceSplice VPInstructions.
8773 *Plan, Builder))
8774 return nullptr;
8775
8776 if (useActiveLaneMask(Style)) {
8777 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8778 // TailFoldingStyle is visible there.
8779 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
8780 bool WithoutRuntimeCheck =
8782 VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
8783 WithoutRuntimeCheck);
8784 }
8785 VPlanTransforms::optimizeInductionExitUsers(*Plan, IVEndValues, *PSE.getSE());
8786
8787 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8788 return Plan;
8789}
8790
8791VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
8792 // Outer loop handling: They may require CFG and instruction level
8793 // transformations before even evaluating whether vectorization is profitable.
8794 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8795 // the vectorization pipeline.
8796 assert(!OrigLoop->isInnermost());
8797 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8798
8799 auto Plan = VPlanTransforms::buildVPlan0(
8800 OrigLoop, *LI, Legal->getWidestInductionType(),
8803 /*HasUncountableExit*/ false);
8804 VPlanTransforms::addMiddleCheck(*Plan, /*RequiresScalarEpilogue*/ true,
8805 /*TailFolded*/ false);
8806
8808
8809 for (ElementCount VF : Range)
8810 Plan->addVF(VF);
8811
8813 Plan,
8814 [this](PHINode *P) {
8815 return Legal->getIntOrFpInductionDescriptor(P);
8816 },
8817 *TLI))
8818 return nullptr;
8819
8820 // Collect mapping of IR header phis to header phi recipes, to be used in
8821 // addScalarResumePhis.
8823 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
8824 Builder, BlockMaskCache, nullptr /*LVer*/);
8825 for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
8826 if (isa<VPCanonicalIVPHIRecipe>(&R))
8827 continue;
8828 auto *HeaderR = cast<VPHeaderPHIRecipe>(&R);
8829 RecipeBuilder.setRecipe(HeaderR->getUnderlyingInstr(), HeaderR);
8830 }
8832 // TODO: IVEndValues are not used yet in the native path, to optimize exit
8833 // values.
8834 addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
8835
8836 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8837 return Plan;
8838}
8839
8840// Adjust the recipes for reductions. For in-loop reductions the chain of
8841// instructions leading from the loop exit instr to the phi need to be converted
8842// to reductions, with one operand being vector and the other being the scalar
8843// reduction chain. For other reductions, a select is introduced between the phi
8844// and users outside the vector region when folding the tail.
8845//
8846// A ComputeReductionResult recipe is added to the middle block, also for
8847// in-loop reductions which compute their result in-loop, because generating
8848// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
8849//
8850// Adjust AnyOf reductions; replace the reduction phi for the selected value
8851// with a boolean reduction phi node to check if the condition is true in any
8852// iteration. The final value is selected by the final ComputeReductionResult.
8853void LoopVectorizationPlanner::adjustRecipesForReductions(
8854 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
8855 using namespace VPlanPatternMatch;
8856 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
8857 VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
8858 VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();
8860
8861 for (VPRecipeBase &R : Header->phis()) {
8862 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
8863 if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
8864 continue;
8865
8866 RecurKind Kind = PhiR->getRecurrenceKind();
8867 assert(
8870 "AnyOf and FindIV reductions are not allowed for in-loop reductions");
8871
8872 // Collect the chain of "link" recipes for the reduction starting at PhiR.
8874 Worklist.insert(PhiR);
8875 for (unsigned I = 0; I != Worklist.size(); ++I) {
8876 VPSingleDefRecipe *Cur = Worklist[I];
8877 for (VPUser *U : Cur->users()) {
8878 auto *UserRecipe = cast<VPSingleDefRecipe>(U);
8879 if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
8880 assert((UserRecipe->getParent() == MiddleVPBB ||
8881 UserRecipe->getParent() == Plan->getScalarPreheader()) &&
8882 "U must be either in the loop region, the middle block or the "
8883 "scalar preheader.");
8884 continue;
8885 }
8886 Worklist.insert(UserRecipe);
8887 }
8888 }
8889
8890 // Visit operation "Links" along the reduction chain top-down starting from
8891 // the phi until LoopExitValue. We keep track of the previous item
8892 // (PreviousLink) to tell which of the two operands of a Link will remain
8893 // scalar and which will be reduced. For minmax by select(cmp), Link will be
8894 // the select instructions. Blend recipes of in-loop reduction phi's will
8895 // get folded to their non-phi operand, as the reduction recipe handles the
8896 // condition directly.
8897 VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
8898 for (VPSingleDefRecipe *CurrentLink : drop_begin(Worklist)) {
8899 if (auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink)) {
8900 assert(Blend->getNumIncomingValues() == 2 &&
8901 "Blend must have 2 incoming values");
8902 if (Blend->getIncomingValue(0) == PhiR) {
8903 Blend->replaceAllUsesWith(Blend->getIncomingValue(1));
8904 } else {
8905 assert(Blend->getIncomingValue(1) == PhiR &&
8906 "PhiR must be an operand of the blend");
8907 Blend->replaceAllUsesWith(Blend->getIncomingValue(0));
8908 }
8909 continue;
8910 }
8911
8912 Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
8913
8914 // Index of the first operand which holds a non-mask vector operand.
8915 unsigned IndexOfFirstOperand;
8916 // Recognize a call to the llvm.fmuladd intrinsic.
8917 bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
8918 VPValue *VecOp;
8919 VPBasicBlock *LinkVPBB = CurrentLink->getParent();
8920 if (IsFMulAdd) {
8921 assert(
8923 "Expected instruction to be a call to the llvm.fmuladd intrinsic");
8924 assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
8925 isa<VPWidenIntrinsicRecipe>(CurrentLink)) &&
8926 CurrentLink->getOperand(2) == PreviousLink &&
8927 "expected a call where the previous link is the added operand");
8928
8929 // If the instruction is a call to the llvm.fmuladd intrinsic then we
8930 // need to create an fmul recipe (multiplying the first two operands of
8931 // the fmuladd together) to use as the vector operand for the fadd
8932 // reduction.
8933 VPInstruction *FMulRecipe = new VPInstruction(
8934 Instruction::FMul,
8935 {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
8936 CurrentLinkI->getFastMathFlags());
8937 LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
8938 VecOp = FMulRecipe;
8939 } else if (PhiR->isInLoop() && Kind == RecurKind::AddChainWithSubs &&
8940 CurrentLinkI->getOpcode() == Instruction::Sub) {
8941 Type *PhiTy = PhiR->getUnderlyingValue()->getType();
8942 auto *Zero = Plan->getOrAddLiveIn(ConstantInt::get(PhiTy, 0));
8944 Instruction::Sub, {Zero, CurrentLink->getOperand(1)}, {},
8945 VPIRMetadata(), CurrentLinkI->getDebugLoc());
8946 Sub->setUnderlyingValue(CurrentLinkI);
8947 LinkVPBB->insert(Sub, CurrentLink->getIterator());
8948 VecOp = Sub;
8949 } else {
8951 if (isa<VPWidenRecipe>(CurrentLink)) {
8952 assert(isa<CmpInst>(CurrentLinkI) &&
8953 "need to have the compare of the select");
8954 continue;
8955 }
8956 assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
8957 "must be a select recipe");
8958 IndexOfFirstOperand = 1;
8959 } else {
8960 assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
8961 "Expected to replace a VPWidenSC");
8962 IndexOfFirstOperand = 0;
8963 }
8964 // Note that for non-commutable operands (cmp-selects), the semantics of
8965 // the cmp-select are captured in the recurrence kind.
8966 unsigned VecOpId =
8967 CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink
8968 ? IndexOfFirstOperand + 1
8969 : IndexOfFirstOperand;
8970 VecOp = CurrentLink->getOperand(VecOpId);
8971 assert(VecOp != PreviousLink &&
8972 CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
8973 (VecOpId - IndexOfFirstOperand)) ==
8974 PreviousLink &&
8975 "PreviousLink must be the operand other than VecOp");
8976 }
8977
8978 VPValue *CondOp = nullptr;
8979 if (CM.blockNeedsPredicationForAnyReason(CurrentLinkI->getParent()))
8980 CondOp = RecipeBuilder.getBlockInMask(CurrentLink->getParent());
8981
8982 // TODO: Retrieve FMFs from recipes directly.
8984 cast<PHINode>(PhiR->getUnderlyingInstr()));
8985 // Non-FP RdxDescs will have all fast math flags set, so clear them.
8986 FastMathFlags FMFs = isa<FPMathOperator>(CurrentLinkI)
8987 ? RdxDesc.getFastMathFlags()
8988 : FastMathFlags();
8989 auto *RedRecipe = new VPReductionRecipe(
8990 Kind, FMFs, CurrentLinkI, PreviousLink, VecOp, CondOp,
8991 PhiR->isOrdered(), CurrentLinkI->getDebugLoc());
8992 // Append the recipe to the end of the VPBasicBlock because we need to
8993 // ensure that it comes after all of it's inputs, including CondOp.
8994 // Delete CurrentLink as it will be invalid if its operand is replaced
8995 // with a reduction defined at the bottom of the block in the next link.
8996 if (LinkVPBB->getNumSuccessors() == 0)
8997 RedRecipe->insertBefore(&*std::prev(std::prev(LinkVPBB->end())));
8998 else
8999 LinkVPBB->appendRecipe(RedRecipe);
9000
9001 CurrentLink->replaceAllUsesWith(RedRecipe);
9002 ToDelete.push_back(CurrentLink);
9003 PreviousLink = RedRecipe;
9004 }
9005 }
9006 VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
9007 Builder.setInsertPoint(&*std::prev(std::prev(LatchVPBB->end())));
9008 VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
9009 for (VPRecipeBase &R :
9010 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9011 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9012 if (!PhiR)
9013 continue;
9014
9015 const RecurrenceDescriptor &RdxDesc = Legal->getRecurrenceDescriptor(
9016 cast<PHINode>(PhiR->getUnderlyingInstr()));
9017 Type *PhiTy = PhiR->getUnderlyingValue()->getType();
9018 // If tail is folded by masking, introduce selects between the phi
9019 // and the users outside the vector region of each reduction, at the
9020 // beginning of the dedicated latch block.
9021 auto *OrigExitingVPV = PhiR->getBackedgeValue();
9022 auto *NewExitingVPV = PhiR->getBackedgeValue();
9023 // Don't output selects for partial reductions because they have an output
9024 // with fewer lanes than the VF. So the operands of the select would have
9025 // different numbers of lanes. Partial reductions mask the input instead.
9026 if (!PhiR->isInLoop() && CM.foldTailByMasking() &&
9027 !isa<VPPartialReductionRecipe>(OrigExitingVPV->getDefiningRecipe())) {
9028 VPValue *Cond = RecipeBuilder.getBlockInMask(PhiR->getParent());
9029 std::optional<FastMathFlags> FMFs =
9030 PhiTy->isFloatingPointTy()
9031 ? std::make_optional(RdxDesc.getFastMathFlags())
9032 : std::nullopt;
9033 NewExitingVPV =
9034 Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
9035 OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
9036 return isa<VPInstruction>(&U) &&
9037 (cast<VPInstruction>(&U)->getOpcode() ==
9039 cast<VPInstruction>(&U)->getOpcode() ==
9041 cast<VPInstruction>(&U)->getOpcode() ==
9043 });
9045 PhiR->setOperand(1, NewExitingVPV);
9046 }
9047
9048 // We want code in the middle block to appear to execute on the location of
9049 // the scalar loop's latch terminator because: (a) it is all compiler
9050 // generated, (b) these instructions are always executed after evaluating
9051 // the latch conditional branch, and (c) other passes may add new
9052 // predecessors which terminate on this line. This is the easiest way to
9053 // ensure we don't accidentally cause an extra step back into the loop while
9054 // debugging.
9055 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9056
9057 // TODO: At the moment ComputeReductionResult also drives creation of the
9058 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9059 // even for in-loop reductions, until the reduction resume value handling is
9060 // also modeled in VPlan.
9061 VPInstruction *FinalReductionResult;
9062 VPBuilder::InsertPointGuard Guard(Builder);
9063 Builder.setInsertPoint(MiddleVPBB, IP);
9064 RecurKind RecurrenceKind = PhiR->getRecurrenceKind();
9066 VPValue *Start = PhiR->getStartValue();
9067 VPValue *Sentinel = Plan->getOrAddLiveIn(RdxDesc.getSentinelValue());
9068 FinalReductionResult =
9070 {PhiR, Start, Sentinel, NewExitingVPV}, ExitDL);
9071 } else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RecurrenceKind)) {
9072 VPValue *Start = PhiR->getStartValue();
9073 FinalReductionResult =
9075 {PhiR, Start, NewExitingVPV}, ExitDL);
9076 } else {
9079 ? VPIRFlags(RdxDesc.getFastMathFlags())
9080 : VPIRFlags();
9081 FinalReductionResult =
9083 {PhiR, NewExitingVPV}, Flags, ExitDL);
9084 }
9085 // If the vector reduction can be performed in a smaller type, we truncate
9086 // then extend the loop exit value to enable InstCombine to evaluate the
9087 // entire expression in the smaller type.
9088 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
9090 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9092 "Unexpected truncated min-max recurrence!");
9093 Type *RdxTy = RdxDesc.getRecurrenceType();
9094 auto *Trunc =
9095 new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9096 Instruction::CastOps ExtendOpc =
9097 RdxDesc.isSigned() ? Instruction::SExt : Instruction::ZExt;
9098 auto *Extnd = new VPWidenCastRecipe(ExtendOpc, Trunc, PhiTy);
9099 Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
9100 Extnd->insertAfter(Trunc);
9101 if (PhiR->getOperand(1) == NewExitingVPV)
9102 PhiR->setOperand(1, Extnd->getVPSingleValue());
9103
9104 // Update ComputeReductionResult with the truncated exiting value and
9105 // extend its result.
9106 FinalReductionResult->setOperand(1, Trunc);
9107 FinalReductionResult =
9108 Builder.createScalarCast(ExtendOpc, FinalReductionResult, PhiTy, {});
9109 }
9110
9111 // Update all users outside the vector region. Also replace redundant
9112 // ExtractLastElement.
9113 for (auto *U : to_vector(OrigExitingVPV->users())) {
9114 auto *Parent = cast<VPRecipeBase>(U)->getParent();
9115 if (FinalReductionResult == U || Parent->getParent())
9116 continue;
9117 U->replaceUsesOfWith(OrigExitingVPV, FinalReductionResult);
9119 cast<VPInstruction>(U)->replaceAllUsesWith(FinalReductionResult);
9120 }
9121
9122 // Adjust AnyOf reductions; replace the reduction phi for the selected value
9123 // with a boolean reduction phi node to check if the condition is true in
9124 // any iteration. The final value is selected by the final
9125 // ComputeReductionResult.
9126 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RecurrenceKind)) {
9127 auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) {
9128 return isa<VPWidenSelectRecipe>(U) ||
9129 (isa<VPReplicateRecipe>(U) &&
9130 cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() ==
9131 Instruction::Select);
9132 }));
9133 VPValue *Cmp = Select->getOperand(0);
9134 // If the compare is checking the reduction PHI node, adjust it to check
9135 // the start value.
9136 if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe())
9137 CmpR->replaceUsesOfWith(PhiR, PhiR->getStartValue());
9138 Builder.setInsertPoint(Select);
9139
9140 // If the true value of the select is the reduction phi, the new value is
9141 // selected if the negated condition is true in any iteration.
9142 if (Select->getOperand(1) == PhiR)
9143 Cmp = Builder.createNot(Cmp);
9144 VPValue *Or = Builder.createOr(PhiR, Cmp);
9145 Select->getVPSingleValue()->replaceAllUsesWith(Or);
9146 // Delete Select now that it has invalid types.
9147 ToDelete.push_back(Select);
9148
9149 // Convert the reduction phi to operate on bools.
9150 PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(
9151 OrigLoop->getHeader()->getContext())));
9152 continue;
9153 }
9154
9156 RdxDesc.getRecurrenceKind())) {
9157 // Adjust the start value for FindFirstIV/FindLastIV recurrences to use
9158 // the sentinel value after generating the ResumePhi recipe, which uses
9159 // the original start value.
9160 PhiR->setOperand(0, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue()));
9161 }
9162 RecurKind RK = RdxDesc.getRecurrenceKind();
9166 VPBuilder PHBuilder(Plan->getVectorPreheader());
9167 VPValue *Iden = Plan->getOrAddLiveIn(
9168 getRecurrenceIdentity(RK, PhiTy, RdxDesc.getFastMathFlags()));
9169 // If the PHI is used by a partial reduction, set the scale factor.
9170 unsigned ScaleFactor =
9171 RecipeBuilder.getScalingForReduction(RdxDesc.getLoopExitInstr())
9172 .value_or(1);
9173 Type *I32Ty = IntegerType::getInt32Ty(PhiTy->getContext());
9174 auto *ScaleFactorVPV =
9175 Plan->getOrAddLiveIn(ConstantInt::get(I32Ty, ScaleFactor));
9176 VPValue *StartV = PHBuilder.createNaryOp(
9178 {PhiR->getStartValue(), Iden, ScaleFactorVPV},
9179 PhiTy->isFloatingPointTy() ? RdxDesc.getFastMathFlags()
9180 : FastMathFlags());
9181 PhiR->setOperand(0, StartV);
9182 }
9183 }
9184 for (VPRecipeBase *R : ToDelete)
9185 R->eraseFromParent();
9186
9188}
9189
9190void LoopVectorizationPlanner::attachRuntimeChecks(
9191 VPlan &Plan, GeneratedRTChecks &RTChecks, bool HasBranchWeights) const {
9192 const auto &[SCEVCheckCond, SCEVCheckBlock] = RTChecks.getSCEVChecks();
9193 if (SCEVCheckBlock && SCEVCheckBlock->hasNPredecessors(0)) {
9194 assert((!CM.OptForSize ||
9196 "Cannot SCEV check stride or overflow when optimizing for size");
9197 VPlanTransforms::attachCheckBlock(Plan, SCEVCheckCond, SCEVCheckBlock,
9198 HasBranchWeights);
9199 }
9200 const auto &[MemCheckCond, MemCheckBlock] = RTChecks.getMemRuntimeChecks();
9201 if (MemCheckBlock && MemCheckBlock->hasNPredecessors(0)) {
9202 // VPlan-native path does not do any analysis for runtime checks
9203 // currently.
9204 assert((!EnableVPlanNativePath || OrigLoop->isInnermost()) &&
9205 "Runtime checks are not supported for outer loops yet");
9206
9207 if (CM.OptForSize) {
9208 assert(
9210 "Cannot emit memory checks when optimizing for size, unless forced "
9211 "to vectorize.");
9212 ORE->emit([&]() {
9213 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
9214 OrigLoop->getStartLoc(),
9215 OrigLoop->getHeader())
9216 << "Code-size may be reduced by not forcing "
9217 "vectorization, or by source-code modifications "
9218 "eliminating the need for runtime checks "
9219 "(e.g., adding 'restrict').";
9220 });
9221 }
9222 VPlanTransforms::attachCheckBlock(Plan, MemCheckCond, MemCheckBlock,
9223 HasBranchWeights);
9224 }
9225}
9226
9228 VPlan &Plan, ElementCount VF, unsigned UF,
9229 ElementCount MinProfitableTripCount) const {
9230 // vscale is not necessarily a power-of-2, which means we cannot guarantee
9231 // an overflow to zero when updating induction variables and so an
9232 // additional overflow check is required before entering the vector loop.
9233 bool IsIndvarOverflowCheckNeededForVF =
9235 !isIndvarOverflowCheckKnownFalse(&CM, VF, UF) &&
9236 CM.getTailFoldingStyle() !=
9238 const uint32_t *BranchWeigths =
9241 : nullptr;
9243 Plan, VF, UF, MinProfitableTripCount,
9245 IsIndvarOverflowCheckNeededForVF, OrigLoop, BranchWeigths,
9247 *PSE.getSE());
9248}
9249
9251 assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
9252
9253 // Fast-math-flags propagate from the original induction instruction.
9255 if (FPBinOp)
9256 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9257
9258 Value *Step = State.get(getStepValue(), VPLane(0));
9259 Value *Index = State.get(getOperand(1), VPLane(0));
9260 Value *DerivedIV = emitTransformedIndex(
9261 State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind,
9262 cast_if_present<BinaryOperator>(FPBinOp));
9263 DerivedIV->setName(Name);
9264 State.set(this, DerivedIV, VPLane(0));
9265}
9266
9267// Determine how to lower the scalar epilogue, which depends on 1) optimising
9268// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9269// predication, and 4) a TTI hook that analyses whether the loop is suitable
9270// for predication.
9275 // 1) OptSize takes precedence over all other options, i.e. if this is set,
9276 // don't look at hints or options, and don't request a scalar epilogue.
9277 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9278 // LoopAccessInfo (due to code dependency and not being able to reliably get
9279 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9280 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9281 // versioning when the vectorization is forced, unlike hasOptSize. So revert
9282 // back to the old way and vectorize with versioning when forced. See D81345.)
9283 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9287
9288 // 2) If set, obey the directives
9289 if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9297 };
9298 }
9299
9300 // 3) If set, obey the hints
9301 switch (Hints.getPredicate()) {
9306 };
9307
9308 // 4) if the TTI hook indicates this is profitable, request predication.
9309 TailFoldingInfo TFI(TLI, &LVL, IAI);
9312
9314}
9315
9316// Process the loop in the VPlan-native vectorization path. This path builds
9317// VPlan upfront in the vectorization pipeline, which allows to apply
9318// VPlan-to-VPlan transformations from the very beginning without modifying the
9319// input LLVM IR.
9326 LoopVectorizationRequirements &Requirements) {
9327
9328 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9329 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9330 return false;
9331 }
9332 assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9333 Function *F = L->getHeader()->getParent();
9334 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9335
9337 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
9338
9339 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9340 &Hints, IAI, PSI, BFI);
9341 // Use the planner for outer loop vectorization.
9342 // TODO: CM is not used at this point inside the planner. Turn CM into an
9343 // optional argument if we don't need it in the future.
9344 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9345 ORE);
9346
9347 // Get user vectorization factor.
9348 ElementCount UserVF = Hints.getWidth();
9349
9351
9352 // Plan how to best vectorize, return the best VF and its cost.
9353 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9354
9355 // If we are stress testing VPlan builds, do not attempt to generate vector
9356 // code. Masked vector code generation support will follow soon.
9357 // Also, do not attempt to vectorize if no vector code will be produced.
9359 return false;
9360
9361 VPlan &BestPlan = LVP.getPlanFor(VF.Width);
9362
9363 {
9364 GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
9365 InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, /*UF=*/1, &CM,
9366 BFI, PSI, Checks, BestPlan);
9367 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9368 << L->getHeader()->getParent()->getName() << "\"\n");
9369 LVP.addMinimumIterationCheck(BestPlan, VF.Width, /*UF=*/1,
9371
9372 LVP.executePlan(VF.Width, /*UF=*/1, BestPlan, LB, DT, false);
9373 }
9374
9375 reportVectorization(ORE, L, VF, 1);
9376
9377 // Mark the loop as already vectorized to avoid vectorizing again.
9378 Hints.setAlreadyVectorized();
9379 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9380 return true;
9381}
9382
9383// Emit a remark if there are stores to floats that required a floating point
9384// extension. If the vectorized loop was generated with floating point there
9385// will be a performance penalty from the conversion overhead and the change in
9386// the vector width.
9389 for (BasicBlock *BB : L->getBlocks()) {
9390 for (Instruction &Inst : *BB) {
9391 if (auto *S = dyn_cast<StoreInst>(&Inst)) {
9392 if (S->getValueOperand()->getType()->isFloatTy())
9393 Worklist.push_back(S);
9394 }
9395 }
9396 }
9397
9398 // Traverse the floating point stores upwards searching, for floating point
9399 // conversions.
9402 while (!Worklist.empty()) {
9403 auto *I = Worklist.pop_back_val();
9404 if (!L->contains(I))
9405 continue;
9406 if (!Visited.insert(I).second)
9407 continue;
9408
9409 // Emit a remark if the floating point store required a floating
9410 // point conversion.
9411 // TODO: More work could be done to identify the root cause such as a
9412 // constant or a function return type and point the user to it.
9413 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
9414 ORE->emit([&]() {
9415 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9416 I->getDebugLoc(), L->getHeader())
9417 << "floating point conversion changes vector width. "
9418 << "Mixed floating point precision requires an up/down "
9419 << "cast that will negatively impact performance.";
9420 });
9421
9422 for (Use &Op : I->operands())
9423 if (auto *OpI = dyn_cast<Instruction>(Op))
9424 Worklist.push_back(OpI);
9425 }
9426}
9427
9428/// For loops with uncountable early exits, find the cost of doing work when
9429/// exiting the loop early, such as calculating the final exit values of
9430/// variables used outside the loop.
9431/// TODO: This is currently overly pessimistic because the loop may not take
9432/// the early exit, but better to keep this conservative for now. In future,
9433/// it might be possible to relax this by using branch probabilities.
9435 VPlan &Plan, ElementCount VF) {
9437 for (auto *ExitVPBB : Plan.getExitBlocks()) {
9438 for (auto *PredVPBB : ExitVPBB->getPredecessors()) {
9439 // If the predecessor is not the middle.block, then it must be the
9440 // vector.early.exit block, which may contain work to calculate the exit
9441 // values of variables used outside the loop.
9442 if (PredVPBB != Plan.getMiddleBlock()) {
9443 LLVM_DEBUG(dbgs() << "Calculating cost of work in exit block "
9444 << PredVPBB->getName() << ":\n");
9445 Cost += PredVPBB->cost(VF, CostCtx);
9446 }
9447 }
9448 }
9449 return Cost;
9450}
9451
9452/// This function determines whether or not it's still profitable to vectorize
9453/// the loop given the extra work we have to do outside of the loop:
9454/// 1. Perform the runtime checks before entering the loop to ensure it's safe
9455/// to vectorize.
9456/// 2. In the case of loops with uncountable early exits, we may have to do
9457/// extra work when exiting the loop early, such as calculating the final
9458/// exit values of variables used outside the loop.
9459static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
9460 VectorizationFactor &VF, Loop *L,
9462 VPCostContext &CostCtx, VPlan &Plan,
9464 std::optional<unsigned> VScale) {
9465 InstructionCost TotalCost = Checks.getCost();
9466 if (!TotalCost.isValid())
9467 return false;
9468
9469 // Add on the cost of any work required in the vector early exit block, if
9470 // one exists.
9471 TotalCost += calculateEarlyExitCost(CostCtx, Plan, VF.Width);
9472
9473 // When interleaving only scalar and vector cost will be equal, which in turn
9474 // would lead to a divide by 0. Fall back to hard threshold.
9475 if (VF.Width.isScalar()) {
9476 // TODO: Should we rename VectorizeMemoryCheckThreshold?
9477 if (TotalCost > VectorizeMemoryCheckThreshold) {
9478 LLVM_DEBUG(
9479 dbgs()
9480 << "LV: Interleaving only is not profitable due to runtime checks\n");
9481 return false;
9482 }
9483 return true;
9484 }
9485
9486 // The scalar cost should only be 0 when vectorizing with a user specified
9487 // VF/IC. In those cases, runtime checks should always be generated.
9488 uint64_t ScalarC = VF.ScalarCost.getValue();
9489 if (ScalarC == 0)
9490 return true;
9491
9492 // First, compute the minimum iteration count required so that the vector
9493 // loop outperforms the scalar loop.
9494 // The total cost of the scalar loop is
9495 // ScalarC * TC
9496 // where
9497 // * TC is the actual trip count of the loop.
9498 // * ScalarC is the cost of a single scalar iteration.
9499 //
9500 // The total cost of the vector loop is
9501 // RtC + VecC * (TC / VF) + EpiC
9502 // where
9503 // * RtC is the cost of the generated runtime checks plus the cost of
9504 // performing any additional work in the vector.early.exit block for loops
9505 // with uncountable early exits.
9506 // * VecC is the cost of a single vector iteration.
9507 // * TC is the actual trip count of the loop
9508 // * VF is the vectorization factor
9509 // * EpiCost is the cost of the generated epilogue, including the cost
9510 // of the remaining scalar operations.
9511 //
9512 // Vectorization is profitable once the total vector cost is less than the
9513 // total scalar cost:
9514 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC
9515 //
9516 // Now we can compute the minimum required trip count TC as
9517 // VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC
9518 //
9519 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
9520 // the computations are performed on doubles, not integers and the result
9521 // is rounded up, hence we get an upper estimate of the TC.
9522 unsigned IntVF = estimateElementCount(VF.Width, VScale);
9523 uint64_t RtC = TotalCost.getValue();
9524 uint64_t Div = ScalarC * IntVF - VF.Cost.getValue();
9525 uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
9526
9527 // Second, compute a minimum iteration count so that the cost of the
9528 // runtime checks is only a fraction of the total scalar loop cost. This
9529 // adds a loop-dependent bound on the overhead incurred if the runtime
9530 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
9531 // * TC. To bound the runtime check to be a fraction 1/X of the scalar
9532 // cost, compute
9533 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
9534 uint64_t MinTC2 = divideCeil(RtC * 10, ScalarC);
9535
9536 // Now pick the larger minimum. If it is not a multiple of VF and a scalar
9537 // epilogue is allowed, choose the next closest multiple of VF. This should
9538 // partly compensate for ignoring the epilogue cost.
9539 uint64_t MinTC = std::max(MinTC1, MinTC2);
9540 if (SEL == CM_ScalarEpilogueAllowed)
9541 MinTC = alignTo(MinTC, IntVF);
9543
9544 LLVM_DEBUG(
9545 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
9546 << VF.MinProfitableTripCount << "\n");
9547
9548 // Skip vectorization if the expected trip count is less than the minimum
9549 // required trip count.
9550 if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
9551 if (ElementCount::isKnownLT(*ExpectedTC, VF.MinProfitableTripCount)) {
9552 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
9553 "trip count < minimum profitable VF ("
9554 << *ExpectedTC << " < " << VF.MinProfitableTripCount
9555 << ")\n");
9556
9557 return false;
9558 }
9559 }
9560 return true;
9561}
9562
9564 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9566 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9568
9569/// Prepare \p MainPlan for vectorizing the main vector loop during epilogue
9570/// vectorization. Remove ResumePhis from \p MainPlan for inductions that
9571/// don't have a corresponding wide induction in \p EpiPlan.
9572static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
9573 // Collect PHI nodes of widened phis in the VPlan for the epilogue. Those
9574 // will need their resume-values computed in the main vector loop. Others
9575 // can be removed from the main VPlan.
9576 SmallPtrSet<PHINode *, 2> EpiWidenedPhis;
9577 for (VPRecipeBase &R :
9579 if (isa<VPCanonicalIVPHIRecipe>(&R))
9580 continue;
9581 EpiWidenedPhis.insert(
9582 cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue()));
9583 }
9584 for (VPRecipeBase &R :
9585 make_early_inc_range(MainPlan.getScalarHeader()->phis())) {
9586 auto *VPIRInst = cast<VPIRPhi>(&R);
9587 if (EpiWidenedPhis.contains(&VPIRInst->getIRPhi()))
9588 continue;
9589 // There is no corresponding wide induction in the epilogue plan that would
9590 // need a resume value. Remove the VPIRInst wrapping the scalar header phi
9591 // together with the corresponding ResumePhi. The resume values for the
9592 // scalar loop will be created during execution of EpiPlan.
9593 VPRecipeBase *ResumePhi = VPIRInst->getOperand(0)->getDefiningRecipe();
9594 VPIRInst->eraseFromParent();
9595 ResumePhi->eraseFromParent();
9596 }
9598
9599 using namespace VPlanPatternMatch;
9600 // When vectorizing the epilogue, FindFirstIV & FindLastIV reductions can
9601 // introduce multiple uses of undef/poison. If the reduction start value may
9602 // be undef or poison it needs to be frozen and the frozen start has to be
9603 // used when computing the reduction result. We also need to use the frozen
9604 // value in the resume phi generated by the main vector loop, as this is also
9605 // used to compute the reduction result after the epilogue vector loop.
9606 auto AddFreezeForFindLastIVReductions = [](VPlan &Plan,
9607 bool UpdateResumePhis) {
9608 VPBuilder Builder(Plan.getEntry());
9609 for (VPRecipeBase &R : *Plan.getMiddleBlock()) {
9610 auto *VPI = dyn_cast<VPInstruction>(&R);
9611 if (!VPI || VPI->getOpcode() != VPInstruction::ComputeFindIVResult)
9612 continue;
9613 VPValue *OrigStart = VPI->getOperand(1);
9615 continue;
9616 VPInstruction *Freeze =
9617 Builder.createNaryOp(Instruction::Freeze, {OrigStart}, {}, "fr");
9618 VPI->setOperand(1, Freeze);
9619 if (UpdateResumePhis)
9620 OrigStart->replaceUsesWithIf(Freeze, [Freeze](VPUser &U, unsigned) {
9621 return Freeze != &U && isa<VPPhi>(&U);
9622 });
9623 }
9624 };
9625 AddFreezeForFindLastIVReductions(MainPlan, true);
9626 AddFreezeForFindLastIVReductions(EpiPlan, false);
9627
9628 VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
9629 VPValue *VectorTC = &MainPlan.getVectorTripCount();
9630 // If there is a suitable resume value for the canonical induction in the
9631 // scalar (which will become vector) epilogue loop, use it and move it to the
9632 // beginning of the scalar preheader. Otherwise create it below.
9633 auto ResumePhiIter =
9634 find_if(MainScalarPH->phis(), [VectorTC](VPRecipeBase &R) {
9635 return match(&R, m_VPInstruction<Instruction::PHI>(m_Specific(VectorTC),
9636 m_SpecificInt(0)));
9637 });
9638 VPPhi *ResumePhi = nullptr;
9639 if (ResumePhiIter == MainScalarPH->phis().end()) {
9640 VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
9641 ResumePhi = ScalarPHBuilder.createScalarPhi(
9642 {VectorTC, MainPlan.getCanonicalIV()->getStartValue()}, {},
9643 "vec.epilog.resume.val");
9644 } else {
9645 ResumePhi = cast<VPPhi>(&*ResumePhiIter);
9646 if (MainScalarPH->begin() == MainScalarPH->end())
9647 ResumePhi->moveBefore(*MainScalarPH, MainScalarPH->end());
9648 else if (&*MainScalarPH->begin() != ResumePhi)
9649 ResumePhi->moveBefore(*MainScalarPH, MainScalarPH->begin());
9650 }
9651 // Add a user to to make sure the resume phi won't get removed.
9652 VPBuilder(MainScalarPH)
9654}
9655
9656/// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
9657/// SCEVs from \p ExpandedSCEVs and set resume values for header recipes.
9658static void
9660 const SCEV2ValueTy &ExpandedSCEVs,
9662 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
9663 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
9664 Header->setName("vec.epilog.vector.body");
9665
9667 // Ensure that the start values for all header phi recipes are updated before
9668 // vectorizing the epilogue loop.
9669 for (VPRecipeBase &R : Header->phis()) {
9670 if (auto *IV = dyn_cast<VPCanonicalIVPHIRecipe>(&R)) {
9671 // When vectorizing the epilogue loop, the canonical induction start
9672 // value needs to be changed from zero to the value after the main
9673 // vector loop. Find the resume value created during execution of the main
9674 // VPlan. It must be the first phi in the loop preheader.
9675 // FIXME: Improve modeling for canonical IV start values in the epilogue
9676 // loop.
9677 using namespace llvm::PatternMatch;
9678 PHINode *EPResumeVal = &*L->getLoopPreheader()->phis().begin();
9679 for (Value *Inc : EPResumeVal->incoming_values()) {
9680 if (match(Inc, m_SpecificInt(0)))
9681 continue;
9682 assert(!EPI.VectorTripCount &&
9683 "Must only have a single non-zero incoming value");
9684 EPI.VectorTripCount = Inc;
9685 }
9686 // If we didn't find a non-zero vector trip count, all incoming values
9687 // must be zero, which also means the vector trip count is zero. Pick the
9688 // first zero as vector trip count.
9689 // TODO: We should not choose VF * UF so the main vector loop is known to
9690 // be dead.
9691 if (!EPI.VectorTripCount) {
9692 assert(
9693 EPResumeVal->getNumIncomingValues() > 0 &&
9694 all_of(EPResumeVal->incoming_values(),
9695 [](Value *Inc) { return match(Inc, m_SpecificInt(0)); }) &&
9696 "all incoming values must be 0");
9697 EPI.VectorTripCount = EPResumeVal->getOperand(0);
9698 }
9699 VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal);
9700 assert(all_of(IV->users(),
9701 [](const VPUser *U) {
9702 return isa<VPScalarIVStepsRecipe>(U) ||
9703 isa<VPDerivedIVRecipe>(U) ||
9704 cast<VPRecipeBase>(U)->isScalarCast() ||
9705 cast<VPInstruction>(U)->getOpcode() ==
9706 Instruction::Add;
9707 }) &&
9708 "the canonical IV should only be used by its increment or "
9709 "ScalarIVSteps when resetting the start value");
9710 IV->setOperand(0, VPV);
9711 continue;
9712 }
9713
9714 Value *ResumeV = nullptr;
9715 // TODO: Move setting of resume values to prepareToExecute.
9716 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
9717 auto *RdxResult =
9718 cast<VPInstruction>(*find_if(ReductionPhi->users(), [](VPUser *U) {
9719 auto *VPI = dyn_cast<VPInstruction>(U);
9720 return VPI &&
9721 (VPI->getOpcode() == VPInstruction::ComputeAnyOfResult ||
9722 VPI->getOpcode() == VPInstruction::ComputeReductionResult ||
9723 VPI->getOpcode() == VPInstruction::ComputeFindIVResult);
9724 }));
9725 ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr())
9726 ->getIncomingValueForBlock(L->getLoopPreheader());
9727 RecurKind RK = ReductionPhi->getRecurrenceKind();
9729 Value *StartV = RdxResult->getOperand(1)->getLiveInIRValue();
9730 // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
9731 // start value; compare the final value from the main vector loop
9732 // to the start value.
9733 BasicBlock *PBB = cast<Instruction>(ResumeV)->getParent();
9734 IRBuilder<> Builder(PBB, PBB->getFirstNonPHIIt());
9735 ResumeV = Builder.CreateICmpNE(ResumeV, StartV);
9737 Value *StartV = getStartValueFromReductionResult(RdxResult);
9738 ToFrozen[StartV] = cast<PHINode>(ResumeV)->getIncomingValueForBlock(
9740
9741 // VPReductionPHIRecipe for FindFirstIV/FindLastIV reductions requires
9742 // an adjustment to the resume value. The resume value is adjusted to
9743 // the sentinel value when the final value from the main vector loop
9744 // equals the start value. This ensures correctness when the start value
9745 // might not be less than the minimum value of a monotonically
9746 // increasing induction variable.
9747 BasicBlock *ResumeBB = cast<Instruction>(ResumeV)->getParent();
9748 IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt());
9749 Value *Cmp = Builder.CreateICmpEQ(ResumeV, ToFrozen[StartV]);
9750 Value *Sentinel = RdxResult->getOperand(2)->getLiveInIRValue();
9751 ResumeV = Builder.CreateSelect(Cmp, Sentinel, ResumeV);
9752 } else {
9753 VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
9754 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9755 if (auto *VPI = dyn_cast<VPInstruction>(PhiR->getStartValue())) {
9756 assert(VPI->getOpcode() == VPInstruction::ReductionStartVector &&
9757 "unexpected start value");
9758 VPI->setOperand(0, StartVal);
9759 continue;
9760 }
9761 }
9762 } else {
9763 // Retrieve the induction resume values for wide inductions from
9764 // their original phi nodes in the scalar loop.
9765 PHINode *IndPhi = cast<VPWidenInductionRecipe>(&R)->getPHINode();
9766 // Hook up to the PHINode generated by a ResumePhi recipe of main
9767 // loop VPlan, which feeds the scalar loop.
9768 ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader());
9769 }
9770 assert(ResumeV && "Must have a resume value");
9771 VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
9772 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
9773 }
9774
9775 // For some VPValues in the epilogue plan we must re-use the generated IR
9776 // values from the main plan. Replace them with live-in VPValues.
9777 // TODO: This is a workaround needed for epilogue vectorization and it
9778 // should be removed once induction resume value creation is done
9779 // directly in VPlan.
9780 for (auto &R : make_early_inc_range(*Plan.getEntry())) {
9781 // Re-use frozen values from the main plan for Freeze VPInstructions in the
9782 // epilogue plan. This ensures all users use the same frozen value.
9783 auto *VPI = dyn_cast<VPInstruction>(&R);
9784 if (VPI && VPI->getOpcode() == Instruction::Freeze) {
9785 VPI->replaceAllUsesWith(Plan.getOrAddLiveIn(
9786 ToFrozen.lookup(VPI->getOperand(0)->getLiveInIRValue())));
9787 continue;
9788 }
9789
9790 // Re-use the trip count and steps expanded for the main loop, as
9791 // skeleton creation needs it as a value that dominates both the scalar
9792 // and vector epilogue loops
9793 auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R);
9794 if (!ExpandR)
9795 continue;
9796 VPValue *ExpandedVal =
9797 Plan.getOrAddLiveIn(ExpandedSCEVs.lookup(ExpandR->getSCEV()));
9798 ExpandR->replaceAllUsesWith(ExpandedVal);
9799 if (Plan.getTripCount() == ExpandR)
9800 Plan.resetTripCount(ExpandedVal);
9801 ExpandR->eraseFromParent();
9802 }
9803}
9804
9805// Generate bypass values from the additional bypass block. Note that when the
9806// vectorized epilogue is skipped due to iteration count check, then the
9807// resume value for the induction variable comes from the trip count of the
9808// main vector loop, passed as the second argument.
9810 PHINode *OrigPhi, const InductionDescriptor &II, IRBuilder<> &BypassBuilder,
9811 const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount,
9812 Instruction *OldInduction) {
9813 Value *Step = getExpandedStep(II, ExpandedSCEVs);
9814 // For the primary induction the additional bypass end value is known.
9815 // Otherwise it is computed.
9816 Value *EndValueFromAdditionalBypass = MainVectorTripCount;
9817 if (OrigPhi != OldInduction) {
9818 auto *BinOp = II.getInductionBinOp();
9819 // Fast-math-flags propagate from the original induction instruction.
9820 if (isa_and_nonnull<FPMathOperator>(BinOp))
9821 BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags());
9822
9823 // Compute the end value for the additional bypass.
9824 EndValueFromAdditionalBypass =
9825 emitTransformedIndex(BypassBuilder, MainVectorTripCount,
9826 II.getStartValue(), Step, II.getKind(), BinOp);
9827 EndValueFromAdditionalBypass->setName("ind.end");
9828 }
9829 return EndValueFromAdditionalBypass;
9830}
9831
9833 VPlan &BestEpiPlan,
9835 const SCEV2ValueTy &ExpandedSCEVs,
9836 Value *MainVectorTripCount) {
9837 // Fix reduction resume values from the additional bypass block.
9838 BasicBlock *PH = L->getLoopPreheader();
9839 for (auto *Pred : predecessors(PH)) {
9840 for (PHINode &Phi : PH->phis()) {
9841 if (Phi.getBasicBlockIndex(Pred) != -1)
9842 continue;
9843 Phi.addIncoming(Phi.getIncomingValueForBlock(BypassBlock), Pred);
9844 }
9845 }
9846 auto *ScalarPH = cast<VPIRBasicBlock>(BestEpiPlan.getScalarPreheader());
9847 if (ScalarPH->hasPredecessors()) {
9848 // If ScalarPH has predecessors, we may need to update its reduction
9849 // resume values.
9850 for (const auto &[R, IRPhi] :
9851 zip(ScalarPH->phis(), ScalarPH->getIRBasicBlock()->phis())) {
9852 fixReductionScalarResumeWhenVectorizingEpilog(cast<VPPhi>(&R), IRPhi,
9853 BypassBlock);
9854 }
9855 }
9856
9857 // Fix induction resume values from the additional bypass block.
9858 IRBuilder<> BypassBuilder(BypassBlock, BypassBlock->getFirstInsertionPt());
9859 for (const auto &[IVPhi, II] : LVL.getInductionVars()) {
9860 auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
9862 IVPhi, II, BypassBuilder, ExpandedSCEVs, MainVectorTripCount,
9863 LVL.getPrimaryInduction());
9864 // TODO: Directly add as extra operand to the VPResumePHI recipe.
9865 Inc->setIncomingValueForBlock(BypassBlock, V);
9866 }
9867}
9868
9870 assert((EnableVPlanNativePath || L->isInnermost()) &&
9871 "VPlan-native path is not enabled. Only process inner loops.");
9872
9873 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
9874 << L->getHeader()->getParent()->getName() << "' from "
9875 << L->getLocStr() << "\n");
9876
9877 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
9878
9879 LLVM_DEBUG(
9880 dbgs() << "LV: Loop hints:"
9881 << " force="
9883 ? "disabled"
9885 ? "enabled"
9886 : "?"))
9887 << " width=" << Hints.getWidth()
9888 << " interleave=" << Hints.getInterleave() << "\n");
9889
9890 // Function containing loop
9891 Function *F = L->getHeader()->getParent();
9892
9893 // Looking at the diagnostic output is the only way to determine if a loop
9894 // was vectorized (other than looking at the IR or machine code), so it
9895 // is important to generate an optimization remark for each loop. Most of
9896 // these messages are generated as OptimizationRemarkAnalysis. Remarks
9897 // generated as OptimizationRemark and OptimizationRemarkMissed are
9898 // less verbose reporting vectorized loops and unvectorized loops that may
9899 // benefit from vectorization, respectively.
9900
9901 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9902 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9903 return false;
9904 }
9905
9906 PredicatedScalarEvolution PSE(*SE, *L);
9907
9908 // Check if it is legal to vectorize the loop.
9909 LoopVectorizationRequirements Requirements;
9910 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
9911 &Requirements, &Hints, DB, AC, BFI, PSI);
9913 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9914 Hints.emitRemarkWithHints();
9915 return false;
9916 }
9917
9919 reportVectorizationFailure("Auto-vectorization of loops with uncountable "
9920 "early exit is not enabled",
9921 "UncountableEarlyExitLoopsDisabled", ORE, L);
9922 return false;
9923 }
9924
9925 // Entrance to the VPlan-native vectorization path. Outer loops are processed
9926 // here. They may require CFG and instruction level transformations before
9927 // even evaluating whether vectorization is profitable. Since we cannot modify
9928 // the incoming IR, we need to build VPlan upfront in the vectorization
9929 // pipeline.
9930 if (!L->isInnermost())
9931 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9932 ORE, BFI, PSI, Hints, Requirements);
9933
9934 assert(L->isInnermost() && "Inner loop expected.");
9935
9936 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9937 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9938
9939 // If an override option has been passed in for interleaved accesses, use it.
9941 UseInterleaved = EnableInterleavedMemAccesses;
9942
9943 // Analyze interleaved memory accesses.
9944 if (UseInterleaved)
9946
9947 if (LVL.hasUncountableEarlyExit()) {
9948 BasicBlock *LoopLatch = L->getLoopLatch();
9949 if (IAI.requiresScalarEpilogue() ||
9951 [LoopLatch](BasicBlock *BB) { return BB != LoopLatch; })) {
9952 reportVectorizationFailure("Auto-vectorization of early exit loops "
9953 "requiring a scalar epilogue is unsupported",
9954 "UncountableEarlyExitUnsupported", ORE, L);
9955 return false;
9956 }
9957 }
9958
9959 // Check the function attributes and profiles to find out if this function
9960 // should be optimized for size.
9962 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
9963
9964 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9965 // count by optimizing for size, to minimize overheads.
9966 auto ExpectedTC = getSmallBestKnownTC(PSE, L);
9967 if (ExpectedTC && ExpectedTC->isFixed() &&
9968 ExpectedTC->getFixedValue() < TinyTripCountVectorThreshold) {
9969 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9970 << "This loop is worth vectorizing only if no scalar "
9971 << "iteration overheads are incurred.");
9973 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9974 else {
9975 LLVM_DEBUG(dbgs() << "\n");
9976 // Predicate tail-folded loops are efficient even when the loop
9977 // iteration count is low. However, setting the epilogue policy to
9978 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
9979 // with runtime checks. It's more effective to let
9980 // `isOutsideLoopWorkProfitable` determine if vectorization is
9981 // beneficial for the loop.
9984 }
9985 }
9986
9987 // Check the function attributes to see if implicit floats or vectors are
9988 // allowed.
9989 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9991 "Can't vectorize when the NoImplicitFloat attribute is used",
9992 "loop not vectorized due to NoImplicitFloat attribute",
9993 "NoImplicitFloat", ORE, L);
9994 Hints.emitRemarkWithHints();
9995 return false;
9996 }
9997
9998 // Check if the target supports potentially unsafe FP vectorization.
9999 // FIXME: Add a check for the type of safety issue (denormal, signaling)
10000 // for the target we're vectorizing for, to make sure none of the
10001 // additional fp-math flags can help.
10002 if (Hints.isPotentiallyUnsafe() &&
10005 "Potentially unsafe FP op prevents vectorization",
10006 "loop not vectorized due to unsafe FP support.",
10007 "UnsafeFP", ORE, L);
10008 Hints.emitRemarkWithHints();
10009 return false;
10010 }
10011
10012 bool AllowOrderedReductions;
10013 // If the flag is set, use that instead and override the TTI behaviour.
10015 AllowOrderedReductions = ForceOrderedReductions;
10016 else
10017 AllowOrderedReductions = TTI->enableOrderedReductions();
10018 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10019 ORE->emit([&]() {
10020 auto *ExactFPMathInst = Requirements.getExactFPInst();
10021 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10022 ExactFPMathInst->getDebugLoc(),
10023 ExactFPMathInst->getParent())
10024 << "loop not vectorized: cannot prove it is safe to reorder "
10025 "floating-point operations";
10026 });
10027 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10028 "reorder floating-point operations\n");
10029 Hints.emitRemarkWithHints();
10030 return false;
10031 }
10032
10033 // Use the cost model.
10034 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10035 F, &Hints, IAI, PSI, BFI);
10036 // Use the planner for vectorization.
10037 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
10038 ORE);
10039
10040 // Get user vectorization factor and interleave count.
10041 ElementCount UserVF = Hints.getWidth();
10042 unsigned UserIC = Hints.getInterleave();
10043
10044 // Plan how to best vectorize.
10045 LVP.plan(UserVF, UserIC);
10047 unsigned IC = 1;
10048
10051
10052 GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
10053 if (LVP.hasPlanWithVF(VF.Width)) {
10054 // Select the interleave count.
10055 IC = LVP.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost);
10056
10057 unsigned SelectedIC = std::max(IC, UserIC);
10058 // Optimistically generate runtime checks if they are needed. Drop them if
10059 // they turn out to not be profitable.
10060 if (VF.Width.isVector() || SelectedIC > 1) {
10061 Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
10062
10063 // Bail out early if either the SCEV or memory runtime checks are known to
10064 // fail. In that case, the vector loop would never execute.
10065 using namespace llvm::PatternMatch;
10066 if (Checks.getSCEVChecks().first &&
10067 match(Checks.getSCEVChecks().first, m_One()))
10068 return false;
10069 if (Checks.getMemRuntimeChecks().first &&
10070 match(Checks.getMemRuntimeChecks().first, m_One()))
10071 return false;
10072 }
10073
10074 // Check if it is profitable to vectorize with runtime checks.
10075 bool ForceVectorization =
10077 VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM,
10078 CM.CostKind);
10079 if (!ForceVectorization &&
10080 !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
10081 LVP.getPlanFor(VF.Width), SEL,
10082 CM.getVScaleForTuning())) {
10083 ORE->emit([&]() {
10085 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
10086 L->getHeader())
10087 << "loop not vectorized: cannot prove it is safe to reorder "
10088 "memory operations";
10089 });
10090 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
10091 Hints.emitRemarkWithHints();
10092 return false;
10093 }
10094 }
10095
10096 // Identify the diagnostic messages that should be produced.
10097 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10098 bool VectorizeLoop = true, InterleaveLoop = true;
10099 if (VF.Width.isScalar()) {
10100 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10101 VecDiagMsg = {
10102 "VectorizationNotBeneficial",
10103 "the cost-model indicates that vectorization is not beneficial"};
10104 VectorizeLoop = false;
10105 }
10106
10107 if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) {
10108 // Tell the user interleaving was avoided up-front, despite being explicitly
10109 // requested.
10110 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10111 "interleaving should be avoided up front\n");
10112 IntDiagMsg = {"InterleavingAvoided",
10113 "Ignoring UserIC, because interleaving was avoided up front"};
10114 InterleaveLoop = false;
10115 } else if (IC == 1 && UserIC <= 1) {
10116 // Tell the user interleaving is not beneficial.
10117 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10118 IntDiagMsg = {
10119 "InterleavingNotBeneficial",
10120 "the cost-model indicates that interleaving is not beneficial"};
10121 InterleaveLoop = false;
10122 if (UserIC == 1) {
10123 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10124 IntDiagMsg.second +=
10125 " and is explicitly disabled or interleave count is set to 1";
10126 }
10127 } else if (IC > 1 && UserIC == 1) {
10128 // Tell the user interleaving is beneficial, but it explicitly disabled.
10129 LLVM_DEBUG(dbgs() << "LV: Interleaving is beneficial but is explicitly "
10130 "disabled.\n");
10131 IntDiagMsg = {"InterleavingBeneficialButDisabled",
10132 "the cost-model indicates that interleaving is beneficial "
10133 "but is explicitly disabled or interleave count is set to 1"};
10134 InterleaveLoop = false;
10135 }
10136
10137 // If there is a histogram in the loop, do not just interleave without
10138 // vectorizing. The order of operations will be incorrect without the
10139 // histogram intrinsics, which are only used for recipes with VF > 1.
10140 if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) {
10141 LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
10142 << "to histogram operations.\n");
10143 IntDiagMsg = {
10144 "HistogramPreventsScalarInterleaving",
10145 "Unable to interleave without vectorization due to constraints on "
10146 "the order of histogram operations"};
10147 InterleaveLoop = false;
10148 }
10149
10150 // Override IC if user provided an interleave count.
10151 IC = UserIC > 0 ? UserIC : IC;
10152
10153 // Emit diagnostic messages, if any.
10154 const char *VAPassName = Hints.vectorizeAnalysisPassName();
10155 if (!VectorizeLoop && !InterleaveLoop) {
10156 // Do not vectorize or interleaving the loop.
10157 ORE->emit([&]() {
10158 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10159 L->getStartLoc(), L->getHeader())
10160 << VecDiagMsg.second;
10161 });
10162 ORE->emit([&]() {
10163 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10164 L->getStartLoc(), L->getHeader())
10165 << IntDiagMsg.second;
10166 });
10167 return false;
10168 }
10169
10170 if (!VectorizeLoop && InterleaveLoop) {
10171 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10172 ORE->emit([&]() {
10173 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10174 L->getStartLoc(), L->getHeader())
10175 << VecDiagMsg.second;
10176 });
10177 } else if (VectorizeLoop && !InterleaveLoop) {
10178 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10179 << ") in " << L->getLocStr() << '\n');
10180 ORE->emit([&]() {
10181 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10182 L->getStartLoc(), L->getHeader())
10183 << IntDiagMsg.second;
10184 });
10185 } else if (VectorizeLoop && InterleaveLoop) {
10186 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10187 << ") in " << L->getLocStr() << '\n');
10188 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10189 }
10190
10191 bool DisableRuntimeUnroll = false;
10192 MDNode *OrigLoopID = L->getLoopID();
10193
10194 // Report the vectorization decision.
10195 if (VF.Width.isScalar()) {
10196 using namespace ore;
10197 assert(IC > 1);
10198 ORE->emit([&]() {
10199 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10200 L->getHeader())
10201 << "interleaved loop (interleaved count: "
10202 << NV("InterleaveCount", IC) << ")";
10203 });
10204 } else {
10205 // Report the vectorization decision.
10206 reportVectorization(ORE, L, VF, IC);
10207 }
10210
10211 // If we decided that it is *legal* to interleave or vectorize the loop, then
10212 // do it.
10213
10214 VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10215 // Consider vectorizing the epilogue too if it's profitable.
10216 VectorizationFactor EpilogueVF =
10218 if (EpilogueVF.Width.isVector()) {
10219 std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
10220
10221 // The first pass vectorizes the main loop and creates a scalar epilogue
10222 // to be vectorized by executing the plan (potentially with a different
10223 // factor) again shortly afterwards.
10224 VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width);
10225 BestEpiPlan.getMiddleBlock()->setName("vec.epilog.middle.block");
10226 preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);
10227 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
10228 BestEpiPlan);
10229 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TTI, AC, EPI, &CM, BFI,
10230 PSI, Checks, *BestMainPlan);
10231 auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
10232 *BestMainPlan, MainILV, DT, false);
10233 ++LoopsVectorized;
10234
10235 // Second pass vectorizes the epilogue and adjusts the control flow
10236 // edges from the first pass.
10237 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
10238 BFI, PSI, Checks, BestEpiPlan);
10239 EpilogILV.setTripCount(MainILV.getTripCount());
10240 preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI);
10241
10242 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, DT,
10243 true);
10244
10246 BestEpiPlan, LVL, ExpandedSCEVs,
10247 EPI.VectorTripCount);
10248 ++LoopsEpilogueVectorized;
10249
10250 if (!Checks.hasChecks())
10251 DisableRuntimeUnroll = true;
10252 } else {
10253 InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, IC, &CM, BFI, PSI,
10254 Checks, BestPlan);
10255 // TODO: Move to general VPlan pipeline once epilogue loops are also
10256 // supported.
10259 IC, PSE);
10260 LVP.addMinimumIterationCheck(BestPlan, VF.Width, IC,
10262
10263 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10264 ++LoopsVectorized;
10265
10266 // Add metadata to disable runtime unrolling a scalar loop when there
10267 // are no runtime checks about strides and memory. A scalar loop that is
10268 // rarely used is not worth unrolling.
10269 if (!Checks.hasChecks() && !VF.Width.isScalar())
10270 DisableRuntimeUnroll = true;
10271 }
10272
10273 assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
10274 "DT not preserved correctly");
10275
10276 std::optional<MDNode *> RemainderLoopID =
10279 if (RemainderLoopID) {
10280 L->setLoopID(*RemainderLoopID);
10281 } else {
10282 if (DisableRuntimeUnroll)
10284
10285 // Mark the loop as already vectorized to avoid vectorizing again.
10286 Hints.setAlreadyVectorized();
10287 }
10288
10289 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10290 return true;
10291}
10292
10294
10295 // Don't attempt if
10296 // 1. the target claims to have no vector registers, and
10297 // 2. interleaving won't help ILP.
10298 //
10299 // The second condition is necessary because, even if the target has no
10300 // vector registers, loop vectorization may still enable scalar
10301 // interleaving.
10304 return LoopVectorizeResult(false, false);
10305
10306 bool Changed = false, CFGChanged = false;
10307
10308 // The vectorizer requires loops to be in simplified form.
10309 // Since simplification may add new inner loops, it has to run before the
10310 // legality and profitability checks. This means running the loop vectorizer
10311 // will simplify all loops, regardless of whether anything end up being
10312 // vectorized.
10313 for (const auto &L : *LI)
10314 Changed |= CFGChanged |=
10315 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10316
10317 // Build up a worklist of inner-loops to vectorize. This is necessary as
10318 // the act of vectorizing or partially unrolling a loop creates new loops
10319 // and can invalidate iterators across the loops.
10320 SmallVector<Loop *, 8> Worklist;
10321
10322 for (Loop *L : *LI)
10323 collectSupportedLoops(*L, LI, ORE, Worklist);
10324
10325 LoopsAnalyzed += Worklist.size();
10326
10327 // Now walk the identified inner loops.
10328 while (!Worklist.empty()) {
10329 Loop *L = Worklist.pop_back_val();
10330
10331 // For the inner loops we actually process, form LCSSA to simplify the
10332 // transform.
10333 Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10334
10335 Changed |= CFGChanged |= processLoop(L);
10336
10337 if (Changed) {
10338 LAIs->clear();
10339
10340#ifndef NDEBUG
10341 if (VerifySCEV)
10342 SE->verify();
10343#endif
10344 }
10345 }
10346
10347 // Process each loop nest in the function.
10348 return LoopVectorizeResult(Changed, CFGChanged);
10349}
10350
10353 LI = &AM.getResult<LoopAnalysis>(F);
10354 // There are no loops in the function. Return before computing other
10355 // expensive analyses.
10356 if (LI->empty())
10357 return PreservedAnalyses::all();
10366
10367 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10368 PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10369 BFI = nullptr;
10370 if (PSI && PSI->hasProfileSummary())
10372 LoopVectorizeResult Result = runImpl(F);
10373 if (!Result.MadeAnyChange)
10374 return PreservedAnalyses::all();
10376
10377 if (isAssignmentTrackingEnabled(*F.getParent())) {
10378 for (auto &BB : F)
10380 }
10381
10382 PA.preserve<LoopAnalysis>();
10386
10387 if (Result.MadeCFGChange) {
10388 // Making CFG changes likely means a loop got vectorized. Indicate that
10389 // extra simplification passes should be run.
10390 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10391 // be run if runtime checks have been added.
10394 } else {
10396 }
10397 return PA;
10398}
10399
10401 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10402 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10403 OS, MapClassName2PassName);
10404
10405 OS << '<';
10406 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10407 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10408 OS << '>';
10409}
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
AMDGPU Lower Kernel Arguments
AMDGPU Register Bank Select
Rewrite undef for PHI
This file implements a class to represent arbitrary precision integral constant values and operations...
@ PostInc
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
This file contains the simple types necessary to represent the attributes associated with functions a...
This is the interface for LLVM's primary stateless and local alias analysis.
static bool IsEmptyBlock(MachineBasicBlock *MBB)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:687
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static InstructionCost getCost(Instruction &Inst, TTI::TargetCostKind CostKind, TargetTransformInfo &TTI, TargetLibraryInfo &TLI)
Definition: CostModel.cpp:74
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file defines DenseMapInfo traits for DenseMap.
This file defines the DenseMap class.
std::string Name
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define Check(C,...)
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
#define _
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
Module.h This file contains the declarations for the Module class.
This defines the Use class.
static bool hasNoUnsignedWrap(BinaryOperator &I)
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
Legalize the Machine IR a function s Machine IR
Definition: Legalizer.cpp:80
This header provides classes for managing per-loop analyses.
static const char * VerboseDebug
loop Loop Strength Reduction
#define LV_NAME
This file defines the LoopVectorizationLegality class.
This file provides a LoopVectorizationPlanner class.
static void collectSupportedLoops(Loop &L, LoopInfo *LI, OptimizationRemarkEmitter *ORE, SmallVectorImpl< Loop * > &V)
static cl::opt< unsigned > EpilogueVectorizationMinVF("epilogue-vectorization-minimum-VF", cl::Hidden, cl::desc("Only loops with vectorization factor equal to or larger than " "the specified value are considered for epilogue vectorization."))
static cl::opt< unsigned > EpilogueVectorizationForceVF("epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, cl::desc("When epilogue vectorization is enabled, and a value greater than " "1 is specified, forces the given VF for all applicable epilogue " "loops."))
static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan, DenseMap< VPValue *, VPValue * > &IVEndValues)
Create resume phis in the scalar preheader for first-order recurrences, reductions and inductions,...
static void addRuntimeUnrollDisableMetaData(Loop *L)
static Type * maybeVectorizeType(Type *Ty, ElementCount VF)
static ElementCount determineVPlanVF(const TargetTransformInfo &TTI, LoopVectorizationCostModel &CM)
static ElementCount getSmallConstantTripCount(ScalarEvolution *SE, const Loop *L)
A version of ScalarEvolution::getSmallConstantTripCount that returns an ElementCount to include loops...
static cl::opt< unsigned > VectorizeMemoryCheckThreshold("vectorize-memory-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum allowed number of runtime memory checks"))
static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan)
Prepare MainPlan for vectorizing the main vector loop during epilogue vectorization.
static cl::opt< unsigned > TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), cl::Hidden, cl::desc("Loops with a constant trip count that is smaller than this " "value are vectorized only if no scalar iteration overheads " "are incurred."))
Loops with a known constant trip count below this number are vectorized only if no scalar iteration o...
static void debugVectorizationMessage(const StringRef Prefix, const StringRef DebugMsg, Instruction *I)
Write a DebugMsg about vectorization to the debug output stream.
static cl::opt< bool > EnableCondStoresVectorization("enable-cond-stores-vec", cl::init(true), cl::Hidden, cl::desc("Enable if predication of stores during vectorization."))
static VPInstruction * addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Create and return a ResumePhi for WideIV, unless it is truncated.
static Value * emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, Value *Step, InductionDescriptor::InductionKind InductionKind, const BinaryOperator *InductionBinOp)
Compute the transformed value of Index at offset StartValue using step StepValue.
static VPIRBasicBlock * replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB)
Replace VPBB with a VPIRBasicBlock wrapping IRBB.
static DebugLoc getDebugLocFromInstOrOperands(Instruction *I)
Look for a meaningful debug location on the instruction or its operands.
static Value * createInductionAdditionalBypassValues(PHINode *OrigPhi, const InductionDescriptor &II, IRBuilder<> &BypassBuilder, const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount, Instruction *OldInduction)
static void fixReductionScalarResumeWhenVectorizingEpilog(VPPhi *EpiResumePhiR, PHINode &EpiResumePhi, BasicBlock *BypassBlock)
static Value * getStartValueFromReductionResult(VPInstruction *RdxResult)
const char LLVMLoopVectorizeFollowupAll[]
static cl::opt< bool > ForceTargetSupportsScalableVectors("force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, cl::desc("Pretend that scalable vectors are supported, even if the target does " "not support them. This flag should only be used for testing."))
static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style)
static cl::opt< bool > EnableEarlyExitVectorization("enable-early-exit-vectorization", cl::init(true), cl::Hidden, cl::desc("Enable vectorization of early exit loops with uncountable exits."))
static unsigned estimateElementCount(ElementCount VF, std::optional< unsigned > VScale)
This function attempts to return a value that represents the ElementCount at runtime.
static constexpr uint32_t MinItersBypassWeights[]
static cl::opt< unsigned > ForceTargetNumScalarRegs("force-target-num-scalar-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of scalar registers."))
static cl::opt< bool > UseWiderVFIfCallVariantsPresent("vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), cl::Hidden, cl::desc("Try wider VFs if they enable the use of vector variants"))
static std::optional< unsigned > getMaxVScale(const Function &F, const TargetTransformInfo &TTI)
static cl::opt< unsigned > SmallLoopCost("small-loop-cost", cl::init(20), cl::Hidden, cl::desc("The cost of a loop that is considered 'small' by the interleaver."))
static bool planContainsAdditionalSimplifications(VPlan &Plan, VPCostContext &CostCtx, Loop *TheLoop, ElementCount VF)
Return true if the original loop \ TheLoop contains any instructions that do not have corresponding r...
static cl::opt< unsigned > ForceTargetNumVectorRegs("force-target-num-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers."))
static bool isExplicitVecOuterLoop(Loop *OuterLp, OptimizationRemarkEmitter *ORE)
static cl::opt< bool > EnableIndVarRegisterHeur("enable-ind-var-reg-heur", cl::init(true), cl::Hidden, cl::desc("Count the induction variable only once when interleaving"))
static cl::opt< TailFoldingStyle > ForceTailFoldingStyle("force-tail-folding-style", cl::desc("Force the tail folding style"), cl::init(TailFoldingStyle::None), cl::values(clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), clEnumValN(TailFoldingStyle::Data, "data", "Create lane mask for data only, using active.lane.mask intrinsic"), clEnumValN(TailFoldingStyle::DataWithoutLaneMask, "data-without-lane-mask", "Create lane mask with compare/stepvector"), clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", "Create lane mask using active.lane.mask intrinsic, and use " "it for both data and control flow"), clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, "data-and-control-without-rt-check", "Similar to data-and-control, but remove the runtime check"), clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl", "Use predicated EVL instructions for tail folding. If EVL " "is unsupported, fallback to data-without-lane-mask.")))
static cl::opt< bool > EnableEpilogueVectorization("enable-epilogue-vectorization", cl::init(true), cl::Hidden, cl::desc("Enable vectorization of epilogue loops."))
static ScalarEpilogueLowering getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI)
static void preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, const SCEV2ValueTy &ExpandedSCEVs, EpilogueLoopVectorizationInfo &EPI)
Prepare Plan for vectorizing the epilogue loop.
const char VerboseDebug[]
static cl::opt< bool > PreferPredicatedReductionSelect("prefer-predicated-reduction-select", cl::init(false), cl::Hidden, cl::desc("Prefer predicating a reduction operation over an after loop select."))
static VPWidenIntOrFpInductionRecipe * createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start, const InductionDescriptor &IndDesc, VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop)
Creates a VPWidenIntOrFpInductionRecpipe for Phi.
static cl::opt< bool > PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), cl::Hidden, cl::desc("Prefer in-loop vector reductions, " "overriding the targets preference."))
const char LLVMLoopVectorizeFollowupVectorized[]
static cl::opt< bool > EnableLoadStoreRuntimeInterleave("enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, cl::desc("Enable runtime interleaving until load/store ports are saturated"))
static cl::opt< bool > VPlanBuildStressTest("vplan-build-stress-test", cl::init(false), cl::Hidden, cl::desc("Build VPlan for every supported loop nest in the function and bail " "out right after the build (stress test the VPlan H-CFG construction " "in the VPlan-native vectorization path)."))
static bool hasIrregularType(Type *Ty, const DataLayout &DL)
A helper function that returns true if the given type is irregular.
static cl::opt< bool > LoopVectorizeWithBlockFrequency("loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, cl::desc("Enable the use of the block frequency analysis to access PGO " "heuristics minimizing code growth in cold regions and being more " "aggressive in hot regions."))
static std::optional< ElementCount > getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L, bool CanUseConstantMax=true)
Returns "best known" trip count, which is either a valid positive trip count or std::nullopt when an ...
static Value * getExpandedStep(const InductionDescriptor &ID, const SCEV2ValueTy &ExpandedSCEVs)
Return the expanded step for ID using ExpandedSCEVs to look up SCEV expansion results.
const char LLVMLoopVectorizeFollowupEpilogue[]
static bool useActiveLaneMask(TailFoldingStyle Style)
static bool hasReplicatorRegion(VPlan &Plan)
static bool isIndvarOverflowCheckKnownFalse(const LoopVectorizationCostModel *Cost, ElementCount VF, std::optional< unsigned > UF=std::nullopt)
For the given VF and UF and maximum trip count computed for the loop, return whether the induction va...
static void addFullyUnrolledInstructionsToIgnore(Loop *L, const LoopVectorizationLegality::InductionList &IL, SmallPtrSetImpl< Instruction * > &InstsToIgnore)
Knowing that loop L executes a single vector iteration, add instructions that will get simplified and...
static cl::opt< PreferPredicateTy::Option > PreferPredicateOverEpilogue("prefer-predicate-over-epilogue", cl::init(PreferPredicateTy::ScalarEpilogue), cl::Hidden, cl::desc("Tail-folding and predication preferences over creating a scalar " "epilogue loop."), cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, "scalar-epilogue", "Don't tail-predicate loops, create scalar epilogue"), clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, "predicate-else-scalar-epilogue", "prefer tail-folding, create scalar epilogue if tail " "folding fails."), clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, "predicate-dont-vectorize", "prefers tail-folding, don't attempt vectorization if " "tail-folding fails.")))
static cl::opt< bool > EnableInterleavedMemAccesses("enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop"))
static cl::opt< bool > EnableMaskedInterleavedMemAccesses("enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"))
An interleave-group may need masking if it resides in a block that needs predication,...
static cl::opt< bool > ForceOrderedReductions("force-ordered-reductions", cl::init(false), cl::Hidden, cl::desc("Enable the vectorisation of loops with in-order (strict) " "FP reductions"))
static void cse(BasicBlock *BB)
Perform cse of induction variable instructions.
static const SCEV * getAddressAccessSCEV(Value *Ptr, LoopVectorizationLegality *Legal, PredicatedScalarEvolution &PSE, const Loop *TheLoop)
Gets Address Access SCEV after verifying that the access pattern is loop invariant except the inducti...
static cl::opt< cl::boolOrDefault > ForceSafeDivisor("force-widen-divrem-via-safe-divisor", cl::Hidden, cl::desc("Override cost based safe divisor widening for div/rem instructions"))
static InstructionCost calculateEarlyExitCost(VPCostContext &CostCtx, VPlan &Plan, ElementCount VF)
For loops with uncountable early exits, find the cost of doing work when exiting the loop early,...
#define DEBUG_TYPE
static cl::opt< unsigned > ForceTargetMaxVectorInterleaveFactor("force-target-max-vector-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "vectorized loops."))
static bool processLoopInVPlanNativePath(Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements)
static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI)
static cl::opt< unsigned > NumberOfStoresToPredicate("vectorize-num-stores-pred", cl::init(1), cl::Hidden, cl::desc("Max number of stores to be predicated behind an if."))
The number of stores in a loop that are allowed to need predication.
static cl::opt< unsigned > MaxNestedScalarReductionIC("max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop."))
static cl::opt< unsigned > ForceTargetMaxScalarInterleaveFactor("force-target-max-scalar-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "scalar loops."))
static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE)
static bool willGenerateVectors(VPlan &Plan, ElementCount VF, const TargetTransformInfo &TTI)
Check if any recipe of Plan will generate a vector value, which will be assigned a vector register.
static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks, VectorizationFactor &VF, Loop *L, PredicatedScalarEvolution &PSE, VPCostContext &CostCtx, VPlan &Plan, ScalarEpilogueLowering SEL, std::optional< unsigned > VScale)
This function determines whether or not it's still profitable to vectorize the loop given the extra w...
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range)
Handle users in the exit block for first order reductions in the original exit block.
static void fixScalarResumeValuesFromBypass(BasicBlock *BypassBlock, Loop *L, VPlan &BestEpiPlan, LoopVectorizationLegality &LVL, const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount)
static cl::opt< bool > MaximizeBandwidth("vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " "will be determined by the smallest type in loop."))
static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, Instruction *I, DebugLoc DL={})
Create an analysis remark that explains why vectorization failed.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
This file implements a map that provides insertion order iteration.
This file contains the declarations for metadata subclasses.
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
if(PassOpts->AAPipeline)
This file contains the declarations for profiling metadata utility functions.
const SmallVectorImpl< MachineOperand > & Cond
static BinaryOperator * CreateMul(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
static BinaryOperator * CreateAdd(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
#define OP(OPC)
Definition: Instruction.h:46
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
#define LLVM_DEBUG(...)
Definition: Debug.h:119
#define DEBUG_WITH_TYPE(TYPE,...)
DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug information.
Definition: Debug.h:77
This pass exposes codegen information to IR-level passes.
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file contains the declarations of different VPlan-related auxiliary helpers.
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const char PassName[]
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:83
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1540
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition: APInt.h:1512
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:255
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:412
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:147
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM_ABI unsigned getVScaleRangeMin() const
Returns the minimum value for the vscale_range attribute.
Definition: Attributes.cpp:468
LLVM Basic Block Representation.
Definition: BasicBlock.h:62
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:459
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition: BasicBlock.h:528
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:393
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:337
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:437
LLVM_ABI const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Definition: BasicBlock.cpp:467
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:213
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this basic block belongs to.
Definition: BasicBlock.cpp:252
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:131
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:233
BinaryOps getOpcode() const
Definition: InstrTypes.h:374
Analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Conditional or Unconditional Branch instruction.
bool isConditional() const
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
BasicBlock * getSuccessor(unsigned i) const
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:73
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:1905
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1348
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1292
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1283
unsigned arg_size() const
Definition: InstrTypes.h:1290
This class represents a function call, abstracting a target machine's calling convention.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:984
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:678
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:701
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:703
@ ICMP_EQ
equal
Definition: InstrTypes.h:699
@ ICMP_NE
not equal
Definition: InstrTypes.h:700
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:704
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:791
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition: CmpPredicate.h:23
This is the shared class of boolean and integer constants.
Definition: Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:868
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:875
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
A debug info location.
Definition: DebugLoc.h:124
static DebugLoc getTemporary()
Definition: DebugLoc.h:161
static DebugLoc getUnknown()
Definition: DebugLoc.h:162
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:104
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:187
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:165
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition: DenseMap.h:229
bool empty() const
Definition: DenseMap.h:107
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:161
iterator end()
Definition: DenseMap.h:81
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:205
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:156
void insert_range(Range &&R)
Inserts range of 'std::pair<KeyT, ValueT>' values into the map.
Definition: DenseMap.h:267
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:214
Implements a dense probed hash-table based set.
Definition: DenseSet.h:263
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:284
bool verify(VerificationLevel VL=VerificationLevel::Full) const
verify - checks if the tree is correct.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
void eraseNode(NodeT *BB)
eraseNode - Removes a node from the dominator tree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:165
constexpr bool isVector() const
One or more elements.
Definition: TypeSize.h:327
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:315
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:312
static constexpr ElementCount get(ScalarTy MinVal, bool Scalable)
Definition: TypeSize.h:318
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:323
BasicBlock * emitMinimumVectorEpilogueIterCountCheck(BasicBlock *Bypass, BasicBlock *Insert)
Emits an iteration count bypass check after the main vector loop has finished to see if there are any...
BasicBlock * getAdditionalBypassBlock() const
Return the additional bypass block which targets the scalar loop by skipping the epilogue loop after ...
BasicBlock * createVectorizedLoopSkeleton() final
Implements the interface for creating a vectorized skeleton using the epilogue loop strategy (i....
EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan)
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
A specialized derived class of inner loop vectorizer that performs vectorization of main loops in the...
void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB)
Introduces a new VPIRBasicBlock for CheckIRBB to Plan between the vector preheader and its predecesso...
Value * createIterationCountCheck(ElementCount VF, unsigned UF) const
EpilogueVectorizerMainLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check, VPlan &Plan)
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
BasicBlock * emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue)
Emits an iteration count bypass check once for the main loop (when ForEpilogue is false) and once for...
BasicBlock * createVectorizedLoopSkeleton() final
Implements the interface for creating a vectorized skeleton using the main loop strategy (i....
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition: Operator.h:333
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:22
Class to represent function types.
Definition: DerivedTypes.h:105
param_iterator param_begin() const
Definition: DerivedTypes.h:130
param_iterator param_end() const
Definition: DerivedTypes.h:131
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:706
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:209
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:363
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:762
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:727
Represents flags for the getelementptr instruction/expression.
static GEPNoWrapFlags inBounds()
static GEPNoWrapFlags none()
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:114
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:502
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1005
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:345
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2333
LLVM_ABI Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition: IRBuilder.cpp:823
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2329
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1420
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:507
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:207
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2439
LLVM_ABI Value * CreateElementCount(Type *Ty, ElementCount EC)
Create an expression which evaluates to the number of elements in EC at runtime.
Definition: IRBuilder.cpp:123
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2780
A struct for saving information about induction variables.
const SCEV * getStep() const
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_NoInduction
Not an induction variable.
@ IK_FpInduction
Floating point induction variable.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
const SmallVectorImpl< Instruction * > & getCastInsts() const
Returns a reference to the type cast instructions in the induction update chain, that are redundant w...
Value * getStartValue() const
An extension of the inner loop vectorizer that creates a skeleton for a vectorized loop that has its ...
InnerLoopAndEpilogueVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan, ElementCount VecWidth, ElementCount MinProfitableTripCount, unsigned UnrollFactor)
EpilogueLoopVectorizationInfo & EPI
Holds and updates state information required to vectorize the main loop and its epilogue in two separ...
InnerLoopVectorizer vectorizes loops which contain only one basic block to a specified vectorization ...
virtual void printDebugTracesAtStart()
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
Value * TripCount
Trip count of the original loop.
const TargetTransformInfo * TTI
Target Transform Info.
LoopVectorizationCostModel * Cost
The profitablity analysis.
BlockFrequencyInfo * BFI
BFI and PSI are used to check for profile guided size optimizations.
Value * getTripCount() const
Returns the original loop trip count.
PredicatedScalarEvolution & PSE
A wrapper around ScalarEvolution used to add runtime SCEV checks.
LoopInfo * LI
Loop Info.
ProfileSummaryInfo * PSI
DominatorTree * DT
Dominator Tree.
void setTripCount(Value *TC)
Used to set the trip count after ILV's construction and after the preheader block has been executed.
void fixVectorizedLoop(VPTransformState &State)
Fix the vectorized code, taking care of header phi's, and more.
virtual BasicBlock * createVectorizedLoopSkeleton()
Creates a basic block for the scalar preheader.
BasicBlock * LoopVectorPreHeader
The vector-loop preheader.
virtual void printDebugTracesAtEnd()
AssumptionCache * AC
Assumption Cache.
InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC, ElementCount VecWidth, unsigned UnrollFactor, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks, VPlan &Plan)
IRBuilder Builder
The builder that we use.
void fixNonInductionPHIs(VPTransformState &State)
Fix the non-induction PHIs in Plan.
VPBasicBlock * VectorPHVPBB
The vector preheader block of Plan, used as target for check blocks introduced during skeleton creati...
unsigned UF
The vectorization unroll factor to use.
GeneratedRTChecks & RTChecks
Structure to hold information about generated runtime checks, responsible for cleaning the checks,...
virtual ~InnerLoopVectorizer()=default
ElementCount VF
The vectorization SIMD factor to use.
Loop * OrigLoop
The original loop.
BasicBlock * createScalarPreheader(StringRef Prefix)
Create and return a new IR basic block for the scalar preheader whose name is prefixed with Prefix.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
static InstructionCost getInvalid(CostType Val=0)
static InstructionCost getMax()
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:513
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:78
LLVM_ABI void moveBefore(InstListType::iterator InsertPos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool isBinaryOp() const
Definition: Instruction.h:317
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:171
LLVM_ABI FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
const char * getOpcodeName() const
Definition: Instruction.h:314
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:312
Class to represent integer types.
Definition: DerivedTypes.h:42
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:319
LLVM_ABI APInt getMask() const
For example, this is 0xFF for an 8 bit integer, 0xFFFF for i16, etc.
Definition: Type.cpp:343
The group of interleaved loads/stores sharing the same stride and close to each other.
Definition: VectorUtils.h:524
uint32_t getFactor() const
Definition: VectorUtils.h:540
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
Definition: VectorUtils.h:594
InstTy * getInsertPos() const
Definition: VectorUtils.h:610
uint32_t getNumMembers() const
Definition: VectorUtils.h:542
Drive the analysis of interleaved memory accesses in the loop.
Definition: VectorUtils.h:669
InterleaveGroup< Instruction > * getInterleaveGroup(const Instruction *Instr) const
Get the interleave group that Instr belongs to.
Definition: VectorUtils.h:714
bool requiresScalarEpilogue() const
Returns true if an interleaved group that may access memory out-of-bounds requires a scalar epilogue ...
Definition: VectorUtils.h:725
bool isInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleave group.
Definition: VectorUtils.h:706
bool invalidateGroups()
Invalidate groups, e.g., in case all blocks in loop will be predicated contrary to original assumptio...
Definition: VectorUtils.h:689
iterator_range< SmallPtrSetIterator< llvm::InterleaveGroup< Instruction > * > > getInterleaveGroups()
Definition: VectorUtils.h:719
LLVM_ABI void analyzeInterleaving(bool EnableMaskedInterleavedGroup)
Analyze the interleaved accesses and collect them in interleave groups.
LLVM_ABI void invalidateGroupsRequiringScalarEpilogue()
Invalidate groups that require a scalar epilogue (due to gaps).
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:49
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
An instruction for reading from memory.
Definition: Instructions.h:180
Type * getPointerOperandType() const
Definition: Instructions.h:262
This analysis provides dependence information for the memory accesses of a loop.
Drive the analysis of memory accesses in the loop.
const RuntimePointerChecking * getRuntimePointerChecking() const
unsigned getNumRuntimePointerChecks() const
Number of memchecks required to prove independence of otherwise may-alias pointers.
const DenseMap< Value *, const SCEV * > & getSymbolicStrides() const
If an access has a symbolic strides, this maps the pointer value to the stride symbol.
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:570
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
BlockT * getUniqueLatchExitBlock() const
Return the unique exit block for the latch, or null if there are multiple different exit blocks or th...
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
BlockT * getHeader() const
unsigned getLoopDepth() const
Return the nesting level of this loop.
iterator_range< block_iterator > blocks() const
BlockT * getLoopPredecessor() const
If the given loop's header has exactly one unique predecessor outside the loop, return it.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
LoopT * getParentLoop() const
Return the parent loop if it exists or nullptr for top level loops.
Store the result of a depth first search within basic blocks contained by a single loop.
Definition: LoopIterator.h:97
RPOIterator beginRPO() const
Reverse iterate over the cached postorder blocks.
Definition: LoopIterator.h:136
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
Definition: LoopInfo.cpp:1276
RPOIterator endRPO() const
Definition: LoopIterator.h:140
Wrapper class to LoopBlocksDFS that provides a standard begin()/end() interface for the DFS reverse p...
Definition: LoopIterator.h:172
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
Definition: LoopIterator.h:180
void removeBlock(BlockT *BB)
This method completely removes BB from all data structures, including all of the Loop objects it is n...
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LoopVectorizationCostModel - estimates the expected speedups due to vectorization.
SmallPtrSet< Type *, 16 > ElementTypesInLoop
All element types found in the loop.
bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment, unsigned AddressSpace) const
Returns true if the target machine supports masked load operation for the given DataType and kind of ...
LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, const Function *F, const LoopVectorizeHints *Hints, InterleavedAccessInfo &IAI, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
void collectElementTypesForWidening()
Collect all element types in the loop for which widening is needed.
bool canVectorizeReductions(ElementCount VF) const
Returns true if the target machine supports all of the reduction variables found for the given VF.
bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment, unsigned AddressSpace) const
Returns true if the target machine supports masked store operation for the given DataType and kind of...
bool isEpilogueVectorizationProfitable(const ElementCount VF, const unsigned IC) const
Returns true if epilogue vectorization is considered profitable, and false otherwise.
bool isPredicatedInst(Instruction *I) const
Returns true if I is an instruction that needs to be predicated at runtime.
void collectValuesToIgnore()
Collect values we want to ignore in the cost model.
void collectInLoopReductions()
Split reductions into those that happen in the loop, and those that happen outside.
std::pair< unsigned, unsigned > getSmallestAndWidestTypes()
bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be uniform after vectorization.
void collectNonVectorizedAndSetWideningDecisions(ElementCount VF)
Collect values that will not be widened, including Uniforms, Scalars, and Instructions to Scalarize f...
PredicatedScalarEvolution & PSE
Predicated scalar evolution analysis.
const LoopVectorizeHints * Hints
Loop Vectorize Hint.
std::optional< unsigned > getMaxSafeElements() const
Return maximum safe number of elements to be processed per vector iteration, which do not prevent sto...
const TargetTransformInfo & TTI
Vector target information.
LoopVectorizationLegality * Legal
Vectorization legality.
std::optional< InstructionCost > getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy) const
Return the cost of instructions in an inloop reduction pattern, if I is part of that pattern.
InstructionCost getInstructionCost(Instruction *I, ElementCount VF)
Returns the execution time cost of an instruction for a given vector width.
DemandedBits * DB
Demanded bits analysis.
bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const
Returns true if I is a memory instruction in an interleaved-group of memory accesses that can be vect...
const TargetLibraryInfo * TLI
Target Library Info.
bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF)
Returns true if I is a memory instruction with consecutive memory access that can be widened.
const InterleaveGroup< Instruction > * getInterleavedAccessGroup(Instruction *Instr) const
Get the interleaved access group that Instr belongs to.
InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const
Estimate cost of an intrinsic call instruction CI if it were vectorized with factor VF.
bool OptForSize
Whether this loop should be optimized for size based on function attribute or profile information.
bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind)
bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be scalar after vectorization.
bool isOptimizableIVTruncate(Instruction *I, ElementCount VF)
Return True if instruction I is an optimizable truncate whose operand is an induction variable.
FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC)
Loop * TheLoop
The loop that we evaluate.
TTI::TargetCostKind CostKind
The kind of cost that we are calculating.
TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow=true) const
Returns the TailFoldingStyle that is best for the current loop.
InterleavedAccessInfo & InterleaveInfo
The interleave access information contains groups of interleaved accesses with the same stride and cl...
SmallPtrSet< const Value *, 16 > ValuesToIgnore
Values to ignore in the cost model.
void setVectorizedCallDecision(ElementCount VF)
A call may be vectorized in different ways depending on whether we have vectorized variants available...
void invalidateCostModelingDecisions()
Invalidates decisions already taken by the cost model.
bool isAccessInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleaved access group.
bool selectUserVectorizationFactor(ElementCount UserVF)
Setup cost-based decisions for user vectorization factor.
std::optional< unsigned > getVScaleForTuning() const
Return the value of vscale used for tuning the cost model.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
LoopInfo * LI
Loop Info analysis.
bool shouldCalculateRegPressureForVF(ElementCount VF)
bool requiresScalarEpilogue(bool IsVectorizing) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
SmallPtrSet< const Value *, 16 > VecValuesToIgnore
Values to ignore in the cost model when VF > 1.
bool isInLoopReduction(PHINode *Phi) const
Returns true if the Phi is part of an inloop reduction.
bool isProfitableToScalarize(Instruction *I, ElementCount VF) const
void setWideningDecision(const InterleaveGroup< Instruction > *Grp, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for interleaving group Grp and vector ...
const MapVector< Instruction *, uint64_t > & getMinimalBitwidths() const
CallWideningDecision getCallWideningDecision(CallInst *CI, ElementCount VF) const
bool isLegalGatherOrScatter(Value *V, ElementCount VF)
Returns true if the target machine can represent V as a masked gather or scatter operation.
bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const
bool shouldConsiderInvariant(Value *Op)
Returns true if Op should be considered invariant and if it is trivially hoistable.
bool foldTailByMasking() const
Returns true if all loop blocks should be masked to fold tail loop.
bool foldTailWithEVL() const
Returns true if VP intrinsics with explicit vector length support should be generated in the tail fol...
bool usePredicatedReductionSelect() const
Returns true if the predicated reduction select should be used to set the incoming value for the redu...
bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const
Returns true if the instructions in this block requires predication for any reason,...
void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind, Function *Variant, Intrinsic::ID IID, std::optional< unsigned > MaskPos, InstructionCost Cost)
void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC)
Selects and saves TailFoldingStyle for 2 options - if IV update may overflow or not.
AssumptionCache * AC
Assumption cache.
void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for instruction I and vector width VF.
InstWidening
Decision that was taken during cost calculation for memory instruction.
bool isScalarWithPredication(Instruction *I, ElementCount VF) const
Returns true if I is an instruction which requires predication and for which our chosen predication s...
InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const
Estimate cost of a call instruction CI if it were vectorized with factor VF.
bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const
Returns true if we should use strict in-order reductions for the given RdxDesc.
std::pair< InstructionCost, InstructionCost > getDivRemSpeculationCost(Instruction *I, ElementCount VF) const
Return the costs for our two available strategies for lowering a div/rem operation which requires spe...
bool isDivRemScalarWithPredication(InstructionCost ScalarCost, InstructionCost SafeDivisorCost) const
Given costs for both strategies, return true if the scalar predication lowering should be used for di...
InstructionCost expectedCost(ElementCount VF)
Returns the expected execution cost.
void setCostBasedWideningDecision(ElementCount VF)
Memory access instruction may be vectorized in more than one way.
InstWidening getWideningDecision(Instruction *I, ElementCount VF) const
Return the cost model decision for the given instruction I and vector width VF.
FixedScalableVFPair MaxPermissibleVFWithoutMaxBW
The highest VF possible for this loop, without using MaxBandwidth.
bool isScalarEpilogueAllowed() const
Returns true if a scalar epilogue is not allowed due to optsize or a loop hint annotation.
InstructionCost getWideningCost(Instruction *I, ElementCount VF)
Return the vectorization cost for the given instruction I and vector width VF.
void collectInstsToScalarize(ElementCount VF)
Collects the instructions to scalarize for each predicated instruction in the loop.
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
bool isInvariantStoreOfReduction(StoreInst *SI)
Returns True if given store is a final invariant store of one of the reductions found in the loop.
bool hasVectorCallVariants() const
Returns true if there is at least one function call in the loop which has a vectorized variant availa...
const RecurrenceDescriptor & getRecurrenceDescriptor(PHINode *PN) const
Returns the recurrence descriptor associated with a given phi node PN, expecting one to exist.
uint64_t getMaxStoreLoadForwardSafeDistanceInBits() const
Return safe power-of-2 number of elements, which do not prevent store-load forwarding and safe to ope...
bool isInvariantAddressOfReduction(Value *V)
Returns True if given address is invariant and is used to store recurrent expression.
bool blockNeedsPredication(BasicBlock *BB) const
Return true if the block BB needs to be predicated in order for the loop to be vectorized.
bool canVectorize(bool UseVPlanNativePath)
Returns true if it is legal to vectorize this loop.
int isConsecutivePtr(Type *AccessTy, Value *Ptr) const
Check if this pointer is consecutive when vectorizing.
std::optional< const HistogramInfo * > getHistogramInfo(Instruction *I) const
Returns a HistogramInfo* for the given instruction if it was determined to be part of a load -> updat...
bool canVectorizeFPMath(bool EnableStrictReductions)
Returns true if it is legal to vectorize the FP math operations in this loop.
bool isReductionVariable(PHINode *PN) const
Returns True if PN is a reduction variable in this loop.
bool isFixedOrderRecurrence(const PHINode *Phi) const
Returns True if Phi is a fixed-order recurrence in this loop.
IntegerType * getWidestInductionType()
Returns the widest induction type.
const InductionDescriptor * getPointerInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is pointer induction.
const InductionDescriptor * getIntOrFpInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is an integer or floating point induction.
bool isInductionPhi(const Value *V) const
Returns True if V is a Phi node of an induction variable in this loop.
PHINode * getPrimaryInduction()
Returns the primary induction variable.
const SmallVector< BasicBlock *, 4 > & getCountableExitingBlocks() const
Returns all exiting blocks with a countable exit, i.e.
bool isUniform(Value *V, ElementCount VF) const
Returns true if value V is uniform across VF lanes, when VF is provided, and otherwise if V is invari...
const InductionList & getInductionVars() const
Returns the induction variables found in the loop.
bool isInvariant(Value *V) const
Returns true if V is invariant across all loop iterations according to SCEV.
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
bool isSafeForAnyStoreLoadForwardDistances() const
Return true if there is store-load forwarding dependencies.
bool canFoldTailByMasking() const
Return true if we can vectorize this loop while folding its tail by masking.
void prepareToFoldTailByMasking()
Mark all respective loads/stores for masking.
bool hasUncountableEarlyExit() const
Returns true if the loop has exactly one uncountable early exit, i.e.
bool hasHistograms() const
Returns a list of all known histogram operations in the loop.
const LoopAccessInfo * getLAI() const
bool isUniformMemOp(Instruction &I, ElementCount VF) const
A uniform memory op is a load or store which accesses the same memory location on all VF lanes,...
bool isMaskRequired(const Instruction *I) const
Returns true if vector representation of the instruction I requires mask.
const RuntimePointerChecking * getRuntimePointerChecking() const
Returns the information that we collected about runtime memory check.
Planner drives the vectorization process after having passed Legality checks.
VectorizationFactor selectEpilogueVectorizationFactor(const ElementCount MaxVF, unsigned IC)
VPlan & getPlanFor(ElementCount VF) const
Return the VPlan for VF.
Definition: VPlan.cpp:1605
VectorizationFactor planInVPlanNativePath(ElementCount UserVF)
Use the VPlan-native path to plan how to best vectorize, return the best VF and its cost.
void buildVPlans(ElementCount MinVF, ElementCount MaxVF)
Build VPlans for power-of-2 VF's between MinVF and MaxVF inclusive, according to the information gath...
Definition: VPlan.cpp:1589
VectorizationFactor computeBestVF()
Compute and return the most profitable vectorization factor.
DenseMap< const SCEV *, Value * > executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan, InnerLoopVectorizer &LB, DominatorTree *DT, bool VectorizingEpilogue)
Generate the IR code for the vectorized loop captured in VPlan BestPlan according to the best selecte...
unsigned selectInterleaveCount(VPlan &Plan, ElementCount VF, InstructionCost LoopCost)
void emitInvalidCostRemarks(OptimizationRemarkEmitter *ORE)
Emit remarks for recipes with invalid costs in the available VPlans.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition: VPlan.cpp:1570
void printPlans(raw_ostream &O)
Definition: VPlan.cpp:1619
void plan(ElementCount UserVF, unsigned UserIC)
Build VPlans for the specified UserVF and UserIC if they are non-zero or all applicable candidate VFs...
void addMinimumIterationCheck(VPlan &Plan, ElementCount VF, unsigned UF, ElementCount MinProfitableTripCount) const
Create a check to Plan to see if the vector loop should be executed based on its trip count.
bool hasPlanWithVF(ElementCount VF) const
Look through the existing plans and return true if we have one with vectorization factor VF.
This holds vectorization requirements that must be verified late in the process.
Utility class for getting and setting loop vectorizer hints in the form of loop metadata.
bool allowVectorization(Function *F, Loop *L, bool VectorizeOnlyWhenForced) const
bool allowReordering() const
When enabling loop hints are provided we allow the vectorizer to change the order of operations that ...
void emitRemarkWithHints() const
Dumps all the hint information.
void setAlreadyVectorized()
Mark the loop L as already vectorized by setting the width to 1.
const char * vectorizeAnalysisPassName() const
If hints are provided that force vectorization, use the AlwaysPrint pass name to force the frontend t...
This class emits a version of the loop where run-time checks ensure that may-alias pointers can't ove...
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:40
bool isLoopInvariant(const Value *V, bool HasCoroSuspendInst=false) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:61
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:644
bool hasLoopInvariantOperands(const Instruction *I, bool HasCoroSuspendInst=false) const
Return true if all the operands of the specified instruction are loop invariant.
Definition: LoopInfo.cpp:76
MDNode * getLoopID() const
Return the llvm.loop loop id metadata node for this loop if it is present.
Definition: LoopInfo.cpp:514
Metadata node.
Definition: Metadata.h:1077
LLVM_ABI void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
Definition: Metadata.cpp:1078
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1445
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1565
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1451
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition: Metadata.cpp:607
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator end()
Definition: MapVector.h:67
iterator find(const KeyT &Key)
Definition: MapVector.h:141
bool contains(const KeyT &Key) const
Definition: MapVector.h:137
bool empty() const
Definition: MapVector.h:75
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:115
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition: Module.cpp:229
Diagnostic information for optimization analysis remarks related to pointer aliasing.
Diagnostic information for optimization analysis remarks related to floating-point non-commutativity.
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
bool allowExtraAnalysis(StringRef PassName) const
Whether we allow for extra compile-time budget to perform more analysis to produce fewer false positi...
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
An analysis over an "inner" IR unit that provides access to an analysis manager over a "outer" IR uni...
Definition: PassManager.h:716
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
op_range incoming_values()
void setIncomingValueForBlock(const BasicBlock *BB, Value *V)
Set every incoming value(s) for block BB to V.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
unsigned getNumIncomingValues() const
Return the number of incoming edges.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEVPredicate & getPredicate() const
LLVM_ABI unsigned getSmallConstantMaxTripCount()
Returns the upper bound of the loop trip count as a normal unsigned value, or 0 if the trip count is ...
LLVM_ABI const SCEV * getBackedgeTakenCount()
Get the (predicated) backedge count for the analyzed loop.
LLVM_ABI const SCEV * getSymbolicMaxBackedgeTakenCount()
Get the (predicated) symbolic max backedge count for the analyzed loop.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:151
PreservedAnalyses & preserve()
Mark an analysis as preserved.
Definition: Analysis.h:132
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
bool hasProfileSummary() const
Returns true if profile summary is available.
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: IVDescriptors.h:90
static bool isFMulAddIntrinsic(Instruction *I)
Returns true if the instruction is a call to the llvm.fmuladd intrinsic.
FastMathFlags getFastMathFlags() const
Instruction * getLoopExitInstr() const
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
Type * getRecurrenceType() const
Returns the type of the recurrence.
const SmallPtrSet< Instruction *, 8 > & getCastInsts() const
Returns a reference to the instructions used for type-promoting the recurrence.
unsigned getMinWidthCastToRecurrenceTypeInBits() const
Returns the minimum width used by the recurrence in bits.
TrackingVH< Value > getRecurrenceStartValue() const
LLVM_ABI SmallVector< Instruction *, 4 > getReductionOpChain(PHINode *Phi, Loop *L) const
Attempts to find a chain of operations from Phi to LoopExitInst that can be treated as a set of reduc...
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
bool isSigned() const
Returns true if all source operands of the recurrence are SExtInsts.
RecurKind getRecurrenceKind() const
bool isOrdered() const
Expose an ordered FP reduction to the instance users.
static LLVM_ABI bool isFloatingPointRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is a floating point kind.
static bool isFindIVRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
Value * getSentinelValue() const
Returns the sentinel value for FindFirstIV & FindLastIV recurrences to replace the start value.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
bool Need
This flag indicates if we need to add the runtime check.
std::optional< ArrayRef< PointerDiffInfo > > getDiffChecks() const
const SmallVectorImpl< RuntimePointerCheck > & getChecks() const
Returns the checks that generateChecks created.
Helper to remove instructions inserted during SCEV expansion, unless they are marked as used.
This class uses information about analyze scalars to rewrite expressions in canonical form.
ScalarEvolution * getSE()
bool isInsertedInstruction(Instruction *I) const
Return true if the specified instruction was inserted by the code rewriter.
LLVM_ABI Value * expandCodeForPredicate(const SCEVPredicate *Pred, Instruction *Loc)
Generates a code sequence that evaluates this predicate.
This class represents an assumption made using SCEV expressions which can be checked at run-time.
virtual bool isAlwaysTrue() const =0
Returns true if the predicate is always true.
This class represents an analyzed expression in the program.
LLVM_ABI bool isOne() const
Return true if the expression is a constant one.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getURemExpr(const SCEV *LHS, const SCEV *RHS)
Represents an unsigned remainder expression based on unsigned division.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getTripCountFromExitCount(const SCEV *ExitCount)
A version of getTripCountFromExitCount below which always picks an evaluation type which can not resu...
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
LLVM_ABI void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
LLVM_ABI void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
LLVM_ABI void forgetBlockAndLoopDispositions(Value *V=nullptr)
Called when the client has changed the disposition of values in a loop or block.
const SCEV * getMinusOne(Type *Ty)
Return a SCEV for the constant -1 of a specific type.
LLVM_ABI void forgetLcssaPhiWithNewPredecessor(Loop *L, PHINode *V)
Forget LCSSA phi node V of loop L to which a new predecessor was added, such that it may no longer be...
LLVM_ABI unsigned getSmallConstantTripCount(const Loop *L)
Returns the exact trip count of the loop if we can compute it, and the result is a small constant.
APInt getUnsignedRangeMax(const SCEV *S)
Determine the max of the unsigned range for a particular SCEV.
LLVM_ABI const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
LLVM_ABI void verify() const
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:59
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:104
void insert_range(Range &&R)
Definition: SetVector.h:193
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:279
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:168
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:380
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:470
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:401
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:476
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:541
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:356
bool empty() const
Definition: SmallVector.h:82
size_t size() const
Definition: SmallVector.h:79
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:938
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
An instruction for storing to memory.
Definition: Instructions.h:296
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
LLVM_ABI std::optional< unsigned > getVScaleForTuning() const
LLVM_ABI bool isLegalMaskedScatter(Type *DataType, Align Alignment) const
Return true if the target supports masked scatter.
LLVM_ABI InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of an Add/...
LLVM_ABI bool enableAggressiveInterleaving(bool LoopHasReductions) const
Don't restrict interleaved unrolling to small loops.
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
LLVM_ABI bool isLegalMaskedLoad(Type *DataType, Align Alignment, unsigned AddressSpace) const
Return true if the target supports masked load.
LLVM_ABI bool supportsEfficientVectorElementLoadStore() const
If target has efficient vector element load/store instructions, it can return true here so that inser...
LLVM_ABI bool prefersVectorizedAddressing() const
Return true if target doesn't mind addresses in vectors.
LLVM_ABI InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI bool hasBranchDivergence(const Function *F=nullptr) const
Return true if branch divergence exists.
LLVM_ABI void getUnrollingPreferences(Loop *L, ScalarEvolution &, UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const
Get target-customized preferences for the generic loop unrolling transformation.
LLVM_ABI TypeSize getRegisterBitWidth(RegisterKind K) const
LLVM_ABI bool hasActiveVectorLength() const
LLVM_ABI bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
LLVM_ABI InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI std::optional< unsigned > getMaxVScale() const
LLVM_ABI InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
LLVM_ABI bool enableOrderedReductions() const
Return true if we should be enabling ordered reductions for the target.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
LLVM_ABI bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const
LLVM_ABI TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const
Query the target what the preferred style of tail folding is.
LLVM_ABI InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
LLVM_ABI bool isElementTypeLegalForScalableVector(Type *Ty) const
LLVM_ABI ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const
LLVM_ABI bool enableMaskedInterleavedAccessVectorization() const
Enable matching of interleaved access groups that contain predicated accesses or gaps and therefore v...
LLVM_ABI InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
LLVM_ABI bool isVScaleKnownToBeAPowerOfTwo() const
LLVM_ABI InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
LLVM_ABI InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind) const
LLVM_ABI const char * getRegisterClassName(unsigned ClassID) const
LLVM_ABI bool preferEpilogueVectorization() const
Return true if the loop vectorizer should consider vectorizing an otherwise scalar epilogue loop.
LLVM_ABI bool preferInLoopReduction(RecurKind Kind, Type *Ty) const
LLVM_ABI InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
LLVM_ABI unsigned getEpilogueVectorizationMinVF() const
LLVM_ABI bool preferPredicatedReductionSelect() const
LLVM_ABI bool isLegalMaskedStore(Type *DataType, Align Alignment, unsigned AddressSpace) const
Return true if the target supports masked store.
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
LLVM_ABI bool supportsScalableVectors() const
LLVM_ABI bool isFPVectorizationPotentiallyUnsafe() const
Indicate that it is potentially unsafe to automatically vectorize floating-point operations because t...
@ TCC_Free
Expected to fold away in lowering.
LLVM_ABI bool enableInterleavedAccessVectorization() const
Enable matching of interleaved access groups.
LLVM_ABI unsigned getMinTripCountTailFoldingThreshold() const
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
LLVM_ABI unsigned getMaxInterleaveFactor(ElementCount VF) const
LLVM_ABI bool enableScalableVectorization() const
LLVM_ABI InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const
LLVM_ABI unsigned getNumberOfParts(Type *Tp) const
LLVM_ABI InstructionCost getOperandsScalarizationOverhead(ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing operands with the given types.
LLVM_ABI bool isTruncateFree(Type *Ty1, Type *Ty2) const
Return true if it's free to truncate a value of type Ty1 to type Ty2.
LLVM_ABI bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const
Query the target whether it would be prefered to create a predicated vector loop, which can avoid the...
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
LLVM_ABI InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
LLVM_ABI InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ Interleave
The cast is used with an interleaved load/store.
@ GatherScatter
The cast is used with a gather/scatter.
LLVM_ABI bool preferFixedOverScalableIfEqualCost() const
This class represents a truncation of integer types.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition: TypeSwitch.h:87
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition: TypeSwitch.h:96
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:273
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:240
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
LLVM_ABI unsigned getIntegerBitWidth() const
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:352
This function has undefined behavior.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
op_range operands()
Definition: User.h:292
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
Value * getOperand(unsigned i) const
Definition: User.h:232
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:74
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition: VPlan.h:3745
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition: VPlan.h:3820
RecipeListTy::iterator iterator
Instruction iterators...
Definition: VPlan.h:3772
iterator end()
Definition: VPlan.h:3782
iterator begin()
Recipe iterator methods.
Definition: VPlan.h:3780
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition: VPlan.h:3833
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition: VPlan.cpp:236
VPRegionBlock * getEnclosingLoopRegion()
Definition: VPlan.cpp:625
void insert(VPRecipeBase *Recipe, iterator InsertPt)
Definition: VPlan.h:3811
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition: VPlan.h:81
VPRegionBlock * getParent()
Definition: VPlan.h:173
const VPBasicBlock * getExitingBasicBlock() const
Definition: VPlan.cpp:180
void setName(const Twine &newName)
Definition: VPlan.h:166
size_t getNumSuccessors() const
Definition: VPlan.h:219
void swapSuccessors()
Swap successors of the block. The block must have exactly 2 successors.
Definition: VPlan.h:322
size_t getNumPredecessors() const
Definition: VPlan.h:220
VPlan * getPlan()
Definition: VPlan.cpp:155
VPBlockBase * getSinglePredecessor() const
Definition: VPlan.h:215
const VPBasicBlock * getEntryBasicBlock() const
Definition: VPlan.cpp:160
VPBlockBase * getSingleSuccessor() const
Definition: VPlan.h:209
const VPBlocksTy & getSuccessors() const
Definition: VPlan.h:198
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition: VPlanUtils.h:237
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition: VPlanUtils.h:175
static void reassociateBlocks(VPBlockBase *Old, VPBlockBase *New)
Reassociate all the blocks connected to Old so that they now point to New.
Definition: VPlanUtils.h:202
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPInstruction * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createNot(VPValue *Operand, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPBasicBlock * getInsertBlock() const
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
void insert(VPRecipeBase *R)
Insert R at the current insertion point.
VPInstruction * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", std::optional< FastMathFlags > FMFs=std::nullopt)
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL, const Twine &Name="")
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL)
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition: VPlanValue.h:422
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition: VPlanValue.h:395
void execute(VPTransformState &State) override
Generate the transformed value of the induction at offset StartValue (1.
VPValue * getStepValue() const
Definition: VPlan.h:3622
VPValue * getStartValue() const
Definition: VPlan.h:3621
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition: VPlan.h:1964
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition: VPlan.h:2012
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition: VPlan.h:2001
A recipe representing a sequence of load -> update -> store as part of a histogram operation.
Definition: VPlan.h:1679
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition: VPlan.h:3898
Class to record and manage LLVM IR flags.
Definition: VPlan.h:600
Helper to manage IR metadata for recipes.
Definition: VPlan.h:935
This is a concrete Recipe that models a single VPlan-level instruction.
Definition: VPlan.h:976
@ ComputeAnyOfResult
Compute the final result of a AnyOf reduction with select(cmp(),x,y), where one of (x,...
Definition: VPlan.h:1009
@ ExtractPenultimateElement
Definition: VPlan.h:1019
@ ResumeForEpilogue
Explicit user for the resume phi of the canonical induction in the main VPlan, used by the epilogue v...
Definition: VPlan.h:1056
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition: VPlan.h:1047
unsigned getOpcode() const
Definition: VPlan.h:1117
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition: VPlan.h:2557
In what follows, the term "input IR" refers to code that is fed into the vectorizer whereas the term ...
Definition: VPlanHelpers.h:125
A recipe for forming partial reductions.
Definition: VPlan.h:2734
detail::zippy< llvm::detail::zip_first, VPUser::const_operand_range, const_incoming_blocks_range > incoming_values_and_blocks() const
Returns an iterator range over pairs of incoming values and corresponding incoming blocks.
Definition: VPlan.h:1288
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition: VPlan.h:394
VPBasicBlock * getParent()
Definition: VPlan.h:415
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition: VPlan.h:482
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * tryToCreateWidenRecipe(VPSingleDefRecipe *R, VFRange &Range)
Create and return a widened recipe for R if one can be created within the given VF Range.
VPValue * getBlockInMask(VPBasicBlock *VPBB) const
Returns the entry mask for block VPBB or null if the mask is all-true.
VPValue * getVPValueOrAddLiveIn(Value *V)
std::optional< unsigned > getScalingForReduction(const Instruction *ExitInst)
void collectScaledReductions(VFRange &Range)
Find all possible partial reductions in the loop and track all of those that are valid so recipes can...
VPReplicateRecipe * handleReplication(Instruction *I, ArrayRef< VPValue * > Operands, VFRange &Range)
Build a VPReplicationRecipe for I using Operands.
VPRecipeBase * tryToCreatePartialReduction(Instruction *Reduction, ArrayRef< VPValue * > Operands, unsigned ScaleFactor)
Create and return a partial reduction recipe for a reduction instruction along with binary operation ...
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
A recipe for handling reduction phis.
Definition: VPlan.h:2318
bool isInLoop() const
Returns true, if the phi is part of an in-loop reduction.
Definition: VPlan.h:2378
RecurKind getRecurrenceKind() const
Returns the recurrence kind of the reduction.
Definition: VPlan.h:2372
A recipe to represent inloop reduction operations, performing a reduction on a vector operand into a ...
Definition: VPlan.h:2647
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition: VPlan.h:3933
const VPBlockBase * getEntry() const
Definition: VPlan.h:3969
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition: VPlan.h:2837
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition: VPlan.h:521
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition: VPlan.h:586
An analysis for type-inference for VPValues.
Definition: VPlanAnalysis.h:43
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition: VPlanValue.h:197
void setOperand(unsigned I, VPValue *New)
Definition: VPlanValue.h:241
VPValue * getOperand(unsigned N) const
Definition: VPlanValue.h:236
void addOperand(VPValue *Operand)
Definition: VPlanValue.h:230
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition: VPlan.cpp:125
Value * getLiveInIRValue() const
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition: VPlanValue.h:174
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition: VPlanValue.h:85
void replaceAllUsesWith(VPValue *New)
Definition: VPlan.cpp:1406
user_iterator user_begin()
Definition: VPlanValue.h:130
unsigned getNumUsers() const
Definition: VPlanValue.h:113
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition: VPlan.cpp:1410
user_range users()
Definition: VPlanValue.h:134
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition: VPlan.h:1830
A recipe to compute the pointers for widened memory accesses of IndexTy.
Definition: VPlan.h:1889
A recipe for widening Call instructions using library calls.
Definition: VPlan.h:1626
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition: VPlan.h:1480
A recipe for handling GEP instructions.
Definition: VPlan.h:1766
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition: VPlan.h:2029
VPValue * getStepValue()
Returns the step value of the induction.
Definition: VPlan.h:2057
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition: VPlan.h:2074
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition: VPlan.h:2104
A recipe for widening vector intrinsics.
Definition: VPlan.h:1537
A common base class for widening memory operations.
Definition: VPlan.h:3114
A recipe for widened phis.
Definition: VPlan.h:2240
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition: VPlan.h:1437
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition: VPlan.h:4036
bool hasVF(ElementCount VF) const
Definition: VPlan.h:4245
VPBasicBlock * getEntry()
Definition: VPlan.h:4135
VPValue & getVectorTripCount()
The vector trip count.
Definition: VPlan.h:4225
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition: VPlan.h:4231
VPValue & getVF()
Returns the VF of the vector loop region.
Definition: VPlan.h:4228
VPValue * getTripCount() const
The trip count of the original loop.
Definition: VPlan.h:4197
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
Definition: VPlan.h:4252
bool hasUF(unsigned UF) const
Definition: VPlan.h:4263
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition: VPlan.h:4187
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition: VPlan.cpp:1040
bool hasEarlyExit() const
Returns true if the VPlan is based on a loop with an early exit.
Definition: VPlan.h:4408
InstructionCost cost(ElementCount VF, VPCostContext &Ctx)
Return the cost of this plan.
Definition: VPlan.cpp:1022
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition: VPlan.h:4211
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition: VPlan.h:4160
void setEntry(VPBasicBlock *VPBB)
Definition: VPlan.h:4124
LLVM_ABI_FOR_TEST VPIRBasicBlock * createVPIRBasicBlock(BasicBlock *IRBB)
Create a VPIRBasicBlock from IRBB containing VPIRInstructions for all instructions in IRBB,...
Definition: VPlan.cpp:1255
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
Definition: VPlan.h:4287
bool hasScalarVFOnly() const
Definition: VPlan.h:4256
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition: VPlan.h:4178
void execute(VPTransformState *State)
Generate the IR code for this VPlan.
Definition: VPlan.cpp:958
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the vector loop.
Definition: VPlan.h:4341
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition: VPlan.h:4183
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition: VPlan.h:4140
VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition: VPlan.cpp:1182
LLVM Value Representation.
Definition: Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
LLVM_ABI bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition: Value.cpp:166
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:390
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:546
iterator_range< user_iterator > users()
Definition: Value.h:426
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1101
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:322
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
int getNumOccurrences() const
Definition: CommandLine.h:400
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:194
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:169
constexpr bool hasKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns true if there exists a value X where RHS.multiplyCoefficientBy(X) will result in a value whos...
Definition: TypeSize.h:272
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:203
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:233
constexpr bool isNonZero() const
Definition: TypeSize.h:159
constexpr ScalarTy getKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns a value X where RHS.multiplyCoefficientBy(X) will result in a value whose quantity matches ou...
Definition: TypeSize.h:280
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:219
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:172
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition: TypeSize.h:259
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition: TypeSize.h:175
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:169
constexpr bool isZero() const
Definition: TypeSize.h:157
static constexpr bool isKnownGT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:226
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition: TypeSize.h:255
static constexpr bool isKnownGE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:240
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition: ilist_node.h:34
self_iterator getIterator()
Definition: ilist_node.h:134
A range adaptor for a pair of iterators.
IteratorT begin() const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:53
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:662
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:126
@ Entry
Definition: COFF.h:862
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
std::variant< std::monostate, Loc::Single, Loc::Multi, Loc::MMI, Loc::EntryValue > Variant
Alias for the std::variant specialization base class of DbgVariable.
Definition: DwarfDebug.h:190
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
OneOps_match< OpTy, Instruction::Freeze > m_Freeze(const OpTy &Op)
Matches FreezeInst.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:862
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:962
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
Definition: PatternMatch.h:592
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
class_match< const SCEVVScale > m_SCEVVScale()
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
specificloop_ty m_SpecificLoop(const Loop *L)
cst_pred_ty< is_specific_signed_cst > m_scev_SpecificSInt(int64_t V)
Match an SCEV constant with a plain signed integer (sign-extended value will be matched)
SCEVAffineAddRec_match< Op0_t, Op1_t, class_match< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
SCEVBinaryExpr_match< SCEVMulExpr, Op0_t, Op1_t > m_scev_Mul(const Op0_t &Op0, const Op1_t &Op1)
bool match(const SCEV *S, const Pattern &P)
class_match< const SCEV > m_SCEV()
match_combine_or< AllRecipe_match< Instruction::ZExt, Op0_t >, AllRecipe_match< Instruction::SExt, Op0_t > > m_ZExtOrSExt(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExtractLastElement, Op0_t > m_ExtractLastElement(const Op0_t &Op0)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:712
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< InstrNode * > Instr
Definition: RDFGraph.h:389
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
Definition: VPlanUtils.cpp:32
VPBasicBlock * getFirstLoopHeader(VPlan &Plan, VPDominatorTree &VPDT)
Returns the header block of the first, top-level loop, or null if none exist.
Definition: VPlanUtils.cpp:138
const SCEV * getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE)
Return the SCEV expression for V.
Definition: VPlanUtils.cpp:79
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
LLVM_ABI bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
Simplify each loop in a loop nest recursively.
LLVM_ABI void ReplaceInstWithInst(BasicBlock *BB, BasicBlock::iterator &BI, Instruction *I)
Replace the instruction specified by BI with the instruction specified by I.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:338
@ Offset
Definition: DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:860
LLVM_ABI Value * addRuntimeChecks(Instruction *Loc, Loop *TheLoop, const SmallVectorImpl< RuntimePointerCheck > &PointerChecks, SCEVExpander &Expander, bool HoistRuntimeChecks=false)
Add code that checks at runtime if the accessed arrays in PointerChecks overlap.
Definition: LoopUtils.cpp:1980
LLVM_ABI bool RemoveRedundantDbgInstrs(BasicBlock *BB)
Try to remove redundant dbg.value instructions from given basic block.
cl::opt< bool > VerifyEachVPlan
LLVM_ABI std::optional< unsigned > getLoopEstimatedTripCount(Loop *L, unsigned *EstimatedLoopInvocationWeight=nullptr)
Returns a loop's estimated trip count based on branch weight metadata.
Definition: LoopUtils.cpp:841
static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, VectorizationFactor VF, unsigned IC)
Report successful vectorization of the loop.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1744
unsigned getLoadStoreAddressSpace(const Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:1023
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1702
LLVM_ABI_FOR_TEST bool verifyVPlanIsValid(const VPlan &Plan, bool VerifyLate=false)
Verify invariants for general VPlans.
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:7513
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
Value * getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF)
Return the runtime value for VF.
LLVM_ABI bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition: LCSSA.cpp:449
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI std::optional< MDNode * > makeFollowupLoopID(MDNode *OrigLoopID, ArrayRef< StringRef > FollowupAttrs, const char *InheritOptionsAttrsPrefix="", bool AlwaysNew=false)
Create a new loop identifier for a loop created from a loop transformation.
Definition: LoopUtils.cpp:264
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2155
LLVM_ABI bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:663
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:293
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition: VPlanCFG.h:216
LLVM_ABI bool VerifySCEV
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition: VPlanCFG.h:243
SmallVector< VPRegisterUsage, 8 > calculateRegisterUsageForPlan(VPlan &Plan, ArrayRef< ElementCount > VFs, const TargetTransformInfo &TTI, const SmallPtrSetImpl< const Value * > &ValuesToIgnore)
Estimate the register usage for Plan and vectorization factors in VFs by calculating the highest numb...
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:342
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1751
void collectEphemeralRecipesForVPlan(VPlan &Plan, DenseSet< VPRecipeBase * > &EphRecipes)
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:428
LLVM_ABI void setBranchWeights(Instruction &I, ArrayRef< uint32_t > Weights, bool IsExpected)
Create a new branch_weights metadata node and add or overwrite a prof metadata reference to instructi...
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:288
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1669
std::unique_ptr< VPlan > VPlanPtr
Definition: VPlan.h:77
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1758
LLVM_ABI cl::opt< bool > EnableLoopVectorization
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition: Local.cpp:421
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
Definition: SmallVector.h:1300
Type * toVectorizedTy(Type *Ty, ElementCount EC)
A helper for converting to vectorized types.
LLVM_ABI void llvm_unreachable_internal(const char *msg=nullptr, const char *file=nullptr, unsigned line=0)
This function calls abort(), and prints the optional message to stderr.
cl::opt< unsigned > ForceTargetInstructionCost
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:126
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition: STLExtras.h:345
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:399
bool canVectorizeTy(Type *Ty)
Returns true if Ty is a valid vector element type, void, or an unpacked literal struct where all elem...
TargetTransformInfo TTI
static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr, DebugLoc DL={})
Reports an informative message: print Msg for debugging purposes as well as an optimization remark.
LLVM_ABI bool isAssignmentTrackingEnabled(const Module &M)
Return true if assignment tracking is enabled for module M.
Definition: DebugInfo.cpp:2259
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:34
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ Sub
Subtraction of integers.
@ AddChainWithSubs
A chain of adds and subs.
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
Definition: LoopUtils.cpp:1305
LLVM_ABI void setProfileInfoAfterUnrolling(Loop *OrigLoop, Loop *UnrolledLoop, Loop *RemainderLoop, uint64_t UF)
Set weights for UnrolledLoop and RemainderLoop based on weights for OrigLoop and the following distri...
Definition: LoopUtils.cpp:1786
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
LLVM_ABI void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr)
Reports a vectorization failure: print DebugMsg for debugging purposes along with the corresponding o...
DWARFExpression::Operation Op
ScalarEpilogueLowering
@ CM_ScalarEpilogueNotAllowedLowTripLoop
@ CM_ScalarEpilogueNotNeededUsePredicate
@ CM_ScalarEpilogueNotAllowedOptSize
@ CM_ScalarEpilogueAllowed
@ CM_ScalarEpilogueNotAllowedUsePredicate
LLVM_ABI bool isGuaranteedNotToBeUndefOrPoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Return true if this function can prove that V does not have undef bits and is never poison.
Value * createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step)
Return a value for Step multiplied by VF.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition: Casting.h:565
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1777
LLVM_ABI llvm::MDNode * makePostTransformationMetadata(llvm::LLVMContext &Context, MDNode *OrigLoopID, llvm::ArrayRef< llvm::StringRef > RemovePrefixes, llvm::ArrayRef< llvm::MDNode * > AddAttrs)
Create a new LoopID after the loop has been transformed.
Definition: LoopInfo.cpp:1182
auto predecessors(const MachineBasicBlock *BB)
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
Definition: iterator.h:363
cl::opt< bool > EnableVPlanNativePath
Definition: VPlan.cpp:55
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
ArrayRef< Type * > getContainedTypes(Type *const &Ty)
Returns the types contained in Ty.
LLVM_ABI Value * addDiffRuntimeChecks(Instruction *Loc, ArrayRef< PointerDiffInfo > Checks, SCEVExpander &Expander, function_ref< Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC)
Definition: LoopUtils.cpp:2038
bool pred_empty(const BasicBlock *BB)
Definition: CFG.h:119
InstructionCost Cost
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
@ None
Don't use tail folding.
@ DataWithEVL
Use predicated EVL instructions for tail-folding.
@ DataAndControlFlow
Use predicate to control both data and control flow.
@ DataWithoutLaneMask
Same as Data, but avoids using the get.active.lane.mask intrinsic to calculate the mask and instead i...
@ Data
Use predicate only to mask operations on data in the loop.
unsigned getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind)
A helper function that returns how much we should divide the cost of a predicated block by.
Definition: VPlanHelpers.h:64
LLVM_ABI bool hasBranchWeightMD(const Instruction &I)
Checks if an instructions has Branch Weight Metadata.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:595
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:280
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI MapVector< Instruction *, uint64_t > computeMinimumValueSizes(ArrayRef< BasicBlock * > Blocks, DemandedBits &DB, const TargetTransformInfo *TTI=nullptr)
Compute a map of integer instructions to their minimum legal type size.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:469
LLVM_ABI cl::opt< bool > EnableLoopInterleaving
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:851
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:853
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
A special type used by analysis passes to provide an address that identifies that particular analysis...
Definition: Analysis.h:29
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:54
Encapsulate information regarding vectorization of a loop and its epilogue.
EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, ElementCount EVF, unsigned EUF, VPlan &EpiloguePlan)
A class that represents two vectorization factors (initialized with 0 by default).
static FixedScalableVFPair getNone()
This holds details about a histogram operation – a load -> update -> store sequence where each lane i...
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
TargetLibraryInfo * TLI
LLVM_ABI LoopVectorizeResult runImpl(Function &F)
LLVM_ABI bool processLoop(Loop *L)
ProfileSummaryInfo * PSI
LoopAccessInfoManager * LAIs
LLVM_ABI void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
LLVM_ABI LoopVectorizePass(LoopVectorizeOptions Opts={})
BlockFrequencyInfo * BFI
ScalarEvolution * SE
AssumptionCache * AC
LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
OptimizationRemarkEmitter * ORE
Storage for information about made changes.
A chain of instructions that form a partial reduction.
Instruction * Reduction
The top-level binary operation that forms the reduction to a scalar after the loop body.
Instruction * ExtendA
The extension of each of the inner binary operation's operands.
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition: PassManager.h:70
A marker analysis to determine if extra passes should be run after loop vectorization.
static LLVM_ABI AnalysisKey Key
Parameters that control the generic loop unrolling transformation.
bool UnrollVectorizedLoop
Don't disable runtime unroll for the loops which were vectorized.
Holds the VFShape for a specific scalar to vector function mapping.
std::optional< unsigned > getParamIndexForOptionalMask() const
Instruction Set Architecture.
Encapsulates information needed to describe a parameter.
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Definition: VPlanHelpers.h:71
ElementCount End
Definition: VPlanHelpers.h:76
Struct to hold various analysis needed for cost computations.
Definition: VPlanHelpers.h:344
LoopVectorizationCostModel & CM
Definition: VPlanHelpers.h:349
bool isLegacyUniformAfterVectorization(Instruction *I, ElementCount VF) const
Return true if I is considered uniform-after-vectorization in the legacy cost model for VF.
bool skipCostComputation(Instruction *UI, bool IsVector) const
Return true if the cost for UI shouldn't be computed, e.g.
InstructionCost getLegacyCost(Instruction *UI, ElementCount VF) const
Return the cost for UI with VF using the legacy cost model as fallback until computing the cost of al...
SmallPtrSet< Instruction *, 8 > SkipCostComputation
Definition: VPlanHelpers.h:350
A recipe for handling first-order recurrence phis.
Definition: VPlan.h:2283
A struct that represents some properties of the register usage of a loop.
Definition: VPlanAnalysis.h:76
BasicBlock * PrevBB
The previous IR BasicBlock created or used.
Definition: VPlanHelpers.h:303
SmallDenseMap< const VPBasicBlock *, BasicBlock * > VPBB2IRBB
A mapping of each VPBasicBlock to the corresponding BasicBlock.
Definition: VPlanHelpers.h:311
VPTransformState holds information passed down when "executing" a VPlan, needed for generating the ou...
Definition: VPlanHelpers.h:205
struct llvm::VPTransformState::CFGState CFG
Value * get(const VPValue *Def, bool IsScalar=false)
Get the generated vector Value for a given VPValue Def if IsScalar is false, otherwise return the gen...
Definition: VPlan.cpp:283
std::optional< VPLane > Lane
Hold the index to generate specific scalar instructions.
Definition: VPlanHelpers.h:219
IRBuilderBase & Builder
Hold a reference to the IRBuilder used to generate output IR code.
Definition: VPlanHelpers.h:328
VPlan * Plan
Pointer to the VPlan code is generated for.
Definition: VPlanHelpers.h:331
void set(const VPValue *Def, Value *V, bool IsScalar=false)
Set the generated vector Value for a given VPValue, if IsScalar is false.
Definition: VPlanHelpers.h:250
VPDominatorTree VPDT
VPlan-based dominator tree.
Definition: VPlanHelpers.h:340
A recipe for widening load operations, using the address to load from and an optional mask.
Definition: VPlan.h:3196
A recipe for widening select instructions.
Definition: VPlan.h:1720
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition: VPlan.h:3276
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static LLVM_ABI_FOR_TEST std::unique_ptr< VPlan > buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, DebugLoc IVDL, PredicatedScalarEvolution &PSE)
Create a base VPlan0, serving as the common starting point for all later candidates.
static void optimizeInductionExitUsers(VPlan &Plan, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static LLVM_ABI_FOR_TEST void handleEarlyExits(VPlan &Plan, bool HasUncountableExit)
Update Plan to account for all early exits.
static void canonicalizeEVLLoops(VPlan &Plan)
Transform EVL loops to use variable-length stepping after region dissolution.
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static bool runPass(bool(*Transform)(VPlan &, ArgsTy...), VPlan &Plan, typename std::remove_reference< ArgsTy >::type &...Args)
Helper to run a VPlan transform Transform on VPlan, forwarding extra arguments to the transform.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static void materializeBuildVectors(VPlan &Plan)
Add explicit Build[Struct]Vector recipes that combine multiple scalar values into single vectors.
static void unrollByUF(VPlan &Plan, unsigned UF)
Explicitly unroll Plan by UF.
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void addMinimumIterationCheck(VPlan &Plan, ElementCount VF, unsigned UF, ElementCount MinProfitableTripCount, bool RequiresScalarEpilogue, bool TailFolded, bool CheckNeededWithTailFolding, Loop *OrigLoop, const uint32_t *MinItersBypassWeights, DebugLoc DL, ScalarEvolution &SE)
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static DenseMap< VPBasicBlock *, VPValue * > introduceMasksAndLinearize(VPlan &Plan, bool FoldTail)
Predicate and linearize the control-flow in the only loop region of Plan.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static bool handleMaxMinNumReductions(VPlan &Plan)
Check if Plan contains any FMaxNum or FMinNum reductions.
static void removeBranchOnConst(VPlan &Plan)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static LLVM_ABI_FOR_TEST void createLoopRegions(VPlan &Plan)
Replace loops in Plan's flat CFG with VPRegionBlocks, turning Plan's flat CFG into a hierarchical CFG...
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void attachCheckBlock(VPlan &Plan, Value *Cond, BasicBlock *CheckBlock, bool AddBranchWeights)
Wrap runtime check block CheckBlock in a VPIRBB and Cond in a VPValue and connect the block to Plan,...
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue)
Materialize vector trip count computations to a set of VPInstructions.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlanPtr &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void replicateByVF(VPlan &Plan, ElementCount VF)
Replace each VPReplicateRecipe outside on any replicate region in Plan with VF single-scalar recipes.
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF, unsigned VectorRegWidth)
Try to convert a plan with interleave groups with VF elements to a plan with the interleave groups re...
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize VF and VFxUF to be computed explicitly using VPInstructions.
static LLVM_ABI_FOR_TEST void addMiddleCheck(VPlan &Plan, bool RequiresScalarEpilogueCheck, bool TailFolded)
If a check is needed to guard executing the scalar epilogue loop, it will be added to the middle bloc...
TODO: The following VectorizationFactor was pulled out of LoopVectorizationCostModel class.
InstructionCost Cost
Cost of the loop with that width.
ElementCount MinProfitableTripCount
The minimum trip count required to make vectorization profitable, e.g.
ElementCount Width
Vector width with best cost.
InstructionCost ScalarCost
Cost of the scalar loop.
static VectorizationFactor Disabled()
Width 1 means no vectorization, cost 0 means uncomputed cost.
static LLVM_ABI bool HoistRuntimeChecks