LLVM 22.0.0git
LoopStrengthReduce.cpp
Go to the documentation of this file.
1//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This transformation analyzes and transforms the induction variables (and
10// computations derived from them) into forms suitable for efficient execution
11// on the target.
12//
13// This pass performs a strength reduction on array references inside loops that
14// have as one or more of their components the loop induction variable, it
15// rewrites expressions to take advantage of scaled-index addressing modes
16// available on the target, and it performs a variety of other optimizations
17// related to loop induction variables.
18//
19// Terminology note: this code has a lot of handling for "post-increment" or
20// "post-inc" users. This is not talking about post-increment addressing modes;
21// it is instead talking about code like this:
22//
23// %i = phi [ 0, %entry ], [ %i.next, %latch ]
24// ...
25// %i.next = add %i, 1
26// %c = icmp eq %i.next, %n
27//
28// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
29// it's useful to think about these as the same register, with some uses using
30// the value of the register before the add and some using it after. In this
31// example, the icmp is a post-increment user, since it uses %i.next, which is
32// the value of the induction variable after the increment. The other common
33// case of post-increment users is users outside the loop.
34//
35// TODO: More sophistication in the way Formulae are generated and filtered.
36//
37// TODO: Handle multiple loops at a time.
38//
39// TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead
40// of a GlobalValue?
41//
42// TODO: When truncation is free, truncate ICmp users' operands to make it a
43// smaller encoding (on x86 at least).
44//
45// TODO: When a negated register is used by an add (such as in a list of
46// multiple base registers, or as the increment expression in an addrec),
47// we may not actually need both reg and (-1 * reg) in registers; the
48// negation can be implemented by using a sub instead of an add. The
49// lack of support for taking this into consideration when making
50// register pressure decisions is partly worked around by the "Special"
51// use kind.
52//
53//===----------------------------------------------------------------------===//
54
56#include "llvm/ADT/APInt.h"
57#include "llvm/ADT/DenseMap.h"
58#include "llvm/ADT/DenseSet.h"
60#include "llvm/ADT/STLExtras.h"
61#include "llvm/ADT/SetVector.h"
64#include "llvm/ADT/SmallSet.h"
66#include "llvm/ADT/Statistic.h"
84#include "llvm/IR/BasicBlock.h"
85#include "llvm/IR/Constant.h"
86#include "llvm/IR/Constants.h"
89#include "llvm/IR/Dominators.h"
90#include "llvm/IR/GlobalValue.h"
91#include "llvm/IR/IRBuilder.h"
92#include "llvm/IR/InstrTypes.h"
93#include "llvm/IR/Instruction.h"
96#include "llvm/IR/Module.h"
97#include "llvm/IR/Operator.h"
98#include "llvm/IR/Type.h"
99#include "llvm/IR/Use.h"
100#include "llvm/IR/User.h"
101#include "llvm/IR/Value.h"
102#include "llvm/IR/ValueHandle.h"
104#include "llvm/Pass.h"
105#include "llvm/Support/Casting.h"
108#include "llvm/Support/Debug.h"
118#include <algorithm>
119#include <cassert>
120#include <cstddef>
121#include <cstdint>
122#include <iterator>
123#include <limits>
124#include <map>
125#include <numeric>
126#include <optional>
127#include <utility>
128
129using namespace llvm;
130using namespace SCEVPatternMatch;
131
132#define DEBUG_TYPE "loop-reduce"
133
134/// MaxIVUsers is an arbitrary threshold that provides an early opportunity for
135/// bail out. This threshold is far beyond the number of users that LSR can
136/// conceivably solve, so it should not affect generated code, but catches the
137/// worst cases before LSR burns too much compile time and stack space.
138static const unsigned MaxIVUsers = 200;
139
140/// Limit the size of expression that SCEV-based salvaging will attempt to
141/// translate into a DIExpression.
142/// Choose a maximum size such that debuginfo is not excessively increased and
143/// the salvaging is not too expensive for the compiler.
144static const unsigned MaxSCEVSalvageExpressionSize = 64;
145
146// Cleanup congruent phis after LSR phi expansion.
148 "enable-lsr-phielim", cl::Hidden, cl::init(true),
149 cl::desc("Enable LSR phi elimination"));
150
151// The flag adds instruction count to solutions cost comparison.
153 "lsr-insns-cost", cl::Hidden, cl::init(true),
154 cl::desc("Add instruction count to a LSR cost model"));
155
156// Flag to choose how to narrow complex lsr solution
158 "lsr-exp-narrow", cl::Hidden, cl::init(false),
159 cl::desc("Narrow LSR complex solution using"
160 " expectation of registers number"));
161
162// Flag to narrow search space by filtering non-optimal formulae with
163// the same ScaledReg and Scale.
165 "lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true),
166 cl::desc("Narrow LSR search space by filtering non-optimal formulae"
167 " with the same ScaledReg and Scale"));
168
170 "lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None),
171 cl::desc("A flag that overrides the target's preferred addressing mode."),
173 clEnumValN(TTI::AMK_None, "none", "Don't prefer any addressing mode"),
174 clEnumValN(TTI::AMK_PreIndexed, "preindexed",
175 "Prefer pre-indexed addressing mode"),
176 clEnumValN(TTI::AMK_PostIndexed, "postindexed",
177 "Prefer post-indexed addressing mode"),
178 clEnumValN(TTI::AMK_All, "all", "Consider all addressing modes")));
179
181 "lsr-complexity-limit", cl::Hidden,
182 cl::init(std::numeric_limits<uint16_t>::max()),
183 cl::desc("LSR search space complexity limit"));
184
186 "lsr-setupcost-depth-limit", cl::Hidden, cl::init(7),
187 cl::desc("The limit on recursion depth for LSRs setup cost"));
188
190 "lsr-drop-solution", cl::Hidden,
191 cl::desc("Attempt to drop solution if it is less profitable"));
192
194 "lsr-enable-vscale-immediates", cl::Hidden, cl::init(true),
195 cl::desc("Enable analysis of vscale-relative immediates in LSR"));
196
198 "lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true),
199 cl::desc("Avoid using scaled registers with vscale-relative addressing"));
200
201#ifndef NDEBUG
202// Stress test IV chain generation.
204 "stress-ivchain", cl::Hidden, cl::init(false),
205 cl::desc("Stress test LSR IV chains"));
206#else
207static bool StressIVChain = false;
208#endif
209
210namespace {
211
212struct MemAccessTy {
213 /// Used in situations where the accessed memory type is unknown.
214 static const unsigned UnknownAddressSpace =
215 std::numeric_limits<unsigned>::max();
216
217 Type *MemTy = nullptr;
218 unsigned AddrSpace = UnknownAddressSpace;
219
220 MemAccessTy() = default;
221 MemAccessTy(Type *Ty, unsigned AS) : MemTy(Ty), AddrSpace(AS) {}
222
223 bool operator==(MemAccessTy Other) const {
224 return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace;
225 }
226
227 bool operator!=(MemAccessTy Other) const { return !(*this == Other); }
228
229 static MemAccessTy getUnknown(LLVMContext &Ctx,
230 unsigned AS = UnknownAddressSpace) {
231 return MemAccessTy(Type::getVoidTy(Ctx), AS);
232 }
233
234 Type *getType() { return MemTy; }
235};
236
237/// This class holds data which is used to order reuse candidates.
238class RegSortData {
239public:
240 /// This represents the set of LSRUse indices which reference
241 /// a particular register.
242 SmallBitVector UsedByIndices;
243
244 void print(raw_ostream &OS) const;
245 void dump() const;
246};
247
248// An offset from an address that is either scalable or fixed. Used for
249// per-target optimizations of addressing modes.
250class Immediate : public details::FixedOrScalableQuantity<Immediate, int64_t> {
251 constexpr Immediate(ScalarTy MinVal, bool Scalable)
252 : FixedOrScalableQuantity(MinVal, Scalable) {}
253
254 constexpr Immediate(const FixedOrScalableQuantity<Immediate, int64_t> &V)
255 : FixedOrScalableQuantity(V) {}
256
257public:
258 constexpr Immediate() = delete;
259
260 static constexpr Immediate getFixed(ScalarTy MinVal) {
261 return {MinVal, false};
262 }
263 static constexpr Immediate getScalable(ScalarTy MinVal) {
264 return {MinVal, true};
265 }
266 static constexpr Immediate get(ScalarTy MinVal, bool Scalable) {
267 return {MinVal, Scalable};
268 }
269 static constexpr Immediate getZero() { return {0, false}; }
270 static constexpr Immediate getFixedMin() {
271 return {std::numeric_limits<int64_t>::min(), false};
272 }
273 static constexpr Immediate getFixedMax() {
274 return {std::numeric_limits<int64_t>::max(), false};
275 }
276 static constexpr Immediate getScalableMin() {
277 return {std::numeric_limits<int64_t>::min(), true};
278 }
279 static constexpr Immediate getScalableMax() {
280 return {std::numeric_limits<int64_t>::max(), true};
281 }
282
283 constexpr bool isLessThanZero() const { return Quantity < 0; }
284
285 constexpr bool isGreaterThanZero() const { return Quantity > 0; }
286
287 constexpr bool isCompatibleImmediate(const Immediate &Imm) const {
288 return isZero() || Imm.isZero() || Imm.Scalable == Scalable;
289 }
290
291 constexpr bool isMin() const {
292 return Quantity == std::numeric_limits<ScalarTy>::min();
293 }
294
295 constexpr bool isMax() const {
296 return Quantity == std::numeric_limits<ScalarTy>::max();
297 }
298
299 // Arithmetic 'operators' that cast to unsigned types first.
300 constexpr Immediate addUnsigned(const Immediate &RHS) const {
301 assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
302 ScalarTy Value = (uint64_t)Quantity + RHS.getKnownMinValue();
303 return {Value, Scalable || RHS.isScalable()};
304 }
305
306 constexpr Immediate subUnsigned(const Immediate &RHS) const {
307 assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
308 ScalarTy Value = (uint64_t)Quantity - RHS.getKnownMinValue();
309 return {Value, Scalable || RHS.isScalable()};
310 }
311
312 // Scale the quantity by a constant without caring about runtime scalability.
313 constexpr Immediate mulUnsigned(const ScalarTy RHS) const {
314 ScalarTy Value = (uint64_t)Quantity * RHS;
315 return {Value, Scalable};
316 }
317
318 // Helpers for generating SCEVs with vscale terms where needed.
319 const SCEV *getSCEV(ScalarEvolution &SE, Type *Ty) const {
320 const SCEV *S = SE.getConstant(Ty, Quantity);
321 if (Scalable)
322 S = SE.getMulExpr(S, SE.getVScale(S->getType()));
323 return S;
324 }
325
326 const SCEV *getNegativeSCEV(ScalarEvolution &SE, Type *Ty) const {
327 const SCEV *NegS = SE.getConstant(Ty, -(uint64_t)Quantity);
328 if (Scalable)
329 NegS = SE.getMulExpr(NegS, SE.getVScale(NegS->getType()));
330 return NegS;
331 }
332
333 const SCEV *getUnknownSCEV(ScalarEvolution &SE, Type *Ty) const {
334 const SCEV *SU = SE.getUnknown(ConstantInt::getSigned(Ty, Quantity));
335 if (Scalable)
336 SU = SE.getMulExpr(SU, SE.getVScale(SU->getType()));
337 return SU;
338 }
339};
340
341// This is needed for the Compare type of std::map when Immediate is used
342// as a key. We don't need it to be fully correct against any value of vscale,
343// just to make sure that vscale-related terms in the map are considered against
344// each other rather than being mixed up and potentially missing opportunities.
345struct KeyOrderTargetImmediate {
346 bool operator()(const Immediate &LHS, const Immediate &RHS) const {
347 if (LHS.isScalable() && !RHS.isScalable())
348 return false;
349 if (!LHS.isScalable() && RHS.isScalable())
350 return true;
351 return LHS.getKnownMinValue() < RHS.getKnownMinValue();
352 }
353};
354
355// This would be nicer if we could be generic instead of directly using size_t,
356// but there doesn't seem to be a type trait for is_orderable or
357// is_lessthan_comparable or similar.
358struct KeyOrderSizeTAndImmediate {
359 bool operator()(const std::pair<size_t, Immediate> &LHS,
360 const std::pair<size_t, Immediate> &RHS) const {
361 size_t LSize = LHS.first;
362 size_t RSize = RHS.first;
363 if (LSize != RSize)
364 return LSize < RSize;
365 return KeyOrderTargetImmediate()(LHS.second, RHS.second);
366 }
367};
368} // end anonymous namespace
369
370#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
371void RegSortData::print(raw_ostream &OS) const {
372 OS << "[NumUses=" << UsedByIndices.count() << ']';
373}
374
375LLVM_DUMP_METHOD void RegSortData::dump() const {
376 print(errs()); errs() << '\n';
377}
378#endif
379
380namespace {
381
382/// Map register candidates to information about how they are used.
383class RegUseTracker {
384 using RegUsesTy = DenseMap<const SCEV *, RegSortData>;
385
386 RegUsesTy RegUsesMap;
388
389public:
390 void countRegister(const SCEV *Reg, size_t LUIdx);
391 void dropRegister(const SCEV *Reg, size_t LUIdx);
392 void swapAndDropUse(size_t LUIdx, size_t LastLUIdx);
393
394 bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
395
396 const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
397
398 void clear();
399
402
403 iterator begin() { return RegSequence.begin(); }
404 iterator end() { return RegSequence.end(); }
405 const_iterator begin() const { return RegSequence.begin(); }
406 const_iterator end() const { return RegSequence.end(); }
407};
408
409} // end anonymous namespace
410
411void
412RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) {
413 std::pair<RegUsesTy::iterator, bool> Pair = RegUsesMap.try_emplace(Reg);
414 RegSortData &RSD = Pair.first->second;
415 if (Pair.second)
416 RegSequence.push_back(Reg);
417 RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
418 RSD.UsedByIndices.set(LUIdx);
419}
420
421void
422RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) {
423 RegUsesTy::iterator It = RegUsesMap.find(Reg);
424 assert(It != RegUsesMap.end());
425 RegSortData &RSD = It->second;
426 assert(RSD.UsedByIndices.size() > LUIdx);
427 RSD.UsedByIndices.reset(LUIdx);
428}
429
430void
431RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
432 assert(LUIdx <= LastLUIdx);
433
434 // Update RegUses. The data structure is not optimized for this purpose;
435 // we must iterate through it and update each of the bit vectors.
436 for (auto &Pair : RegUsesMap) {
437 SmallBitVector &UsedByIndices = Pair.second.UsedByIndices;
438 if (LUIdx < UsedByIndices.size())
439 UsedByIndices[LUIdx] =
440 LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : false;
441 UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
442 }
443}
444
445bool
446RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
447 RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
448 if (I == RegUsesMap.end())
449 return false;
450 const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
451 int i = UsedByIndices.find_first();
452 if (i == -1) return false;
453 if ((size_t)i != LUIdx) return true;
454 return UsedByIndices.find_next(i) != -1;
455}
456
457const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
458 RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
459 assert(I != RegUsesMap.end() && "Unknown register!");
460 return I->second.UsedByIndices;
461}
462
463void RegUseTracker::clear() {
464 RegUsesMap.clear();
465 RegSequence.clear();
466}
467
468namespace {
469
470/// This class holds information that describes a formula for computing
471/// satisfying a use. It may include broken-out immediates and scaled registers.
472struct Formula {
473 /// Global base address used for complex addressing.
474 GlobalValue *BaseGV = nullptr;
475
476 /// Base offset for complex addressing.
477 Immediate BaseOffset = Immediate::getZero();
478
479 /// Whether any complex addressing has a base register.
480 bool HasBaseReg = false;
481
482 /// The scale of any complex addressing.
483 int64_t Scale = 0;
484
485 /// The list of "base" registers for this use. When this is non-empty. The
486 /// canonical representation of a formula is
487 /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
488 /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
489 /// 3. The reg containing recurrent expr related with currect loop in the
490 /// formula should be put in the ScaledReg.
491 /// #1 enforces that the scaled register is always used when at least two
492 /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
493 /// #2 enforces that 1 * reg is reg.
494 /// #3 ensures invariant regs with respect to current loop can be combined
495 /// together in LSR codegen.
496 /// This invariant can be temporarily broken while building a formula.
497 /// However, every formula inserted into the LSRInstance must be in canonical
498 /// form.
500
501 /// The 'scaled' register for this use. This should be non-null when Scale is
502 /// not zero.
503 const SCEV *ScaledReg = nullptr;
504
505 /// An additional constant offset which added near the use. This requires a
506 /// temporary register, but the offset itself can live in an add immediate
507 /// field rather than a register.
508 Immediate UnfoldedOffset = Immediate::getZero();
509
510 Formula() = default;
511
512 void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
513
514 bool isCanonical(const Loop &L) const;
515
516 void canonicalize(const Loop &L);
517
518 bool unscale();
519
520 bool hasZeroEnd() const;
521
522 bool countsDownToZero() const;
523
524 size_t getNumRegs() const;
525 Type *getType() const;
526
527 void deleteBaseReg(const SCEV *&S);
528
529 bool referencesReg(const SCEV *S) const;
530 bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
531 const RegUseTracker &RegUses) const;
532
533 void print(raw_ostream &OS) const;
534 void dump() const;
535};
536
537} // end anonymous namespace
538
539/// Recursion helper for initialMatch.
540static void DoInitialMatch(const SCEV *S, Loop *L,
543 ScalarEvolution &SE) {
544 // Collect expressions which properly dominate the loop header.
545 if (SE.properlyDominates(S, L->getHeader())) {
546 Good.push_back(S);
547 return;
548 }
549
550 // Look at add operands.
551 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
552 for (const SCEV *S : Add->operands())
553 DoInitialMatch(S, L, Good, Bad, SE);
554 return;
555 }
556
557 // Look at addrec operands.
558 const SCEV *Start, *Step;
559 const Loop *ARLoop;
560 if (match(S,
561 m_scev_AffineAddRec(m_SCEV(Start), m_SCEV(Step), m_Loop(ARLoop))) &&
562 !Start->isZero()) {
563 DoInitialMatch(Start, L, Good, Bad, SE);
564 DoInitialMatch(SE.getAddRecExpr(SE.getConstant(S->getType(), 0), Step,
565 // FIXME: AR->getNoWrapFlags()
566 ARLoop, SCEV::FlagAnyWrap),
567 L, Good, Bad, SE);
568 return;
569 }
570
571 // Handle a multiplication by -1 (negation) if it didn't fold.
572 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
573 if (Mul->getOperand(0)->isAllOnesValue()) {
575 const SCEV *NewMul = SE.getMulExpr(Ops);
576
579 DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
580 const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
581 SE.getEffectiveSCEVType(NewMul->getType())));
582 for (const SCEV *S : MyGood)
583 Good.push_back(SE.getMulExpr(NegOne, S));
584 for (const SCEV *S : MyBad)
585 Bad.push_back(SE.getMulExpr(NegOne, S));
586 return;
587 }
588
589 // Ok, we can't do anything interesting. Just stuff the whole thing into a
590 // register and hope for the best.
591 Bad.push_back(S);
592}
593
594/// Incorporate loop-variant parts of S into this Formula, attempting to keep
595/// all loop-invariant and loop-computable values in a single base register.
596void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
599 DoInitialMatch(S, L, Good, Bad, SE);
600 if (!Good.empty()) {
601 const SCEV *Sum = SE.getAddExpr(Good);
602 if (!Sum->isZero())
603 BaseRegs.push_back(Sum);
604 HasBaseReg = true;
605 }
606 if (!Bad.empty()) {
607 const SCEV *Sum = SE.getAddExpr(Bad);
608 if (!Sum->isZero())
609 BaseRegs.push_back(Sum);
610 HasBaseReg = true;
611 }
612 canonicalize(*L);
613}
614
615static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L) {
616 return SCEVExprContains(S, [&L](const SCEV *S) {
617 return isa<SCEVAddRecExpr>(S) && (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
618 });
619}
620
621/// Check whether or not this formula satisfies the canonical
622/// representation.
623/// \see Formula::BaseRegs.
624bool Formula::isCanonical(const Loop &L) const {
625 assert((Scale == 0 || ScaledReg) &&
626 "ScaledReg must be non-null if Scale is non-zero");
627
628 if (!ScaledReg)
629 return BaseRegs.size() <= 1;
630
631 if (Scale != 1)
632 return true;
633
634 if (Scale == 1 && BaseRegs.empty())
635 return false;
636
637 if (containsAddRecDependentOnLoop(ScaledReg, L))
638 return true;
639
640 // If ScaledReg is not a recurrent expr, or it is but its loop is not current
641 // loop, meanwhile BaseRegs contains a recurrent expr reg related with current
642 // loop, we want to swap the reg in BaseRegs with ScaledReg.
643 return none_of(BaseRegs, [&L](const SCEV *S) {
645 });
646}
647
648/// Helper method to morph a formula into its canonical representation.
649/// \see Formula::BaseRegs.
650/// Every formula having more than one base register, must use the ScaledReg
651/// field. Otherwise, we would have to do special cases everywhere in LSR
652/// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
653/// On the other hand, 1*reg should be canonicalized into reg.
654void Formula::canonicalize(const Loop &L) {
655 if (isCanonical(L))
656 return;
657
658 if (BaseRegs.empty()) {
659 // No base reg? Use scale reg with scale = 1 as such.
660 assert(ScaledReg && "Expected 1*reg => reg");
661 assert(Scale == 1 && "Expected 1*reg => reg");
662 BaseRegs.push_back(ScaledReg);
663 Scale = 0;
664 ScaledReg = nullptr;
665 return;
666 }
667
668 // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
669 if (!ScaledReg) {
670 ScaledReg = BaseRegs.pop_back_val();
671 Scale = 1;
672 }
673
674 // If ScaledReg is an invariant with respect to L, find the reg from
675 // BaseRegs containing the recurrent expr related with Loop L. Swap the
676 // reg with ScaledReg.
677 if (!containsAddRecDependentOnLoop(ScaledReg, L)) {
678 auto I = find_if(BaseRegs, [&L](const SCEV *S) {
680 });
681 if (I != BaseRegs.end())
682 std::swap(ScaledReg, *I);
683 }
684 assert(isCanonical(L) && "Failed to canonicalize?");
685}
686
687/// Get rid of the scale in the formula.
688/// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
689/// \return true if it was possible to get rid of the scale, false otherwise.
690/// \note After this operation the formula may not be in the canonical form.
691bool Formula::unscale() {
692 if (Scale != 1)
693 return false;
694 Scale = 0;
695 BaseRegs.push_back(ScaledReg);
696 ScaledReg = nullptr;
697 return true;
698}
699
700bool Formula::hasZeroEnd() const {
701 if (UnfoldedOffset || BaseOffset)
702 return false;
703 if (BaseRegs.size() != 1 || ScaledReg)
704 return false;
705 return true;
706}
707
708bool Formula::countsDownToZero() const {
709 if (!hasZeroEnd())
710 return false;
711 assert(BaseRegs.size() == 1 && "hasZeroEnd should mean one BaseReg");
712 const APInt *StepInt;
713 if (!match(BaseRegs[0], m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(StepInt))))
714 return false;
715 return StepInt->isNegative();
716}
717
718/// Return the total number of register operands used by this formula. This does
719/// not include register uses implied by non-constant addrec strides.
720size_t Formula::getNumRegs() const {
721 return !!ScaledReg + BaseRegs.size();
722}
723
724/// Return the type of this formula, if it has one, or null otherwise. This type
725/// is meaningless except for the bit size.
726Type *Formula::getType() const {
727 return !BaseRegs.empty() ? BaseRegs.front()->getType() :
728 ScaledReg ? ScaledReg->getType() :
729 BaseGV ? BaseGV->getType() :
730 nullptr;
731}
732
733/// Delete the given base reg from the BaseRegs list.
734void Formula::deleteBaseReg(const SCEV *&S) {
735 if (&S != &BaseRegs.back())
736 std::swap(S, BaseRegs.back());
737 BaseRegs.pop_back();
738}
739
740/// Test if this formula references the given register.
741bool Formula::referencesReg(const SCEV *S) const {
742 return S == ScaledReg || is_contained(BaseRegs, S);
743}
744
745/// Test whether this formula uses registers which are used by uses other than
746/// the use with the given index.
747bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
748 const RegUseTracker &RegUses) const {
749 if (ScaledReg)
750 if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
751 return true;
752 for (const SCEV *BaseReg : BaseRegs)
753 if (RegUses.isRegUsedByUsesOtherThan(BaseReg, LUIdx))
754 return true;
755 return false;
756}
757
758#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
759void Formula::print(raw_ostream &OS) const {
760 bool First = true;
761 if (BaseGV) {
762 if (!First) OS << " + "; else First = false;
763 BaseGV->printAsOperand(OS, /*PrintType=*/false);
764 }
765 if (BaseOffset.isNonZero()) {
766 if (!First) OS << " + "; else First = false;
767 OS << BaseOffset;
768 }
769 for (const SCEV *BaseReg : BaseRegs) {
770 if (!First) OS << " + "; else First = false;
771 OS << "reg(" << *BaseReg << ')';
772 }
773 if (HasBaseReg && BaseRegs.empty()) {
774 if (!First) OS << " + "; else First = false;
775 OS << "**error: HasBaseReg**";
776 } else if (!HasBaseReg && !BaseRegs.empty()) {
777 if (!First) OS << " + "; else First = false;
778 OS << "**error: !HasBaseReg**";
779 }
780 if (Scale != 0) {
781 if (!First) OS << " + "; else First = false;
782 OS << Scale << "*reg(";
783 if (ScaledReg)
784 OS << *ScaledReg;
785 else
786 OS << "<unknown>";
787 OS << ')';
788 }
789 if (UnfoldedOffset.isNonZero()) {
790 if (!First) OS << " + ";
791 OS << "imm(" << UnfoldedOffset << ')';
792 }
793}
794
795LLVM_DUMP_METHOD void Formula::dump() const {
796 print(errs()); errs() << '\n';
797}
798#endif
799
800/// Return true if the given addrec can be sign-extended without changing its
801/// value.
803 Type *WideTy =
805 return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
806}
807
808/// Return true if the given add can be sign-extended without changing its
809/// value.
810static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
811 Type *WideTy =
812 IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
813 return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
814}
815
816/// Return true if the given mul can be sign-extended without changing its
817/// value.
818static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
819 Type *WideTy =
821 SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
822 return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
823}
824
825/// Return an expression for LHS /s RHS, if it can be determined and if the
826/// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits
827/// is true, expressions like (X * Y) /s Y are simplified to X, ignoring that
828/// the multiplication may overflow, which is useful when the result will be
829/// used in a context where the most significant bits are ignored.
830static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
831 ScalarEvolution &SE,
832 bool IgnoreSignificantBits = false) {
833 // Handle the trivial case, which works for any SCEV type.
834 if (LHS == RHS)
835 return SE.getConstant(LHS->getType(), 1);
836
837 // Handle a few RHS special cases.
839 if (RC) {
840 const APInt &RA = RC->getAPInt();
841 // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
842 // some folding.
843 if (RA.isAllOnes()) {
844 if (LHS->getType()->isPointerTy())
845 return nullptr;
846 return SE.getMulExpr(LHS, RC);
847 }
848 // Handle x /s 1 as x.
849 if (RA == 1)
850 return LHS;
851 }
852
853 // Check for a division of a constant by a constant.
855 if (!RC)
856 return nullptr;
857 const APInt &LA = C->getAPInt();
858 const APInt &RA = RC->getAPInt();
859 if (LA.srem(RA) != 0)
860 return nullptr;
861 return SE.getConstant(LA.sdiv(RA));
862 }
863
864 // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
866 if ((IgnoreSignificantBits || isAddRecSExtable(AR, SE)) && AR->isAffine()) {
867 const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
868 IgnoreSignificantBits);
869 if (!Step) return nullptr;
870 const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
871 IgnoreSignificantBits);
872 if (!Start) return nullptr;
873 // FlagNW is independent of the start value, step direction, and is
874 // preserved with smaller magnitude steps.
875 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
876 return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
877 }
878 return nullptr;
879 }
880
881 // Distribute the sdiv over add operands, if the add doesn't overflow.
883 if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
885 for (const SCEV *S : Add->operands()) {
886 const SCEV *Op = getExactSDiv(S, RHS, SE, IgnoreSignificantBits);
887 if (!Op) return nullptr;
888 Ops.push_back(Op);
889 }
890 return SE.getAddExpr(Ops);
891 }
892 return nullptr;
893 }
894
895 // Check for a multiply operand that we can pull RHS out of.
897 if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
898 // Handle special case C1*X*Y /s C2*X*Y.
899 if (const SCEVMulExpr *MulRHS = dyn_cast<SCEVMulExpr>(RHS)) {
900 if (IgnoreSignificantBits || isMulSExtable(MulRHS, SE)) {
901 const SCEVConstant *LC = dyn_cast<SCEVConstant>(Mul->getOperand(0));
902 const SCEVConstant *RC =
903 dyn_cast<SCEVConstant>(MulRHS->getOperand(0));
904 if (LC && RC) {
906 SmallVector<const SCEV *, 4> ROps(drop_begin(MulRHS->operands()));
907 if (LOps == ROps)
908 return getExactSDiv(LC, RC, SE, IgnoreSignificantBits);
909 }
910 }
911 }
912
914 bool Found = false;
915 for (const SCEV *S : Mul->operands()) {
916 if (!Found)
917 if (const SCEV *Q = getExactSDiv(S, RHS, SE,
918 IgnoreSignificantBits)) {
919 S = Q;
920 Found = true;
921 }
922 Ops.push_back(S);
923 }
924 return Found ? SE.getMulExpr(Ops) : nullptr;
925 }
926 return nullptr;
927 }
928
929 // Otherwise we don't know.
930 return nullptr;
931}
932
933/// If S involves the addition of a constant integer value, return that integer
934/// value, and mutate S to point to a new SCEV with that value excluded.
935static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
936 const APInt *C;
937 if (match(S, m_scev_APInt(C))) {
938 if (C->getSignificantBits() <= 64) {
939 S = SE.getConstant(S->getType(), 0);
940 return Immediate::getFixed(C->getSExtValue());
941 }
942 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
943 SmallVector<const SCEV *, 8> NewOps(Add->operands());
944 Immediate Result = ExtractImmediate(NewOps.front(), SE);
945 if (Result.isNonZero())
946 S = SE.getAddExpr(NewOps);
947 return Result;
948 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
949 SmallVector<const SCEV *, 8> NewOps(AR->operands());
950 Immediate Result = ExtractImmediate(NewOps.front(), SE);
951 if (Result.isNonZero())
952 S = SE.getAddRecExpr(NewOps, AR->getLoop(),
953 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
955 return Result;
956 } else if (EnableVScaleImmediates &&
958 S = SE.getConstant(S->getType(), 0);
959 return Immediate::getScalable(C->getSExtValue());
960 }
961 return Immediate::getZero();
962}
963
964/// If S involves the addition of a GlobalValue address, return that symbol, and
965/// mutate S to point to a new SCEV with that value excluded.
967 if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
968 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
969 S = SE.getConstant(GV->getType(), 0);
970 return GV;
971 }
972 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
973 SmallVector<const SCEV *, 8> NewOps(Add->operands());
974 GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
975 if (Result)
976 S = SE.getAddExpr(NewOps);
977 return Result;
978 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
979 SmallVector<const SCEV *, 8> NewOps(AR->operands());
980 GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
981 if (Result)
982 S = SE.getAddRecExpr(NewOps, AR->getLoop(),
983 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
985 return Result;
986 }
987 return nullptr;
988}
989
990/// Returns true if the specified instruction is using the specified value as an
991/// address.
993 Instruction *Inst, Value *OperandVal) {
994 bool isAddress = isa<LoadInst>(Inst);
995 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
996 if (SI->getPointerOperand() == OperandVal)
997 isAddress = true;
998 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
999 // Addressing modes can also be folded into prefetches and a variety
1000 // of intrinsics.
1001 switch (II->getIntrinsicID()) {
1002 case Intrinsic::memset:
1003 case Intrinsic::prefetch:
1004 case Intrinsic::masked_load:
1005 if (II->getArgOperand(0) == OperandVal)
1006 isAddress = true;
1007 break;
1008 case Intrinsic::masked_store:
1009 if (II->getArgOperand(1) == OperandVal)
1010 isAddress = true;
1011 break;
1012 case Intrinsic::memmove:
1013 case Intrinsic::memcpy:
1014 if (II->getArgOperand(0) == OperandVal ||
1015 II->getArgOperand(1) == OperandVal)
1016 isAddress = true;
1017 break;
1018 default: {
1019 MemIntrinsicInfo IntrInfo;
1020 if (TTI.getTgtMemIntrinsic(II, IntrInfo)) {
1021 if (IntrInfo.PtrVal == OperandVal)
1022 isAddress = true;
1023 }
1024 }
1025 }
1026 } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
1027 if (RMW->getPointerOperand() == OperandVal)
1028 isAddress = true;
1029 } else if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
1030 if (CmpX->getPointerOperand() == OperandVal)
1031 isAddress = true;
1032 }
1033 return isAddress;
1034}
1035
1036/// Return the type of the memory being accessed.
1037static MemAccessTy getAccessType(const TargetTransformInfo &TTI,
1038 Instruction *Inst, Value *OperandVal) {
1039 MemAccessTy AccessTy = MemAccessTy::getUnknown(Inst->getContext());
1040
1041 // First get the type of memory being accessed.
1042 if (Type *Ty = Inst->getAccessType())
1043 AccessTy.MemTy = Ty;
1044
1045 // Then get the pointer address space.
1046 if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
1047 AccessTy.AddrSpace = SI->getPointerAddressSpace();
1048 } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
1049 AccessTy.AddrSpace = LI->getPointerAddressSpace();
1050 } else if (const AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
1051 AccessTy.AddrSpace = RMW->getPointerAddressSpace();
1052 } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
1053 AccessTy.AddrSpace = CmpX->getPointerAddressSpace();
1054 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
1055 switch (II->getIntrinsicID()) {
1056 case Intrinsic::prefetch:
1057 case Intrinsic::memset:
1058 AccessTy.AddrSpace = II->getArgOperand(0)->getType()->getPointerAddressSpace();
1059 AccessTy.MemTy = OperandVal->getType();
1060 break;
1061 case Intrinsic::memmove:
1062 case Intrinsic::memcpy:
1063 AccessTy.AddrSpace = OperandVal->getType()->getPointerAddressSpace();
1064 AccessTy.MemTy = OperandVal->getType();
1065 break;
1066 case Intrinsic::masked_load:
1067 AccessTy.AddrSpace =
1068 II->getArgOperand(0)->getType()->getPointerAddressSpace();
1069 break;
1070 case Intrinsic::masked_store:
1071 AccessTy.AddrSpace =
1072 II->getArgOperand(1)->getType()->getPointerAddressSpace();
1073 break;
1074 default: {
1075 MemIntrinsicInfo IntrInfo;
1076 if (TTI.getTgtMemIntrinsic(II, IntrInfo) && IntrInfo.PtrVal) {
1077 AccessTy.AddrSpace
1078 = IntrInfo.PtrVal->getType()->getPointerAddressSpace();
1079 }
1080
1081 break;
1082 }
1083 }
1084 }
1085
1086 return AccessTy;
1087}
1088
1089/// Return true if this AddRec is already a phi in its loop.
1090static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
1091 for (PHINode &PN : AR->getLoop()->getHeader()->phis()) {
1092 if (SE.isSCEVable(PN.getType()) &&
1093 (SE.getEffectiveSCEVType(PN.getType()) ==
1094 SE.getEffectiveSCEVType(AR->getType())) &&
1095 SE.getSCEV(&PN) == AR)
1096 return true;
1097 }
1098 return false;
1099}
1100
1101/// Check if expanding this expression is likely to incur significant cost. This
1102/// is tricky because SCEV doesn't track which expressions are actually computed
1103/// by the current IR.
1104///
1105/// We currently allow expansion of IV increments that involve adds,
1106/// multiplication by constants, and AddRecs from existing phis.
1107///
1108/// TODO: Allow UDivExpr if we can find an existing IV increment that is an
1109/// obvious multiple of the UDivExpr.
1110static bool isHighCostExpansion(const SCEV *S,
1112 ScalarEvolution &SE) {
1113 // Zero/One operand expressions
1114 switch (S->getSCEVType()) {
1115 case scUnknown:
1116 case scConstant:
1117 case scVScale:
1118 return false;
1119 case scTruncate:
1120 return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(),
1121 Processed, SE);
1122 case scZeroExtend:
1123 return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(),
1124 Processed, SE);
1125 case scSignExtend:
1126 return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(),
1127 Processed, SE);
1128 default:
1129 break;
1130 }
1131
1132 if (!Processed.insert(S).second)
1133 return false;
1134
1135 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
1136 for (const SCEV *S : Add->operands()) {
1137 if (isHighCostExpansion(S, Processed, SE))
1138 return true;
1139 }
1140 return false;
1141 }
1142
1143 const SCEV *Op0, *Op1;
1144 if (match(S, m_scev_Mul(m_SCEV(Op0), m_SCEV(Op1)))) {
1145 // Multiplication by a constant is ok
1146 if (isa<SCEVConstant>(Op0))
1147 return isHighCostExpansion(Op1, Processed, SE);
1148
1149 // If we have the value of one operand, check if an existing
1150 // multiplication already generates this expression.
1151 if (const auto *U = dyn_cast<SCEVUnknown>(Op1)) {
1152 Value *UVal = U->getValue();
1153 for (User *UR : UVal->users()) {
1154 // If U is a constant, it may be used by a ConstantExpr.
1156 if (UI && UI->getOpcode() == Instruction::Mul &&
1157 SE.isSCEVable(UI->getType())) {
1158 return SE.getSCEV(UI) == S;
1159 }
1160 }
1161 }
1162 }
1163
1164 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
1165 if (isExistingPhi(AR, SE))
1166 return false;
1167 }
1168
1169 // Fow now, consider any other type of expression (div/mul/min/max) high cost.
1170 return true;
1171}
1172
1173namespace {
1174
1175class LSRUse;
1176
1177} // end anonymous namespace
1178
1179/// Check if the addressing mode defined by \p F is completely
1180/// folded in \p LU at isel time.
1181/// This includes address-mode folding and special icmp tricks.
1182/// This function returns true if \p LU can accommodate what \p F
1183/// defines and up to 1 base + 1 scaled + offset.
1184/// In other words, if \p F has several base registers, this function may
1185/// still return true. Therefore, users still need to account for
1186/// additional base registers and/or unfolded offsets to derive an
1187/// accurate cost model.
1188static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1189 const LSRUse &LU, const Formula &F);
1190
1191// Get the cost of the scaling factor used in F for LU.
1192static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI,
1193 const LSRUse &LU, const Formula &F,
1194 const Loop &L);
1195
1196namespace {
1197
1198/// This class is used to measure and compare candidate formulae.
1199class Cost {
1200 const Loop *L = nullptr;
1201 ScalarEvolution *SE = nullptr;
1202 const TargetTransformInfo *TTI = nullptr;
1203 TargetTransformInfo::LSRCost C;
1205
1206public:
1207 Cost() = delete;
1208 Cost(const Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
1210 L(L), SE(&SE), TTI(&TTI), AMK(AMK) {
1211 C.Insns = 0;
1212 C.NumRegs = 0;
1213 C.AddRecCost = 0;
1214 C.NumIVMuls = 0;
1215 C.NumBaseAdds = 0;
1216 C.ImmCost = 0;
1217 C.SetupCost = 0;
1218 C.ScaleCost = 0;
1219 }
1220
1221 bool isLess(const Cost &Other) const;
1222
1223 void Lose();
1224
1225#ifndef NDEBUG
1226 // Once any of the metrics loses, they must all remain losers.
1227 bool isValid() {
1228 return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls | C.NumBaseAdds
1229 | C.ImmCost | C.SetupCost | C.ScaleCost) != ~0u)
1230 || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls & C.NumBaseAdds
1231 & C.ImmCost & C.SetupCost & C.ScaleCost) == ~0u);
1232 }
1233#endif
1234
1235 bool isLoser() {
1236 assert(isValid() && "invalid cost");
1237 return C.NumRegs == ~0u;
1238 }
1239
1240 void RateFormula(const Formula &F, SmallPtrSetImpl<const SCEV *> &Regs,
1241 const DenseSet<const SCEV *> &VisitedRegs, const LSRUse &LU,
1242 bool HardwareLoopProfitable,
1243 SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
1244
1245 void print(raw_ostream &OS) const;
1246 void dump() const;
1247
1248private:
1249 void RateRegister(const Formula &F, const SCEV *Reg,
1250 SmallPtrSetImpl<const SCEV *> &Regs, const LSRUse &LU,
1251 bool HardwareLoopProfitable);
1252 void RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1253 SmallPtrSetImpl<const SCEV *> &Regs,
1254 const LSRUse &LU, bool HardwareLoopProfitable,
1255 SmallPtrSetImpl<const SCEV *> *LoserRegs);
1256};
1257
1258/// An operand value in an instruction which is to be replaced with some
1259/// equivalent, possibly strength-reduced, replacement.
1260struct LSRFixup {
1261 /// The instruction which will be updated.
1262 Instruction *UserInst = nullptr;
1263
1264 /// The operand of the instruction which will be replaced. The operand may be
1265 /// used more than once; every instance will be replaced.
1266 Value *OperandValToReplace = nullptr;
1267
1268 /// If this user is to use the post-incremented value of an induction
1269 /// variable, this set is non-empty and holds the loops associated with the
1270 /// induction variable.
1271 PostIncLoopSet PostIncLoops;
1272
1273 /// A constant offset to be added to the LSRUse expression. This allows
1274 /// multiple fixups to share the same LSRUse with different offsets, for
1275 /// example in an unrolled loop.
1276 Immediate Offset = Immediate::getZero();
1277
1278 LSRFixup() = default;
1279
1280 bool isUseFullyOutsideLoop(const Loop *L) const;
1281
1282 void print(raw_ostream &OS) const;
1283 void dump() const;
1284};
1285
1286/// This class holds the state that LSR keeps for each use in IVUsers, as well
1287/// as uses invented by LSR itself. It includes information about what kinds of
1288/// things can be folded into the user, information about the user itself, and
1289/// information about how the use may be satisfied. TODO: Represent multiple
1290/// users of the same expression in common?
1291class LSRUse {
1292 DenseSet<SmallVector<const SCEV *, 4>> Uniquifier;
1293
1294public:
1295 /// An enum for a kind of use, indicating what types of scaled and immediate
1296 /// operands it might support.
1297 enum KindType {
1298 Basic, ///< A normal use, with no folding.
1299 Special, ///< A special case of basic, allowing -1 scales.
1300 Address, ///< An address use; folding according to TargetLowering
1301 ICmpZero ///< An equality icmp with both operands folded into one.
1302 // TODO: Add a generic icmp too?
1303 };
1304
1305 using SCEVUseKindPair = PointerIntPair<const SCEV *, 2, KindType>;
1306
1307 KindType Kind;
1308 MemAccessTy AccessTy;
1309
1310 /// The list of operands which are to be replaced.
1312
1313 /// Keep track of the min and max offsets of the fixups.
1314 Immediate MinOffset = Immediate::getFixedMax();
1315 Immediate MaxOffset = Immediate::getFixedMin();
1316
1317 /// This records whether all of the fixups using this LSRUse are outside of
1318 /// the loop, in which case some special-case heuristics may be used.
1319 bool AllFixupsOutsideLoop = true;
1320
1321 /// RigidFormula is set to true to guarantee that this use will be associated
1322 /// with a single formula--the one that initially matched. Some SCEV
1323 /// expressions cannot be expanded. This allows LSR to consider the registers
1324 /// used by those expressions without the need to expand them later after
1325 /// changing the formula.
1326 bool RigidFormula = false;
1327
1328 /// This records the widest use type for any fixup using this
1329 /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max
1330 /// fixup widths to be equivalent, because the narrower one may be relying on
1331 /// the implicit truncation to truncate away bogus bits.
1332 Type *WidestFixupType = nullptr;
1333
1334 /// A list of ways to build a value that can satisfy this user. After the
1335 /// list is populated, one of these is selected heuristically and used to
1336 /// formulate a replacement for OperandValToReplace in UserInst.
1337 SmallVector<Formula, 12> Formulae;
1338
1339 /// The set of register candidates used by all formulae in this LSRUse.
1340 SmallPtrSet<const SCEV *, 4> Regs;
1341
1342 LSRUse(KindType K, MemAccessTy AT) : Kind(K), AccessTy(AT) {}
1343
1344 LSRFixup &getNewFixup() {
1345 Fixups.push_back(LSRFixup());
1346 return Fixups.back();
1347 }
1348
1349 void pushFixup(LSRFixup &f) {
1350 Fixups.push_back(f);
1351 if (Immediate::isKnownGT(f.Offset, MaxOffset))
1352 MaxOffset = f.Offset;
1353 if (Immediate::isKnownLT(f.Offset, MinOffset))
1354 MinOffset = f.Offset;
1355 }
1356
1357 bool HasFormulaWithSameRegs(const Formula &F) const;
1358 float getNotSelectedProbability(const SCEV *Reg) const;
1359 bool InsertFormula(const Formula &F, const Loop &L);
1360 void DeleteFormula(Formula &F);
1361 void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
1362
1363 void print(raw_ostream &OS) const;
1364 void dump() const;
1365};
1366
1367} // end anonymous namespace
1368
1369static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1370 LSRUse::KindType Kind, MemAccessTy AccessTy,
1371 GlobalValue *BaseGV, Immediate BaseOffset,
1372 bool HasBaseReg, int64_t Scale,
1373 Instruction *Fixup = nullptr);
1374
1375static unsigned getSetupCost(const SCEV *Reg, unsigned Depth) {
1377 return 1;
1378 if (Depth == 0)
1379 return 0;
1380 if (const auto *S = dyn_cast<SCEVAddRecExpr>(Reg))
1381 return getSetupCost(S->getStart(), Depth - 1);
1382 if (auto S = dyn_cast<SCEVIntegralCastExpr>(Reg))
1383 return getSetupCost(S->getOperand(), Depth - 1);
1384 if (auto S = dyn_cast<SCEVNAryExpr>(Reg))
1385 return std::accumulate(S->operands().begin(), S->operands().end(), 0,
1386 [&](unsigned i, const SCEV *Reg) {
1387 return i + getSetupCost(Reg, Depth - 1);
1388 });
1389 if (auto S = dyn_cast<SCEVUDivExpr>(Reg))
1390 return getSetupCost(S->getLHS(), Depth - 1) +
1391 getSetupCost(S->getRHS(), Depth - 1);
1392 return 0;
1393}
1394
1395/// Tally up interesting quantities from the given register.
1396void Cost::RateRegister(const Formula &F, const SCEV *Reg,
1397 SmallPtrSetImpl<const SCEV *> &Regs, const LSRUse &LU,
1398 bool HardwareLoopProfitable) {
1399 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
1400 // If this is an addrec for another loop, it should be an invariant
1401 // with respect to L since L is the innermost loop (at least
1402 // for now LSR only handles innermost loops).
1403 if (AR->getLoop() != L) {
1404 // If the AddRec exists, consider it's register free and leave it alone.
1405 if (isExistingPhi(AR, *SE) && !(AMK & TTI::AMK_PostIndexed))
1406 return;
1407
1408 // It is bad to allow LSR for current loop to add induction variables
1409 // for its sibling loops.
1410 if (!AR->getLoop()->contains(L)) {
1411 Lose();
1412 return;
1413 }
1414
1415 // Otherwise, it will be an invariant with respect to Loop L.
1416 ++C.NumRegs;
1417 return;
1418 }
1419
1420 unsigned LoopCost = 1;
1421 if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) ||
1422 TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) {
1423 const SCEV *Start;
1424 const SCEVConstant *Step;
1425 if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_SCEVConstant(Step))))
1426 // If the step size matches the base offset, we could use pre-indexed
1427 // addressing.
1428 if (((AMK & TTI::AMK_PreIndexed) && F.BaseOffset.isFixed() &&
1429 Step->getAPInt() == F.BaseOffset.getFixedValue()) ||
1430 ((AMK & TTI::AMK_PostIndexed) && !isa<SCEVConstant>(Start) &&
1431 SE->isLoopInvariant(Start, L)))
1432 LoopCost = 0;
1433 }
1434 // If the loop counts down to zero and we'll be using a hardware loop then
1435 // the addrec will be combined into the hardware loop instruction.
1436 if (LU.Kind == LSRUse::ICmpZero && F.countsDownToZero() &&
1437 HardwareLoopProfitable)
1438 LoopCost = 0;
1439 C.AddRecCost += LoopCost;
1440
1441 // Add the step value register, if it needs one.
1442 // TODO: The non-affine case isn't precisely modeled here.
1443 if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
1444 if (!Regs.count(AR->getOperand(1))) {
1445 RateRegister(F, AR->getOperand(1), Regs, LU, HardwareLoopProfitable);
1446 if (isLoser())
1447 return;
1448 }
1449 }
1450 }
1451 ++C.NumRegs;
1452
1453 // Rough heuristic; favor registers which don't require extra setup
1454 // instructions in the preheader.
1455 C.SetupCost += getSetupCost(Reg, SetupCostDepthLimit);
1456 // Ensure we don't, even with the recusion limit, produce invalid costs.
1457 C.SetupCost = std::min<unsigned>(C.SetupCost, 1 << 16);
1458
1459 C.NumIVMuls += isa<SCEVMulExpr>(Reg) &&
1461}
1462
1463/// Record this register in the set. If we haven't seen it before, rate
1464/// it. Optional LoserRegs provides a way to declare any formula that refers to
1465/// one of those regs an instant loser.
1466void Cost::RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1467 SmallPtrSetImpl<const SCEV *> &Regs,
1468 const LSRUse &LU, bool HardwareLoopProfitable,
1469 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1470 if (LoserRegs && LoserRegs->count(Reg)) {
1471 Lose();
1472 return;
1473 }
1474 if (Regs.insert(Reg).second) {
1475 RateRegister(F, Reg, Regs, LU, HardwareLoopProfitable);
1476 if (LoserRegs && isLoser())
1477 LoserRegs->insert(Reg);
1478 }
1479}
1480
1481void Cost::RateFormula(const Formula &F, SmallPtrSetImpl<const SCEV *> &Regs,
1482 const DenseSet<const SCEV *> &VisitedRegs,
1483 const LSRUse &LU, bool HardwareLoopProfitable,
1484 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1485 if (isLoser())
1486 return;
1487 assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula");
1488 // Tally up the registers.
1489 unsigned PrevAddRecCost = C.AddRecCost;
1490 unsigned PrevNumRegs = C.NumRegs;
1491 unsigned PrevNumBaseAdds = C.NumBaseAdds;
1492 if (const SCEV *ScaledReg = F.ScaledReg) {
1493 if (VisitedRegs.count(ScaledReg)) {
1494 Lose();
1495 return;
1496 }
1497 RatePrimaryRegister(F, ScaledReg, Regs, LU, HardwareLoopProfitable,
1498 LoserRegs);
1499 if (isLoser())
1500 return;
1501 }
1502 for (const SCEV *BaseReg : F.BaseRegs) {
1503 if (VisitedRegs.count(BaseReg)) {
1504 Lose();
1505 return;
1506 }
1507 RatePrimaryRegister(F, BaseReg, Regs, LU, HardwareLoopProfitable,
1508 LoserRegs);
1509 if (isLoser())
1510 return;
1511 }
1512
1513 // Determine how many (unfolded) adds we'll need inside the loop.
1514 size_t NumBaseParts = F.getNumRegs();
1515 if (NumBaseParts > 1)
1516 // Do not count the base and a possible second register if the target
1517 // allows to fold 2 registers.
1518 C.NumBaseAdds +=
1519 NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(*TTI, LU, F)));
1520 C.NumBaseAdds += (F.UnfoldedOffset.isNonZero());
1521
1522 // Accumulate non-free scaling amounts.
1523 C.ScaleCost += getScalingFactorCost(*TTI, LU, F, *L).getValue();
1524
1525 // Tally up the non-zero immediates.
1526 for (const LSRFixup &Fixup : LU.Fixups) {
1527 if (Fixup.Offset.isCompatibleImmediate(F.BaseOffset)) {
1528 Immediate Offset = Fixup.Offset.addUnsigned(F.BaseOffset);
1529 if (F.BaseGV)
1530 C.ImmCost += 64; // Handle symbolic values conservatively.
1531 // TODO: This should probably be the pointer size.
1532 else if (Offset.isNonZero())
1533 C.ImmCost +=
1534 APInt(64, Offset.getKnownMinValue(), true).getSignificantBits();
1535
1536 // Check with target if this offset with this instruction is
1537 // specifically not supported.
1538 if (LU.Kind == LSRUse::Address && Offset.isNonZero() &&
1539 !isAMCompletelyFolded(*TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1540 Offset, F.HasBaseReg, F.Scale, Fixup.UserInst))
1541 C.NumBaseAdds++;
1542 } else {
1543 // Incompatible immediate type, increase cost to avoid using
1544 C.ImmCost += 2048;
1545 }
1546 }
1547
1548 // If we don't count instruction cost exit here.
1549 if (!InsnsCost) {
1550 assert(isValid() && "invalid cost");
1551 return;
1552 }
1553
1554 // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
1555 // additional instruction (at least fill).
1556 // TODO: Need distinguish register class?
1557 unsigned TTIRegNum = TTI->getNumberOfRegisters(
1558 TTI->getRegisterClassForType(false, F.getType())) - 1;
1559 if (C.NumRegs > TTIRegNum) {
1560 // Cost already exceeded TTIRegNum, then only newly added register can add
1561 // new instructions.
1562 if (PrevNumRegs > TTIRegNum)
1563 C.Insns += (C.NumRegs - PrevNumRegs);
1564 else
1565 C.Insns += (C.NumRegs - TTIRegNum);
1566 }
1567
1568 // If ICmpZero formula ends with not 0, it could not be replaced by
1569 // just add or sub. We'll need to compare final result of AddRec.
1570 // That means we'll need an additional instruction. But if the target can
1571 // macro-fuse a compare with a branch, don't count this extra instruction.
1572 // For -10 + {0, +, 1}:
1573 // i = i + 1;
1574 // cmp i, 10
1575 //
1576 // For {-10, +, 1}:
1577 // i = i + 1;
1578 if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() &&
1579 !TTI->canMacroFuseCmp())
1580 C.Insns++;
1581 // Each new AddRec adds 1 instruction to calculation.
1582 C.Insns += (C.AddRecCost - PrevAddRecCost);
1583
1584 // BaseAdds adds instructions for unfolded registers.
1585 if (LU.Kind != LSRUse::ICmpZero)
1586 C.Insns += C.NumBaseAdds - PrevNumBaseAdds;
1587 assert(isValid() && "invalid cost");
1588}
1589
1590/// Set this cost to a losing value.
1591void Cost::Lose() {
1592 C.Insns = std::numeric_limits<unsigned>::max();
1593 C.NumRegs = std::numeric_limits<unsigned>::max();
1594 C.AddRecCost = std::numeric_limits<unsigned>::max();
1595 C.NumIVMuls = std::numeric_limits<unsigned>::max();
1596 C.NumBaseAdds = std::numeric_limits<unsigned>::max();
1597 C.ImmCost = std::numeric_limits<unsigned>::max();
1598 C.SetupCost = std::numeric_limits<unsigned>::max();
1599 C.ScaleCost = std::numeric_limits<unsigned>::max();
1600}
1601
1602/// Choose the lower cost.
1603bool Cost::isLess(const Cost &Other) const {
1604 if (InsnsCost.getNumOccurrences() > 0 && InsnsCost &&
1605 C.Insns != Other.C.Insns)
1606 return C.Insns < Other.C.Insns;
1607 return TTI->isLSRCostLess(C, Other.C);
1608}
1609
1610#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1611void Cost::print(raw_ostream &OS) const {
1612 if (InsnsCost)
1613 OS << C.Insns << " instruction" << (C.Insns == 1 ? " " : "s ");
1614 OS << C.NumRegs << " reg" << (C.NumRegs == 1 ? "" : "s");
1615 if (C.AddRecCost != 0)
1616 OS << ", with addrec cost " << C.AddRecCost;
1617 if (C.NumIVMuls != 0)
1618 OS << ", plus " << C.NumIVMuls << " IV mul"
1619 << (C.NumIVMuls == 1 ? "" : "s");
1620 if (C.NumBaseAdds != 0)
1621 OS << ", plus " << C.NumBaseAdds << " base add"
1622 << (C.NumBaseAdds == 1 ? "" : "s");
1623 if (C.ScaleCost != 0)
1624 OS << ", plus " << C.ScaleCost << " scale cost";
1625 if (C.ImmCost != 0)
1626 OS << ", plus " << C.ImmCost << " imm cost";
1627 if (C.SetupCost != 0)
1628 OS << ", plus " << C.SetupCost << " setup cost";
1629}
1630
1631LLVM_DUMP_METHOD void Cost::dump() const {
1632 print(errs()); errs() << '\n';
1633}
1634#endif
1635
1636/// Test whether this fixup always uses its value outside of the given loop.
1637bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
1638 // PHI nodes use their value in their incoming blocks.
1639 if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
1640 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
1641 if (PN->getIncomingValue(i) == OperandValToReplace &&
1642 L->contains(PN->getIncomingBlock(i)))
1643 return false;
1644 return true;
1645 }
1646
1647 return !L->contains(UserInst);
1648}
1649
1650#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1651void LSRFixup::print(raw_ostream &OS) const {
1652 OS << "UserInst=";
1653 // Store is common and interesting enough to be worth special-casing.
1654 if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
1655 OS << "store ";
1656 Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
1657 } else if (UserInst->getType()->isVoidTy())
1658 OS << UserInst->getOpcodeName();
1659 else
1660 UserInst->printAsOperand(OS, /*PrintType=*/false);
1661
1662 OS << ", OperandValToReplace=";
1663 OperandValToReplace->printAsOperand(OS, /*PrintType=*/false);
1664
1665 for (const Loop *PIL : PostIncLoops) {
1666 OS << ", PostIncLoop=";
1667 PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false);
1668 }
1669
1670 if (Offset.isNonZero())
1671 OS << ", Offset=" << Offset;
1672}
1673
1674LLVM_DUMP_METHOD void LSRFixup::dump() const {
1675 print(errs()); errs() << '\n';
1676}
1677#endif
1678
1679/// Test whether this use as a formula which has the same registers as the given
1680/// formula.
1681bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
1683 if (F.ScaledReg) Key.push_back(F.ScaledReg);
1684 // Unstable sort by host order ok, because this is only used for uniquifying.
1685 llvm::sort(Key);
1686 return Uniquifier.count(Key);
1687}
1688
1689/// The function returns a probability of selecting formula without Reg.
1690float LSRUse::getNotSelectedProbability(const SCEV *Reg) const {
1691 unsigned FNum = 0;
1692 for (const Formula &F : Formulae)
1693 if (F.referencesReg(Reg))
1694 FNum++;
1695 return ((float)(Formulae.size() - FNum)) / Formulae.size();
1696}
1697
1698/// If the given formula has not yet been inserted, add it to the list, and
1699/// return true. Return false otherwise. The formula must be in canonical form.
1700bool LSRUse::InsertFormula(const Formula &F, const Loop &L) {
1701 assert(F.isCanonical(L) && "Invalid canonical representation");
1702
1703 if (!Formulae.empty() && RigidFormula)
1704 return false;
1705
1707 if (F.ScaledReg) Key.push_back(F.ScaledReg);
1708 // Unstable sort by host order ok, because this is only used for uniquifying.
1709 llvm::sort(Key);
1710
1711 if (!Uniquifier.insert(Key).second)
1712 return false;
1713
1714 // Using a register to hold the value of 0 is not profitable.
1715 assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
1716 "Zero allocated in a scaled register!");
1717#ifndef NDEBUG
1718 for (const SCEV *BaseReg : F.BaseRegs)
1719 assert(!BaseReg->isZero() && "Zero allocated in a base register!");
1720#endif
1721
1722 // Add the formula to the list.
1723 Formulae.push_back(F);
1724
1725 // Record registers now being used by this use.
1726 Regs.insert_range(F.BaseRegs);
1727 if (F.ScaledReg)
1728 Regs.insert(F.ScaledReg);
1729
1730 return true;
1731}
1732
1733/// Remove the given formula from this use's list.
1734void LSRUse::DeleteFormula(Formula &F) {
1735 if (&F != &Formulae.back())
1736 std::swap(F, Formulae.back());
1737 Formulae.pop_back();
1738}
1739
1740/// Recompute the Regs field, and update RegUses.
1741void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
1742 // Now that we've filtered out some formulae, recompute the Regs set.
1743 SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs);
1744 Regs.clear();
1745 for (const Formula &F : Formulae) {
1746 if (F.ScaledReg) Regs.insert(F.ScaledReg);
1747 Regs.insert_range(F.BaseRegs);
1748 }
1749
1750 // Update the RegTracker.
1751 for (const SCEV *S : OldRegs)
1752 if (!Regs.count(S))
1753 RegUses.dropRegister(S, LUIdx);
1754}
1755
1756#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1757void LSRUse::print(raw_ostream &OS) const {
1758 OS << "LSR Use: Kind=";
1759 switch (Kind) {
1760 case Basic: OS << "Basic"; break;
1761 case Special: OS << "Special"; break;
1762 case ICmpZero: OS << "ICmpZero"; break;
1763 case Address:
1764 OS << "Address of ";
1765 if (AccessTy.MemTy->isPointerTy())
1766 OS << "pointer"; // the full pointer type could be really verbose
1767 else {
1768 OS << *AccessTy.MemTy;
1769 }
1770
1771 OS << " in addrspace(" << AccessTy.AddrSpace << ')';
1772 }
1773
1774 OS << ", Offsets={";
1775 bool NeedComma = false;
1776 for (const LSRFixup &Fixup : Fixups) {
1777 if (NeedComma) OS << ',';
1778 OS << Fixup.Offset;
1779 NeedComma = true;
1780 }
1781 OS << '}';
1782
1783 if (AllFixupsOutsideLoop)
1784 OS << ", all-fixups-outside-loop";
1785
1786 if (WidestFixupType)
1787 OS << ", widest fixup type: " << *WidestFixupType;
1788}
1789
1790LLVM_DUMP_METHOD void LSRUse::dump() const {
1791 print(errs()); errs() << '\n';
1792}
1793#endif
1794
1796 LSRUse::KindType Kind, MemAccessTy AccessTy,
1797 GlobalValue *BaseGV, Immediate BaseOffset,
1798 bool HasBaseReg, int64_t Scale,
1799 Instruction *Fixup /* = nullptr */) {
1800 switch (Kind) {
1801 case LSRUse::Address: {
1802 int64_t FixedOffset =
1803 BaseOffset.isScalable() ? 0 : BaseOffset.getFixedValue();
1804 int64_t ScalableOffset =
1805 BaseOffset.isScalable() ? BaseOffset.getKnownMinValue() : 0;
1806 return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, FixedOffset,
1807 HasBaseReg, Scale, AccessTy.AddrSpace,
1808 Fixup, ScalableOffset);
1809 }
1810 case LSRUse::ICmpZero:
1811 // There's not even a target hook for querying whether it would be legal to
1812 // fold a GV into an ICmp.
1813 if (BaseGV)
1814 return false;
1815
1816 // ICmp only has two operands; don't allow more than two non-trivial parts.
1817 if (Scale != 0 && HasBaseReg && BaseOffset.isNonZero())
1818 return false;
1819
1820 // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
1821 // putting the scaled register in the other operand of the icmp.
1822 if (Scale != 0 && Scale != -1)
1823 return false;
1824
1825 // If we have low-level target information, ask the target if it can fold an
1826 // integer immediate on an icmp.
1827 if (BaseOffset.isNonZero()) {
1828 // We don't have an interface to query whether the target supports
1829 // icmpzero against scalable quantities yet.
1830 if (BaseOffset.isScalable())
1831 return false;
1832
1833 // We have one of:
1834 // ICmpZero BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
1835 // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
1836 // Offs is the ICmp immediate.
1837 if (Scale == 0)
1838 // The cast does the right thing with
1839 // std::numeric_limits<int64_t>::min().
1840 BaseOffset = BaseOffset.getFixed(-(uint64_t)BaseOffset.getFixedValue());
1841 return TTI.isLegalICmpImmediate(BaseOffset.getFixedValue());
1842 }
1843
1844 // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
1845 return true;
1846
1847 case LSRUse::Basic:
1848 // Only handle single-register values.
1849 return !BaseGV && Scale == 0 && BaseOffset.isZero();
1850
1851 case LSRUse::Special:
1852 // Special case Basic to handle -1 scales.
1853 return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset.isZero();
1854 }
1855
1856 llvm_unreachable("Invalid LSRUse Kind!");
1857}
1858
1860 Immediate MinOffset, Immediate MaxOffset,
1861 LSRUse::KindType Kind, MemAccessTy AccessTy,
1862 GlobalValue *BaseGV, Immediate BaseOffset,
1863 bool HasBaseReg, int64_t Scale) {
1864 if (BaseOffset.isNonZero() &&
1865 (BaseOffset.isScalable() != MinOffset.isScalable() ||
1866 BaseOffset.isScalable() != MaxOffset.isScalable()))
1867 return false;
1868 // Check for overflow.
1869 int64_t Base = BaseOffset.getKnownMinValue();
1870 int64_t Min = MinOffset.getKnownMinValue();
1871 int64_t Max = MaxOffset.getKnownMinValue();
1872 if (((int64_t)((uint64_t)Base + Min) > Base) != (Min > 0))
1873 return false;
1874 MinOffset = Immediate::get((uint64_t)Base + Min, MinOffset.isScalable());
1875 if (((int64_t)((uint64_t)Base + Max) > Base) != (Max > 0))
1876 return false;
1877 MaxOffset = Immediate::get((uint64_t)Base + Max, MaxOffset.isScalable());
1878
1879 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
1880 HasBaseReg, Scale) &&
1881 isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset,
1882 HasBaseReg, Scale);
1883}
1884
1886 Immediate MinOffset, Immediate MaxOffset,
1887 LSRUse::KindType Kind, MemAccessTy AccessTy,
1888 const Formula &F, const Loop &L) {
1889 // For the purpose of isAMCompletelyFolded either having a canonical formula
1890 // or a scale not equal to zero is correct.
1891 // Problems may arise from non canonical formulae having a scale == 0.
1892 // Strictly speaking it would best to just rely on canonical formulae.
1893 // However, when we generate the scaled formulae, we first check that the
1894 // scaling factor is profitable before computing the actual ScaledReg for
1895 // compile time sake.
1896 assert((F.isCanonical(L) || F.Scale != 0));
1897 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1898 F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
1899}
1900
1901/// Test whether we know how to expand the current formula.
1902static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
1903 Immediate MaxOffset, LSRUse::KindType Kind,
1904 MemAccessTy AccessTy, GlobalValue *BaseGV,
1905 Immediate BaseOffset, bool HasBaseReg, int64_t Scale) {
1906 // We know how to expand completely foldable formulae.
1907 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
1908 BaseOffset, HasBaseReg, Scale) ||
1909 // Or formulae that use a base register produced by a sum of base
1910 // registers.
1911 (Scale == 1 &&
1912 isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1913 BaseGV, BaseOffset, true, 0));
1914}
1915
1916static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
1917 Immediate MaxOffset, LSRUse::KindType Kind,
1918 MemAccessTy AccessTy, const Formula &F) {
1919 return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
1920 F.BaseOffset, F.HasBaseReg, F.Scale);
1921}
1922
1924 Immediate Offset) {
1925 if (Offset.isScalable())
1926 return TTI.isLegalAddScalableImmediate(Offset.getKnownMinValue());
1927
1928 return TTI.isLegalAddImmediate(Offset.getFixedValue());
1929}
1930
1932 const LSRUse &LU, const Formula &F) {
1933 // Target may want to look at the user instructions.
1934 if (LU.Kind == LSRUse::Address && TTI.LSRWithInstrQueries()) {
1935 for (const LSRFixup &Fixup : LU.Fixups)
1936 if (!isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1937 (F.BaseOffset + Fixup.Offset), F.HasBaseReg,
1938 F.Scale, Fixup.UserInst))
1939 return false;
1940 return true;
1941 }
1942
1943 return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1944 LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg,
1945 F.Scale);
1946}
1947
1949 const LSRUse &LU, const Formula &F,
1950 const Loop &L) {
1951 if (!F.Scale)
1952 return 0;
1953
1954 // If the use is not completely folded in that instruction, we will have to
1955 // pay an extra cost only for scale != 1.
1956 if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1957 LU.AccessTy, F, L))
1958 return F.Scale != 1;
1959
1960 switch (LU.Kind) {
1961 case LSRUse::Address: {
1962 // Check the scaling factor cost with both the min and max offsets.
1963 int64_t ScalableMin = 0, ScalableMax = 0, FixedMin = 0, FixedMax = 0;
1964 if (F.BaseOffset.isScalable()) {
1965 ScalableMin = (F.BaseOffset + LU.MinOffset).getKnownMinValue();
1966 ScalableMax = (F.BaseOffset + LU.MaxOffset).getKnownMinValue();
1967 } else {
1968 FixedMin = (F.BaseOffset + LU.MinOffset).getFixedValue();
1969 FixedMax = (F.BaseOffset + LU.MaxOffset).getFixedValue();
1970 }
1971 InstructionCost ScaleCostMinOffset = TTI.getScalingFactorCost(
1972 LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMin, ScalableMin),
1973 F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
1974 InstructionCost ScaleCostMaxOffset = TTI.getScalingFactorCost(
1975 LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMax, ScalableMax),
1976 F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
1977
1978 assert(ScaleCostMinOffset.isValid() && ScaleCostMaxOffset.isValid() &&
1979 "Legal addressing mode has an illegal cost!");
1980 return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
1981 }
1982 case LSRUse::ICmpZero:
1983 case LSRUse::Basic:
1984 case LSRUse::Special:
1985 // The use is completely folded, i.e., everything is folded into the
1986 // instruction.
1987 return 0;
1988 }
1989
1990 llvm_unreachable("Invalid LSRUse Kind!");
1991}
1992
1994 LSRUse::KindType Kind, MemAccessTy AccessTy,
1995 GlobalValue *BaseGV, Immediate BaseOffset,
1996 bool HasBaseReg) {
1997 // Fast-path: zero is always foldable.
1998 if (BaseOffset.isZero() && !BaseGV)
1999 return true;
2000
2001 // Conservatively, create an address with an immediate and a
2002 // base and a scale.
2003 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
2004
2005 // Canonicalize a scale of 1 to a base register if the formula doesn't
2006 // already have a base register.
2007 if (!HasBaseReg && Scale == 1) {
2008 Scale = 0;
2009 HasBaseReg = true;
2010 }
2011
2012 // FIXME: Try with + without a scale? Maybe based on TTI?
2013 // I think basereg + scaledreg + immediateoffset isn't a good 'conservative'
2014 // default for many architectures, not just AArch64 SVE. More investigation
2015 // needed later to determine if this should be used more widely than just
2016 // on scalable types.
2017 if (HasBaseReg && BaseOffset.isNonZero() && Kind != LSRUse::ICmpZero &&
2018 AccessTy.MemTy && AccessTy.MemTy->isScalableTy() && DropScaledForVScale)
2019 Scale = 0;
2020
2021 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
2022 HasBaseReg, Scale);
2023}
2024
2026 ScalarEvolution &SE, Immediate MinOffset,
2027 Immediate MaxOffset, LSRUse::KindType Kind,
2028 MemAccessTy AccessTy, const SCEV *S,
2029 bool HasBaseReg) {
2030 // Fast-path: zero is always foldable.
2031 if (S->isZero()) return true;
2032
2033 // Conservatively, create an address with an immediate and a
2034 // base and a scale.
2035 Immediate BaseOffset = ExtractImmediate(S, SE);
2036 GlobalValue *BaseGV = ExtractSymbol(S, SE);
2037
2038 // If there's anything else involved, it's not foldable.
2039 if (!S->isZero()) return false;
2040
2041 // Fast-path: zero is always foldable.
2042 if (BaseOffset.isZero() && !BaseGV)
2043 return true;
2044
2045 if (BaseOffset.isScalable())
2046 return false;
2047
2048 // Conservatively, create an address with an immediate and a
2049 // base and a scale.
2050 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
2051
2052 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
2053 BaseOffset, HasBaseReg, Scale);
2054}
2055
2056namespace {
2057
2058/// An individual increment in a Chain of IV increments. Relate an IV user to
2059/// an expression that computes the IV it uses from the IV used by the previous
2060/// link in the Chain.
2061///
2062/// For the head of a chain, IncExpr holds the absolute SCEV expression for the
2063/// original IVOperand. The head of the chain's IVOperand is only valid during
2064/// chain collection, before LSR replaces IV users. During chain generation,
2065/// IncExpr can be used to find the new IVOperand that computes the same
2066/// expression.
2067struct IVInc {
2068 Instruction *UserInst;
2069 Value* IVOperand;
2070 const SCEV *IncExpr;
2071
2072 IVInc(Instruction *U, Value *O, const SCEV *E)
2073 : UserInst(U), IVOperand(O), IncExpr(E) {}
2074};
2075
2076// The list of IV increments in program order. We typically add the head of a
2077// chain without finding subsequent links.
2078struct IVChain {
2080 const SCEV *ExprBase = nullptr;
2081
2082 IVChain() = default;
2083 IVChain(const IVInc &Head, const SCEV *Base)
2084 : Incs(1, Head), ExprBase(Base) {}
2085
2086 using const_iterator = SmallVectorImpl<IVInc>::const_iterator;
2087
2088 // Return the first increment in the chain.
2089 const_iterator begin() const {
2090 assert(!Incs.empty());
2091 return std::next(Incs.begin());
2092 }
2093 const_iterator end() const {
2094 return Incs.end();
2095 }
2096
2097 // Returns true if this chain contains any increments.
2098 bool hasIncs() const { return Incs.size() >= 2; }
2099
2100 // Add an IVInc to the end of this chain.
2101 void add(const IVInc &X) { Incs.push_back(X); }
2102
2103 // Returns the last UserInst in the chain.
2104 Instruction *tailUserInst() const { return Incs.back().UserInst; }
2105
2106 // Returns true if IncExpr can be profitably added to this chain.
2107 bool isProfitableIncrement(const SCEV *OperExpr,
2108 const SCEV *IncExpr,
2109 ScalarEvolution&);
2110};
2111
2112/// Helper for CollectChains to track multiple IV increment uses. Distinguish
2113/// between FarUsers that definitely cross IV increments and NearUsers that may
2114/// be used between IV increments.
2115struct ChainUsers {
2116 SmallPtrSet<Instruction*, 4> FarUsers;
2117 SmallPtrSet<Instruction*, 4> NearUsers;
2118};
2119
2120/// This class holds state for the main loop strength reduction logic.
2121class LSRInstance {
2122 IVUsers &IU;
2123 ScalarEvolution &SE;
2124 DominatorTree &DT;
2125 LoopInfo &LI;
2126 AssumptionCache &AC;
2127 TargetLibraryInfo &TLI;
2128 const TargetTransformInfo &TTI;
2129 Loop *const L;
2130 MemorySSAUpdater *MSSAU;
2132 mutable SCEVExpander Rewriter;
2133 bool Changed = false;
2134 bool HardwareLoopProfitable = false;
2135
2136 /// This is the insert position that the current loop's induction variable
2137 /// increment should be placed. In simple loops, this is the latch block's
2138 /// terminator. But in more complicated cases, this is a position which will
2139 /// dominate all the in-loop post-increment users.
2140 Instruction *IVIncInsertPos = nullptr;
2141
2142 /// Interesting factors between use strides.
2143 ///
2144 /// We explicitly use a SetVector which contains a SmallSet, instead of the
2145 /// default, a SmallDenseSet, because we need to use the full range of
2146 /// int64_ts, and there's currently no good way of doing that with
2147 /// SmallDenseSet.
2148 SetVector<int64_t, SmallVector<int64_t, 8>, SmallSet<int64_t, 8>> Factors;
2149
2150 /// The cost of the current SCEV, the best solution by LSR will be dropped if
2151 /// the solution is not profitable.
2152 Cost BaselineCost;
2153
2154 /// Interesting use types, to facilitate truncation reuse.
2155 SmallSetVector<Type *, 4> Types;
2156
2157 /// The list of interesting uses.
2159
2160 /// Track which uses use which register candidates.
2161 RegUseTracker RegUses;
2162
2163 // Limit the number of chains to avoid quadratic behavior. We don't expect to
2164 // have more than a few IV increment chains in a loop. Missing a Chain falls
2165 // back to normal LSR behavior for those uses.
2166 static const unsigned MaxChains = 8;
2167
2168 /// IV users can form a chain of IV increments.
2170
2171 /// IV users that belong to profitable IVChains.
2172 SmallPtrSet<Use*, MaxChains> IVIncSet;
2173
2174 /// Induction variables that were generated and inserted by the SCEV Expander.
2175 SmallVector<llvm::WeakVH, 2> ScalarEvolutionIVs;
2176
2177 // Inserting instructions in the loop and using them as PHI's input could
2178 // break LCSSA in case if PHI's parent block is not a loop exit (i.e. the
2179 // corresponding incoming block is not loop exiting). So collect all such
2180 // instructions to form LCSSA for them later.
2181 SmallSetVector<Instruction *, 4> InsertedNonLCSSAInsts;
2182
2183 void OptimizeShadowIV();
2184 bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
2185 ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
2186 void OptimizeLoopTermCond();
2187
2188 void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
2189 SmallVectorImpl<ChainUsers> &ChainUsersVec);
2190 void FinalizeChain(IVChain &Chain);
2191 void CollectChains();
2192 void GenerateIVChain(const IVChain &Chain,
2193 SmallVectorImpl<WeakTrackingVH> &DeadInsts);
2194
2195 void CollectInterestingTypesAndFactors();
2196 void CollectFixupsAndInitialFormulae();
2197
2198 // Support for sharing of LSRUses between LSRFixups.
2199 using UseMapTy = DenseMap<LSRUse::SCEVUseKindPair, size_t>;
2200 UseMapTy UseMap;
2201
2202 bool reconcileNewOffset(LSRUse &LU, Immediate NewOffset, bool HasBaseReg,
2203 LSRUse::KindType Kind, MemAccessTy AccessTy);
2204
2205 std::pair<size_t, Immediate> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
2206 MemAccessTy AccessTy);
2207
2208 void DeleteUse(LSRUse &LU, size_t LUIdx);
2209
2210 LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
2211
2212 void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2213 void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2214 void CountRegisters(const Formula &F, size_t LUIdx);
2215 bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
2216
2217 void CollectLoopInvariantFixupsAndFormulae();
2218
2219 void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
2220 unsigned Depth = 0);
2221
2222 void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
2223 const Formula &Base, unsigned Depth,
2224 size_t Idx, bool IsScaledReg = false);
2225 void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
2226 void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2227 const Formula &Base, size_t Idx,
2228 bool IsScaledReg = false);
2229 void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2230 void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2231 const Formula &Base,
2232 const SmallVectorImpl<Immediate> &Worklist,
2233 size_t Idx, bool IsScaledReg = false);
2234 void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2235 void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2236 void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2237 void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
2238 void GenerateCrossUseConstantOffsets();
2239 void GenerateAllReuseFormulae();
2240
2241 void FilterOutUndesirableDedicatedRegisters();
2242
2243 size_t EstimateSearchSpaceComplexity() const;
2244 void NarrowSearchSpaceByDetectingSupersets();
2245 void NarrowSearchSpaceByCollapsingUnrolledCode();
2246 void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
2247 void NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
2248 void NarrowSearchSpaceByFilterPostInc();
2249 void NarrowSearchSpaceByDeletingCostlyFormulas();
2250 void NarrowSearchSpaceByPickingWinnerRegs();
2251 void NarrowSearchSpaceUsingHeuristics();
2252
2253 void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
2254 Cost &SolutionCost,
2255 SmallVectorImpl<const Formula *> &Workspace,
2256 const Cost &CurCost,
2257 const SmallPtrSet<const SCEV *, 16> &CurRegs,
2258 DenseSet<const SCEV *> &VisitedRegs) const;
2259 void Solve(SmallVectorImpl<const Formula *> &Solution) const;
2260
2262 HoistInsertPosition(BasicBlock::iterator IP,
2263 const SmallVectorImpl<Instruction *> &Inputs) const;
2264 BasicBlock::iterator AdjustInsertPositionForExpand(BasicBlock::iterator IP,
2265 const LSRFixup &LF,
2266 const LSRUse &LU) const;
2267
2268 Value *Expand(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2270 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2271 void RewriteForPHI(PHINode *PN, const LSRUse &LU, const LSRFixup &LF,
2272 const Formula &F,
2273 SmallVectorImpl<WeakTrackingVH> &DeadInsts);
2274 void Rewrite(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2275 SmallVectorImpl<WeakTrackingVH> &DeadInsts);
2276 void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution);
2277
2278public:
2279 LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT,
2280 LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC,
2281 TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU);
2282
2283 bool getChanged() const { return Changed; }
2284 const SmallVectorImpl<WeakVH> &getScalarEvolutionIVs() const {
2285 return ScalarEvolutionIVs;
2286 }
2287
2288 void print_factors_and_types(raw_ostream &OS) const;
2289 void print_fixups(raw_ostream &OS) const;
2290 void print_uses(raw_ostream &OS) const;
2291 void print(raw_ostream &OS) const;
2292 void dump() const;
2293};
2294
2295} // end anonymous namespace
2296
2297/// If IV is used in a int-to-float cast inside the loop then try to eliminate
2298/// the cast operation.
2299void LSRInstance::OptimizeShadowIV() {
2300 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2301 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2302 return;
2303
2304 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
2305 UI != E; /* empty */) {
2306 IVUsers::const_iterator CandidateUI = UI;
2307 ++UI;
2308 Instruction *ShadowUse = CandidateUI->getUser();
2309 Type *DestTy = nullptr;
2310 bool IsSigned = false;
2311
2312 /* If shadow use is a int->float cast then insert a second IV
2313 to eliminate this cast.
2314
2315 for (unsigned i = 0; i < n; ++i)
2316 foo((double)i);
2317
2318 is transformed into
2319
2320 double d = 0.0;
2321 for (unsigned i = 0; i < n; ++i, ++d)
2322 foo(d);
2323 */
2324 if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
2325 IsSigned = false;
2326 DestTy = UCast->getDestTy();
2327 }
2328 else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
2329 IsSigned = true;
2330 DestTy = SCast->getDestTy();
2331 }
2332 if (!DestTy) continue;
2333
2334 // If target does not support DestTy natively then do not apply
2335 // this transformation.
2336 if (!TTI.isTypeLegal(DestTy)) continue;
2337
2338 PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
2339 if (!PH) continue;
2340 if (PH->getNumIncomingValues() != 2) continue;
2341
2342 // If the calculation in integers overflows, the result in FP type will
2343 // differ. So we only can do this transformation if we are guaranteed to not
2344 // deal with overflowing values
2345 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(PH));
2346 if (!AR) continue;
2347 if (IsSigned && !AR->hasNoSignedWrap()) continue;
2348 if (!IsSigned && !AR->hasNoUnsignedWrap()) continue;
2349
2350 Type *SrcTy = PH->getType();
2351 int Mantissa = DestTy->getFPMantissaWidth();
2352 if (Mantissa == -1) continue;
2353 if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
2354 continue;
2355
2356 unsigned Entry, Latch;
2357 if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
2358 Entry = 0;
2359 Latch = 1;
2360 } else {
2361 Entry = 1;
2362 Latch = 0;
2363 }
2364
2365 ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
2366 if (!Init) continue;
2367 Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
2368 (double)Init->getSExtValue() :
2369 (double)Init->getZExtValue());
2370
2371 BinaryOperator *Incr =
2373 if (!Incr) continue;
2374 if (Incr->getOpcode() != Instruction::Add
2375 && Incr->getOpcode() != Instruction::Sub)
2376 continue;
2377
2378 /* Initialize new IV, double d = 0.0 in above example. */
2379 ConstantInt *C = nullptr;
2380 if (Incr->getOperand(0) == PH)
2382 else if (Incr->getOperand(1) == PH)
2384 else
2385 continue;
2386
2387 if (!C) continue;
2388
2389 // Ignore negative constants, as the code below doesn't handle them
2390 // correctly. TODO: Remove this restriction.
2391 if (!C->getValue().isStrictlyPositive())
2392 continue;
2393
2394 /* Add new PHINode. */
2395 PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH->getIterator());
2396 NewPH->setDebugLoc(PH->getDebugLoc());
2397
2398 /* create new increment. '++d' in above example. */
2399 Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
2400 BinaryOperator *NewIncr = BinaryOperator::Create(
2401 Incr->getOpcode() == Instruction::Add ? Instruction::FAdd
2402 : Instruction::FSub,
2403 NewPH, CFP, "IV.S.next.", Incr->getIterator());
2404 NewIncr->setDebugLoc(Incr->getDebugLoc());
2405
2406 NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
2407 NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
2408
2409 /* Remove cast operation */
2410 ShadowUse->replaceAllUsesWith(NewPH);
2411 ShadowUse->eraseFromParent();
2412 Changed = true;
2413 break;
2414 }
2415}
2416
2417/// If Cond has an operand that is an expression of an IV, set the IV user and
2418/// stride information and return true, otherwise return false.
2419bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
2420 for (IVStrideUse &U : IU)
2421 if (U.getUser() == Cond) {
2422 // NOTE: we could handle setcc instructions with multiple uses here, but
2423 // InstCombine does it as well for simple uses, it's not clear that it
2424 // occurs enough in real life to handle.
2425 CondUse = &U;
2426 return true;
2427 }
2428 return false;
2429}
2430
2431/// Rewrite the loop's terminating condition if it uses a max computation.
2432///
2433/// This is a narrow solution to a specific, but acute, problem. For loops
2434/// like this:
2435///
2436/// i = 0;
2437/// do {
2438/// p[i] = 0.0;
2439/// } while (++i < n);
2440///
2441/// the trip count isn't just 'n', because 'n' might not be positive. And
2442/// unfortunately this can come up even for loops where the user didn't use
2443/// a C do-while loop. For example, seemingly well-behaved top-test loops
2444/// will commonly be lowered like this:
2445///
2446/// if (n > 0) {
2447/// i = 0;
2448/// do {
2449/// p[i] = 0.0;
2450/// } while (++i < n);
2451/// }
2452///
2453/// and then it's possible for subsequent optimization to obscure the if
2454/// test in such a way that indvars can't find it.
2455///
2456/// When indvars can't find the if test in loops like this, it creates a
2457/// max expression, which allows it to give the loop a canonical
2458/// induction variable:
2459///
2460/// i = 0;
2461/// max = n < 1 ? 1 : n;
2462/// do {
2463/// p[i] = 0.0;
2464/// } while (++i != max);
2465///
2466/// Canonical induction variables are necessary because the loop passes
2467/// are designed around them. The most obvious example of this is the
2468/// LoopInfo analysis, which doesn't remember trip count values. It
2469/// expects to be able to rediscover the trip count each time it is
2470/// needed, and it does this using a simple analysis that only succeeds if
2471/// the loop has a canonical induction variable.
2472///
2473/// However, when it comes time to generate code, the maximum operation
2474/// can be quite costly, especially if it's inside of an outer loop.
2475///
2476/// This function solves this problem by detecting this type of loop and
2477/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
2478/// the instructions for the maximum computation.
2479ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
2480 // Check that the loop matches the pattern we're looking for.
2481 if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
2482 Cond->getPredicate() != CmpInst::ICMP_NE)
2483 return Cond;
2484
2485 SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
2486 if (!Sel || !Sel->hasOneUse()) return Cond;
2487
2488 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2489 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2490 return Cond;
2491 const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
2492
2493 // Add one to the backedge-taken count to get the trip count.
2494 const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
2495 if (IterationCount != SE.getSCEV(Sel)) return Cond;
2496
2497 // Check for a max calculation that matches the pattern. There's no check
2498 // for ICMP_ULE here because the comparison would be with zero, which
2499 // isn't interesting.
2500 CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
2501 const SCEVNAryExpr *Max = nullptr;
2502 if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
2503 Pred = ICmpInst::ICMP_SLE;
2504 Max = S;
2505 } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
2506 Pred = ICmpInst::ICMP_SLT;
2507 Max = S;
2508 } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
2509 Pred = ICmpInst::ICMP_ULT;
2510 Max = U;
2511 } else {
2512 // No match; bail.
2513 return Cond;
2514 }
2515
2516 // To handle a max with more than two operands, this optimization would
2517 // require additional checking and setup.
2518 if (Max->getNumOperands() != 2)
2519 return Cond;
2520
2521 const SCEV *MaxLHS = Max->getOperand(0);
2522 const SCEV *MaxRHS = Max->getOperand(1);
2523
2524 // ScalarEvolution canonicalizes constants to the left. For < and >, look
2525 // for a comparison with 1. For <= and >=, a comparison with zero.
2526 if (!MaxLHS ||
2527 (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
2528 return Cond;
2529
2530 // Check the relevant induction variable for conformance to
2531 // the pattern.
2532 const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
2533 if (!match(IV,
2535 return Cond;
2536
2537 assert(cast<SCEVAddRecExpr>(IV)->getLoop() == L &&
2538 "Loop condition operand is an addrec in a different loop!");
2539
2540 // Check the right operand of the select, and remember it, as it will
2541 // be used in the new comparison instruction.
2542 Value *NewRHS = nullptr;
2543 if (ICmpInst::isTrueWhenEqual(Pred)) {
2544 // Look for n+1, and grab n.
2545 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
2546 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2547 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2548 NewRHS = BO->getOperand(0);
2549 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
2550 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2551 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2552 NewRHS = BO->getOperand(0);
2553 if (!NewRHS)
2554 return Cond;
2555 } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
2556 NewRHS = Sel->getOperand(1);
2557 else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
2558 NewRHS = Sel->getOperand(2);
2559 else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
2560 NewRHS = SU->getValue();
2561 else
2562 // Max doesn't match expected pattern.
2563 return Cond;
2564
2565 // Determine the new comparison opcode. It may be signed or unsigned,
2566 // and the original comparison may be either equality or inequality.
2567 if (Cond->getPredicate() == CmpInst::ICMP_EQ)
2568 Pred = CmpInst::getInversePredicate(Pred);
2569
2570 // Ok, everything looks ok to change the condition into an SLT or SGE and
2571 // delete the max calculation.
2572 ICmpInst *NewCond = new ICmpInst(Cond->getIterator(), Pred,
2573 Cond->getOperand(0), NewRHS, "scmp");
2574
2575 // Delete the max calculation instructions.
2576 NewCond->setDebugLoc(Cond->getDebugLoc());
2577 Cond->replaceAllUsesWith(NewCond);
2578 CondUse->setUser(NewCond);
2580 Cond->eraseFromParent();
2581 Sel->eraseFromParent();
2582 if (Cmp->use_empty()) {
2583 salvageDebugInfo(*Cmp);
2584 Cmp->eraseFromParent();
2585 }
2586 return NewCond;
2587}
2588
2589/// Change loop terminating condition to use the postinc iv when possible.
2590void
2591LSRInstance::OptimizeLoopTermCond() {
2592 SmallPtrSet<Instruction *, 4> PostIncs;
2593
2594 // We need a different set of heuristics for rotated and non-rotated loops.
2595 // If a loop is rotated then the latch is also the backedge, so inserting
2596 // post-inc expressions just before the latch is ideal. To reduce live ranges
2597 // it also makes sense to rewrite terminating conditions to use post-inc
2598 // expressions.
2599 //
2600 // If the loop is not rotated then the latch is not a backedge; the latch
2601 // check is done in the loop head. Adding post-inc expressions before the
2602 // latch will cause overlapping live-ranges of pre-inc and post-inc expressions
2603 // in the loop body. In this case we do *not* want to use post-inc expressions
2604 // in the latch check, and we want to insert post-inc expressions before
2605 // the backedge.
2606 BasicBlock *LatchBlock = L->getLoopLatch();
2607 SmallVector<BasicBlock*, 8> ExitingBlocks;
2608 L->getExitingBlocks(ExitingBlocks);
2609 if (!llvm::is_contained(ExitingBlocks, LatchBlock)) {
2610 // The backedge doesn't exit the loop; treat this as a head-tested loop.
2611 IVIncInsertPos = LatchBlock->getTerminator();
2612 return;
2613 }
2614
2615 // Otherwise treat this as a rotated loop.
2616 for (BasicBlock *ExitingBlock : ExitingBlocks) {
2617 // Get the terminating condition for the loop if possible. If we
2618 // can, we want to change it to use a post-incremented version of its
2619 // induction variable, to allow coalescing the live ranges for the IV into
2620 // one register value.
2621
2622 BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
2623 if (!TermBr)
2624 continue;
2625 // FIXME: Overly conservative, termination condition could be an 'or' etc..
2626 if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
2627 continue;
2628
2629 // Search IVUsesByStride to find Cond's IVUse if there is one.
2630 IVStrideUse *CondUse = nullptr;
2631 ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
2632 if (!FindIVUserForCond(Cond, CondUse))
2633 continue;
2634
2635 // If the trip count is computed in terms of a max (due to ScalarEvolution
2636 // being unable to find a sufficient guard, for example), change the loop
2637 // comparison to use SLT or ULT instead of NE.
2638 // One consequence of doing this now is that it disrupts the count-down
2639 // optimization. That's not always a bad thing though, because in such
2640 // cases it may still be worthwhile to avoid a max.
2641 Cond = OptimizeMax(Cond, CondUse);
2642
2643 // If this exiting block dominates the latch block, it may also use
2644 // the post-inc value if it won't be shared with other uses.
2645 // Check for dominance.
2646 if (!DT.dominates(ExitingBlock, LatchBlock))
2647 continue;
2648
2649 // Conservatively avoid trying to use the post-inc value in non-latch
2650 // exits if there may be pre-inc users in intervening blocks.
2651 if (LatchBlock != ExitingBlock)
2652 for (const IVStrideUse &UI : IU)
2653 // Test if the use is reachable from the exiting block. This dominator
2654 // query is a conservative approximation of reachability.
2655 if (&UI != CondUse &&
2656 !DT.properlyDominates(UI.getUser()->getParent(), ExitingBlock)) {
2657 // Conservatively assume there may be reuse if the quotient of their
2658 // strides could be a legal scale.
2659 const SCEV *A = IU.getStride(*CondUse, L);
2660 const SCEV *B = IU.getStride(UI, L);
2661 if (!A || !B) continue;
2662 if (SE.getTypeSizeInBits(A->getType()) !=
2663 SE.getTypeSizeInBits(B->getType())) {
2664 if (SE.getTypeSizeInBits(A->getType()) >
2665 SE.getTypeSizeInBits(B->getType()))
2666 B = SE.getSignExtendExpr(B, A->getType());
2667 else
2668 A = SE.getSignExtendExpr(A, B->getType());
2669 }
2670 if (const SCEVConstant *D =
2672 const ConstantInt *C = D->getValue();
2673 // Stride of one or negative one can have reuse with non-addresses.
2674 if (C->isOne() || C->isMinusOne())
2675 goto decline_post_inc;
2676 // Avoid weird situations.
2677 if (C->getValue().getSignificantBits() >= 64 ||
2678 C->getValue().isMinSignedValue())
2679 goto decline_post_inc;
2680 // Check for possible scaled-address reuse.
2681 if (isAddressUse(TTI, UI.getUser(), UI.getOperandValToReplace())) {
2682 MemAccessTy AccessTy =
2683 getAccessType(TTI, UI.getUser(), UI.getOperandValToReplace());
2684 int64_t Scale = C->getSExtValue();
2685 if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2686 /*BaseOffset=*/0,
2687 /*HasBaseReg=*/true, Scale,
2688 AccessTy.AddrSpace))
2689 goto decline_post_inc;
2690 Scale = -Scale;
2691 if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2692 /*BaseOffset=*/0,
2693 /*HasBaseReg=*/true, Scale,
2694 AccessTy.AddrSpace))
2695 goto decline_post_inc;
2696 }
2697 }
2698 }
2699
2700 LLVM_DEBUG(dbgs() << " Change loop exiting icmp to use postinc iv: "
2701 << *Cond << '\n');
2702
2703 // It's possible for the setcc instruction to be anywhere in the loop, and
2704 // possible for it to have multiple users. If it is not immediately before
2705 // the exiting block branch, move it.
2706 if (Cond->getNextNode() != TermBr) {
2707 if (Cond->hasOneUse()) {
2708 Cond->moveBefore(TermBr->getIterator());
2709 } else {
2710 // Clone the terminating condition and insert into the loopend.
2711 ICmpInst *OldCond = Cond;
2712 Cond = cast<ICmpInst>(Cond->clone());
2713 Cond->setName(L->getHeader()->getName() + ".termcond");
2714 Cond->insertInto(ExitingBlock, TermBr->getIterator());
2715
2716 // Clone the IVUse, as the old use still exists!
2717 CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
2718 TermBr->replaceUsesOfWith(OldCond, Cond);
2719 }
2720 }
2721
2722 // If we get to here, we know that we can transform the setcc instruction to
2723 // use the post-incremented version of the IV, allowing us to coalesce the
2724 // live ranges for the IV correctly.
2725 CondUse->transformToPostInc(L);
2726 Changed = true;
2727
2728 PostIncs.insert(Cond);
2729 decline_post_inc:;
2730 }
2731
2732 // Determine an insertion point for the loop induction variable increment. It
2733 // must dominate all the post-inc comparisons we just set up, and it must
2734 // dominate the loop latch edge.
2735 IVIncInsertPos = L->getLoopLatch()->getTerminator();
2736 for (Instruction *Inst : PostIncs)
2737 IVIncInsertPos = DT.findNearestCommonDominator(IVIncInsertPos, Inst);
2738}
2739
2740/// Determine if the given use can accommodate a fixup at the given offset and
2741/// other details. If so, update the use and return true.
2742bool LSRInstance::reconcileNewOffset(LSRUse &LU, Immediate NewOffset,
2743 bool HasBaseReg, LSRUse::KindType Kind,
2744 MemAccessTy AccessTy) {
2745 Immediate NewMinOffset = LU.MinOffset;
2746 Immediate NewMaxOffset = LU.MaxOffset;
2747 MemAccessTy NewAccessTy = AccessTy;
2748
2749 // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
2750 // something conservative, however this can pessimize in the case that one of
2751 // the uses will have all its uses outside the loop, for example.
2752 if (LU.Kind != Kind)
2753 return false;
2754
2755 // Check for a mismatched access type, and fall back conservatively as needed.
2756 // TODO: Be less conservative when the type is similar and can use the same
2757 // addressing modes.
2758 if (Kind == LSRUse::Address) {
2759 if (AccessTy.MemTy != LU.AccessTy.MemTy) {
2760 NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext(),
2761 AccessTy.AddrSpace);
2762 }
2763 }
2764
2765 // Conservatively assume HasBaseReg is true for now.
2766 if (Immediate::isKnownLT(NewOffset, LU.MinOffset)) {
2767 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2768 LU.MaxOffset - NewOffset, HasBaseReg))
2769 return false;
2770 NewMinOffset = NewOffset;
2771 } else if (Immediate::isKnownGT(NewOffset, LU.MaxOffset)) {
2772 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2773 NewOffset - LU.MinOffset, HasBaseReg))
2774 return false;
2775 NewMaxOffset = NewOffset;
2776 }
2777
2778 // FIXME: We should be able to handle some level of scalable offset support
2779 // for 'void', but in order to get basic support up and running this is
2780 // being left out.
2781 if (NewAccessTy.MemTy && NewAccessTy.MemTy->isVoidTy() &&
2782 (NewMinOffset.isScalable() || NewMaxOffset.isScalable()))
2783 return false;
2784
2785 // Update the use.
2786 LU.MinOffset = NewMinOffset;
2787 LU.MaxOffset = NewMaxOffset;
2788 LU.AccessTy = NewAccessTy;
2789 return true;
2790}
2791
2792/// Return an LSRUse index and an offset value for a fixup which needs the given
2793/// expression, with the given kind and optional access type. Either reuse an
2794/// existing use or create a new one, as needed.
2795std::pair<size_t, Immediate> LSRInstance::getUse(const SCEV *&Expr,
2796 LSRUse::KindType Kind,
2797 MemAccessTy AccessTy) {
2798 const SCEV *Copy = Expr;
2799 Immediate Offset = ExtractImmediate(Expr, SE);
2800
2801 // Basic uses can't accept any offset, for example.
2802 if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
2803 Offset, /*HasBaseReg=*/ true)) {
2804 Expr = Copy;
2805 Offset = Immediate::getFixed(0);
2806 }
2807
2808 std::pair<UseMapTy::iterator, bool> P =
2809 UseMap.try_emplace(LSRUse::SCEVUseKindPair(Expr, Kind));
2810 if (!P.second) {
2811 // A use already existed with this base.
2812 size_t LUIdx = P.first->second;
2813 LSRUse &LU = Uses[LUIdx];
2814 if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
2815 // Reuse this use.
2816 return std::make_pair(LUIdx, Offset);
2817 }
2818
2819 // Create a new use.
2820 size_t LUIdx = Uses.size();
2821 P.first->second = LUIdx;
2822 Uses.push_back(LSRUse(Kind, AccessTy));
2823 LSRUse &LU = Uses[LUIdx];
2824
2825 LU.MinOffset = Offset;
2826 LU.MaxOffset = Offset;
2827 return std::make_pair(LUIdx, Offset);
2828}
2829
2830/// Delete the given use from the Uses list.
2831void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
2832 if (&LU != &Uses.back())
2833 std::swap(LU, Uses.back());
2834 Uses.pop_back();
2835
2836 // Update RegUses.
2837 RegUses.swapAndDropUse(LUIdx, Uses.size());
2838}
2839
2840/// Look for a use distinct from OrigLU which is has a formula that has the same
2841/// registers as the given formula.
2842LSRUse *
2843LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
2844 const LSRUse &OrigLU) {
2845 // Search all uses for the formula. This could be more clever.
2846 for (LSRUse &LU : Uses) {
2847 // Check whether this use is close enough to OrigLU, to see whether it's
2848 // worthwhile looking through its formulae.
2849 // Ignore ICmpZero uses because they may contain formulae generated by
2850 // GenerateICmpZeroScales, in which case adding fixup offsets may
2851 // be invalid.
2852 if (&LU != &OrigLU &&
2853 LU.Kind != LSRUse::ICmpZero &&
2854 LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
2855 LU.WidestFixupType == OrigLU.WidestFixupType &&
2856 LU.HasFormulaWithSameRegs(OrigF)) {
2857 // Scan through this use's formulae.
2858 for (const Formula &F : LU.Formulae) {
2859 // Check to see if this formula has the same registers and symbols
2860 // as OrigF.
2861 if (F.BaseRegs == OrigF.BaseRegs &&
2862 F.ScaledReg == OrigF.ScaledReg &&
2863 F.BaseGV == OrigF.BaseGV &&
2864 F.Scale == OrigF.Scale &&
2865 F.UnfoldedOffset == OrigF.UnfoldedOffset) {
2866 if (F.BaseOffset.isZero())
2867 return &LU;
2868 // This is the formula where all the registers and symbols matched;
2869 // there aren't going to be any others. Since we declined it, we
2870 // can skip the rest of the formulae and proceed to the next LSRUse.
2871 break;
2872 }
2873 }
2874 }
2875 }
2876
2877 // Nothing looked good.
2878 return nullptr;
2879}
2880
2881void LSRInstance::CollectInterestingTypesAndFactors() {
2882 SmallSetVector<const SCEV *, 4> Strides;
2883
2884 // Collect interesting types and strides.
2886 for (const IVStrideUse &U : IU) {
2887 const SCEV *Expr = IU.getExpr(U);
2888 if (!Expr)
2889 continue;
2890
2891 // Collect interesting types.
2892 Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
2893
2894 // Add strides for mentioned loops.
2895 Worklist.push_back(Expr);
2896 do {
2897 const SCEV *S = Worklist.pop_back_val();
2898 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
2899 if (AR->getLoop() == L)
2900 Strides.insert(AR->getStepRecurrence(SE));
2901 Worklist.push_back(AR->getStart());
2902 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
2903 append_range(Worklist, Add->operands());
2904 }
2905 } while (!Worklist.empty());
2906 }
2907
2908 // Compute interesting factors from the set of interesting strides.
2909 for (SmallSetVector<const SCEV *, 4>::const_iterator
2910 I = Strides.begin(), E = Strides.end(); I != E; ++I)
2911 for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter =
2912 std::next(I); NewStrideIter != E; ++NewStrideIter) {
2913 const SCEV *OldStride = *I;
2914 const SCEV *NewStride = *NewStrideIter;
2915
2916 if (SE.getTypeSizeInBits(OldStride->getType()) !=
2917 SE.getTypeSizeInBits(NewStride->getType())) {
2918 if (SE.getTypeSizeInBits(OldStride->getType()) >
2919 SE.getTypeSizeInBits(NewStride->getType()))
2920 NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
2921 else
2922 OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
2923 }
2924 if (const SCEVConstant *Factor =
2925 dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
2926 SE, true))) {
2927 if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2928 Factors.insert(Factor->getAPInt().getSExtValue());
2929 } else if (const SCEVConstant *Factor =
2931 NewStride,
2932 SE, true))) {
2933 if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2934 Factors.insert(Factor->getAPInt().getSExtValue());
2935 }
2936 }
2937
2938 // If all uses use the same type, don't bother looking for truncation-based
2939 // reuse.
2940 if (Types.size() == 1)
2941 Types.clear();
2942
2943 LLVM_DEBUG(print_factors_and_types(dbgs()));
2944}
2945
2946/// Helper for CollectChains that finds an IV operand (computed by an AddRec in
2947/// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to
2948/// IVStrideUses, we could partially skip this.
2949static User::op_iterator
2951 Loop *L, ScalarEvolution &SE) {
2952 for(; OI != OE; ++OI) {
2953 if (Instruction *Oper = dyn_cast<Instruction>(*OI)) {
2954 if (!SE.isSCEVable(Oper->getType()))
2955 continue;
2956
2957 if (const SCEVAddRecExpr *AR =
2959 if (AR->getLoop() == L)
2960 break;
2961 }
2962 }
2963 }
2964 return OI;
2965}
2966
2967/// IVChain logic must consistently peek base TruncInst operands, so wrap it in
2968/// a convenient helper.
2970 if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
2971 return Trunc->getOperand(0);
2972 return Oper;
2973}
2974
2975/// Return an approximation of this SCEV expression's "base", or NULL for any
2976/// constant. Returning the expression itself is conservative. Returning a
2977/// deeper subexpression is more precise and valid as long as it isn't less
2978/// complex than another subexpression. For expressions involving multiple
2979/// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids
2980/// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i],
2981/// IVInc==b-a.
2982///
2983/// Since SCEVUnknown is the rightmost type, and pointers are the rightmost
2984/// SCEVUnknown, we simply return the rightmost SCEV operand.
2985static const SCEV *getExprBase(const SCEV *S) {
2986 switch (S->getSCEVType()) {
2987 default: // including scUnknown.
2988 return S;
2989 case scConstant:
2990 case scVScale:
2991 return nullptr;
2992 case scTruncate:
2993 return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand());
2994 case scZeroExtend:
2995 return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand());
2996 case scSignExtend:
2997 return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand());
2998 case scAddExpr: {
2999 // Skip over scaled operands (scMulExpr) to follow add operands as long as
3000 // there's nothing more complex.
3001 // FIXME: not sure if we want to recognize negation.
3002 const SCEVAddExpr *Add = cast<SCEVAddExpr>(S);
3003 for (const SCEV *SubExpr : reverse(Add->operands())) {
3004 if (SubExpr->getSCEVType() == scAddExpr)
3005 return getExprBase(SubExpr);
3006
3007 if (SubExpr->getSCEVType() != scMulExpr)
3008 return SubExpr;
3009 }
3010 return S; // all operands are scaled, be conservative.
3011 }
3012 case scAddRecExpr:
3013 return getExprBase(cast<SCEVAddRecExpr>(S)->getStart());
3014 }
3015 llvm_unreachable("Unknown SCEV kind!");
3016}
3017
3018/// Return true if the chain increment is profitable to expand into a loop
3019/// invariant value, which may require its own register. A profitable chain
3020/// increment will be an offset relative to the same base. We allow such offsets
3021/// to potentially be used as chain increment as long as it's not obviously
3022/// expensive to expand using real instructions.
3023bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
3024 const SCEV *IncExpr,
3025 ScalarEvolution &SE) {
3026 // Aggressively form chains when -stress-ivchain.
3027 if (StressIVChain)
3028 return true;
3029
3030 // Do not replace a constant offset from IV head with a nonconstant IV
3031 // increment.
3032 if (!isa<SCEVConstant>(IncExpr)) {
3033 const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand));
3034 if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
3035 return false;
3036 }
3037
3038 SmallPtrSet<const SCEV*, 8> Processed;
3039 return !isHighCostExpansion(IncExpr, Processed, SE);
3040}
3041
3042/// Return true if the number of registers needed for the chain is estimated to
3043/// be less than the number required for the individual IV users. First prohibit
3044/// any IV users that keep the IV live across increments (the Users set should
3045/// be empty). Next count the number and type of increments in the chain.
3046///
3047/// Chaining IVs can lead to considerable code bloat if ISEL doesn't
3048/// effectively use postinc addressing modes. Only consider it profitable it the
3049/// increments can be computed in fewer registers when chained.
3050///
3051/// TODO: Consider IVInc free if it's already used in another chains.
3052static bool isProfitableChain(IVChain &Chain,
3054 ScalarEvolution &SE,
3055 const TargetTransformInfo &TTI) {
3056 if (StressIVChain)
3057 return true;
3058
3059 if (!Chain.hasIncs())
3060 return false;
3061
3062 if (!Users.empty()) {
3063 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
3064 for (Instruction *Inst
3065 : Users) { dbgs() << " " << *Inst << "\n"; });
3066 return false;
3067 }
3068 assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3069
3070 // The chain itself may require a register, so intialize cost to 1.
3071 int cost = 1;
3072
3073 // A complete chain likely eliminates the need for keeping the original IV in
3074 // a register. LSR does not currently know how to form a complete chain unless
3075 // the header phi already exists.
3076 if (isa<PHINode>(Chain.tailUserInst())
3077 && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) {
3078 --cost;
3079 }
3080 const SCEV *LastIncExpr = nullptr;
3081 unsigned NumConstIncrements = 0;
3082 unsigned NumVarIncrements = 0;
3083 unsigned NumReusedIncrements = 0;
3084
3085 if (TTI.isProfitableLSRChainElement(Chain.Incs[0].UserInst))
3086 return true;
3087
3088 for (const IVInc &Inc : Chain) {
3089 if (TTI.isProfitableLSRChainElement(Inc.UserInst))
3090 return true;
3091 if (Inc.IncExpr->isZero())
3092 continue;
3093
3094 // Incrementing by zero or some constant is neutral. We assume constants can
3095 // be folded into an addressing mode or an add's immediate operand.
3096 if (isa<SCEVConstant>(Inc.IncExpr)) {
3097 ++NumConstIncrements;
3098 continue;
3099 }
3100
3101 if (Inc.IncExpr == LastIncExpr)
3102 ++NumReusedIncrements;
3103 else
3104 ++NumVarIncrements;
3105
3106 LastIncExpr = Inc.IncExpr;
3107 }
3108 // An IV chain with a single increment is handled by LSR's postinc
3109 // uses. However, a chain with multiple increments requires keeping the IV's
3110 // value live longer than it needs to be if chained.
3111 if (NumConstIncrements > 1)
3112 --cost;
3113
3114 // Materializing increment expressions in the preheader that didn't exist in
3115 // the original code may cost a register. For example, sign-extended array
3116 // indices can produce ridiculous increments like this:
3117 // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
3118 cost += NumVarIncrements;
3119
3120 // Reusing variable increments likely saves a register to hold the multiple of
3121 // the stride.
3122 cost -= NumReusedIncrements;
3123
3124 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
3125 << "\n");
3126
3127 return cost < 0;
3128}
3129
3130/// Add this IV user to an existing chain or make it the head of a new chain.
3131void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
3132 SmallVectorImpl<ChainUsers> &ChainUsersVec) {
3133 // When IVs are used as types of varying widths, they are generally converted
3134 // to a wider type with some uses remaining narrow under a (free) trunc.
3135 Value *const NextIV = getWideOperand(IVOper);
3136 const SCEV *const OperExpr = SE.getSCEV(NextIV);
3137 const SCEV *const OperExprBase = getExprBase(OperExpr);
3138
3139 // Visit all existing chains. Check if its IVOper can be computed as a
3140 // profitable loop invariant increment from the last link in the Chain.
3141 unsigned ChainIdx = 0, NChains = IVChainVec.size();
3142 const SCEV *LastIncExpr = nullptr;
3143 for (; ChainIdx < NChains; ++ChainIdx) {
3144 IVChain &Chain = IVChainVec[ChainIdx];
3145
3146 // Prune the solution space aggressively by checking that both IV operands
3147 // are expressions that operate on the same unscaled SCEVUnknown. This
3148 // "base" will be canceled by the subsequent getMinusSCEV call. Checking
3149 // first avoids creating extra SCEV expressions.
3150 if (!StressIVChain && Chain.ExprBase != OperExprBase)
3151 continue;
3152
3153 Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand);
3154 if (PrevIV->getType() != NextIV->getType())
3155 continue;
3156
3157 // A phi node terminates a chain.
3158 if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst()))
3159 continue;
3160
3161 // The increment must be loop-invariant so it can be kept in a register.
3162 const SCEV *PrevExpr = SE.getSCEV(PrevIV);
3163 const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
3164 if (isa<SCEVCouldNotCompute>(IncExpr) || !SE.isLoopInvariant(IncExpr, L))
3165 continue;
3166
3167 if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) {
3168 LastIncExpr = IncExpr;
3169 break;
3170 }
3171 }
3172 // If we haven't found a chain, create a new one, unless we hit the max. Don't
3173 // bother for phi nodes, because they must be last in the chain.
3174 if (ChainIdx == NChains) {
3175 if (isa<PHINode>(UserInst))
3176 return;
3177 if (NChains >= MaxChains && !StressIVChain) {
3178 LLVM_DEBUG(dbgs() << "IV Chain Limit\n");
3179 return;
3180 }
3181 LastIncExpr = OperExpr;
3182 // IVUsers may have skipped over sign/zero extensions. We don't currently
3183 // attempt to form chains involving extensions unless they can be hoisted
3184 // into this loop's AddRec.
3185 if (!isa<SCEVAddRecExpr>(LastIncExpr))
3186 return;
3187 ++NChains;
3188 IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
3189 OperExprBase));
3190 ChainUsersVec.resize(NChains);
3191 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
3192 << ") IV=" << *LastIncExpr << "\n");
3193 } else {
3194 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Inc: (" << *UserInst
3195 << ") IV+" << *LastIncExpr << "\n");
3196 // Add this IV user to the end of the chain.
3197 IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
3198 }
3199 IVChain &Chain = IVChainVec[ChainIdx];
3200
3201 SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
3202 // This chain's NearUsers become FarUsers.
3203 if (!LastIncExpr->isZero()) {
3204 ChainUsersVec[ChainIdx].FarUsers.insert_range(NearUsers);
3205 NearUsers.clear();
3206 }
3207
3208 // All other uses of IVOperand become near uses of the chain.
3209 // We currently ignore intermediate values within SCEV expressions, assuming
3210 // they will eventually be used be the current chain, or can be computed
3211 // from one of the chain increments. To be more precise we could
3212 // transitively follow its user and only add leaf IV users to the set.
3213 for (User *U : IVOper->users()) {
3214 Instruction *OtherUse = dyn_cast<Instruction>(U);
3215 if (!OtherUse)
3216 continue;
3217 // Uses in the chain will no longer be uses if the chain is formed.
3218 // Include the head of the chain in this iteration (not Chain.begin()).
3219 IVChain::const_iterator IncIter = Chain.Incs.begin();
3220 IVChain::const_iterator IncEnd = Chain.Incs.end();
3221 for( ; IncIter != IncEnd; ++IncIter) {
3222 if (IncIter->UserInst == OtherUse)
3223 break;
3224 }
3225 if (IncIter != IncEnd)
3226 continue;
3227
3228 if (SE.isSCEVable(OtherUse->getType())
3229 && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
3230 && IU.isIVUserOrOperand(OtherUse)) {
3231 continue;
3232 }
3233 NearUsers.insert(OtherUse);
3234 }
3235
3236 // Since this user is part of the chain, it's no longer considered a use
3237 // of the chain.
3238 ChainUsersVec[ChainIdx].FarUsers.erase(UserInst);
3239}
3240
3241/// Populate the vector of Chains.
3242///
3243/// This decreases ILP at the architecture level. Targets with ample registers,
3244/// multiple memory ports, and no register renaming probably don't want
3245/// this. However, such targets should probably disable LSR altogether.
3246///
3247/// The job of LSR is to make a reasonable choice of induction variables across
3248/// the loop. Subsequent passes can easily "unchain" computation exposing more
3249/// ILP *within the loop* if the target wants it.
3250///
3251/// Finding the best IV chain is potentially a scheduling problem. Since LSR
3252/// will not reorder memory operations, it will recognize this as a chain, but
3253/// will generate redundant IV increments. Ideally this would be corrected later
3254/// by a smart scheduler:
3255/// = A[i]
3256/// = A[i+x]
3257/// A[i] =
3258/// A[i+x] =
3259///
3260/// TODO: Walk the entire domtree within this loop, not just the path to the
3261/// loop latch. This will discover chains on side paths, but requires
3262/// maintaining multiple copies of the Chains state.
3263void LSRInstance::CollectChains() {
3264 LLVM_DEBUG(dbgs() << "Collecting IV Chains.\n");
3265 SmallVector<ChainUsers, 8> ChainUsersVec;
3266
3267 SmallVector<BasicBlock *,8> LatchPath;
3268 BasicBlock *LoopHeader = L->getHeader();
3269 for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch());
3270 Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) {
3271 LatchPath.push_back(Rung->getBlock());
3272 }
3273 LatchPath.push_back(LoopHeader);
3274
3275 // Walk the instruction stream from the loop header to the loop latch.
3276 for (BasicBlock *BB : reverse(LatchPath)) {
3277 for (Instruction &I : *BB) {
3278 // Skip instructions that weren't seen by IVUsers analysis.
3279 if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&I))
3280 continue;
3281
3282 // Ignore users that are part of a SCEV expression. This way we only
3283 // consider leaf IV Users. This effectively rediscovers a portion of
3284 // IVUsers analysis but in program order this time.
3285 if (SE.isSCEVable(I.getType()) && !isa<SCEVUnknown>(SE.getSCEV(&I)))
3286 continue;
3287
3288 // Remove this instruction from any NearUsers set it may be in.
3289 for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
3290 ChainIdx < NChains; ++ChainIdx) {
3291 ChainUsersVec[ChainIdx].NearUsers.erase(&I);
3292 }
3293 // Search for operands that can be chained.
3294 SmallPtrSet<Instruction*, 4> UniqueOperands;
3295 User::op_iterator IVOpEnd = I.op_end();
3296 User::op_iterator IVOpIter = findIVOperand(I.op_begin(), IVOpEnd, L, SE);
3297 while (IVOpIter != IVOpEnd) {
3298 Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
3299 if (UniqueOperands.insert(IVOpInst).second)
3300 ChainInstruction(&I, IVOpInst, ChainUsersVec);
3301 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3302 }
3303 } // Continue walking down the instructions.
3304 } // Continue walking down the domtree.
3305 // Visit phi backedges to determine if the chain can generate the IV postinc.
3306 for (PHINode &PN : L->getHeader()->phis()) {
3307 if (!SE.isSCEVable(PN.getType()))
3308 continue;
3309
3310 Instruction *IncV =
3311 dyn_cast<Instruction>(PN.getIncomingValueForBlock(L->getLoopLatch()));
3312 if (IncV)
3313 ChainInstruction(&PN, IncV, ChainUsersVec);
3314 }
3315 // Remove any unprofitable chains.
3316 unsigned ChainIdx = 0;
3317 for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
3318 UsersIdx < NChains; ++UsersIdx) {
3319 if (!isProfitableChain(IVChainVec[UsersIdx],
3320 ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
3321 continue;
3322 // Preserve the chain at UsesIdx.
3323 if (ChainIdx != UsersIdx)
3324 IVChainVec[ChainIdx] = IVChainVec[UsersIdx];
3325 FinalizeChain(IVChainVec[ChainIdx]);
3326 ++ChainIdx;
3327 }
3328 IVChainVec.resize(ChainIdx);
3329}
3330
3331void LSRInstance::FinalizeChain(IVChain &Chain) {
3332 assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3333 LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
3334
3335 for (const IVInc &Inc : Chain) {
3336 LLVM_DEBUG(dbgs() << " Inc: " << *Inc.UserInst << "\n");
3337 auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand);
3338 assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand");
3339 IVIncSet.insert(UseI);
3340 }
3341}
3342
3343/// Return true if the IVInc can be folded into an addressing mode.
3344static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
3345 Value *Operand, const TargetTransformInfo &TTI) {
3346 const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
3347 Immediate IncOffset = Immediate::getZero();
3348 if (IncConst) {
3349 if (IncConst && IncConst->getAPInt().getSignificantBits() > 64)
3350 return false;
3351 IncOffset = Immediate::getFixed(IncConst->getValue()->getSExtValue());
3352 } else {
3353 // Look for mul(vscale, constant), to detect a scalable offset.
3354 const APInt *C;
3355 if (!match(IncExpr, m_scev_Mul(m_scev_APInt(C), m_SCEVVScale())) ||
3356 C->getSignificantBits() > 64)
3357 return false;
3358 IncOffset = Immediate::getScalable(C->getSExtValue());
3359 }
3360
3361 if (!isAddressUse(TTI, UserInst, Operand))
3362 return false;
3363
3364 MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand);
3365 if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
3366 IncOffset, /*HasBaseReg=*/false))
3367 return false;
3368
3369 return true;
3370}
3371
3372/// Generate an add or subtract for each IVInc in a chain to materialize the IV
3373/// user's operand from the previous IV user's operand.
3374void LSRInstance::GenerateIVChain(const IVChain &Chain,
3375 SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
3376 // Find the new IVOperand for the head of the chain. It may have been replaced
3377 // by LSR.
3378 const IVInc &Head = Chain.Incs[0];
3379 User::op_iterator IVOpEnd = Head.UserInst->op_end();
3380 // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
3381 User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
3382 IVOpEnd, L, SE);
3383 Value *IVSrc = nullptr;
3384 while (IVOpIter != IVOpEnd) {
3385 IVSrc = getWideOperand(*IVOpIter);
3386
3387 // If this operand computes the expression that the chain needs, we may use
3388 // it. (Check this after setting IVSrc which is used below.)
3389 //
3390 // Note that if Head.IncExpr is wider than IVSrc, then this phi is too
3391 // narrow for the chain, so we can no longer use it. We do allow using a
3392 // wider phi, assuming the LSR checked for free truncation. In that case we
3393 // should already have a truncate on this operand such that
3394 // getSCEV(IVSrc) == IncExpr.
3395 if (SE.getSCEV(*IVOpIter) == Head.IncExpr
3396 || SE.getSCEV(IVSrc) == Head.IncExpr) {
3397 break;
3398 }
3399 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3400 }
3401 if (IVOpIter == IVOpEnd) {
3402 // Gracefully give up on this chain.
3403 LLVM_DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
3404 return;
3405 }
3406 assert(IVSrc && "Failed to find IV chain source");
3407
3408 LLVM_DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
3409 Type *IVTy = IVSrc->getType();
3410 Type *IntTy = SE.getEffectiveSCEVType(IVTy);
3411 const SCEV *LeftOverExpr = nullptr;
3412 const SCEV *Accum = SE.getZero(IntTy);
3414 Bases.emplace_back(Accum, IVSrc);
3415
3416 for (const IVInc &Inc : Chain) {
3417 Instruction *InsertPt = Inc.UserInst;
3418 if (isa<PHINode>(InsertPt))
3419 InsertPt = L->getLoopLatch()->getTerminator();
3420
3421 // IVOper will replace the current IV User's operand. IVSrc is the IV
3422 // value currently held in a register.
3423 Value *IVOper = IVSrc;
3424 if (!Inc.IncExpr->isZero()) {
3425 // IncExpr was the result of subtraction of two narrow values, so must
3426 // be signed.
3427 const SCEV *IncExpr = SE.getNoopOrSignExtend(Inc.IncExpr, IntTy);
3428 Accum = SE.getAddExpr(Accum, IncExpr);
3429 LeftOverExpr = LeftOverExpr ?
3430 SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr;
3431 }
3432
3433 // Look through each base to see if any can produce a nice addressing mode.
3434 bool FoundBase = false;
3435 for (auto [MapScev, MapIVOper] : reverse(Bases)) {
3436 const SCEV *Remainder = SE.getMinusSCEV(Accum, MapScev);
3437 if (canFoldIVIncExpr(Remainder, Inc.UserInst, Inc.IVOperand, TTI)) {
3438 if (!Remainder->isZero()) {
3439 Rewriter.clearPostInc();
3440 Value *IncV = Rewriter.expandCodeFor(Remainder, IntTy, InsertPt);
3441 const SCEV *IVOperExpr =
3442 SE.getAddExpr(SE.getUnknown(MapIVOper), SE.getUnknown(IncV));
3443 IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3444 } else {
3445 IVOper = MapIVOper;
3446 }
3447
3448 FoundBase = true;
3449 break;
3450 }
3451 }
3452 if (!FoundBase && LeftOverExpr && !LeftOverExpr->isZero()) {
3453 // Expand the IV increment.
3454 Rewriter.clearPostInc();
3455 Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt);
3456 const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc),
3457 SE.getUnknown(IncV));
3458 IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3459
3460 // If an IV increment can't be folded, use it as the next IV value.
3461 if (!canFoldIVIncExpr(LeftOverExpr, Inc.UserInst, Inc.IVOperand, TTI)) {
3462 assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
3463 Bases.emplace_back(Accum, IVOper);
3464 IVSrc = IVOper;
3465 LeftOverExpr = nullptr;
3466 }
3467 }
3468 Type *OperTy = Inc.IVOperand->getType();
3469 if (IVTy != OperTy) {
3470 assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) &&
3471 "cannot extend a chained IV");
3472 IRBuilder<> Builder(InsertPt);
3473 IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");
3474 }
3475 Inc.UserInst->replaceUsesOfWith(Inc.IVOperand, IVOper);
3476 if (auto *OperandIsInstr = dyn_cast<Instruction>(Inc.IVOperand))
3477 DeadInsts.emplace_back(OperandIsInstr);
3478 }
3479 // If LSR created a new, wider phi, we may also replace its postinc. We only
3480 // do this if we also found a wide value for the head of the chain.
3481 if (isa<PHINode>(Chain.tailUserInst())) {
3482 for (PHINode &Phi : L->getHeader()->phis()) {
3483 if (Phi.getType() != IVSrc->getType())
3484 continue;
3486 Phi.getIncomingValueForBlock(L->getLoopLatch()));
3487 if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc)))
3488 continue;
3489 Value *IVOper = IVSrc;
3490 Type *PostIncTy = PostIncV->getType();
3491 if (IVTy != PostIncTy) {
3492 assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types");
3493 IRBuilder<> Builder(L->getLoopLatch()->getTerminator());
3494 Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
3495 IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
3496 }
3497 Phi.replaceUsesOfWith(PostIncV, IVOper);
3498 DeadInsts.emplace_back(PostIncV);
3499 }
3500 }
3501}
3502
3503void LSRInstance::CollectFixupsAndInitialFormulae() {
3504 BranchInst *ExitBranch = nullptr;
3505 bool SaveCmp = TTI.canSaveCmp(L, &ExitBranch, &SE, &LI, &DT, &AC, &TLI);
3506
3507 // For calculating baseline cost
3508 SmallPtrSet<const SCEV *, 16> Regs;
3509 DenseSet<const SCEV *> VisitedRegs;
3510 DenseSet<size_t> VisitedLSRUse;
3511
3512 for (const IVStrideUse &U : IU) {
3513 Instruction *UserInst = U.getUser();
3514 // Skip IV users that are part of profitable IV Chains.
3515 User::op_iterator UseI =
3516 find(UserInst->operands(), U.getOperandValToReplace());
3517 assert(UseI != UserInst->op_end() && "cannot find IV operand");
3518 if (IVIncSet.count(UseI)) {
3519 LLVM_DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n');
3520 continue;
3521 }
3522
3523 LSRUse::KindType Kind = LSRUse::Basic;
3524 MemAccessTy AccessTy;
3525 if (isAddressUse(TTI, UserInst, U.getOperandValToReplace())) {
3526 Kind = LSRUse::Address;
3527 AccessTy = getAccessType(TTI, UserInst, U.getOperandValToReplace());
3528 }
3529
3530 const SCEV *S = IU.getExpr(U);
3531 if (!S)
3532 continue;
3533 PostIncLoopSet TmpPostIncLoops = U.getPostIncLoops();
3534
3535 // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
3536 // (N - i == 0), and this allows (N - i) to be the expression that we work
3537 // with rather than just N or i, so we can consider the register
3538 // requirements for both N and i at the same time. Limiting this code to
3539 // equality icmps is not a problem because all interesting loops use
3540 // equality icmps, thanks to IndVarSimplify.
3541 if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst)) {
3542 // If CI can be saved in some target, like replaced inside hardware loop
3543 // in PowerPC, no need to generate initial formulae for it.
3544 if (SaveCmp && CI == dyn_cast<ICmpInst>(ExitBranch->getCondition()))
3545 continue;
3546 if (CI->isEquality()) {
3547 // Swap the operands if needed to put the OperandValToReplace on the
3548 // left, for consistency.
3549 Value *NV = CI->getOperand(1);
3550 if (NV == U.getOperandValToReplace()) {
3551 CI->setOperand(1, CI->getOperand(0));
3552 CI->setOperand(0, NV);
3553 NV = CI->getOperand(1);
3554 Changed = true;
3555 }
3556
3557 // x == y --> x - y == 0
3558 const SCEV *N = SE.getSCEV(NV);
3559 if (SE.isLoopInvariant(N, L) && Rewriter.isSafeToExpand(N) &&
3560 (!NV->getType()->isPointerTy() ||
3561 SE.getPointerBase(N) == SE.getPointerBase(S))) {
3562 // S is normalized, so normalize N before folding it into S
3563 // to keep the result normalized.
3564 N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3565 if (!N)
3566 continue;
3567 Kind = LSRUse::ICmpZero;
3568 S = SE.getMinusSCEV(N, S);
3569 } else if (L->isLoopInvariant(NV) &&
3570 (!isa<Instruction>(NV) ||
3571 DT.dominates(cast<Instruction>(NV), L->getHeader())) &&
3572 !NV->getType()->isPointerTy()) {
3573 // If we can't generally expand the expression (e.g. it contains
3574 // a divide), but it is already at a loop invariant point before the
3575 // loop, wrap it in an unknown (to prevent the expander from trying
3576 // to re-expand in a potentially unsafe way.) The restriction to
3577 // integer types is required because the unknown hides the base, and
3578 // SCEV can't compute the difference of two unknown pointers.
3579 N = SE.getUnknown(NV);
3580 N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3581 if (!N)
3582 continue;
3583 Kind = LSRUse::ICmpZero;
3584 S = SE.getMinusSCEV(N, S);
3586 }
3587
3588 // -1 and the negations of all interesting strides (except the negation
3589 // of -1) are now also interesting.
3590 for (size_t i = 0, e = Factors.size(); i != e; ++i)
3591 if (Factors[i] != -1)
3592 Factors.insert(-(uint64_t)Factors[i]);
3593 Factors.insert(-1);
3594 }
3595 }
3596
3597 // Get or create an LSRUse.
3598 std::pair<size_t, Immediate> P = getUse(S, Kind, AccessTy);
3599 size_t LUIdx = P.first;
3600 Immediate Offset = P.second;
3601 LSRUse &LU = Uses[LUIdx];
3602
3603 // Record the fixup.
3604 LSRFixup &LF = LU.getNewFixup();
3605 LF.UserInst = UserInst;
3606 LF.OperandValToReplace = U.getOperandValToReplace();
3607 LF.PostIncLoops = TmpPostIncLoops;
3608 LF.Offset = Offset;
3609 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3610
3611 // Create SCEV as Formula for calculating baseline cost
3612 if (!VisitedLSRUse.count(LUIdx) && !LF.isUseFullyOutsideLoop(L)) {
3613 Formula F;
3614 F.initialMatch(S, L, SE);
3615 BaselineCost.RateFormula(F, Regs, VisitedRegs, LU,
3616 HardwareLoopProfitable);
3617 VisitedLSRUse.insert(LUIdx);
3618 }
3619
3620 if (!LU.WidestFixupType ||
3621 SE.getTypeSizeInBits(LU.WidestFixupType) <
3622 SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3623 LU.WidestFixupType = LF.OperandValToReplace->getType();
3624
3625 // If this is the first use of this LSRUse, give it a formula.
3626 if (LU.Formulae.empty()) {
3627 InsertInitialFormula(S, LU, LUIdx);
3628 CountRegisters(LU.Formulae.back(), LUIdx);
3629 }
3630 }
3631
3632 LLVM_DEBUG(print_fixups(dbgs()));
3633}
3634
3635/// Insert a formula for the given expression into the given use, separating out
3636/// loop-variant portions from loop-invariant and loop-computable portions.
3637void LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU,
3638 size_t LUIdx) {
3639 // Mark uses whose expressions cannot be expanded.
3640 if (!Rewriter.isSafeToExpand(S))
3641 LU.RigidFormula = true;
3642
3643 Formula F;
3644 F.initialMatch(S, L, SE);
3645 bool Inserted = InsertFormula(LU, LUIdx, F);
3646 assert(Inserted && "Initial formula already exists!"); (void)Inserted;
3647}
3648
3649/// Insert a simple single-register formula for the given expression into the
3650/// given use.
3651void
3652LSRInstance::InsertSupplementalFormula(const SCEV *S,
3653 LSRUse &LU, size_t LUIdx) {
3654 Formula F;
3655 F.BaseRegs.push_back(S);
3656 F.HasBaseReg = true;
3657 bool Inserted = InsertFormula(LU, LUIdx, F);
3658 assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
3659}
3660
3661/// Note which registers are used by the given formula, updating RegUses.
3662void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
3663 if (F.ScaledReg)
3664 RegUses.countRegister(F.ScaledReg, LUIdx);
3665 for (const SCEV *BaseReg : F.BaseRegs)
3666 RegUses.countRegister(BaseReg, LUIdx);
3667}
3668
3669/// If the given formula has not yet been inserted, add it to the list, and
3670/// return true. Return false otherwise.
3671bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
3672 // Do not insert formula that we will not be able to expand.
3673 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
3674 "Formula is illegal");
3675
3676 if (!LU.InsertFormula(F, *L))
3677 return false;
3678
3679 CountRegisters(F, LUIdx);
3680 return true;
3681}
3682
3683/// Check for other uses of loop-invariant values which we're tracking. These
3684/// other uses will pin these values in registers, making them less profitable
3685/// for elimination.
3686/// TODO: This currently misses non-constant addrec step registers.
3687/// TODO: Should this give more weight to users inside the loop?
3688void
3689LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
3690 SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
3691 SmallPtrSet<const SCEV *, 32> Visited;
3692
3693 // Don't collect outside uses if we are favoring postinc - the instructions in
3694 // the loop are more important than the ones outside of it.
3695 if (AMK == TTI::AMK_PostIndexed)
3696 return;
3697
3698 while (!Worklist.empty()) {
3699 const SCEV *S = Worklist.pop_back_val();
3700
3701 // Don't process the same SCEV twice
3702 if (!Visited.insert(S).second)
3703 continue;
3704
3705 if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
3706 append_range(Worklist, N->operands());
3707 else if (const SCEVIntegralCastExpr *C = dyn_cast<SCEVIntegralCastExpr>(S))
3708 Worklist.push_back(C->getOperand());
3709 else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
3710 Worklist.push_back(D->getLHS());
3711 Worklist.push_back(D->getRHS());
3712 } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) {
3713 const Value *V = US->getValue();
3714 if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
3715 // Look for instructions defined outside the loop.
3716 if (L->contains(Inst)) continue;
3717 } else if (isa<Constant>(V))
3718 // Constants can be re-materialized.
3719 continue;
3720 for (const Use &U : V->uses()) {
3721 const Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
3722 // Ignore non-instructions.
3723 if (!UserInst)
3724 continue;
3725 // Don't bother if the instruction is an EHPad.
3726 if (UserInst->isEHPad())
3727 continue;
3728 // Ignore instructions in other functions (as can happen with
3729 // Constants).
3730 if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
3731 continue;
3732 // Ignore instructions not dominated by the loop.
3733 const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
3734 UserInst->getParent() :
3735 cast<PHINode>(UserInst)->getIncomingBlock(
3737 if (!DT.dominates(L->getHeader(), UseBB))
3738 continue;
3739 // Don't bother if the instruction is in a BB which ends in an EHPad.
3740 if (UseBB->getTerminator()->isEHPad())
3741 continue;
3742
3743 // Ignore cases in which the currently-examined value could come from
3744 // a basic block terminated with an EHPad. This checks all incoming
3745 // blocks of the phi node since it is possible that the same incoming
3746 // value comes from multiple basic blocks, only some of which may end
3747 // in an EHPad. If any of them do, a subsequent rewrite attempt by this
3748 // pass would try to insert instructions into an EHPad, hitting an
3749 // assertion.
3750 if (isa<PHINode>(UserInst)) {
3751 const auto *PhiNode = cast<PHINode>(UserInst);
3752 bool HasIncompatibleEHPTerminatedBlock = false;
3753 llvm::Value *ExpectedValue = U;
3754 for (unsigned int I = 0; I < PhiNode->getNumIncomingValues(); I++) {
3755 if (PhiNode->getIncomingValue(I) == ExpectedValue) {
3756 if (PhiNode->getIncomingBlock(I)->getTerminator()->isEHPad()) {
3757 HasIncompatibleEHPTerminatedBlock = true;
3758 break;
3759 }
3760 }
3761 }
3762 if (HasIncompatibleEHPTerminatedBlock) {
3763 continue;
3764 }
3765 }
3766
3767 // Don't bother rewriting PHIs in catchswitch blocks.
3768 if (isa<CatchSwitchInst>(UserInst->getParent()->getTerminator()))
3769 continue;
3770 // Ignore uses which are part of other SCEV expressions, to avoid
3771 // analyzing them multiple times.
3772 if (SE.isSCEVable(UserInst->getType())) {
3773 const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
3774 // If the user is a no-op, look through to its uses.
3775 if (!isa<SCEVUnknown>(UserS))
3776 continue;
3777 if (UserS == US) {
3778 Worklist.push_back(
3779 SE.getUnknown(const_cast<Instruction *>(UserInst)));
3780 continue;
3781 }
3782 }
3783 // Ignore icmp instructions which are already being analyzed.
3784 if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
3785 unsigned OtherIdx = !U.getOperandNo();
3786 Value *OtherOp = ICI->getOperand(OtherIdx);
3787 if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
3788 continue;
3789 }
3790
3791 // Do not consider uses inside lifetime intrinsics. These are not
3792 // actually materialized.
3793 if (UserInst->isLifetimeStartOrEnd())
3794 continue;
3795
3796 std::pair<size_t, Immediate> P =
3797 getUse(S, LSRUse::Basic, MemAccessTy());
3798 size_t LUIdx = P.first;
3799 Immediate Offset = P.second;
3800 LSRUse &LU = Uses[LUIdx];
3801 LSRFixup &LF = LU.getNewFixup();
3802 LF.UserInst = const_cast<Instruction *>(UserInst);
3803 LF.OperandValToReplace = U;
3804 LF.Offset = Offset;
3805 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3806 if (!LU.WidestFixupType ||
3807 SE.getTypeSizeInBits(LU.WidestFixupType) <
3808 SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3809 LU.WidestFixupType = LF.OperandValToReplace->getType();
3810 InsertSupplementalFormula(US, LU, LUIdx);
3811 CountRegisters(LU.Formulae.back(), Uses.size() - 1);
3812 break;
3813 }
3814 }
3815 }
3816}
3817
3818/// Split S into subexpressions which can be pulled out into separate
3819/// registers. If C is non-null, multiply each subexpression by C.
3820///
3821/// Return remainder expression after factoring the subexpressions captured by
3822/// Ops. If Ops is complete, return NULL.
3823static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
3825 const Loop *L,
3826 ScalarEvolution &SE,
3827 unsigned Depth = 0) {
3828 // Arbitrarily cap recursion to protect compile time.
3829 if (Depth >= 3)
3830 return S;
3831
3832 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
3833 // Break out add operands.
3834 for (const SCEV *S : Add->operands()) {
3835 const SCEV *Remainder = CollectSubexprs(S, C, Ops, L, SE, Depth+1);
3836 if (Remainder)
3837 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3838 }
3839 return nullptr;
3840 }
3841 const SCEV *Start, *Step;
3842 const SCEVConstant *Op0;
3843 const SCEV *Op1;
3844 if (match(S, m_scev_AffineAddRec(m_SCEV(Start), m_SCEV(Step)))) {
3845 // Split a non-zero base out of an addrec.
3846 if (Start->isZero())
3847 return S;
3848
3849 const SCEV *Remainder = CollectSubexprs(Start, C, Ops, L, SE, Depth + 1);
3850 // Split the non-zero AddRec unless it is part of a nested recurrence that
3851 // does not pertain to this loop.
3852 if (Remainder && (cast<SCEVAddRecExpr>(S)->getLoop() == L ||
3853 !isa<SCEVAddRecExpr>(Remainder))) {
3854 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3855 Remainder = nullptr;
3856 }
3857 if (Remainder != Start) {
3858 if (!Remainder)
3859 Remainder = SE.getConstant(S->getType(), 0);
3860 return SE.getAddRecExpr(Remainder, Step,
3861 cast<SCEVAddRecExpr>(S)->getLoop(),
3862 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
3864 }
3865 } else if (match(S, m_scev_Mul(m_SCEVConstant(Op0), m_SCEV(Op1)))) {
3866 // Break (C * (a + b + c)) into C*a + C*b + C*c.
3867 C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0;
3868 const SCEV *Remainder = CollectSubexprs(Op1, C, Ops, L, SE, Depth + 1);
3869 if (Remainder)
3870 Ops.push_back(SE.getMulExpr(C, Remainder));
3871 return nullptr;
3872 }
3873 return S;
3874}
3875
3876/// Return true if the SCEV represents a value that may end up as a
3877/// post-increment operation.
3879 LSRUse &LU, const SCEV *S, const Loop *L,
3880 ScalarEvolution &SE) {
3881 if (LU.Kind != LSRUse::Address ||
3882 !LU.AccessTy.getType()->isIntOrIntVectorTy())
3883 return false;
3884 const SCEV *Start;
3885 if (!match(S, m_scev_AffineAddRec(m_SCEV(Start), m_SCEVConstant())))
3886 return false;
3887 // Check if a post-indexed load/store can be used.
3888 if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, S->getType()) ||
3889 TTI.isIndexedStoreLegal(TTI.MIM_PostInc, S->getType())) {
3890 if (!isa<SCEVConstant>(Start) && SE.isLoopInvariant(Start, L))
3891 return true;
3892 }
3893 return false;
3894}
3895
3896/// Helper function for LSRInstance::GenerateReassociations.
3897void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
3898 const Formula &Base,
3899 unsigned Depth, size_t Idx,
3900 bool IsScaledReg) {
3901 const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
3902 // Don't generate reassociations for the base register of a value that
3903 // may generate a post-increment operator. The reason is that the
3904 // reassociations cause extra base+register formula to be created,
3905 // and possibly chosen, but the post-increment is more efficient.
3906 if (AMK == TTI::AMK_PostIndexed && mayUsePostIncMode(TTI, LU, BaseReg, L, SE))
3907 return;
3909 const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
3910 if (Remainder)
3911 AddOps.push_back(Remainder);
3912
3913 if (AddOps.size() == 1)
3914 return;
3915
3917 JE = AddOps.end();
3918 J != JE; ++J) {
3919 // Loop-variant "unknown" values are uninteresting; we won't be able to
3920 // do anything meaningful with them.
3921 if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
3922 continue;
3923
3924 // Don't pull a constant into a register if the constant could be folded
3925 // into an immediate field.
3926 if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3927 LU.AccessTy, *J, Base.getNumRegs() > 1))
3928 continue;
3929
3930 // Collect all operands except *J.
3931 SmallVector<const SCEV *, 8> InnerAddOps(std::as_const(AddOps).begin(), J);
3932 InnerAddOps.append(std::next(J), std::as_const(AddOps).end());
3933
3934 // Don't leave just a constant behind in a register if the constant could
3935 // be folded into an immediate field.
3936 if (InnerAddOps.size() == 1 &&
3937 isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3938 LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
3939 continue;
3940
3941 const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
3942 if (InnerSum->isZero())
3943 continue;
3944 Formula F = Base;
3945
3946 if (F.UnfoldedOffset.isNonZero() && F.UnfoldedOffset.isScalable())
3947 continue;
3948
3949 // Add the remaining pieces of the add back into the new formula.
3950 const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
3951 if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
3952 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
3953 InnerSumSC->getValue()->getZExtValue())) {
3954 F.UnfoldedOffset =
3955 Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
3956 InnerSumSC->getValue()->getZExtValue());
3957 if (IsScaledReg) {
3958 F.ScaledReg = nullptr;
3959 F.Scale = 0;
3960 } else
3961 F.BaseRegs.erase(F.BaseRegs.begin() + Idx);
3962 } else if (IsScaledReg)
3963 F.ScaledReg = InnerSum;
3964 else
3965 F.BaseRegs[Idx] = InnerSum;
3966
3967 // Add J as its own register, or an unfolded immediate.
3968 const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
3969 if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
3970 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
3971 SC->getValue()->getZExtValue()))
3972 F.UnfoldedOffset =
3973 Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
3974 SC->getValue()->getZExtValue());
3975 else
3976 F.BaseRegs.push_back(*J);
3977 // We may have changed the number of register in base regs, adjust the
3978 // formula accordingly.
3979 F.canonicalize(*L);
3980
3981 if (InsertFormula(LU, LUIdx, F))
3982 // If that formula hadn't been seen before, recurse to find more like
3983 // it.
3984 // Add check on Log16(AddOps.size()) - same as Log2_32(AddOps.size()) >> 2)
3985 // Because just Depth is not enough to bound compile time.
3986 // This means that every time AddOps.size() is greater 16^x we will add
3987 // x to Depth.
3988 GenerateReassociations(LU, LUIdx, LU.Formulae.back(),
3989 Depth + 1 + (Log2_32(AddOps.size()) >> 2));
3990 }
3991}
3992
3993/// Split out subexpressions from adds and the bases of addrecs.
3994void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
3995 Formula Base, unsigned Depth) {
3996 assert(Base.isCanonical(*L) && "Input must be in the canonical form");
3997 // Arbitrarily cap recursion to protect compile time.
3998 if (Depth >= 3)
3999 return;
4000
4001 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4002 GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i);
4003
4004 if (Base.Scale == 1)
4005 GenerateReassociationsImpl(LU, LUIdx, Base, Depth,
4006 /* Idx */ -1, /* IsScaledReg */ true);
4007}
4008
4009/// Generate a formula consisting of all of the loop-dominating registers added
4010/// into a single register.
4011void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
4012 Formula Base) {
4013 // This method is only interesting on a plurality of registers.
4014 if (Base.BaseRegs.size() + (Base.Scale == 1) +
4015 (Base.UnfoldedOffset.isNonZero()) <=
4016 1)
4017 return;
4018
4019 // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
4020 // processing the formula.
4021 Base.unscale();
4023 Formula NewBase = Base;
4024 NewBase.BaseRegs.clear();
4025 Type *CombinedIntegerType = nullptr;
4026 for (const SCEV *BaseReg : Base.BaseRegs) {
4027 if (SE.properlyDominates(BaseReg, L->getHeader()) &&
4028 !SE.hasComputableLoopEvolution(BaseReg, L)) {
4029 if (!CombinedIntegerType)
4030 CombinedIntegerType = SE.getEffectiveSCEVType(BaseReg->getType());
4031 Ops.push_back(BaseReg);
4032 }
4033 else
4034 NewBase.BaseRegs.push_back(BaseReg);
4035 }
4036
4037 // If no register is relevant, we're done.
4038 if (Ops.size() == 0)
4039 return;
4040
4041 // Utility function for generating the required variants of the combined
4042 // registers.
4043 auto GenerateFormula = [&](const SCEV *Sum) {
4044 Formula F = NewBase;
4045
4046 // TODO: If Sum is zero, it probably means ScalarEvolution missed an
4047 // opportunity to fold something. For now, just ignore such cases
4048 // rather than proceed with zero in a register.
4049 if (Sum->isZero())
4050 return;
4051
4052 F.BaseRegs.push_back(Sum);
4053 F.canonicalize(*L);
4054 (void)InsertFormula(LU, LUIdx, F);
4055 };
4056
4057 // If we collected at least two registers, generate a formula combining them.
4058 if (Ops.size() > 1) {
4059 SmallVector<const SCEV *, 4> OpsCopy(Ops); // Don't let SE modify Ops.
4060 GenerateFormula(SE.getAddExpr(OpsCopy));
4061 }
4062
4063 // If we have an unfolded offset, generate a formula combining it with the
4064 // registers collected.
4065 if (NewBase.UnfoldedOffset.isNonZero() && NewBase.UnfoldedOffset.isFixed()) {
4066 assert(CombinedIntegerType && "Missing a type for the unfolded offset");
4067 Ops.push_back(SE.getConstant(CombinedIntegerType,
4068 NewBase.UnfoldedOffset.getFixedValue(), true));
4069 NewBase.UnfoldedOffset = Immediate::getFixed(0);
4070 GenerateFormula(SE.getAddExpr(Ops));
4071 }
4072}
4073
4074/// Helper function for LSRInstance::GenerateSymbolicOffsets.
4075void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
4076 const Formula &Base, size_t Idx,
4077 bool IsScaledReg) {
4078 const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
4079 GlobalValue *GV = ExtractSymbol(G, SE);
4080 if (G->isZero() || !GV)
4081 return;
4082 Formula F = Base;
4083 F.BaseGV = GV;
4084 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
4085 return;
4086 if (IsScaledReg)
4087 F.ScaledReg = G;
4088 else
4089 F.BaseRegs[Idx] = G;
4090 (void)InsertFormula(LU, LUIdx, F);
4091}
4092
4093/// Generate reuse formulae using symbolic offsets.
4094void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
4095 Formula Base) {
4096 // We can't add a symbolic offset if the address already contains one.
4097 if (Base.BaseGV) return;
4098
4099 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4100 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i);
4101 if (Base.Scale == 1)
4102 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1,
4103 /* IsScaledReg */ true);
4104}
4105
4106/// Helper function for LSRInstance::GenerateConstantOffsets.
4107void LSRInstance::GenerateConstantOffsetsImpl(
4108 LSRUse &LU, unsigned LUIdx, const Formula &Base,
4109 const SmallVectorImpl<Immediate> &Worklist, size_t Idx, bool IsScaledReg) {
4110
4111 auto GenerateOffset = [&](const SCEV *G, Immediate Offset) {
4112 Formula F = Base;
4113 if (!Base.BaseOffset.isCompatibleImmediate(Offset))
4114 return;
4115 F.BaseOffset = Base.BaseOffset.subUnsigned(Offset);
4116
4117 if (isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) {
4118 // Add the offset to the base register.
4119 const SCEV *NewOffset = Offset.getSCEV(SE, G->getType());
4120 const SCEV *NewG = SE.getAddExpr(NewOffset, G);
4121 // If it cancelled out, drop the base register, otherwise update it.
4122 if (NewG->isZero()) {
4123 if (IsScaledReg) {
4124 F.Scale = 0;
4125 F.ScaledReg = nullptr;
4126 } else
4127 F.deleteBaseReg(F.BaseRegs[Idx]);
4128 F.canonicalize(*L);
4129 } else if (IsScaledReg)
4130 F.ScaledReg = NewG;
4131 else
4132 F.BaseRegs[Idx] = NewG;
4133
4134 (void)InsertFormula(LU, LUIdx, F);
4135 }
4136 };
4137
4138 const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
4139
4140 // With constant offsets and constant steps, we can generate pre-inc
4141 // accesses by having the offset equal the step. So, for access #0 with a
4142 // step of 8, we generate a G - 8 base which would require the first access
4143 // to be ((G - 8) + 8),+,8. The pre-indexed access then updates the pointer
4144 // for itself and hopefully becomes the base for other accesses. This means
4145 // means that a single pre-indexed access can be generated to become the new
4146 // base pointer for each iteration of the loop, resulting in no extra add/sub
4147 // instructions for pointer updating.
4148 if ((AMK & TTI::AMK_PreIndexed) && LU.Kind == LSRUse::Address) {
4149 const APInt *StepInt;
4150 if (match(G, m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(StepInt)))) {
4151 int64_t Step = StepInt->isNegative() ? StepInt->getSExtValue()
4152 : StepInt->getZExtValue();
4153
4154 for (Immediate Offset : Worklist) {
4155 if (Offset.isFixed()) {
4156 Offset = Immediate::getFixed(Offset.getFixedValue() - Step);
4157 GenerateOffset(G, Offset);
4158 }
4159 }
4160 }
4161 }
4162 for (Immediate Offset : Worklist)
4163 GenerateOffset(G, Offset);
4164
4165 Immediate Imm = ExtractImmediate(G, SE);
4166 if (G->isZero() || Imm.isZero() ||
4167 !Base.BaseOffset.isCompatibleImmediate(Imm))
4168 return;
4169 Formula F = Base;
4170 F.BaseOffset = F.BaseOffset.addUnsigned(Imm);
4171 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
4172 return;
4173 if (IsScaledReg) {
4174 F.ScaledReg = G;
4175 } else {
4176 F.BaseRegs[Idx] = G;
4177 // We may generate non canonical Formula if G is a recurrent expr reg
4178 // related with current loop while F.ScaledReg is not.
4179 F.canonicalize(*L);
4180 }
4181 (void)InsertFormula(LU, LUIdx, F);
4182}
4183
4184/// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
4185void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
4186 Formula Base) {
4187 // TODO: For now, just add the min and max offset, because it usually isn't
4188 // worthwhile looking at everything inbetween.
4190 Worklist.push_back(LU.MinOffset);
4191 if (LU.MaxOffset != LU.MinOffset)
4192 Worklist.push_back(LU.MaxOffset);
4193
4194 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4195 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i);
4196 if (Base.Scale == 1)
4197 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1,
4198 /* IsScaledReg */ true);
4199}
4200
4201/// For ICmpZero, check to see if we can scale up the comparison. For example, x
4202/// == y -> x*c == y*c.
4203void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
4204 Formula Base) {
4205 if (LU.Kind != LSRUse::ICmpZero) return;
4206
4207 // Determine the integer type for the base formula.
4208 Type *IntTy = Base.getType();
4209 if (!IntTy) return;
4210 if (SE.getTypeSizeInBits(IntTy) > 64) return;
4211
4212 // Don't do this if there is more than one offset.
4213 if (LU.MinOffset != LU.MaxOffset) return;
4214
4215 // Check if transformation is valid. It is illegal to multiply pointer.
4216 if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4217 return;
4218 for (const SCEV *BaseReg : Base.BaseRegs)
4219 if (BaseReg->getType()->isPointerTy())
4220 return;
4221 assert(!Base.BaseGV && "ICmpZero use is not legal!");
4222
4223 // Check each interesting stride.
4224 for (int64_t Factor : Factors) {
4225 // Check that Factor can be represented by IntTy
4226 if (!ConstantInt::isValueValidForType(IntTy, Factor))
4227 continue;
4228 // Check that the multiplication doesn't overflow.
4229 if (Base.BaseOffset.isMin() && Factor == -1)
4230 continue;
4231 // Not supporting scalable immediates.
4232 if (Base.BaseOffset.isNonZero() && Base.BaseOffset.isScalable())
4233 continue;
4234 Immediate NewBaseOffset = Base.BaseOffset.mulUnsigned(Factor);
4235 assert(Factor != 0 && "Zero factor not expected!");
4236 if (NewBaseOffset.getFixedValue() / Factor !=
4237 Base.BaseOffset.getFixedValue())
4238 continue;
4239 // If the offset will be truncated at this use, check that it is in bounds.
4240 if (!IntTy->isPointerTy() &&
4241 !ConstantInt::isValueValidForType(IntTy, NewBaseOffset.getFixedValue()))
4242 continue;
4243
4244 // Check that multiplying with the use offset doesn't overflow.
4245 Immediate Offset = LU.MinOffset;
4246 if (Offset.isMin() && Factor == -1)
4247 continue;
4248 Offset = Offset.mulUnsigned(Factor);
4249 if (Offset.getFixedValue() / Factor != LU.MinOffset.getFixedValue())
4250 continue;
4251 // If the offset will be truncated at this use, check that it is in bounds.
4252 if (!IntTy->isPointerTy() &&
4253 !ConstantInt::isValueValidForType(IntTy, Offset.getFixedValue()))
4254 continue;
4255
4256 Formula F = Base;
4257 F.BaseOffset = NewBaseOffset;
4258
4259 // Check that this scale is legal.
4260 if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F))
4261 continue;
4262
4263 // Compensate for the use having MinOffset built into it.
4264 F.BaseOffset = F.BaseOffset.addUnsigned(Offset).subUnsigned(LU.MinOffset);
4265
4266 const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4267
4268 // Check that multiplying with each base register doesn't overflow.
4269 for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
4270 F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
4271 if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
4272 goto next;
4273 }
4274
4275 // Check that multiplying with the scaled register doesn't overflow.
4276 if (F.ScaledReg) {
4277 F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
4278 if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
4279 continue;
4280 }
4281
4282 // Check that multiplying with the unfolded offset doesn't overflow.
4283 if (F.UnfoldedOffset.isNonZero()) {
4284 if (F.UnfoldedOffset.isMin() && Factor == -1)
4285 continue;
4286 F.UnfoldedOffset = F.UnfoldedOffset.mulUnsigned(Factor);
4287 if (F.UnfoldedOffset.getFixedValue() / Factor !=
4288 Base.UnfoldedOffset.getFixedValue())
4289 continue;
4290 // If the offset will be truncated, check that it is in bounds.
4292 IntTy, F.UnfoldedOffset.getFixedValue()))
4293 continue;
4294 }
4295
4296 // If we make it here and it's legal, add it.
4297 (void)InsertFormula(LU, LUIdx, F);
4298 next:;
4299 }
4300}
4301
4302/// Generate stride factor reuse formulae by making use of scaled-offset address
4303/// modes, for example.
4304void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
4305 // Determine the integer type for the base formula.
4306 Type *IntTy = Base.getType();
4307 if (!IntTy) return;
4308
4309 // If this Formula already has a scaled register, we can't add another one.
4310 // Try to unscale the formula to generate a better scale.
4311 if (Base.Scale != 0 && !Base.unscale())
4312 return;
4313
4314 assert(Base.Scale == 0 && "unscale did not did its job!");
4315
4316 // Check each interesting stride.
4317 for (int64_t Factor : Factors) {
4318 Base.Scale = Factor;
4319 Base.HasBaseReg = Base.BaseRegs.size() > 1;
4320 // Check whether this scale is going to be legal.
4321 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4322 Base)) {
4323 // As a special-case, handle special out-of-loop Basic users specially.
4324 // TODO: Reconsider this special case.
4325 if (LU.Kind == LSRUse::Basic &&
4326 isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special,
4327 LU.AccessTy, Base) &&
4328 LU.AllFixupsOutsideLoop)
4329 LU.Kind = LSRUse::Special;
4330 else
4331 continue;
4332 }
4333 // For an ICmpZero, negating a solitary base register won't lead to
4334 // new solutions.
4335 if (LU.Kind == LSRUse::ICmpZero && !Base.HasBaseReg &&
4336 Base.BaseOffset.isZero() && !Base.BaseGV)
4337 continue;
4338 // For each addrec base reg, if its loop is current loop, apply the scale.
4339 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
4340 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i]);
4341 if (AR && (AR->getLoop() == L || LU.AllFixupsOutsideLoop)) {
4342 const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4343 if (FactorS->isZero())
4344 continue;
4345 // Divide out the factor, ignoring high bits, since we'll be
4346 // scaling the value back up in the end.
4347 if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true))
4348 if (!Quotient->isZero()) {
4349 // TODO: This could be optimized to avoid all the copying.
4350 Formula F = Base;
4351 F.ScaledReg = Quotient;
4352 F.deleteBaseReg(F.BaseRegs[i]);
4353 // The canonical representation of 1*reg is reg, which is already in
4354 // Base. In that case, do not try to insert the formula, it will be
4355 // rejected anyway.
4356 if (F.Scale == 1 && (F.BaseRegs.empty() ||
4357 (AR->getLoop() != L && LU.AllFixupsOutsideLoop)))
4358 continue;
4359 // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate
4360 // non canonical Formula with ScaledReg's loop not being L.
4361 if (F.Scale == 1 && LU.AllFixupsOutsideLoop)
4362 F.canonicalize(*L);
4363 (void)InsertFormula(LU, LUIdx, F);
4364 }
4365 }
4366 }
4367 }
4368}
4369
4370/// Extend/Truncate \p Expr to \p ToTy considering post-inc uses in \p Loops.
4371/// For all PostIncLoopSets in \p Loops, first de-normalize \p Expr, then
4372/// perform the extension/truncate and normalize again, as the normalized form
4373/// can result in folds that are not valid in the post-inc use contexts. The
4374/// expressions for all PostIncLoopSets must match, otherwise return nullptr.
4375static const SCEV *
4377 const SCEV *Expr, Type *ToTy,
4378 ScalarEvolution &SE) {
4379 const SCEV *Result = nullptr;
4380 for (auto &L : Loops) {
4381 auto *DenormExpr = denormalizeForPostIncUse(Expr, L, SE);
4382 const SCEV *NewDenormExpr = SE.getAnyExtendExpr(DenormExpr, ToTy);
4383 const SCEV *New = normalizeForPostIncUse(NewDenormExpr, L, SE);
4384 if (!New || (Result && New != Result))
4385 return nullptr;
4386 Result = New;
4387 }
4388
4389 assert(Result && "failed to create expression");
4390 return Result;
4391}
4392
4393/// Generate reuse formulae from different IV types.
4394void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
4395 // Don't bother truncating symbolic values.
4396 if (Base.BaseGV) return;
4397
4398 // Determine the integer type for the base formula.
4399 Type *DstTy = Base.getType();
4400 if (!DstTy) return;
4401 if (DstTy->isPointerTy())
4402 return;
4403
4404 // It is invalid to extend a pointer type so exit early if ScaledReg or
4405 // any of the BaseRegs are pointers.
4406 if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4407 return;
4408 if (any_of(Base.BaseRegs,
4409 [](const SCEV *S) { return S->getType()->isPointerTy(); }))
4410 return;
4411
4413 for (auto &LF : LU.Fixups)
4414 Loops.push_back(LF.PostIncLoops);
4415
4416 for (Type *SrcTy : Types) {
4417 if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) {
4418 Formula F = Base;
4419
4420 // Sometimes SCEV is able to prove zero during ext transform. It may
4421 // happen if SCEV did not do all possible transforms while creating the
4422 // initial node (maybe due to depth limitations), but it can do them while
4423 // taking ext.
4424 if (F.ScaledReg) {
4425 const SCEV *NewScaledReg =
4426 getAnyExtendConsideringPostIncUses(Loops, F.ScaledReg, SrcTy, SE);
4427 if (!NewScaledReg || NewScaledReg->isZero())
4428 continue;
4429 F.ScaledReg = NewScaledReg;
4430 }
4431 bool HasZeroBaseReg = false;
4432 for (const SCEV *&BaseReg : F.BaseRegs) {
4433 const SCEV *NewBaseReg =
4434 getAnyExtendConsideringPostIncUses(Loops, BaseReg, SrcTy, SE);
4435 if (!NewBaseReg || NewBaseReg->isZero()) {
4436 HasZeroBaseReg = true;
4437 break;
4438 }
4439 BaseReg = NewBaseReg;
4440 }
4441 if (HasZeroBaseReg)
4442 continue;
4443
4444 // TODO: This assumes we've done basic processing on all uses and
4445 // have an idea what the register usage is.
4446 if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
4447 continue;
4448
4449 F.canonicalize(*L);
4450 (void)InsertFormula(LU, LUIdx, F);
4451 }
4452 }
4453}
4454
4455namespace {
4456
4457/// Helper class for GenerateCrossUseConstantOffsets. It's used to defer
4458/// modifications so that the search phase doesn't have to worry about the data
4459/// structures moving underneath it.
4460struct WorkItem {
4461 size_t LUIdx;
4462 Immediate Imm;
4463 const SCEV *OrigReg;
4464
4465 WorkItem(size_t LI, Immediate I, const SCEV *R)
4466 : LUIdx(LI), Imm(I), OrigReg(R) {}
4467
4468 void print(raw_ostream &OS) const;
4469 void dump() const;
4470};
4471
4472} // end anonymous namespace
4473
4474#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4475void WorkItem::print(raw_ostream &OS) const {
4476 OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
4477 << " , add offset " << Imm;
4478}
4479
4480LLVM_DUMP_METHOD void WorkItem::dump() const {
4481 print(errs()); errs() << '\n';
4482}
4483#endif
4484
4485/// Look for registers which are a constant distance apart and try to form reuse
4486/// opportunities between them.
4487void LSRInstance::GenerateCrossUseConstantOffsets() {
4488 // Group the registers by their value without any added constant offset.
4489 using ImmMapTy = std::map<Immediate, const SCEV *, KeyOrderTargetImmediate>;
4490
4491 DenseMap<const SCEV *, ImmMapTy> Map;
4492 DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
4494 for (const SCEV *Use : RegUses) {
4495 const SCEV *Reg = Use; // Make a copy for ExtractImmediate to modify.
4496 Immediate Imm = ExtractImmediate(Reg, SE);
4497 auto Pair = Map.try_emplace(Reg);
4498 if (Pair.second)
4499 Sequence.push_back(Reg);
4500 Pair.first->second.insert(std::make_pair(Imm, Use));
4501 UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(Use);
4502 }
4503
4504 // Now examine each set of registers with the same base value. Build up
4505 // a list of work to do and do the work in a separate step so that we're
4506 // not adding formulae and register counts while we're searching.
4507 SmallVector<WorkItem, 32> WorkItems;
4508 SmallSet<std::pair<size_t, Immediate>, 32, KeyOrderSizeTAndImmediate>
4509 UniqueItems;
4510 for (const SCEV *Reg : Sequence) {
4511 const ImmMapTy &Imms = Map.find(Reg)->second;
4512
4513 // It's not worthwhile looking for reuse if there's only one offset.
4514 if (Imms.size() == 1)
4515 continue;
4516
4517 LLVM_DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
4518 for (const auto &Entry
4519 : Imms) dbgs()
4520 << ' ' << Entry.first;
4521 dbgs() << '\n');
4522
4523 // Examine each offset.
4524 for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
4525 J != JE; ++J) {
4526 const SCEV *OrigReg = J->second;
4527
4528 Immediate JImm = J->first;
4529 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
4530
4531 if (!isa<SCEVConstant>(OrigReg) &&
4532 UsedByIndicesMap[Reg].count() == 1) {
4533 LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4534 << '\n');
4535 continue;
4536 }
4537
4538 // Conservatively examine offsets between this orig reg a few selected
4539 // other orig regs.
4540 Immediate First = Imms.begin()->first;
4541 Immediate Last = std::prev(Imms.end())->first;
4542 if (!First.isCompatibleImmediate(Last)) {
4543 LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4544 << "\n");
4545 continue;
4546 }
4547 // Only scalable if both terms are scalable, or if one is scalable and
4548 // the other is 0.
4549 bool Scalable = First.isScalable() || Last.isScalable();
4550 int64_t FI = First.getKnownMinValue();
4551 int64_t LI = Last.getKnownMinValue();
4552 // Compute (First + Last) / 2 without overflow using the fact that
4553 // First + Last = 2 * (First + Last) + (First ^ Last).
4554 int64_t Avg = (FI & LI) + ((FI ^ LI) >> 1);
4555 // If the result is negative and FI is odd and LI even (or vice versa),
4556 // we rounded towards -inf. Add 1 in that case, to round towards 0.
4557 Avg = Avg + ((FI ^ LI) & ((uint64_t)Avg >> 63));
4558 ImmMapTy::const_iterator OtherImms[] = {
4559 Imms.begin(), std::prev(Imms.end()),
4560 Imms.lower_bound(Immediate::get(Avg, Scalable))};
4561 for (const auto &M : OtherImms) {
4562 if (M == J || M == JE) continue;
4563 if (!JImm.isCompatibleImmediate(M->first))
4564 continue;
4565
4566 // Compute the difference between the two.
4567 Immediate Imm = JImm.subUnsigned(M->first);
4568 for (unsigned LUIdx : UsedByIndices.set_bits())
4569 // Make a memo of this use, offset, and register tuple.
4570 if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
4571 WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
4572 }
4573 }
4574 }
4575
4576 Map.clear();
4577 Sequence.clear();
4578 UsedByIndicesMap.clear();
4579 UniqueItems.clear();
4580
4581 // Now iterate through the worklist and add new formulae.
4582 for (const WorkItem &WI : WorkItems) {
4583 size_t LUIdx = WI.LUIdx;
4584 LSRUse &LU = Uses[LUIdx];
4585 Immediate Imm = WI.Imm;
4586 const SCEV *OrigReg = WI.OrigReg;
4587
4588 Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
4589 const SCEV *NegImmS = Imm.getNegativeSCEV(SE, IntTy);
4590 unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
4591
4592 // TODO: Use a more targeted data structure.
4593 for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
4594 Formula F = LU.Formulae[L];
4595 // FIXME: The code for the scaled and unscaled registers looks
4596 // very similar but slightly different. Investigate if they
4597 // could be merged. That way, we would not have to unscale the
4598 // Formula.
4599 F.unscale();
4600 // Use the immediate in the scaled register.
4601 if (F.ScaledReg == OrigReg) {
4602 if (!F.BaseOffset.isCompatibleImmediate(Imm))
4603 continue;
4604 Immediate Offset = F.BaseOffset.addUnsigned(Imm.mulUnsigned(F.Scale));
4605 // Don't create 50 + reg(-50).
4606 const SCEV *S = Offset.getNegativeSCEV(SE, IntTy);
4607 if (F.referencesReg(S))
4608 continue;
4609 Formula NewF = F;
4610 NewF.BaseOffset = Offset;
4611 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4612 NewF))
4613 continue;
4614 NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
4615
4616 // If the new scale is a constant in a register, and adding the constant
4617 // value to the immediate would produce a value closer to zero than the
4618 // immediate itself, then the formula isn't worthwhile.
4619 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg)) {
4620 // FIXME: Do we need to do something for scalable immediates here?
4621 // A scalable SCEV won't be constant, but we might still have
4622 // something in the offset? Bail out for now to be safe.
4623 if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
4624 continue;
4625 if (C->getValue()->isNegative() !=
4626 (NewF.BaseOffset.isLessThanZero()) &&
4627 (C->getAPInt().abs() * APInt(BitWidth, F.Scale))
4628 .ule(std::abs(NewF.BaseOffset.getFixedValue())))
4629 continue;
4630 }
4631
4632 // OK, looks good.
4633 NewF.canonicalize(*this->L);
4634 (void)InsertFormula(LU, LUIdx, NewF);
4635 } else {
4636 // Use the immediate in a base register.
4637 for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
4638 const SCEV *BaseReg = F.BaseRegs[N];
4639 if (BaseReg != OrigReg)
4640 continue;
4641 Formula NewF = F;
4642 if (!NewF.BaseOffset.isCompatibleImmediate(Imm) ||
4643 !NewF.UnfoldedOffset.isCompatibleImmediate(Imm) ||
4644 !NewF.BaseOffset.isCompatibleImmediate(NewF.UnfoldedOffset))
4645 continue;
4646 NewF.BaseOffset = NewF.BaseOffset.addUnsigned(Imm);
4647 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
4648 LU.Kind, LU.AccessTy, NewF)) {
4649 if (AMK == TTI::AMK_PostIndexed &&
4650 mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE))
4651 continue;
4652 Immediate NewUnfoldedOffset = NewF.UnfoldedOffset.addUnsigned(Imm);
4653 if (!isLegalAddImmediate(TTI, NewUnfoldedOffset))
4654 continue;
4655 NewF = F;
4656 NewF.UnfoldedOffset = NewUnfoldedOffset;
4657 }
4658 NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
4659
4660 // If the new formula has a constant in a register, and adding the
4661 // constant value to the immediate would produce a value closer to
4662 // zero than the immediate itself, then the formula isn't worthwhile.
4663 for (const SCEV *NewReg : NewF.BaseRegs)
4664 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg)) {
4665 if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
4666 goto skip_formula;
4667 if ((C->getAPInt() + NewF.BaseOffset.getFixedValue())
4668 .abs()
4669 .slt(std::abs(NewF.BaseOffset.getFixedValue())) &&
4670 (C->getAPInt() + NewF.BaseOffset.getFixedValue())
4671 .countr_zero() >=
4673 NewF.BaseOffset.getFixedValue()))
4674 goto skip_formula;
4675 }
4676
4677 // Ok, looks good.
4678 NewF.canonicalize(*this->L);
4679 (void)InsertFormula(LU, LUIdx, NewF);
4680 break;
4681 skip_formula:;
4682 }
4683 }
4684 }
4685 }
4686}
4687
4688/// Generate formulae for each use.
4689void
4690LSRInstance::GenerateAllReuseFormulae() {
4691 // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
4692 // queries are more precise.
4693 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4694 LSRUse &LU = Uses[LUIdx];
4695 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4696 GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
4697 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4698 GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
4699 }
4700 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4701 LSRUse &LU = Uses[LUIdx];
4702 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4703 GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
4704 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4705 GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
4706 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4707 GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
4708 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4709 GenerateScales(LU, LUIdx, LU.Formulae[i]);
4710 }
4711 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4712 LSRUse &LU = Uses[LUIdx];
4713 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4714 GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
4715 }
4716
4717 GenerateCrossUseConstantOffsets();
4718
4719 LLVM_DEBUG(dbgs() << "\n"
4720 "After generating reuse formulae:\n";
4721 print_uses(dbgs()));
4722}
4723
4724/// If there are multiple formulae with the same set of registers used
4725/// by other uses, pick the best one and delete the others.
4726void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
4727 DenseSet<const SCEV *> VisitedRegs;
4728 SmallPtrSet<const SCEV *, 16> Regs;
4729 SmallPtrSet<const SCEV *, 16> LoserRegs;
4730#ifndef NDEBUG
4731 bool ChangedFormulae = false;
4732#endif
4733
4734 // Collect the best formula for each unique set of shared registers. This
4735 // is reset for each use.
4736 using BestFormulaeTy = DenseMap<SmallVector<const SCEV *, 4>, size_t>;
4737
4738 BestFormulaeTy BestFormulae;
4739
4740 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4741 LSRUse &LU = Uses[LUIdx];
4742 LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
4743 dbgs() << '\n');
4744
4745 bool Any = false;
4746 for (size_t FIdx = 0, NumForms = LU.Formulae.size();
4747 FIdx != NumForms; ++FIdx) {
4748 Formula &F = LU.Formulae[FIdx];
4749
4750 // Some formulas are instant losers. For example, they may depend on
4751 // nonexistent AddRecs from other loops. These need to be filtered
4752 // immediately, otherwise heuristics could choose them over others leading
4753 // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
4754 // avoids the need to recompute this information across formulae using the
4755 // same bad AddRec. Passing LoserRegs is also essential unless we remove
4756 // the corresponding bad register from the Regs set.
4757 Cost CostF(L, SE, TTI, AMK);
4758 Regs.clear();
4759 CostF.RateFormula(F, Regs, VisitedRegs, LU, HardwareLoopProfitable,
4760 &LoserRegs);
4761 if (CostF.isLoser()) {
4762 // During initial formula generation, undesirable formulae are generated
4763 // by uses within other loops that have some non-trivial address mode or
4764 // use the postinc form of the IV. LSR needs to provide these formulae
4765 // as the basis of rediscovering the desired formula that uses an AddRec
4766 // corresponding to the existing phi. Once all formulae have been
4767 // generated, these initial losers may be pruned.
4768 LLVM_DEBUG(dbgs() << " Filtering loser "; F.print(dbgs());
4769 dbgs() << "\n");
4770 }
4771 else {
4773 for (const SCEV *Reg : F.BaseRegs) {
4774 if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
4775 Key.push_back(Reg);
4776 }
4777 if (F.ScaledReg &&
4778 RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
4779 Key.push_back(F.ScaledReg);
4780 // Unstable sort by host order ok, because this is only used for
4781 // uniquifying.
4782 llvm::sort(Key);
4783
4784 std::pair<BestFormulaeTy::const_iterator, bool> P =
4785 BestFormulae.insert(std::make_pair(Key, FIdx));
4786 if (P.second)
4787 continue;
4788
4789 Formula &Best = LU.Formulae[P.first->second];
4790
4791 Cost CostBest(L, SE, TTI, AMK);
4792 Regs.clear();
4793 CostBest.RateFormula(Best, Regs, VisitedRegs, LU,
4794 HardwareLoopProfitable);
4795 if (CostF.isLess(CostBest))
4796 std::swap(F, Best);
4797 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
4798 dbgs() << "\n"
4799 " in favor of formula ";
4800 Best.print(dbgs()); dbgs() << '\n');
4801 }
4802#ifndef NDEBUG
4803 ChangedFormulae = true;
4804#endif
4805 LU.DeleteFormula(F);
4806 --FIdx;
4807 --NumForms;
4808 Any = true;
4809 }
4810
4811 // Now that we've filtered out some formulae, recompute the Regs set.
4812 if (Any)
4813 LU.RecomputeRegs(LUIdx, RegUses);
4814
4815 // Reset this to prepare for the next use.
4816 BestFormulae.clear();
4817 }
4818
4819 LLVM_DEBUG(if (ChangedFormulae) {
4820 dbgs() << "\n"
4821 "After filtering out undesirable candidates:\n";
4822 print_uses(dbgs());
4823 });
4824}
4825
4826/// Estimate the worst-case number of solutions the solver might have to
4827/// consider. It almost never considers this many solutions because it prune the
4828/// search space, but the pruning isn't always sufficient.
4829size_t LSRInstance::EstimateSearchSpaceComplexity() const {
4830 size_t Power = 1;
4831 for (const LSRUse &LU : Uses) {
4832 size_t FSize = LU.Formulae.size();
4833 if (FSize >= ComplexityLimit) {
4834 Power = ComplexityLimit;
4835 break;
4836 }
4837 Power *= FSize;
4838 if (Power >= ComplexityLimit)
4839 break;
4840 }
4841 return Power;
4842}
4843
4844/// When one formula uses a superset of the registers of another formula, it
4845/// won't help reduce register pressure (though it may not necessarily hurt
4846/// register pressure); remove it to simplify the system.
4847void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
4848 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4849 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
4850
4851 LLVM_DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
4852 "which use a superset of registers used by other "
4853 "formulae.\n");
4854
4855 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4856 LSRUse &LU = Uses[LUIdx];
4857 bool Any = false;
4858 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
4859 Formula &F = LU.Formulae[i];
4860 if (F.BaseOffset.isNonZero() && F.BaseOffset.isScalable())
4861 continue;
4862 // Look for a formula with a constant or GV in a register. If the use
4863 // also has a formula with that same value in an immediate field,
4864 // delete the one that uses a register.
4866 I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
4867 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
4868 Formula NewF = F;
4869 //FIXME: Formulas should store bitwidth to do wrapping properly.
4870 // See PR41034.
4871 NewF.BaseOffset =
4872 Immediate::getFixed(NewF.BaseOffset.getFixedValue() +
4873 (uint64_t)C->getValue()->getSExtValue());
4874 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4875 (I - F.BaseRegs.begin()));
4876 if (LU.HasFormulaWithSameRegs(NewF)) {
4877 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4878 dbgs() << '\n');
4879 LU.DeleteFormula(F);
4880 --i;
4881 --e;
4882 Any = true;
4883 break;
4884 }
4885 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
4886 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
4887 if (!F.BaseGV) {
4888 Formula NewF = F;
4889 NewF.BaseGV = GV;
4890 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4891 (I - F.BaseRegs.begin()));
4892 if (LU.HasFormulaWithSameRegs(NewF)) {
4893 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4894 dbgs() << '\n');
4895 LU.DeleteFormula(F);
4896 --i;
4897 --e;
4898 Any = true;
4899 break;
4900 }
4901 }
4902 }
4903 }
4904 }
4905 if (Any)
4906 LU.RecomputeRegs(LUIdx, RegUses);
4907 }
4908
4909 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4910 }
4911}
4912
4913/// When there are many registers for expressions like A, A+1, A+2, etc.,
4914/// allocate a single register for them.
4915void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
4916 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
4917 return;
4918
4919 LLVM_DEBUG(
4920 dbgs() << "The search space is too complex.\n"
4921 "Narrowing the search space by assuming that uses separated "
4922 "by a constant offset will use the same registers.\n");
4923
4924 // This is especially useful for unrolled loops.
4925
4926 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4927 LSRUse &LU = Uses[LUIdx];
4928 for (const Formula &F : LU.Formulae) {
4929 if (F.BaseOffset.isZero() || (F.Scale != 0 && F.Scale != 1))
4930 continue;
4931
4932 LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
4933 if (!LUThatHas)
4934 continue;
4935
4936 if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false,
4937 LU.Kind, LU.AccessTy))
4938 continue;
4939
4940 LLVM_DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n');
4941
4942 LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
4943
4944 // Transfer the fixups of LU to LUThatHas.
4945 for (LSRFixup &Fixup : LU.Fixups) {
4946 Fixup.Offset += F.BaseOffset;
4947 LUThatHas->pushFixup(Fixup);
4948 LLVM_DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
4949 }
4950
4951 // Delete formulae from the new use which are no longer legal.
4952 bool Any = false;
4953 for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
4954 Formula &F = LUThatHas->Formulae[i];
4955 if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
4956 LUThatHas->Kind, LUThatHas->AccessTy, F)) {
4957 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
4958 LUThatHas->DeleteFormula(F);
4959 --i;
4960 --e;
4961 Any = true;
4962 }
4963 }
4964
4965 if (Any)
4966 LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
4967
4968 // Delete the old use.
4969 DeleteUse(LU, LUIdx);
4970 --LUIdx;
4971 --NumUses;
4972 break;
4973 }
4974 }
4975
4976 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4977}
4978
4979/// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that
4980/// we've done more filtering, as it may be able to find more formulae to
4981/// eliminate.
4982void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
4983 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4984 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
4985
4986 LLVM_DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
4987 "undesirable dedicated registers.\n");
4988
4989 FilterOutUndesirableDedicatedRegisters();
4990
4991 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4992 }
4993}
4994
4995/// If a LSRUse has multiple formulae with the same ScaledReg and Scale.
4996/// Pick the best one and delete the others.
4997/// This narrowing heuristic is to keep as many formulae with different
4998/// Scale and ScaledReg pair as possible while narrowing the search space.
4999/// The benefit is that it is more likely to find out a better solution
5000/// from a formulae set with more Scale and ScaledReg variations than
5001/// a formulae set with the same Scale and ScaledReg. The picking winner
5002/// reg heuristic will often keep the formulae with the same Scale and
5003/// ScaledReg and filter others, and we want to avoid that if possible.
5004void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
5005 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5006 return;
5007
5008 LLVM_DEBUG(
5009 dbgs() << "The search space is too complex.\n"
5010 "Narrowing the search space by choosing the best Formula "
5011 "from the Formulae with the same Scale and ScaledReg.\n");
5012
5013 // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse.
5014 using BestFormulaeTy = DenseMap<std::pair<const SCEV *, int64_t>, size_t>;
5015
5016 BestFormulaeTy BestFormulae;
5017#ifndef NDEBUG
5018 bool ChangedFormulae = false;
5019#endif
5020 DenseSet<const SCEV *> VisitedRegs;
5021 SmallPtrSet<const SCEV *, 16> Regs;
5022
5023 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5024 LSRUse &LU = Uses[LUIdx];
5025 LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
5026 dbgs() << '\n');
5027
5028 // Return true if Formula FA is better than Formula FB.
5029 auto IsBetterThan = [&](Formula &FA, Formula &FB) {
5030 // First we will try to choose the Formula with fewer new registers.
5031 // For a register used by current Formula, the more the register is
5032 // shared among LSRUses, the less we increase the register number
5033 // counter of the formula.
5034 size_t FARegNum = 0;
5035 for (const SCEV *Reg : FA.BaseRegs) {
5036 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
5037 FARegNum += (NumUses - UsedByIndices.count() + 1);
5038 }
5039 size_t FBRegNum = 0;
5040 for (const SCEV *Reg : FB.BaseRegs) {
5041 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
5042 FBRegNum += (NumUses - UsedByIndices.count() + 1);
5043 }
5044 if (FARegNum != FBRegNum)
5045 return FARegNum < FBRegNum;
5046
5047 // If the new register numbers are the same, choose the Formula with
5048 // less Cost.
5049 Cost CostFA(L, SE, TTI, AMK);
5050 Cost CostFB(L, SE, TTI, AMK);
5051 Regs.clear();
5052 CostFA.RateFormula(FA, Regs, VisitedRegs, LU, HardwareLoopProfitable);
5053 Regs.clear();
5054 CostFB.RateFormula(FB, Regs, VisitedRegs, LU, HardwareLoopProfitable);
5055 return CostFA.isLess(CostFB);
5056 };
5057
5058 bool Any = false;
5059 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
5060 ++FIdx) {
5061 Formula &F = LU.Formulae[FIdx];
5062 if (!F.ScaledReg)
5063 continue;
5064 auto P = BestFormulae.insert({{F.ScaledReg, F.Scale}, FIdx});
5065 if (P.second)
5066 continue;
5067
5068 Formula &Best = LU.Formulae[P.first->second];
5069 if (IsBetterThan(F, Best))
5070 std::swap(F, Best);
5071 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
5072 dbgs() << "\n"
5073 " in favor of formula ";
5074 Best.print(dbgs()); dbgs() << '\n');
5075#ifndef NDEBUG
5076 ChangedFormulae = true;
5077#endif
5078 LU.DeleteFormula(F);
5079 --FIdx;
5080 --NumForms;
5081 Any = true;
5082 }
5083 if (Any)
5084 LU.RecomputeRegs(LUIdx, RegUses);
5085
5086 // Reset this to prepare for the next use.
5087 BestFormulae.clear();
5088 }
5089
5090 LLVM_DEBUG(if (ChangedFormulae) {
5091 dbgs() << "\n"
5092 "After filtering out undesirable candidates:\n";
5093 print_uses(dbgs());
5094 });
5095}
5096
5097/// If we are over the complexity limit, filter out any post-inc prefering
5098/// variables to only post-inc values.
5099void LSRInstance::NarrowSearchSpaceByFilterPostInc() {
5100 if (AMK != TTI::AMK_PostIndexed)
5101 return;
5102 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5103 return;
5104
5105 LLVM_DEBUG(dbgs() << "The search space is too complex.\n"
5106 "Narrowing the search space by choosing the lowest "
5107 "register Formula for PostInc Uses.\n");
5108
5109 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5110 LSRUse &LU = Uses[LUIdx];
5111
5112 if (LU.Kind != LSRUse::Address)
5113 continue;
5114 if (!TTI.isIndexedLoadLegal(TTI.MIM_PostInc, LU.AccessTy.getType()) &&
5115 !TTI.isIndexedStoreLegal(TTI.MIM_PostInc, LU.AccessTy.getType()))
5116 continue;
5117
5118 size_t MinRegs = std::numeric_limits<size_t>::max();
5119 for (const Formula &F : LU.Formulae)
5120 MinRegs = std::min(F.getNumRegs(), MinRegs);
5121
5122 bool Any = false;
5123 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
5124 ++FIdx) {
5125 Formula &F = LU.Formulae[FIdx];
5126 if (F.getNumRegs() > MinRegs) {
5127 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
5128 dbgs() << "\n");
5129 LU.DeleteFormula(F);
5130 --FIdx;
5131 --NumForms;
5132 Any = true;
5133 }
5134 }
5135 if (Any)
5136 LU.RecomputeRegs(LUIdx, RegUses);
5137
5138 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5139 break;
5140 }
5141
5142 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5143}
5144
5145/// The function delete formulas with high registers number expectation.
5146/// Assuming we don't know the value of each formula (already delete
5147/// all inefficient), generate probability of not selecting for each
5148/// register.
5149/// For example,
5150/// Use1:
5151/// reg(a) + reg({0,+,1})
5152/// reg(a) + reg({-1,+,1}) + 1
5153/// reg({a,+,1})
5154/// Use2:
5155/// reg(b) + reg({0,+,1})
5156/// reg(b) + reg({-1,+,1}) + 1
5157/// reg({b,+,1})
5158/// Use3:
5159/// reg(c) + reg(b) + reg({0,+,1})
5160/// reg(c) + reg({b,+,1})
5161///
5162/// Probability of not selecting
5163/// Use1 Use2 Use3
5164/// reg(a) (1/3) * 1 * 1
5165/// reg(b) 1 * (1/3) * (1/2)
5166/// reg({0,+,1}) (2/3) * (2/3) * (1/2)
5167/// reg({-1,+,1}) (2/3) * (2/3) * 1
5168/// reg({a,+,1}) (2/3) * 1 * 1
5169/// reg({b,+,1}) 1 * (2/3) * (2/3)
5170/// reg(c) 1 * 1 * 0
5171///
5172/// Now count registers number mathematical expectation for each formula:
5173/// Note that for each use we exclude probability if not selecting for the use.
5174/// For example for Use1 probability for reg(a) would be just 1 * 1 (excluding
5175/// probabilty 1/3 of not selecting for Use1).
5176/// Use1:
5177/// reg(a) + reg({0,+,1}) 1 + 1/3 -- to be deleted
5178/// reg(a) + reg({-1,+,1}) + 1 1 + 4/9 -- to be deleted
5179/// reg({a,+,1}) 1
5180/// Use2:
5181/// reg(b) + reg({0,+,1}) 1/2 + 1/3 -- to be deleted
5182/// reg(b) + reg({-1,+,1}) + 1 1/2 + 2/3 -- to be deleted
5183/// reg({b,+,1}) 2/3
5184/// Use3:
5185/// reg(c) + reg(b) + reg({0,+,1}) 1 + 1/3 + 4/9 -- to be deleted
5186/// reg(c) + reg({b,+,1}) 1 + 2/3
5187void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
5188 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5189 return;
5190 // Ok, we have too many of formulae on our hands to conveniently handle.
5191 // Use a rough heuristic to thin out the list.
5192
5193 // Set of Regs wich will be 100% used in final solution.
5194 // Used in each formula of a solution (in example above this is reg(c)).
5195 // We can skip them in calculations.
5196 SmallPtrSet<const SCEV *, 4> UniqRegs;
5197 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5198
5199 // Map each register to probability of not selecting
5200 DenseMap <const SCEV *, float> RegNumMap;
5201 for (const SCEV *Reg : RegUses) {
5202 if (UniqRegs.count(Reg))
5203 continue;
5204 float PNotSel = 1;
5205 for (const LSRUse &LU : Uses) {
5206 if (!LU.Regs.count(Reg))
5207 continue;
5208 float P = LU.getNotSelectedProbability(Reg);
5209 if (P != 0.0)
5210 PNotSel *= P;
5211 else
5212 UniqRegs.insert(Reg);
5213 }
5214 RegNumMap.insert(std::make_pair(Reg, PNotSel));
5215 }
5216
5217 LLVM_DEBUG(
5218 dbgs() << "Narrowing the search space by deleting costly formulas\n");
5219
5220 // Delete formulas where registers number expectation is high.
5221 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5222 LSRUse &LU = Uses[LUIdx];
5223 // If nothing to delete - continue.
5224 if (LU.Formulae.size() < 2)
5225 continue;
5226 // This is temporary solution to test performance. Float should be
5227 // replaced with round independent type (based on integers) to avoid
5228 // different results for different target builds.
5229 float FMinRegNum = LU.Formulae[0].getNumRegs();
5230 float FMinARegNum = LU.Formulae[0].getNumRegs();
5231 size_t MinIdx = 0;
5232 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
5233 Formula &F = LU.Formulae[i];
5234 float FRegNum = 0;
5235 float FARegNum = 0;
5236 for (const SCEV *BaseReg : F.BaseRegs) {
5237 if (UniqRegs.count(BaseReg))
5238 continue;
5239 FRegNum += RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
5240 if (isa<SCEVAddRecExpr>(BaseReg))
5241 FARegNum +=
5242 RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
5243 }
5244 if (const SCEV *ScaledReg = F.ScaledReg) {
5245 if (!UniqRegs.count(ScaledReg)) {
5246 FRegNum +=
5247 RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
5248 if (isa<SCEVAddRecExpr>(ScaledReg))
5249 FARegNum +=
5250 RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
5251 }
5252 }
5253 if (FMinRegNum > FRegNum ||
5254 (FMinRegNum == FRegNum && FMinARegNum > FARegNum)) {
5255 FMinRegNum = FRegNum;
5256 FMinARegNum = FARegNum;
5257 MinIdx = i;
5258 }
5259 }
5260 LLVM_DEBUG(dbgs() << " The formula "; LU.Formulae[MinIdx].print(dbgs());
5261 dbgs() << " with min reg num " << FMinRegNum << '\n');
5262 if (MinIdx != 0)
5263 std::swap(LU.Formulae[MinIdx], LU.Formulae[0]);
5264 while (LU.Formulae.size() != 1) {
5265 LLVM_DEBUG(dbgs() << " Deleting "; LU.Formulae.back().print(dbgs());
5266 dbgs() << '\n');
5267 LU.Formulae.pop_back();
5268 }
5269 LU.RecomputeRegs(LUIdx, RegUses);
5270 assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula");
5271 Formula &F = LU.Formulae[0];
5272 LLVM_DEBUG(dbgs() << " Leaving only "; F.print(dbgs()); dbgs() << '\n');
5273 // When we choose the formula, the regs become unique.
5274 UniqRegs.insert_range(F.BaseRegs);
5275 if (F.ScaledReg)
5276 UniqRegs.insert(F.ScaledReg);
5277 }
5278 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5279}
5280
5281// Check if Best and Reg are SCEVs separated by a constant amount C, and if so
5282// would the addressing offset +C would be legal where the negative offset -C is
5283// not.
5285 ScalarEvolution &SE, const SCEV *Best,
5286 const SCEV *Reg,
5287 MemAccessTy AccessType) {
5288 if (Best->getType() != Reg->getType() ||
5290 cast<SCEVAddRecExpr>(Best)->getLoop() !=
5291 cast<SCEVAddRecExpr>(Reg)->getLoop()))
5292 return false;
5293 std::optional<APInt> Diff = SE.computeConstantDifference(Best, Reg);
5294 if (!Diff)
5295 return false;
5296
5297 return TTI.isLegalAddressingMode(
5298 AccessType.MemTy, /*BaseGV=*/nullptr,
5299 /*BaseOffset=*/Diff->getSExtValue(),
5300 /*HasBaseReg=*/true, /*Scale=*/0, AccessType.AddrSpace) &&
5301 !TTI.isLegalAddressingMode(
5302 AccessType.MemTy, /*BaseGV=*/nullptr,
5303 /*BaseOffset=*/-Diff->getSExtValue(),
5304 /*HasBaseReg=*/true, /*Scale=*/0, AccessType.AddrSpace);
5305}
5306
5307/// Pick a register which seems likely to be profitable, and then in any use
5308/// which has any reference to that register, delete all formulae which do not
5309/// reference that register.
5310void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
5311 // With all other options exhausted, loop until the system is simple
5312 // enough to handle.
5313 SmallPtrSet<const SCEV *, 4> Taken;
5314 while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
5315 // Ok, we have too many of formulae on our hands to conveniently handle.
5316 // Use a rough heuristic to thin out the list.
5317 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5318
5319 // Pick the register which is used by the most LSRUses, which is likely
5320 // to be a good reuse register candidate.
5321 const SCEV *Best = nullptr;
5322 unsigned BestNum = 0;
5323 for (const SCEV *Reg : RegUses) {
5324 if (Taken.count(Reg))
5325 continue;
5326 if (!Best) {
5327 Best = Reg;
5328 BestNum = RegUses.getUsedByIndices(Reg).count();
5329 } else {
5330 unsigned Count = RegUses.getUsedByIndices(Reg).count();
5331 if (Count > BestNum) {
5332 Best = Reg;
5333 BestNum = Count;
5334 }
5335
5336 // If the scores are the same, but the Reg is simpler for the target
5337 // (for example {x,+,1} as opposed to {x+C,+,1}, where the target can
5338 // handle +C but not -C), opt for the simpler formula.
5339 if (Count == BestNum) {
5340 int LUIdx = RegUses.getUsedByIndices(Reg).find_first();
5341 if (LUIdx >= 0 && Uses[LUIdx].Kind == LSRUse::Address &&
5343 Uses[LUIdx].AccessTy)) {
5344 Best = Reg;
5345 BestNum = Count;
5346 }
5347 }
5348 }
5349 }
5350 assert(Best && "Failed to find best LSRUse candidate");
5351
5352 LLVM_DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
5353 << " will yield profitable reuse.\n");
5354 Taken.insert(Best);
5355
5356 // In any use with formulae which references this register, delete formulae
5357 // which don't reference it.
5358 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5359 LSRUse &LU = Uses[LUIdx];
5360 if (!LU.Regs.count(Best)) continue;
5361
5362 bool Any = false;
5363 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
5364 Formula &F = LU.Formulae[i];
5365 if (!F.referencesReg(Best)) {
5366 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
5367 LU.DeleteFormula(F);
5368 --e;
5369 --i;
5370 Any = true;
5371 assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
5372 continue;
5373 }
5374 }
5375
5376 if (Any)
5377 LU.RecomputeRegs(LUIdx, RegUses);
5378 }
5379
5380 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5381 }
5382}
5383
5384/// If there are an extraordinary number of formulae to choose from, use some
5385/// rough heuristics to prune down the number of formulae. This keeps the main
5386/// solver from taking an extraordinary amount of time in some worst-case
5387/// scenarios.
5388void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
5389 NarrowSearchSpaceByDetectingSupersets();
5390 NarrowSearchSpaceByCollapsingUnrolledCode();
5391 NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
5393 NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
5394 NarrowSearchSpaceByFilterPostInc();
5395 if (LSRExpNarrow)
5396 NarrowSearchSpaceByDeletingCostlyFormulas();
5397 else
5398 NarrowSearchSpaceByPickingWinnerRegs();
5399}
5400
5401/// This is the recursive solver.
5402void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
5403 Cost &SolutionCost,
5404 SmallVectorImpl<const Formula *> &Workspace,
5405 const Cost &CurCost,
5406 const SmallPtrSet<const SCEV *, 16> &CurRegs,
5407 DenseSet<const SCEV *> &VisitedRegs) const {
5408 // Some ideas:
5409 // - prune more:
5410 // - use more aggressive filtering
5411 // - sort the formula so that the most profitable solutions are found first
5412 // - sort the uses too
5413 // - search faster:
5414 // - don't compute a cost, and then compare. compare while computing a cost
5415 // and bail early.
5416 // - track register sets with SmallBitVector
5417
5418 const LSRUse &LU = Uses[Workspace.size()];
5419
5420 // If this use references any register that's already a part of the
5421 // in-progress solution, consider it a requirement that a formula must
5422 // reference that register in order to be considered. This prunes out
5423 // unprofitable searching.
5424 SmallSetVector<const SCEV *, 4> ReqRegs;
5425 for (const SCEV *S : CurRegs)
5426 if (LU.Regs.count(S))
5427 ReqRegs.insert(S);
5428
5429 SmallPtrSet<const SCEV *, 16> NewRegs;
5430 Cost NewCost(L, SE, TTI, AMK);
5431 for (const Formula &F : LU.Formulae) {
5432 // Ignore formulae which may not be ideal in terms of register reuse of
5433 // ReqRegs. The formula should use all required registers before
5434 // introducing new ones.
5435 // This can sometimes (notably when trying to favour postinc) lead to
5436 // sub-optimial decisions. There it is best left to the cost modelling to
5437 // get correct.
5438 if (!(AMK & TTI::AMK_PostIndexed) || LU.Kind != LSRUse::Address) {
5439 int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
5440 for (const SCEV *Reg : ReqRegs) {
5441 if ((F.ScaledReg && F.ScaledReg == Reg) ||
5442 is_contained(F.BaseRegs, Reg)) {
5443 --NumReqRegsToFind;
5444 if (NumReqRegsToFind == 0)
5445 break;
5446 }
5447 }
5448 if (NumReqRegsToFind != 0) {
5449 // If none of the formulae satisfied the required registers, then we could
5450 // clear ReqRegs and try again. Currently, we simply give up in this case.
5451 continue;
5452 }
5453 }
5454
5455 // Evaluate the cost of the current formula. If it's already worse than
5456 // the current best, prune the search at that point.
5457 NewCost = CurCost;
5458 NewRegs = CurRegs;
5459 NewCost.RateFormula(F, NewRegs, VisitedRegs, LU, HardwareLoopProfitable);
5460 if (NewCost.isLess(SolutionCost)) {
5461 Workspace.push_back(&F);
5462 if (Workspace.size() != Uses.size()) {
5463 SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
5464 NewRegs, VisitedRegs);
5465 if (F.getNumRegs() == 1 && Workspace.size() == 1)
5466 VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
5467 } else {
5468 LLVM_DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
5469 dbgs() << ".\nRegs:\n";
5470 for (const SCEV *S : NewRegs) dbgs()
5471 << "- " << *S << "\n";
5472 dbgs() << '\n');
5473
5474 SolutionCost = NewCost;
5475 Solution = Workspace;
5476 }
5477 Workspace.pop_back();
5478 }
5479 }
5480}
5481
5482/// Choose one formula from each use. Return the results in the given Solution
5483/// vector.
5484void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
5486 Cost SolutionCost(L, SE, TTI, AMK);
5487 SolutionCost.Lose();
5488 Cost CurCost(L, SE, TTI, AMK);
5489 SmallPtrSet<const SCEV *, 16> CurRegs;
5490 DenseSet<const SCEV *> VisitedRegs;
5491 Workspace.reserve(Uses.size());
5492
5493 // SolveRecurse does all the work.
5494 SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
5495 CurRegs, VisitedRegs);
5496 if (Solution.empty()) {
5497 LLVM_DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
5498 return;
5499 }
5500
5501 // Ok, we've now made all our decisions.
5502 LLVM_DEBUG(dbgs() << "\n"
5503 "The chosen solution requires ";
5504 SolutionCost.print(dbgs()); dbgs() << ":\n";
5505 for (size_t i = 0, e = Uses.size(); i != e; ++i) {
5506 dbgs() << " ";
5507 Uses[i].print(dbgs());
5508 dbgs() << "\n"
5509 " ";
5510 Solution[i]->print(dbgs());
5511 dbgs() << '\n';
5512 });
5513
5514 assert(Solution.size() == Uses.size() && "Malformed solution!");
5515
5516 const bool EnableDropUnprofitableSolution = [&] {
5518 case cl::BOU_TRUE:
5519 return true;
5520 case cl::BOU_FALSE:
5521 return false;
5522 case cl::BOU_UNSET:
5524 }
5525 llvm_unreachable("Unhandled cl::boolOrDefault enum");
5526 }();
5527
5528 if (BaselineCost.isLess(SolutionCost)) {
5529 if (!EnableDropUnprofitableSolution)
5530 LLVM_DEBUG(
5531 dbgs() << "Baseline is more profitable than chosen solution, "
5532 "add option 'lsr-drop-solution' to drop LSR solution.\n");
5533 else {
5534 LLVM_DEBUG(dbgs() << "Baseline is more profitable than chosen "
5535 "solution, dropping LSR solution.\n";);
5536 Solution.clear();
5537 }
5538 }
5539}
5540
5541/// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as
5542/// we can go while still being dominated by the input positions. This helps
5543/// canonicalize the insert position, which encourages sharing.
5545LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
5546 const SmallVectorImpl<Instruction *> &Inputs)
5547 const {
5548 Instruction *Tentative = &*IP;
5549 while (true) {
5550 bool AllDominate = true;
5551 Instruction *BetterPos = nullptr;
5552 // Don't bother attempting to insert before a catchswitch, their basic block
5553 // cannot have other non-PHI instructions.
5554 if (isa<CatchSwitchInst>(Tentative))
5555 return IP;
5556
5557 for (Instruction *Inst : Inputs) {
5558 if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
5559 AllDominate = false;
5560 break;
5561 }
5562 // Attempt to find an insert position in the middle of the block,
5563 // instead of at the end, so that it can be used for other expansions.
5564 if (Tentative->getParent() == Inst->getParent() &&
5565 (!BetterPos || !DT.dominates(Inst, BetterPos)))
5566 BetterPos = &*std::next(BasicBlock::iterator(Inst));
5567 }
5568 if (!AllDominate)
5569 break;
5570 if (BetterPos)
5571 IP = BetterPos->getIterator();
5572 else
5573 IP = Tentative->getIterator();
5574
5575 const Loop *IPLoop = LI.getLoopFor(IP->getParent());
5576 unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
5577
5578 BasicBlock *IDom;
5579 for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
5580 if (!Rung) return IP;
5581 Rung = Rung->getIDom();
5582 if (!Rung) return IP;
5583 IDom = Rung->getBlock();
5584
5585 // Don't climb into a loop though.
5586 const Loop *IDomLoop = LI.getLoopFor(IDom);
5587 unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
5588 if (IDomDepth <= IPLoopDepth &&
5589 (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
5590 break;
5591 }
5592
5593 Tentative = IDom->getTerminator();
5594 }
5595
5596 return IP;
5597}
5598
5599/// Determine an input position which will be dominated by the operands and
5600/// which will dominate the result.
5601BasicBlock::iterator LSRInstance::AdjustInsertPositionForExpand(
5602 BasicBlock::iterator LowestIP, const LSRFixup &LF, const LSRUse &LU) const {
5603 // Collect some instructions which must be dominated by the
5604 // expanding replacement. These must be dominated by any operands that
5605 // will be required in the expansion.
5606 SmallVector<Instruction *, 4> Inputs;
5607 if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
5608 Inputs.push_back(I);
5609 if (LU.Kind == LSRUse::ICmpZero)
5610 if (Instruction *I =
5611 dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
5612 Inputs.push_back(I);
5613 if (LF.PostIncLoops.count(L)) {
5614 if (LF.isUseFullyOutsideLoop(L))
5615 Inputs.push_back(L->getLoopLatch()->getTerminator());
5616 else
5617 Inputs.push_back(IVIncInsertPos);
5618 }
5619 // The expansion must also be dominated by the increment positions of any
5620 // loops it for which it is using post-inc mode.
5621 for (const Loop *PIL : LF.PostIncLoops) {
5622 if (PIL == L) continue;
5623
5624 // Be dominated by the loop exit.
5625 SmallVector<BasicBlock *, 4> ExitingBlocks;
5626 PIL->getExitingBlocks(ExitingBlocks);
5627 if (!ExitingBlocks.empty()) {
5628 BasicBlock *BB = ExitingBlocks[0];
5629 for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
5630 BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
5631 Inputs.push_back(BB->getTerminator());
5632 }
5633 }
5634
5635 assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad() &&
5636 "Insertion point must be a normal instruction");
5637
5638 // Then, climb up the immediate dominator tree as far as we can go while
5639 // still being dominated by the input positions.
5640 BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs);
5641
5642 // Don't insert instructions before PHI nodes.
5643 while (isa<PHINode>(IP)) ++IP;
5644
5645 // Ignore landingpad instructions.
5646 while (IP->isEHPad()) ++IP;
5647
5648 // Set IP below instructions recently inserted by SCEVExpander. This keeps the
5649 // IP consistent across expansions and allows the previously inserted
5650 // instructions to be reused by subsequent expansion.
5651 while (Rewriter.isInsertedInstruction(&*IP) && IP != LowestIP)
5652 ++IP;
5653
5654 return IP;
5655}
5656
5657/// Emit instructions for the leading candidate expression for this LSRUse (this
5658/// is called "expanding").
5659Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
5660 const Formula &F, BasicBlock::iterator IP,
5661 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
5662 if (LU.RigidFormula)
5663 return LF.OperandValToReplace;
5664
5665 // Determine an input position which will be dominated by the operands and
5666 // which will dominate the result.
5667 IP = AdjustInsertPositionForExpand(IP, LF, LU);
5668 Rewriter.setInsertPoint(&*IP);
5669
5670 // Inform the Rewriter if we have a post-increment use, so that it can
5671 // perform an advantageous expansion.
5672 Rewriter.setPostInc(LF.PostIncLoops);
5673
5674 // This is the type that the user actually needs.
5675 Type *OpTy = LF.OperandValToReplace->getType();
5676 // This will be the type that we'll initially expand to.
5677 Type *Ty = F.getType();
5678 if (!Ty)
5679 // No type known; just expand directly to the ultimate type.
5680 Ty = OpTy;
5681 else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
5682 // Expand directly to the ultimate type if it's the right size.
5683 Ty = OpTy;
5684 // This is the type to do integer arithmetic in.
5685 Type *IntTy = SE.getEffectiveSCEVType(Ty);
5686
5687 // Build up a list of operands to add together to form the full base.
5689
5690 // Expand the BaseRegs portion.
5691 for (const SCEV *Reg : F.BaseRegs) {
5692 assert(!Reg->isZero() && "Zero allocated in a base register!");
5693
5694 // If we're expanding for a post-inc user, make the post-inc adjustment.
5695 Reg = denormalizeForPostIncUse(Reg, LF.PostIncLoops, SE);
5696 Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr)));
5697 }
5698
5699 // Expand the ScaledReg portion.
5700 Value *ICmpScaledV = nullptr;
5701 if (F.Scale != 0) {
5702 const SCEV *ScaledS = F.ScaledReg;
5703
5704 // If we're expanding for a post-inc user, make the post-inc adjustment.
5705 PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
5706 ScaledS = denormalizeForPostIncUse(ScaledS, Loops, SE);
5707
5708 if (LU.Kind == LSRUse::ICmpZero) {
5709 // Expand ScaleReg as if it was part of the base regs.
5710 if (F.Scale == 1)
5711 Ops.push_back(
5712 SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr)));
5713 else {
5714 // An interesting way of "folding" with an icmp is to use a negated
5715 // scale, which we'll implement by inserting it into the other operand
5716 // of the icmp.
5717 assert(F.Scale == -1 &&
5718 "The only scale supported by ICmpZero uses is -1!");
5719 ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr);
5720 }
5721 } else {
5722 // Otherwise just expand the scaled register and an explicit scale,
5723 // which is expected to be matched as part of the address.
5724
5725 // Flush the operand list to suppress SCEVExpander hoisting address modes.
5726 // Unless the addressing mode will not be folded.
5727 if (!Ops.empty() && LU.Kind == LSRUse::Address &&
5728 isAMCompletelyFolded(TTI, LU, F)) {
5729 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), nullptr);
5730 Ops.clear();
5731 Ops.push_back(SE.getUnknown(FullV));
5732 }
5733 ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr));
5734 if (F.Scale != 1)
5735 ScaledS =
5736 SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale));
5737 Ops.push_back(ScaledS);
5738 }
5739 }
5740
5741 // Expand the GV portion.
5742 if (F.BaseGV) {
5743 // Flush the operand list to suppress SCEVExpander hoisting.
5744 if (!Ops.empty()) {
5745 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), IntTy);
5746 Ops.clear();
5747 Ops.push_back(SE.getUnknown(FullV));
5748 }
5749 Ops.push_back(SE.getUnknown(F.BaseGV));
5750 }
5751
5752 // Flush the operand list to suppress SCEVExpander hoisting of both folded and
5753 // unfolded offsets. LSR assumes they both live next to their uses.
5754 if (!Ops.empty()) {
5755 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
5756 Ops.clear();
5757 Ops.push_back(SE.getUnknown(FullV));
5758 }
5759
5760 // FIXME: Are we sure we won't get a mismatch here? Is there a way to bail
5761 // out at this point, or should we generate a SCEV adding together mixed
5762 // offsets?
5763 assert(F.BaseOffset.isCompatibleImmediate(LF.Offset) &&
5764 "Expanding mismatched offsets\n");
5765 // Expand the immediate portion.
5766 Immediate Offset = F.BaseOffset.addUnsigned(LF.Offset);
5767 if (Offset.isNonZero()) {
5768 if (LU.Kind == LSRUse::ICmpZero) {
5769 // The other interesting way of "folding" with an ICmpZero is to use a
5770 // negated immediate.
5771 if (!ICmpScaledV)
5772 ICmpScaledV =
5773 ConstantInt::get(IntTy, -(uint64_t)Offset.getFixedValue());
5774 else {
5775 Ops.push_back(SE.getUnknown(ICmpScaledV));
5776 ICmpScaledV = ConstantInt::get(IntTy, Offset.getFixedValue());
5777 }
5778 } else {
5779 // Just add the immediate values. These again are expected to be matched
5780 // as part of the address.
5781 Ops.push_back(Offset.getUnknownSCEV(SE, IntTy));
5782 }
5783 }
5784
5785 // Expand the unfolded offset portion.
5786 Immediate UnfoldedOffset = F.UnfoldedOffset;
5787 if (UnfoldedOffset.isNonZero()) {
5788 // Just add the immediate values.
5789 Ops.push_back(UnfoldedOffset.getUnknownSCEV(SE, IntTy));
5790 }
5791
5792 // Emit instructions summing all the operands.
5793 const SCEV *FullS = Ops.empty() ?
5794 SE.getConstant(IntTy, 0) :
5795 SE.getAddExpr(Ops);
5796 Value *FullV = Rewriter.expandCodeFor(FullS, Ty);
5797
5798 // We're done expanding now, so reset the rewriter.
5799 Rewriter.clearPostInc();
5800
5801 // An ICmpZero Formula represents an ICmp which we're handling as a
5802 // comparison against zero. Now that we've expanded an expression for that
5803 // form, update the ICmp's other operand.
5804 if (LU.Kind == LSRUse::ICmpZero) {
5805 ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
5806 if (auto *OperandIsInstr = dyn_cast<Instruction>(CI->getOperand(1)))
5807 DeadInsts.emplace_back(OperandIsInstr);
5808 assert(!F.BaseGV && "ICmp does not support folding a global value and "
5809 "a scale at the same time!");
5810 if (F.Scale == -1) {
5811 if (ICmpScaledV->getType() != OpTy) {
5813 CastInst::getCastOpcode(ICmpScaledV, false, OpTy, false),
5814 ICmpScaledV, OpTy, "tmp", CI->getIterator());
5815 ICmpScaledV = Cast;
5816 }
5817 CI->setOperand(1, ICmpScaledV);
5818 } else {
5819 // A scale of 1 means that the scale has been expanded as part of the
5820 // base regs.
5821 assert((F.Scale == 0 || F.Scale == 1) &&
5822 "ICmp does not support folding a global value and "
5823 "a scale at the same time!");
5825 -(uint64_t)Offset.getFixedValue());
5826 if (C->getType() != OpTy) {
5828 CastInst::getCastOpcode(C, false, OpTy, false), C, OpTy,
5829 CI->getDataLayout());
5830 assert(C && "Cast of ConstantInt should have folded");
5831 }
5832
5833 CI->setOperand(1, C);
5834 }
5835 }
5836
5837 return FullV;
5838}
5839
5840/// Helper for Rewrite. PHI nodes are special because the use of their operands
5841/// effectively happens in their predecessor blocks, so the expression may need
5842/// to be expanded in multiple places.
5843void LSRInstance::RewriteForPHI(PHINode *PN, const LSRUse &LU,
5844 const LSRFixup &LF, const Formula &F,
5845 SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
5846 DenseMap<BasicBlock *, Value *> Inserted;
5847
5848 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
5849 if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
5850 bool needUpdateFixups = false;
5851 BasicBlock *BB = PN->getIncomingBlock(i);
5852
5853 // If this is a critical edge, split the edge so that we do not insert
5854 // the code on all predecessor/successor paths. We do this unless this
5855 // is the canonical backedge for this loop, which complicates post-inc
5856 // users.
5857 if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
5860 BasicBlock *Parent = PN->getParent();
5861 Loop *PNLoop = LI.getLoopFor(Parent);
5862 if (!PNLoop || Parent != PNLoop->getHeader()) {
5863 // Split the critical edge.
5864 BasicBlock *NewBB = nullptr;
5865 if (!Parent->isLandingPad()) {
5866 NewBB =
5867 SplitCriticalEdge(BB, Parent,
5868 CriticalEdgeSplittingOptions(&DT, &LI, MSSAU)
5869 .setMergeIdenticalEdges()
5870 .setKeepOneInputPHIs());
5871 } else {
5873 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
5874 SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DTU, &LI);
5875 NewBB = NewBBs[0];
5876 }
5877 // If NewBB==NULL, then SplitCriticalEdge refused to split because all
5878 // phi predecessors are identical. The simple thing to do is skip
5879 // splitting in this case rather than complicate the API.
5880 if (NewBB) {
5881 // If PN is outside of the loop and BB is in the loop, we want to
5882 // move the block to be immediately before the PHI block, not
5883 // immediately after BB.
5884 if (L->contains(BB) && !L->contains(PN))
5885 NewBB->moveBefore(PN->getParent());
5886
5887 // Splitting the edge can reduce the number of PHI entries we have.
5888 e = PN->getNumIncomingValues();
5889 BB = NewBB;
5890 i = PN->getBasicBlockIndex(BB);
5891
5892 needUpdateFixups = true;
5893 }
5894 }
5895 }
5896
5897 std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
5898 Inserted.try_emplace(BB);
5899 if (!Pair.second)
5900 PN->setIncomingValue(i, Pair.first->second);
5901 else {
5902 Value *FullV =
5903 Expand(LU, LF, F, BB->getTerminator()->getIterator(), DeadInsts);
5904
5905 // If this is reuse-by-noop-cast, insert the noop cast.
5906 Type *OpTy = LF.OperandValToReplace->getType();
5907 if (FullV->getType() != OpTy)
5908 FullV = CastInst::Create(
5909 CastInst::getCastOpcode(FullV, false, OpTy, false), FullV,
5910 LF.OperandValToReplace->getType(), "tmp",
5911 BB->getTerminator()->getIterator());
5912
5913 // If the incoming block for this value is not in the loop, it means the
5914 // current PHI is not in a loop exit, so we must create a LCSSA PHI for
5915 // the inserted value.
5916 if (auto *I = dyn_cast<Instruction>(FullV))
5917 if (L->contains(I) && !L->contains(BB))
5918 InsertedNonLCSSAInsts.insert(I);
5919
5920 PN->setIncomingValue(i, FullV);
5921 Pair.first->second = FullV;
5922 }
5923
5924 // If LSR splits critical edge and phi node has other pending
5925 // fixup operands, we need to update those pending fixups. Otherwise
5926 // formulae will not be implemented completely and some instructions
5927 // will not be eliminated.
5928 if (needUpdateFixups) {
5929 for (LSRUse &LU : Uses)
5930 for (LSRFixup &Fixup : LU.Fixups)
5931 // If fixup is supposed to rewrite some operand in the phi
5932 // that was just updated, it may be already moved to
5933 // another phi node. Such fixup requires update.
5934 if (Fixup.UserInst == PN) {
5935 // Check if the operand we try to replace still exists in the
5936 // original phi.
5937 bool foundInOriginalPHI = false;
5938 for (const auto &val : PN->incoming_values())
5939 if (val == Fixup.OperandValToReplace) {
5940 foundInOriginalPHI = true;
5941 break;
5942 }
5943
5944 // If fixup operand found in original PHI - nothing to do.
5945 if (foundInOriginalPHI)
5946 continue;
5947
5948 // Otherwise it might be moved to another PHI and requires update.
5949 // If fixup operand not found in any of the incoming blocks that
5950 // means we have already rewritten it - nothing to do.
5951 for (const auto &Block : PN->blocks())
5952 for (BasicBlock::iterator I = Block->begin(); isa<PHINode>(I);
5953 ++I) {
5954 PHINode *NewPN = cast<PHINode>(I);
5955 for (const auto &val : NewPN->incoming_values())
5956 if (val == Fixup.OperandValToReplace)
5957 Fixup.UserInst = NewPN;
5958 }
5959 }
5960 }
5961 }
5962}
5963
5964/// Emit instructions for the leading candidate expression for this LSRUse (this
5965/// is called "expanding"), and update the UserInst to reference the newly
5966/// expanded value.
5967void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF,
5968 const Formula &F,
5969 SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
5970 // First, find an insertion point that dominates UserInst. For PHI nodes,
5971 // find the nearest block which dominates all the relevant uses.
5972 if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
5973 RewriteForPHI(PN, LU, LF, F, DeadInsts);
5974 } else {
5975 Value *FullV = Expand(LU, LF, F, LF.UserInst->getIterator(), DeadInsts);
5976
5977 // If this is reuse-by-noop-cast, insert the noop cast.
5978 Type *OpTy = LF.OperandValToReplace->getType();
5979 if (FullV->getType() != OpTy) {
5980 Instruction *Cast =
5981 CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
5982 FullV, OpTy, "tmp", LF.UserInst->getIterator());
5983 FullV = Cast;
5984 }
5985
5986 // Update the user. ICmpZero is handled specially here (for now) because
5987 // Expand may have updated one of the operands of the icmp already, and
5988 // its new value may happen to be equal to LF.OperandValToReplace, in
5989 // which case doing replaceUsesOfWith leads to replacing both operands
5990 // with the same value. TODO: Reorganize this.
5991 if (LU.Kind == LSRUse::ICmpZero)
5992 LF.UserInst->setOperand(0, FullV);
5993 else
5994 LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
5995 }
5996
5997 if (auto *OperandIsInstr = dyn_cast<Instruction>(LF.OperandValToReplace))
5998 DeadInsts.emplace_back(OperandIsInstr);
5999}
6000
6001// Trying to hoist the IVInc to loop header if all IVInc users are in
6002// the loop header. It will help backend to generate post index load/store
6003// when the latch block is different from loop header block.
6004static bool canHoistIVInc(const TargetTransformInfo &TTI, const LSRFixup &Fixup,
6005 const LSRUse &LU, Instruction *IVIncInsertPos,
6006 Loop *L) {
6007 if (LU.Kind != LSRUse::Address)
6008 return false;
6009
6010 // For now this code do the conservative optimization, only work for
6011 // the header block. Later we can hoist the IVInc to the block post
6012 // dominate all users.
6013 BasicBlock *LHeader = L->getHeader();
6014 if (IVIncInsertPos->getParent() == LHeader)
6015 return false;
6016
6017 if (!Fixup.OperandValToReplace ||
6018 any_of(Fixup.OperandValToReplace->users(), [&LHeader](User *U) {
6019 Instruction *UI = cast<Instruction>(U);
6020 return UI->getParent() != LHeader;
6021 }))
6022 return false;
6023
6024 Instruction *I = Fixup.UserInst;
6025 Type *Ty = I->getType();
6026 return (isa<LoadInst>(I) && TTI.isIndexedLoadLegal(TTI.MIM_PostInc, Ty)) ||
6027 (isa<StoreInst>(I) && TTI.isIndexedStoreLegal(TTI.MIM_PostInc, Ty));
6028}
6029
6030/// Rewrite all the fixup locations with new values, following the chosen
6031/// solution.
6032void LSRInstance::ImplementSolution(
6033 const SmallVectorImpl<const Formula *> &Solution) {
6034 // Keep track of instructions we may have made dead, so that
6035 // we can remove them after we are done working.
6037
6038 // Mark phi nodes that terminate chains so the expander tries to reuse them.
6039 for (const IVChain &Chain : IVChainVec) {
6040 if (PHINode *PN = dyn_cast<PHINode>(Chain.tailUserInst()))
6041 Rewriter.setChainedPhi(PN);
6042 }
6043
6044 // Expand the new value definitions and update the users.
6045 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx)
6046 for (const LSRFixup &Fixup : Uses[LUIdx].Fixups) {
6047 Instruction *InsertPos =
6048 canHoistIVInc(TTI, Fixup, Uses[LUIdx], IVIncInsertPos, L)
6049 ? L->getHeader()->getTerminator()
6050 : IVIncInsertPos;
6051 Rewriter.setIVIncInsertPos(L, InsertPos);
6052 Rewrite(Uses[LUIdx], Fixup, *Solution[LUIdx], DeadInsts);
6053 Changed = true;
6054 }
6055
6056 auto InsertedInsts = InsertedNonLCSSAInsts.takeVector();
6057 formLCSSAForInstructions(InsertedInsts, DT, LI, &SE);
6058
6059 for (const IVChain &Chain : IVChainVec) {
6060 GenerateIVChain(Chain, DeadInsts);
6061 Changed = true;
6062 }
6063
6064 for (const WeakVH &IV : Rewriter.getInsertedIVs())
6065 if (IV && dyn_cast<Instruction>(&*IV)->getParent())
6066 ScalarEvolutionIVs.push_back(IV);
6067
6068 // Clean up after ourselves. This must be done before deleting any
6069 // instructions.
6070 Rewriter.clear();
6071
6073 &TLI, MSSAU);
6074
6075 // In our cost analysis above, we assume that each addrec consumes exactly
6076 // one register, and arrange to have increments inserted just before the
6077 // latch to maximimize the chance this is true. However, if we reused
6078 // existing IVs, we now need to move the increments to match our
6079 // expectations. Otherwise, our cost modeling results in us having a
6080 // chosen a non-optimal result for the actual schedule. (And yes, this
6081 // scheduling decision does impact later codegen.)
6082 for (PHINode &PN : L->getHeader()->phis()) {
6083 BinaryOperator *BO = nullptr;
6084 Value *Start = nullptr, *Step = nullptr;
6085 if (!matchSimpleRecurrence(&PN, BO, Start, Step))
6086 continue;
6087
6088 switch (BO->getOpcode()) {
6089 case Instruction::Sub:
6090 if (BO->getOperand(0) != &PN)
6091 // sub is non-commutative - match handling elsewhere in LSR
6092 continue;
6093 break;
6094 case Instruction::Add:
6095 break;
6096 default:
6097 continue;
6098 };
6099
6100 if (!isa<Constant>(Step))
6101 // If not a constant step, might increase register pressure
6102 // (We assume constants have been canonicalized to RHS)
6103 continue;
6104
6105 if (BO->getParent() == IVIncInsertPos->getParent())
6106 // Only bother moving across blocks. Isel can handle block local case.
6107 continue;
6108
6109 // Can we legally schedule inc at the desired point?
6110 if (!llvm::all_of(BO->uses(),
6111 [&](Use &U) {return DT.dominates(IVIncInsertPos, U);}))
6112 continue;
6113 BO->moveBefore(IVIncInsertPos->getIterator());
6114 Changed = true;
6115 }
6116
6117
6118}
6119
6120LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
6121 DominatorTree &DT, LoopInfo &LI,
6122 const TargetTransformInfo &TTI, AssumptionCache &AC,
6123 TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU)
6124 : IU(IU), SE(SE), DT(DT), LI(LI), AC(AC), TLI(TLI), TTI(TTI), L(L),
6125 MSSAU(MSSAU), AMK(PreferredAddresingMode.getNumOccurrences() > 0
6127 : TTI.getPreferredAddressingMode(L, &SE)),
6128 Rewriter(SE, L->getHeader()->getDataLayout(), "lsr", false),
6129 BaselineCost(L, SE, TTI, AMK) {
6130 // If LoopSimplify form is not available, stay out of trouble.
6131 if (!L->isLoopSimplifyForm())
6132 return;
6133
6134 // If there's no interesting work to be done, bail early.
6135 if (IU.empty()) return;
6136
6137 // If there's too much analysis to be done, bail early. We won't be able to
6138 // model the problem anyway.
6139 unsigned NumUsers = 0;
6140 for (const IVStrideUse &U : IU) {
6141 if (++NumUsers > MaxIVUsers) {
6142 (void)U;
6143 LLVM_DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U
6144 << "\n");
6145 return;
6146 }
6147 // Bail out if we have a PHI on an EHPad that gets a value from a
6148 // CatchSwitchInst. Because the CatchSwitchInst cannot be split, there is
6149 // no good place to stick any instructions.
6150 if (auto *PN = dyn_cast<PHINode>(U.getUser())) {
6151 auto FirstNonPHI = PN->getParent()->getFirstNonPHIIt();
6152 if (isa<FuncletPadInst>(FirstNonPHI) ||
6153 isa<CatchSwitchInst>(FirstNonPHI))
6154 for (BasicBlock *PredBB : PN->blocks())
6155 if (isa<CatchSwitchInst>(PredBB->getFirstNonPHIIt()))
6156 return;
6157 }
6158 }
6159
6160 LLVM_DEBUG(dbgs() << "\nLSR on loop ";
6161 L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
6162 dbgs() << ":\n");
6163
6164 // Check if we expect this loop to use a hardware loop instruction, which will
6165 // be used when calculating the costs of formulas.
6166 HardwareLoopInfo HWLoopInfo(L);
6167 HardwareLoopProfitable =
6168 TTI.isHardwareLoopProfitable(L, SE, AC, &TLI, HWLoopInfo);
6169
6170 // Configure SCEVExpander already now, so the correct mode is used for
6171 // isSafeToExpand() checks.
6172#if LLVM_ENABLE_ABI_BREAKING_CHECKS
6173 Rewriter.setDebugType(DEBUG_TYPE);
6174#endif
6175 Rewriter.disableCanonicalMode();
6176 Rewriter.enableLSRMode();
6177
6178 // First, perform some low-level loop optimizations.
6179 OptimizeShadowIV();
6180 OptimizeLoopTermCond();
6181
6182 // If loop preparation eliminates all interesting IV users, bail.
6183 if (IU.empty()) return;
6184
6185 // Skip nested loops until we can model them better with formulae.
6186 if (!L->isInnermost()) {
6187 LLVM_DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
6188 return;
6189 }
6190
6191 // Start collecting data and preparing for the solver.
6192 // If number of registers is not the major cost, we cannot benefit from the
6193 // current profitable chain optimization which is based on number of
6194 // registers.
6195 // FIXME: add profitable chain optimization for other kinds major cost, for
6196 // example number of instructions.
6197 if (TTI.isNumRegsMajorCostOfLSR() || StressIVChain)
6198 CollectChains();
6199 CollectInterestingTypesAndFactors();
6200 CollectFixupsAndInitialFormulae();
6201 CollectLoopInvariantFixupsAndFormulae();
6202
6203 if (Uses.empty())
6204 return;
6205
6206 LLVM_DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
6207 print_uses(dbgs()));
6208 LLVM_DEBUG(dbgs() << "The baseline solution requires ";
6209 BaselineCost.print(dbgs()); dbgs() << "\n");
6210
6211 // Now use the reuse data to generate a bunch of interesting ways
6212 // to formulate the values needed for the uses.
6213 GenerateAllReuseFormulae();
6214
6215 FilterOutUndesirableDedicatedRegisters();
6216 NarrowSearchSpaceUsingHeuristics();
6217
6219 Solve(Solution);
6220
6221 // Release memory that is no longer needed.
6222 Factors.clear();
6223 Types.clear();
6224 RegUses.clear();
6225
6226 if (Solution.empty())
6227 return;
6228
6229#ifndef NDEBUG
6230 // Formulae should be legal.
6231 for (const LSRUse &LU : Uses) {
6232 for (const Formula &F : LU.Formulae)
6233 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
6234 F) && "Illegal formula generated!");
6235 };
6236#endif
6237
6238 // Now that we've decided what we want, make it so.
6239 ImplementSolution(Solution);
6240}
6241
6242#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
6243void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
6244 if (Factors.empty() && Types.empty()) return;
6245
6246 OS << "LSR has identified the following interesting factors and types: ";
6247 bool First = true;
6248
6249 for (int64_t Factor : Factors) {
6250 if (!First) OS << ", ";
6251 First = false;
6252 OS << '*' << Factor;
6253 }
6254
6255 for (Type *Ty : Types) {
6256 if (!First) OS << ", ";
6257 First = false;
6258 OS << '(' << *Ty << ')';
6259 }
6260 OS << '\n';
6261}
6262
6263void LSRInstance::print_fixups(raw_ostream &OS) const {
6264 OS << "LSR is examining the following fixup sites:\n";
6265 for (const LSRUse &LU : Uses)
6266 for (const LSRFixup &LF : LU.Fixups) {
6267 dbgs() << " ";
6268 LF.print(OS);
6269 OS << '\n';
6270 }
6271}
6272
6273void LSRInstance::print_uses(raw_ostream &OS) const {
6274 OS << "LSR is examining the following uses:\n";
6275 for (const LSRUse &LU : Uses) {
6276 dbgs() << " ";
6277 LU.print(OS);
6278 OS << '\n';
6279 for (const Formula &F : LU.Formulae) {
6280 OS << " ";
6281 F.print(OS);
6282 OS << '\n';
6283 }
6284 }
6285}
6286
6287void LSRInstance::print(raw_ostream &OS) const {
6288 print_factors_and_types(OS);
6289 print_fixups(OS);
6290 print_uses(OS);
6291}
6292
6293LLVM_DUMP_METHOD void LSRInstance::dump() const {
6294 print(errs()); errs() << '\n';
6295}
6296#endif
6297
6298namespace {
6299
6300class LoopStrengthReduce : public LoopPass {
6301public:
6302 static char ID; // Pass ID, replacement for typeid
6303
6304 LoopStrengthReduce();
6305
6306private:
6307 bool runOnLoop(Loop *L, LPPassManager &LPM) override;
6308 void getAnalysisUsage(AnalysisUsage &AU) const override;
6309};
6310
6311} // end anonymous namespace
6312
6313LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) {
6315}
6316
6317void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
6318 // We split critical edges, so we change the CFG. However, we do update
6319 // many analyses if they are around.
6321
6322 AU.addRequired<LoopInfoWrapperPass>();
6323 AU.addPreserved<LoopInfoWrapperPass>();
6325 AU.addRequired<DominatorTreeWrapperPass>();
6326 AU.addPreserved<DominatorTreeWrapperPass>();
6327 AU.addRequired<ScalarEvolutionWrapperPass>();
6328 AU.addPreserved<ScalarEvolutionWrapperPass>();
6329 AU.addRequired<AssumptionCacheTracker>();
6330 AU.addRequired<TargetLibraryInfoWrapperPass>();
6331 // Requiring LoopSimplify a second time here prevents IVUsers from running
6332 // twice, since LoopSimplify was invalidated by running ScalarEvolution.
6334 AU.addRequired<IVUsersWrapperPass>();
6335 AU.addPreserved<IVUsersWrapperPass>();
6336 AU.addRequired<TargetTransformInfoWrapperPass>();
6337 AU.addPreserved<MemorySSAWrapperPass>();
6338}
6339
6340namespace {
6341
6342/// Enables more convenient iteration over a DWARF expression vector.
6344ToDwarfOpIter(SmallVectorImpl<uint64_t> &Expr) {
6345 llvm::DIExpression::expr_op_iterator Begin =
6346 llvm::DIExpression::expr_op_iterator(Expr.begin());
6347 llvm::DIExpression::expr_op_iterator End =
6348 llvm::DIExpression::expr_op_iterator(Expr.end());
6349 return {Begin, End};
6350}
6351
6352struct SCEVDbgValueBuilder {
6353 SCEVDbgValueBuilder() = default;
6354 SCEVDbgValueBuilder(const SCEVDbgValueBuilder &Base) { clone(Base); }
6355
6356 void clone(const SCEVDbgValueBuilder &Base) {
6357 LocationOps = Base.LocationOps;
6358 Expr = Base.Expr;
6359 }
6360
6361 void clear() {
6362 LocationOps.clear();
6363 Expr.clear();
6364 }
6365
6366 /// The DIExpression as we translate the SCEV.
6368 /// The location ops of the DIExpression.
6369 SmallVector<Value *, 2> LocationOps;
6370
6371 void pushOperator(uint64_t Op) { Expr.push_back(Op); }
6372 void pushUInt(uint64_t Operand) { Expr.push_back(Operand); }
6373
6374 /// Add a DW_OP_LLVM_arg to the expression, followed by the index of the value
6375 /// in the set of values referenced by the expression.
6376 void pushLocation(llvm::Value *V) {
6378 auto *It = llvm::find(LocationOps, V);
6379 unsigned ArgIndex = 0;
6380 if (It != LocationOps.end()) {
6381 ArgIndex = std::distance(LocationOps.begin(), It);
6382 } else {
6383 ArgIndex = LocationOps.size();
6384 LocationOps.push_back(V);
6385 }
6386 Expr.push_back(ArgIndex);
6387 }
6388
6389 void pushValue(const SCEVUnknown *U) {
6390 llvm::Value *V = cast<SCEVUnknown>(U)->getValue();
6391 pushLocation(V);
6392 }
6393
6394 bool pushConst(const SCEVConstant *C) {
6395 if (C->getAPInt().getSignificantBits() > 64)
6396 return false;
6397 Expr.push_back(llvm::dwarf::DW_OP_consts);
6398 Expr.push_back(C->getAPInt().getSExtValue());
6399 return true;
6400 }
6401
6402 // Iterating the expression as DWARF ops is convenient when updating
6403 // DWARF_OP_LLVM_args.
6405 return ToDwarfOpIter(Expr);
6406 }
6407
6408 /// Several SCEV types are sequences of the same arithmetic operator applied
6409 /// to constants and values that may be extended or truncated.
6410 bool pushArithmeticExpr(const llvm::SCEVCommutativeExpr *CommExpr,
6411 uint64_t DwarfOp) {
6412 assert((isa<llvm::SCEVAddExpr>(CommExpr) || isa<SCEVMulExpr>(CommExpr)) &&
6413 "Expected arithmetic SCEV type");
6414 bool Success = true;
6415 unsigned EmitOperator = 0;
6416 for (const auto &Op : CommExpr->operands()) {
6417 Success &= pushSCEV(Op);
6418
6419 if (EmitOperator >= 1)
6420 pushOperator(DwarfOp);
6421 ++EmitOperator;
6422 }
6423 return Success;
6424 }
6425
6426 // TODO: Identify and omit noop casts.
6427 bool pushCast(const llvm::SCEVCastExpr *C, bool IsSigned) {
6428 const llvm::SCEV *Inner = C->getOperand(0);
6429 const llvm::Type *Type = C->getType();
6430 uint64_t ToWidth = Type->getIntegerBitWidth();
6431 bool Success = pushSCEV(Inner);
6432 uint64_t CastOps[] = {dwarf::DW_OP_LLVM_convert, ToWidth,
6433 IsSigned ? llvm::dwarf::DW_ATE_signed
6434 : llvm::dwarf::DW_ATE_unsigned};
6435 for (const auto &Op : CastOps)
6436 pushOperator(Op);
6437 return Success;
6438 }
6439
6440 // TODO: MinMax - although these haven't been encountered in the test suite.
6441 bool pushSCEV(const llvm::SCEV *S) {
6442 bool Success = true;
6443 if (const SCEVConstant *StartInt = dyn_cast<SCEVConstant>(S)) {
6444 Success &= pushConst(StartInt);
6445
6446 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
6447 if (!U->getValue())
6448 return false;
6449 pushLocation(U->getValue());
6450
6451 } else if (const SCEVMulExpr *MulRec = dyn_cast<SCEVMulExpr>(S)) {
6452 Success &= pushArithmeticExpr(MulRec, llvm::dwarf::DW_OP_mul);
6453
6454 } else if (const SCEVUDivExpr *UDiv = dyn_cast<SCEVUDivExpr>(S)) {
6455 Success &= pushSCEV(UDiv->getLHS());
6456 Success &= pushSCEV(UDiv->getRHS());
6457 pushOperator(llvm::dwarf::DW_OP_div);
6458
6459 } else if (const SCEVCastExpr *Cast = dyn_cast<SCEVCastExpr>(S)) {
6460 // Assert if a new and unknown SCEVCastEXpr type is encountered.
6463 "Unexpected cast type in SCEV.");
6464 Success &= pushCast(Cast, (isa<SCEVSignExtendExpr>(Cast)));
6465
6466 } else if (const SCEVAddExpr *AddExpr = dyn_cast<SCEVAddExpr>(S)) {
6467 Success &= pushArithmeticExpr(AddExpr, llvm::dwarf::DW_OP_plus);
6468
6469 } else if (isa<SCEVAddRecExpr>(S)) {
6470 // Nested SCEVAddRecExpr are generated by nested loops and are currently
6471 // unsupported.
6472 return false;
6473
6474 } else {
6475 return false;
6476 }
6477 return Success;
6478 }
6479
6480 /// Return true if the combination of arithmetic operator and underlying
6481 /// SCEV constant value is an identity function.
6482 bool isIdentityFunction(uint64_t Op, const SCEV *S) {
6483 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
6484 if (C->getAPInt().getSignificantBits() > 64)
6485 return false;
6486 int64_t I = C->getAPInt().getSExtValue();
6487 switch (Op) {
6488 case llvm::dwarf::DW_OP_plus:
6489 case llvm::dwarf::DW_OP_minus:
6490 return I == 0;
6491 case llvm::dwarf::DW_OP_mul:
6492 case llvm::dwarf::DW_OP_div:
6493 return I == 1;
6494 }
6495 }
6496 return false;
6497 }
6498
6499 /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6500 /// builder's expression stack. The stack should already contain an
6501 /// expression for the iteration count, so that it can be multiplied by
6502 /// the stride and added to the start.
6503 /// Components of the expression are omitted if they are an identity function.
6504 /// Chain (non-affine) SCEVs are not supported.
6505 bool SCEVToValueExpr(const llvm::SCEVAddRecExpr &SAR, ScalarEvolution &SE) {
6506 assert(SAR.isAffine() && "Expected affine SCEV");
6507 const SCEV *Start = SAR.getStart();
6508 const SCEV *Stride = SAR.getStepRecurrence(SE);
6509
6510 // Skip pushing arithmetic noops.
6511 if (!isIdentityFunction(llvm::dwarf::DW_OP_mul, Stride)) {
6512 if (!pushSCEV(Stride))
6513 return false;
6514 pushOperator(llvm::dwarf::DW_OP_mul);
6515 }
6516 if (!isIdentityFunction(llvm::dwarf::DW_OP_plus, Start)) {
6517 if (!pushSCEV(Start))
6518 return false;
6519 pushOperator(llvm::dwarf::DW_OP_plus);
6520 }
6521 return true;
6522 }
6523
6524 /// Create an expression that is an offset from a value (usually the IV).
6525 void createOffsetExpr(int64_t Offset, Value *OffsetValue) {
6526 pushLocation(OffsetValue);
6528 LLVM_DEBUG(
6529 dbgs() << "scev-salvage: Generated IV offset expression. Offset: "
6530 << std::to_string(Offset) << "\n");
6531 }
6532
6533 /// Combine a translation of the SCEV and the IV to create an expression that
6534 /// recovers a location's value.
6535 /// returns true if an expression was created.
6536 bool createIterCountExpr(const SCEV *S,
6537 const SCEVDbgValueBuilder &IterationCount,
6538 ScalarEvolution &SE) {
6539 // SCEVs for SSA values are most frquently of the form
6540 // {start,+,stride}, but sometimes they are ({start,+,stride} + %a + ..).
6541 // This is because %a is a PHI node that is not the IV. However, these
6542 // SCEVs have not been observed to result in debuginfo-lossy optimisations,
6543 // so its not expected this point will be reached.
6544 if (!isa<SCEVAddRecExpr>(S))
6545 return false;
6546
6547 LLVM_DEBUG(dbgs() << "scev-salvage: Location to salvage SCEV: " << *S
6548 << '\n');
6549
6550 const auto *Rec = cast<SCEVAddRecExpr>(S);
6551 if (!Rec->isAffine())
6552 return false;
6553
6555 return false;
6556
6557 // Initialise a new builder with the iteration count expression. In
6558 // combination with the value's SCEV this enables recovery.
6559 clone(IterationCount);
6560 if (!SCEVToValueExpr(*Rec, SE))
6561 return false;
6562
6563 return true;
6564 }
6565
6566 /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6567 /// builder's expression stack. The stack should already contain an
6568 /// expression for the iteration count, so that it can be multiplied by
6569 /// the stride and added to the start.
6570 /// Components of the expression are omitted if they are an identity function.
6571 bool SCEVToIterCountExpr(const llvm::SCEVAddRecExpr &SAR,
6572 ScalarEvolution &SE) {
6573 assert(SAR.isAffine() && "Expected affine SCEV");
6574 const SCEV *Start = SAR.getStart();
6575 const SCEV *Stride = SAR.getStepRecurrence(SE);
6576
6577 // Skip pushing arithmetic noops.
6578 if (!isIdentityFunction(llvm::dwarf::DW_OP_minus, Start)) {
6579 if (!pushSCEV(Start))
6580 return false;
6581 pushOperator(llvm::dwarf::DW_OP_minus);
6582 }
6583 if (!isIdentityFunction(llvm::dwarf::DW_OP_div, Stride)) {
6584 if (!pushSCEV(Stride))
6585 return false;
6586 pushOperator(llvm::dwarf::DW_OP_div);
6587 }
6588 return true;
6589 }
6590
6591 // Append the current expression and locations to a location list and an
6592 // expression list. Modify the DW_OP_LLVM_arg indexes to account for
6593 // the locations already present in the destination list.
6594 void appendToVectors(SmallVectorImpl<uint64_t> &DestExpr,
6595 SmallVectorImpl<Value *> &DestLocations) {
6596 assert(!DestLocations.empty() &&
6597 "Expected the locations vector to contain the IV");
6598 // The DWARF_OP_LLVM_arg arguments of the expression being appended must be
6599 // modified to account for the locations already in the destination vector.
6600 // All builders contain the IV as the first location op.
6601 assert(!LocationOps.empty() &&
6602 "Expected the location ops to contain the IV.");
6603 // DestIndexMap[n] contains the index in DestLocations for the nth
6604 // location in this SCEVDbgValueBuilder.
6605 SmallVector<uint64_t, 2> DestIndexMap;
6606 for (const auto &Op : LocationOps) {
6607 auto It = find(DestLocations, Op);
6608 if (It != DestLocations.end()) {
6609 // Location already exists in DestLocations, reuse existing ArgIndex.
6610 DestIndexMap.push_back(std::distance(DestLocations.begin(), It));
6611 continue;
6612 }
6613 // Location is not in DestLocations, add it.
6614 DestIndexMap.push_back(DestLocations.size());
6615 DestLocations.push_back(Op);
6616 }
6617
6618 for (const auto &Op : expr_ops()) {
6619 if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
6620 Op.appendToVector(DestExpr);
6621 continue;
6622 }
6623
6625 // `DW_OP_LLVM_arg n` represents the nth LocationOp in this SCEV,
6626 // DestIndexMap[n] contains its new index in DestLocations.
6627 uint64_t NewIndex = DestIndexMap[Op.getArg(0)];
6628 DestExpr.push_back(NewIndex);
6629 }
6630 }
6631};
6632
6633/// Holds all the required data to salvage a dbg.value using the pre-LSR SCEVs
6634/// and DIExpression.
6635struct DVIRecoveryRec {
6636 DVIRecoveryRec(DbgVariableRecord *DVR)
6637 : DbgRef(DVR), Expr(DVR->getExpression()), HadLocationArgList(false) {}
6638
6639 DbgVariableRecord *DbgRef;
6640 DIExpression *Expr;
6641 bool HadLocationArgList;
6642 SmallVector<WeakVH, 2> LocationOps;
6645
6646 void clear() {
6647 for (auto &RE : RecoveryExprs)
6648 RE.reset();
6649 RecoveryExprs.clear();
6650 }
6651
6652 ~DVIRecoveryRec() { clear(); }
6653};
6654} // namespace
6655
6656/// Returns the total number of DW_OP_llvm_arg operands in the expression.
6657/// This helps in determining if a DIArglist is necessary or can be omitted from
6658/// the dbg.value.
6660 auto expr_ops = ToDwarfOpIter(Expr);
6661 unsigned Count = 0;
6662 for (auto Op : expr_ops)
6663 if (Op.getOp() == dwarf::DW_OP_LLVM_arg)
6664 Count++;
6665 return Count;
6666}
6667
6668/// Overwrites DVI with the location and Ops as the DIExpression. This will
6669/// create an invalid expression if Ops has any dwarf::DW_OP_llvm_arg operands,
6670/// because a DIArglist is not created for the first argument of the dbg.value.
6671template <typename T>
6672static void updateDVIWithLocation(T &DbgVal, Value *Location,
6674 assert(numLLVMArgOps(Ops) == 0 && "Expected expression that does not "
6675 "contain any DW_OP_llvm_arg operands.");
6676 DbgVal.setRawLocation(ValueAsMetadata::get(Location));
6677 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6678 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6679}
6680
6681/// Overwrite DVI with locations placed into a DIArglist.
6682template <typename T>
6683static void updateDVIWithLocations(T &DbgVal,
6684 SmallVectorImpl<Value *> &Locations,
6686 assert(numLLVMArgOps(Ops) != 0 &&
6687 "Expected expression that references DIArglist locations using "
6688 "DW_OP_llvm_arg operands.");
6690 for (Value *V : Locations)
6691 MetadataLocs.push_back(ValueAsMetadata::get(V));
6692 auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6693 DbgVal.setRawLocation(llvm::DIArgList::get(DbgVal.getContext(), ValArrayRef));
6694 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6695}
6696
6697/// Write the new expression and new location ops for the dbg.value. If possible
6698/// reduce the szie of the dbg.value by omitting DIArglist. This
6699/// can be omitted if:
6700/// 1. There is only a single location, refenced by a single DW_OP_llvm_arg.
6701/// 2. The DW_OP_LLVM_arg is the first operand in the expression.
6702static void UpdateDbgValue(DVIRecoveryRec &DVIRec,
6703 SmallVectorImpl<Value *> &NewLocationOps,
6705 DbgVariableRecord *DbgVal = DVIRec.DbgRef;
6706 unsigned NumLLVMArgs = numLLVMArgOps(NewExpr);
6707 if (NumLLVMArgs == 0) {
6708 // Location assumed to be on the stack.
6709 updateDVIWithLocation(*DbgVal, NewLocationOps[0], NewExpr);
6710 } else if (NumLLVMArgs == 1 && NewExpr[0] == dwarf::DW_OP_LLVM_arg) {
6711 // There is only a single DW_OP_llvm_arg at the start of the expression,
6712 // so it can be omitted along with DIArglist.
6713 assert(NewExpr[1] == 0 &&
6714 "Lone LLVM_arg in a DIExpression should refer to location-op 0.");
6716 updateDVIWithLocation(*DbgVal, NewLocationOps[0], ShortenedOps);
6717 } else {
6718 // Multiple DW_OP_llvm_arg, so DIArgList is strictly necessary.
6719 updateDVIWithLocations(*DbgVal, NewLocationOps, NewExpr);
6720 }
6721
6722 // If the DIExpression was previously empty then add the stack terminator.
6723 // Non-empty expressions have only had elements inserted into them and so
6724 // the terminator should already be present e.g. stack_value or fragment.
6725 DIExpression *SalvageExpr = DbgVal->getExpression();
6726 if (!DVIRec.Expr->isComplex() && SalvageExpr->isComplex()) {
6727 SalvageExpr = DIExpression::append(SalvageExpr, {dwarf::DW_OP_stack_value});
6728 DbgVal->setExpression(SalvageExpr);
6729 }
6730}
6731
6732/// Cached location ops may be erased during LSR, in which case a poison is
6733/// required when restoring from the cache. The type of that location is no
6734/// longer available, so just use int8. The poison will be replaced by one or
6735/// more locations later when a SCEVDbgValueBuilder selects alternative
6736/// locations to use for the salvage.
6738 return (VH) ? VH : PoisonValue::get(llvm::Type::getInt8Ty(C));
6739}
6740
6741/// Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
6742static void restorePreTransformState(DVIRecoveryRec &DVIRec) {
6743 DbgVariableRecord *DbgVal = DVIRec.DbgRef;
6744 LLVM_DEBUG(dbgs() << "scev-salvage: restore dbg.value to pre-LSR state\n"
6745 << "scev-salvage: post-LSR: " << *DbgVal << '\n');
6746 assert(DVIRec.Expr && "Expected an expression");
6747 DbgVal->setExpression(DVIRec.Expr);
6748
6749 // Even a single location-op may be inside a DIArgList and referenced with
6750 // DW_OP_LLVM_arg, which is valid only with a DIArgList.
6751 if (!DVIRec.HadLocationArgList) {
6752 assert(DVIRec.LocationOps.size() == 1 &&
6753 "Unexpected number of location ops.");
6754 // LSR's unsuccessful salvage attempt may have added DIArgList, which in
6755 // this case was not present before, so force the location back to a
6756 // single uncontained Value.
6757 Value *CachedValue =
6758 getValueOrPoison(DVIRec.LocationOps[0], DbgVal->getContext());
6759 DbgVal->setRawLocation(ValueAsMetadata::get(CachedValue));
6760 } else {
6762 for (WeakVH VH : DVIRec.LocationOps) {
6763 Value *CachedValue = getValueOrPoison(VH, DbgVal->getContext());
6764 MetadataLocs.push_back(ValueAsMetadata::get(CachedValue));
6765 }
6766 auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6767 DbgVal->setRawLocation(
6768 llvm::DIArgList::get(DbgVal->getContext(), ValArrayRef));
6769 }
6770 LLVM_DEBUG(dbgs() << "scev-salvage: pre-LSR: " << *DbgVal << '\n');
6771}
6772
6774 llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec,
6775 const SCEV *SCEVInductionVar,
6776 SCEVDbgValueBuilder IterCountExpr) {
6777
6778 if (!DVIRec.DbgRef->isKillLocation())
6779 return false;
6780
6781 // LSR may have caused several changes to the dbg.value in the failed salvage
6782 // attempt. So restore the DIExpression, the location ops and also the
6783 // location ops format, which is always DIArglist for multiple ops, but only
6784 // sometimes for a single op.
6786
6787 // LocationOpIndexMap[i] will store the post-LSR location index of
6788 // the non-optimised out location at pre-LSR index i.
6789 SmallVector<int64_t, 2> LocationOpIndexMap;
6790 LocationOpIndexMap.assign(DVIRec.LocationOps.size(), -1);
6791 SmallVector<Value *, 2> NewLocationOps;
6792 NewLocationOps.push_back(LSRInductionVar);
6793
6794 for (unsigned i = 0; i < DVIRec.LocationOps.size(); i++) {
6795 WeakVH VH = DVIRec.LocationOps[i];
6796 // Place the locations not optimised out in the list first, avoiding
6797 // inserts later. The map is used to update the DIExpression's
6798 // DW_OP_LLVM_arg arguments as the expression is updated.
6799 if (VH && !isa<UndefValue>(VH)) {
6800 NewLocationOps.push_back(VH);
6801 LocationOpIndexMap[i] = NewLocationOps.size() - 1;
6802 LLVM_DEBUG(dbgs() << "scev-salvage: Location index " << i
6803 << " now at index " << LocationOpIndexMap[i] << "\n");
6804 continue;
6805 }
6806
6807 // It's possible that a value referred to in the SCEV may have been
6808 // optimised out by LSR.
6809 if (SE.containsErasedValue(DVIRec.SCEVs[i]) ||
6810 SE.containsUndefs(DVIRec.SCEVs[i])) {
6811 LLVM_DEBUG(dbgs() << "scev-salvage: SCEV for location at index: " << i
6812 << " refers to a location that is now undef or erased. "
6813 "Salvage abandoned.\n");
6814 return false;
6815 }
6816
6817 LLVM_DEBUG(dbgs() << "scev-salvage: salvaging location at index " << i
6818 << " with SCEV: " << *DVIRec.SCEVs[i] << "\n");
6819
6820 DVIRec.RecoveryExprs[i] = std::make_unique<SCEVDbgValueBuilder>();
6821 SCEVDbgValueBuilder *SalvageExpr = DVIRec.RecoveryExprs[i].get();
6822
6823 // Create an offset-based salvage expression if possible, as it requires
6824 // less DWARF ops than an iteration count-based expression.
6825 if (std::optional<APInt> Offset =
6826 SE.computeConstantDifference(DVIRec.SCEVs[i], SCEVInductionVar)) {
6827 if (Offset->getSignificantBits() <= 64)
6828 SalvageExpr->createOffsetExpr(Offset->getSExtValue(), LSRInductionVar);
6829 else
6830 return false;
6831 } else if (!SalvageExpr->createIterCountExpr(DVIRec.SCEVs[i], IterCountExpr,
6832 SE))
6833 return false;
6834 }
6835
6836 // Merge the DbgValueBuilder generated expressions and the original
6837 // DIExpression, place the result into an new vector.
6839 if (DVIRec.Expr->getNumElements() == 0) {
6840 assert(DVIRec.RecoveryExprs.size() == 1 &&
6841 "Expected only a single recovery expression for an empty "
6842 "DIExpression.");
6843 assert(DVIRec.RecoveryExprs[0] &&
6844 "Expected a SCEVDbgSalvageBuilder for location 0");
6845 SCEVDbgValueBuilder *B = DVIRec.RecoveryExprs[0].get();
6846 B->appendToVectors(NewExpr, NewLocationOps);
6847 }
6848 for (const auto &Op : DVIRec.Expr->expr_ops()) {
6849 // Most Ops needn't be updated.
6850 if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
6851 Op.appendToVector(NewExpr);
6852 continue;
6853 }
6854
6855 uint64_t LocationArgIndex = Op.getArg(0);
6856 SCEVDbgValueBuilder *DbgBuilder =
6857 DVIRec.RecoveryExprs[LocationArgIndex].get();
6858 // The location doesn't have s SCEVDbgValueBuilder, so LSR did not
6859 // optimise it away. So just translate the argument to the updated
6860 // location index.
6861 if (!DbgBuilder) {
6862 NewExpr.push_back(dwarf::DW_OP_LLVM_arg);
6863 assert(LocationOpIndexMap[Op.getArg(0)] != -1 &&
6864 "Expected a positive index for the location-op position.");
6865 NewExpr.push_back(LocationOpIndexMap[Op.getArg(0)]);
6866 continue;
6867 }
6868 // The location has a recovery expression.
6869 DbgBuilder->appendToVectors(NewExpr, NewLocationOps);
6870 }
6871
6872 UpdateDbgValue(DVIRec, NewLocationOps, NewExpr);
6873 LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: " << *DVIRec.DbgRef << "\n");
6874 return true;
6875}
6876
6877/// Obtain an expression for the iteration count, then attempt to salvage the
6878/// dbg.value intrinsics.
6880 llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar,
6881 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &DVIToUpdate) {
6882 if (DVIToUpdate.empty())
6883 return;
6884
6885 const llvm::SCEV *SCEVInductionVar = SE.getSCEV(LSRInductionVar);
6886 assert(SCEVInductionVar &&
6887 "Anticipated a SCEV for the post-LSR induction variable");
6888
6889 if (const SCEVAddRecExpr *IVAddRec =
6890 dyn_cast<SCEVAddRecExpr>(SCEVInductionVar)) {
6891 if (!IVAddRec->isAffine())
6892 return;
6893
6894 // Prevent translation using excessive resources.
6895 if (IVAddRec->getExpressionSize() > MaxSCEVSalvageExpressionSize)
6896 return;
6897
6898 // The iteration count is required to recover location values.
6899 SCEVDbgValueBuilder IterCountExpr;
6900 IterCountExpr.pushLocation(LSRInductionVar);
6901 if (!IterCountExpr.SCEVToIterCountExpr(*IVAddRec, SE))
6902 return;
6903
6904 LLVM_DEBUG(dbgs() << "scev-salvage: IV SCEV: " << *SCEVInductionVar
6905 << '\n');
6906
6907 for (auto &DVIRec : DVIToUpdate) {
6908 SalvageDVI(L, SE, LSRInductionVar, *DVIRec, SCEVInductionVar,
6909 IterCountExpr);
6910 }
6911 }
6912}
6913
6914/// Identify and cache salvageable DVI locations and expressions along with the
6915/// corresponding SCEV(s). Also ensure that the DVI is not deleted between
6916/// cacheing and salvaging.
6918 Loop *L, ScalarEvolution &SE,
6919 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &SalvageableDVISCEVs) {
6920 for (const auto &B : L->getBlocks()) {
6921 for (auto &I : *B) {
6922 for (DbgVariableRecord &DbgVal : filterDbgVars(I.getDbgRecordRange())) {
6923 if (!DbgVal.isDbgValue() && !DbgVal.isDbgAssign())
6924 continue;
6925
6926 // Ensure that if any location op is undef that the dbg.vlue is not
6927 // cached.
6928 if (DbgVal.isKillLocation())
6929 continue;
6930
6931 // Check that the location op SCEVs are suitable for translation to
6932 // DIExpression.
6933 const auto &HasTranslatableLocationOps =
6934 [&](const DbgVariableRecord &DbgValToTranslate) -> bool {
6935 for (const auto LocOp : DbgValToTranslate.location_ops()) {
6936 if (!LocOp)
6937 return false;
6938
6939 if (!SE.isSCEVable(LocOp->getType()))
6940 return false;
6941
6942 const SCEV *S = SE.getSCEV(LocOp);
6943 if (SE.containsUndefs(S))
6944 return false;
6945 }
6946 return true;
6947 };
6948
6949 if (!HasTranslatableLocationOps(DbgVal))
6950 continue;
6951
6952 std::unique_ptr<DVIRecoveryRec> NewRec =
6953 std::make_unique<DVIRecoveryRec>(&DbgVal);
6954 // Each location Op may need a SCEVDbgValueBuilder in order to recover
6955 // it. Pre-allocating a vector will enable quick lookups of the builder
6956 // later during the salvage.
6957 NewRec->RecoveryExprs.resize(DbgVal.getNumVariableLocationOps());
6958 for (const auto LocOp : DbgVal.location_ops()) {
6959 NewRec->SCEVs.push_back(SE.getSCEV(LocOp));
6960 NewRec->LocationOps.push_back(LocOp);
6961 NewRec->HadLocationArgList = DbgVal.hasArgList();
6962 }
6963 SalvageableDVISCEVs.push_back(std::move(NewRec));
6964 }
6965 }
6966 }
6967}
6968
6969/// Ideally pick the PHI IV inserted by ScalarEvolutionExpander. As a fallback
6970/// any PHi from the loop header is usable, but may have less chance of
6971/// surviving subsequent transforms.
6973 const LSRInstance &LSR) {
6974
6975 auto IsSuitableIV = [&](PHINode *P) {
6976 if (!SE.isSCEVable(P->getType()))
6977 return false;
6978 if (const SCEVAddRecExpr *Rec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(P)))
6979 return Rec->isAffine() && !SE.containsUndefs(SE.getSCEV(P));
6980 return false;
6981 };
6982
6983 // For now, just pick the first IV that was generated and inserted by
6984 // ScalarEvolution. Ideally pick an IV that is unlikely to be optimised away
6985 // by subsequent transforms.
6986 for (const WeakVH &IV : LSR.getScalarEvolutionIVs()) {
6987 if (!IV)
6988 continue;
6989
6990 // There should only be PHI node IVs.
6991 PHINode *P = cast<PHINode>(&*IV);
6992
6993 if (IsSuitableIV(P))
6994 return P;
6995 }
6996
6997 for (PHINode &P : L.getHeader()->phis()) {
6998 if (IsSuitableIV(&P))
6999 return &P;
7000 }
7001 return nullptr;
7002}
7003
7005 DominatorTree &DT, LoopInfo &LI,
7006 const TargetTransformInfo &TTI,
7008 MemorySSA *MSSA) {
7009
7010 // Debug preservation - before we start removing anything identify which DVI
7011 // meet the salvageable criteria and store their DIExpression and SCEVs.
7012 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> SalvageableDVIRecords;
7013 DbgGatherSalvagableDVI(L, SE, SalvageableDVIRecords);
7014
7015 bool Changed = false;
7016 std::unique_ptr<MemorySSAUpdater> MSSAU;
7017 if (MSSA)
7018 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
7019
7020 // Run the main LSR transformation.
7021 const LSRInstance &Reducer =
7022 LSRInstance(L, IU, SE, DT, LI, TTI, AC, TLI, MSSAU.get());
7023 Changed |= Reducer.getChanged();
7024
7025 // Remove any extra phis created by processing inner loops.
7026 Changed |= DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7027 if (EnablePhiElim && L->isLoopSimplifyForm()) {
7029 const DataLayout &DL = L->getHeader()->getDataLayout();
7030 SCEVExpander Rewriter(SE, DL, "lsr", false);
7031#if LLVM_ENABLE_ABI_BREAKING_CHECKS
7032 Rewriter.setDebugType(DEBUG_TYPE);
7033#endif
7034 unsigned numFolded = Rewriter.replaceCongruentIVs(L, &DT, DeadInsts, &TTI);
7035 Rewriter.clear();
7036 if (numFolded) {
7037 Changed = true;
7039 MSSAU.get());
7040 DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7041 }
7042 }
7043 // LSR may at times remove all uses of an induction variable from a loop.
7044 // The only remaining use is the PHI in the exit block.
7045 // When this is the case, if the exit value of the IV can be calculated using
7046 // SCEV, we can replace the exit block PHI with the final value of the IV and
7047 // skip the updates in each loop iteration.
7048 if (L->isRecursivelyLCSSAForm(DT, LI) && L->getExitBlock()) {
7050 const DataLayout &DL = L->getHeader()->getDataLayout();
7051 SCEVExpander Rewriter(SE, DL, "lsr", true);
7052 int Rewrites = rewriteLoopExitValues(L, &LI, &TLI, &SE, &TTI, Rewriter, &DT,
7053 UnusedIndVarInLoop, DeadInsts);
7054 Rewriter.clear();
7055 if (Rewrites) {
7056 Changed = true;
7058 MSSAU.get());
7059 DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7060 }
7061 }
7062
7063 if (SalvageableDVIRecords.empty())
7064 return Changed;
7065
7066 // Obtain relevant IVs and attempt to rewrite the salvageable DVIs with
7067 // expressions composed using the derived iteration count.
7068 // TODO: Allow for multiple IV references for nested AddRecSCEVs
7069 for (const auto &L : LI) {
7070 if (llvm::PHINode *IV = GetInductionVariable(*L, SE, Reducer))
7071 DbgRewriteSalvageableDVIs(L, SE, IV, SalvageableDVIRecords);
7072 else {
7073 LLVM_DEBUG(dbgs() << "scev-salvage: SCEV salvaging not possible. An IV "
7074 "could not be identified.\n");
7075 }
7076 }
7077
7078 for (auto &Rec : SalvageableDVIRecords)
7079 Rec->clear();
7080 SalvageableDVIRecords.clear();
7081 return Changed;
7082}
7083
7084bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
7085 if (skipLoop(L))
7086 return false;
7087
7088 auto &IU = getAnalysis<IVUsersWrapperPass>().getIU();
7089 auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
7090 auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
7091 auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
7092 const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
7093 *L->getHeader()->getParent());
7094 auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
7095 *L->getHeader()->getParent());
7096 auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
7097 *L->getHeader()->getParent());
7098 auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
7099 MemorySSA *MSSA = nullptr;
7100 if (MSSAAnalysis)
7101 MSSA = &MSSAAnalysis->getMSSA();
7102 return ReduceLoopStrength(L, IU, SE, DT, LI, TTI, AC, TLI, MSSA);
7103}
7104
7107 LPMUpdater &) {
7108 if (!ReduceLoopStrength(&L, AM.getResult<IVUsersAnalysis>(L, AR), AR.SE,
7109 AR.DT, AR.LI, AR.TTI, AR.AC, AR.TLI, AR.MSSA))
7110 return PreservedAnalyses::all();
7111
7112 auto PA = getLoopPassPreservedAnalyses();
7113 if (AR.MSSA)
7114 PA.preserve<MemorySSAAnalysis>();
7115 return PA;
7116}
7117
7118char LoopStrengthReduce::ID = 0;
7119
7120INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
7121 "Loop Strength Reduction", false, false)
7127INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
7128INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
7129 "Loop Strength Reduction", false, false)
7130
7131Pass *llvm::createLoopStrengthReducePass() { return new LoopStrengthReduce(); }
#define Success
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis false
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:638
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isCanonical(const MDString *S)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
This file contains constants used for implementing Dwarf debug support.
early cse Early CSE w MemorySSA
#define DEBUG_TYPE
Hexagon Hardware Loops
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
This header provides classes for managing per-loop analyses.
static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec, const SCEV *SCEVInductionVar, SCEVDbgValueBuilder IterCountExpr)
static cl::opt< bool > DropScaledForVScale("lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true), cl::desc("Avoid using scaled registers with vscale-relative addressing"))
static Value * getWideOperand(Value *Oper)
IVChain logic must consistently peek base TruncInst operands, so wrap it in a convenient helper.
static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE)
Return true if the given add can be sign-extended without changing its value.
static bool mayUsePostIncMode(const TargetTransformInfo &TTI, LSRUse &LU, const SCEV *S, const Loop *L, ScalarEvolution &SE)
Return true if the SCEV represents a value that may end up as a post-increment operation.
static void restorePreTransformState(DVIRecoveryRec &DVIRec)
Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE)
If S involves the addition of a constant integer value, return that integer value,...
static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L)
static User::op_iterator findIVOperand(User::op_iterator OI, User::op_iterator OE, Loop *L, ScalarEvolution &SE)
Helper for CollectChains that finds an IV operand (computed by an AddRec in this loop) within [OI,...
static cl::opt< TTI::AddressingModeKind > PreferredAddresingMode("lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None), cl::desc("A flag that overrides the target's preferred addressing mode."), cl::values(clEnumValN(TTI::AMK_None, "none", "Don't prefer any addressing mode"), clEnumValN(TTI::AMK_PreIndexed, "preindexed", "Prefer pre-indexed addressing mode"), clEnumValN(TTI::AMK_PostIndexed, "postindexed", "Prefer post-indexed addressing mode"), clEnumValN(TTI::AMK_All, "all", "Consider all addressing modes")))
static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset, Immediate MaxOffset, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg, int64_t Scale)
Test whether we know how to expand the current formula.
static void DbgGatherSalvagableDVI(Loop *L, ScalarEvolution &SE, SmallVector< std::unique_ptr< DVIRecoveryRec >, 2 > &SalvageableDVISCEVs)
Identify and cache salvageable DVI locations and expressions along with the corresponding SCEV(s).
static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE)
Return true if the given mul can be sign-extended without changing its value.
static const unsigned MaxSCEVSalvageExpressionSize
Limit the size of expression that SCEV-based salvaging will attempt to translate into a DIExpression.
static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE)
Return true if this AddRec is already a phi in its loop.
static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F, const Loop &L)
static cl::opt< bool > InsnsCost("lsr-insns-cost", cl::Hidden, cl::init(true), cl::desc("Add instruction count to a LSR cost model"))
static cl::opt< bool > StressIVChain("stress-ivchain", cl::Hidden, cl::init(false), cl::desc("Stress test LSR IV chains"))
static bool isAddressUse(const TargetTransformInfo &TTI, Instruction *Inst, Value *OperandVal)
Returns true if the specified instruction is using the specified value as an address.
static GlobalValue * ExtractSymbol(const SCEV *&S, ScalarEvolution &SE)
If S involves the addition of a GlobalValue address, return that symbol, and mutate S to point to a n...
static void updateDVIWithLocation(T &DbgVal, Value *Location, SmallVectorImpl< uint64_t > &Ops)
Overwrites DVI with the location and Ops as the DIExpression.
static bool isLegalAddImmediate(const TargetTransformInfo &TTI, Immediate Offset)
static cl::opt< cl::boolOrDefault > AllowDropSolutionIfLessProfitable("lsr-drop-solution", cl::Hidden, cl::desc("Attempt to drop solution if it is less profitable"))
static cl::opt< bool > EnableVScaleImmediates("lsr-enable-vscale-immediates", cl::Hidden, cl::init(true), cl::desc("Enable analysis of vscale-relative immediates in LSR"))
static const SCEV * getExprBase(const SCEV *S)
Return an approximation of this SCEV expression's "base", or NULL for any constant.
static bool isAlwaysFoldable(const TargetTransformInfo &TTI, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg)
static llvm::PHINode * GetInductionVariable(const Loop &L, ScalarEvolution &SE, const LSRInstance &LSR)
Ideally pick the PHI IV inserted by ScalarEvolutionExpander.
static bool IsSimplerBaseSCEVForTarget(const TargetTransformInfo &TTI, ScalarEvolution &SE, const SCEV *Best, const SCEV *Reg, MemAccessTy AccessType)
static const unsigned MaxIVUsers
MaxIVUsers is an arbitrary threshold that provides an early opportunity for bail out.
static bool isHighCostExpansion(const SCEV *S, SmallPtrSetImpl< const SCEV * > &Processed, ScalarEvolution &SE)
Check if expanding this expression is likely to incur significant cost.
static Value * getValueOrPoison(WeakVH &VH, LLVMContext &C)
Cached location ops may be erased during LSR, in which case a poison is required when restoring from ...
static MemAccessTy getAccessType(const TargetTransformInfo &TTI, Instruction *Inst, Value *OperandVal)
Return the type of the memory being accessed.
static unsigned numLLVMArgOps(SmallVectorImpl< uint64_t > &Expr)
Returns the total number of DW_OP_llvm_arg operands in the expression.
static void DbgRewriteSalvageableDVIs(llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar, SmallVector< std::unique_ptr< DVIRecoveryRec >, 2 > &DVIToUpdate)
Obtain an expression for the iteration count, then attempt to salvage the dbg.value intrinsics.
static cl::opt< bool > EnablePhiElim("enable-lsr-phielim", cl::Hidden, cl::init(true), cl::desc("Enable LSR phi elimination"))
static void UpdateDbgValue(DVIRecoveryRec &DVIRec, SmallVectorImpl< Value * > &NewLocationOps, SmallVectorImpl< uint64_t > &NewExpr)
Write the new expression and new location ops for the dbg.value.
static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE)
Return true if the given addrec can be sign-extended without changing its value.
static bool canHoistIVInc(const TargetTransformInfo &TTI, const LSRFixup &Fixup, const LSRUse &LU, Instruction *IVIncInsertPos, Loop *L)
static void DoInitialMatch(const SCEV *S, Loop *L, SmallVectorImpl< const SCEV * > &Good, SmallVectorImpl< const SCEV * > &Bad, ScalarEvolution &SE)
Recursion helper for initialMatch.
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F)
Check if the addressing mode defined by F is completely folded in LU at isel time.
static cl::opt< bool > LSRExpNarrow("lsr-exp-narrow", cl::Hidden, cl::init(false), cl::desc("Narrow LSR complex solution using" " expectation of registers number"))
static cl::opt< bool > FilterSameScaledReg("lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true), cl::desc("Narrow LSR search space by filtering non-optimal formulae" " with the same ScaledReg and Scale"))
static void updateDVIWithLocations(T &DbgVal, SmallVectorImpl< Value * > &Locations, SmallVectorImpl< uint64_t > &Ops)
Overwrite DVI with locations placed into a DIArglist.
static cl::opt< unsigned > ComplexityLimit("lsr-complexity-limit", cl::Hidden, cl::init(std::numeric_limits< uint16_t >::max()), cl::desc("LSR search space complexity limit"))
static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC, TargetLibraryInfo &TLI, MemorySSA *MSSA)
static bool isProfitableChain(IVChain &Chain, SmallPtrSetImpl< Instruction * > &Users, ScalarEvolution &SE, const TargetTransformInfo &TTI)
Return true if the number of registers needed for the chain is estimated to be less than the number r...
static const SCEV * CollectSubexprs(const SCEV *S, const SCEVConstant *C, SmallVectorImpl< const SCEV * > &Ops, const Loop *L, ScalarEvolution &SE, unsigned Depth=0)
Split S into subexpressions which can be pulled out into separate registers.
static const SCEV * getExactSDiv(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE, bool IgnoreSignificantBits=false)
Return an expression for LHS /s RHS, if it can be determined and if the remainder is known to be zero...
static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, Value *Operand, const TargetTransformInfo &TTI)
Return true if the IVInc can be folded into an addressing mode.
static const SCEV * getAnyExtendConsideringPostIncUses(ArrayRef< PostIncLoopSet > Loops, const SCEV *Expr, Type *ToTy, ScalarEvolution &SE)
Extend/Truncate Expr to ToTy considering post-inc uses in Loops.
static unsigned getSetupCost(const SCEV *Reg, unsigned Depth)
static cl::opt< unsigned > SetupCostDepthLimit("lsr-setupcost-depth-limit", cl::Hidden, cl::init(7), cl::desc("The limit on recursion depth for LSRs setup cost"))
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
#define G(x, y, z)
Definition MD5.cpp:56
Register Reg
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...
#define T
uint64_t IntrinsicInst * II
#define P(N)
PowerPC TLS Dynamic Call Fixup
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file defines the PointerIntPair class.
const SmallVectorImpl< MachineOperand > & Cond
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
SI optimize exec mask operations pre RA
This file contains some templates that are useful if you are working with the STL at all.
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
static const unsigned UnknownAddressSpace
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
Virtual Register Rewriter
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Class for arbitrary precision integers.
Definition APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:329
LLVM_ABI APInt sdiv(const APInt &RHS) const
Signed division function for APInt.
Definition APInt.cpp:1644
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition APInt.h:1531
LLVM_ABI APInt srem(const APInt &RHS) const
Function for signed remainder operation.
Definition APInt.cpp:1736
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1562
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
LLVM_ABI AnalysisUsage & addRequiredID(const void *ID)
Definition Pass.cpp:284
AnalysisUsage & addPreservedID(const void *ID)
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition BasicBlock.h:528
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition BasicBlock.h:386
LLVM_ABI bool isLandingPad() const
Return true if this basic block is a landing pad.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
BinaryOps getOpcode() const
Definition InstrTypes.h:374
static LLVM_ABI BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), InsertPosition InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
bool isUnconditional() const
Value * getCondition() const
static LLVM_ABI Instruction::CastOps getCastOpcode(const Value *Val, bool SrcIsSigned, Type *Ty, bool DstIsSigned)
Returns the opcode necessary to cast Val into Ty using usual casting rules.
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
@ ICMP_NE
not equal
Definition InstrTypes.h:700
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:791
static LLVM_ABI bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:131
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:169
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:163
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI DIArgList * get(LLVMContext &Context, ArrayRef< ValueAsMetadata * > Args)
DWARF expression.
iterator_range< expr_op_iterator > expr_ops() const
static LLVM_ABI DIExpression * append(const DIExpression *Expr, ArrayRef< uint64_t > Ops)
Append the opcodes Ops to DIExpr.
unsigned getNumElements() const
static LLVM_ABI void appendOffset(SmallVectorImpl< uint64_t > &Ops, int64_t Offset)
Append Ops with operations to apply the Offset.
LLVM_ABI bool isComplex() const
Return whether the location is computed on the expression stack, meaning it cannot be a simple regist...
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
LLVM_ABI LLVMContext & getContext()
Record of a variable value-assignment, aka a non instruction representation of the dbg....
LLVM_ABI bool isKillLocation() const
void setRawLocation(Metadata *NewLocation)
Use of this should generally be avoided; instead, replaceVariableLocationOp and addVariableLocationOp...
void setExpression(DIExpression *NewExpr)
DIExpression * getExpression() const
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:229
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:214
NodeT * getBlock() const
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:322
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:165
LLVM_ABI Instruction * findNearestCommonDominator(Instruction *I1, Instruction *I2) const
Find the nearest instruction I that dominates both I1 and I2, in the sense that a result produced bef...
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
PointerType * getType() const
Global values are always pointers.
IVStrideUse - Keep track of one use of a strided induction variable.
Definition IVUsers.h:35
void transformToPostInc(const Loop *L)
transformToPostInc - Transform the expression to post-inc form for the given loop.
Definition IVUsers.cpp:365
Value * getOperandValToReplace() const
getOperandValToReplace - Return the Value of the operand in the user instruction that this IVStrideUs...
Definition IVUsers.h:54
void setUser(Instruction *NewUser)
setUser - Assign a new user instruction for this use.
Definition IVUsers.h:48
Analysis pass that exposes the IVUsers for a loop.
Definition IVUsers.h:184
ilist< IVStrideUse >::const_iterator const_iterator
Definition IVUsers.h:142
iterator end()
Definition IVUsers.h:144
iterator begin()
Definition IVUsers.h:143
bool empty() const
Definition IVUsers.h:147
LLVM_ABI void print(raw_ostream &OS) const
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isLifetimeStartOrEnd() const LLVM_READONLY
Return true if the instruction is a llvm.lifetime.start or llvm.lifetime.end marker.
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveBefore(InstListType::iterator InsertPos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool isEHPad() const
Return true if the instruction is a variety of EH-block.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI Type * getAccessType() const LLVM_READONLY
Return the type this instruction accesses in memory, if any.
const char * getOpcodeName() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
An instruction for reading from memory.
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
BlockT * getHeader() const
unsigned getLoopDepth() const
Return the nesting level of this loop.
The legacy pass manager's analysis pass to compute loop information.
Definition LoopInfo.h:596
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1565
An analysis that produces MemorySSA for a function.
Definition MemorySSA.h:936
Encapsulates MemorySSA, including all data associated with memory accesses.
Definition MemorySSA.h:702
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
iterator_range< const_block_iterator > blocks() const
op_range incoming_values()
void setIncomingValue(unsigned i, Value *V)
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
static unsigned getIncomingValueNumForOperand(unsigned i)
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition Pass.h:99
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
This node represents an addition of some number of SCEVs.
This node represents a polynomial recurrence on the trip count of the specified loop.
const SCEV * getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents a constant integer value.
ConstantInt * getValue() const
const APInt & getAPInt() const
This class uses information about analyze scalars to rewrite expressions in canonical form.
This node represents multiplication of some number of SCEVs.
ArrayRef< const SCEV * > operands() const
This means that we are dealing with an entirely unknown SCEV value, and only represent it as its LLVM...
This class represents an analyzed expression in the program.
LLVM_ABI ArrayRef< const SCEV * > operands() const
Return operands of this SCEV expression.
unsigned short getExpressionSize() const
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
SCEVTypes getSCEVType() const
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getZero(Type *Ty)
Return a SCEV for the constant 0 of a specific type.
LLVM_ABI uint64_t getTypeSizeInBits(Type *Ty) const
Return the size in bits of the specified type, for which isSCEVable must return true.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getNoopOrSignExtend(const SCEV *V, Type *Ty)
Return a SCEV corresponding to a conversion of the input value to the specified type.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI const SCEV * getAddRecExpr(const SCEV *Start, const SCEV *Step, const Loop *L, SCEV::NoWrapFlags Flags)
Get an add recurrence expression for the specified loop.
LLVM_ABI bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
LLVM_ABI Type * getEffectiveSCEVType(Type *Ty) const
Return a type with the same bitwidth as the given type and which represents how SCEV will treat the g...
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getAnyExtendExpr(const SCEV *Op, Type *Ty)
getAnyExtendExpr - Return a SCEV for the given operand extended with unspecified bits out to the give...
LLVM_ABI bool containsUndefs(const SCEV *S) const
Return true if the SCEV expression contains an undef value.
LLVM_ABI const SCEV * getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth=0)
LLVM_ABI const SCEV * getVScale(Type *Ty)
LLVM_ABI bool hasComputableLoopEvolution(const SCEV *S, const Loop *L)
Return true if the given SCEV changes value in a known way in the specified loop.
LLVM_ABI const SCEV * getPointerBase(const SCEV *V)
Transitively follow the chain of pointer-type operands until reaching a SCEV that does not have a sin...
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getUnknown(Value *V)
LLVM_ABI std::optional< APInt > computeConstantDifference(const SCEV *LHS, const SCEV *RHS)
Compute LHS - RHS and returns the result as an APInt if it is a constant, and std::nullopt if it isn'...
LLVM_ABI bool properlyDominates(const SCEV *S, const BasicBlock *BB)
Return true if elements that makes up the given SCEV properly dominate the specified basic block.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
LLVM_ABI bool containsErasedValue(const SCEV *S) const
Return true if the SCEV expression contains a Value that has been optimised out and is now a nullptr.
LLVMContext & getContext() const
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:104
iterator end()
Get an iterator to the end of the SetVector.
Definition SetVector.h:119
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition SetVector.h:109
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:168
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
iterator_range< const_set_bits_iterator > set_bits() const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
size_type size() const
Returns the number of bits in this bitvector.
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:181
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
iterator erase(const_iterator CI)
typename SuperClass::const_iterator const_iterator
typename SuperClass::iterator iterator
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
static StackOffset get(int64_t Fixed, int64_t Scalable)
Definition TypeSize.h:42
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
LLVM_ABI bool shouldDropLSRSolutionIfLessProfitable() const
Return true if LSR should drop a found solution if it's calculated to be less profitable than the bas...
LLVM_ABI bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const
Return true if LSR cost of C1 is lower than C2.
LLVM_ABI bool isIndexedStoreLegal(enum MemIndexedMode Mode, Type *Ty) const
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
LLVM_ABI bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace=0, Instruction *I=nullptr, int64_t ScalableOffset=0) const
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
LLVM_ABI bool isIndexedLoadLegal(enum MemIndexedMode Mode, Type *Ty) const
LLVM_ABI bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
LLVM_ABI bool isLegalAddImmediate(int64_t Imm) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
LLVM_ABI bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, TargetLibraryInfo *LibInfo) const
Return true if the target can save a compare for loop count, for example hardware loop saves a compar...
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
LLVM_ABI bool canMacroFuseCmp() const
Return true if the target can fuse a compare and branch.
AddressingModeKind
Which addressing mode Loop Strength Reduction will try to generate.
@ AMK_PostIndexed
Prefer post-indexed addressing mode.
@ AMK_All
Consider all addressing modes.
@ AMK_PreIndexed
Prefer pre-indexed addressing mode.
@ AMK_None
Don't prefer any addressing mode.
LLVM_ABI bool isTruncateFree(Type *Ty1, Type *Ty2) const
Return true if it's free to truncate a value of type Ty1 to type Ty2.
This class represents a truncation of integer types.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:62
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:295
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI int getFPMantissaWidth() const
Return the width of the mantissa of this type.
Definition Type.cpp:236
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
Use * op_iterator
Definition User.h:279
op_range operands()
Definition User.h:292
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition User.cpp:21
op_iterator op_begin()
Definition User.h:284
void setOperand(unsigned i, Value *Val)
Definition User.h:237
Value * getOperand(unsigned i) const
Definition User.h:232
op_iterator op_end()
Definition User.h:286
static LLVM_ABI ValueAsMetadata * get(Value *V)
Definition Metadata.cpp:502
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI void printAsOperand(raw_ostream &O, bool PrintType=true, const Module *M=nullptr) const
Print the name of this Value out to the specified raw_ostream.
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1101
iterator_range< use_iterator > uses()
Definition Value.h:380
A nullable Value handle that is nullable.
int getNumOccurrences() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:194
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:174
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:130
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
class_match< const SCEVVScale > m_SCEVVScale()
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
class_match< const SCEVConstant > m_SCEVConstant()
SCEVAffineAddRec_match< Op0_t, Op1_t, class_match< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
SCEVBinaryExpr_match< SCEVMulExpr, Op0_t, Op1_t > m_scev_Mul(const Op0_t &Op0, const Op1_t &Op1)
bool match(const SCEV *S, const Pattern &P)
class_match< const Loop > m_Loop()
cst_pred_ty< is_specific_cst > m_scev_SpecificInt(uint64_t V)
Match an SCEV constant with a plain unsigned integer.
class_match< const SCEV > m_SCEV()
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
@ DW_OP_LLVM_arg
Only used in LLVM metadata.
Definition Dwarf.h:149
@ DW_OP_LLVM_convert
Only used in LLVM metadata.
Definition Dwarf.h:145
constexpr double e
Definition MathExtras.h:47
Sequence
A sequence of states that a pointer may go through in which an objc_retain and objc_release are actua...
Definition PtrState.h:41
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
iterator end() const
Definition BasicBlock.h:89
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI iterator begin() const
BaseReg
Stack frame base register. Bit 0 of FREInfo.Info.
Definition SFrame.h:77
unsigned KindType
For isa, dyn_cast, etc operations on TelemetryInfo.
Definition Telemetry.h:85
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1731
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1705
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
InstructionCost Cost
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition Utils.cpp:1725
bool operator!=(uint64_t V1, const APInt &V2)
Definition APInt.h:2113
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2116
LLVM_ABI char & LoopSimplifyID
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
DomTreeNodeBase< BasicBlock > DomTreeNode
Definition Dominators.h:95
AnalysisManager< Loop, LoopStandardAnalysisResults & > LoopAnalysisManager
The loop analysis manager.
LLVM_ABI bool matchSimpleRecurrence(const PHINode *P, BinaryOperator *&BO, Value *&Start, Value *&Step)
Attempt to match a simple first order recurrence cycle of the form: iv = phi Ty [Start,...
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:759
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1712
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
LLVM_ABI bool DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr)
Examine each PHI in the given block and delete it if it is dead.
LLVM_ABI void initializeLoopStrengthReducePass(PassRegistry &)
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
LLVM_ABI const SCEV * denormalizeForPostIncUse(const SCEV *S, const PostIncLoopSet &Loops, ScalarEvolution &SE)
Denormalize S to be post-increment for all loops present in Loops.
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1624
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1719
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
LLVM_ABI Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
LLVM_ABI void SplitLandingPadPredecessors(BasicBlock *OrigBB, ArrayRef< BasicBlock * > Preds, const char *Suffix, const char *Suffix2, SmallVectorImpl< BasicBlock * > &NewBBs, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, bool PreserveLCSSA=false)
This method transforms the landing pad, OrigBB, by introducing two new basic blocks into the function...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
LLVM_ABI const SCEV * normalizeForPostIncUse(const SCEV *S, const PostIncLoopSet &Loops, ScalarEvolution &SE, bool CheckInvertible=true)
Normalize S to be post-increment for all loops present in Loops.
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Other
Any other memory.
Definition ModRef.h:68
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
@ Add
Sum of integers.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1934
DWARFExpression::Operation Op
LLVM_ABI Pass * createLoopStrengthReducePass()
LLVM_ABI BasicBlock * SplitCriticalEdge(Instruction *TI, unsigned SuccNum, const CriticalEdgeSplittingOptions &Options=CriticalEdgeSplittingOptions(), const Twine &BBName="")
If this edge is a critical edge, insert a new node to split the critical edge.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructionsPermissive(SmallVectorImpl< WeakTrackingVH > &DeadInsts, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
Same functionality as RecursivelyDeleteTriviallyDeadInstructions, but allow instructions that are not...
Definition Local.cpp:548
constexpr unsigned BitWidth
LLVM_ABI bool formLCSSAForInstructions(SmallVectorImpl< Instruction * > &Worklist, const DominatorTree &DT, const LoopInfo &LI, ScalarEvolution *SE, SmallVectorImpl< PHINode * > *PHIsToRemove=nullptr, SmallVectorImpl< PHINode * > *InsertedPHIs=nullptr)
Ensures LCSSA form for every instruction from the Worklist in the scope of innermost containing loop.
Definition LCSSA.cpp:308
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
LLVM_ABI PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
SmallPtrSet< const Loop *, 2 > PostIncLoopSet
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
LLVM_ABI int rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI, ScalarEvolution *SE, const TargetTransformInfo *TTI, SCEVExpander &Rewriter, DominatorTree *DT, ReplaceExitVal ReplaceExitValue, SmallVector< WeakTrackingVH, 16 > &DeadInsts)
If the final value of any expressions that are recurrent in the loop can be computed,...
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1877
@ UnusedIndVarInLoop
Definition LoopUtils.h:520
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
bool SCEVExprContains(const SCEV *Root, PredTy Pred)
Return true if any node in Root satisfies the predicate Pred.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:853
#define N
Attributes of a target dependent hardware loop.
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
Information about a load/store intrinsic defined by the target.
Value * PtrVal
This is the pointer that the intrinsic is loading from or storing to.