LLVM 22.0.0git
LoopStrengthReduce.cpp
Go to the documentation of this file.
1//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This transformation analyzes and transforms the induction variables (and
10// computations derived from them) into forms suitable for efficient execution
11// on the target.
12//
13// This pass performs a strength reduction on array references inside loops that
14// have as one or more of their components the loop induction variable, it
15// rewrites expressions to take advantage of scaled-index addressing modes
16// available on the target, and it performs a variety of other optimizations
17// related to loop induction variables.
18//
19// Terminology note: this code has a lot of handling for "post-increment" or
20// "post-inc" users. This is not talking about post-increment addressing modes;
21// it is instead talking about code like this:
22//
23// %i = phi [ 0, %entry ], [ %i.next, %latch ]
24// ...
25// %i.next = add %i, 1
26// %c = icmp eq %i.next, %n
27//
28// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
29// it's useful to think about these as the same register, with some uses using
30// the value of the register before the add and some using it after. In this
31// example, the icmp is a post-increment user, since it uses %i.next, which is
32// the value of the induction variable after the increment. The other common
33// case of post-increment users is users outside the loop.
34//
35// TODO: More sophistication in the way Formulae are generated and filtered.
36//
37// TODO: Handle multiple loops at a time.
38//
39// TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead
40// of a GlobalValue?
41//
42// TODO: When truncation is free, truncate ICmp users' operands to make it a
43// smaller encoding (on x86 at least).
44//
45// TODO: When a negated register is used by an add (such as in a list of
46// multiple base registers, or as the increment expression in an addrec),
47// we may not actually need both reg and (-1 * reg) in registers; the
48// negation can be implemented by using a sub instead of an add. The
49// lack of support for taking this into consideration when making
50// register pressure decisions is partly worked around by the "Special"
51// use kind.
52//
53//===----------------------------------------------------------------------===//
54
56#include "llvm/ADT/APInt.h"
57#include "llvm/ADT/DenseMap.h"
58#include "llvm/ADT/DenseSet.h"
60#include "llvm/ADT/STLExtras.h"
61#include "llvm/ADT/SetVector.h"
64#include "llvm/ADT/SmallSet.h"
66#include "llvm/ADT/Statistic.h"
84#include "llvm/IR/BasicBlock.h"
85#include "llvm/IR/Constant.h"
86#include "llvm/IR/Constants.h"
89#include "llvm/IR/Dominators.h"
90#include "llvm/IR/GlobalValue.h"
91#include "llvm/IR/IRBuilder.h"
92#include "llvm/IR/InstrTypes.h"
93#include "llvm/IR/Instruction.h"
96#include "llvm/IR/Module.h"
97#include "llvm/IR/Operator.h"
98#include "llvm/IR/Type.h"
99#include "llvm/IR/Use.h"
100#include "llvm/IR/User.h"
101#include "llvm/IR/Value.h"
102#include "llvm/IR/ValueHandle.h"
104#include "llvm/Pass.h"
105#include "llvm/Support/Casting.h"
108#include "llvm/Support/Debug.h"
118#include <algorithm>
119#include <cassert>
120#include <cstddef>
121#include <cstdint>
122#include <iterator>
123#include <limits>
124#include <map>
125#include <numeric>
126#include <optional>
127#include <utility>
128
129using namespace llvm;
130using namespace SCEVPatternMatch;
131
132#define DEBUG_TYPE "loop-reduce"
133
134/// MaxIVUsers is an arbitrary threshold that provides an early opportunity for
135/// bail out. This threshold is far beyond the number of users that LSR can
136/// conceivably solve, so it should not affect generated code, but catches the
137/// worst cases before LSR burns too much compile time and stack space.
138static const unsigned MaxIVUsers = 200;
139
140/// Limit the size of expression that SCEV-based salvaging will attempt to
141/// translate into a DIExpression.
142/// Choose a maximum size such that debuginfo is not excessively increased and
143/// the salvaging is not too expensive for the compiler.
144static const unsigned MaxSCEVSalvageExpressionSize = 64;
145
146// Cleanup congruent phis after LSR phi expansion.
148 "enable-lsr-phielim", cl::Hidden, cl::init(true),
149 cl::desc("Enable LSR phi elimination"));
150
151// The flag adds instruction count to solutions cost comparison.
153 "lsr-insns-cost", cl::Hidden, cl::init(true),
154 cl::desc("Add instruction count to a LSR cost model"));
155
156// Flag to choose how to narrow complex lsr solution
158 "lsr-exp-narrow", cl::Hidden, cl::init(false),
159 cl::desc("Narrow LSR complex solution using"
160 " expectation of registers number"));
161
162// Flag to narrow search space by filtering non-optimal formulae with
163// the same ScaledReg and Scale.
165 "lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true),
166 cl::desc("Narrow LSR search space by filtering non-optimal formulae"
167 " with the same ScaledReg and Scale"));
168
170 "lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None),
171 cl::desc("A flag that overrides the target's preferred addressing mode."),
173 "none",
174 "Don't prefer any addressing mode"),
176 "preindexed",
177 "Prefer pre-indexed addressing mode"),
179 "postindexed",
180 "Prefer post-indexed addressing mode")));
181
183 "lsr-complexity-limit", cl::Hidden,
184 cl::init(std::numeric_limits<uint16_t>::max()),
185 cl::desc("LSR search space complexity limit"));
186
188 "lsr-setupcost-depth-limit", cl::Hidden, cl::init(7),
189 cl::desc("The limit on recursion depth for LSRs setup cost"));
190
192 "lsr-drop-solution", cl::Hidden,
193 cl::desc("Attempt to drop solution if it is less profitable"));
194
196 "lsr-enable-vscale-immediates", cl::Hidden, cl::init(true),
197 cl::desc("Enable analysis of vscale-relative immediates in LSR"));
198
200 "lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true),
201 cl::desc("Avoid using scaled registers with vscale-relative addressing"));
202
203#ifndef NDEBUG
204// Stress test IV chain generation.
206 "stress-ivchain", cl::Hidden, cl::init(false),
207 cl::desc("Stress test LSR IV chains"));
208#else
209static bool StressIVChain = false;
210#endif
211
212namespace {
213
214struct MemAccessTy {
215 /// Used in situations where the accessed memory type is unknown.
216 static const unsigned UnknownAddressSpace =
217 std::numeric_limits<unsigned>::max();
218
219 Type *MemTy = nullptr;
220 unsigned AddrSpace = UnknownAddressSpace;
221
222 MemAccessTy() = default;
223 MemAccessTy(Type *Ty, unsigned AS) : MemTy(Ty), AddrSpace(AS) {}
224
225 bool operator==(MemAccessTy Other) const {
226 return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace;
227 }
228
229 bool operator!=(MemAccessTy Other) const { return !(*this == Other); }
230
231 static MemAccessTy getUnknown(LLVMContext &Ctx,
232 unsigned AS = UnknownAddressSpace) {
233 return MemAccessTy(Type::getVoidTy(Ctx), AS);
234 }
235
236 Type *getType() { return MemTy; }
237};
238
239/// This class holds data which is used to order reuse candidates.
240class RegSortData {
241public:
242 /// This represents the set of LSRUse indices which reference
243 /// a particular register.
244 SmallBitVector UsedByIndices;
245
246 void print(raw_ostream &OS) const;
247 void dump() const;
248};
249
250// An offset from an address that is either scalable or fixed. Used for
251// per-target optimizations of addressing modes.
252class Immediate : public details::FixedOrScalableQuantity<Immediate, int64_t> {
253 constexpr Immediate(ScalarTy MinVal, bool Scalable)
254 : FixedOrScalableQuantity(MinVal, Scalable) {}
255
256 constexpr Immediate(const FixedOrScalableQuantity<Immediate, int64_t> &V)
257 : FixedOrScalableQuantity(V) {}
258
259public:
260 constexpr Immediate() = delete;
261
262 static constexpr Immediate getFixed(ScalarTy MinVal) {
263 return {MinVal, false};
264 }
265 static constexpr Immediate getScalable(ScalarTy MinVal) {
266 return {MinVal, true};
267 }
268 static constexpr Immediate get(ScalarTy MinVal, bool Scalable) {
269 return {MinVal, Scalable};
270 }
271 static constexpr Immediate getZero() { return {0, false}; }
272 static constexpr Immediate getFixedMin() {
273 return {std::numeric_limits<int64_t>::min(), false};
274 }
275 static constexpr Immediate getFixedMax() {
276 return {std::numeric_limits<int64_t>::max(), false};
277 }
278 static constexpr Immediate getScalableMin() {
279 return {std::numeric_limits<int64_t>::min(), true};
280 }
281 static constexpr Immediate getScalableMax() {
282 return {std::numeric_limits<int64_t>::max(), true};
283 }
284
285 constexpr bool isLessThanZero() const { return Quantity < 0; }
286
287 constexpr bool isGreaterThanZero() const { return Quantity > 0; }
288
289 constexpr bool isCompatibleImmediate(const Immediate &Imm) const {
290 return isZero() || Imm.isZero() || Imm.Scalable == Scalable;
291 }
292
293 constexpr bool isMin() const {
294 return Quantity == std::numeric_limits<ScalarTy>::min();
295 }
296
297 constexpr bool isMax() const {
298 return Quantity == std::numeric_limits<ScalarTy>::max();
299 }
300
301 // Arithmetic 'operators' that cast to unsigned types first.
302 constexpr Immediate addUnsigned(const Immediate &RHS) const {
303 assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
304 ScalarTy Value = (uint64_t)Quantity + RHS.getKnownMinValue();
305 return {Value, Scalable || RHS.isScalable()};
306 }
307
308 constexpr Immediate subUnsigned(const Immediate &RHS) const {
309 assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
310 ScalarTy Value = (uint64_t)Quantity - RHS.getKnownMinValue();
311 return {Value, Scalable || RHS.isScalable()};
312 }
313
314 // Scale the quantity by a constant without caring about runtime scalability.
315 constexpr Immediate mulUnsigned(const ScalarTy RHS) const {
316 ScalarTy Value = (uint64_t)Quantity * RHS;
317 return {Value, Scalable};
318 }
319
320 // Helpers for generating SCEVs with vscale terms where needed.
321 const SCEV *getSCEV(ScalarEvolution &SE, Type *Ty) const {
322 const SCEV *S = SE.getConstant(Ty, Quantity);
323 if (Scalable)
324 S = SE.getMulExpr(S, SE.getVScale(S->getType()));
325 return S;
326 }
327
328 const SCEV *getNegativeSCEV(ScalarEvolution &SE, Type *Ty) const {
329 const SCEV *NegS = SE.getConstant(Ty, -(uint64_t)Quantity);
330 if (Scalable)
331 NegS = SE.getMulExpr(NegS, SE.getVScale(NegS->getType()));
332 return NegS;
333 }
334
335 const SCEV *getUnknownSCEV(ScalarEvolution &SE, Type *Ty) const {
336 const SCEV *SU = SE.getUnknown(ConstantInt::getSigned(Ty, Quantity));
337 if (Scalable)
338 SU = SE.getMulExpr(SU, SE.getVScale(SU->getType()));
339 return SU;
340 }
341};
342
343// This is needed for the Compare type of std::map when Immediate is used
344// as a key. We don't need it to be fully correct against any value of vscale,
345// just to make sure that vscale-related terms in the map are considered against
346// each other rather than being mixed up and potentially missing opportunities.
347struct KeyOrderTargetImmediate {
348 bool operator()(const Immediate &LHS, const Immediate &RHS) const {
349 if (LHS.isScalable() && !RHS.isScalable())
350 return false;
351 if (!LHS.isScalable() && RHS.isScalable())
352 return true;
353 return LHS.getKnownMinValue() < RHS.getKnownMinValue();
354 }
355};
356
357// This would be nicer if we could be generic instead of directly using size_t,
358// but there doesn't seem to be a type trait for is_orderable or
359// is_lessthan_comparable or similar.
360struct KeyOrderSizeTAndImmediate {
361 bool operator()(const std::pair<size_t, Immediate> &LHS,
362 const std::pair<size_t, Immediate> &RHS) const {
363 size_t LSize = LHS.first;
364 size_t RSize = RHS.first;
365 if (LSize != RSize)
366 return LSize < RSize;
367 return KeyOrderTargetImmediate()(LHS.second, RHS.second);
368 }
369};
370} // end anonymous namespace
371
372#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
373void RegSortData::print(raw_ostream &OS) const {
374 OS << "[NumUses=" << UsedByIndices.count() << ']';
375}
376
377LLVM_DUMP_METHOD void RegSortData::dump() const {
378 print(errs()); errs() << '\n';
379}
380#endif
381
382namespace {
383
384/// Map register candidates to information about how they are used.
385class RegUseTracker {
386 using RegUsesTy = DenseMap<const SCEV *, RegSortData>;
387
388 RegUsesTy RegUsesMap;
390
391public:
392 void countRegister(const SCEV *Reg, size_t LUIdx);
393 void dropRegister(const SCEV *Reg, size_t LUIdx);
394 void swapAndDropUse(size_t LUIdx, size_t LastLUIdx);
395
396 bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
397
398 const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
399
400 void clear();
401
404
405 iterator begin() { return RegSequence.begin(); }
406 iterator end() { return RegSequence.end(); }
407 const_iterator begin() const { return RegSequence.begin(); }
408 const_iterator end() const { return RegSequence.end(); }
409};
410
411} // end anonymous namespace
412
413void
414RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) {
415 std::pair<RegUsesTy::iterator, bool> Pair = RegUsesMap.try_emplace(Reg);
416 RegSortData &RSD = Pair.first->second;
417 if (Pair.second)
418 RegSequence.push_back(Reg);
419 RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
420 RSD.UsedByIndices.set(LUIdx);
421}
422
423void
424RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) {
425 RegUsesTy::iterator It = RegUsesMap.find(Reg);
426 assert(It != RegUsesMap.end());
427 RegSortData &RSD = It->second;
428 assert(RSD.UsedByIndices.size() > LUIdx);
429 RSD.UsedByIndices.reset(LUIdx);
430}
431
432void
433RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
434 assert(LUIdx <= LastLUIdx);
435
436 // Update RegUses. The data structure is not optimized for this purpose;
437 // we must iterate through it and update each of the bit vectors.
438 for (auto &Pair : RegUsesMap) {
439 SmallBitVector &UsedByIndices = Pair.second.UsedByIndices;
440 if (LUIdx < UsedByIndices.size())
441 UsedByIndices[LUIdx] =
442 LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : false;
443 UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
444 }
445}
446
447bool
448RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
449 RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
450 if (I == RegUsesMap.end())
451 return false;
452 const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
453 int i = UsedByIndices.find_first();
454 if (i == -1) return false;
455 if ((size_t)i != LUIdx) return true;
456 return UsedByIndices.find_next(i) != -1;
457}
458
459const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
460 RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
461 assert(I != RegUsesMap.end() && "Unknown register!");
462 return I->second.UsedByIndices;
463}
464
465void RegUseTracker::clear() {
466 RegUsesMap.clear();
467 RegSequence.clear();
468}
469
470namespace {
471
472/// This class holds information that describes a formula for computing
473/// satisfying a use. It may include broken-out immediates and scaled registers.
474struct Formula {
475 /// Global base address used for complex addressing.
476 GlobalValue *BaseGV = nullptr;
477
478 /// Base offset for complex addressing.
479 Immediate BaseOffset = Immediate::getZero();
480
481 /// Whether any complex addressing has a base register.
482 bool HasBaseReg = false;
483
484 /// The scale of any complex addressing.
485 int64_t Scale = 0;
486
487 /// The list of "base" registers for this use. When this is non-empty. The
488 /// canonical representation of a formula is
489 /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
490 /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
491 /// 3. The reg containing recurrent expr related with currect loop in the
492 /// formula should be put in the ScaledReg.
493 /// #1 enforces that the scaled register is always used when at least two
494 /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
495 /// #2 enforces that 1 * reg is reg.
496 /// #3 ensures invariant regs with respect to current loop can be combined
497 /// together in LSR codegen.
498 /// This invariant can be temporarily broken while building a formula.
499 /// However, every formula inserted into the LSRInstance must be in canonical
500 /// form.
502
503 /// The 'scaled' register for this use. This should be non-null when Scale is
504 /// not zero.
505 const SCEV *ScaledReg = nullptr;
506
507 /// An additional constant offset which added near the use. This requires a
508 /// temporary register, but the offset itself can live in an add immediate
509 /// field rather than a register.
510 Immediate UnfoldedOffset = Immediate::getZero();
511
512 Formula() = default;
513
514 void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
515
516 bool isCanonical(const Loop &L) const;
517
518 void canonicalize(const Loop &L);
519
520 bool unscale();
521
522 bool hasZeroEnd() const;
523
524 bool countsDownToZero() const;
525
526 size_t getNumRegs() const;
527 Type *getType() const;
528
529 void deleteBaseReg(const SCEV *&S);
530
531 bool referencesReg(const SCEV *S) const;
532 bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
533 const RegUseTracker &RegUses) const;
534
535 void print(raw_ostream &OS) const;
536 void dump() const;
537};
538
539} // end anonymous namespace
540
541/// Recursion helper for initialMatch.
542static void DoInitialMatch(const SCEV *S, Loop *L,
545 ScalarEvolution &SE) {
546 // Collect expressions which properly dominate the loop header.
547 if (SE.properlyDominates(S, L->getHeader())) {
548 Good.push_back(S);
549 return;
550 }
551
552 // Look at add operands.
553 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
554 for (const SCEV *S : Add->operands())
555 DoInitialMatch(S, L, Good, Bad, SE);
556 return;
557 }
558
559 // Look at addrec operands.
560 const SCEV *Start, *Step;
561 const Loop *ARLoop;
562 if (match(S,
563 m_scev_AffineAddRec(m_SCEV(Start), m_SCEV(Step), m_Loop(ARLoop))) &&
564 !Start->isZero()) {
565 DoInitialMatch(Start, L, Good, Bad, SE);
566 DoInitialMatch(SE.getAddRecExpr(SE.getConstant(S->getType(), 0), Step,
567 // FIXME: AR->getNoWrapFlags()
568 ARLoop, SCEV::FlagAnyWrap),
569 L, Good, Bad, SE);
570 return;
571 }
572
573 // Handle a multiplication by -1 (negation) if it didn't fold.
574 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
575 if (Mul->getOperand(0)->isAllOnesValue()) {
577 const SCEV *NewMul = SE.getMulExpr(Ops);
578
581 DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
582 const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
583 SE.getEffectiveSCEVType(NewMul->getType())));
584 for (const SCEV *S : MyGood)
585 Good.push_back(SE.getMulExpr(NegOne, S));
586 for (const SCEV *S : MyBad)
587 Bad.push_back(SE.getMulExpr(NegOne, S));
588 return;
589 }
590
591 // Ok, we can't do anything interesting. Just stuff the whole thing into a
592 // register and hope for the best.
593 Bad.push_back(S);
594}
595
596/// Incorporate loop-variant parts of S into this Formula, attempting to keep
597/// all loop-invariant and loop-computable values in a single base register.
598void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
601 DoInitialMatch(S, L, Good, Bad, SE);
602 if (!Good.empty()) {
603 const SCEV *Sum = SE.getAddExpr(Good);
604 if (!Sum->isZero())
605 BaseRegs.push_back(Sum);
606 HasBaseReg = true;
607 }
608 if (!Bad.empty()) {
609 const SCEV *Sum = SE.getAddExpr(Bad);
610 if (!Sum->isZero())
611 BaseRegs.push_back(Sum);
612 HasBaseReg = true;
613 }
614 canonicalize(*L);
615}
616
617static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L) {
618 return SCEVExprContains(S, [&L](const SCEV *S) {
619 return isa<SCEVAddRecExpr>(S) && (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
620 });
621}
622
623/// Check whether or not this formula satisfies the canonical
624/// representation.
625/// \see Formula::BaseRegs.
626bool Formula::isCanonical(const Loop &L) const {
627 assert((Scale == 0 || ScaledReg) &&
628 "ScaledReg must be non-null if Scale is non-zero");
629
630 if (!ScaledReg)
631 return BaseRegs.size() <= 1;
632
633 if (Scale != 1)
634 return true;
635
636 if (Scale == 1 && BaseRegs.empty())
637 return false;
638
639 if (containsAddRecDependentOnLoop(ScaledReg, L))
640 return true;
641
642 // If ScaledReg is not a recurrent expr, or it is but its loop is not current
643 // loop, meanwhile BaseRegs contains a recurrent expr reg related with current
644 // loop, we want to swap the reg in BaseRegs with ScaledReg.
645 return none_of(BaseRegs, [&L](const SCEV *S) {
647 });
648}
649
650/// Helper method to morph a formula into its canonical representation.
651/// \see Formula::BaseRegs.
652/// Every formula having more than one base register, must use the ScaledReg
653/// field. Otherwise, we would have to do special cases everywhere in LSR
654/// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
655/// On the other hand, 1*reg should be canonicalized into reg.
656void Formula::canonicalize(const Loop &L) {
657 if (isCanonical(L))
658 return;
659
660 if (BaseRegs.empty()) {
661 // No base reg? Use scale reg with scale = 1 as such.
662 assert(ScaledReg && "Expected 1*reg => reg");
663 assert(Scale == 1 && "Expected 1*reg => reg");
664 BaseRegs.push_back(ScaledReg);
665 Scale = 0;
666 ScaledReg = nullptr;
667 return;
668 }
669
670 // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
671 if (!ScaledReg) {
672 ScaledReg = BaseRegs.pop_back_val();
673 Scale = 1;
674 }
675
676 // If ScaledReg is an invariant with respect to L, find the reg from
677 // BaseRegs containing the recurrent expr related with Loop L. Swap the
678 // reg with ScaledReg.
679 if (!containsAddRecDependentOnLoop(ScaledReg, L)) {
680 auto I = find_if(BaseRegs, [&L](const SCEV *S) {
682 });
683 if (I != BaseRegs.end())
684 std::swap(ScaledReg, *I);
685 }
686 assert(isCanonical(L) && "Failed to canonicalize?");
687}
688
689/// Get rid of the scale in the formula.
690/// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
691/// \return true if it was possible to get rid of the scale, false otherwise.
692/// \note After this operation the formula may not be in the canonical form.
693bool Formula::unscale() {
694 if (Scale != 1)
695 return false;
696 Scale = 0;
697 BaseRegs.push_back(ScaledReg);
698 ScaledReg = nullptr;
699 return true;
700}
701
702bool Formula::hasZeroEnd() const {
703 if (UnfoldedOffset || BaseOffset)
704 return false;
705 if (BaseRegs.size() != 1 || ScaledReg)
706 return false;
707 return true;
708}
709
710bool Formula::countsDownToZero() const {
711 if (!hasZeroEnd())
712 return false;
713 assert(BaseRegs.size() == 1 && "hasZeroEnd should mean one BaseReg");
714 const APInt *StepInt;
715 if (!match(BaseRegs[0], m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(StepInt))))
716 return false;
717 return StepInt->isNegative();
718}
719
720/// Return the total number of register operands used by this formula. This does
721/// not include register uses implied by non-constant addrec strides.
722size_t Formula::getNumRegs() const {
723 return !!ScaledReg + BaseRegs.size();
724}
725
726/// Return the type of this formula, if it has one, or null otherwise. This type
727/// is meaningless except for the bit size.
728Type *Formula::getType() const {
729 return !BaseRegs.empty() ? BaseRegs.front()->getType() :
730 ScaledReg ? ScaledReg->getType() :
731 BaseGV ? BaseGV->getType() :
732 nullptr;
733}
734
735/// Delete the given base reg from the BaseRegs list.
736void Formula::deleteBaseReg(const SCEV *&S) {
737 if (&S != &BaseRegs.back())
738 std::swap(S, BaseRegs.back());
739 BaseRegs.pop_back();
740}
741
742/// Test if this formula references the given register.
743bool Formula::referencesReg(const SCEV *S) const {
744 return S == ScaledReg || is_contained(BaseRegs, S);
745}
746
747/// Test whether this formula uses registers which are used by uses other than
748/// the use with the given index.
749bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
750 const RegUseTracker &RegUses) const {
751 if (ScaledReg)
752 if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
753 return true;
754 for (const SCEV *BaseReg : BaseRegs)
755 if (RegUses.isRegUsedByUsesOtherThan(BaseReg, LUIdx))
756 return true;
757 return false;
758}
759
760#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
761void Formula::print(raw_ostream &OS) const {
762 bool First = true;
763 if (BaseGV) {
764 if (!First) OS << " + "; else First = false;
765 BaseGV->printAsOperand(OS, /*PrintType=*/false);
766 }
767 if (BaseOffset.isNonZero()) {
768 if (!First) OS << " + "; else First = false;
769 OS << BaseOffset;
770 }
771 for (const SCEV *BaseReg : BaseRegs) {
772 if (!First) OS << " + "; else First = false;
773 OS << "reg(" << *BaseReg << ')';
774 }
775 if (HasBaseReg && BaseRegs.empty()) {
776 if (!First) OS << " + "; else First = false;
777 OS << "**error: HasBaseReg**";
778 } else if (!HasBaseReg && !BaseRegs.empty()) {
779 if (!First) OS << " + "; else First = false;
780 OS << "**error: !HasBaseReg**";
781 }
782 if (Scale != 0) {
783 if (!First) OS << " + "; else First = false;
784 OS << Scale << "*reg(";
785 if (ScaledReg)
786 OS << *ScaledReg;
787 else
788 OS << "<unknown>";
789 OS << ')';
790 }
791 if (UnfoldedOffset.isNonZero()) {
792 if (!First) OS << " + ";
793 OS << "imm(" << UnfoldedOffset << ')';
794 }
795}
796
797LLVM_DUMP_METHOD void Formula::dump() const {
798 print(errs()); errs() << '\n';
799}
800#endif
801
802/// Return true if the given addrec can be sign-extended without changing its
803/// value.
805 Type *WideTy =
807 return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
808}
809
810/// Return true if the given add can be sign-extended without changing its
811/// value.
812static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
813 Type *WideTy =
814 IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
815 return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
816}
817
818/// Return true if the given mul can be sign-extended without changing its
819/// value.
820static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
821 Type *WideTy =
823 SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
824 return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
825}
826
827/// Return an expression for LHS /s RHS, if it can be determined and if the
828/// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits
829/// is true, expressions like (X * Y) /s Y are simplified to X, ignoring that
830/// the multiplication may overflow, which is useful when the result will be
831/// used in a context where the most significant bits are ignored.
832static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
833 ScalarEvolution &SE,
834 bool IgnoreSignificantBits = false) {
835 // Handle the trivial case, which works for any SCEV type.
836 if (LHS == RHS)
837 return SE.getConstant(LHS->getType(), 1);
838
839 // Handle a few RHS special cases.
840 const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
841 if (RC) {
842 const APInt &RA = RC->getAPInt();
843 // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
844 // some folding.
845 if (RA.isAllOnes()) {
846 if (LHS->getType()->isPointerTy())
847 return nullptr;
848 return SE.getMulExpr(LHS, RC);
849 }
850 // Handle x /s 1 as x.
851 if (RA == 1)
852 return LHS;
853 }
854
855 // Check for a division of a constant by a constant.
856 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
857 if (!RC)
858 return nullptr;
859 const APInt &LA = C->getAPInt();
860 const APInt &RA = RC->getAPInt();
861 if (LA.srem(RA) != 0)
862 return nullptr;
863 return SE.getConstant(LA.sdiv(RA));
864 }
865
866 // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
867 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
868 if ((IgnoreSignificantBits || isAddRecSExtable(AR, SE)) && AR->isAffine()) {
869 const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
870 IgnoreSignificantBits);
871 if (!Step) return nullptr;
872 const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
873 IgnoreSignificantBits);
874 if (!Start) return nullptr;
875 // FlagNW is independent of the start value, step direction, and is
876 // preserved with smaller magnitude steps.
877 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
878 return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
879 }
880 return nullptr;
881 }
882
883 // Distribute the sdiv over add operands, if the add doesn't overflow.
884 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) {
885 if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
887 for (const SCEV *S : Add->operands()) {
888 const SCEV *Op = getExactSDiv(S, RHS, SE, IgnoreSignificantBits);
889 if (!Op) return nullptr;
890 Ops.push_back(Op);
891 }
892 return SE.getAddExpr(Ops);
893 }
894 return nullptr;
895 }
896
897 // Check for a multiply operand that we can pull RHS out of.
898 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) {
899 if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
900 // Handle special case C1*X*Y /s C2*X*Y.
901 if (const SCEVMulExpr *MulRHS = dyn_cast<SCEVMulExpr>(RHS)) {
902 if (IgnoreSignificantBits || isMulSExtable(MulRHS, SE)) {
903 const SCEVConstant *LC = dyn_cast<SCEVConstant>(Mul->getOperand(0));
904 const SCEVConstant *RC =
905 dyn_cast<SCEVConstant>(MulRHS->getOperand(0));
906 if (LC && RC) {
908 SmallVector<const SCEV *, 4> ROps(drop_begin(MulRHS->operands()));
909 if (LOps == ROps)
910 return getExactSDiv(LC, RC, SE, IgnoreSignificantBits);
911 }
912 }
913 }
914
916 bool Found = false;
917 for (const SCEV *S : Mul->operands()) {
918 if (!Found)
919 if (const SCEV *Q = getExactSDiv(S, RHS, SE,
920 IgnoreSignificantBits)) {
921 S = Q;
922 Found = true;
923 }
924 Ops.push_back(S);
925 }
926 return Found ? SE.getMulExpr(Ops) : nullptr;
927 }
928 return nullptr;
929 }
930
931 // Otherwise we don't know.
932 return nullptr;
933}
934
935/// If S involves the addition of a constant integer value, return that integer
936/// value, and mutate S to point to a new SCEV with that value excluded.
937static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
938 const APInt *C;
939 if (match(S, m_scev_APInt(C))) {
940 if (C->getSignificantBits() <= 64) {
941 S = SE.getConstant(S->getType(), 0);
942 return Immediate::getFixed(C->getSExtValue());
943 }
944 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
945 SmallVector<const SCEV *, 8> NewOps(Add->operands());
946 Immediate Result = ExtractImmediate(NewOps.front(), SE);
947 if (Result.isNonZero())
948 S = SE.getAddExpr(NewOps);
949 return Result;
950 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
951 SmallVector<const SCEV *, 8> NewOps(AR->operands());
952 Immediate Result = ExtractImmediate(NewOps.front(), SE);
953 if (Result.isNonZero())
954 S = SE.getAddRecExpr(NewOps, AR->getLoop(),
955 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
957 return Result;
958 } else if (EnableVScaleImmediates &&
960 S = SE.getConstant(S->getType(), 0);
961 return Immediate::getScalable(C->getSExtValue());
962 }
963 return Immediate::getZero();
964}
965
966/// If S involves the addition of a GlobalValue address, return that symbol, and
967/// mutate S to point to a new SCEV with that value excluded.
969 if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
970 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
971 S = SE.getConstant(GV->getType(), 0);
972 return GV;
973 }
974 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
975 SmallVector<const SCEV *, 8> NewOps(Add->operands());
976 GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
977 if (Result)
978 S = SE.getAddExpr(NewOps);
979 return Result;
980 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
981 SmallVector<const SCEV *, 8> NewOps(AR->operands());
982 GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
983 if (Result)
984 S = SE.getAddRecExpr(NewOps, AR->getLoop(),
985 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
987 return Result;
988 }
989 return nullptr;
990}
991
992/// Returns true if the specified instruction is using the specified value as an
993/// address.
995 Instruction *Inst, Value *OperandVal) {
996 bool isAddress = isa<LoadInst>(Inst);
997 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
998 if (SI->getPointerOperand() == OperandVal)
999 isAddress = true;
1000 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
1001 // Addressing modes can also be folded into prefetches and a variety
1002 // of intrinsics.
1003 switch (II->getIntrinsicID()) {
1004 case Intrinsic::memset:
1005 case Intrinsic::prefetch:
1006 case Intrinsic::masked_load:
1007 if (II->getArgOperand(0) == OperandVal)
1008 isAddress = true;
1009 break;
1010 case Intrinsic::masked_store:
1011 if (II->getArgOperand(1) == OperandVal)
1012 isAddress = true;
1013 break;
1014 case Intrinsic::memmove:
1015 case Intrinsic::memcpy:
1016 if (II->getArgOperand(0) == OperandVal ||
1017 II->getArgOperand(1) == OperandVal)
1018 isAddress = true;
1019 break;
1020 default: {
1021 MemIntrinsicInfo IntrInfo;
1022 if (TTI.getTgtMemIntrinsic(II, IntrInfo)) {
1023 if (IntrInfo.PtrVal == OperandVal)
1024 isAddress = true;
1025 }
1026 }
1027 }
1028 } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
1029 if (RMW->getPointerOperand() == OperandVal)
1030 isAddress = true;
1031 } else if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
1032 if (CmpX->getPointerOperand() == OperandVal)
1033 isAddress = true;
1034 }
1035 return isAddress;
1036}
1037
1038/// Return the type of the memory being accessed.
1039static MemAccessTy getAccessType(const TargetTransformInfo &TTI,
1040 Instruction *Inst, Value *OperandVal) {
1041 MemAccessTy AccessTy = MemAccessTy::getUnknown(Inst->getContext());
1042
1043 // First get the type of memory being accessed.
1044 if (Type *Ty = Inst->getAccessType())
1045 AccessTy.MemTy = Ty;
1046
1047 // Then get the pointer address space.
1048 if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
1049 AccessTy.AddrSpace = SI->getPointerAddressSpace();
1050 } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
1051 AccessTy.AddrSpace = LI->getPointerAddressSpace();
1052 } else if (const AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
1053 AccessTy.AddrSpace = RMW->getPointerAddressSpace();
1054 } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
1055 AccessTy.AddrSpace = CmpX->getPointerAddressSpace();
1056 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
1057 switch (II->getIntrinsicID()) {
1058 case Intrinsic::prefetch:
1059 case Intrinsic::memset:
1060 AccessTy.AddrSpace = II->getArgOperand(0)->getType()->getPointerAddressSpace();
1061 AccessTy.MemTy = OperandVal->getType();
1062 break;
1063 case Intrinsic::memmove:
1064 case Intrinsic::memcpy:
1065 AccessTy.AddrSpace = OperandVal->getType()->getPointerAddressSpace();
1066 AccessTy.MemTy = OperandVal->getType();
1067 break;
1068 case Intrinsic::masked_load:
1069 AccessTy.AddrSpace =
1070 II->getArgOperand(0)->getType()->getPointerAddressSpace();
1071 break;
1072 case Intrinsic::masked_store:
1073 AccessTy.AddrSpace =
1074 II->getArgOperand(1)->getType()->getPointerAddressSpace();
1075 break;
1076 default: {
1077 MemIntrinsicInfo IntrInfo;
1078 if (TTI.getTgtMemIntrinsic(II, IntrInfo) && IntrInfo.PtrVal) {
1079 AccessTy.AddrSpace
1080 = IntrInfo.PtrVal->getType()->getPointerAddressSpace();
1081 }
1082
1083 break;
1084 }
1085 }
1086 }
1087
1088 return AccessTy;
1089}
1090
1091/// Return true if this AddRec is already a phi in its loop.
1092static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
1093 for (PHINode &PN : AR->getLoop()->getHeader()->phis()) {
1094 if (SE.isSCEVable(PN.getType()) &&
1095 (SE.getEffectiveSCEVType(PN.getType()) ==
1096 SE.getEffectiveSCEVType(AR->getType())) &&
1097 SE.getSCEV(&PN) == AR)
1098 return true;
1099 }
1100 return false;
1101}
1102
1103/// Check if expanding this expression is likely to incur significant cost. This
1104/// is tricky because SCEV doesn't track which expressions are actually computed
1105/// by the current IR.
1106///
1107/// We currently allow expansion of IV increments that involve adds,
1108/// multiplication by constants, and AddRecs from existing phis.
1109///
1110/// TODO: Allow UDivExpr if we can find an existing IV increment that is an
1111/// obvious multiple of the UDivExpr.
1112static bool isHighCostExpansion(const SCEV *S,
1114 ScalarEvolution &SE) {
1115 // Zero/One operand expressions
1116 switch (S->getSCEVType()) {
1117 case scUnknown:
1118 case scConstant:
1119 case scVScale:
1120 return false;
1121 case scTruncate:
1122 return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(),
1123 Processed, SE);
1124 case scZeroExtend:
1125 return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(),
1126 Processed, SE);
1127 case scSignExtend:
1128 return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(),
1129 Processed, SE);
1130 default:
1131 break;
1132 }
1133
1134 if (!Processed.insert(S).second)
1135 return false;
1136
1137 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
1138 for (const SCEV *S : Add->operands()) {
1139 if (isHighCostExpansion(S, Processed, SE))
1140 return true;
1141 }
1142 return false;
1143 }
1144
1145 const SCEV *Op0, *Op1;
1146 if (match(S, m_scev_Mul(m_SCEV(Op0), m_SCEV(Op1)))) {
1147 // Multiplication by a constant is ok
1148 if (isa<SCEVConstant>(Op0))
1149 return isHighCostExpansion(Op1, Processed, SE);
1150
1151 // If we have the value of one operand, check if an existing
1152 // multiplication already generates this expression.
1153 if (const auto *U = dyn_cast<SCEVUnknown>(Op1)) {
1154 Value *UVal = U->getValue();
1155 for (User *UR : UVal->users()) {
1156 // If U is a constant, it may be used by a ConstantExpr.
1157 Instruction *UI = dyn_cast<Instruction>(UR);
1158 if (UI && UI->getOpcode() == Instruction::Mul &&
1159 SE.isSCEVable(UI->getType())) {
1160 return SE.getSCEV(UI) == S;
1161 }
1162 }
1163 }
1164 }
1165
1166 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
1167 if (isExistingPhi(AR, SE))
1168 return false;
1169 }
1170
1171 // Fow now, consider any other type of expression (div/mul/min/max) high cost.
1172 return true;
1173}
1174
1175namespace {
1176
1177class LSRUse;
1178
1179} // end anonymous namespace
1180
1181/// Check if the addressing mode defined by \p F is completely
1182/// folded in \p LU at isel time.
1183/// This includes address-mode folding and special icmp tricks.
1184/// This function returns true if \p LU can accommodate what \p F
1185/// defines and up to 1 base + 1 scaled + offset.
1186/// In other words, if \p F has several base registers, this function may
1187/// still return true. Therefore, users still need to account for
1188/// additional base registers and/or unfolded offsets to derive an
1189/// accurate cost model.
1191 const LSRUse &LU, const Formula &F);
1192
1193// Get the cost of the scaling factor used in F for LU.
1195 const LSRUse &LU, const Formula &F,
1196 const Loop &L);
1197
1198namespace {
1199
1200/// This class is used to measure and compare candidate formulae.
1201class Cost {
1202 const Loop *L = nullptr;
1203 ScalarEvolution *SE = nullptr;
1204 const TargetTransformInfo *TTI = nullptr;
1207
1208public:
1209 Cost() = delete;
1210 Cost(const Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
1212 L(L), SE(&SE), TTI(&TTI), AMK(AMK) {
1213 C.Insns = 0;
1214 C.NumRegs = 0;
1215 C.AddRecCost = 0;
1216 C.NumIVMuls = 0;
1217 C.NumBaseAdds = 0;
1218 C.ImmCost = 0;
1219 C.SetupCost = 0;
1220 C.ScaleCost = 0;
1221 }
1222
1223 bool isLess(const Cost &Other) const;
1224
1225 void Lose();
1226
1227#ifndef NDEBUG
1228 // Once any of the metrics loses, they must all remain losers.
1229 bool isValid() {
1230 return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls | C.NumBaseAdds
1231 | C.ImmCost | C.SetupCost | C.ScaleCost) != ~0u)
1232 || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls & C.NumBaseAdds
1233 & C.ImmCost & C.SetupCost & C.ScaleCost) == ~0u);
1234 }
1235#endif
1236
1237 bool isLoser() {
1238 assert(isValid() && "invalid cost");
1239 return C.NumRegs == ~0u;
1240 }
1241
1242 void RateFormula(const Formula &F, SmallPtrSetImpl<const SCEV *> &Regs,
1243 const DenseSet<const SCEV *> &VisitedRegs, const LSRUse &LU,
1244 bool HardwareLoopProfitable,
1245 SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
1246
1247 void print(raw_ostream &OS) const;
1248 void dump() const;
1249
1250private:
1251 void RateRegister(const Formula &F, const SCEV *Reg,
1252 SmallPtrSetImpl<const SCEV *> &Regs, const LSRUse &LU,
1253 bool HardwareLoopProfitable);
1254 void RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1256 const LSRUse &LU, bool HardwareLoopProfitable,
1258};
1259
1260/// An operand value in an instruction which is to be replaced with some
1261/// equivalent, possibly strength-reduced, replacement.
1262struct LSRFixup {
1263 /// The instruction which will be updated.
1264 Instruction *UserInst = nullptr;
1265
1266 /// The operand of the instruction which will be replaced. The operand may be
1267 /// used more than once; every instance will be replaced.
1268 Value *OperandValToReplace = nullptr;
1269
1270 /// If this user is to use the post-incremented value of an induction
1271 /// variable, this set is non-empty and holds the loops associated with the
1272 /// induction variable.
1273 PostIncLoopSet PostIncLoops;
1274
1275 /// A constant offset to be added to the LSRUse expression. This allows
1276 /// multiple fixups to share the same LSRUse with different offsets, for
1277 /// example in an unrolled loop.
1278 Immediate Offset = Immediate::getZero();
1279
1280 LSRFixup() = default;
1281
1282 bool isUseFullyOutsideLoop(const Loop *L) const;
1283
1284 void print(raw_ostream &OS) const;
1285 void dump() const;
1286};
1287
1288/// This class holds the state that LSR keeps for each use in IVUsers, as well
1289/// as uses invented by LSR itself. It includes information about what kinds of
1290/// things can be folded into the user, information about the user itself, and
1291/// information about how the use may be satisfied. TODO: Represent multiple
1292/// users of the same expression in common?
1293class LSRUse {
1295
1296public:
1297 /// An enum for a kind of use, indicating what types of scaled and immediate
1298 /// operands it might support.
1299 enum KindType {
1300 Basic, ///< A normal use, with no folding.
1301 Special, ///< A special case of basic, allowing -1 scales.
1302 Address, ///< An address use; folding according to TargetLowering
1303 ICmpZero ///< An equality icmp with both operands folded into one.
1304 // TODO: Add a generic icmp too?
1305 };
1306
1307 using SCEVUseKindPair = PointerIntPair<const SCEV *, 2, KindType>;
1308
1309 KindType Kind;
1310 MemAccessTy AccessTy;
1311
1312 /// The list of operands which are to be replaced.
1314
1315 /// Keep track of the min and max offsets of the fixups.
1316 Immediate MinOffset = Immediate::getFixedMax();
1317 Immediate MaxOffset = Immediate::getFixedMin();
1318
1319 /// This records whether all of the fixups using this LSRUse are outside of
1320 /// the loop, in which case some special-case heuristics may be used.
1321 bool AllFixupsOutsideLoop = true;
1322
1323 /// RigidFormula is set to true to guarantee that this use will be associated
1324 /// with a single formula--the one that initially matched. Some SCEV
1325 /// expressions cannot be expanded. This allows LSR to consider the registers
1326 /// used by those expressions without the need to expand them later after
1327 /// changing the formula.
1328 bool RigidFormula = false;
1329
1330 /// This records the widest use type for any fixup using this
1331 /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max
1332 /// fixup widths to be equivalent, because the narrower one may be relying on
1333 /// the implicit truncation to truncate away bogus bits.
1334 Type *WidestFixupType = nullptr;
1335
1336 /// A list of ways to build a value that can satisfy this user. After the
1337 /// list is populated, one of these is selected heuristically and used to
1338 /// formulate a replacement for OperandValToReplace in UserInst.
1339 SmallVector<Formula, 12> Formulae;
1340
1341 /// The set of register candidates used by all formulae in this LSRUse.
1343
1344 LSRUse(KindType K, MemAccessTy AT) : Kind(K), AccessTy(AT) {}
1345
1346 LSRFixup &getNewFixup() {
1347 Fixups.push_back(LSRFixup());
1348 return Fixups.back();
1349 }
1350
1351 void pushFixup(LSRFixup &f) {
1352 Fixups.push_back(f);
1353 if (Immediate::isKnownGT(f.Offset, MaxOffset))
1354 MaxOffset = f.Offset;
1355 if (Immediate::isKnownLT(f.Offset, MinOffset))
1356 MinOffset = f.Offset;
1357 }
1358
1359 bool HasFormulaWithSameRegs(const Formula &F) const;
1360 float getNotSelectedProbability(const SCEV *Reg) const;
1361 bool InsertFormula(const Formula &F, const Loop &L);
1362 void DeleteFormula(Formula &F);
1363 void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
1364
1365 void print(raw_ostream &OS) const;
1366 void dump() const;
1367};
1368
1369} // end anonymous namespace
1370
1372 LSRUse::KindType Kind, MemAccessTy AccessTy,
1373 GlobalValue *BaseGV, Immediate BaseOffset,
1374 bool HasBaseReg, int64_t Scale,
1375 Instruction *Fixup = nullptr);
1376
1377static unsigned getSetupCost(const SCEV *Reg, unsigned Depth) {
1378 if (isa<SCEVUnknown>(Reg) || isa<SCEVConstant>(Reg))
1379 return 1;
1380 if (Depth == 0)
1381 return 0;
1382 if (const auto *S = dyn_cast<SCEVAddRecExpr>(Reg))
1383 return getSetupCost(S->getStart(), Depth - 1);
1384 if (auto S = dyn_cast<SCEVIntegralCastExpr>(Reg))
1385 return getSetupCost(S->getOperand(), Depth - 1);
1386 if (auto S = dyn_cast<SCEVNAryExpr>(Reg))
1387 return std::accumulate(S->operands().begin(), S->operands().end(), 0,
1388 [&](unsigned i, const SCEV *Reg) {
1389 return i + getSetupCost(Reg, Depth - 1);
1390 });
1391 if (auto S = dyn_cast<SCEVUDivExpr>(Reg))
1392 return getSetupCost(S->getLHS(), Depth - 1) +
1393 getSetupCost(S->getRHS(), Depth - 1);
1394 return 0;
1395}
1396
1397/// Tally up interesting quantities from the given register.
1398void Cost::RateRegister(const Formula &F, const SCEV *Reg,
1399 SmallPtrSetImpl<const SCEV *> &Regs, const LSRUse &LU,
1400 bool HardwareLoopProfitable) {
1401 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
1402 // If this is an addrec for another loop, it should be an invariant
1403 // with respect to L since L is the innermost loop (at least
1404 // for now LSR only handles innermost loops).
1405 if (AR->getLoop() != L) {
1406 // If the AddRec exists, consider it's register free and leave it alone.
1407 if (isExistingPhi(AR, *SE) && AMK != TTI::AMK_PostIndexed)
1408 return;
1409
1410 // It is bad to allow LSR for current loop to add induction variables
1411 // for its sibling loops.
1412 if (!AR->getLoop()->contains(L)) {
1413 Lose();
1414 return;
1415 }
1416
1417 // Otherwise, it will be an invariant with respect to Loop L.
1418 ++C.NumRegs;
1419 return;
1420 }
1421
1422 unsigned LoopCost = 1;
1423 if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) ||
1424 TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) {
1425 const SCEV *Start;
1426 const SCEVConstant *Step;
1427 if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_SCEVConstant(Step))))
1428 // If the step size matches the base offset, we could use pre-indexed
1429 // addressing.
1430 if ((AMK == TTI::AMK_PreIndexed && F.BaseOffset.isFixed() &&
1431 Step->getAPInt() == F.BaseOffset.getFixedValue()) ||
1432 (AMK == TTI::AMK_PostIndexed && !isa<SCEVConstant>(Start) &&
1433 SE->isLoopInvariant(Start, L)))
1434 LoopCost = 0;
1435 }
1436 // If the loop counts down to zero and we'll be using a hardware loop then
1437 // the addrec will be combined into the hardware loop instruction.
1438 if (LU.Kind == LSRUse::ICmpZero && F.countsDownToZero() &&
1439 HardwareLoopProfitable)
1440 LoopCost = 0;
1441 C.AddRecCost += LoopCost;
1442
1443 // Add the step value register, if it needs one.
1444 // TODO: The non-affine case isn't precisely modeled here.
1445 if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
1446 if (!Regs.count(AR->getOperand(1))) {
1447 RateRegister(F, AR->getOperand(1), Regs, LU, HardwareLoopProfitable);
1448 if (isLoser())
1449 return;
1450 }
1451 }
1452 }
1453 ++C.NumRegs;
1454
1455 // Rough heuristic; favor registers which don't require extra setup
1456 // instructions in the preheader.
1457 C.SetupCost += getSetupCost(Reg, SetupCostDepthLimit);
1458 // Ensure we don't, even with the recusion limit, produce invalid costs.
1459 C.SetupCost = std::min<unsigned>(C.SetupCost, 1 << 16);
1460
1461 C.NumIVMuls += isa<SCEVMulExpr>(Reg) &&
1462 SE->hasComputableLoopEvolution(Reg, L);
1463}
1464
1465/// Record this register in the set. If we haven't seen it before, rate
1466/// it. Optional LoserRegs provides a way to declare any formula that refers to
1467/// one of those regs an instant loser.
1468void Cost::RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1470 const LSRUse &LU, bool HardwareLoopProfitable,
1471 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1472 if (LoserRegs && LoserRegs->count(Reg)) {
1473 Lose();
1474 return;
1475 }
1476 if (Regs.insert(Reg).second) {
1477 RateRegister(F, Reg, Regs, LU, HardwareLoopProfitable);
1478 if (LoserRegs && isLoser())
1479 LoserRegs->insert(Reg);
1480 }
1481}
1482
1483void Cost::RateFormula(const Formula &F, SmallPtrSetImpl<const SCEV *> &Regs,
1484 const DenseSet<const SCEV *> &VisitedRegs,
1485 const LSRUse &LU, bool HardwareLoopProfitable,
1486 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1487 if (isLoser())
1488 return;
1489 assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula");
1490 // Tally up the registers.
1491 unsigned PrevAddRecCost = C.AddRecCost;
1492 unsigned PrevNumRegs = C.NumRegs;
1493 unsigned PrevNumBaseAdds = C.NumBaseAdds;
1494 if (const SCEV *ScaledReg = F.ScaledReg) {
1495 if (VisitedRegs.count(ScaledReg)) {
1496 Lose();
1497 return;
1498 }
1499 RatePrimaryRegister(F, ScaledReg, Regs, LU, HardwareLoopProfitable,
1500 LoserRegs);
1501 if (isLoser())
1502 return;
1503 }
1504 for (const SCEV *BaseReg : F.BaseRegs) {
1505 if (VisitedRegs.count(BaseReg)) {
1506 Lose();
1507 return;
1508 }
1509 RatePrimaryRegister(F, BaseReg, Regs, LU, HardwareLoopProfitable,
1510 LoserRegs);
1511 if (isLoser())
1512 return;
1513 }
1514
1515 // Determine how many (unfolded) adds we'll need inside the loop.
1516 size_t NumBaseParts = F.getNumRegs();
1517 if (NumBaseParts > 1)
1518 // Do not count the base and a possible second register if the target
1519 // allows to fold 2 registers.
1520 C.NumBaseAdds +=
1521 NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(*TTI, LU, F)));
1522 C.NumBaseAdds += (F.UnfoldedOffset.isNonZero());
1523
1524 // Accumulate non-free scaling amounts.
1525 C.ScaleCost += getScalingFactorCost(*TTI, LU, F, *L).getValue();
1526
1527 // Tally up the non-zero immediates.
1528 for (const LSRFixup &Fixup : LU.Fixups) {
1529 if (Fixup.Offset.isCompatibleImmediate(F.BaseOffset)) {
1530 Immediate Offset = Fixup.Offset.addUnsigned(F.BaseOffset);
1531 if (F.BaseGV)
1532 C.ImmCost += 64; // Handle symbolic values conservatively.
1533 // TODO: This should probably be the pointer size.
1534 else if (Offset.isNonZero())
1535 C.ImmCost +=
1536 APInt(64, Offset.getKnownMinValue(), true).getSignificantBits();
1537
1538 // Check with target if this offset with this instruction is
1539 // specifically not supported.
1540 if (LU.Kind == LSRUse::Address && Offset.isNonZero() &&
1541 !isAMCompletelyFolded(*TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1542 Offset, F.HasBaseReg, F.Scale, Fixup.UserInst))
1543 C.NumBaseAdds++;
1544 } else {
1545 // Incompatible immediate type, increase cost to avoid using
1546 C.ImmCost += 2048;
1547 }
1548 }
1549
1550 // If we don't count instruction cost exit here.
1551 if (!InsnsCost) {
1552 assert(isValid() && "invalid cost");
1553 return;
1554 }
1555
1556 // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
1557 // additional instruction (at least fill).
1558 // TODO: Need distinguish register class?
1559 unsigned TTIRegNum = TTI->getNumberOfRegisters(
1560 TTI->getRegisterClassForType(false, F.getType())) - 1;
1561 if (C.NumRegs > TTIRegNum) {
1562 // Cost already exceeded TTIRegNum, then only newly added register can add
1563 // new instructions.
1564 if (PrevNumRegs > TTIRegNum)
1565 C.Insns += (C.NumRegs - PrevNumRegs);
1566 else
1567 C.Insns += (C.NumRegs - TTIRegNum);
1568 }
1569
1570 // If ICmpZero formula ends with not 0, it could not be replaced by
1571 // just add or sub. We'll need to compare final result of AddRec.
1572 // That means we'll need an additional instruction. But if the target can
1573 // macro-fuse a compare with a branch, don't count this extra instruction.
1574 // For -10 + {0, +, 1}:
1575 // i = i + 1;
1576 // cmp i, 10
1577 //
1578 // For {-10, +, 1}:
1579 // i = i + 1;
1580 if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() &&
1581 !TTI->canMacroFuseCmp())
1582 C.Insns++;
1583 // Each new AddRec adds 1 instruction to calculation.
1584 C.Insns += (C.AddRecCost - PrevAddRecCost);
1585
1586 // BaseAdds adds instructions for unfolded registers.
1587 if (LU.Kind != LSRUse::ICmpZero)
1588 C.Insns += C.NumBaseAdds - PrevNumBaseAdds;
1589 assert(isValid() && "invalid cost");
1590}
1591
1592/// Set this cost to a losing value.
1593void Cost::Lose() {
1594 C.Insns = std::numeric_limits<unsigned>::max();
1595 C.NumRegs = std::numeric_limits<unsigned>::max();
1596 C.AddRecCost = std::numeric_limits<unsigned>::max();
1597 C.NumIVMuls = std::numeric_limits<unsigned>::max();
1598 C.NumBaseAdds = std::numeric_limits<unsigned>::max();
1599 C.ImmCost = std::numeric_limits<unsigned>::max();
1600 C.SetupCost = std::numeric_limits<unsigned>::max();
1601 C.ScaleCost = std::numeric_limits<unsigned>::max();
1602}
1603
1604/// Choose the lower cost.
1605bool Cost::isLess(const Cost &Other) const {
1606 if (InsnsCost.getNumOccurrences() > 0 && InsnsCost &&
1607 C.Insns != Other.C.Insns)
1608 return C.Insns < Other.C.Insns;
1609 return TTI->isLSRCostLess(C, Other.C);
1610}
1611
1612#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1613void Cost::print(raw_ostream &OS) const {
1614 if (InsnsCost)
1615 OS << C.Insns << " instruction" << (C.Insns == 1 ? " " : "s ");
1616 OS << C.NumRegs << " reg" << (C.NumRegs == 1 ? "" : "s");
1617 if (C.AddRecCost != 0)
1618 OS << ", with addrec cost " << C.AddRecCost;
1619 if (C.NumIVMuls != 0)
1620 OS << ", plus " << C.NumIVMuls << " IV mul"
1621 << (C.NumIVMuls == 1 ? "" : "s");
1622 if (C.NumBaseAdds != 0)
1623 OS << ", plus " << C.NumBaseAdds << " base add"
1624 << (C.NumBaseAdds == 1 ? "" : "s");
1625 if (C.ScaleCost != 0)
1626 OS << ", plus " << C.ScaleCost << " scale cost";
1627 if (C.ImmCost != 0)
1628 OS << ", plus " << C.ImmCost << " imm cost";
1629 if (C.SetupCost != 0)
1630 OS << ", plus " << C.SetupCost << " setup cost";
1631}
1632
1633LLVM_DUMP_METHOD void Cost::dump() const {
1634 print(errs()); errs() << '\n';
1635}
1636#endif
1637
1638/// Test whether this fixup always uses its value outside of the given loop.
1639bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
1640 // PHI nodes use their value in their incoming blocks.
1641 if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
1642 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
1643 if (PN->getIncomingValue(i) == OperandValToReplace &&
1644 L->contains(PN->getIncomingBlock(i)))
1645 return false;
1646 return true;
1647 }
1648
1649 return !L->contains(UserInst);
1650}
1651
1652#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1653void LSRFixup::print(raw_ostream &OS) const {
1654 OS << "UserInst=";
1655 // Store is common and interesting enough to be worth special-casing.
1656 if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
1657 OS << "store ";
1658 Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
1659 } else if (UserInst->getType()->isVoidTy())
1660 OS << UserInst->getOpcodeName();
1661 else
1662 UserInst->printAsOperand(OS, /*PrintType=*/false);
1663
1664 OS << ", OperandValToReplace=";
1665 OperandValToReplace->printAsOperand(OS, /*PrintType=*/false);
1666
1667 for (const Loop *PIL : PostIncLoops) {
1668 OS << ", PostIncLoop=";
1669 PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false);
1670 }
1671
1672 if (Offset.isNonZero())
1673 OS << ", Offset=" << Offset;
1674}
1675
1676LLVM_DUMP_METHOD void LSRFixup::dump() const {
1677 print(errs()); errs() << '\n';
1678}
1679#endif
1680
1681/// Test whether this use as a formula which has the same registers as the given
1682/// formula.
1683bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
1685 if (F.ScaledReg) Key.push_back(F.ScaledReg);
1686 // Unstable sort by host order ok, because this is only used for uniquifying.
1687 llvm::sort(Key);
1688 return Uniquifier.count(Key);
1689}
1690
1691/// The function returns a probability of selecting formula without Reg.
1692float LSRUse::getNotSelectedProbability(const SCEV *Reg) const {
1693 unsigned FNum = 0;
1694 for (const Formula &F : Formulae)
1695 if (F.referencesReg(Reg))
1696 FNum++;
1697 return ((float)(Formulae.size() - FNum)) / Formulae.size();
1698}
1699
1700/// If the given formula has not yet been inserted, add it to the list, and
1701/// return true. Return false otherwise. The formula must be in canonical form.
1702bool LSRUse::InsertFormula(const Formula &F, const Loop &L) {
1703 assert(F.isCanonical(L) && "Invalid canonical representation");
1704
1705 if (!Formulae.empty() && RigidFormula)
1706 return false;
1707
1709 if (F.ScaledReg) Key.push_back(F.ScaledReg);
1710 // Unstable sort by host order ok, because this is only used for uniquifying.
1711 llvm::sort(Key);
1712
1713 if (!Uniquifier.insert(Key).second)
1714 return false;
1715
1716 // Using a register to hold the value of 0 is not profitable.
1717 assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
1718 "Zero allocated in a scaled register!");
1719#ifndef NDEBUG
1720 for (const SCEV *BaseReg : F.BaseRegs)
1721 assert(!BaseReg->isZero() && "Zero allocated in a base register!");
1722#endif
1723
1724 // Add the formula to the list.
1725 Formulae.push_back(F);
1726
1727 // Record registers now being used by this use.
1728 Regs.insert_range(F.BaseRegs);
1729 if (F.ScaledReg)
1730 Regs.insert(F.ScaledReg);
1731
1732 return true;
1733}
1734
1735/// Remove the given formula from this use's list.
1736void LSRUse::DeleteFormula(Formula &F) {
1737 if (&F != &Formulae.back())
1738 std::swap(F, Formulae.back());
1739 Formulae.pop_back();
1740}
1741
1742/// Recompute the Regs field, and update RegUses.
1743void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
1744 // Now that we've filtered out some formulae, recompute the Regs set.
1745 SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs);
1746 Regs.clear();
1747 for (const Formula &F : Formulae) {
1748 if (F.ScaledReg) Regs.insert(F.ScaledReg);
1749 Regs.insert_range(F.BaseRegs);
1750 }
1751
1752 // Update the RegTracker.
1753 for (const SCEV *S : OldRegs)
1754 if (!Regs.count(S))
1755 RegUses.dropRegister(S, LUIdx);
1756}
1757
1758#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1759void LSRUse::print(raw_ostream &OS) const {
1760 OS << "LSR Use: Kind=";
1761 switch (Kind) {
1762 case Basic: OS << "Basic"; break;
1763 case Special: OS << "Special"; break;
1764 case ICmpZero: OS << "ICmpZero"; break;
1765 case Address:
1766 OS << "Address of ";
1767 if (AccessTy.MemTy->isPointerTy())
1768 OS << "pointer"; // the full pointer type could be really verbose
1769 else {
1770 OS << *AccessTy.MemTy;
1771 }
1772
1773 OS << " in addrspace(" << AccessTy.AddrSpace << ')';
1774 }
1775
1776 OS << ", Offsets={";
1777 bool NeedComma = false;
1778 for (const LSRFixup &Fixup : Fixups) {
1779 if (NeedComma) OS << ',';
1780 OS << Fixup.Offset;
1781 NeedComma = true;
1782 }
1783 OS << '}';
1784
1785 if (AllFixupsOutsideLoop)
1786 OS << ", all-fixups-outside-loop";
1787
1788 if (WidestFixupType)
1789 OS << ", widest fixup type: " << *WidestFixupType;
1790}
1791
1792LLVM_DUMP_METHOD void LSRUse::dump() const {
1793 print(errs()); errs() << '\n';
1794}
1795#endif
1796
1798 LSRUse::KindType Kind, MemAccessTy AccessTy,
1799 GlobalValue *BaseGV, Immediate BaseOffset,
1800 bool HasBaseReg, int64_t Scale,
1801 Instruction *Fixup /* = nullptr */) {
1802 switch (Kind) {
1803 case LSRUse::Address: {
1804 int64_t FixedOffset =
1805 BaseOffset.isScalable() ? 0 : BaseOffset.getFixedValue();
1806 int64_t ScalableOffset =
1807 BaseOffset.isScalable() ? BaseOffset.getKnownMinValue() : 0;
1808 return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, FixedOffset,
1809 HasBaseReg, Scale, AccessTy.AddrSpace,
1810 Fixup, ScalableOffset);
1811 }
1812 case LSRUse::ICmpZero:
1813 // There's not even a target hook for querying whether it would be legal to
1814 // fold a GV into an ICmp.
1815 if (BaseGV)
1816 return false;
1817
1818 // ICmp only has two operands; don't allow more than two non-trivial parts.
1819 if (Scale != 0 && HasBaseReg && BaseOffset.isNonZero())
1820 return false;
1821
1822 // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
1823 // putting the scaled register in the other operand of the icmp.
1824 if (Scale != 0 && Scale != -1)
1825 return false;
1826
1827 // If we have low-level target information, ask the target if it can fold an
1828 // integer immediate on an icmp.
1829 if (BaseOffset.isNonZero()) {
1830 // We don't have an interface to query whether the target supports
1831 // icmpzero against scalable quantities yet.
1832 if (BaseOffset.isScalable())
1833 return false;
1834
1835 // We have one of:
1836 // ICmpZero BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
1837 // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
1838 // Offs is the ICmp immediate.
1839 if (Scale == 0)
1840 // The cast does the right thing with
1841 // std::numeric_limits<int64_t>::min().
1842 BaseOffset = BaseOffset.getFixed(-(uint64_t)BaseOffset.getFixedValue());
1843 return TTI.isLegalICmpImmediate(BaseOffset.getFixedValue());
1844 }
1845
1846 // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
1847 return true;
1848
1849 case LSRUse::Basic:
1850 // Only handle single-register values.
1851 return !BaseGV && Scale == 0 && BaseOffset.isZero();
1852
1853 case LSRUse::Special:
1854 // Special case Basic to handle -1 scales.
1855 return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset.isZero();
1856 }
1857
1858 llvm_unreachable("Invalid LSRUse Kind!");
1859}
1860
1862 Immediate MinOffset, Immediate MaxOffset,
1863 LSRUse::KindType Kind, MemAccessTy AccessTy,
1864 GlobalValue *BaseGV, Immediate BaseOffset,
1865 bool HasBaseReg, int64_t Scale) {
1866 if (BaseOffset.isNonZero() &&
1867 (BaseOffset.isScalable() != MinOffset.isScalable() ||
1868 BaseOffset.isScalable() != MaxOffset.isScalable()))
1869 return false;
1870 // Check for overflow.
1871 int64_t Base = BaseOffset.getKnownMinValue();
1872 int64_t Min = MinOffset.getKnownMinValue();
1873 int64_t Max = MaxOffset.getKnownMinValue();
1874 if (((int64_t)((uint64_t)Base + Min) > Base) != (Min > 0))
1875 return false;
1876 MinOffset = Immediate::get((uint64_t)Base + Min, MinOffset.isScalable());
1877 if (((int64_t)((uint64_t)Base + Max) > Base) != (Max > 0))
1878 return false;
1879 MaxOffset = Immediate::get((uint64_t)Base + Max, MaxOffset.isScalable());
1880
1881 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
1882 HasBaseReg, Scale) &&
1883 isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset,
1884 HasBaseReg, Scale);
1885}
1886
1888 Immediate MinOffset, Immediate MaxOffset,
1889 LSRUse::KindType Kind, MemAccessTy AccessTy,
1890 const Formula &F, const Loop &L) {
1891 // For the purpose of isAMCompletelyFolded either having a canonical formula
1892 // or a scale not equal to zero is correct.
1893 // Problems may arise from non canonical formulae having a scale == 0.
1894 // Strictly speaking it would best to just rely on canonical formulae.
1895 // However, when we generate the scaled formulae, we first check that the
1896 // scaling factor is profitable before computing the actual ScaledReg for
1897 // compile time sake.
1898 assert((F.isCanonical(L) || F.Scale != 0));
1899 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1900 F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
1901}
1902
1903/// Test whether we know how to expand the current formula.
1904static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
1905 Immediate MaxOffset, LSRUse::KindType Kind,
1906 MemAccessTy AccessTy, GlobalValue *BaseGV,
1907 Immediate BaseOffset, bool HasBaseReg, int64_t Scale) {
1908 // We know how to expand completely foldable formulae.
1909 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
1910 BaseOffset, HasBaseReg, Scale) ||
1911 // Or formulae that use a base register produced by a sum of base
1912 // registers.
1913 (Scale == 1 &&
1914 isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1915 BaseGV, BaseOffset, true, 0));
1916}
1917
1918static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
1919 Immediate MaxOffset, LSRUse::KindType Kind,
1920 MemAccessTy AccessTy, const Formula &F) {
1921 return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
1922 F.BaseOffset, F.HasBaseReg, F.Scale);
1923}
1924
1926 Immediate Offset) {
1927 if (Offset.isScalable())
1928 return TTI.isLegalAddScalableImmediate(Offset.getKnownMinValue());
1929
1930 return TTI.isLegalAddImmediate(Offset.getFixedValue());
1931}
1932
1934 const LSRUse &LU, const Formula &F) {
1935 // Target may want to look at the user instructions.
1936 if (LU.Kind == LSRUse::Address && TTI.LSRWithInstrQueries()) {
1937 for (const LSRFixup &Fixup : LU.Fixups)
1938 if (!isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1939 (F.BaseOffset + Fixup.Offset), F.HasBaseReg,
1940 F.Scale, Fixup.UserInst))
1941 return false;
1942 return true;
1943 }
1944
1945 return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1946 LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg,
1947 F.Scale);
1948}
1949
1951 const LSRUse &LU, const Formula &F,
1952 const Loop &L) {
1953 if (!F.Scale)
1954 return 0;
1955
1956 // If the use is not completely folded in that instruction, we will have to
1957 // pay an extra cost only for scale != 1.
1958 if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1959 LU.AccessTy, F, L))
1960 return F.Scale != 1;
1961
1962 switch (LU.Kind) {
1963 case LSRUse::Address: {
1964 // Check the scaling factor cost with both the min and max offsets.
1965 int64_t ScalableMin = 0, ScalableMax = 0, FixedMin = 0, FixedMax = 0;
1966 if (F.BaseOffset.isScalable()) {
1967 ScalableMin = (F.BaseOffset + LU.MinOffset).getKnownMinValue();
1968 ScalableMax = (F.BaseOffset + LU.MaxOffset).getKnownMinValue();
1969 } else {
1970 FixedMin = (F.BaseOffset + LU.MinOffset).getFixedValue();
1971 FixedMax = (F.BaseOffset + LU.MaxOffset).getFixedValue();
1972 }
1973 InstructionCost ScaleCostMinOffset = TTI.getScalingFactorCost(
1974 LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMin, ScalableMin),
1975 F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
1976 InstructionCost ScaleCostMaxOffset = TTI.getScalingFactorCost(
1977 LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMax, ScalableMax),
1978 F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
1979
1980 assert(ScaleCostMinOffset.isValid() && ScaleCostMaxOffset.isValid() &&
1981 "Legal addressing mode has an illegal cost!");
1982 return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
1983 }
1984 case LSRUse::ICmpZero:
1985 case LSRUse::Basic:
1986 case LSRUse::Special:
1987 // The use is completely folded, i.e., everything is folded into the
1988 // instruction.
1989 return 0;
1990 }
1991
1992 llvm_unreachable("Invalid LSRUse Kind!");
1993}
1994
1996 LSRUse::KindType Kind, MemAccessTy AccessTy,
1997 GlobalValue *BaseGV, Immediate BaseOffset,
1998 bool HasBaseReg) {
1999 // Fast-path: zero is always foldable.
2000 if (BaseOffset.isZero() && !BaseGV)
2001 return true;
2002
2003 // Conservatively, create an address with an immediate and a
2004 // base and a scale.
2005 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
2006
2007 // Canonicalize a scale of 1 to a base register if the formula doesn't
2008 // already have a base register.
2009 if (!HasBaseReg && Scale == 1) {
2010 Scale = 0;
2011 HasBaseReg = true;
2012 }
2013
2014 // FIXME: Try with + without a scale? Maybe based on TTI?
2015 // I think basereg + scaledreg + immediateoffset isn't a good 'conservative'
2016 // default for many architectures, not just AArch64 SVE. More investigation
2017 // needed later to determine if this should be used more widely than just
2018 // on scalable types.
2019 if (HasBaseReg && BaseOffset.isNonZero() && Kind != LSRUse::ICmpZero &&
2020 AccessTy.MemTy && AccessTy.MemTy->isScalableTy() && DropScaledForVScale)
2021 Scale = 0;
2022
2023 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
2024 HasBaseReg, Scale);
2025}
2026
2028 ScalarEvolution &SE, Immediate MinOffset,
2029 Immediate MaxOffset, LSRUse::KindType Kind,
2030 MemAccessTy AccessTy, const SCEV *S,
2031 bool HasBaseReg) {
2032 // Fast-path: zero is always foldable.
2033 if (S->isZero()) return true;
2034
2035 // Conservatively, create an address with an immediate and a
2036 // base and a scale.
2037 Immediate BaseOffset = ExtractImmediate(S, SE);
2038 GlobalValue *BaseGV = ExtractSymbol(S, SE);
2039
2040 // If there's anything else involved, it's not foldable.
2041 if (!S->isZero()) return false;
2042
2043 // Fast-path: zero is always foldable.
2044 if (BaseOffset.isZero() && !BaseGV)
2045 return true;
2046
2047 if (BaseOffset.isScalable())
2048 return false;
2049
2050 // Conservatively, create an address with an immediate and a
2051 // base and a scale.
2052 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
2053
2054 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
2055 BaseOffset, HasBaseReg, Scale);
2056}
2057
2058namespace {
2059
2060/// An individual increment in a Chain of IV increments. Relate an IV user to
2061/// an expression that computes the IV it uses from the IV used by the previous
2062/// link in the Chain.
2063///
2064/// For the head of a chain, IncExpr holds the absolute SCEV expression for the
2065/// original IVOperand. The head of the chain's IVOperand is only valid during
2066/// chain collection, before LSR replaces IV users. During chain generation,
2067/// IncExpr can be used to find the new IVOperand that computes the same
2068/// expression.
2069struct IVInc {
2070 Instruction *UserInst;
2071 Value* IVOperand;
2072 const SCEV *IncExpr;
2073
2074 IVInc(Instruction *U, Value *O, const SCEV *E)
2075 : UserInst(U), IVOperand(O), IncExpr(E) {}
2076};
2077
2078// The list of IV increments in program order. We typically add the head of a
2079// chain without finding subsequent links.
2080struct IVChain {
2082 const SCEV *ExprBase = nullptr;
2083
2084 IVChain() = default;
2085 IVChain(const IVInc &Head, const SCEV *Base)
2086 : Incs(1, Head), ExprBase(Base) {}
2087
2089
2090 // Return the first increment in the chain.
2091 const_iterator begin() const {
2092 assert(!Incs.empty());
2093 return std::next(Incs.begin());
2094 }
2095 const_iterator end() const {
2096 return Incs.end();
2097 }
2098
2099 // Returns true if this chain contains any increments.
2100 bool hasIncs() const { return Incs.size() >= 2; }
2101
2102 // Add an IVInc to the end of this chain.
2103 void add(const IVInc &X) { Incs.push_back(X); }
2104
2105 // Returns the last UserInst in the chain.
2106 Instruction *tailUserInst() const { return Incs.back().UserInst; }
2107
2108 // Returns true if IncExpr can be profitably added to this chain.
2109 bool isProfitableIncrement(const SCEV *OperExpr,
2110 const SCEV *IncExpr,
2112};
2113
2114/// Helper for CollectChains to track multiple IV increment uses. Distinguish
2115/// between FarUsers that definitely cross IV increments and NearUsers that may
2116/// be used between IV increments.
2117struct ChainUsers {
2120};
2121
2122/// This class holds state for the main loop strength reduction logic.
2123class LSRInstance {
2124 IVUsers &IU;
2125 ScalarEvolution &SE;
2126 DominatorTree &DT;
2127 LoopInfo &LI;
2128 AssumptionCache &AC;
2129 TargetLibraryInfo &TLI;
2130 const TargetTransformInfo &TTI;
2131 Loop *const L;
2132 MemorySSAUpdater *MSSAU;
2134 mutable SCEVExpander Rewriter;
2135 bool Changed = false;
2136 bool HardwareLoopProfitable = false;
2137
2138 /// This is the insert position that the current loop's induction variable
2139 /// increment should be placed. In simple loops, this is the latch block's
2140 /// terminator. But in more complicated cases, this is a position which will
2141 /// dominate all the in-loop post-increment users.
2142 Instruction *IVIncInsertPos = nullptr;
2143
2144 /// Interesting factors between use strides.
2145 ///
2146 /// We explicitly use a SetVector which contains a SmallSet, instead of the
2147 /// default, a SmallDenseSet, because we need to use the full range of
2148 /// int64_ts, and there's currently no good way of doing that with
2149 /// SmallDenseSet.
2151
2152 /// The cost of the current SCEV, the best solution by LSR will be dropped if
2153 /// the solution is not profitable.
2154 Cost BaselineCost;
2155
2156 /// Interesting use types, to facilitate truncation reuse.
2158
2159 /// The list of interesting uses.
2161
2162 /// Track which uses use which register candidates.
2163 RegUseTracker RegUses;
2164
2165 // Limit the number of chains to avoid quadratic behavior. We don't expect to
2166 // have more than a few IV increment chains in a loop. Missing a Chain falls
2167 // back to normal LSR behavior for those uses.
2168 static const unsigned MaxChains = 8;
2169
2170 /// IV users can form a chain of IV increments.
2172
2173 /// IV users that belong to profitable IVChains.
2175
2176 /// Induction variables that were generated and inserted by the SCEV Expander.
2177 SmallVector<llvm::WeakVH, 2> ScalarEvolutionIVs;
2178
2179 // Inserting instructions in the loop and using them as PHI's input could
2180 // break LCSSA in case if PHI's parent block is not a loop exit (i.e. the
2181 // corresponding incoming block is not loop exiting). So collect all such
2182 // instructions to form LCSSA for them later.
2183 SmallSetVector<Instruction *, 4> InsertedNonLCSSAInsts;
2184
2185 void OptimizeShadowIV();
2186 bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
2187 ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
2188 void OptimizeLoopTermCond();
2189
2190 void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
2191 SmallVectorImpl<ChainUsers> &ChainUsersVec);
2192 void FinalizeChain(IVChain &Chain);
2193 void CollectChains();
2194 void GenerateIVChain(const IVChain &Chain,
2196
2197 void CollectInterestingTypesAndFactors();
2198 void CollectFixupsAndInitialFormulae();
2199
2200 // Support for sharing of LSRUses between LSRFixups.
2202 UseMapTy UseMap;
2203
2204 bool reconcileNewOffset(LSRUse &LU, Immediate NewOffset, bool HasBaseReg,
2205 LSRUse::KindType Kind, MemAccessTy AccessTy);
2206
2207 std::pair<size_t, Immediate> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
2208 MemAccessTy AccessTy);
2209
2210 void DeleteUse(LSRUse &LU, size_t LUIdx);
2211
2212 LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
2213
2214 void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2215 void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2216 void CountRegisters(const Formula &F, size_t LUIdx);
2217 bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
2218
2219 void CollectLoopInvariantFixupsAndFormulae();
2220
2221 void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
2222 unsigned Depth = 0);
2223
2224 void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
2225 const Formula &Base, unsigned Depth,
2226 size_t Idx, bool IsScaledReg = false);
2227 void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
2228 void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2229 const Formula &Base, size_t Idx,
2230 bool IsScaledReg = false);
2231 void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2232 void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2233 const Formula &Base,
2234 const SmallVectorImpl<Immediate> &Worklist,
2235 size_t Idx, bool IsScaledReg = false);
2236 void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2237 void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2238 void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2239 void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
2240 void GenerateCrossUseConstantOffsets();
2241 void GenerateAllReuseFormulae();
2242
2243 void FilterOutUndesirableDedicatedRegisters();
2244
2245 size_t EstimateSearchSpaceComplexity() const;
2246 void NarrowSearchSpaceByDetectingSupersets();
2247 void NarrowSearchSpaceByCollapsingUnrolledCode();
2248 void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
2249 void NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
2250 void NarrowSearchSpaceByFilterPostInc();
2251 void NarrowSearchSpaceByDeletingCostlyFormulas();
2252 void NarrowSearchSpaceByPickingWinnerRegs();
2253 void NarrowSearchSpaceUsingHeuristics();
2254
2255 void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
2256 Cost &SolutionCost,
2258 const Cost &CurCost,
2259 const SmallPtrSet<const SCEV *, 16> &CurRegs,
2260 DenseSet<const SCEV *> &VisitedRegs) const;
2261 void Solve(SmallVectorImpl<const Formula *> &Solution) const;
2262
2264 HoistInsertPosition(BasicBlock::iterator IP,
2265 const SmallVectorImpl<Instruction *> &Inputs) const;
2266 BasicBlock::iterator AdjustInsertPositionForExpand(BasicBlock::iterator IP,
2267 const LSRFixup &LF,
2268 const LSRUse &LU) const;
2269
2270 Value *Expand(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2272 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2273 void RewriteForPHI(PHINode *PN, const LSRUse &LU, const LSRFixup &LF,
2274 const Formula &F,
2276 void Rewrite(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2278 void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution);
2279
2280public:
2281 LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT,
2283 TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU);
2284
2285 bool getChanged() const { return Changed; }
2286 const SmallVectorImpl<WeakVH> &getScalarEvolutionIVs() const {
2287 return ScalarEvolutionIVs;
2288 }
2289
2290 void print_factors_and_types(raw_ostream &OS) const;
2291 void print_fixups(raw_ostream &OS) const;
2292 void print_uses(raw_ostream &OS) const;
2293 void print(raw_ostream &OS) const;
2294 void dump() const;
2295};
2296
2297} // end anonymous namespace
2298
2299/// If IV is used in a int-to-float cast inside the loop then try to eliminate
2300/// the cast operation.
2301void LSRInstance::OptimizeShadowIV() {
2302 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2303 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2304 return;
2305
2306 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
2307 UI != E; /* empty */) {
2308 IVUsers::const_iterator CandidateUI = UI;
2309 ++UI;
2310 Instruction *ShadowUse = CandidateUI->getUser();
2311 Type *DestTy = nullptr;
2312 bool IsSigned = false;
2313
2314 /* If shadow use is a int->float cast then insert a second IV
2315 to eliminate this cast.
2316
2317 for (unsigned i = 0; i < n; ++i)
2318 foo((double)i);
2319
2320 is transformed into
2321
2322 double d = 0.0;
2323 for (unsigned i = 0; i < n; ++i, ++d)
2324 foo(d);
2325 */
2326 if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
2327 IsSigned = false;
2328 DestTy = UCast->getDestTy();
2329 }
2330 else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
2331 IsSigned = true;
2332 DestTy = SCast->getDestTy();
2333 }
2334 if (!DestTy) continue;
2335
2336 // If target does not support DestTy natively then do not apply
2337 // this transformation.
2338 if (!TTI.isTypeLegal(DestTy)) continue;
2339
2340 PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
2341 if (!PH) continue;
2342 if (PH->getNumIncomingValues() != 2) continue;
2343
2344 // If the calculation in integers overflows, the result in FP type will
2345 // differ. So we only can do this transformation if we are guaranteed to not
2346 // deal with overflowing values
2347 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(PH));
2348 if (!AR) continue;
2349 if (IsSigned && !AR->hasNoSignedWrap()) continue;
2350 if (!IsSigned && !AR->hasNoUnsignedWrap()) continue;
2351
2352 Type *SrcTy = PH->getType();
2353 int Mantissa = DestTy->getFPMantissaWidth();
2354 if (Mantissa == -1) continue;
2355 if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
2356 continue;
2357
2358 unsigned Entry, Latch;
2359 if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
2360 Entry = 0;
2361 Latch = 1;
2362 } else {
2363 Entry = 1;
2364 Latch = 0;
2365 }
2366
2367 ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
2368 if (!Init) continue;
2369 Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
2370 (double)Init->getSExtValue() :
2371 (double)Init->getZExtValue());
2372
2373 BinaryOperator *Incr =
2374 dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
2375 if (!Incr) continue;
2376 if (Incr->getOpcode() != Instruction::Add
2377 && Incr->getOpcode() != Instruction::Sub)
2378 continue;
2379
2380 /* Initialize new IV, double d = 0.0 in above example. */
2381 ConstantInt *C = nullptr;
2382 if (Incr->getOperand(0) == PH)
2383 C = dyn_cast<ConstantInt>(Incr->getOperand(1));
2384 else if (Incr->getOperand(1) == PH)
2385 C = dyn_cast<ConstantInt>(Incr->getOperand(0));
2386 else
2387 continue;
2388
2389 if (!C) continue;
2390
2391 // Ignore negative constants, as the code below doesn't handle them
2392 // correctly. TODO: Remove this restriction.
2393 if (!C->getValue().isStrictlyPositive())
2394 continue;
2395
2396 /* Add new PHINode. */
2397 PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH->getIterator());
2398 NewPH->setDebugLoc(PH->getDebugLoc());
2399
2400 /* create new increment. '++d' in above example. */
2401 Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
2403 Incr->getOpcode() == Instruction::Add ? Instruction::FAdd
2404 : Instruction::FSub,
2405 NewPH, CFP, "IV.S.next.", Incr->getIterator());
2406 NewIncr->setDebugLoc(Incr->getDebugLoc());
2407
2408 NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
2409 NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
2410
2411 /* Remove cast operation */
2412 ShadowUse->replaceAllUsesWith(NewPH);
2413 ShadowUse->eraseFromParent();
2414 Changed = true;
2415 break;
2416 }
2417}
2418
2419/// If Cond has an operand that is an expression of an IV, set the IV user and
2420/// stride information and return true, otherwise return false.
2421bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
2422 for (IVStrideUse &U : IU)
2423 if (U.getUser() == Cond) {
2424 // NOTE: we could handle setcc instructions with multiple uses here, but
2425 // InstCombine does it as well for simple uses, it's not clear that it
2426 // occurs enough in real life to handle.
2427 CondUse = &U;
2428 return true;
2429 }
2430 return false;
2431}
2432
2433/// Rewrite the loop's terminating condition if it uses a max computation.
2434///
2435/// This is a narrow solution to a specific, but acute, problem. For loops
2436/// like this:
2437///
2438/// i = 0;
2439/// do {
2440/// p[i] = 0.0;
2441/// } while (++i < n);
2442///
2443/// the trip count isn't just 'n', because 'n' might not be positive. And
2444/// unfortunately this can come up even for loops where the user didn't use
2445/// a C do-while loop. For example, seemingly well-behaved top-test loops
2446/// will commonly be lowered like this:
2447///
2448/// if (n > 0) {
2449/// i = 0;
2450/// do {
2451/// p[i] = 0.0;
2452/// } while (++i < n);
2453/// }
2454///
2455/// and then it's possible for subsequent optimization to obscure the if
2456/// test in such a way that indvars can't find it.
2457///
2458/// When indvars can't find the if test in loops like this, it creates a
2459/// max expression, which allows it to give the loop a canonical
2460/// induction variable:
2461///
2462/// i = 0;
2463/// max = n < 1 ? 1 : n;
2464/// do {
2465/// p[i] = 0.0;
2466/// } while (++i != max);
2467///
2468/// Canonical induction variables are necessary because the loop passes
2469/// are designed around them. The most obvious example of this is the
2470/// LoopInfo analysis, which doesn't remember trip count values. It
2471/// expects to be able to rediscover the trip count each time it is
2472/// needed, and it does this using a simple analysis that only succeeds if
2473/// the loop has a canonical induction variable.
2474///
2475/// However, when it comes time to generate code, the maximum operation
2476/// can be quite costly, especially if it's inside of an outer loop.
2477///
2478/// This function solves this problem by detecting this type of loop and
2479/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
2480/// the instructions for the maximum computation.
2481ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
2482 // Check that the loop matches the pattern we're looking for.
2483 if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
2484 Cond->getPredicate() != CmpInst::ICMP_NE)
2485 return Cond;
2486
2487 SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
2488 if (!Sel || !Sel->hasOneUse()) return Cond;
2489
2490 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2491 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2492 return Cond;
2493 const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
2494
2495 // Add one to the backedge-taken count to get the trip count.
2496 const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
2497 if (IterationCount != SE.getSCEV(Sel)) return Cond;
2498
2499 // Check for a max calculation that matches the pattern. There's no check
2500 // for ICMP_ULE here because the comparison would be with zero, which
2501 // isn't interesting.
2502 CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
2503 const SCEVNAryExpr *Max = nullptr;
2504 if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
2505 Pred = ICmpInst::ICMP_SLE;
2506 Max = S;
2507 } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
2508 Pred = ICmpInst::ICMP_SLT;
2509 Max = S;
2510 } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
2511 Pred = ICmpInst::ICMP_ULT;
2512 Max = U;
2513 } else {
2514 // No match; bail.
2515 return Cond;
2516 }
2517
2518 // To handle a max with more than two operands, this optimization would
2519 // require additional checking and setup.
2520 if (Max->getNumOperands() != 2)
2521 return Cond;
2522
2523 const SCEV *MaxLHS = Max->getOperand(0);
2524 const SCEV *MaxRHS = Max->getOperand(1);
2525
2526 // ScalarEvolution canonicalizes constants to the left. For < and >, look
2527 // for a comparison with 1. For <= and >=, a comparison with zero.
2528 if (!MaxLHS ||
2529 (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
2530 return Cond;
2531
2532 // Check the relevant induction variable for conformance to
2533 // the pattern.
2534 const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
2535 if (!match(IV,
2537 return Cond;
2538
2539 assert(cast<SCEVAddRecExpr>(IV)->getLoop() == L &&
2540 "Loop condition operand is an addrec in a different loop!");
2541
2542 // Check the right operand of the select, and remember it, as it will
2543 // be used in the new comparison instruction.
2544 Value *NewRHS = nullptr;
2545 if (ICmpInst::isTrueWhenEqual(Pred)) {
2546 // Look for n+1, and grab n.
2547 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
2548 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2549 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2550 NewRHS = BO->getOperand(0);
2551 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
2552 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2553 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2554 NewRHS = BO->getOperand(0);
2555 if (!NewRHS)
2556 return Cond;
2557 } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
2558 NewRHS = Sel->getOperand(1);
2559 else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
2560 NewRHS = Sel->getOperand(2);
2561 else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
2562 NewRHS = SU->getValue();
2563 else
2564 // Max doesn't match expected pattern.
2565 return Cond;
2566
2567 // Determine the new comparison opcode. It may be signed or unsigned,
2568 // and the original comparison may be either equality or inequality.
2569 if (Cond->getPredicate() == CmpInst::ICMP_EQ)
2570 Pred = CmpInst::getInversePredicate(Pred);
2571
2572 // Ok, everything looks ok to change the condition into an SLT or SGE and
2573 // delete the max calculation.
2574 ICmpInst *NewCond = new ICmpInst(Cond->getIterator(), Pred,
2575 Cond->getOperand(0), NewRHS, "scmp");
2576
2577 // Delete the max calculation instructions.
2578 NewCond->setDebugLoc(Cond->getDebugLoc());
2579 Cond->replaceAllUsesWith(NewCond);
2580 CondUse->setUser(NewCond);
2581 Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
2582 Cond->eraseFromParent();
2583 Sel->eraseFromParent();
2584 if (Cmp->use_empty()) {
2585 salvageDebugInfo(*Cmp);
2586 Cmp->eraseFromParent();
2587 }
2588 return NewCond;
2589}
2590
2591/// Change loop terminating condition to use the postinc iv when possible.
2592void
2593LSRInstance::OptimizeLoopTermCond() {
2595
2596 // We need a different set of heuristics for rotated and non-rotated loops.
2597 // If a loop is rotated then the latch is also the backedge, so inserting
2598 // post-inc expressions just before the latch is ideal. To reduce live ranges
2599 // it also makes sense to rewrite terminating conditions to use post-inc
2600 // expressions.
2601 //
2602 // If the loop is not rotated then the latch is not a backedge; the latch
2603 // check is done in the loop head. Adding post-inc expressions before the
2604 // latch will cause overlapping live-ranges of pre-inc and post-inc expressions
2605 // in the loop body. In this case we do *not* want to use post-inc expressions
2606 // in the latch check, and we want to insert post-inc expressions before
2607 // the backedge.
2608 BasicBlock *LatchBlock = L->getLoopLatch();
2609 SmallVector<BasicBlock*, 8> ExitingBlocks;
2610 L->getExitingBlocks(ExitingBlocks);
2611 if (!llvm::is_contained(ExitingBlocks, LatchBlock)) {
2612 // The backedge doesn't exit the loop; treat this as a head-tested loop.
2613 IVIncInsertPos = LatchBlock->getTerminator();
2614 return;
2615 }
2616
2617 // Otherwise treat this as a rotated loop.
2618 for (BasicBlock *ExitingBlock : ExitingBlocks) {
2619 // Get the terminating condition for the loop if possible. If we
2620 // can, we want to change it to use a post-incremented version of its
2621 // induction variable, to allow coalescing the live ranges for the IV into
2622 // one register value.
2623
2624 BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
2625 if (!TermBr)
2626 continue;
2627 // FIXME: Overly conservative, termination condition could be an 'or' etc..
2628 if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
2629 continue;
2630
2631 // Search IVUsesByStride to find Cond's IVUse if there is one.
2632 IVStrideUse *CondUse = nullptr;
2633 ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
2634 if (!FindIVUserForCond(Cond, CondUse))
2635 continue;
2636
2637 // If the trip count is computed in terms of a max (due to ScalarEvolution
2638 // being unable to find a sufficient guard, for example), change the loop
2639 // comparison to use SLT or ULT instead of NE.
2640 // One consequence of doing this now is that it disrupts the count-down
2641 // optimization. That's not always a bad thing though, because in such
2642 // cases it may still be worthwhile to avoid a max.
2643 Cond = OptimizeMax(Cond, CondUse);
2644
2645 // If this exiting block dominates the latch block, it may also use
2646 // the post-inc value if it won't be shared with other uses.
2647 // Check for dominance.
2648 if (!DT.dominates(ExitingBlock, LatchBlock))
2649 continue;
2650
2651 // Conservatively avoid trying to use the post-inc value in non-latch
2652 // exits if there may be pre-inc users in intervening blocks.
2653 if (LatchBlock != ExitingBlock)
2654 for (const IVStrideUse &UI : IU)
2655 // Test if the use is reachable from the exiting block. This dominator
2656 // query is a conservative approximation of reachability.
2657 if (&UI != CondUse &&
2658 !DT.properlyDominates(UI.getUser()->getParent(), ExitingBlock)) {
2659 // Conservatively assume there may be reuse if the quotient of their
2660 // strides could be a legal scale.
2661 const SCEV *A = IU.getStride(*CondUse, L);
2662 const SCEV *B = IU.getStride(UI, L);
2663 if (!A || !B) continue;
2664 if (SE.getTypeSizeInBits(A->getType()) !=
2665 SE.getTypeSizeInBits(B->getType())) {
2666 if (SE.getTypeSizeInBits(A->getType()) >
2667 SE.getTypeSizeInBits(B->getType()))
2668 B = SE.getSignExtendExpr(B, A->getType());
2669 else
2670 A = SE.getSignExtendExpr(A, B->getType());
2671 }
2672 if (const SCEVConstant *D =
2673 dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) {
2674 const ConstantInt *C = D->getValue();
2675 // Stride of one or negative one can have reuse with non-addresses.
2676 if (C->isOne() || C->isMinusOne())
2677 goto decline_post_inc;
2678 // Avoid weird situations.
2679 if (C->getValue().getSignificantBits() >= 64 ||
2680 C->getValue().isMinSignedValue())
2681 goto decline_post_inc;
2682 // Check for possible scaled-address reuse.
2683 if (isAddressUse(TTI, UI.getUser(), UI.getOperandValToReplace())) {
2684 MemAccessTy AccessTy =
2685 getAccessType(TTI, UI.getUser(), UI.getOperandValToReplace());
2686 int64_t Scale = C->getSExtValue();
2687 if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2688 /*BaseOffset=*/0,
2689 /*HasBaseReg=*/true, Scale,
2690 AccessTy.AddrSpace))
2691 goto decline_post_inc;
2692 Scale = -Scale;
2693 if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2694 /*BaseOffset=*/0,
2695 /*HasBaseReg=*/true, Scale,
2696 AccessTy.AddrSpace))
2697 goto decline_post_inc;
2698 }
2699 }
2700 }
2701
2702 LLVM_DEBUG(dbgs() << " Change loop exiting icmp to use postinc iv: "
2703 << *Cond << '\n');
2704
2705 // It's possible for the setcc instruction to be anywhere in the loop, and
2706 // possible for it to have multiple users. If it is not immediately before
2707 // the exiting block branch, move it.
2708 if (Cond->getNextNode() != TermBr) {
2709 if (Cond->hasOneUse()) {
2710 Cond->moveBefore(TermBr->getIterator());
2711 } else {
2712 // Clone the terminating condition and insert into the loopend.
2713 ICmpInst *OldCond = Cond;
2714 Cond = cast<ICmpInst>(Cond->clone());
2715 Cond->setName(L->getHeader()->getName() + ".termcond");
2716 Cond->insertInto(ExitingBlock, TermBr->getIterator());
2717
2718 // Clone the IVUse, as the old use still exists!
2719 CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
2720 TermBr->replaceUsesOfWith(OldCond, Cond);
2721 }
2722 }
2723
2724 // If we get to here, we know that we can transform the setcc instruction to
2725 // use the post-incremented version of the IV, allowing us to coalesce the
2726 // live ranges for the IV correctly.
2727 CondUse->transformToPostInc(L);
2728 Changed = true;
2729
2730 PostIncs.insert(Cond);
2731 decline_post_inc:;
2732 }
2733
2734 // Determine an insertion point for the loop induction variable increment. It
2735 // must dominate all the post-inc comparisons we just set up, and it must
2736 // dominate the loop latch edge.
2737 IVIncInsertPos = L->getLoopLatch()->getTerminator();
2738 for (Instruction *Inst : PostIncs)
2739 IVIncInsertPos = DT.findNearestCommonDominator(IVIncInsertPos, Inst);
2740}
2741
2742/// Determine if the given use can accommodate a fixup at the given offset and
2743/// other details. If so, update the use and return true.
2744bool LSRInstance::reconcileNewOffset(LSRUse &LU, Immediate NewOffset,
2745 bool HasBaseReg, LSRUse::KindType Kind,
2746 MemAccessTy AccessTy) {
2747 Immediate NewMinOffset = LU.MinOffset;
2748 Immediate NewMaxOffset = LU.MaxOffset;
2749 MemAccessTy NewAccessTy = AccessTy;
2750
2751 // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
2752 // something conservative, however this can pessimize in the case that one of
2753 // the uses will have all its uses outside the loop, for example.
2754 if (LU.Kind != Kind)
2755 return false;
2756
2757 // Check for a mismatched access type, and fall back conservatively as needed.
2758 // TODO: Be less conservative when the type is similar and can use the same
2759 // addressing modes.
2760 if (Kind == LSRUse::Address) {
2761 if (AccessTy.MemTy != LU.AccessTy.MemTy) {
2762 NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext(),
2763 AccessTy.AddrSpace);
2764 }
2765 }
2766
2767 // Conservatively assume HasBaseReg is true for now.
2768 if (Immediate::isKnownLT(NewOffset, LU.MinOffset)) {
2769 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2770 LU.MaxOffset - NewOffset, HasBaseReg))
2771 return false;
2772 NewMinOffset = NewOffset;
2773 } else if (Immediate::isKnownGT(NewOffset, LU.MaxOffset)) {
2774 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2775 NewOffset - LU.MinOffset, HasBaseReg))
2776 return false;
2777 NewMaxOffset = NewOffset;
2778 }
2779
2780 // FIXME: We should be able to handle some level of scalable offset support
2781 // for 'void', but in order to get basic support up and running this is
2782 // being left out.
2783 if (NewAccessTy.MemTy && NewAccessTy.MemTy->isVoidTy() &&
2784 (NewMinOffset.isScalable() || NewMaxOffset.isScalable()))
2785 return false;
2786
2787 // Update the use.
2788 LU.MinOffset = NewMinOffset;
2789 LU.MaxOffset = NewMaxOffset;
2790 LU.AccessTy = NewAccessTy;
2791 return true;
2792}
2793
2794/// Return an LSRUse index and an offset value for a fixup which needs the given
2795/// expression, with the given kind and optional access type. Either reuse an
2796/// existing use or create a new one, as needed.
2797std::pair<size_t, Immediate> LSRInstance::getUse(const SCEV *&Expr,
2798 LSRUse::KindType Kind,
2799 MemAccessTy AccessTy) {
2800 const SCEV *Copy = Expr;
2801 Immediate Offset = ExtractImmediate(Expr, SE);
2802
2803 // Basic uses can't accept any offset, for example.
2804 if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
2805 Offset, /*HasBaseReg=*/ true)) {
2806 Expr = Copy;
2807 Offset = Immediate::getFixed(0);
2808 }
2809
2810 std::pair<UseMapTy::iterator, bool> P =
2811 UseMap.try_emplace(LSRUse::SCEVUseKindPair(Expr, Kind));
2812 if (!P.second) {
2813 // A use already existed with this base.
2814 size_t LUIdx = P.first->second;
2815 LSRUse &LU = Uses[LUIdx];
2816 if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
2817 // Reuse this use.
2818 return std::make_pair(LUIdx, Offset);
2819 }
2820
2821 // Create a new use.
2822 size_t LUIdx = Uses.size();
2823 P.first->second = LUIdx;
2824 Uses.push_back(LSRUse(Kind, AccessTy));
2825 LSRUse &LU = Uses[LUIdx];
2826
2827 LU.MinOffset = Offset;
2828 LU.MaxOffset = Offset;
2829 return std::make_pair(LUIdx, Offset);
2830}
2831
2832/// Delete the given use from the Uses list.
2833void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
2834 if (&LU != &Uses.back())
2835 std::swap(LU, Uses.back());
2836 Uses.pop_back();
2837
2838 // Update RegUses.
2839 RegUses.swapAndDropUse(LUIdx, Uses.size());
2840}
2841
2842/// Look for a use distinct from OrigLU which is has a formula that has the same
2843/// registers as the given formula.
2844LSRUse *
2845LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
2846 const LSRUse &OrigLU) {
2847 // Search all uses for the formula. This could be more clever.
2848 for (LSRUse &LU : Uses) {
2849 // Check whether this use is close enough to OrigLU, to see whether it's
2850 // worthwhile looking through its formulae.
2851 // Ignore ICmpZero uses because they may contain formulae generated by
2852 // GenerateICmpZeroScales, in which case adding fixup offsets may
2853 // be invalid.
2854 if (&LU != &OrigLU &&
2855 LU.Kind != LSRUse::ICmpZero &&
2856 LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
2857 LU.WidestFixupType == OrigLU.WidestFixupType &&
2858 LU.HasFormulaWithSameRegs(OrigF)) {
2859 // Scan through this use's formulae.
2860 for (const Formula &F : LU.Formulae) {
2861 // Check to see if this formula has the same registers and symbols
2862 // as OrigF.
2863 if (F.BaseRegs == OrigF.BaseRegs &&
2864 F.ScaledReg == OrigF.ScaledReg &&
2865 F.BaseGV == OrigF.BaseGV &&
2866 F.Scale == OrigF.Scale &&
2867 F.UnfoldedOffset == OrigF.UnfoldedOffset) {
2868 if (F.BaseOffset.isZero())
2869 return &LU;
2870 // This is the formula where all the registers and symbols matched;
2871 // there aren't going to be any others. Since we declined it, we
2872 // can skip the rest of the formulae and proceed to the next LSRUse.
2873 break;
2874 }
2875 }
2876 }
2877 }
2878
2879 // Nothing looked good.
2880 return nullptr;
2881}
2882
2883void LSRInstance::CollectInterestingTypesAndFactors() {
2885
2886 // Collect interesting types and strides.
2888 for (const IVStrideUse &U : IU) {
2889 const SCEV *Expr = IU.getExpr(U);
2890 if (!Expr)
2891 continue;
2892
2893 // Collect interesting types.
2894 Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
2895
2896 // Add strides for mentioned loops.
2897 Worklist.push_back(Expr);
2898 do {
2899 const SCEV *S = Worklist.pop_back_val();
2900 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
2901 if (AR->getLoop() == L)
2902 Strides.insert(AR->getStepRecurrence(SE));
2903 Worklist.push_back(AR->getStart());
2904 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
2905 append_range(Worklist, Add->operands());
2906 }
2907 } while (!Worklist.empty());
2908 }
2909
2910 // Compute interesting factors from the set of interesting strides.
2912 I = Strides.begin(), E = Strides.end(); I != E; ++I)
2914 std::next(I); NewStrideIter != E; ++NewStrideIter) {
2915 const SCEV *OldStride = *I;
2916 const SCEV *NewStride = *NewStrideIter;
2917
2918 if (SE.getTypeSizeInBits(OldStride->getType()) !=
2919 SE.getTypeSizeInBits(NewStride->getType())) {
2920 if (SE.getTypeSizeInBits(OldStride->getType()) >
2921 SE.getTypeSizeInBits(NewStride->getType()))
2922 NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
2923 else
2924 OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
2925 }
2926 if (const SCEVConstant *Factor =
2927 dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
2928 SE, true))) {
2929 if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2930 Factors.insert(Factor->getAPInt().getSExtValue());
2931 } else if (const SCEVConstant *Factor =
2932 dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride,
2933 NewStride,
2934 SE, true))) {
2935 if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2936 Factors.insert(Factor->getAPInt().getSExtValue());
2937 }
2938 }
2939
2940 // If all uses use the same type, don't bother looking for truncation-based
2941 // reuse.
2942 if (Types.size() == 1)
2943 Types.clear();
2944
2945 LLVM_DEBUG(print_factors_and_types(dbgs()));
2946}
2947
2948/// Helper for CollectChains that finds an IV operand (computed by an AddRec in
2949/// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to
2950/// IVStrideUses, we could partially skip this.
2951static User::op_iterator
2953 Loop *L, ScalarEvolution &SE) {
2954 for(; OI != OE; ++OI) {
2955 if (Instruction *Oper = dyn_cast<Instruction>(*OI)) {
2956 if (!SE.isSCEVable(Oper->getType()))
2957 continue;
2958
2959 if (const SCEVAddRecExpr *AR =
2960 dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Oper))) {
2961 if (AR->getLoop() == L)
2962 break;
2963 }
2964 }
2965 }
2966 return OI;
2967}
2968
2969/// IVChain logic must consistently peek base TruncInst operands, so wrap it in
2970/// a convenient helper.
2972 if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
2973 return Trunc->getOperand(0);
2974 return Oper;
2975}
2976
2977/// Return an approximation of this SCEV expression's "base", or NULL for any
2978/// constant. Returning the expression itself is conservative. Returning a
2979/// deeper subexpression is more precise and valid as long as it isn't less
2980/// complex than another subexpression. For expressions involving multiple
2981/// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids
2982/// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i],
2983/// IVInc==b-a.
2984///
2985/// Since SCEVUnknown is the rightmost type, and pointers are the rightmost
2986/// SCEVUnknown, we simply return the rightmost SCEV operand.
2987static const SCEV *getExprBase(const SCEV *S) {
2988 switch (S->getSCEVType()) {
2989 default: // including scUnknown.
2990 return S;
2991 case scConstant:
2992 case scVScale:
2993 return nullptr;
2994 case scTruncate:
2995 return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand());
2996 case scZeroExtend:
2997 return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand());
2998 case scSignExtend:
2999 return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand());
3000 case scAddExpr: {
3001 // Skip over scaled operands (scMulExpr) to follow add operands as long as
3002 // there's nothing more complex.
3003 // FIXME: not sure if we want to recognize negation.
3004 const SCEVAddExpr *Add = cast<SCEVAddExpr>(S);
3005 for (const SCEV *SubExpr : reverse(Add->operands())) {
3006 if (SubExpr->getSCEVType() == scAddExpr)
3007 return getExprBase(SubExpr);
3008
3009 if (SubExpr->getSCEVType() != scMulExpr)
3010 return SubExpr;
3011 }
3012 return S; // all operands are scaled, be conservative.
3013 }
3014 case scAddRecExpr:
3015 return getExprBase(cast<SCEVAddRecExpr>(S)->getStart());
3016 }
3017 llvm_unreachable("Unknown SCEV kind!");
3018}
3019
3020/// Return true if the chain increment is profitable to expand into a loop
3021/// invariant value, which may require its own register. A profitable chain
3022/// increment will be an offset relative to the same base. We allow such offsets
3023/// to potentially be used as chain increment as long as it's not obviously
3024/// expensive to expand using real instructions.
3025bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
3026 const SCEV *IncExpr,
3027 ScalarEvolution &SE) {
3028 // Aggressively form chains when -stress-ivchain.
3029 if (StressIVChain)
3030 return true;
3031
3032 // Do not replace a constant offset from IV head with a nonconstant IV
3033 // increment.
3034 if (!isa<SCEVConstant>(IncExpr)) {
3035 const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand));
3036 if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
3037 return false;
3038 }
3039
3041 return !isHighCostExpansion(IncExpr, Processed, SE);
3042}
3043
3044/// Return true if the number of registers needed for the chain is estimated to
3045/// be less than the number required for the individual IV users. First prohibit
3046/// any IV users that keep the IV live across increments (the Users set should
3047/// be empty). Next count the number and type of increments in the chain.
3048///
3049/// Chaining IVs can lead to considerable code bloat if ISEL doesn't
3050/// effectively use postinc addressing modes. Only consider it profitable it the
3051/// increments can be computed in fewer registers when chained.
3052///
3053/// TODO: Consider IVInc free if it's already used in another chains.
3054static bool isProfitableChain(IVChain &Chain,
3056 ScalarEvolution &SE,
3057 const TargetTransformInfo &TTI) {
3058 if (StressIVChain)
3059 return true;
3060
3061 if (!Chain.hasIncs())
3062 return false;
3063
3064 if (!Users.empty()) {
3065 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
3066 for (Instruction *Inst
3067 : Users) { dbgs() << " " << *Inst << "\n"; });
3068 return false;
3069 }
3070 assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3071
3072 // The chain itself may require a register, so intialize cost to 1.
3073 int cost = 1;
3074
3075 // A complete chain likely eliminates the need for keeping the original IV in
3076 // a register. LSR does not currently know how to form a complete chain unless
3077 // the header phi already exists.
3078 if (isa<PHINode>(Chain.tailUserInst())
3079 && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) {
3080 --cost;
3081 }
3082 const SCEV *LastIncExpr = nullptr;
3083 unsigned NumConstIncrements = 0;
3084 unsigned NumVarIncrements = 0;
3085 unsigned NumReusedIncrements = 0;
3086
3087 if (TTI.isProfitableLSRChainElement(Chain.Incs[0].UserInst))
3088 return true;
3089
3090 for (const IVInc &Inc : Chain) {
3091 if (TTI.isProfitableLSRChainElement(Inc.UserInst))
3092 return true;
3093 if (Inc.IncExpr->isZero())
3094 continue;
3095
3096 // Incrementing by zero or some constant is neutral. We assume constants can
3097 // be folded into an addressing mode or an add's immediate operand.
3098 if (isa<SCEVConstant>(Inc.IncExpr)) {
3099 ++NumConstIncrements;
3100 continue;
3101 }
3102
3103 if (Inc.IncExpr == LastIncExpr)
3104 ++NumReusedIncrements;
3105 else
3106 ++NumVarIncrements;
3107
3108 LastIncExpr = Inc.IncExpr;
3109 }
3110 // An IV chain with a single increment is handled by LSR's postinc
3111 // uses. However, a chain with multiple increments requires keeping the IV's
3112 // value live longer than it needs to be if chained.
3113 if (NumConstIncrements > 1)
3114 --cost;
3115
3116 // Materializing increment expressions in the preheader that didn't exist in
3117 // the original code may cost a register. For example, sign-extended array
3118 // indices can produce ridiculous increments like this:
3119 // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
3120 cost += NumVarIncrements;
3121
3122 // Reusing variable increments likely saves a register to hold the multiple of
3123 // the stride.
3124 cost -= NumReusedIncrements;
3125
3126 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
3127 << "\n");
3128
3129 return cost < 0;
3130}
3131
3132/// Add this IV user to an existing chain or make it the head of a new chain.
3133void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
3134 SmallVectorImpl<ChainUsers> &ChainUsersVec) {
3135 // When IVs are used as types of varying widths, they are generally converted
3136 // to a wider type with some uses remaining narrow under a (free) trunc.
3137 Value *const NextIV = getWideOperand(IVOper);
3138 const SCEV *const OperExpr = SE.getSCEV(NextIV);
3139 const SCEV *const OperExprBase = getExprBase(OperExpr);
3140
3141 // Visit all existing chains. Check if its IVOper can be computed as a
3142 // profitable loop invariant increment from the last link in the Chain.
3143 unsigned ChainIdx = 0, NChains = IVChainVec.size();
3144 const SCEV *LastIncExpr = nullptr;
3145 for (; ChainIdx < NChains; ++ChainIdx) {
3146 IVChain &Chain = IVChainVec[ChainIdx];
3147
3148 // Prune the solution space aggressively by checking that both IV operands
3149 // are expressions that operate on the same unscaled SCEVUnknown. This
3150 // "base" will be canceled by the subsequent getMinusSCEV call. Checking
3151 // first avoids creating extra SCEV expressions.
3152 if (!StressIVChain && Chain.ExprBase != OperExprBase)
3153 continue;
3154
3155 Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand);
3156 if (PrevIV->getType() != NextIV->getType())
3157 continue;
3158
3159 // A phi node terminates a chain.
3160 if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst()))
3161 continue;
3162
3163 // The increment must be loop-invariant so it can be kept in a register.
3164 const SCEV *PrevExpr = SE.getSCEV(PrevIV);
3165 const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
3166 if (isa<SCEVCouldNotCompute>(IncExpr) || !SE.isLoopInvariant(IncExpr, L))
3167 continue;
3168
3169 if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) {
3170 LastIncExpr = IncExpr;
3171 break;
3172 }
3173 }
3174 // If we haven't found a chain, create a new one, unless we hit the max. Don't
3175 // bother for phi nodes, because they must be last in the chain.
3176 if (ChainIdx == NChains) {
3177 if (isa<PHINode>(UserInst))
3178 return;
3179 if (NChains >= MaxChains && !StressIVChain) {
3180 LLVM_DEBUG(dbgs() << "IV Chain Limit\n");
3181 return;
3182 }
3183 LastIncExpr = OperExpr;
3184 // IVUsers may have skipped over sign/zero extensions. We don't currently
3185 // attempt to form chains involving extensions unless they can be hoisted
3186 // into this loop's AddRec.
3187 if (!isa<SCEVAddRecExpr>(LastIncExpr))
3188 return;
3189 ++NChains;
3190 IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
3191 OperExprBase));
3192 ChainUsersVec.resize(NChains);
3193 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
3194 << ") IV=" << *LastIncExpr << "\n");
3195 } else {
3196 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Inc: (" << *UserInst
3197 << ") IV+" << *LastIncExpr << "\n");
3198 // Add this IV user to the end of the chain.
3199 IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
3200 }
3201 IVChain &Chain = IVChainVec[ChainIdx];
3202
3203 SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
3204 // This chain's NearUsers become FarUsers.
3205 if (!LastIncExpr->isZero()) {
3206 ChainUsersVec[ChainIdx].FarUsers.insert_range(NearUsers);
3207 NearUsers.clear();
3208 }
3209
3210 // All other uses of IVOperand become near uses of the chain.
3211 // We currently ignore intermediate values within SCEV expressions, assuming
3212 // they will eventually be used be the current chain, or can be computed
3213 // from one of the chain increments. To be more precise we could
3214 // transitively follow its user and only add leaf IV users to the set.
3215 for (User *U : IVOper->users()) {
3216 Instruction *OtherUse = dyn_cast<Instruction>(U);
3217 if (!OtherUse)
3218 continue;
3219 // Uses in the chain will no longer be uses if the chain is formed.
3220 // Include the head of the chain in this iteration (not Chain.begin()).
3221 IVChain::const_iterator IncIter = Chain.Incs.begin();
3222 IVChain::const_iterator IncEnd = Chain.Incs.end();
3223 for( ; IncIter != IncEnd; ++IncIter) {
3224 if (IncIter->UserInst == OtherUse)
3225 break;
3226 }
3227 if (IncIter != IncEnd)
3228 continue;
3229
3230 if (SE.isSCEVable(OtherUse->getType())
3231 && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
3232 && IU.isIVUserOrOperand(OtherUse)) {
3233 continue;
3234 }
3235 NearUsers.insert(OtherUse);
3236 }
3237
3238 // Since this user is part of the chain, it's no longer considered a use
3239 // of the chain.
3240 ChainUsersVec[ChainIdx].FarUsers.erase(UserInst);
3241}
3242
3243/// Populate the vector of Chains.
3244///
3245/// This decreases ILP at the architecture level. Targets with ample registers,
3246/// multiple memory ports, and no register renaming probably don't want
3247/// this. However, such targets should probably disable LSR altogether.
3248///
3249/// The job of LSR is to make a reasonable choice of induction variables across
3250/// the loop. Subsequent passes can easily "unchain" computation exposing more
3251/// ILP *within the loop* if the target wants it.
3252///
3253/// Finding the best IV chain is potentially a scheduling problem. Since LSR
3254/// will not reorder memory operations, it will recognize this as a chain, but
3255/// will generate redundant IV increments. Ideally this would be corrected later
3256/// by a smart scheduler:
3257/// = A[i]
3258/// = A[i+x]
3259/// A[i] =
3260/// A[i+x] =
3261///
3262/// TODO: Walk the entire domtree within this loop, not just the path to the
3263/// loop latch. This will discover chains on side paths, but requires
3264/// maintaining multiple copies of the Chains state.
3265void LSRInstance::CollectChains() {
3266 LLVM_DEBUG(dbgs() << "Collecting IV Chains.\n");
3267 SmallVector<ChainUsers, 8> ChainUsersVec;
3268
3270 BasicBlock *LoopHeader = L->getHeader();
3271 for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch());
3272 Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) {
3273 LatchPath.push_back(Rung->getBlock());
3274 }
3275 LatchPath.push_back(LoopHeader);
3276
3277 // Walk the instruction stream from the loop header to the loop latch.
3278 for (BasicBlock *BB : reverse(LatchPath)) {
3279 for (Instruction &I : *BB) {
3280 // Skip instructions that weren't seen by IVUsers analysis.
3281 if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&I))
3282 continue;
3283
3284 // Ignore users that are part of a SCEV expression. This way we only
3285 // consider leaf IV Users. This effectively rediscovers a portion of
3286 // IVUsers analysis but in program order this time.
3287 if (SE.isSCEVable(I.getType()) && !isa<SCEVUnknown>(SE.getSCEV(&I)))
3288 continue;
3289
3290 // Remove this instruction from any NearUsers set it may be in.
3291 for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
3292 ChainIdx < NChains; ++ChainIdx) {
3293 ChainUsersVec[ChainIdx].NearUsers.erase(&I);
3294 }
3295 // Search for operands that can be chained.
3296 SmallPtrSet<Instruction*, 4> UniqueOperands;
3297 User::op_iterator IVOpEnd = I.op_end();
3298 User::op_iterator IVOpIter = findIVOperand(I.op_begin(), IVOpEnd, L, SE);
3299 while (IVOpIter != IVOpEnd) {
3300 Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
3301 if (UniqueOperands.insert(IVOpInst).second)
3302 ChainInstruction(&I, IVOpInst, ChainUsersVec);
3303 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3304 }
3305 } // Continue walking down the instructions.
3306 } // Continue walking down the domtree.
3307 // Visit phi backedges to determine if the chain can generate the IV postinc.
3308 for (PHINode &PN : L->getHeader()->phis()) {
3309 if (!SE.isSCEVable(PN.getType()))
3310 continue;
3311
3312 Instruction *IncV =
3313 dyn_cast<Instruction>(PN.getIncomingValueForBlock(L->getLoopLatch()));
3314 if (IncV)
3315 ChainInstruction(&PN, IncV, ChainUsersVec);
3316 }
3317 // Remove any unprofitable chains.
3318 unsigned ChainIdx = 0;
3319 for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
3320 UsersIdx < NChains; ++UsersIdx) {
3321 if (!isProfitableChain(IVChainVec[UsersIdx],
3322 ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
3323 continue;
3324 // Preserve the chain at UsesIdx.
3325 if (ChainIdx != UsersIdx)
3326 IVChainVec[ChainIdx] = IVChainVec[UsersIdx];
3327 FinalizeChain(IVChainVec[ChainIdx]);
3328 ++ChainIdx;
3329 }
3330 IVChainVec.resize(ChainIdx);
3331}
3332
3333void LSRInstance::FinalizeChain(IVChain &Chain) {
3334 assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3335 LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
3336
3337 for (const IVInc &Inc : Chain) {
3338 LLVM_DEBUG(dbgs() << " Inc: " << *Inc.UserInst << "\n");
3339 auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand);
3340 assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand");
3341 IVIncSet.insert(UseI);
3342 }
3343}
3344
3345/// Return true if the IVInc can be folded into an addressing mode.
3346static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
3347 Value *Operand, const TargetTransformInfo &TTI) {
3348 const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
3349 Immediate IncOffset = Immediate::getZero();
3350 if (IncConst) {
3351 if (IncConst && IncConst->getAPInt().getSignificantBits() > 64)
3352 return false;
3353 IncOffset = Immediate::getFixed(IncConst->getValue()->getSExtValue());
3354 } else {
3355 // Look for mul(vscale, constant), to detect a scalable offset.
3356 const APInt *C;
3357 if (!match(IncExpr, m_scev_Mul(m_scev_APInt(C), m_SCEVVScale())) ||
3358 C->getSignificantBits() > 64)
3359 return false;
3360 IncOffset = Immediate::getScalable(C->getSExtValue());
3361 }
3362
3363 if (!isAddressUse(TTI, UserInst, Operand))
3364 return false;
3365
3366 MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand);
3367 if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
3368 IncOffset, /*HasBaseReg=*/false))
3369 return false;
3370
3371 return true;
3372}
3373
3374/// Generate an add or subtract for each IVInc in a chain to materialize the IV
3375/// user's operand from the previous IV user's operand.
3376void LSRInstance::GenerateIVChain(const IVChain &Chain,
3378 // Find the new IVOperand for the head of the chain. It may have been replaced
3379 // by LSR.
3380 const IVInc &Head = Chain.Incs[0];
3381 User::op_iterator IVOpEnd = Head.UserInst->op_end();
3382 // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
3383 User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
3384 IVOpEnd, L, SE);
3385 Value *IVSrc = nullptr;
3386 while (IVOpIter != IVOpEnd) {
3387 IVSrc = getWideOperand(*IVOpIter);
3388
3389 // If this operand computes the expression that the chain needs, we may use
3390 // it. (Check this after setting IVSrc which is used below.)
3391 //
3392 // Note that if Head.IncExpr is wider than IVSrc, then this phi is too
3393 // narrow for the chain, so we can no longer use it. We do allow using a
3394 // wider phi, assuming the LSR checked for free truncation. In that case we
3395 // should already have a truncate on this operand such that
3396 // getSCEV(IVSrc) == IncExpr.
3397 if (SE.getSCEV(*IVOpIter) == Head.IncExpr
3398 || SE.getSCEV(IVSrc) == Head.IncExpr) {
3399 break;
3400 }
3401 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3402 }
3403 if (IVOpIter == IVOpEnd) {
3404 // Gracefully give up on this chain.
3405 LLVM_DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
3406 return;
3407 }
3408 assert(IVSrc && "Failed to find IV chain source");
3409
3410 LLVM_DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
3411 Type *IVTy = IVSrc->getType();
3412 Type *IntTy = SE.getEffectiveSCEVType(IVTy);
3413 const SCEV *LeftOverExpr = nullptr;
3414 const SCEV *Accum = SE.getZero(IntTy);
3416 Bases.emplace_back(Accum, IVSrc);
3417
3418 for (const IVInc &Inc : Chain) {
3419 Instruction *InsertPt = Inc.UserInst;
3420 if (isa<PHINode>(InsertPt))
3421 InsertPt = L->getLoopLatch()->getTerminator();
3422
3423 // IVOper will replace the current IV User's operand. IVSrc is the IV
3424 // value currently held in a register.
3425 Value *IVOper = IVSrc;
3426 if (!Inc.IncExpr->isZero()) {
3427 // IncExpr was the result of subtraction of two narrow values, so must
3428 // be signed.
3429 const SCEV *IncExpr = SE.getNoopOrSignExtend(Inc.IncExpr, IntTy);
3430 Accum = SE.getAddExpr(Accum, IncExpr);
3431 LeftOverExpr = LeftOverExpr ?
3432 SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr;
3433 }
3434
3435 // Look through each base to see if any can produce a nice addressing mode.
3436 bool FoundBase = false;
3437 for (auto [MapScev, MapIVOper] : reverse(Bases)) {
3438 const SCEV *Remainder = SE.getMinusSCEV(Accum, MapScev);
3439 if (canFoldIVIncExpr(Remainder, Inc.UserInst, Inc.IVOperand, TTI)) {
3440 if (!Remainder->isZero()) {
3441 Rewriter.clearPostInc();
3442 Value *IncV = Rewriter.expandCodeFor(Remainder, IntTy, InsertPt);
3443 const SCEV *IVOperExpr =
3444 SE.getAddExpr(SE.getUnknown(MapIVOper), SE.getUnknown(IncV));
3445 IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3446 } else {
3447 IVOper = MapIVOper;
3448 }
3449
3450 FoundBase = true;
3451 break;
3452 }
3453 }
3454 if (!FoundBase && LeftOverExpr && !LeftOverExpr->isZero()) {
3455 // Expand the IV increment.
3456 Rewriter.clearPostInc();
3457 Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt);
3458 const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc),
3459 SE.getUnknown(IncV));
3460 IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3461
3462 // If an IV increment can't be folded, use it as the next IV value.
3463 if (!canFoldIVIncExpr(LeftOverExpr, Inc.UserInst, Inc.IVOperand, TTI)) {
3464 assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
3465 Bases.emplace_back(Accum, IVOper);
3466 IVSrc = IVOper;
3467 LeftOverExpr = nullptr;
3468 }
3469 }
3470 Type *OperTy = Inc.IVOperand->getType();
3471 if (IVTy != OperTy) {
3472 assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) &&
3473 "cannot extend a chained IV");
3474 IRBuilder<> Builder(InsertPt);
3475 IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");
3476 }
3477 Inc.UserInst->replaceUsesOfWith(Inc.IVOperand, IVOper);
3478 if (auto *OperandIsInstr = dyn_cast<Instruction>(Inc.IVOperand))
3479 DeadInsts.emplace_back(OperandIsInstr);
3480 }
3481 // If LSR created a new, wider phi, we may also replace its postinc. We only
3482 // do this if we also found a wide value for the head of the chain.
3483 if (isa<PHINode>(Chain.tailUserInst())) {
3484 for (PHINode &Phi : L->getHeader()->phis()) {
3485 if (Phi.getType() != IVSrc->getType())
3486 continue;
3487 Instruction *PostIncV = dyn_cast<Instruction>(
3488 Phi.getIncomingValueForBlock(L->getLoopLatch()));
3489 if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc)))
3490 continue;
3491 Value *IVOper = IVSrc;
3492 Type *PostIncTy = PostIncV->getType();
3493 if (IVTy != PostIncTy) {
3494 assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types");
3495 IRBuilder<> Builder(L->getLoopLatch()->getTerminator());
3496 Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
3497 IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
3498 }
3499 Phi.replaceUsesOfWith(PostIncV, IVOper);
3500 DeadInsts.emplace_back(PostIncV);
3501 }
3502 }
3503}
3504
3505void LSRInstance::CollectFixupsAndInitialFormulae() {
3506 BranchInst *ExitBranch = nullptr;
3507 bool SaveCmp = TTI.canSaveCmp(L, &ExitBranch, &SE, &LI, &DT, &AC, &TLI);
3508
3509 // For calculating baseline cost
3511 DenseSet<const SCEV *> VisitedRegs;
3512 DenseSet<size_t> VisitedLSRUse;
3513
3514 for (const IVStrideUse &U : IU) {
3515 Instruction *UserInst = U.getUser();
3516 // Skip IV users that are part of profitable IV Chains.
3517 User::op_iterator UseI =
3518 find(UserInst->operands(), U.getOperandValToReplace());
3519 assert(UseI != UserInst->op_end() && "cannot find IV operand");
3520 if (IVIncSet.count(UseI)) {
3521 LLVM_DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n');
3522 continue;
3523 }
3524
3525 LSRUse::KindType Kind = LSRUse::Basic;
3526 MemAccessTy AccessTy;
3527 if (isAddressUse(TTI, UserInst, U.getOperandValToReplace())) {
3528 Kind = LSRUse::Address;
3529 AccessTy = getAccessType(TTI, UserInst, U.getOperandValToReplace());
3530 }
3531
3532 const SCEV *S = IU.getExpr(U);
3533 if (!S)
3534 continue;
3535 PostIncLoopSet TmpPostIncLoops = U.getPostIncLoops();
3536
3537 // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
3538 // (N - i == 0), and this allows (N - i) to be the expression that we work
3539 // with rather than just N or i, so we can consider the register
3540 // requirements for both N and i at the same time. Limiting this code to
3541 // equality icmps is not a problem because all interesting loops use
3542 // equality icmps, thanks to IndVarSimplify.
3543 if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst)) {
3544 // If CI can be saved in some target, like replaced inside hardware loop
3545 // in PowerPC, no need to generate initial formulae for it.
3546 if (SaveCmp && CI == dyn_cast<ICmpInst>(ExitBranch->getCondition()))
3547 continue;
3548 if (CI->isEquality()) {
3549 // Swap the operands if needed to put the OperandValToReplace on the
3550 // left, for consistency.
3551 Value *NV = CI->getOperand(1);
3552 if (NV == U.getOperandValToReplace()) {
3553 CI->setOperand(1, CI->getOperand(0));
3554 CI->setOperand(0, NV);
3555 NV = CI->getOperand(1);
3556 Changed = true;
3557 }
3558
3559 // x == y --> x - y == 0
3560 const SCEV *N = SE.getSCEV(NV);
3561 if (SE.isLoopInvariant(N, L) && Rewriter.isSafeToExpand(N) &&
3562 (!NV->getType()->isPointerTy() ||
3563 SE.getPointerBase(N) == SE.getPointerBase(S))) {
3564 // S is normalized, so normalize N before folding it into S
3565 // to keep the result normalized.
3566 N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3567 if (!N)
3568 continue;
3569 Kind = LSRUse::ICmpZero;
3570 S = SE.getMinusSCEV(N, S);
3571 } else if (L->isLoopInvariant(NV) &&
3572 (!isa<Instruction>(NV) ||
3573 DT.dominates(cast<Instruction>(NV), L->getHeader())) &&
3574 !NV->getType()->isPointerTy()) {
3575 // If we can't generally expand the expression (e.g. it contains
3576 // a divide), but it is already at a loop invariant point before the
3577 // loop, wrap it in an unknown (to prevent the expander from trying
3578 // to re-expand in a potentially unsafe way.) The restriction to
3579 // integer types is required because the unknown hides the base, and
3580 // SCEV can't compute the difference of two unknown pointers.
3581 N = SE.getUnknown(NV);
3582 N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3583 if (!N)
3584 continue;
3585 Kind = LSRUse::ICmpZero;
3586 S = SE.getMinusSCEV(N, S);
3587 assert(!isa<SCEVCouldNotCompute>(S));
3588 }
3589
3590 // -1 and the negations of all interesting strides (except the negation
3591 // of -1) are now also interesting.
3592 for (size_t i = 0, e = Factors.size(); i != e; ++i)
3593 if (Factors[i] != -1)
3594 Factors.insert(-(uint64_t)Factors[i]);
3595 Factors.insert(-1);
3596 }
3597 }
3598
3599 // Get or create an LSRUse.
3600 std::pair<size_t, Immediate> P = getUse(S, Kind, AccessTy);
3601 size_t LUIdx = P.first;
3602 Immediate Offset = P.second;
3603 LSRUse &LU = Uses[LUIdx];
3604
3605 // Record the fixup.
3606 LSRFixup &LF = LU.getNewFixup();
3607 LF.UserInst = UserInst;
3608 LF.OperandValToReplace = U.getOperandValToReplace();
3609 LF.PostIncLoops = TmpPostIncLoops;
3610 LF.Offset = Offset;
3611 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3612
3613 // Create SCEV as Formula for calculating baseline cost
3614 if (!VisitedLSRUse.count(LUIdx) && !LF.isUseFullyOutsideLoop(L)) {
3615 Formula F;
3616 F.initialMatch(S, L, SE);
3617 BaselineCost.RateFormula(F, Regs, VisitedRegs, LU,
3618 HardwareLoopProfitable);
3619 VisitedLSRUse.insert(LUIdx);
3620 }
3621
3622 if (!LU.WidestFixupType ||
3623 SE.getTypeSizeInBits(LU.WidestFixupType) <
3624 SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3625 LU.WidestFixupType = LF.OperandValToReplace->getType();
3626
3627 // If this is the first use of this LSRUse, give it a formula.
3628 if (LU.Formulae.empty()) {
3629 InsertInitialFormula(S, LU, LUIdx);
3630 CountRegisters(LU.Formulae.back(), LUIdx);
3631 }
3632 }
3633
3634 LLVM_DEBUG(print_fixups(dbgs()));
3635}
3636
3637/// Insert a formula for the given expression into the given use, separating out
3638/// loop-variant portions from loop-invariant and loop-computable portions.
3639void LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU,
3640 size_t LUIdx) {
3641 // Mark uses whose expressions cannot be expanded.
3642 if (!Rewriter.isSafeToExpand(S))
3643 LU.RigidFormula = true;
3644
3645 Formula F;
3646 F.initialMatch(S, L, SE);
3647 bool Inserted = InsertFormula(LU, LUIdx, F);
3648 assert(Inserted && "Initial formula already exists!"); (void)Inserted;
3649}
3650
3651/// Insert a simple single-register formula for the given expression into the
3652/// given use.
3653void
3654LSRInstance::InsertSupplementalFormula(const SCEV *S,
3655 LSRUse &LU, size_t LUIdx) {
3656 Formula F;
3657 F.BaseRegs.push_back(S);
3658 F.HasBaseReg = true;
3659 bool Inserted = InsertFormula(LU, LUIdx, F);
3660 assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
3661}
3662
3663/// Note which registers are used by the given formula, updating RegUses.
3664void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
3665 if (F.ScaledReg)
3666 RegUses.countRegister(F.ScaledReg, LUIdx);
3667 for (const SCEV *BaseReg : F.BaseRegs)
3668 RegUses.countRegister(BaseReg, LUIdx);
3669}
3670
3671/// If the given formula has not yet been inserted, add it to the list, and
3672/// return true. Return false otherwise.
3673bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
3674 // Do not insert formula that we will not be able to expand.
3675 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
3676 "Formula is illegal");
3677
3678 if (!LU.InsertFormula(F, *L))
3679 return false;
3680
3681 CountRegisters(F, LUIdx);
3682 return true;
3683}
3684
3685/// Check for other uses of loop-invariant values which we're tracking. These
3686/// other uses will pin these values in registers, making them less profitable
3687/// for elimination.
3688/// TODO: This currently misses non-constant addrec step registers.
3689/// TODO: Should this give more weight to users inside the loop?
3690void
3691LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
3692 SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
3694
3695 // Don't collect outside uses if we are favoring postinc - the instructions in
3696 // the loop are more important than the ones outside of it.
3697 if (AMK == TTI::AMK_PostIndexed)
3698 return;
3699
3700 while (!Worklist.empty()) {
3701 const SCEV *S = Worklist.pop_back_val();
3702
3703 // Don't process the same SCEV twice
3704 if (!Visited.insert(S).second)
3705 continue;
3706
3707 if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
3708 append_range(Worklist, N->operands());
3709 else if (const SCEVIntegralCastExpr *C = dyn_cast<SCEVIntegralCastExpr>(S))
3710 Worklist.push_back(C->getOperand());
3711 else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
3712 Worklist.push_back(D->getLHS());
3713 Worklist.push_back(D->getRHS());
3714 } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) {
3715 const Value *V = US->getValue();
3716 if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
3717 // Look for instructions defined outside the loop.
3718 if (L->contains(Inst)) continue;
3719 } else if (isa<Constant>(V))
3720 // Constants can be re-materialized.
3721 continue;
3722 for (const Use &U : V->uses()) {
3723 const Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
3724 // Ignore non-instructions.
3725 if (!UserInst)
3726 continue;
3727 // Don't bother if the instruction is an EHPad.
3728 if (UserInst->isEHPad())
3729 continue;
3730 // Ignore instructions in other functions (as can happen with
3731 // Constants).
3732 if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
3733 continue;
3734 // Ignore instructions not dominated by the loop.
3735 const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
3736 UserInst->getParent() :
3737 cast<PHINode>(UserInst)->getIncomingBlock(
3739 if (!DT.dominates(L->getHeader(), UseBB))
3740 continue;
3741 // Don't bother if the instruction is in a BB which ends in an EHPad.
3742 if (UseBB->getTerminator()->isEHPad())
3743 continue;
3744
3745 // Ignore cases in which the currently-examined value could come from
3746 // a basic block terminated with an EHPad. This checks all incoming
3747 // blocks of the phi node since it is possible that the same incoming
3748 // value comes from multiple basic blocks, only some of which may end
3749 // in an EHPad. If any of them do, a subsequent rewrite attempt by this
3750 // pass would try to insert instructions into an EHPad, hitting an
3751 // assertion.
3752 if (isa<PHINode>(UserInst)) {
3753 const auto *PhiNode = cast<PHINode>(UserInst);
3754 bool HasIncompatibleEHPTerminatedBlock = false;
3755 llvm::Value *ExpectedValue = U;
3756 for (unsigned int I = 0; I < PhiNode->getNumIncomingValues(); I++) {
3757 if (PhiNode->getIncomingValue(I) == ExpectedValue) {
3758 if (PhiNode->getIncomingBlock(I)->getTerminator()->isEHPad()) {
3759 HasIncompatibleEHPTerminatedBlock = true;
3760 break;
3761 }
3762 }
3763 }
3764 if (HasIncompatibleEHPTerminatedBlock) {
3765 continue;
3766 }
3767 }
3768
3769 // Don't bother rewriting PHIs in catchswitch blocks.
3770 if (isa<CatchSwitchInst>(UserInst->getParent()->getTerminator()))
3771 continue;
3772 // Ignore uses which are part of other SCEV expressions, to avoid
3773 // analyzing them multiple times.
3774 if (SE.isSCEVable(UserInst->getType())) {
3775 const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
3776 // If the user is a no-op, look through to its uses.
3777 if (!isa<SCEVUnknown>(UserS))
3778 continue;
3779 if (UserS == US) {
3780 Worklist.push_back(
3781 SE.getUnknown(const_cast<Instruction *>(UserInst)));
3782 continue;
3783 }
3784 }
3785 // Ignore icmp instructions which are already being analyzed.
3786 if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
3787 unsigned OtherIdx = !U.getOperandNo();
3788 Value *OtherOp = ICI->getOperand(OtherIdx);
3789 if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
3790 continue;
3791 }
3792
3793 // Do not consider uses inside lifetime intrinsics. These are not
3794 // actually materialized.
3795 if (UserInst->isLifetimeStartOrEnd())
3796 continue;
3797
3798 std::pair<size_t, Immediate> P =
3799 getUse(S, LSRUse::Basic, MemAccessTy());
3800 size_t LUIdx = P.first;
3801 Immediate Offset = P.second;
3802 LSRUse &LU = Uses[LUIdx];
3803 LSRFixup &LF = LU.getNewFixup();
3804 LF.UserInst = const_cast<Instruction *>(UserInst);
3805 LF.OperandValToReplace = U;
3806 LF.Offset = Offset;
3807 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3808 if (!LU.WidestFixupType ||
3809 SE.getTypeSizeInBits(LU.WidestFixupType) <
3810 SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3811 LU.WidestFixupType = LF.OperandValToReplace->getType();
3812 InsertSupplementalFormula(US, LU, LUIdx);
3813 CountRegisters(LU.Formulae.back(), Uses.size() - 1);
3814 break;
3815 }
3816 }
3817 }
3818}
3819
3820/// Split S into subexpressions which can be pulled out into separate
3821/// registers. If C is non-null, multiply each subexpression by C.
3822///
3823/// Return remainder expression after factoring the subexpressions captured by
3824/// Ops. If Ops is complete, return NULL.
3825static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
3827 const Loop *L,
3828 ScalarEvolution &SE,
3829 unsigned Depth = 0) {
3830 // Arbitrarily cap recursion to protect compile time.
3831 if (Depth >= 3)
3832 return S;
3833
3834 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
3835 // Break out add operands.
3836 for (const SCEV *S : Add->operands()) {
3837 const SCEV *Remainder = CollectSubexprs(S, C, Ops, L, SE, Depth+1);
3838 if (Remainder)
3839 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3840 }
3841 return nullptr;
3842 }
3843 const SCEV *Start, *Step;
3844 const SCEVConstant *Op0;
3845 const SCEV *Op1;
3846 if (match(S, m_scev_AffineAddRec(m_SCEV(Start), m_SCEV(Step)))) {
3847 // Split a non-zero base out of an addrec.
3848 if (Start->isZero())
3849 return S;
3850
3851 const SCEV *Remainder = CollectSubexprs(Start, C, Ops, L, SE, Depth + 1);
3852 // Split the non-zero AddRec unless it is part of a nested recurrence that
3853 // does not pertain to this loop.
3854 if (Remainder && (cast<SCEVAddRecExpr>(S)->getLoop() == L ||
3855 !isa<SCEVAddRecExpr>(Remainder))) {
3856 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3857 Remainder = nullptr;
3858 }
3859 if (Remainder != Start) {
3860 if (!Remainder)
3861 Remainder = SE.getConstant(S->getType(), 0);
3862 return SE.getAddRecExpr(Remainder, Step,
3863 cast<SCEVAddRecExpr>(S)->getLoop(),
3864 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
3866 }
3867 } else if (match(S, m_scev_Mul(m_SCEVConstant(Op0), m_SCEV(Op1)))) {
3868 // Break (C * (a + b + c)) into C*a + C*b + C*c.
3869 C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0;
3870 const SCEV *Remainder = CollectSubexprs(Op1, C, Ops, L, SE, Depth + 1);
3871 if (Remainder)
3872 Ops.push_back(SE.getMulExpr(C, Remainder));
3873 return nullptr;
3874 }
3875 return S;
3876}
3877
3878/// Return true if the SCEV represents a value that may end up as a
3879/// post-increment operation.
3881 LSRUse &LU, const SCEV *S, const Loop *L,
3882 ScalarEvolution &SE) {
3883 if (LU.Kind != LSRUse::Address ||
3884 !LU.AccessTy.getType()->isIntOrIntVectorTy())
3885 return false;
3886 const SCEV *Start;
3887 if (!match(S, m_scev_AffineAddRec(m_SCEV(Start), m_SCEVConstant())))
3888 return false;
3889 // Check if a post-indexed load/store can be used.
3892 if (!isa<SCEVConstant>(Start) && SE.isLoopInvariant(Start, L))
3893 return true;
3894 }
3895 return false;
3896}
3897
3898/// Helper function for LSRInstance::GenerateReassociations.
3899void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
3900 const Formula &Base,
3901 unsigned Depth, size_t Idx,
3902 bool IsScaledReg) {
3903 const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
3904 // Don't generate reassociations for the base register of a value that
3905 // may generate a post-increment operator. The reason is that the
3906 // reassociations cause extra base+register formula to be created,
3907 // and possibly chosen, but the post-increment is more efficient.
3908 if (AMK == TTI::AMK_PostIndexed && mayUsePostIncMode(TTI, LU, BaseReg, L, SE))
3909 return;
3911 const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
3912 if (Remainder)
3913 AddOps.push_back(Remainder);
3914
3915 if (AddOps.size() == 1)
3916 return;
3917
3919 JE = AddOps.end();
3920 J != JE; ++J) {
3921 // Loop-variant "unknown" values are uninteresting; we won't be able to
3922 // do anything meaningful with them.
3923 if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
3924 continue;
3925
3926 // Don't pull a constant into a register if the constant could be folded
3927 // into an immediate field.
3928 if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3929 LU.AccessTy, *J, Base.getNumRegs() > 1))
3930 continue;
3931
3932 // Collect all operands except *J.
3933 SmallVector<const SCEV *, 8> InnerAddOps(std::as_const(AddOps).begin(), J);
3934 InnerAddOps.append(std::next(J), std::as_const(AddOps).end());
3935
3936 // Don't leave just a constant behind in a register if the constant could
3937 // be folded into an immediate field.
3938 if (InnerAddOps.size() == 1 &&
3939 isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3940 LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
3941 continue;
3942
3943 const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
3944 if (InnerSum->isZero())
3945 continue;
3946 Formula F = Base;
3947
3948 if (F.UnfoldedOffset.isNonZero() && F.UnfoldedOffset.isScalable())
3949 continue;
3950
3951 // Add the remaining pieces of the add back into the new formula.
3952 const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
3953 if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
3954 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
3955 InnerSumSC->getValue()->getZExtValue())) {
3956 F.UnfoldedOffset =
3957 Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
3958 InnerSumSC->getValue()->getZExtValue());
3959 if (IsScaledReg) {
3960 F.ScaledReg = nullptr;
3961 F.Scale = 0;
3962 } else
3963 F.BaseRegs.erase(F.BaseRegs.begin() + Idx);
3964 } else if (IsScaledReg)
3965 F.ScaledReg = InnerSum;
3966 else
3967 F.BaseRegs[Idx] = InnerSum;
3968
3969 // Add J as its own register, or an unfolded immediate.
3970 const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
3971 if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
3972 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
3973 SC->getValue()->getZExtValue()))
3974 F.UnfoldedOffset =
3975 Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
3976 SC->getValue()->getZExtValue());
3977 else
3978 F.BaseRegs.push_back(*J);
3979 // We may have changed the number of register in base regs, adjust the
3980 // formula accordingly.
3981 F.canonicalize(*L);
3982
3983 if (InsertFormula(LU, LUIdx, F))
3984 // If that formula hadn't been seen before, recurse to find more like
3985 // it.
3986 // Add check on Log16(AddOps.size()) - same as Log2_32(AddOps.size()) >> 2)
3987 // Because just Depth is not enough to bound compile time.
3988 // This means that every time AddOps.size() is greater 16^x we will add
3989 // x to Depth.
3990 GenerateReassociations(LU, LUIdx, LU.Formulae.back(),
3991 Depth + 1 + (Log2_32(AddOps.size()) >> 2));
3992 }
3993}
3994
3995/// Split out subexpressions from adds and the bases of addrecs.
3996void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
3997 Formula Base, unsigned Depth) {
3998 assert(Base.isCanonical(*L) && "Input must be in the canonical form");
3999 // Arbitrarily cap recursion to protect compile time.
4000 if (Depth >= 3)
4001 return;
4002
4003 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4004 GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i);
4005
4006 if (Base.Scale == 1)
4007 GenerateReassociationsImpl(LU, LUIdx, Base, Depth,
4008 /* Idx */ -1, /* IsScaledReg */ true);
4009}
4010
4011/// Generate a formula consisting of all of the loop-dominating registers added
4012/// into a single register.
4013void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
4014 Formula Base) {
4015 // This method is only interesting on a plurality of registers.
4016 if (Base.BaseRegs.size() + (Base.Scale == 1) +
4017 (Base.UnfoldedOffset.isNonZero()) <=
4018 1)
4019 return;
4020
4021 // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
4022 // processing the formula.
4023 Base.unscale();
4025 Formula NewBase = Base;
4026 NewBase.BaseRegs.clear();
4027 Type *CombinedIntegerType = nullptr;
4028 for (const SCEV *BaseReg : Base.BaseRegs) {
4029 if (SE.properlyDominates(BaseReg, L->getHeader()) &&
4030 !SE.hasComputableLoopEvolution(BaseReg, L)) {
4031 if (!CombinedIntegerType)
4032 CombinedIntegerType = SE.getEffectiveSCEVType(BaseReg->getType());
4033 Ops.push_back(BaseReg);
4034 }
4035 else
4036 NewBase.BaseRegs.push_back(BaseReg);
4037 }
4038
4039 // If no register is relevant, we're done.
4040 if (Ops.size() == 0)
4041 return;
4042
4043 // Utility function for generating the required variants of the combined
4044 // registers.
4045 auto GenerateFormula = [&](const SCEV *Sum) {
4046 Formula F = NewBase;
4047
4048 // TODO: If Sum is zero, it probably means ScalarEvolution missed an
4049 // opportunity to fold something. For now, just ignore such cases
4050 // rather than proceed with zero in a register.
4051 if (Sum->isZero())
4052 return;
4053
4054 F.BaseRegs.push_back(Sum);
4055 F.canonicalize(*L);
4056 (void)InsertFormula(LU, LUIdx, F);
4057 };
4058
4059 // If we collected at least two registers, generate a formula combining them.
4060 if (Ops.size() > 1) {
4061 SmallVector<const SCEV *, 4> OpsCopy(Ops); // Don't let SE modify Ops.
4062 GenerateFormula(SE.getAddExpr(OpsCopy));
4063 }
4064
4065 // If we have an unfolded offset, generate a formula combining it with the
4066 // registers collected.
4067 if (NewBase.UnfoldedOffset.isNonZero() && NewBase.UnfoldedOffset.isFixed()) {
4068 assert(CombinedIntegerType && "Missing a type for the unfolded offset");
4069 Ops.push_back(SE.getConstant(CombinedIntegerType,
4070 NewBase.UnfoldedOffset.getFixedValue(), true));
4071 NewBase.UnfoldedOffset = Immediate::getFixed(0);
4072 GenerateFormula(SE.getAddExpr(Ops));
4073 }
4074}
4075
4076/// Helper function for LSRInstance::GenerateSymbolicOffsets.
4077void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
4078 const Formula &Base, size_t Idx,
4079 bool IsScaledReg) {
4080 const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
4081 GlobalValue *GV = ExtractSymbol(G, SE);
4082 if (G->isZero() || !GV)
4083 return;
4084 Formula F = Base;
4085 F.BaseGV = GV;
4086 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
4087 return;
4088 if (IsScaledReg)
4089 F.ScaledReg = G;
4090 else
4091 F.BaseRegs[Idx] = G;
4092 (void)InsertFormula(LU, LUIdx, F);
4093}
4094
4095/// Generate reuse formulae using symbolic offsets.
4096void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
4097 Formula Base) {
4098 // We can't add a symbolic offset if the address already contains one.
4099 if (Base.BaseGV) return;
4100
4101 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4102 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i);
4103 if (Base.Scale == 1)
4104 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1,
4105 /* IsScaledReg */ true);
4106}
4107
4108/// Helper function for LSRInstance::GenerateConstantOffsets.
4109void LSRInstance::GenerateConstantOffsetsImpl(
4110 LSRUse &LU, unsigned LUIdx, const Formula &Base,
4111 const SmallVectorImpl<Immediate> &Worklist, size_t Idx, bool IsScaledReg) {
4112
4113 auto GenerateOffset = [&](const SCEV *G, Immediate Offset) {
4114 Formula F = Base;
4115 if (!Base.BaseOffset.isCompatibleImmediate(Offset))
4116 return;
4117 F.BaseOffset = Base.BaseOffset.subUnsigned(Offset);
4118
4119 if (isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) {
4120 // Add the offset to the base register.
4121 const SCEV *NewOffset = Offset.getSCEV(SE, G->getType());
4122 const SCEV *NewG = SE.getAddExpr(NewOffset, G);
4123 // If it cancelled out, drop the base register, otherwise update it.
4124 if (NewG->isZero()) {
4125 if (IsScaledReg) {
4126 F.Scale = 0;
4127 F.ScaledReg = nullptr;
4128 } else
4129 F.deleteBaseReg(F.BaseRegs[Idx]);
4130 F.canonicalize(*L);
4131 } else if (IsScaledReg)
4132 F.ScaledReg = NewG;
4133 else
4134 F.BaseRegs[Idx] = NewG;
4135
4136 (void)InsertFormula(LU, LUIdx, F);
4137 }
4138 };
4139
4140 const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
4141
4142 // With constant offsets and constant steps, we can generate pre-inc
4143 // accesses by having the offset equal the step. So, for access #0 with a
4144 // step of 8, we generate a G - 8 base which would require the first access
4145 // to be ((G - 8) + 8),+,8. The pre-indexed access then updates the pointer
4146 // for itself and hopefully becomes the base for other accesses. This means
4147 // means that a single pre-indexed access can be generated to become the new
4148 // base pointer for each iteration of the loop, resulting in no extra add/sub
4149 // instructions for pointer updating.
4150 if (AMK == TTI::AMK_PreIndexed && LU.Kind == LSRUse::Address) {
4151 const APInt *StepInt;
4152 if (match(G, m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(StepInt)))) {
4153 int64_t Step = StepInt->isNegative() ? StepInt->getSExtValue()
4154 : StepInt->getZExtValue();
4155
4156 for (Immediate Offset : Worklist) {
4157 if (Offset.isFixed()) {
4158 Offset = Immediate::getFixed(Offset.getFixedValue() - Step);
4159 GenerateOffset(G, Offset);
4160 }
4161 }
4162 }
4163 }
4164 for (Immediate Offset : Worklist)
4165 GenerateOffset(G, Offset);
4166
4167 Immediate Imm = ExtractImmediate(G, SE);
4168 if (G->isZero() || Imm.isZero() ||
4169 !Base.BaseOffset.isCompatibleImmediate(Imm))
4170 return;
4171 Formula F = Base;
4172 F.BaseOffset = F.BaseOffset.addUnsigned(Imm);
4173 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
4174 return;
4175 if (IsScaledReg) {
4176 F.ScaledReg = G;
4177 } else {
4178 F.BaseRegs[Idx] = G;
4179 // We may generate non canonical Formula if G is a recurrent expr reg
4180 // related with current loop while F.ScaledReg is not.
4181 F.canonicalize(*L);
4182 }
4183 (void)InsertFormula(LU, LUIdx, F);
4184}
4185
4186/// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
4187void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
4188 Formula Base) {
4189 // TODO: For now, just add the min and max offset, because it usually isn't
4190 // worthwhile looking at everything inbetween.
4192 Worklist.push_back(LU.MinOffset);
4193 if (LU.MaxOffset != LU.MinOffset)
4194 Worklist.push_back(LU.MaxOffset);
4195
4196 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4197 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i);
4198 if (Base.Scale == 1)
4199 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1,
4200 /* IsScaledReg */ true);
4201}
4202
4203/// For ICmpZero, check to see if we can scale up the comparison. For example, x
4204/// == y -> x*c == y*c.
4205void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
4206 Formula Base) {
4207 if (LU.Kind != LSRUse::ICmpZero) return;
4208
4209 // Determine the integer type for the base formula.
4210 Type *IntTy = Base.getType();
4211 if (!IntTy) return;
4212 if (SE.getTypeSizeInBits(IntTy) > 64) return;
4213
4214 // Don't do this if there is more than one offset.
4215 if (LU.MinOffset != LU.MaxOffset) return;
4216
4217 // Check if transformation is valid. It is illegal to multiply pointer.
4218 if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4219 return;
4220 for (const SCEV *BaseReg : Base.BaseRegs)
4221 if (BaseReg->getType()->isPointerTy())
4222 return;
4223 assert(!Base.BaseGV && "ICmpZero use is not legal!");
4224
4225 // Check each interesting stride.
4226 for (int64_t Factor : Factors) {
4227 // Check that Factor can be represented by IntTy
4228 if (!ConstantInt::isValueValidForType(IntTy, Factor))
4229 continue;
4230 // Check that the multiplication doesn't overflow.
4231 if (Base.BaseOffset.isMin() && Factor == -1)
4232 continue;
4233 // Not supporting scalable immediates.
4234 if (Base.BaseOffset.isNonZero() && Base.BaseOffset.isScalable())
4235 continue;
4236 Immediate NewBaseOffset = Base.BaseOffset.mulUnsigned(Factor);
4237 assert(Factor != 0 && "Zero factor not expected!");
4238 if (NewBaseOffset.getFixedValue() / Factor !=
4239 Base.BaseOffset.getFixedValue())
4240 continue;
4241 // If the offset will be truncated at this use, check that it is in bounds.
4242 if (!IntTy->isPointerTy() &&
4243 !ConstantInt::isValueValidForType(IntTy, NewBaseOffset.getFixedValue()))
4244 continue;
4245
4246 // Check that multiplying with the use offset doesn't overflow.
4247 Immediate Offset = LU.MinOffset;
4248 if (Offset.isMin() && Factor == -1)
4249 continue;
4250 Offset = Offset.mulUnsigned(Factor);
4251 if (Offset.getFixedValue() / Factor != LU.MinOffset.getFixedValue())
4252 continue;
4253 // If the offset will be truncated at this use, check that it is in bounds.
4254 if (!IntTy->isPointerTy() &&
4255 !ConstantInt::isValueValidForType(IntTy, Offset.getFixedValue()))
4256 continue;
4257
4258 Formula F = Base;
4259 F.BaseOffset = NewBaseOffset;
4260
4261 // Check that this scale is legal.
4262 if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F))
4263 continue;
4264
4265 // Compensate for the use having MinOffset built into it.
4266 F.BaseOffset = F.BaseOffset.addUnsigned(Offset).subUnsigned(LU.MinOffset);
4267
4268 const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4269
4270 // Check that multiplying with each base register doesn't overflow.
4271 for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
4272 F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
4273 if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
4274 goto next;
4275 }
4276
4277 // Check that multiplying with the scaled register doesn't overflow.
4278 if (F.ScaledReg) {
4279 F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
4280 if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
4281 continue;
4282 }
4283
4284 // Check that multiplying with the unfolded offset doesn't overflow.
4285 if (F.UnfoldedOffset.isNonZero()) {
4286 if (F.UnfoldedOffset.isMin() && Factor == -1)
4287 continue;
4288 F.UnfoldedOffset = F.UnfoldedOffset.mulUnsigned(Factor);
4289 if (F.UnfoldedOffset.getFixedValue() / Factor !=
4290 Base.UnfoldedOffset.getFixedValue())
4291 continue;
4292 // If the offset will be truncated, check that it is in bounds.
4294 IntTy, F.UnfoldedOffset.getFixedValue()))
4295 continue;
4296 }
4297
4298 // If we make it here and it's legal, add it.
4299 (void)InsertFormula(LU, LUIdx, F);
4300 next:;
4301 }
4302}
4303
4304/// Generate stride factor reuse formulae by making use of scaled-offset address
4305/// modes, for example.
4306void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
4307 // Determine the integer type for the base formula.
4308 Type *IntTy = Base.getType();
4309 if (!IntTy) return;
4310
4311 // If this Formula already has a scaled register, we can't add another one.
4312 // Try to unscale the formula to generate a better scale.
4313 if (Base.Scale != 0 && !Base.unscale())
4314 return;
4315
4316 assert(Base.Scale == 0 && "unscale did not did its job!");
4317
4318 // Check each interesting stride.
4319 for (int64_t Factor : Factors) {
4320 Base.Scale = Factor;
4321 Base.HasBaseReg = Base.BaseRegs.size() > 1;
4322 // Check whether this scale is going to be legal.
4323 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4324 Base)) {
4325 // As a special-case, handle special out-of-loop Basic users specially.
4326 // TODO: Reconsider this special case.
4327 if (LU.Kind == LSRUse::Basic &&
4328 isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special,
4329 LU.AccessTy, Base) &&
4330 LU.AllFixupsOutsideLoop)
4331 LU.Kind = LSRUse::Special;
4332 else
4333 continue;
4334 }
4335 // For an ICmpZero, negating a solitary base register won't lead to
4336 // new solutions.
4337 if (LU.Kind == LSRUse::ICmpZero && !Base.HasBaseReg &&
4338 Base.BaseOffset.isZero() && !Base.BaseGV)
4339 continue;
4340 // For each addrec base reg, if its loop is current loop, apply the scale.
4341 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
4342 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i]);
4343 if (AR && (AR->getLoop() == L || LU.AllFixupsOutsideLoop)) {
4344 const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4345 if (FactorS->isZero())
4346 continue;
4347 // Divide out the factor, ignoring high bits, since we'll be
4348 // scaling the value back up in the end.
4349 if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true))
4350 if (!Quotient->isZero()) {
4351 // TODO: This could be optimized to avoid all the copying.
4352 Formula F = Base;
4353 F.ScaledReg = Quotient;
4354 F.deleteBaseReg(F.BaseRegs[i]);
4355 // The canonical representation of 1*reg is reg, which is already in
4356 // Base. In that case, do not try to insert the formula, it will be
4357 // rejected anyway.
4358 if (F.Scale == 1 && (F.BaseRegs.empty() ||
4359 (AR->getLoop() != L && LU.AllFixupsOutsideLoop)))
4360 continue;
4361 // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate
4362 // non canonical Formula with ScaledReg's loop not being L.
4363 if (F.Scale == 1 && LU.AllFixupsOutsideLoop)
4364 F.canonicalize(*L);
4365 (void)InsertFormula(LU, LUIdx, F);
4366 }
4367 }
4368 }
4369 }
4370}
4371
4372/// Extend/Truncate \p Expr to \p ToTy considering post-inc uses in \p Loops.
4373/// For all PostIncLoopSets in \p Loops, first de-normalize \p Expr, then
4374/// perform the extension/truncate and normalize again, as the normalized form
4375/// can result in folds that are not valid in the post-inc use contexts. The
4376/// expressions for all PostIncLoopSets must match, otherwise return nullptr.
4377static const SCEV *
4379 const SCEV *Expr, Type *ToTy,
4380 ScalarEvolution &SE) {
4381 const SCEV *Result = nullptr;
4382 for (auto &L : Loops) {
4383 auto *DenormExpr = denormalizeForPostIncUse(Expr, L, SE);
4384 const SCEV *NewDenormExpr = SE.getAnyExtendExpr(DenormExpr, ToTy);
4385 const SCEV *New = normalizeForPostIncUse(NewDenormExpr, L, SE);
4386 if (!New || (Result && New != Result))
4387 return nullptr;
4388 Result = New;
4389 }
4390
4391 assert(Result && "failed to create expression");
4392 return Result;
4393}
4394
4395/// Generate reuse formulae from different IV types.
4396void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
4397 // Don't bother truncating symbolic values.
4398 if (Base.BaseGV) return;
4399
4400 // Determine the integer type for the base formula.
4401 Type *DstTy = Base.getType();
4402 if (!DstTy) return;
4403 if (DstTy->isPointerTy())
4404 return;
4405
4406 // It is invalid to extend a pointer type so exit early if ScaledReg or
4407 // any of the BaseRegs are pointers.
4408 if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4409 return;
4410 if (any_of(Base.BaseRegs,
4411 [](const SCEV *S) { return S->getType()->isPointerTy(); }))
4412 return;
4413
4415 for (auto &LF : LU.Fixups)
4416 Loops.push_back(LF.PostIncLoops);
4417
4418 for (Type *SrcTy : Types) {
4419 if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) {
4420 Formula F = Base;
4421
4422 // Sometimes SCEV is able to prove zero during ext transform. It may
4423 // happen if SCEV did not do all possible transforms while creating the
4424 // initial node (maybe due to depth limitations), but it can do them while
4425 // taking ext.
4426 if (F.ScaledReg) {
4427 const SCEV *NewScaledReg =
4428 getAnyExtendConsideringPostIncUses(Loops, F.ScaledReg, SrcTy, SE);
4429 if (!NewScaledReg || NewScaledReg->isZero())
4430 continue;
4431 F.ScaledReg = NewScaledReg;
4432 }
4433 bool HasZeroBaseReg = false;
4434 for (const SCEV *&BaseReg : F.BaseRegs) {
4435 const SCEV *NewBaseReg =
4436 getAnyExtendConsideringPostIncUses(Loops, BaseReg, SrcTy, SE);
4437 if (!NewBaseReg || NewBaseReg->isZero()) {
4438 HasZeroBaseReg = true;
4439 break;
4440 }
4441 BaseReg = NewBaseReg;
4442 }
4443 if (HasZeroBaseReg)
4444 continue;
4445
4446 // TODO: This assumes we've done basic processing on all uses and
4447 // have an idea what the register usage is.
4448 if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
4449 continue;
4450
4451 F.canonicalize(*L);
4452 (void)InsertFormula(LU, LUIdx, F);
4453 }
4454 }
4455}
4456
4457namespace {
4458
4459/// Helper class for GenerateCrossUseConstantOffsets. It's used to defer
4460/// modifications so that the search phase doesn't have to worry about the data
4461/// structures moving underneath it.
4462struct WorkItem {
4463 size_t LUIdx;
4464 Immediate Imm;
4465 const SCEV *OrigReg;
4466
4467 WorkItem(size_t LI, Immediate I, const SCEV *R)
4468 : LUIdx(LI), Imm(I), OrigReg(R) {}
4469
4470 void print(raw_ostream &OS) const;
4471 void dump() const;
4472};
4473
4474} // end anonymous namespace
4475
4476#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4477void WorkItem::print(raw_ostream &OS) const {
4478 OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
4479 << " , add offset " << Imm;
4480}
4481
4482LLVM_DUMP_METHOD void WorkItem::dump() const {
4483 print(errs()); errs() << '\n';
4484}
4485#endif
4486
4487/// Look for registers which are a constant distance apart and try to form reuse
4488/// opportunities between them.
4489void LSRInstance::GenerateCrossUseConstantOffsets() {
4490 // Group the registers by their value without any added constant offset.
4491 using ImmMapTy = std::map<Immediate, const SCEV *, KeyOrderTargetImmediate>;
4492
4496 for (const SCEV *Use : RegUses) {
4497 const SCEV *Reg = Use; // Make a copy for ExtractImmediate to modify.
4498 Immediate Imm = ExtractImmediate(Reg, SE);
4499 auto Pair = Map.try_emplace(Reg);
4500 if (Pair.second)
4501 Sequence.push_back(Reg);
4502 Pair.first->second.insert(std::make_pair(Imm, Use));
4503 UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(Use);
4504 }
4505
4506 // Now examine each set of registers with the same base value. Build up
4507 // a list of work to do and do the work in a separate step so that we're
4508 // not adding formulae and register counts while we're searching.
4509 SmallVector<WorkItem, 32> WorkItems;
4510 SmallSet<std::pair<size_t, Immediate>, 32, KeyOrderSizeTAndImmediate>
4511 UniqueItems;
4512 for (const SCEV *Reg : Sequence) {
4513 const ImmMapTy &Imms = Map.find(Reg)->second;
4514
4515 // It's not worthwhile looking for reuse if there's only one offset.
4516 if (Imms.size() == 1)
4517 continue;
4518
4519 LLVM_DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
4520 for (const auto &Entry
4521 : Imms) dbgs()
4522 << ' ' << Entry.first;
4523 dbgs() << '\n');
4524
4525 // Examine each offset.
4526 for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
4527 J != JE; ++J) {
4528 const SCEV *OrigReg = J->second;
4529
4530 Immediate JImm = J->first;
4531 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
4532
4533 if (!isa<SCEVConstant>(OrigReg) &&
4534 UsedByIndicesMap[Reg].count() == 1) {
4535 LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4536 << '\n');
4537 continue;
4538 }
4539
4540 // Conservatively examine offsets between this orig reg a few selected
4541 // other orig regs.
4542 Immediate First = Imms.begin()->first;
4543 Immediate Last = std::prev(Imms.end())->first;
4544 if (!First.isCompatibleImmediate(Last)) {
4545 LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4546 << "\n");
4547 continue;
4548 }
4549 // Only scalable if both terms are scalable, or if one is scalable and
4550 // the other is 0.
4551 bool Scalable = First.isScalable() || Last.isScalable();
4552 int64_t FI = First.getKnownMinValue();
4553 int64_t LI = Last.getKnownMinValue();
4554 // Compute (First + Last) / 2 without overflow using the fact that
4555 // First + Last = 2 * (First + Last) + (First ^ Last).
4556 int64_t Avg = (FI & LI) + ((FI ^ LI) >> 1);
4557 // If the result is negative and FI is odd and LI even (or vice versa),
4558 // we rounded towards -inf. Add 1 in that case, to round towards 0.
4559 Avg = Avg + ((FI ^ LI) & ((uint64_t)Avg >> 63));
4560 ImmMapTy::const_iterator OtherImms[] = {
4561 Imms.begin(), std::prev(Imms.end()),
4562 Imms.lower_bound(Immediate::get(Avg, Scalable))};
4563 for (const auto &M : OtherImms) {
4564 if (M == J || M == JE) continue;
4565 if (!JImm.isCompatibleImmediate(M->first))
4566 continue;
4567
4568 // Compute the difference between the two.
4569 Immediate Imm = JImm.subUnsigned(M->first);
4570 for (unsigned LUIdx : UsedByIndices.set_bits())
4571 // Make a memo of this use, offset, and register tuple.
4572 if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
4573 WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
4574 }
4575 }
4576 }
4577
4578 Map.clear();
4579 Sequence.clear();
4580 UsedByIndicesMap.clear();
4581 UniqueItems.clear();
4582
4583 // Now iterate through the worklist and add new formulae.
4584 for (const WorkItem &WI : WorkItems) {
4585 size_t LUIdx = WI.LUIdx;
4586 LSRUse &LU = Uses[LUIdx];
4587 Immediate Imm = WI.Imm;
4588 const SCEV *OrigReg = WI.OrigReg;
4589
4590 Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
4591 const SCEV *NegImmS = Imm.getNegativeSCEV(SE, IntTy);
4592 unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
4593
4594 // TODO: Use a more targeted data structure.
4595 for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
4596 Formula F = LU.Formulae[L];
4597 // FIXME: The code for the scaled and unscaled registers looks
4598 // very similar but slightly different. Investigate if they
4599 // could be merged. That way, we would not have to unscale the
4600 // Formula.
4601 F.unscale();
4602 // Use the immediate in the scaled register.
4603 if (F.ScaledReg == OrigReg) {
4604 if (!F.BaseOffset.isCompatibleImmediate(Imm))
4605 continue;
4606 Immediate Offset = F.BaseOffset.addUnsigned(Imm.mulUnsigned(F.Scale));
4607 // Don't create 50 + reg(-50).
4608 const SCEV *S = Offset.getNegativeSCEV(SE, IntTy);
4609 if (F.referencesReg(S))
4610 continue;
4611 Formula NewF = F;
4612 NewF.BaseOffset = Offset;
4613 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4614 NewF))
4615 continue;
4616 NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
4617
4618 // If the new scale is a constant in a register, and adding the constant
4619 // value to the immediate would produce a value closer to zero than the
4620 // immediate itself, then the formula isn't worthwhile.
4621 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg)) {
4622 // FIXME: Do we need to do something for scalable immediates here?
4623 // A scalable SCEV won't be constant, but we might still have
4624 // something in the offset? Bail out for now to be safe.
4625 if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
4626 continue;
4627 if (C->getValue()->isNegative() !=
4628 (NewF.BaseOffset.isLessThanZero()) &&
4629 (C->getAPInt().abs() * APInt(BitWidth, F.Scale))
4630 .ule(std::abs(NewF.BaseOffset.getFixedValue())))
4631 continue;
4632 }
4633
4634 // OK, looks good.
4635 NewF.canonicalize(*this->L);
4636 (void)InsertFormula(LU, LUIdx, NewF);
4637 } else {
4638 // Use the immediate in a base register.
4639 for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
4640 const SCEV *BaseReg = F.BaseRegs[N];
4641 if (BaseReg != OrigReg)
4642 continue;
4643 Formula NewF = F;
4644 if (!NewF.BaseOffset.isCompatibleImmediate(Imm) ||
4645 !NewF.UnfoldedOffset.isCompatibleImmediate(Imm) ||
4646 !NewF.BaseOffset.isCompatibleImmediate(NewF.UnfoldedOffset))
4647 continue;
4648 NewF.BaseOffset = NewF.BaseOffset.addUnsigned(Imm);
4649 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
4650 LU.Kind, LU.AccessTy, NewF)) {
4651 if (AMK == TTI::AMK_PostIndexed &&
4652 mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE))
4653 continue;
4654 Immediate NewUnfoldedOffset = NewF.UnfoldedOffset.addUnsigned(Imm);
4655 if (!isLegalAddImmediate(TTI, NewUnfoldedOffset))
4656 continue;
4657 NewF = F;
4658 NewF.UnfoldedOffset = NewUnfoldedOffset;
4659 }
4660 NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
4661
4662 // If the new formula has a constant in a register, and adding the
4663 // constant value to the immediate would produce a value closer to
4664 // zero than the immediate itself, then the formula isn't worthwhile.
4665 for (const SCEV *NewReg : NewF.BaseRegs)
4666 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg)) {
4667 if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
4668 goto skip_formula;
4669 if ((C->getAPInt() + NewF.BaseOffset.getFixedValue())
4670 .abs()
4671 .slt(std::abs(NewF.BaseOffset.getFixedValue())) &&
4672 (C->getAPInt() + NewF.BaseOffset.getFixedValue())
4673 .countr_zero() >=
4674 (unsigned)llvm::countr_zero<uint64_t>(
4675 NewF.BaseOffset.getFixedValue()))
4676 goto skip_formula;
4677 }
4678
4679 // Ok, looks good.
4680 NewF.canonicalize(*this->L);
4681 (void)InsertFormula(LU, LUIdx, NewF);
4682 break;
4683 skip_formula:;
4684 }
4685 }
4686 }
4687 }
4688}
4689
4690/// Generate formulae for each use.
4691void
4692LSRInstance::GenerateAllReuseFormulae() {
4693 // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
4694 // queries are more precise.
4695 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4696 LSRUse &LU = Uses[LUIdx];
4697 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4698 GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
4699 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4700 GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
4701 }
4702 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4703 LSRUse &LU = Uses[LUIdx];
4704 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4705 GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
4706 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4707 GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
4708 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4709 GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
4710 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4711 GenerateScales(LU, LUIdx, LU.Formulae[i]);
4712 }
4713 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4714 LSRUse &LU = Uses[LUIdx];
4715 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4716 GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
4717 }
4718
4719 GenerateCrossUseConstantOffsets();
4720
4721 LLVM_DEBUG(dbgs() << "\n"
4722 "After generating reuse formulae:\n";
4723 print_uses(dbgs()));
4724}
4725
4726/// If there are multiple formulae with the same set of registers used
4727/// by other uses, pick the best one and delete the others.
4728void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
4729 DenseSet<const SCEV *> VisitedRegs;
4732#ifndef NDEBUG
4733 bool ChangedFormulae = false;
4734#endif
4735
4736 // Collect the best formula for each unique set of shared registers. This
4737 // is reset for each use.
4738 using BestFormulaeTy = DenseMap<SmallVector<const SCEV *, 4>, size_t>;
4739
4740 BestFormulaeTy BestFormulae;
4741
4742 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4743 LSRUse &LU = Uses[LUIdx];
4744 LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
4745 dbgs() << '\n');
4746
4747 bool Any = false;
4748 for (size_t FIdx = 0, NumForms = LU.Formulae.size();
4749 FIdx != NumForms; ++FIdx) {
4750 Formula &F = LU.Formulae[FIdx];
4751
4752 // Some formulas are instant losers. For example, they may depend on
4753 // nonexistent AddRecs from other loops. These need to be filtered
4754 // immediately, otherwise heuristics could choose them over others leading
4755 // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
4756 // avoids the need to recompute this information across formulae using the
4757 // same bad AddRec. Passing LoserRegs is also essential unless we remove
4758 // the corresponding bad register from the Regs set.
4759 Cost CostF(L, SE, TTI, AMK);
4760 Regs.clear();
4761 CostF.RateFormula(F, Regs, VisitedRegs, LU, HardwareLoopProfitable,
4762 &LoserRegs);
4763 if (CostF.isLoser()) {
4764 // During initial formula generation, undesirable formulae are generated
4765 // by uses within other loops that have some non-trivial address mode or
4766 // use the postinc form of the IV. LSR needs to provide these formulae
4767 // as the basis of rediscovering the desired formula that uses an AddRec
4768 // corresponding to the existing phi. Once all formulae have been
4769 // generated, these initial losers may be pruned.
4770 LLVM_DEBUG(dbgs() << " Filtering loser "; F.print(dbgs());
4771 dbgs() << "\n");
4772 }
4773 else {
4775 for (const SCEV *Reg : F.BaseRegs) {
4776 if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
4777 Key.push_back(Reg);
4778 }
4779 if (F.ScaledReg &&
4780 RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
4781 Key.push_back(F.ScaledReg);
4782 // Unstable sort by host order ok, because this is only used for
4783 // uniquifying.
4784 llvm::sort(Key);
4785
4786 std::pair<BestFormulaeTy::const_iterator, bool> P =
4787 BestFormulae.insert(std::make_pair(Key, FIdx));
4788 if (P.second)
4789 continue;
4790
4791 Formula &Best = LU.Formulae[P.first->second];
4792
4793 Cost CostBest(L, SE, TTI, AMK);
4794 Regs.clear();
4795 CostBest.RateFormula(Best, Regs, VisitedRegs, LU,
4796 HardwareLoopProfitable);
4797 if (CostF.isLess(CostBest))
4798 std::swap(F, Best);
4799 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
4800 dbgs() << "\n"
4801 " in favor of formula ";
4802 Best.print(dbgs()); dbgs() << '\n');
4803 }
4804#ifndef NDEBUG
4805 ChangedFormulae = true;
4806#endif
4807 LU.DeleteFormula(F);
4808 --FIdx;
4809 --NumForms;
4810 Any = true;
4811 }
4812
4813 // Now that we've filtered out some formulae, recompute the Regs set.
4814 if (Any)
4815 LU.RecomputeRegs(LUIdx, RegUses);
4816
4817 // Reset this to prepare for the next use.
4818 BestFormulae.clear();
4819 }
4820
4821 LLVM_DEBUG(if (ChangedFormulae) {
4822 dbgs() << "\n"
4823 "After filtering out undesirable candidates:\n";
4824 print_uses(dbgs());
4825 });
4826}
4827
4828/// Estimate the worst-case number of solutions the solver might have to
4829/// consider. It almost never considers this many solutions because it prune the
4830/// search space, but the pruning isn't always sufficient.
4831size_t LSRInstance::EstimateSearchSpaceComplexity() const {
4832 size_t Power = 1;
4833 for (const LSRUse &LU : Uses) {
4834 size_t FSize = LU.Formulae.size();
4835 if (FSize >= ComplexityLimit) {
4836 Power = ComplexityLimit;
4837 break;
4838 }
4839 Power *= FSize;
4840 if (Power >= ComplexityLimit)
4841 break;
4842 }
4843 return Power;
4844}
4845
4846/// When one formula uses a superset of the registers of another formula, it
4847/// won't help reduce register pressure (though it may not necessarily hurt
4848/// register pressure); remove it to simplify the system.
4849void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
4850 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4851 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
4852
4853 LLVM_DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
4854 "which use a superset of registers used by other "
4855 "formulae.\n");
4856
4857 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4858 LSRUse &LU = Uses[LUIdx];
4859 bool Any = false;
4860 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
4861 Formula &F = LU.Formulae[i];
4862 if (F.BaseOffset.isNonZero() && F.BaseOffset.isScalable())
4863 continue;
4864 // Look for a formula with a constant or GV in a register. If the use
4865 // also has a formula with that same value in an immediate field,
4866 // delete the one that uses a register.
4868 I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
4869 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
4870 Formula NewF = F;
4871 //FIXME: Formulas should store bitwidth to do wrapping properly.
4872 // See PR41034.
4873 NewF.BaseOffset =
4874 Immediate::getFixed(NewF.BaseOffset.getFixedValue() +
4875 (uint64_t)C->getValue()->getSExtValue());
4876 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4877 (I - F.BaseRegs.begin()));
4878 if (LU.HasFormulaWithSameRegs(NewF)) {
4879 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4880 dbgs() << '\n');
4881 LU.DeleteFormula(F);
4882 --i;
4883 --e;
4884 Any = true;
4885 break;
4886 }
4887 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
4888 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
4889 if (!F.BaseGV) {
4890 Formula NewF = F;
4891 NewF.BaseGV = GV;
4892 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4893 (I - F.BaseRegs.begin()));
4894 if (LU.HasFormulaWithSameRegs(NewF)) {
4895 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4896 dbgs() << '\n');
4897 LU.DeleteFormula(F);
4898 --i;
4899 --e;
4900 Any = true;
4901 break;
4902 }
4903 }
4904 }
4905 }
4906 }
4907 if (Any)
4908 LU.RecomputeRegs(LUIdx, RegUses);
4909 }
4910
4911 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4912 }
4913}
4914
4915/// When there are many registers for expressions like A, A+1, A+2, etc.,
4916/// allocate a single register for them.
4917void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
4918 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
4919 return;
4920
4921 LLVM_DEBUG(
4922 dbgs() << "The search space is too complex.\n"
4923 "Narrowing the search space by assuming that uses separated "
4924 "by a constant offset will use the same registers.\n");
4925
4926 // This is especially useful for unrolled loops.
4927
4928 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4929 LSRUse &LU = Uses[LUIdx];
4930 for (const Formula &F : LU.Formulae) {
4931 if (F.BaseOffset.isZero() || (F.Scale != 0 && F.Scale != 1))
4932 continue;
4933
4934 LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
4935 if (!LUThatHas)
4936 continue;
4937
4938 if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false,
4939 LU.Kind, LU.AccessTy))
4940 continue;
4941
4942 LLVM_DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n');
4943
4944 LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
4945
4946 // Transfer the fixups of LU to LUThatHas.
4947 for (LSRFixup &Fixup : LU.Fixups) {
4948 Fixup.Offset += F.BaseOffset;
4949 LUThatHas->pushFixup(Fixup);
4950 LLVM_DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
4951 }
4952
4953 // Delete formulae from the new use which are no longer legal.
4954 bool Any = false;
4955 for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
4956 Formula &F = LUThatHas->Formulae[i];
4957 if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
4958 LUThatHas->Kind, LUThatHas->AccessTy, F)) {
4959 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
4960 LUThatHas->DeleteFormula(F);
4961 --i;
4962 --e;
4963 Any = true;
4964 }
4965 }
4966
4967 if (Any)
4968 LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
4969
4970 // Delete the old use.
4971 DeleteUse(LU, LUIdx);
4972 --LUIdx;
4973 --NumUses;
4974 break;
4975 }
4976 }
4977
4978 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4979}
4980
4981/// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that
4982/// we've done more filtering, as it may be able to find more formulae to
4983/// eliminate.
4984void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
4985 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4986 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
4987
4988 LLVM_DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
4989 "undesirable dedicated registers.\n");
4990
4991 FilterOutUndesirableDedicatedRegisters();
4992
4993 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4994 }
4995}
4996
4997/// If a LSRUse has multiple formulae with the same ScaledReg and Scale.
4998/// Pick the best one and delete the others.
4999/// This narrowing heuristic is to keep as many formulae with different
5000/// Scale and ScaledReg pair as possible while narrowing the search space.
5001/// The benefit is that it is more likely to find out a better solution
5002/// from a formulae set with more Scale and ScaledReg variations than
5003/// a formulae set with the same Scale and ScaledReg. The picking winner
5004/// reg heuristic will often keep the formulae with the same Scale and
5005/// ScaledReg and filter others, and we want to avoid that if possible.
5006void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
5007 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5008 return;
5009
5010 LLVM_DEBUG(
5011 dbgs() << "The search space is too complex.\n"
5012 "Narrowing the search space by choosing the best Formula "
5013 "from the Formulae with the same Scale and ScaledReg.\n");
5014
5015 // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse.
5016 using BestFormulaeTy = DenseMap<std::pair<const SCEV *, int64_t>, size_t>;
5017
5018 BestFormulaeTy BestFormulae;
5019#ifndef NDEBUG
5020 bool ChangedFormulae = false;
5021#endif
5022 DenseSet<const SCEV *> VisitedRegs;
5024
5025 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5026 LSRUse &LU = Uses[LUIdx];
5027 LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
5028 dbgs() << '\n');
5029
5030 // Return true if Formula FA is better than Formula FB.
5031 auto IsBetterThan = [&](Formula &FA, Formula &FB) {
5032 // First we will try to choose the Formula with fewer new registers.
5033 // For a register used by current Formula, the more the register is
5034 // shared among LSRUses, the less we increase the register number
5035 // counter of the formula.
5036 size_t FARegNum = 0;
5037 for (const SCEV *Reg : FA.BaseRegs) {
5038 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
5039 FARegNum += (NumUses - UsedByIndices.count() + 1);
5040 }
5041 size_t FBRegNum = 0;
5042 for (const SCEV *Reg : FB.BaseRegs) {
5043 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
5044 FBRegNum += (NumUses - UsedByIndices.count() + 1);
5045 }
5046 if (FARegNum != FBRegNum)
5047 return FARegNum < FBRegNum;
5048
5049 // If the new register numbers are the same, choose the Formula with
5050 // less Cost.
5051 Cost CostFA(L, SE, TTI, AMK);
5052 Cost CostFB(L, SE, TTI, AMK);
5053 Regs.clear();
5054 CostFA.RateFormula(FA, Regs, VisitedRegs, LU, HardwareLoopProfitable);
5055 Regs.clear();
5056 CostFB.RateFormula(FB, Regs, VisitedRegs, LU, HardwareLoopProfitable);
5057 return CostFA.isLess(CostFB);
5058 };
5059
5060 bool Any = false;
5061 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
5062 ++FIdx) {
5063 Formula &F = LU.Formulae[FIdx];
5064 if (!F.ScaledReg)
5065 continue;
5066 auto P = BestFormulae.insert({{F.ScaledReg, F.Scale}, FIdx});
5067 if (P.second)
5068 continue;
5069
5070 Formula &Best = LU.Formulae[P.first->second];
5071 if (IsBetterThan(F, Best))
5072 std::swap(F, Best);
5073 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
5074 dbgs() << "\n"
5075 " in favor of formula ";
5076 Best.print(dbgs()); dbgs() << '\n');
5077#ifndef NDEBUG
5078 ChangedFormulae = true;
5079#endif
5080 LU.DeleteFormula(F);
5081 --FIdx;
5082 --NumForms;
5083 Any = true;
5084 }
5085 if (Any)
5086 LU.RecomputeRegs(LUIdx, RegUses);
5087
5088 // Reset this to prepare for the next use.
5089 BestFormulae.clear();
5090 }
5091
5092 LLVM_DEBUG(if (ChangedFormulae) {
5093 dbgs() << "\n"
5094 "After filtering out undesirable candidates:\n";
5095 print_uses(dbgs());
5096 });
5097}
5098
5099/// If we are over the complexity limit, filter out any post-inc prefering
5100/// variables to only post-inc values.
5101void LSRInstance::NarrowSearchSpaceByFilterPostInc() {
5102 if (AMK != TTI::AMK_PostIndexed)
5103 return;
5104 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5105 return;
5106
5107 LLVM_DEBUG(dbgs() << "The search space is too complex.\n"
5108 "Narrowing the search space by choosing the lowest "
5109 "register Formula for PostInc Uses.\n");
5110
5111 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5112 LSRUse &LU = Uses[LUIdx];
5113
5114 if (LU.Kind != LSRUse::Address)
5115 continue;
5116 if (!TTI.isIndexedLoadLegal(TTI.MIM_PostInc, LU.AccessTy.getType()) &&
5117 !TTI.isIndexedStoreLegal(TTI.MIM_PostInc, LU.AccessTy.getType()))
5118 continue;
5119
5120 size_t MinRegs = std::numeric_limits<size_t>::max();
5121 for (const Formula &F : LU.Formulae)
5122 MinRegs = std::min(F.getNumRegs(), MinRegs);
5123
5124 bool Any = false;
5125 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
5126 ++FIdx) {
5127 Formula &F = LU.Formulae[FIdx];
5128 if (F.getNumRegs() > MinRegs) {
5129 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
5130 dbgs() << "\n");
5131 LU.DeleteFormula(F);
5132 --FIdx;
5133 --NumForms;
5134 Any = true;
5135 }
5136 }
5137 if (Any)
5138 LU.RecomputeRegs(LUIdx, RegUses);
5139
5140 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5141 break;
5142 }
5143
5144 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5145}
5146
5147/// The function delete formulas with high registers number expectation.
5148/// Assuming we don't know the value of each formula (already delete
5149/// all inefficient), generate probability of not selecting for each
5150/// register.
5151/// For example,
5152/// Use1:
5153/// reg(a) + reg({0,+,1})
5154/// reg(a) + reg({-1,+,1}) + 1
5155/// reg({a,+,1})
5156/// Use2:
5157/// reg(b) + reg({0,+,1})
5158/// reg(b) + reg({-1,+,1}) + 1
5159/// reg({b,+,1})
5160/// Use3:
5161/// reg(c) + reg(b) + reg({0,+,1})
5162/// reg(c) + reg({b,+,1})
5163///
5164/// Probability of not selecting
5165/// Use1 Use2 Use3
5166/// reg(a) (1/3) * 1 * 1
5167/// reg(b) 1 * (1/3) * (1/2)
5168/// reg({0,+,1}) (2/3) * (2/3) * (1/2)
5169/// reg({-1,+,1}) (2/3) * (2/3) * 1
5170/// reg({a,+,1}) (2/3) * 1 * 1
5171/// reg({b,+,1}) 1 * (2/3) * (2/3)
5172/// reg(c) 1 * 1 * 0
5173///
5174/// Now count registers number mathematical expectation for each formula:
5175/// Note that for each use we exclude probability if not selecting for the use.
5176/// For example for Use1 probability for reg(a) would be just 1 * 1 (excluding
5177/// probabilty 1/3 of not selecting for Use1).
5178/// Use1:
5179/// reg(a) + reg({0,+,1}) 1 + 1/3 -- to be deleted
5180/// reg(a) + reg({-1,+,1}) + 1 1 + 4/9 -- to be deleted
5181/// reg({a,+,1}) 1
5182/// Use2:
5183/// reg(b) + reg({0,+,1}) 1/2 + 1/3 -- to be deleted
5184/// reg(b) + reg({-1,+,1}) + 1 1/2 + 2/3 -- to be deleted
5185/// reg({b,+,1}) 2/3
5186/// Use3:
5187/// reg(c) + reg(b) + reg({0,+,1}) 1 + 1/3 + 4/9 -- to be deleted
5188/// reg(c) + reg({b,+,1}) 1 + 2/3
5189void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
5190 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5191 return;
5192 // Ok, we have too many of formulae on our hands to conveniently handle.
5193 // Use a rough heuristic to thin out the list.
5194
5195 // Set of Regs wich will be 100% used in final solution.
5196 // Used in each formula of a solution (in example above this is reg(c)).
5197 // We can skip them in calculations.
5199 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5200
5201 // Map each register to probability of not selecting
5202 DenseMap <const SCEV *, float> RegNumMap;
5203 for (const SCEV *Reg : RegUses) {
5204 if (UniqRegs.count(Reg))
5205 continue;
5206 float PNotSel = 1;
5207 for (const LSRUse &LU : Uses) {
5208 if (!LU.Regs.count(Reg))
5209 continue;
5210 float P = LU.getNotSelectedProbability(Reg);
5211 if (P != 0.0)
5212 PNotSel *= P;
5213 else
5214 UniqRegs.insert(Reg);
5215 }
5216 RegNumMap.insert(std::make_pair(Reg, PNotSel));
5217 }
5218
5219 LLVM_DEBUG(
5220 dbgs() << "Narrowing the search space by deleting costly formulas\n");
5221
5222 // Delete formulas where registers number expectation is high.
5223 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5224 LSRUse &LU = Uses[LUIdx];
5225 // If nothing to delete - continue.
5226 if (LU.Formulae.size() < 2)
5227 continue;
5228 // This is temporary solution to test performance. Float should be
5229 // replaced with round independent type (based on integers) to avoid
5230 // different results for different target builds.
5231 float FMinRegNum = LU.Formulae[0].getNumRegs();
5232 float FMinARegNum = LU.Formulae[0].getNumRegs();
5233 size_t MinIdx = 0;
5234 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
5235 Formula &F = LU.Formulae[i];
5236 float FRegNum = 0;
5237 float FARegNum = 0;
5238 for (const SCEV *BaseReg : F.BaseRegs) {
5239 if (UniqRegs.count(BaseReg))
5240 continue;
5241 FRegNum += RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
5242 if (isa<SCEVAddRecExpr>(BaseReg))
5243 FARegNum +=
5244 RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
5245 }
5246 if (const SCEV *ScaledReg = F.ScaledReg) {
5247 if (!UniqRegs.count(ScaledReg)) {
5248 FRegNum +=
5249 RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
5250 if (isa<SCEVAddRecExpr>(ScaledReg))
5251 FARegNum +=
5252 RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
5253 }
5254 }
5255 if (FMinRegNum > FRegNum ||
5256 (FMinRegNum == FRegNum && FMinARegNum > FARegNum)) {
5257 FMinRegNum = FRegNum;
5258 FMinARegNum = FARegNum;
5259 MinIdx = i;
5260 }
5261 }
5262 LLVM_DEBUG(dbgs() << " The formula "; LU.Formulae[MinIdx].print(dbgs());
5263 dbgs() << " with min reg num " << FMinRegNum << '\n');
5264 if (MinIdx != 0)
5265 std::swap(LU.Formulae[MinIdx], LU.Formulae[0]);
5266 while (LU.Formulae.size() != 1) {
5267 LLVM_DEBUG(dbgs() << " Deleting "; LU.Formulae.back().print(dbgs());
5268 dbgs() << '\n');
5269 LU.Formulae.pop_back();
5270 }
5271 LU.RecomputeRegs(LUIdx, RegUses);
5272 assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula");
5273 Formula &F = LU.Formulae[0];
5274 LLVM_DEBUG(dbgs() << " Leaving only "; F.print(dbgs()); dbgs() << '\n');
5275 // When we choose the formula, the regs become unique.
5276 UniqRegs.insert_range(F.BaseRegs);
5277 if (F.ScaledReg)
5278 UniqRegs.insert(F.ScaledReg);
5279 }
5280 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5281}
5282
5283// Check if Best and Reg are SCEVs separated by a constant amount C, and if so
5284// would the addressing offset +C would be legal where the negative offset -C is
5285// not.
5287 ScalarEvolution &SE, const SCEV *Best,
5288 const SCEV *Reg,
5289 MemAccessTy AccessType) {
5290 if (Best->getType() != Reg->getType() ||
5291 (isa<SCEVAddRecExpr>(Best) && isa<SCEVAddRecExpr>(Reg) &&
5292 cast<SCEVAddRecExpr>(Best)->getLoop() !=
5293 cast<SCEVAddRecExpr>(Reg)->getLoop()))
5294 return false;
5295 std::optional<APInt> Diff = SE.computeConstantDifference(Best, Reg);
5296 if (!Diff)
5297 return false;
5298
5300 AccessType.MemTy, /*BaseGV=*/nullptr,
5301 /*BaseOffset=*/Diff->getSExtValue(),
5302 /*HasBaseReg=*/true, /*Scale=*/0, AccessType.AddrSpace) &&
5304 AccessType.MemTy, /*BaseGV=*/nullptr,
5305 /*BaseOffset=*/-Diff->getSExtValue(),
5306 /*HasBaseReg=*/true, /*Scale=*/0, AccessType.AddrSpace);
5307}
5308
5309/// Pick a register which seems likely to be profitable, and then in any use
5310/// which has any reference to that register, delete all formulae which do not
5311/// reference that register.
5312void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
5313 // With all other options exhausted, loop until the system is simple
5314 // enough to handle.
5316 while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
5317 // Ok, we have too many of formulae on our hands to conveniently handle.
5318 // Use a rough heuristic to thin out the list.
5319 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5320
5321 // Pick the register which is used by the most LSRUses, which is likely
5322 // to be a good reuse register candidate.
5323 const SCEV *Best = nullptr;
5324 unsigned BestNum = 0;
5325 for (const SCEV *Reg : RegUses) {
5326 if (Taken.count(Reg))
5327 continue;
5328 if (!Best) {
5329 Best = Reg;
5330 BestNum = RegUses.getUsedByIndices(Reg).count();
5331 } else {
5332 unsigned Count = RegUses.getUsedByIndices(Reg).count();
5333 if (Count > BestNum) {
5334 Best = Reg;
5335 BestNum = Count;
5336 }
5337
5338 // If the scores are the same, but the Reg is simpler for the target
5339 // (for example {x,+,1} as opposed to {x+C,+,1}, where the target can
5340 // handle +C but not -C), opt for the simpler formula.
5341 if (Count == BestNum) {
5342 int LUIdx = RegUses.getUsedByIndices(Reg).find_first();
5343 if (LUIdx >= 0 && Uses[LUIdx].Kind == LSRUse::Address &&
5344 IsSimplerBaseSCEVForTarget(TTI, SE, Best, Reg,
5345 Uses[LUIdx].AccessTy)) {
5346 Best = Reg;
5347 BestNum = Count;
5348 }
5349 }
5350 }
5351 }
5352 assert(Best && "Failed to find best LSRUse candidate");
5353
5354 LLVM_DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
5355 << " will yield profitable reuse.\n");
5356 Taken.insert(Best);
5357
5358 // In any use with formulae which references this register, delete formulae
5359 // which don't reference it.
5360 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5361 LSRUse &LU = Uses[LUIdx];
5362 if (!LU.Regs.count(Best)) continue;
5363
5364 bool Any = false;
5365 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
5366 Formula &F = LU.Formulae[i];
5367 if (!F.referencesReg(Best)) {
5368 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
5369 LU.DeleteFormula(F);
5370 --e;
5371 --i;
5372 Any = true;
5373 assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
5374 continue;
5375 }
5376 }
5377
5378 if (Any)
5379 LU.RecomputeRegs(LUIdx, RegUses);
5380 }
5381
5382 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5383 }
5384}
5385
5386/// If there are an extraordinary number of formulae to choose from, use some
5387/// rough heuristics to prune down the number of formulae. This keeps the main
5388/// solver from taking an extraordinary amount of time in some worst-case
5389/// scenarios.
5390void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
5391 NarrowSearchSpaceByDetectingSupersets();
5392 NarrowSearchSpaceByCollapsingUnrolledCode();
5393 NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
5395 NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
5396 NarrowSearchSpaceByFilterPostInc();
5397 if (LSRExpNarrow)
5398 NarrowSearchSpaceByDeletingCostlyFormulas();
5399 else
5400 NarrowSearchSpaceByPickingWinnerRegs();
5401}
5402
5403/// This is the recursive solver.
5404void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
5405 Cost &SolutionCost,
5407 const Cost &CurCost,
5408 const SmallPtrSet<const SCEV *, 16> &CurRegs,
5409 DenseSet<const SCEV *> &VisitedRegs) const {
5410 // Some ideas:
5411 // - prune more:
5412 // - use more aggressive filtering
5413 // - sort the formula so that the most profitable solutions are found first
5414 // - sort the uses too
5415 // - search faster:
5416 // - don't compute a cost, and then compare. compare while computing a cost
5417 // and bail early.
5418 // - track register sets with SmallBitVector
5419
5420 const LSRUse &LU = Uses[Workspace.size()];
5421
5422 // If this use references any register that's already a part of the
5423 // in-progress solution, consider it a requirement that a formula must
5424 // reference that register in order to be considered. This prunes out
5425 // unprofitable searching.
5427 for (const SCEV *S : CurRegs)
5428 if (LU.Regs.count(S))
5429 ReqRegs.insert(S);
5430
5432 Cost NewCost(L, SE, TTI, AMK);
5433 for (const Formula &F : LU.Formulae) {
5434 // Ignore formulae which may not be ideal in terms of register reuse of
5435 // ReqRegs. The formula should use all required registers before
5436 // introducing new ones.
5437 // This can sometimes (notably when trying to favour postinc) lead to
5438 // sub-optimial decisions. There it is best left to the cost modelling to
5439 // get correct.
5440 if (AMK != TTI::AMK_PostIndexed || LU.Kind != LSRUse::Address) {
5441 int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
5442 for (const SCEV *Reg : ReqRegs) {
5443 if ((F.ScaledReg && F.ScaledReg == Reg) ||
5444 is_contained(F.BaseRegs, Reg)) {
5445 --NumReqRegsToFind;
5446 if (NumReqRegsToFind == 0)
5447 break;
5448 }
5449 }
5450 if (NumReqRegsToFind != 0) {
5451 // If none of the formulae satisfied the required registers, then we could
5452 // clear ReqRegs and try again. Currently, we simply give up in this case.
5453 continue;
5454 }
5455 }
5456
5457 // Evaluate the cost of the current formula. If it's already worse than
5458 // the current best, prune the search at that point.
5459 NewCost = CurCost;
5460 NewRegs = CurRegs;
5461 NewCost.RateFormula(F, NewRegs, VisitedRegs, LU, HardwareLoopProfitable);
5462 if (NewCost.isLess(SolutionCost)) {
5463 Workspace.push_back(&F);
5464 if (Workspace.size() != Uses.size()) {
5465 SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
5466 NewRegs, VisitedRegs);
5467 if (F.getNumRegs() == 1 && Workspace.size() == 1)
5468 VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
5469 } else {
5470 LLVM_DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
5471 dbgs() << ".\nRegs:\n";
5472 for (const SCEV *S : NewRegs) dbgs()
5473 << "- " << *S << "\n";
5474 dbgs() << '\n');
5475
5476 SolutionCost = NewCost;
5477 Solution = Workspace;
5478 }
5479 Workspace.pop_back();
5480 }
5481 }
5482}
5483
5484/// Choose one formula from each use. Return the results in the given Solution
5485/// vector.
5486void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
5488 Cost SolutionCost(L, SE, TTI, AMK);
5489 SolutionCost.Lose();
5490 Cost CurCost(L, SE, TTI, AMK);
5492 DenseSet<const SCEV *> VisitedRegs;
5493 Workspace.reserve(Uses.size());
5494
5495 // SolveRecurse does all the work.
5496 SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
5497 CurRegs, VisitedRegs);
5498 if (Solution.empty()) {
5499 LLVM_DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
5500 return;
5501 }
5502
5503 // Ok, we've now made all our decisions.
5504 LLVM_DEBUG(dbgs() << "\n"
5505 "The chosen solution requires ";
5506 SolutionCost.print(dbgs()); dbgs() << ":\n";
5507 for (size_t i = 0, e = Uses.size(); i != e; ++i) {
5508 dbgs() << " ";
5509 Uses[i].print(dbgs());
5510 dbgs() << "\n"
5511 " ";
5512 Solution[i]->print(dbgs());
5513 dbgs() << '\n';
5514 });
5515
5516 assert(Solution.size() == Uses.size() && "Malformed solution!");
5517
5518 const bool EnableDropUnprofitableSolution = [&] {
5520 case cl::BOU_TRUE:
5521 return true;
5522 case cl::BOU_FALSE:
5523 return false;
5524 case cl::BOU_UNSET:
5526 }
5527 llvm_unreachable("Unhandled cl::boolOrDefault enum");
5528 }();
5529
5530 if (BaselineCost.isLess(SolutionCost)) {
5531 if (!EnableDropUnprofitableSolution)
5532 LLVM_DEBUG(
5533 dbgs() << "Baseline is more profitable than chosen solution, "
5534 "add option 'lsr-drop-solution' to drop LSR solution.\n");
5535 else {
5536 LLVM_DEBUG(dbgs() << "Baseline is more profitable than chosen "
5537 "solution, dropping LSR solution.\n";);
5538 Solution.clear();
5539 }
5540 }
5541}
5542
5543/// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as
5544/// we can go while still being dominated by the input positions. This helps
5545/// canonicalize the insert position, which encourages sharing.
5547LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
5548 const SmallVectorImpl<Instruction *> &Inputs)
5549 const {
5550 Instruction *Tentative = &*IP;
5551 while (true) {
5552 bool AllDominate = true;
5553 Instruction *BetterPos = nullptr;
5554 // Don't bother attempting to insert before a catchswitch, their basic block
5555 // cannot have other non-PHI instructions.
5556 if (isa<CatchSwitchInst>(Tentative))
5557 return IP;
5558
5559 for (Instruction *Inst : Inputs) {
5560 if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
5561 AllDominate = false;
5562 break;
5563 }
5564 // Attempt to find an insert position in the middle of the block,
5565 // instead of at the end, so that it can be used for other expansions.
5566 if (Tentative->getParent() == Inst->getParent() &&
5567 (!BetterPos || !DT.dominates(Inst, BetterPos)))
5568 BetterPos = &*std::next(BasicBlock::iterator(Inst));
5569 }
5570 if (!AllDominate)
5571 break;
5572 if (BetterPos)
5573 IP = BetterPos->getIterator();
5574 else
5575 IP = Tentative->getIterator();
5576
5577 const Loop *IPLoop = LI.getLoopFor(IP->getParent());
5578 unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
5579
5580 BasicBlock *IDom;
5581 for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
5582 if (!Rung) return IP;
5583 Rung = Rung->getIDom();
5584 if (!Rung) return IP;
5585 IDom = Rung->getBlock();
5586
5587 // Don't climb into a loop though.
5588 const Loop *IDomLoop = LI.getLoopFor(IDom);
5589 unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
5590 if (IDomDepth <= IPLoopDepth &&
5591 (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
5592 break;
5593 }
5594
5595 Tentative = IDom->getTerminator();
5596 }
5597
5598 return IP;
5599}
5600
5601/// Determine an input position which will be dominated by the operands and
5602/// which will dominate the result.
5603BasicBlock::iterator LSRInstance::AdjustInsertPositionForExpand(
5604 BasicBlock::iterator LowestIP, const LSRFixup &LF, const LSRUse &LU) const {
5605 // Collect some instructions which must be dominated by the
5606 // expanding replacement. These must be dominated by any operands that
5607 // will be required in the expansion.
5609 if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
5610 Inputs.push_back(I);
5611 if (LU.Kind == LSRUse::ICmpZero)
5612 if (Instruction *I =
5613 dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
5614 Inputs.push_back(I);
5615 if (LF.PostIncLoops.count(L)) {
5616 if (LF.isUseFullyOutsideLoop(L))
5617 Inputs.push_back(L->getLoopLatch()->getTerminator());
5618 else
5619 Inputs.push_back(IVIncInsertPos);
5620 }
5621 // The expansion must also be dominated by the increment positions of any
5622 // loops it for which it is using post-inc mode.
5623 for (const Loop *PIL : LF.PostIncLoops) {
5624 if (PIL == L) continue;
5625
5626 // Be dominated by the loop exit.
5627 SmallVector<BasicBlock *, 4> ExitingBlocks;
5628 PIL->getExitingBlocks(ExitingBlocks);
5629 if (!ExitingBlocks.empty()) {
5630 BasicBlock *BB = ExitingBlocks[0];
5631 for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
5632 BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
5633 Inputs.push_back(BB->getTerminator());
5634 }
5635 }
5636
5637 assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad() &&
5638 "Insertion point must be a normal instruction");
5639
5640 // Then, climb up the immediate dominator tree as far as we can go while
5641 // still being dominated by the input positions.
5642 BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs);
5643
5644 // Don't insert instructions before PHI nodes.
5645 while (isa<PHINode>(IP)) ++IP;
5646
5647 // Ignore landingpad instructions.
5648 while (IP->isEHPad()) ++IP;
5649
5650 // Set IP below instructions recently inserted by SCEVExpander. This keeps the
5651 // IP consistent across expansions and allows the previously inserted
5652 // instructions to be reused by subsequent expansion.
5653 while (Rewriter.isInsertedInstruction(&*IP) && IP != LowestIP)
5654 ++IP;
5655
5656 return IP;
5657}
5658
5659/// Emit instructions for the leading candidate expression for this LSRUse (this
5660/// is called "expanding").
5661Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
5662 const Formula &F, BasicBlock::iterator IP,
5663 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
5664 if (LU.RigidFormula)
5665 return LF.OperandValToReplace;
5666
5667 // Determine an input position which will be dominated by the operands and
5668 // which will dominate the result.
5669 IP = AdjustInsertPositionForExpand(IP, LF, LU);
5670 Rewriter.setInsertPoint(&*IP);
5671
5672 // Inform the Rewriter if we have a post-increment use, so that it can
5673 // perform an advantageous expansion.
5674 Rewriter.setPostInc(LF.PostIncLoops);
5675
5676 // This is the type that the user actually needs.
5677 Type *OpTy = LF.OperandValToReplace->getType();
5678 // This will be the type that we'll initially expand to.
5679 Type *Ty = F.getType();
5680 if (!Ty)
5681 // No type known; just expand directly to the ultimate type.
5682 Ty = OpTy;
5683 else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
5684 // Expand directly to the ultimate type if it's the right size.
5685 Ty = OpTy;
5686 // This is the type to do integer arithmetic in.
5687 Type *IntTy = SE.getEffectiveSCEVType(Ty);
5688
5689 // Build up a list of operands to add together to form the full base.
5691
5692 // Expand the BaseRegs portion.
5693 for (const SCEV *Reg : F.BaseRegs) {
5694 assert(!Reg->isZero() && "Zero allocated in a base register!");
5695
5696 // If we're expanding for a post-inc user, make the post-inc adjustment.
5697 Reg = denormalizeForPostIncUse(Reg, LF.PostIncLoops, SE);
5698 Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr)));
5699 }
5700
5701 // Expand the ScaledReg portion.
5702 Value *ICmpScaledV = nullptr;
5703 if (F.Scale != 0) {
5704 const SCEV *ScaledS = F.ScaledReg;
5705
5706 // If we're expanding for a post-inc user, make the post-inc adjustment.
5707 PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
5708 ScaledS = denormalizeForPostIncUse(ScaledS, Loops, SE);
5709
5710 if (LU.Kind == LSRUse::ICmpZero) {
5711 // Expand ScaleReg as if it was part of the base regs.
5712 if (F.Scale == 1)
5713 Ops.push_back(
5714 SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr)));
5715 else {
5716 // An interesting way of "folding" with an icmp is to use a negated
5717 // scale, which we'll implement by inserting it into the other operand
5718 // of the icmp.
5719 assert(F.Scale == -1 &&
5720 "The only scale supported by ICmpZero uses is -1!");
5721 ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr);
5722 }
5723 } else {
5724 // Otherwise just expand the scaled register and an explicit scale,
5725 // which is expected to be matched as part of the address.
5726
5727 // Flush the operand list to suppress SCEVExpander hoisting address modes.
5728 // Unless the addressing mode will not be folded.
5729 if (!Ops.empty() && LU.Kind == LSRUse::Address &&
5730 isAMCompletelyFolded(TTI, LU, F)) {
5731 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), nullptr);
5732 Ops.clear();
5733 Ops.push_back(SE.getUnknown(FullV));
5734 }
5735 ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr));
5736 if (F.Scale != 1)
5737 ScaledS =
5738 SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale));
5739 Ops.push_back(ScaledS);
5740 }
5741 }
5742
5743 // Expand the GV portion.
5744 if (F.BaseGV) {
5745 // Flush the operand list to suppress SCEVExpander hoisting.
5746 if (!Ops.empty()) {
5747 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), IntTy);
5748 Ops.clear();
5749 Ops.push_back(SE.getUnknown(FullV));
5750 }
5751 Ops.push_back(SE.getUnknown(F.BaseGV));
5752 }
5753
5754 // Flush the operand list to suppress SCEVExpander hoisting of both folded and
5755 // unfolded offsets. LSR assumes they both live next to their uses.
5756 if (!Ops.empty()) {
5757 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
5758 Ops.clear();
5759 Ops.push_back(SE.getUnknown(FullV));
5760 }
5761
5762 // FIXME: Are we sure we won't get a mismatch here? Is there a way to bail
5763 // out at this point, or should we generate a SCEV adding together mixed
5764 // offsets?
5765 assert(F.BaseOffset.isCompatibleImmediate(LF.Offset) &&
5766 "Expanding mismatched offsets\n");
5767 // Expand the immediate portion.
5768 Immediate Offset = F.BaseOffset.addUnsigned(LF.Offset);
5769 if (Offset.isNonZero()) {
5770 if (LU.Kind == LSRUse::ICmpZero) {
5771 // The other interesting way of "folding" with an ICmpZero is to use a
5772 // negated immediate.
5773 if (!ICmpScaledV)
5774 ICmpScaledV =
5775 ConstantInt::get(IntTy, -(uint64_t)Offset.getFixedValue());
5776 else {
5777 Ops.push_back(SE.getUnknown(ICmpScaledV));
5778 ICmpScaledV = ConstantInt::get(IntTy, Offset.getFixedValue());
5779 }
5780 } else {
5781 // Just add the immediate values. These again are expected to be matched
5782 // as part of the address.
5783 Ops.push_back(Offset.getUnknownSCEV(SE, IntTy));
5784 }
5785 }
5786
5787 // Expand the unfolded offset portion.
5788 Immediate UnfoldedOffset = F.UnfoldedOffset;
5789 if (UnfoldedOffset.isNonZero()) {
5790 // Just add the immediate values.
5791 Ops.push_back(UnfoldedOffset.getUnknownSCEV(SE, IntTy));
5792 }
5793
5794 // Emit instructions summing all the operands.
5795 const SCEV *FullS = Ops.empty() ?
5796 SE.getConstant(IntTy, 0) :
5797 SE.getAddExpr(Ops);
5798 Value *FullV = Rewriter.expandCodeFor(FullS, Ty);
5799
5800 // We're done expanding now, so reset the rewriter.
5801 Rewriter.clearPostInc();
5802
5803 // An ICmpZero Formula represents an ICmp which we're handling as a
5804 // comparison against zero. Now that we've expanded an expression for that
5805 // form, update the ICmp's other operand.
5806 if (LU.Kind == LSRUse::ICmpZero) {
5807 ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
5808 if (auto *OperandIsInstr = dyn_cast<Instruction>(CI->getOperand(1)))
5809 DeadInsts.emplace_back(OperandIsInstr);
5810 assert(!F.BaseGV && "ICmp does not support folding a global value and "
5811 "a scale at the same time!");
5812 if (F.Scale == -1) {
5813 if (ICmpScaledV->getType() != OpTy) {
5815 CastInst::getCastOpcode(ICmpScaledV, false, OpTy, false),
5816 ICmpScaledV, OpTy, "tmp", CI->getIterator());
5817 ICmpScaledV = Cast;
5818 }
5819 CI->setOperand(1, ICmpScaledV);
5820 } else {
5821 // A scale of 1 means that the scale has been expanded as part of the
5822 // base regs.
5823 assert((F.Scale == 0 || F.Scale == 1) &&
5824 "ICmp does not support folding a global value and "
5825 "a scale at the same time!");
5827 -(uint64_t)Offset.getFixedValue());
5828 if (C->getType() != OpTy) {
5830 CastInst::getCastOpcode(C, false, OpTy, false), C, OpTy,
5831 CI->getDataLayout());
5832 assert(C && "Cast of ConstantInt should have folded");
5833 }
5834
5835 CI->setOperand(1, C);
5836 }
5837 }
5838
5839 return FullV;
5840}
5841
5842/// Helper for Rewrite. PHI nodes are special because the use of their operands
5843/// effectively happens in their predecessor blocks, so the expression may need
5844/// to be expanded in multiple places.
5845void LSRInstance::RewriteForPHI(PHINode *PN, const LSRUse &LU,
5846 const LSRFixup &LF, const Formula &F,
5849
5850 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
5851 if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
5852 bool needUpdateFixups = false;
5853 BasicBlock *BB = PN->getIncomingBlock(i);
5854
5855 // If this is a critical edge, split the edge so that we do not insert
5856 // the code on all predecessor/successor paths. We do this unless this
5857 // is the canonical backedge for this loop, which complicates post-inc
5858 // users.
5859 if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
5860 !isa<IndirectBrInst>(BB->getTerminator()) &&
5861 !isa<CatchSwitchInst>(BB->getTerminator())) {
5862 BasicBlock *Parent = PN->getParent();
5863 Loop *PNLoop = LI.getLoopFor(Parent);
5864 if (!PNLoop || Parent != PNLoop->getHeader()) {
5865 // Split the critical edge.
5866 BasicBlock *NewBB = nullptr;
5867 if (!Parent->isLandingPad()) {
5868 NewBB =
5869 SplitCriticalEdge(BB, Parent,
5870 CriticalEdgeSplittingOptions(&DT, &LI, MSSAU)
5871 .setMergeIdenticalEdges()
5872 .setKeepOneInputPHIs());
5873 } else {
5875 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
5876 SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DTU, &LI);
5877 NewBB = NewBBs[0];
5878 }
5879 // If NewBB==NULL, then SplitCriticalEdge refused to split because all
5880 // phi predecessors are identical. The simple thing to do is skip
5881 // splitting in this case rather than complicate the API.
5882 if (NewBB) {
5883 // If PN is outside of the loop and BB is in the loop, we want to
5884 // move the block to be immediately before the PHI block, not
5885 // immediately after BB.
5886 if (L->contains(BB) && !L->contains(PN))
5887 NewBB->moveBefore(PN->getParent());
5888
5889 // Splitting the edge can reduce the number of PHI entries we have.
5890 e = PN->getNumIncomingValues();
5891 BB = NewBB;
5892 i = PN->getBasicBlockIndex(BB);
5893
5894 needUpdateFixups = true;
5895 }
5896 }
5897 }
5898
5899 std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
5900 Inserted.try_emplace(BB);
5901 if (!Pair.second)
5902 PN->setIncomingValue(i, Pair.first->second);
5903 else {
5904 Value *FullV =
5905 Expand(LU, LF, F, BB->getTerminator()->getIterator(), DeadInsts);
5906
5907 // If this is reuse-by-noop-cast, insert the noop cast.
5908 Type *OpTy = LF.OperandValToReplace->getType();
5909 if (FullV->getType() != OpTy)
5910 FullV = CastInst::Create(
5911 CastInst::getCastOpcode(FullV, false, OpTy, false), FullV,
5912 LF.OperandValToReplace->getType(), "tmp",
5913 BB->getTerminator()->getIterator());
5914
5915 // If the incoming block for this value is not in the loop, it means the
5916 // current PHI is not in a loop exit, so we must create a LCSSA PHI for
5917 // the inserted value.
5918 if (auto *I = dyn_cast<Instruction>(FullV))
5919 if (L->contains(I) && !L->contains(BB))
5920 InsertedNonLCSSAInsts.insert(I);
5921
5922 PN->setIncomingValue(i, FullV);
5923 Pair.first->second = FullV;
5924 }
5925
5926 // If LSR splits critical edge and phi node has other pending
5927 // fixup operands, we need to update those pending fixups. Otherwise
5928 // formulae will not be implemented completely and some instructions
5929 // will not be eliminated.
5930 if (needUpdateFixups) {
5931 for (LSRUse &LU : Uses)
5932 for (LSRFixup &Fixup : LU.Fixups)
5933 // If fixup is supposed to rewrite some operand in the phi
5934 // that was just updated, it may be already moved to
5935 // another phi node. Such fixup requires update.
5936 if (Fixup.UserInst == PN) {
5937 // Check if the operand we try to replace still exists in the
5938 // original phi.
5939 bool foundInOriginalPHI = false;
5940 for (const auto &val : PN->incoming_values())
5941 if (val == Fixup.OperandValToReplace) {
5942 foundInOriginalPHI = true;
5943 break;
5944 }
5945
5946 // If fixup operand found in original PHI - nothing to do.
5947 if (foundInOriginalPHI)
5948 continue;
5949
5950 // Otherwise it might be moved to another PHI and requires update.
5951 // If fixup operand not found in any of the incoming blocks that
5952 // means we have already rewritten it - nothing to do.
5953 for (const auto &Block : PN->blocks())
5954 for (BasicBlock::iterator I = Block->begin(); isa<PHINode>(I);
5955 ++I) {
5956 PHINode *NewPN = cast<PHINode>(I);
5957 for (const auto &val : NewPN->incoming_values())
5958 if (val == Fixup.OperandValToReplace)
5959 Fixup.UserInst = NewPN;
5960 }
5961 }
5962 }
5963 }
5964}
5965
5966/// Emit instructions for the leading candidate expression for this LSRUse (this
5967/// is called "expanding"), and update the UserInst to reference the newly
5968/// expanded value.
5969void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF,
5970 const Formula &F,
5972 // First, find an insertion point that dominates UserInst. For PHI nodes,
5973 // find the nearest block which dominates all the relevant uses.
5974 if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
5975 RewriteForPHI(PN, LU, LF, F, DeadInsts);
5976 } else {
5977 Value *FullV = Expand(LU, LF, F, LF.UserInst->getIterator(), DeadInsts);
5978
5979 // If this is reuse-by-noop-cast, insert the noop cast.
5980 Type *OpTy = LF.OperandValToReplace->getType();
5981 if (FullV->getType() != OpTy) {
5982 Instruction *Cast =
5983 CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
5984 FullV, OpTy, "tmp", LF.UserInst->getIterator());
5985 FullV = Cast;
5986 }
5987
5988 // Update the user. ICmpZero is handled specially here (for now) because
5989 // Expand may have updated one of the operands of the icmp already, and
5990 // its new value may happen to be equal to LF.OperandValToReplace, in
5991 // which case doing replaceUsesOfWith leads to replacing both operands
5992 // with the same value. TODO: Reorganize this.
5993 if (LU.Kind == LSRUse::ICmpZero)
5994 LF.UserInst->setOperand(0, FullV);
5995 else
5996 LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
5997 }
5998
5999 if (auto *OperandIsInstr = dyn_cast<Instruction>(LF.OperandValToReplace))
6000 DeadInsts.emplace_back(OperandIsInstr);
6001}
6002
6003// Trying to hoist the IVInc to loop header if all IVInc users are in
6004// the loop header. It will help backend to generate post index load/store
6005// when the latch block is different from loop header block.
6006static bool canHoistIVInc(const TargetTransformInfo &TTI, const LSRFixup &Fixup,
6007 const LSRUse &LU, Instruction *IVIncInsertPos,
6008 Loop *L) {
6009 if (LU.Kind != LSRUse::Address)
6010 return false;
6011
6012 // For now this code do the conservative optimization, only work for
6013 // the header block. Later we can hoist the IVInc to the block post
6014 // dominate all users.
6015 BasicBlock *LHeader = L->getHeader();
6016 if (IVIncInsertPos->getParent() == LHeader)
6017 return false;
6018
6019 if (!Fixup.OperandValToReplace ||
6020 any_of(Fixup.OperandValToReplace->users(), [&LHeader](User *U) {
6021 Instruction *UI = cast<Instruction>(U);
6022 return UI->getParent() != LHeader;
6023 }))
6024 return false;
6025
6026 Instruction *I = Fixup.UserInst;
6027 Type *Ty = I->getType();
6028 return (isa<LoadInst>(I) && TTI.isIndexedLoadLegal(TTI.MIM_PostInc, Ty)) ||
6029 (isa<StoreInst>(I) && TTI.isIndexedStoreLegal(TTI.MIM_PostInc, Ty));
6030}
6031
6032/// Rewrite all the fixup locations with new values, following the chosen
6033/// solution.
6034void LSRInstance::ImplementSolution(
6035 const SmallVectorImpl<const Formula *> &Solution) {
6036 // Keep track of instructions we may have made dead, so that
6037 // we can remove them after we are done working.
6039
6040 // Mark phi nodes that terminate chains so the expander tries to reuse them.
6041 for (const IVChain &Chain : IVChainVec) {
6042 if (PHINode *PN = dyn_cast<PHINode>(Chain.tailUserInst()))
6043 Rewriter.setChainedPhi(PN);
6044 }
6045
6046 // Expand the new value definitions and update the users.
6047 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx)
6048 for (const LSRFixup &Fixup : Uses[LUIdx].Fixups) {
6049 Instruction *InsertPos =
6050 canHoistIVInc(TTI, Fixup, Uses[LUIdx], IVIncInsertPos, L)
6051 ? L->getHeader()->getTerminator()
6052 : IVIncInsertPos;
6053 Rewriter.setIVIncInsertPos(L, InsertPos);
6054 Rewrite(Uses[LUIdx], Fixup, *Solution[LUIdx], DeadInsts);
6055 Changed = true;
6056 }
6057
6058 auto InsertedInsts = InsertedNonLCSSAInsts.takeVector();
6059 formLCSSAForInstructions(InsertedInsts, DT, LI, &SE);
6060
6061 for (const IVChain &Chain : IVChainVec) {
6062 GenerateIVChain(Chain, DeadInsts);
6063 Changed = true;
6064 }
6065
6066 for (const WeakVH &IV : Rewriter.getInsertedIVs())
6067 if (IV && dyn_cast<Instruction>(&*IV)->getParent())
6068 ScalarEvolutionIVs.push_back(IV);
6069
6070 // Clean up after ourselves. This must be done before deleting any
6071 // instructions.
6072 Rewriter.clear();
6073
6075 &TLI, MSSAU);
6076
6077 // In our cost analysis above, we assume that each addrec consumes exactly
6078 // one register, and arrange to have increments inserted just before the
6079 // latch to maximimize the chance this is true. However, if we reused
6080 // existing IVs, we now need to move the increments to match our
6081 // expectations. Otherwise, our cost modeling results in us having a
6082 // chosen a non-optimal result for the actual schedule. (And yes, this
6083 // scheduling decision does impact later codegen.)
6084 for (PHINode &PN : L->getHeader()->phis()) {
6085 BinaryOperator *BO = nullptr;
6086 Value *Start = nullptr, *Step = nullptr;
6087 if (!matchSimpleRecurrence(&PN, BO, Start, Step))
6088 continue;
6089
6090 switch (BO->getOpcode()) {
6091 case Instruction::Sub:
6092 if (BO->getOperand(0) != &PN)
6093 // sub is non-commutative - match handling elsewhere in LSR
6094 continue;
6095 break;
6096 case Instruction::Add:
6097 break;
6098 default:
6099 continue;
6100 };
6101
6102 if (!isa<Constant>(Step))
6103 // If not a constant step, might increase register pressure
6104 // (We assume constants have been canonicalized to RHS)
6105 continue;
6106
6107 if (BO->getParent() == IVIncInsertPos->getParent())
6108 // Only bother moving across blocks. Isel can handle block local case.
6109 continue;
6110
6111 // Can we legally schedule inc at the desired point?
6112 if (!llvm::all_of(BO->uses(),
6113 [&](Use &U) {return DT.dominates(IVIncInsertPos, U);}))
6114 continue;
6115 BO->moveBefore(IVIncInsertPos->getIterator());
6116 Changed = true;
6117 }
6118
6119
6120}
6121
6122LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
6123 DominatorTree &DT, LoopInfo &LI,
6126 : IU(IU), SE(SE), DT(DT), LI(LI), AC(AC), TLI(TLI), TTI(TTI), L(L),
6127 MSSAU(MSSAU), AMK(PreferredAddresingMode.getNumOccurrences() > 0
6129 : TTI.getPreferredAddressingMode(L, &SE)),
6130 Rewriter(SE, L->getHeader()->getDataLayout(), "lsr", false),
6131 BaselineCost(L, SE, TTI, AMK) {
6132 // If LoopSimplify form is not available, stay out of trouble.
6133 if (!L->isLoopSimplifyForm())
6134 return;
6135
6136 // If there's no interesting work to be done, bail early.
6137 if (IU.empty()) return;
6138
6139 // If there's too much analysis to be done, bail early. We won't be able to
6140 // model the problem anyway.
6141 unsigned NumUsers = 0;
6142 for (const IVStrideUse &U : IU) {
6143 if (++NumUsers > MaxIVUsers) {
6144 (void)U;
6145 LLVM_DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U
6146 << "\n");
6147 return;
6148 }
6149 // Bail out if we have a PHI on an EHPad that gets a value from a
6150 // CatchSwitchInst. Because the CatchSwitchInst cannot be split, there is
6151 // no good place to stick any instructions.
6152 if (auto *PN = dyn_cast<PHINode>(U.getUser())) {
6153 auto FirstNonPHI = PN->getParent()->getFirstNonPHIIt();
6154 if (isa<FuncletPadInst>(FirstNonPHI) ||
6155 isa<CatchSwitchInst>(FirstNonPHI))
6156 for (BasicBlock *PredBB : PN->blocks())
6157 if (isa<CatchSwitchInst>(PredBB->getFirstNonPHIIt()))
6158 return;
6159 }
6160 }
6161
6162 LLVM_DEBUG(dbgs() << "\nLSR on loop ";
6163 L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
6164 dbgs() << ":\n");
6165
6166 // Check if we expect this loop to use a hardware loop instruction, which will
6167 // be used when calculating the costs of formulas.
6168 HardwareLoopInfo HWLoopInfo(L);
6169 HardwareLoopProfitable =
6170 TTI.isHardwareLoopProfitable(L, SE, AC, &TLI, HWLoopInfo);
6171
6172 // Configure SCEVExpander already now, so the correct mode is used for
6173 // isSafeToExpand() checks.
6174#if LLVM_ENABLE_ABI_BREAKING_CHECKS
6175 Rewriter.setDebugType(DEBUG_TYPE);
6176#endif
6177 Rewriter.disableCanonicalMode();
6178 Rewriter.enableLSRMode();
6179
6180 // First, perform some low-level loop optimizations.
6181 OptimizeShadowIV();
6182 OptimizeLoopTermCond();
6183
6184 // If loop preparation eliminates all interesting IV users, bail.
6185 if (IU.empty()) return;
6186
6187 // Skip nested loops until we can model them better with formulae.
6188 if (!L->isInnermost()) {
6189 LLVM_DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
6190 return;
6191 }
6192
6193 // Start collecting data and preparing for the solver.
6194 // If number of registers is not the major cost, we cannot benefit from the
6195 // current profitable chain optimization which is based on number of
6196 // registers.
6197 // FIXME: add profitable chain optimization for other kinds major cost, for
6198 // example number of instructions.
6200 CollectChains();
6201 CollectInterestingTypesAndFactors();
6202 CollectFixupsAndInitialFormulae();
6203 CollectLoopInvariantFixupsAndFormulae();
6204
6205 if (Uses.empty())
6206 return;
6207
6208 LLVM_DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
6209 print_uses(dbgs()));
6210 LLVM_DEBUG(dbgs() << "The baseline solution requires ";
6211 BaselineCost.print(dbgs()); dbgs() << "\n");
6212
6213 // Now use the reuse data to generate a bunch of interesting ways
6214 // to formulate the values needed for the uses.
6215 GenerateAllReuseFormulae();
6216
6217 FilterOutUndesirableDedicatedRegisters();
6218 NarrowSearchSpaceUsingHeuristics();
6219
6221 Solve(Solution);
6222
6223 // Release memory that is no longer needed.
6224 Factors.clear();
6225 Types.clear();
6226 RegUses.clear();
6227
6228 if (Solution.empty())
6229 return;
6230
6231#ifndef NDEBUG
6232 // Formulae should be legal.
6233 for (const LSRUse &LU : Uses) {
6234 for (const Formula &F : LU.Formulae)
6235 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
6236 F) && "Illegal formula generated!");
6237 };
6238#endif
6239
6240 // Now that we've decided what we want, make it so.
6241 ImplementSolution(Solution);
6242}
6243
6244#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
6245void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
6246 if (Factors.empty() && Types.empty()) return;
6247
6248 OS << "LSR has identified the following interesting factors and types: ";
6249 bool First = true;
6250
6251 for (int64_t Factor : Factors) {
6252 if (!First) OS << ", ";
6253 First = false;
6254 OS << '*' << Factor;
6255 }
6256
6257 for (Type *Ty : Types) {
6258 if (!First) OS << ", ";
6259 First = false;
6260 OS << '(' << *Ty << ')';
6261 }
6262 OS << '\n';
6263}
6264
6265void LSRInstance::print_fixups(raw_ostream &OS) const {
6266 OS << "LSR is examining the following fixup sites:\n";
6267 for (const LSRUse &LU : Uses)
6268 for (const LSRFixup &LF : LU.Fixups) {
6269 dbgs() << " ";
6270 LF.print(OS);
6271 OS << '\n';
6272 }
6273}
6274
6275void LSRInstance::print_uses(raw_ostream &OS) const {
6276 OS << "LSR is examining the following uses:\n";
6277 for (const LSRUse &LU : Uses) {
6278 dbgs() << " ";
6279 LU.print(OS);
6280 OS << '\n';
6281 for (const Formula &F : LU.Formulae) {
6282 OS << " ";
6283 F.print(OS);
6284 OS << '\n';
6285 }
6286 }
6287}
6288
6289void LSRInstance::print(raw_ostream &OS) const {
6290 print_factors_and_types(OS);
6291 print_fixups(OS);
6292 print_uses(OS);
6293}
6294
6295LLVM_DUMP_METHOD void LSRInstance::dump() const {
6296 print(errs()); errs() << '\n';
6297}
6298#endif
6299
6300namespace {
6301
6302class LoopStrengthReduce : public LoopPass {
6303public:
6304 static char ID; // Pass ID, replacement for typeid
6305
6306 LoopStrengthReduce();
6307
6308private:
6309 bool runOnLoop(Loop *L, LPPassManager &LPM) override;
6310 void getAnalysisUsage(AnalysisUsage &AU) const override;
6311};
6312
6313} // end anonymous namespace
6314
6315LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) {
6317}
6318
6319void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
6320 // We split critical edges, so we change the CFG. However, we do update
6321 // many analyses if they are around.
6323
6333 // Requiring LoopSimplify a second time here prevents IVUsers from running
6334 // twice, since LoopSimplify was invalidated by running ScalarEvolution.
6340}
6341
6342namespace {
6343
6344/// Enables more convenient iteration over a DWARF expression vector.
6346ToDwarfOpIter(SmallVectorImpl<uint64_t> &Expr) {
6351 return {Begin, End};
6352}
6353
6354struct SCEVDbgValueBuilder {
6355 SCEVDbgValueBuilder() = default;
6356 SCEVDbgValueBuilder(const SCEVDbgValueBuilder &Base) { clone(Base); }
6357
6358 void clone(const SCEVDbgValueBuilder &Base) {
6359 LocationOps = Base.LocationOps;
6360 Expr = Base.Expr;
6361 }
6362
6363 void clear() {
6364 LocationOps.clear();
6365 Expr.clear();
6366 }
6367
6368 /// The DIExpression as we translate the SCEV.
6370 /// The location ops of the DIExpression.
6371 SmallVector<Value *, 2> LocationOps;
6372
6373 void pushOperator(uint64_t Op) { Expr.push_back(Op); }
6374 void pushUInt(uint64_t Operand) { Expr.push_back(Operand); }
6375
6376 /// Add a DW_OP_LLVM_arg to the expression, followed by the index of the value
6377 /// in the set of values referenced by the expression.
6378 void pushLocation(llvm::Value *V) {
6380 auto *It = llvm::find(LocationOps, V);
6381 unsigned ArgIndex = 0;
6382 if (It != LocationOps.end()) {
6383 ArgIndex = std::distance(LocationOps.begin(), It);
6384 } else {
6385 ArgIndex = LocationOps.size();
6386 LocationOps.push_back(V);
6387 }
6388 Expr.push_back(ArgIndex);
6389 }
6390
6391 void pushValue(const SCEVUnknown *U) {
6392 llvm::Value *V = cast<SCEVUnknown>(U)->getValue();
6393 pushLocation(V);
6394 }
6395
6396 bool pushConst(const SCEVConstant *C) {
6397 if (C->getAPInt().getSignificantBits() > 64)
6398 return false;
6399 Expr.push_back(llvm::dwarf::DW_OP_consts);
6400 Expr.push_back(C->getAPInt().getSExtValue());
6401 return true;
6402 }
6403
6404 // Iterating the expression as DWARF ops is convenient when updating
6405 // DWARF_OP_LLVM_args.
6407 return ToDwarfOpIter(Expr);
6408 }
6409
6410 /// Several SCEV types are sequences of the same arithmetic operator applied
6411 /// to constants and values that may be extended or truncated.
6412 bool pushArithmeticExpr(const llvm::SCEVCommutativeExpr *CommExpr,
6413 uint64_t DwarfOp) {
6414 assert((isa<llvm::SCEVAddExpr>(CommExpr) || isa<SCEVMulExpr>(CommExpr)) &&
6415 "Expected arithmetic SCEV type");
6416 bool Success = true;
6417 unsigned EmitOperator = 0;
6418 for (const auto &Op : CommExpr->operands()) {
6419 Success &= pushSCEV(Op);
6420
6421 if (EmitOperator >= 1)
6422 pushOperator(DwarfOp);
6423 ++EmitOperator;
6424 }
6425 return Success;
6426 }
6427
6428 // TODO: Identify and omit noop casts.
6429 bool pushCast(const llvm::SCEVCastExpr *C, bool IsSigned) {
6430 const llvm::SCEV *Inner = C->getOperand(0);
6431 const llvm::Type *Type = C->getType();
6432 uint64_t ToWidth = Type->getIntegerBitWidth();
6433 bool Success = pushSCEV(Inner);
6434 uint64_t CastOps[] = {dwarf::DW_OP_LLVM_convert, ToWidth,
6435 IsSigned ? llvm::dwarf::DW_ATE_signed
6436 : llvm::dwarf::DW_ATE_unsigned};
6437 for (const auto &Op : CastOps)
6438 pushOperator(Op);
6439 return Success;
6440 }
6441
6442 // TODO: MinMax - although these haven't been encountered in the test suite.
6443 bool pushSCEV(const llvm::SCEV *S) {
6444 bool Success = true;
6445 if (const SCEVConstant *StartInt = dyn_cast<SCEVConstant>(S)) {
6446 Success &= pushConst(StartInt);
6447
6448 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
6449 if (!U->getValue())
6450 return false;
6451 pushLocation(U->getValue());
6452
6453 } else if (const SCEVMulExpr *MulRec = dyn_cast<SCEVMulExpr>(S)) {
6454 Success &= pushArithmeticExpr(MulRec, llvm::dwarf::DW_OP_mul);
6455
6456 } else if (const SCEVUDivExpr *UDiv = dyn_cast<SCEVUDivExpr>(S)) {
6457 Success &= pushSCEV(UDiv->getLHS());
6458 Success &= pushSCEV(UDiv->getRHS());
6459 pushOperator(llvm::dwarf::DW_OP_div);
6460
6461 } else if (const SCEVCastExpr *Cast = dyn_cast<SCEVCastExpr>(S)) {
6462 // Assert if a new and unknown SCEVCastEXpr type is encountered.
6463 assert((isa<SCEVZeroExtendExpr>(Cast) || isa<SCEVTruncateExpr>(Cast) ||
6464 isa<SCEVPtrToIntExpr>(Cast) || isa<SCEVSignExtendExpr>(Cast)) &&
6465 "Unexpected cast type in SCEV.");
6466 Success &= pushCast(Cast, (isa<SCEVSignExtendExpr>(Cast)));
6467
6468 } else if (const SCEVAddExpr *AddExpr = dyn_cast<SCEVAddExpr>(S)) {
6469 Success &= pushArithmeticExpr(AddExpr, llvm::dwarf::DW_OP_plus);
6470
6471 } else if (isa<SCEVAddRecExpr>(S)) {
6472 // Nested SCEVAddRecExpr are generated by nested loops and are currently
6473 // unsupported.
6474 return false;
6475
6476 } else {
6477 return false;
6478 }
6479 return Success;
6480 }
6481
6482 /// Return true if the combination of arithmetic operator and underlying
6483 /// SCEV constant value is an identity function.
6484 bool isIdentityFunction(uint64_t Op, const SCEV *S) {
6485 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
6486 if (C->getAPInt().getSignificantBits() > 64)
6487 return false;
6488 int64_t I = C->getAPInt().getSExtValue();
6489 switch (Op) {
6490 case llvm::dwarf::DW_OP_plus:
6491 case llvm::dwarf::DW_OP_minus:
6492 return I == 0;
6493 case llvm::dwarf::DW_OP_mul:
6494 case llvm::dwarf::DW_OP_div:
6495 return I == 1;
6496 }
6497 }
6498 return false;
6499 }
6500
6501 /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6502 /// builder's expression stack. The stack should already contain an
6503 /// expression for the iteration count, so that it can be multiplied by
6504 /// the stride and added to the start.
6505 /// Components of the expression are omitted if they are an identity function.
6506 /// Chain (non-affine) SCEVs are not supported.
6507 bool SCEVToValueExpr(const llvm::SCEVAddRecExpr &SAR, ScalarEvolution &SE) {
6508 assert(SAR.isAffine() && "Expected affine SCEV");
6509 const SCEV *Start = SAR.getStart();
6510 const SCEV *Stride = SAR.getStepRecurrence(SE);
6511
6512 // Skip pushing arithmetic noops.
6513 if (!isIdentityFunction(llvm::dwarf::DW_OP_mul, Stride)) {
6514 if (!pushSCEV(Stride))
6515 return false;
6516 pushOperator(llvm::dwarf::DW_OP_mul);
6517 }
6518 if (!isIdentityFunction(llvm::dwarf::DW_OP_plus, Start)) {
6519 if (!pushSCEV(Start))
6520 return false;
6521 pushOperator(llvm::dwarf::DW_OP_plus);
6522 }
6523 return true;
6524 }
6525
6526 /// Create an expression that is an offset from a value (usually the IV).
6527 void createOffsetExpr(int64_t Offset, Value *OffsetValue) {
6528 pushLocation(OffsetValue);
6530 LLVM_DEBUG(
6531 dbgs() << "scev-salvage: Generated IV offset expression. Offset: "
6532 << std::to_string(Offset) << "\n");
6533 }
6534
6535 /// Combine a translation of the SCEV and the IV to create an expression that
6536 /// recovers a location's value.
6537 /// returns true if an expression was created.
6538 bool createIterCountExpr(const SCEV *S,
6539 const SCEVDbgValueBuilder &IterationCount,
6540 ScalarEvolution &SE) {
6541 // SCEVs for SSA values are most frquently of the form
6542 // {start,+,stride}, but sometimes they are ({start,+,stride} + %a + ..).
6543 // This is because %a is a PHI node that is not the IV. However, these
6544 // SCEVs have not been observed to result in debuginfo-lossy optimisations,
6545 // so its not expected this point will be reached.
6546 if (!isa<SCEVAddRecExpr>(S))
6547 return false;
6548
6549 LLVM_DEBUG(dbgs() << "scev-salvage: Location to salvage SCEV: " << *S
6550 << '\n');
6551
6552 const auto *Rec = cast<SCEVAddRecExpr>(S);
6553 if (!Rec->isAffine())
6554 return false;
6555
6557 return false;
6558
6559 // Initialise a new builder with the iteration count expression. In
6560 // combination with the value's SCEV this enables recovery.
6561 clone(IterationCount);
6562 if (!SCEVToValueExpr(*Rec, SE))
6563 return false;
6564
6565 return true;
6566 }
6567
6568 /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6569 /// builder's expression stack. The stack should already contain an
6570 /// expression for the iteration count, so that it can be multiplied by
6571 /// the stride and added to the start.
6572 /// Components of the expression are omitted if they are an identity function.
6573 bool SCEVToIterCountExpr(const llvm::SCEVAddRecExpr &SAR,
6574 ScalarEvolution &SE) {
6575 assert(SAR.isAffine() && "Expected affine SCEV");
6576 const SCEV *Start = SAR.getStart();
6577 const SCEV *Stride = SAR.getStepRecurrence(SE);
6578
6579 // Skip pushing arithmetic noops.
6580 if (!isIdentityFunction(llvm::dwarf::DW_OP_minus, Start)) {
6581 if (!pushSCEV(Start))
6582 return false;
6583 pushOperator(llvm::dwarf::DW_OP_minus);
6584 }
6585 if (!isIdentityFunction(llvm::dwarf::DW_OP_div, Stride)) {
6586 if (!pushSCEV(Stride))
6587 return false;
6588 pushOperator(llvm::dwarf::DW_OP_div);
6589 }
6590 return true;
6591 }
6592
6593 // Append the current expression and locations to a location list and an
6594 // expression list. Modify the DW_OP_LLVM_arg indexes to account for
6595 // the locations already present in the destination list.
6596 void appendToVectors(SmallVectorImpl<uint64_t> &DestExpr,
6597 SmallVectorImpl<Value *> &DestLocations) {
6598 assert(!DestLocations.empty() &&
6599 "Expected the locations vector to contain the IV");
6600 // The DWARF_OP_LLVM_arg arguments of the expression being appended must be
6601 // modified to account for the locations already in the destination vector.
6602 // All builders contain the IV as the first location op.
6603 assert(!LocationOps.empty() &&
6604 "Expected the location ops to contain the IV.");
6605 // DestIndexMap[n] contains the index in DestLocations for the nth
6606 // location in this SCEVDbgValueBuilder.
6607 SmallVector<uint64_t, 2> DestIndexMap;
6608 for (const auto &Op : LocationOps) {
6609 auto It = find(DestLocations, Op);
6610 if (It != DestLocations.end()) {
6611 // Location already exists in DestLocations, reuse existing ArgIndex.
6612 DestIndexMap.push_back(std::distance(DestLocations.begin(), It));
6613 continue;
6614 }
6615 // Location is not in DestLocations, add it.
6616 DestIndexMap.push_back(DestLocations.size());
6617 DestLocations.push_back(Op);
6618 }
6619
6620 for (const auto &Op : expr_ops()) {
6621 if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
6622 Op.appendToVector(DestExpr);
6623 continue;
6624 }
6625
6627 // `DW_OP_LLVM_arg n` represents the nth LocationOp in this SCEV,
6628 // DestIndexMap[n] contains its new index in DestLocations.
6629 uint64_t NewIndex = DestIndexMap[Op.getArg(0)];
6630 DestExpr.push_back(NewIndex);
6631 }
6632 }
6633};
6634
6635/// Holds all the required data to salvage a dbg.value using the pre-LSR SCEVs
6636/// and DIExpression.
6637struct DVIRecoveryRec {
6638 DVIRecoveryRec(DbgVariableRecord *DVR)
6639 : DbgRef(DVR), Expr(DVR->getExpression()), HadLocationArgList(false) {}
6640
6641 DbgVariableRecord *DbgRef;
6642 DIExpression *Expr;
6643 bool HadLocationArgList;
6644 SmallVector<WeakVH, 2> LocationOps;
6647
6648 void clear() {
6649 for (auto &RE : RecoveryExprs)
6650 RE.reset();
6651 RecoveryExprs.clear();
6652 }
6653
6654 ~DVIRecoveryRec() { clear(); }
6655};
6656} // namespace
6657
6658/// Returns the total number of DW_OP_llvm_arg operands in the expression.
6659/// This helps in determining if a DIArglist is necessary or can be omitted from
6660/// the dbg.value.
6662 auto expr_ops = ToDwarfOpIter(Expr);
6663 unsigned Count = 0;
6664 for (auto Op : expr_ops)
6665 if (Op.getOp() == dwarf::DW_OP_LLVM_arg)
6666 Count++;
6667 return Count;
6668}
6669
6670/// Overwrites DVI with the location and Ops as the DIExpression. This will
6671/// create an invalid expression if Ops has any dwarf::DW_OP_llvm_arg operands,
6672/// because a DIArglist is not created for the first argument of the dbg.value.
6673template <typename T>
6674static void updateDVIWithLocation(T &DbgVal, Value *Location,
6676 assert(numLLVMArgOps(Ops) == 0 && "Expected expression that does not "
6677 "contain any DW_OP_llvm_arg operands.");
6678 DbgVal.setRawLocation(ValueAsMetadata::get(Location));
6679 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6680 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6681}
6682
6683/// Overwrite DVI with locations placed into a DIArglist.
6684template <typename T>
6685static void updateDVIWithLocations(T &DbgVal,
6686 SmallVectorImpl<Value *> &Locations,
6688 assert(numLLVMArgOps(Ops) != 0 &&
6689 "Expected expression that references DIArglist locations using "
6690 "DW_OP_llvm_arg operands.");
6692 for (Value *V : Locations)
6693 MetadataLocs.push_back(ValueAsMetadata::get(V));
6694 auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6695 DbgVal.setRawLocation(llvm::DIArgList::get(DbgVal.getContext(), ValArrayRef));
6696 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6697}
6698
6699/// Write the new expression and new location ops for the dbg.value. If possible
6700/// reduce the szie of the dbg.value by omitting DIArglist. This
6701/// can be omitted if:
6702/// 1. There is only a single location, refenced by a single DW_OP_llvm_arg.
6703/// 2. The DW_OP_LLVM_arg is the first operand in the expression.
6704static void UpdateDbgValue(DVIRecoveryRec &DVIRec,
6705 SmallVectorImpl<Value *> &NewLocationOps,
6707 DbgVariableRecord *DbgVal = DVIRec.DbgRef;
6708 unsigned NumLLVMArgs = numLLVMArgOps(NewExpr);
6709 if (NumLLVMArgs == 0) {
6710 // Location assumed to be on the stack.
6711 updateDVIWithLocation(*DbgVal, NewLocationOps[0], NewExpr);
6712 } else if (NumLLVMArgs == 1 && NewExpr[0] == dwarf::DW_OP_LLVM_arg) {
6713 // There is only a single DW_OP_llvm_arg at the start of the expression,
6714 // so it can be omitted along with DIArglist.
6715 assert(NewExpr[1] == 0 &&
6716 "Lone LLVM_arg in a DIExpression should refer to location-op 0.");
6718 updateDVIWithLocation(*DbgVal, NewLocationOps[0], ShortenedOps);
6719 } else {
6720 // Multiple DW_OP_llvm_arg, so DIArgList is strictly necessary.
6721 updateDVIWithLocations(*DbgVal, NewLocationOps, NewExpr);
6722 }
6723
6724 // If the DIExpression was previously empty then add the stack terminator.
6725 // Non-empty expressions have only had elements inserted into them and so
6726 // the terminator should already be present e.g. stack_value or fragment.
6727 DIExpression *SalvageExpr = DbgVal->getExpression();
6728 if (!DVIRec.Expr->isComplex() && SalvageExpr->isComplex()) {
6729 SalvageExpr = DIExpression::append(SalvageExpr, {dwarf::DW_OP_stack_value});
6730 DbgVal->setExpression(SalvageExpr);
6731 }
6732}
6733
6734/// Cached location ops may be erased during LSR, in which case a poison is
6735/// required when restoring from the cache. The type of that location is no
6736/// longer available, so just use int8. The poison will be replaced by one or
6737/// more locations later when a SCEVDbgValueBuilder selects alternative
6738/// locations to use for the salvage.
6740 return (VH) ? VH : PoisonValue::get(llvm::Type::getInt8Ty(C));
6741}
6742
6743/// Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
6744static void restorePreTransformState(DVIRecoveryRec &DVIRec) {
6745 DbgVariableRecord *DbgVal = DVIRec.DbgRef;
6746 LLVM_DEBUG(dbgs() << "scev-salvage: restore dbg.value to pre-LSR state\n"
6747 << "scev-salvage: post-LSR: " << *DbgVal << '\n');
6748 assert(DVIRec.Expr && "Expected an expression");
6749 DbgVal->setExpression(DVIRec.Expr);
6750
6751 // Even a single location-op may be inside a DIArgList and referenced with
6752 // DW_OP_LLVM_arg, which is valid only with a DIArgList.
6753 if (!DVIRec.HadLocationArgList) {
6754 assert(DVIRec.LocationOps.size() == 1 &&
6755 "Unexpected number of location ops.");
6756 // LSR's unsuccessful salvage attempt may have added DIArgList, which in
6757 // this case was not present before, so force the location back to a
6758 // single uncontained Value.
6759 Value *CachedValue =
6760 getValueOrPoison(DVIRec.LocationOps[0], DbgVal->getContext());
6761 DbgVal->setRawLocation(ValueAsMetadata::get(CachedValue));
6762 } else {
6764 for (WeakVH VH : DVIRec.LocationOps) {
6765 Value *CachedValue = getValueOrPoison(VH, DbgVal->getContext());
6766 MetadataLocs.push_back(ValueAsMetadata::get(CachedValue));
6767 }
6768 auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6769 DbgVal->setRawLocation(
6770 llvm::DIArgList::get(DbgVal->getContext(), ValArrayRef));
6771 }
6772 LLVM_DEBUG(dbgs() << "scev-salvage: pre-LSR: " << *DbgVal << '\n');
6773}
6774
6776 llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec,
6777 const SCEV *SCEVInductionVar,
6778 SCEVDbgValueBuilder IterCountExpr) {
6779
6780 if (!DVIRec.DbgRef->isKillLocation())
6781 return false;
6782
6783 // LSR may have caused several changes to the dbg.value in the failed salvage
6784 // attempt. So restore the DIExpression, the location ops and also the
6785 // location ops format, which is always DIArglist for multiple ops, but only
6786 // sometimes for a single op.
6788
6789 // LocationOpIndexMap[i] will store the post-LSR location index of
6790 // the non-optimised out location at pre-LSR index i.
6791 SmallVector<int64_t, 2> LocationOpIndexMap;
6792 LocationOpIndexMap.assign(DVIRec.LocationOps.size(), -1);
6793 SmallVector<Value *, 2> NewLocationOps;
6794 NewLocationOps.push_back(LSRInductionVar);
6795
6796 for (unsigned i = 0; i < DVIRec.LocationOps.size(); i++) {
6797 WeakVH VH = DVIRec.LocationOps[i];
6798 // Place the locations not optimised out in the list first, avoiding
6799 // inserts later. The map is used to update the DIExpression's
6800 // DW_OP_LLVM_arg arguments as the expression is updated.
6801 if (VH && !isa<UndefValue>(VH)) {
6802 NewLocationOps.push_back(VH);
6803 LocationOpIndexMap[i] = NewLocationOps.size() - 1;
6804 LLVM_DEBUG(dbgs() << "scev-salvage: Location index " << i
6805 << " now at index " << LocationOpIndexMap[i] << "\n");
6806 continue;
6807 }
6808
6809 // It's possible that a value referred to in the SCEV may have been
6810 // optimised out by LSR.
6811 if (SE.containsErasedValue(DVIRec.SCEVs[i]) ||
6812 SE.containsUndefs(DVIRec.SCEVs[i])) {
6813 LLVM_DEBUG(dbgs() << "scev-salvage: SCEV for location at index: " << i
6814 << " refers to a location that is now undef or erased. "
6815 "Salvage abandoned.\n");
6816 return false;
6817 }
6818
6819 LLVM_DEBUG(dbgs() << "scev-salvage: salvaging location at index " << i
6820 << " with SCEV: " << *DVIRec.SCEVs[i] << "\n");
6821
6822 DVIRec.RecoveryExprs[i] = std::make_unique<SCEVDbgValueBuilder>();
6823 SCEVDbgValueBuilder *SalvageExpr = DVIRec.RecoveryExprs[i].get();
6824
6825 // Create an offset-based salvage expression if possible, as it requires
6826 // less DWARF ops than an iteration count-based expression.
6827 if (std::optional<APInt> Offset =
6828 SE.computeConstantDifference(DVIRec.SCEVs[i], SCEVInductionVar)) {
6829 if (Offset->getSignificantBits() <= 64)
6830 SalvageExpr->createOffsetExpr(Offset->getSExtValue(), LSRInductionVar);
6831 else
6832 return false;
6833 } else if (!SalvageExpr->createIterCountExpr(DVIRec.SCEVs[i], IterCountExpr,
6834 SE))
6835 return false;
6836 }
6837
6838 // Merge the DbgValueBuilder generated expressions and the original
6839 // DIExpression, place the result into an new vector.
6841 if (DVIRec.Expr->getNumElements() == 0) {
6842 assert(DVIRec.RecoveryExprs.size() == 1 &&
6843 "Expected only a single recovery expression for an empty "
6844 "DIExpression.");
6845 assert(DVIRec.RecoveryExprs[0] &&
6846 "Expected a SCEVDbgSalvageBuilder for location 0");
6847 SCEVDbgValueBuilder *B = DVIRec.RecoveryExprs[0].get();
6848 B->appendToVectors(NewExpr, NewLocationOps);
6849 }
6850 for (const auto &Op : DVIRec.Expr->expr_ops()) {
6851 // Most Ops needn't be updated.
6852 if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
6853 Op.appendToVector(NewExpr);
6854 continue;
6855 }
6856
6857 uint64_t LocationArgIndex = Op.getArg(0);
6858 SCEVDbgValueBuilder *DbgBuilder =
6859 DVIRec.RecoveryExprs[LocationArgIndex].get();
6860 // The location doesn't have s SCEVDbgValueBuilder, so LSR did not
6861 // optimise it away. So just translate the argument to the updated
6862 // location index.
6863 if (!DbgBuilder) {
6864 NewExpr.push_back(dwarf::DW_OP_LLVM_arg);
6865 assert(LocationOpIndexMap[Op.getArg(0)] != -1 &&
6866 "Expected a positive index for the location-op position.");
6867 NewExpr.push_back(LocationOpIndexMap[Op.getArg(0)]);
6868 continue;
6869 }
6870 // The location has a recovery expression.
6871 DbgBuilder->appendToVectors(NewExpr, NewLocationOps);
6872 }
6873
6874 UpdateDbgValue(DVIRec, NewLocationOps, NewExpr);
6875 LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: " << *DVIRec.DbgRef << "\n");
6876 return true;
6877}
6878
6879/// Obtain an expression for the iteration count, then attempt to salvage the
6880/// dbg.value intrinsics.
6882 llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar,
6883 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &DVIToUpdate) {
6884 if (DVIToUpdate.empty())
6885 return;
6886
6887 const llvm::SCEV *SCEVInductionVar = SE.getSCEV(LSRInductionVar);
6888 assert(SCEVInductionVar &&
6889 "Anticipated a SCEV for the post-LSR induction variable");
6890
6891 if (const SCEVAddRecExpr *IVAddRec =
6892 dyn_cast<SCEVAddRecExpr>(SCEVInductionVar)) {
6893 if (!IVAddRec->isAffine())
6894 return;
6895
6896 // Prevent translation using excessive resources.
6897 if (IVAddRec->getExpressionSize() > MaxSCEVSalvageExpressionSize)
6898 return;
6899
6900 // The iteration count is required to recover location values.
6901 SCEVDbgValueBuilder IterCountExpr;
6902 IterCountExpr.pushLocation(LSRInductionVar);
6903 if (!IterCountExpr.SCEVToIterCountExpr(*IVAddRec, SE))
6904 return;
6905
6906 LLVM_DEBUG(dbgs() << "scev-salvage: IV SCEV: " << *SCEVInductionVar
6907 << '\n');
6908
6909 for (auto &DVIRec : DVIToUpdate) {
6910 SalvageDVI(L, SE, LSRInductionVar, *DVIRec, SCEVInductionVar,
6911 IterCountExpr);
6912 }
6913 }
6914}
6915
6916/// Identify and cache salvageable DVI locations and expressions along with the
6917/// corresponding SCEV(s). Also ensure that the DVI is not deleted between
6918/// cacheing and salvaging.
6920 Loop *L, ScalarEvolution &SE,
6921 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &SalvageableDVISCEVs) {
6922 for (const auto &B : L->getBlocks()) {
6923 for (auto &I : *B) {
6924 for (DbgVariableRecord &DbgVal : filterDbgVars(I.getDbgRecordRange())) {
6925 if (!DbgVal.isDbgValue() && !DbgVal.isDbgAssign())
6926 continue;
6927
6928 // Ensure that if any location op is undef that the dbg.vlue is not
6929 // cached.
6930 if (DbgVal.isKillLocation())
6931 continue;
6932
6933 // Check that the location op SCEVs are suitable for translation to
6934 // DIExpression.
6935 const auto &HasTranslatableLocationOps =
6936 [&](const DbgVariableRecord &DbgValToTranslate) -> bool {
6937 for (const auto LocOp : DbgValToTranslate.location_ops()) {
6938 if (!LocOp)
6939 return false;
6940
6941 if (!SE.isSCEVable(LocOp->getType()))
6942 return false;
6943
6944 const SCEV *S = SE.getSCEV(LocOp);
6945 if (SE.containsUndefs(S))
6946 return false;
6947 }
6948 return true;
6949 };
6950
6951 if (!HasTranslatableLocationOps(DbgVal))
6952 continue;
6953
6954 std::unique_ptr<DVIRecoveryRec> NewRec =
6955 std::make_unique<DVIRecoveryRec>(&DbgVal);
6956 // Each location Op may need a SCEVDbgValueBuilder in order to recover
6957 // it. Pre-allocating a vector will enable quick lookups of the builder
6958 // later during the salvage.
6959 NewRec->RecoveryExprs.resize(DbgVal.getNumVariableLocationOps());
6960 for (const auto LocOp : DbgVal.location_ops()) {
6961 NewRec->SCEVs.push_back(SE.getSCEV(LocOp));
6962 NewRec->LocationOps.push_back(LocOp);
6963 NewRec->HadLocationArgList = DbgVal.hasArgList();
6964 }
6965 SalvageableDVISCEVs.push_back(std::move(NewRec));
6966 }
6967 }
6968 }
6969}
6970
6971/// Ideally pick the PHI IV inserted by ScalarEvolutionExpander. As a fallback
6972/// any PHi from the loop header is usable, but may have less chance of
6973/// surviving subsequent transforms.
6975 const LSRInstance &LSR) {
6976
6977 auto IsSuitableIV = [&](PHINode *P) {
6978 if (!SE.isSCEVable(P->getType()))
6979 return false;
6980 if (const SCEVAddRecExpr *Rec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(P)))
6981 return Rec->isAffine() && !SE.containsUndefs(SE.getSCEV(P));
6982 return false;
6983 };
6984
6985 // For now, just pick the first IV that was generated and inserted by
6986 // ScalarEvolution. Ideally pick an IV that is unlikely to be optimised away
6987 // by subsequent transforms.
6988 for (const WeakVH &IV : LSR.getScalarEvolutionIVs()) {
6989 if (!IV)
6990 continue;
6991
6992 // There should only be PHI node IVs.
6993 PHINode *P = cast<PHINode>(&*IV);
6994
6995 if (IsSuitableIV(P))
6996 return P;
6997 }
6998
6999 for (PHINode &P : L.getHeader()->phis()) {
7000 if (IsSuitableIV(&P))
7001 return &P;
7002 }
7003 return nullptr;
7004}
7005
7007 DominatorTree &DT, LoopInfo &LI,
7008 const TargetTransformInfo &TTI,
7010 MemorySSA *MSSA) {
7011
7012 // Debug preservation - before we start removing anything identify which DVI
7013 // meet the salvageable criteria and store their DIExpression and SCEVs.
7014 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> SalvageableDVIRecords;
7015 DbgGatherSalvagableDVI(L, SE, SalvageableDVIRecords);
7016
7017 bool Changed = false;
7018 std::unique_ptr<MemorySSAUpdater> MSSAU;
7019 if (MSSA)
7020 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
7021
7022 // Run the main LSR transformation.
7023 const LSRInstance &Reducer =
7024 LSRInstance(L, IU, SE, DT, LI, TTI, AC, TLI, MSSAU.get());
7025 Changed |= Reducer.getChanged();
7026
7027 // Remove any extra phis created by processing inner loops.
7028 Changed |= DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7029 if (EnablePhiElim && L->isLoopSimplifyForm()) {
7031 const DataLayout &DL = L->getHeader()->getDataLayout();
7032 SCEVExpander Rewriter(SE, DL, "lsr", false);
7033#if LLVM_ENABLE_ABI_BREAKING_CHECKS
7034 Rewriter.setDebugType(DEBUG_TYPE);
7035#endif
7036 unsigned numFolded = Rewriter.replaceCongruentIVs(L, &DT, DeadInsts, &TTI);
7037 Rewriter.clear();
7038 if (numFolded) {
7039 Changed = true;
7041 MSSAU.get());
7042 DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7043 }
7044 }
7045 // LSR may at times remove all uses of an induction variable from a loop.
7046 // The only remaining use is the PHI in the exit block.
7047 // When this is the case, if the exit value of the IV can be calculated using
7048 // SCEV, we can replace the exit block PHI with the final value of the IV and
7049 // skip the updates in each loop iteration.
7050 if (L->isRecursivelyLCSSAForm(DT, LI) && L->getExitBlock()) {
7052 const DataLayout &DL = L->getHeader()->getDataLayout();
7053 SCEVExpander Rewriter(SE, DL, "lsr", true);
7054 int Rewrites = rewriteLoopExitValues(L, &LI, &TLI, &SE, &TTI, Rewriter, &DT,
7055 UnusedIndVarInLoop, DeadInsts);
7056 Rewriter.clear();
7057 if (Rewrites) {
7058 Changed = true;
7060 MSSAU.get());
7061 DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7062 }
7063 }
7064
7065 if (SalvageableDVIRecords.empty())
7066 return Changed;
7067
7068 // Obtain relevant IVs and attempt to rewrite the salvageable DVIs with
7069 // expressions composed using the derived iteration count.
7070 // TODO: Allow for multiple IV references for nested AddRecSCEVs
7071 for (const auto &L : LI) {
7072 if (llvm::PHINode *IV = GetInductionVariable(*L, SE, Reducer))
7073 DbgRewriteSalvageableDVIs(L, SE, IV, SalvageableDVIRecords);
7074 else {
7075 LLVM_DEBUG(dbgs() << "scev-salvage: SCEV salvaging not possible. An IV "
7076 "could not be identified.\n");
7077 }
7078 }
7079
7080 for (auto &Rec : SalvageableDVIRecords)
7081 Rec->clear();
7082 SalvageableDVIRecords.clear();
7083 return Changed;
7084}
7085
7086bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
7087 if (skipLoop(L))
7088 return false;
7089
7090 auto &IU = getAnalysis<IVUsersWrapperPass>().getIU();
7091 auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
7092 auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
7093 auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
7094 const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
7095 *L->getHeader()->getParent());
7096 auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
7097 *L->getHeader()->getParent());
7098 auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
7099 *L->getHeader()->getParent());
7100 auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
7101 MemorySSA *MSSA = nullptr;
7102 if (MSSAAnalysis)
7103 MSSA = &MSSAAnalysis->getMSSA();
7104 return ReduceLoopStrength(L, IU, SE, DT, LI, TTI, AC, TLI, MSSA);
7105}
7106
7109 LPMUpdater &) {
7110 if (!ReduceLoopStrength(&L, AM.getResult<IVUsersAnalysis>(L, AR), AR.SE,
7111 AR.DT, AR.LI, AR.TTI, AR.AC, AR.TLI, AR.MSSA))
7112 return PreservedAnalyses::all();
7113
7114 auto PA = getLoopPassPreservedAnalyses();
7115 if (AR.MSSA)
7116 PA.preserve<MemorySSAAnalysis>();
7117 return PA;
7118}
7119
7120char LoopStrengthReduce::ID = 0;
7121
7122INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
7123 "Loop Strength Reduction", false, false)
7129INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
7130INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
7131 "Loop Strength Reduction", false, false)
7132
7133Pass *llvm::createLoopStrengthReducePass() { return new LoopStrengthReduce(); }
#define Success
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:687
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:638
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static bool isCanonical(const MDString *S)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
This file contains constants used for implementing Dwarf debug support.
std::optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1328
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Hexagon Hardware Loops
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition: IVUsers.cpp:48
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition: Lint.cpp:546
This header provides classes for managing per-loop analyses.
static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec, const SCEV *SCEVInductionVar, SCEVDbgValueBuilder IterCountExpr)
static cl::opt< bool > DropScaledForVScale("lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true), cl::desc("Avoid using scaled registers with vscale-relative addressing"))
static Value * getWideOperand(Value *Oper)
IVChain logic must consistently peek base TruncInst operands, so wrap it in a convenient helper.
static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE)
Return true if the given add can be sign-extended without changing its value.
static bool mayUsePostIncMode(const TargetTransformInfo &TTI, LSRUse &LU, const SCEV *S, const Loop *L, ScalarEvolution &SE)
Return true if the SCEV represents a value that may end up as a post-increment operation.
static void restorePreTransformState(DVIRecoveryRec &DVIRec)
Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE)
If S involves the addition of a constant integer value, return that integer value,...
static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L)
static User::op_iterator findIVOperand(User::op_iterator OI, User::op_iterator OE, Loop *L, ScalarEvolution &SE)
Helper for CollectChains that finds an IV operand (computed by an AddRec in this loop) within [OI,...
static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset, Immediate MaxOffset, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg, int64_t Scale)
Test whether we know how to expand the current formula.
static void DbgGatherSalvagableDVI(Loop *L, ScalarEvolution &SE, SmallVector< std::unique_ptr< DVIRecoveryRec >, 2 > &SalvageableDVISCEVs)
Identify and cache salvageable DVI locations and expressions along with the corresponding SCEV(s).
static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE)
Return true if the given mul can be sign-extended without changing its value.
static const unsigned MaxSCEVSalvageExpressionSize
Limit the size of expression that SCEV-based salvaging will attempt to translate into a DIExpression.
static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE)
Return true if this AddRec is already a phi in its loop.
static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F, const Loop &L)
static cl::opt< bool > InsnsCost("lsr-insns-cost", cl::Hidden, cl::init(true), cl::desc("Add instruction count to a LSR cost model"))
static cl::opt< bool > StressIVChain("stress-ivchain", cl::Hidden, cl::init(false), cl::desc("Stress test LSR IV chains"))
static bool isAddressUse(const TargetTransformInfo &TTI, Instruction *Inst, Value *OperandVal)
Returns true if the specified instruction is using the specified value as an address.
static GlobalValue * ExtractSymbol(const SCEV *&S, ScalarEvolution &SE)
If S involves the addition of a GlobalValue address, return that symbol, and mutate S to point to a n...
static void updateDVIWithLocation(T &DbgVal, Value *Location, SmallVectorImpl< uint64_t > &Ops)
Overwrites DVI with the location and Ops as the DIExpression.
static bool isLegalAddImmediate(const TargetTransformInfo &TTI, Immediate Offset)
static cl::opt< cl::boolOrDefault > AllowDropSolutionIfLessProfitable("lsr-drop-solution", cl::Hidden, cl::desc("Attempt to drop solution if it is less profitable"))
static cl::opt< bool > EnableVScaleImmediates("lsr-enable-vscale-immediates", cl::Hidden, cl::init(true), cl::desc("Enable analysis of vscale-relative immediates in LSR"))
static cl::opt< TTI::AddressingModeKind > PreferredAddresingMode("lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None), cl::desc("A flag that overrides the target's preferred addressing mode."), cl::values(clEnumValN(TTI::AMK_None, "none", "Don't prefer any addressing mode"), clEnumValN(TTI::AMK_PreIndexed, "preindexed", "Prefer pre-indexed addressing mode"), clEnumValN(TTI::AMK_PostIndexed, "postindexed", "Prefer post-indexed addressing mode")))
static const SCEV * getExprBase(const SCEV *S)
Return an approximation of this SCEV expression's "base", or NULL for any constant.
static bool isAlwaysFoldable(const TargetTransformInfo &TTI, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg)
static llvm::PHINode * GetInductionVariable(const Loop &L, ScalarEvolution &SE, const LSRInstance &LSR)
Ideally pick the PHI IV inserted by ScalarEvolutionExpander.
static bool IsSimplerBaseSCEVForTarget(const TargetTransformInfo &TTI, ScalarEvolution &SE, const SCEV *Best, const SCEV *Reg, MemAccessTy AccessType)
loop reduce
static const unsigned MaxIVUsers
MaxIVUsers is an arbitrary threshold that provides an early opportunity for bail out.
static bool isHighCostExpansion(const SCEV *S, SmallPtrSetImpl< const SCEV * > &Processed, ScalarEvolution &SE)
Check if expanding this expression is likely to incur significant cost.
static Value * getValueOrPoison(WeakVH &VH, LLVMContext &C)
Cached location ops may be erased during LSR, in which case a poison is required when restoring from ...
static MemAccessTy getAccessType(const TargetTransformInfo &TTI, Instruction *Inst, Value *OperandVal)
Return the type of the memory being accessed.
static unsigned numLLVMArgOps(SmallVectorImpl< uint64_t > &Expr)
Returns the total number of DW_OP_llvm_arg operands in the expression.
static void DbgRewriteSalvageableDVIs(llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar, SmallVector< std::unique_ptr< DVIRecoveryRec >, 2 > &DVIToUpdate)
Obtain an expression for the iteration count, then attempt to salvage the dbg.value intrinsics.
static cl::opt< bool > EnablePhiElim("enable-lsr-phielim", cl::Hidden, cl::init(true), cl::desc("Enable LSR phi elimination"))
static void UpdateDbgValue(DVIRecoveryRec &DVIRec, SmallVectorImpl< Value * > &NewLocationOps, SmallVectorImpl< uint64_t > &NewExpr)
Write the new expression and new location ops for the dbg.value.
static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE)
Return true if the given addrec can be sign-extended without changing its value.
static bool canHoistIVInc(const TargetTransformInfo &TTI, const LSRFixup &Fixup, const LSRUse &LU, Instruction *IVIncInsertPos, Loop *L)
static void DoInitialMatch(const SCEV *S, Loop *L, SmallVectorImpl< const SCEV * > &Good, SmallVectorImpl< const SCEV * > &Bad, ScalarEvolution &SE)
Recursion helper for initialMatch.
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F)
Check if the addressing mode defined by F is completely folded in LU at isel time.
static cl::opt< bool > LSRExpNarrow("lsr-exp-narrow", cl::Hidden, cl::init(false), cl::desc("Narrow LSR complex solution using" " expectation of registers number"))
static cl::opt< bool > FilterSameScaledReg("lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true), cl::desc("Narrow LSR search space by filtering non-optimal formulae" " with the same ScaledReg and Scale"))
static void updateDVIWithLocations(T &DbgVal, SmallVectorImpl< Value * > &Locations, SmallVectorImpl< uint64_t > &Ops)
Overwrite DVI with locations placed into a DIArglist.
static cl::opt< unsigned > ComplexityLimit("lsr-complexity-limit", cl::Hidden, cl::init(std::numeric_limits< uint16_t >::max()), cl::desc("LSR search space complexity limit"))
static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC, TargetLibraryInfo &TLI, MemorySSA *MSSA)
static bool isProfitableChain(IVChain &Chain, SmallPtrSetImpl< Instruction * > &Users, ScalarEvolution &SE, const TargetTransformInfo &TTI)
Return true if the number of registers needed for the chain is estimated to be less than the number r...
static const SCEV * CollectSubexprs(const SCEV *S, const SCEVConstant *C, SmallVectorImpl< const SCEV * > &Ops, const Loop *L, ScalarEvolution &SE, unsigned Depth=0)
Split S into subexpressions which can be pulled out into separate registers.
static const SCEV * getExactSDiv(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE, bool IgnoreSignificantBits=false)
Return an expression for LHS /s RHS, if it can be determined and if the remainder is known to be zero...
static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, Value *Operand, const TargetTransformInfo &TTI)
Return true if the IVInc can be folded into an addressing mode.
#define DEBUG_TYPE
static const SCEV * getAnyExtendConsideringPostIncUses(ArrayRef< PostIncLoopSet > Loops, const SCEV *Expr, Type *ToTy, ScalarEvolution &SE)
Extend/Truncate Expr to ToTy considering post-inc uses in Loops.
static unsigned getSetupCost(const SCEV *Reg, unsigned Depth)
static cl::opt< unsigned > SetupCostDepthLimit("lsr-setupcost-depth-limit", cl::Hidden, cl::init(7), cl::desc("The limit on recursion depth for LSRs setup cost"))
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
Register Reg
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...
uint64_t IntrinsicInst * II
#define P(N)
PowerPC TLS Dynamic Call Fixup
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:39
This file defines the PointerIntPair class.
const SmallVectorImpl< MachineOperand > & Cond
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
SI optimize exec mask operations pre RA
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define LLVM_DEBUG(...)
Definition: Debug.h:119
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
Virtual Register Rewriter
Definition: VirtRegMap.cpp:269
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition: blake3_impl.h:83
Class for arbitrary precision integers.
Definition: APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1540
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:329
LLVM_ABI APInt sdiv(const APInt &RHS) const
Signed division function for APInt.
Definition: APInt.cpp:1644
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition: APInt.h:1531
LLVM_ABI APInt srem(const APInt &RHS) const
Function for signed remainder operation.
Definition: APInt.cpp:1736
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1562
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:255
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:412
Represent the analysis usage information of a pass.
LLVM_ABI AnalysisUsage & addRequiredID(const void *ID)
Definition: Pass.cpp:284
AnalysisUsage & addPreservedID(const void *ID)
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
Definition: Any.h:28
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:506
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:709
LLVM Basic Block Representation.
Definition: BasicBlock.h:62
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition: BasicBlock.h:528
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:170
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition: BasicBlock.h:386
LLVM_ABI bool isLandingPad() const
Return true if this basic block is a landing pad.
Definition: BasicBlock.cpp:661
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:233
BinaryOps getOpcode() const
Definition: InstrTypes.h:374
static LLVM_ABI BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), InsertPosition InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
Conditional or Unconditional Branch instruction.
bool isUnconditional() const
Value * getCondition() const
static LLVM_ABI Instruction::CastOps getCastOpcode(const Value *Val, bool SrcIsSigned, Type *Ty, bool DstIsSigned)
Returns the opcode necessary to cast Val into Ty using usual casting rules.
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:678
@ ICMP_EQ
equal
Definition: InstrTypes.h:699
@ ICMP_NE
not equal
Definition: InstrTypes.h:700
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:791
This is the shared class of boolean and integer constants.
Definition: Constants.h:87
static LLVM_ABI bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
Definition: Constants.cpp:1602
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition: Constants.h:131
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:169
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:163
This is an important base class in LLVM.
Definition: Constant.h:43
static LLVM_ABI DIArgList * get(LLVMContext &Context, ArrayRef< ValueAsMetadata * > Args)
An iterator for expression operands.
DWARF expression.
static LLVM_ABI DIExpression * append(const DIExpression *Expr, ArrayRef< uint64_t > Ops)
Append the opcodes Ops to DIExpr.
static LLVM_ABI void appendOffset(SmallVectorImpl< uint64_t > &Ops, int64_t Offset)
Append Ops with operations to apply the Offset.
LLVM_ABI bool isComplex() const
Return whether the location is computed on the expression stack, meaning it cannot be a simple regist...
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
LLVM_ABI LLVMContext & getContext()
Record of a variable value-assignment, aka a non instruction representation of the dbg....
void setRawLocation(Metadata *NewLocation)
Use of this should generally be avoided; instead, replaceVariableLocationOp and addVariableLocationOp...
void setExpression(DIExpression *NewExpr)
DIExpression * getExpression() const
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:230
Implements a dense probed hash-table based set.
Definition: DenseSet.h:263
NodeT * getBlock() const
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:322
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:165
LLVM_ABI Instruction * findNearestCommonDominator(Instruction *I1, Instruction *I2) const
Find the nearest instruction I that dominates both I1 and I2, in the sense that a result produced bef...
Definition: Dominators.cpp:357
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:135
This instruction compares its operands according to the predicate given to the constructor.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2780
IVStrideUse - Keep track of one use of a strided induction variable.
Definition: IVUsers.h:35
void transformToPostInc(const Loop *L)
transformToPostInc - Transform the expression to post-inc form for the given loop.
Definition: IVUsers.cpp:365
Value * getOperandValToReplace() const
getOperandValToReplace - Return the Value of the operand in the user instruction that this IVStrideUs...
Definition: IVUsers.h:54
void setUser(Instruction *NewUser)
setUser - Assign a new user instruction for this use.
Definition: IVUsers.h:48
Analysis pass that exposes the IVUsers for a loop.
Definition: IVUsers.h:184
ilist< IVStrideUse >::const_iterator const_iterator
Definition: IVUsers.h:142
bool empty() const
Definition: IVUsers.h:147
LLVM_ABI void print(raw_ostream &OS) const
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isLifetimeStartOrEnd() const LLVM_READONLY
Return true if the instruction is a llvm.lifetime.start or llvm.lifetime.end marker.
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:513
LLVM_ABI void moveBefore(InstListType::iterator InsertPos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool isEHPad() const
Return true if the instruction is a variety of EH-block.
Definition: Instruction.h:879
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI Type * getAccessType() const LLVM_READONLY
Return the type this instruction accesses in memory, if any.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:312
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:510
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Definition: Instruction.cpp:86
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:319
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:49
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
An instruction for reading from memory.
Definition: Instructions.h:180
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
BlockT * getHeader() const
unsigned getLoopDepth() const
Return the nesting level of this loop.
The legacy pass manager's analysis pass to compute loop information.
Definition: LoopInfo.h:597
virtual bool runOnLoop(Loop *L, LPPassManager &LPM)=0
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:40
An analysis that produces MemorySSA for a function.
Definition: MemorySSA.h:936
Legacy analysis pass which computes MemorySSA.
Definition: MemorySSA.h:993
Encapsulates MemorySSA, including all data associated with memory accesses.
Definition: MemorySSA.h:702
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
iterator_range< const_block_iterator > blocks() const
op_range incoming_values()
void setIncomingValue(unsigned i, Value *V)
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
static unsigned getIncomingValueNumForOperand(unsigned i)
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:99
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:112
PointerIntPair - This class implements a pair of a pointer and small integer.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1885
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:118
This node represents an addition of some number of SCEVs.
This node represents a polynomial recurrence on the trip count of the specified loop.
const SCEV * getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This is the base class for unary cast operator classes.
This node is the base class for n'ary commutative operators.
This class represents a constant integer value.
ConstantInt * getValue() const
const APInt & getAPInt() const
This class uses information about analyze scalars to rewrite expressions in canonical form.
This is the base class for unary integral cast operator classes.
This node represents multiplication of some number of SCEVs.
This node is a base class providing common functionality for n'ary operators.
ArrayRef< const SCEV * > operands() const
This class represents a signed maximum selection.
This class represents a binary unsigned division operation.
This class represents an unsigned maximum selection.
This means that we are dealing with an entirely unknown SCEV value, and only represent it as its LLVM...
This class represents an analyzed expression in the program.
LLVM_ABI ArrayRef< const SCEV * > operands() const
Return operands of this SCEV expression.
unsigned short getExpressionSize() const
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
SCEVTypes getSCEVType() const
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
This class represents a cast from signed integer to floating point.
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getZero(Type *Ty)
Return a SCEV for the constant 0 of a specific type.
LLVM_ABI uint64_t getTypeSizeInBits(Type *Ty) const
Return the size in bits of the specified type, for which isSCEVable must return true.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getNoopOrSignExtend(const SCEV *V, Type *Ty)
Return a SCEV corresponding to a conversion of the input value to the specified type.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI const SCEV * getAddRecExpr(const SCEV *Start, const SCEV *Step, const Loop *L, SCEV::NoWrapFlags Flags)
Get an add recurrence expression for the specified loop.
LLVM_ABI bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
LLVM_ABI Type * getEffectiveSCEVType(Type *Ty) const
Return a type with the same bitwidth as the given type and which represents how SCEV will treat the g...
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getAnyExtendExpr(const SCEV *Op, Type *Ty)
getAnyExtendExpr - Return a SCEV for the given operand extended with unspecified bits out to the give...
LLVM_ABI bool containsUndefs(const SCEV *S) const
Return true if the SCEV expression contains an undef value.
LLVM_ABI const SCEV * getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth=0)
LLVM_ABI const SCEV * getVScale(Type *Ty)
LLVM_ABI bool hasComputableLoopEvolution(const SCEV *S, const Loop *L)
Return true if the given SCEV changes value in a known way in the specified loop.
LLVM_ABI const SCEV * getPointerBase(const SCEV *V)
Transitively follow the chain of pointer-type operands until reaching a SCEV that does not have a sin...
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getUnknown(Value *V)
LLVM_ABI std::optional< APInt > computeConstantDifference(const SCEV *LHS, const SCEV *RHS)
Compute LHS - RHS and returns the result as an APInt if it is a constant, and std::nullopt if it isn'...
LLVM_ABI bool properlyDominates(const SCEV *S, const BasicBlock *BB)
Return true if elements that makes up the given SCEV properly dominate the specified basic block.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
LLVM_ABI bool containsErasedValue(const SCEV *S) const
Return true if the SCEV expression contains a Value that has been optimised out and is now a nullptr.
LLVMContext & getContext() const
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:59
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:104
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:119
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:109
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:168
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
iterator_range< const_set_bits_iterator > set_bits() const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
size_type size() const
Returns the number of bits in this bitvector.
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
size_type count() const
Returns the number of bits which are set.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:380
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:470
void insert_range(Range &&R)
Definition: SmallPtrSet.h:490
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:401
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:356
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:134
void clear()
Definition: SmallSet.h:209
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:182
bool empty() const
Definition: SmallVector.h:82
size_t size() const
Definition: SmallVector.h:79
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:705
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:938
void reserve(size_type N)
Definition: SmallVector.h:664
iterator erase(const_iterator CI)
Definition: SmallVector.h:738
typename SuperClass::const_iterator const_iterator
Definition: SmallVector.h:579
typename SuperClass::iterator iterator
Definition: SmallVector.h:578
void resize(size_type N)
Definition: SmallVector.h:639
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
static StackOffset get(int64_t Fixed, int64_t Scalable)
Definition: TypeSize.h:45
An instruction for storing to memory.
Definition: Instructions.h:296
Provides information about what library functions are available for the current target.
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
LLVM_ABI bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const
LLVM_ABI bool shouldDropLSRSolutionIfLessProfitable() const
Return true if LSR should drop a found solution if it's calculated to be less profitable than the bas...
LLVM_ABI bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const
Return true if LSR cost of C1 is lower than C2.
LLVM_ABI bool isProfitableLSRChainElement(Instruction *I) const
LLVM_ABI bool LSRWithInstrQueries() const
Return true if the loop strength reduce pass should make Instruction* based TTI queries to isLegalAdd...
LLVM_ABI bool isIndexedStoreLegal(enum MemIndexedMode Mode, Type *Ty) const
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
LLVM_ABI bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace=0, Instruction *I=nullptr, int64_t ScalableOffset=0) const
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
LLVM_ABI bool isIndexedLoadLegal(enum MemIndexedMode Mode, Type *Ty) const
LLVM_ABI bool isLegalICmpImmediate(int64_t Imm) const
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
LLVM_ABI bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
LLVM_ABI bool isLegalAddImmediate(int64_t Imm) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
LLVM_ABI bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, TargetLibraryInfo *LibInfo) const
Return true if the target can save a compare for loop count, for example hardware loop saves a compar...
LLVM_ABI bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const
Query the target whether it would be profitable to convert the given loop into a hardware loop.
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
LLVM_ABI bool isLegalAddScalableImmediate(int64_t Imm) const
Return true if adding the specified scalable immediate is legal, that is the target has add instructi...
LLVM_ABI bool isNumRegsMajorCostOfLSR() const
Return true if LSR major cost is number of registers.
@ MIM_PostInc
Post-incrementing.
LLVM_ABI bool canMacroFuseCmp() const
Return true if the target can fuse a compare and branch.
LLVM_ABI InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace=0) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
LLVM_ABI bool isTruncateFree(Type *Ty1, Type *Ty2) const
Return true if it's free to truncate a value of type Ty1 to type Ty2.
This class represents a truncation of integer types.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
LLVM_ABI int getFPMantissaWidth() const
Return the width of the mantissa of this type.
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:267
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
LLVM_ABI unsigned getIntegerBitWidth() const
This class represents a cast unsigned integer to floating point.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
op_range operands()
Definition: User.h:292
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
void setOperand(unsigned i, Value *Val)
Definition: User.h:237
Value * getOperand(unsigned i) const
Definition: User.h:232
op_iterator op_end()
Definition: User.h:286
static LLVM_ABI ValueAsMetadata * get(Value *V)
Definition: Metadata.cpp:502
LLVM Value Representation.
Definition: Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:546
iterator_range< user_iterator > users()
Definition: Value.h:426
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1098
iterator_range< use_iterator > uses()
Definition: Value.h:380
A nullable Value handle that is nullable.
Definition: ValueHandle.h:145
int getNumOccurrences() const
Definition: CommandLine.h:400
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:194
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:174
const ParentTy * getParent() const
Definition: ilist_node.h:34
self_iterator getIterator()
Definition: ilist_node.h:134
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:53
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Key
PAL metadata keys.
@ Entry
Definition: COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
class_match< const SCEVVScale > m_SCEVVScale()
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
class_match< const SCEVConstant > m_SCEVConstant()
SCEVAffineAddRec_match< Op0_t, Op1_t, class_match< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
SCEVBinaryExpr_match< SCEVMulExpr, Op0_t, Op1_t > m_scev_Mul(const Op0_t &Op0, const Op1_t &Op1)
bool match(const SCEV *S, const Pattern &P)
class_match< const Loop > m_Loop()
cst_pred_ty< is_specific_cst > m_scev_SpecificInt(uint64_t V)
Match an SCEV constant with a plain unsigned integer.
class_match< const SCEV > m_SCEV()
Reg
All possible values of the reg field in the ModR/M byte.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:712
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
@ DW_OP_LLVM_arg
Only used in LLVM metadata.
Definition: Dwarf.h:148
@ DW_OP_LLVM_convert
Only used in LLVM metadata.
Definition: Dwarf.h:144
constexpr double e
Definition: MathExtras.h:47
Sequence
A sequence of states that a pointer may go through in which an objc_retain and objc_release are actua...
Definition: PtrState.h:41
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
BaseReg
Stack frame base register. Bit 0 of FREInfo.Info.
Definition: SFrame.h:77
LLVM_ABI const_iterator begin(StringRef path LLVM_LIFETIME_BOUND, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:226
LLVM_ABI const_iterator end(StringRef path LLVM_LIFETIME_BOUND)
Get end iterator over path.
Definition: Path.cpp:235
unsigned KindType
For isa, dyn_cast, etc operations on TelemetryInfo.
Definition: Telemetry.h:85
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:338
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition: DWP.cpp:477
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1770
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1744
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition: Utils.cpp:1723
bool operator!=(uint64_t V1, const APInt &V2)
Definition: APInt.h:2113
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2155
LLVM_ABI char & LoopSimplifyID
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:157
LLVM_ABI bool matchSimpleRecurrence(const PHINode *P, BinaryOperator *&BO, Value *&Start, Value *&Step)
Attempt to match a simple first order recurrence cycle of the form: iv = phi Ty [Start,...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1751
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:336
LLVM_ABI bool DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr)
Examine each PHI in the given block and delete it if it is dead.
LLVM_ABI void initializeLoopStrengthReducePass(PassRegistry &)
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:428
LLVM_ABI const SCEV * denormalizeForPostIncUse(const SCEV *S, const PostIncLoopSet &Loops, ScalarEvolution &SE)
Denormalize S to be post-increment for all loops present in Loops.
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1669
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1758
LLVM_ABI Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
LLVM_ABI void SplitLandingPadPredecessors(BasicBlock *OrigBB, ArrayRef< BasicBlock * > Preds, const char *Suffix, const char *Suffix2, SmallVectorImpl< BasicBlock * > &NewBBs, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, bool PreserveLCSSA=false)
This method transforms the landing pad, OrigBB, by introducing two new basic blocks into the function...
LLVM_ABI const SCEV * normalizeForPostIncUse(const SCEV *S, const PostIncLoopSet &Loops, ScalarEvolution &SE, bool CheckInvertible=true)
Normalize S to be post-increment for all loops present in Loops.
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
@ Add
Sum of integers.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1973
LLVM_ABI Pass * createLoopStrengthReducePass()
LLVM_ABI BasicBlock * SplitCriticalEdge(Instruction *TI, unsigned SuccNum, const CriticalEdgeSplittingOptions &Options=CriticalEdgeSplittingOptions(), const Twine &BBName="")
If this edge is a critical edge, insert a new node to split the critical edge.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructionsPermissive(SmallVectorImpl< WeakTrackingVH > &DeadInsts, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
Same functionality as RecursivelyDeleteTriviallyDeadInstructions, but allow instructions that are not...
Definition: Local.cpp:548
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:223
LLVM_ABI bool formLCSSAForInstructions(SmallVectorImpl< Instruction * > &Worklist, const DominatorTree &DT, const LoopInfo &LI, ScalarEvolution *SE, SmallVectorImpl< PHINode * > *PHIsToRemove=nullptr, SmallVectorImpl< PHINode * > *InsertedPHIs=nullptr)
Ensures LCSSA form for every instruction from the Worklist in the scope of innermost containing loop.
Definition: LCSSA.cpp:308
LLVM_ABI PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1777
LLVM_ABI int rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI, ScalarEvolution *SE, const TargetTransformInfo *TTI, SCEVExpander &Rewriter, DominatorTree *DT, ReplaceExitVal ReplaceExitValue, SmallVector< WeakTrackingVH, 16 > &DeadInsts)
If the final value of any expressions that are recurrent in the loop can be computed,...
Definition: LoopUtils.cpp:1574
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1916
@ UnusedIndVarInLoop
Definition: LoopUtils.h:495
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
bool SCEVExprContains(const SCEV *Root, PredTy Pred)
Return true if any node in Root satisfies the predicate Pred.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:858
#define N
Option class for critical edge splitting.
Attributes of a target dependent hardware loop.
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
Information about a load/store intrinsic defined by the target.
Value * PtrVal
This is the pointer that the intrinsic is loading from or storing to.