LLVM 22.0.0git
InlineCost.cpp
Go to the documentation of this file.
1//===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements inline cost analysis.
10//
11//===----------------------------------------------------------------------===//
12
14#include "llvm/ADT/STLExtras.h"
15#include "llvm/ADT/SetVector.h"
18#include "llvm/ADT/Statistic.h"
33#include "llvm/Config/llvm-config.h"
35#include "llvm/IR/CallingConv.h"
36#include "llvm/IR/DataLayout.h"
37#include "llvm/IR/Dominators.h"
39#include "llvm/IR/GlobalAlias.h"
40#include "llvm/IR/InlineAsm.h"
41#include "llvm/IR/InstVisitor.h"
43#include "llvm/IR/Operator.h"
46#include "llvm/Support/Debug.h"
49#include <climits>
50#include <limits>
51#include <optional>
52
53using namespace llvm;
54
55#define DEBUG_TYPE "inline-cost"
56
57STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
58
59static cl::opt<int>
60 DefaultThreshold("inlinedefault-threshold", cl::Hidden, cl::init(225),
61 cl::desc("Default amount of inlining to perform"));
62
63// We introduce this option since there is a minor compile-time win by avoiding
64// addition of TTI attributes (target-features in particular) to inline
65// candidates when they are guaranteed to be the same as top level methods in
66// some use cases. If we avoid adding the attribute, we need an option to avoid
67// checking these attributes.
69 "ignore-tti-inline-compatible", cl::Hidden, cl::init(false),
70 cl::desc("Ignore TTI attributes compatibility check between callee/caller "
71 "during inline cost calculation"));
72
74 "print-instruction-comments", cl::Hidden, cl::init(false),
75 cl::desc("Prints comments for instruction based on inline cost analysis"));
76
78 "inline-threshold", cl::Hidden, cl::init(225),
79 cl::desc("Control the amount of inlining to perform (default = 225)"));
80
82 "inlinehint-threshold", cl::Hidden, cl::init(325),
83 cl::desc("Threshold for inlining functions with inline hint"));
84
85static cl::opt<int>
86 ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
87 cl::init(45),
88 cl::desc("Threshold for inlining cold callsites"));
89
91 "inline-enable-cost-benefit-analysis", cl::Hidden, cl::init(false),
92 cl::desc("Enable the cost-benefit analysis for the inliner"));
93
94// InlineSavingsMultiplier overrides per TTI multipliers iff it is
95// specified explicitly in command line options. This option is exposed
96// for tuning and testing.
98 "inline-savings-multiplier", cl::Hidden, cl::init(8),
99 cl::desc("Multiplier to multiply cycle savings by during inlining"));
100
101// InlineSavingsProfitableMultiplier overrides per TTI multipliers iff it is
102// specified explicitly in command line options. This option is exposed
103// for tuning and testing.
105 "inline-savings-profitable-multiplier", cl::Hidden, cl::init(4),
106 cl::desc("A multiplier on top of cycle savings to decide whether the "
107 "savings won't justify the cost"));
108
109static cl::opt<int>
110 InlineSizeAllowance("inline-size-allowance", cl::Hidden, cl::init(100),
111 cl::desc("The maximum size of a callee that get's "
112 "inlined without sufficient cycle savings"));
113
114// We introduce this threshold to help performance of instrumentation based
115// PGO before we actually hook up inliner with analysis passes such as BPI and
116// BFI.
118 "inlinecold-threshold", cl::Hidden, cl::init(45),
119 cl::desc("Threshold for inlining functions with cold attribute"));
120
121static cl::opt<int>
122 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000),
123 cl::desc("Threshold for hot callsites "));
124
126 "locally-hot-callsite-threshold", cl::Hidden, cl::init(525),
127 cl::desc("Threshold for locally hot callsites "));
128
130 "cold-callsite-rel-freq", cl::Hidden, cl::init(2),
131 cl::desc("Maximum block frequency, expressed as a percentage of caller's "
132 "entry frequency, for a callsite to be cold in the absence of "
133 "profile information."));
134
136 "hot-callsite-rel-freq", cl::Hidden, cl::init(60),
137 cl::desc("Minimum block frequency, expressed as a multiple of caller's "
138 "entry frequency, for a callsite to be hot in the absence of "
139 "profile information."));
140
141static cl::opt<int>
142 InstrCost("inline-instr-cost", cl::Hidden, cl::init(5),
143 cl::desc("Cost of a single instruction when inlining"));
144
146 "inline-asm-instr-cost", cl::Hidden, cl::init(0),
147 cl::desc("Cost of a single inline asm instruction when inlining"));
148
149static cl::opt<int>
150 MemAccessCost("inline-memaccess-cost", cl::Hidden, cl::init(0),
151 cl::desc("Cost of load/store instruction when inlining"));
152
154 "inline-call-penalty", cl::Hidden, cl::init(25),
155 cl::desc("Call penalty that is applied per callsite when inlining"));
156
157static cl::opt<size_t>
158 StackSizeThreshold("inline-max-stacksize", cl::Hidden,
159 cl::init(std::numeric_limits<size_t>::max()),
160 cl::desc("Do not inline functions with a stack size "
161 "that exceeds the specified limit"));
162
164 "recursive-inline-max-stacksize", cl::Hidden,
166 cl::desc("Do not inline recursive functions with a stack "
167 "size that exceeds the specified limit"));
168
170 "inline-cost-full", cl::Hidden,
171 cl::desc("Compute the full inline cost of a call site even when the cost "
172 "exceeds the threshold."));
173
175 "inline-caller-superset-nobuiltin", cl::Hidden, cl::init(true),
176 cl::desc("Allow inlining when caller has a superset of callee's nobuiltin "
177 "attributes."));
178
180 "disable-gep-const-evaluation", cl::Hidden, cl::init(false),
181 cl::desc("Disables evaluation of GetElementPtr with constant operands"));
182
184 "inline-all-viable-calls", cl::Hidden, cl::init(false),
185 cl::desc("Inline all viable calls, even if they exceed the inlining "
186 "threshold"));
187namespace llvm {
188std::optional<int> getStringFnAttrAsInt(const Attribute &Attr) {
189 if (Attr.isValid()) {
190 int AttrValue = 0;
191 if (!Attr.getValueAsString().getAsInteger(10, AttrValue))
192 return AttrValue;
193 }
194 return std::nullopt;
195}
196
197std::optional<int> getStringFnAttrAsInt(CallBase &CB, StringRef AttrKind) {
198 return getStringFnAttrAsInt(CB.getFnAttr(AttrKind));
199}
200
201std::optional<int> getStringFnAttrAsInt(Function *F, StringRef AttrKind) {
202 return getStringFnAttrAsInt(F->getFnAttribute(AttrKind));
203}
204
205namespace InlineConstants {
206int getInstrCost() { return InstrCost; }
207
208} // namespace InlineConstants
209
210} // namespace llvm
211
212namespace {
213class InlineCostCallAnalyzer;
214
215// This struct is used to store information about inline cost of a
216// particular instruction
217struct InstructionCostDetail {
218 int CostBefore = 0;
219 int CostAfter = 0;
220 int ThresholdBefore = 0;
221 int ThresholdAfter = 0;
222
223 int getThresholdDelta() const { return ThresholdAfter - ThresholdBefore; }
224
225 int getCostDelta() const { return CostAfter - CostBefore; }
226
227 bool hasThresholdChanged() const { return ThresholdAfter != ThresholdBefore; }
228};
229
230class InlineCostAnnotationWriter : public AssemblyAnnotationWriter {
231private:
232 InlineCostCallAnalyzer *const ICCA;
233
234public:
235 InlineCostAnnotationWriter(InlineCostCallAnalyzer *ICCA) : ICCA(ICCA) {}
237 formatted_raw_ostream &OS) override;
238};
239
240/// Carry out call site analysis, in order to evaluate inlinability.
241/// NOTE: the type is currently used as implementation detail of functions such
242/// as llvm::getInlineCost. Note the function_ref constructor parameters - the
243/// expectation is that they come from the outer scope, from the wrapper
244/// functions. If we want to support constructing CallAnalyzer objects where
245/// lambdas are provided inline at construction, or where the object needs to
246/// otherwise survive past the scope of the provided functions, we need to
247/// revisit the argument types.
248class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
250 friend class InstVisitor<CallAnalyzer, bool>;
251
252protected:
253 virtual ~CallAnalyzer() = default;
254 /// The TargetTransformInfo available for this compilation.
256
257 /// Getter for the cache of @llvm.assume intrinsics.
258 function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
259
260 /// Getter for BlockFrequencyInfo
262
263 /// Getter for TargetLibraryInfo
264 function_ref<const TargetLibraryInfo &(Function &)> GetTLI;
265
266 /// Profile summary information.
268
269 /// The called function.
270 Function &F;
271
272 // Cache the DataLayout since we use it a lot.
273 const DataLayout &DL;
274
275 /// The OptimizationRemarkEmitter available for this compilation.
277
278 /// The candidate callsite being analyzed. Please do not use this to do
279 /// analysis in the caller function; we want the inline cost query to be
280 /// easily cacheable. Instead, use the cover function paramHasAttr.
281 CallBase &CandidateCall;
282
283 /// Getter for the cache of ephemeral values.
284 function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache = nullptr;
285
286 /// Extension points for handling callsite features.
287 // Called before a basic block was analyzed.
288 virtual void onBlockStart(const BasicBlock *BB) {}
289
290 /// Called after a basic block was analyzed.
291 virtual void onBlockAnalyzed(const BasicBlock *BB) {}
292
293 /// Called before an instruction was analyzed
294 virtual void onInstructionAnalysisStart(const Instruction *I) {}
295
296 /// Called after an instruction was analyzed
297 virtual void onInstructionAnalysisFinish(const Instruction *I) {}
298
299 /// Called at the end of the analysis of the callsite. Return the outcome of
300 /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or
301 /// the reason it can't.
302 virtual InlineResult finalizeAnalysis() { return InlineResult::success(); }
303 /// Called when we're about to start processing a basic block, and every time
304 /// we are done processing an instruction. Return true if there is no point in
305 /// continuing the analysis (e.g. we've determined already the call site is
306 /// too expensive to inline)
307 virtual bool shouldStop() { return false; }
308
309 /// Called before the analysis of the callee body starts (with callsite
310 /// contexts propagated). It checks callsite-specific information. Return a
311 /// reason analysis can't continue if that's the case, or 'true' if it may
312 /// continue.
313 virtual InlineResult onAnalysisStart() { return InlineResult::success(); }
314 /// Called if the analysis engine decides SROA cannot be done for the given
315 /// alloca.
316 virtual void onDisableSROA(AllocaInst *Arg) {}
317
318 /// Called the analysis engine determines load elimination won't happen.
319 virtual void onDisableLoadElimination() {}
320
321 /// Called when we visit a CallBase, before the analysis starts. Return false
322 /// to stop further processing of the instruction.
323 virtual bool onCallBaseVisitStart(CallBase &Call) { return true; }
324
325 /// Called to account for a call.
326 virtual void onCallPenalty() {}
327
328 /// Called to account for a load or store.
329 virtual void onMemAccess(){};
330
331 /// Called to account for the expectation the inlining would result in a load
332 /// elimination.
333 virtual void onLoadEliminationOpportunity() {}
334
335 /// Called to account for the cost of argument setup for the Call in the
336 /// callee's body (not the callsite currently under analysis).
337 virtual void onCallArgumentSetup(const CallBase &Call) {}
338
339 /// Called to account for a load relative intrinsic.
340 virtual void onLoadRelativeIntrinsic() {}
341
342 /// Called to account for a lowered call.
343 virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) {
344 }
345
346 /// Account for a jump table of given size. Return false to stop further
347 /// processing the switch instruction
348 virtual bool onJumpTable(unsigned JumpTableSize) { return true; }
349
350 /// Account for a case cluster of given size. Return false to stop further
351 /// processing of the instruction.
352 virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; }
353
354 /// Called at the end of processing a switch instruction, with the given
355 /// number of case clusters.
356 virtual void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
357 bool DefaultDestUnreachable) {}
358
359 /// Called to account for any other instruction not specifically accounted
360 /// for.
361 virtual void onMissedSimplification() {}
362
363 /// Account for inline assembly instructions.
364 virtual void onInlineAsm(const InlineAsm &Arg) {}
365
366 /// Start accounting potential benefits due to SROA for the given alloca.
367 virtual void onInitializeSROAArg(AllocaInst *Arg) {}
368
369 /// Account SROA savings for the AllocaInst value.
370 virtual void onAggregateSROAUse(AllocaInst *V) {}
371
372 bool handleSROA(Value *V, bool DoNotDisable) {
373 // Check for SROA candidates in comparisons.
374 if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
375 if (DoNotDisable) {
376 onAggregateSROAUse(SROAArg);
377 return true;
378 }
379 disableSROAForArg(SROAArg);
380 }
381 return false;
382 }
383
384 bool IsCallerRecursive = false;
385 bool IsRecursiveCall = false;
386 bool ExposesReturnsTwice = false;
387 bool HasDynamicAlloca = false;
388 bool ContainsNoDuplicateCall = false;
389 bool HasReturn = false;
390 bool HasIndirectBr = false;
391 bool HasUninlineableIntrinsic = false;
392 bool InitsVargArgs = false;
393
394 /// Number of bytes allocated statically by the callee.
395 uint64_t AllocatedSize = 0;
396 unsigned NumInstructions = 0;
397 unsigned NumInlineAsmInstructions = 0;
398 unsigned NumVectorInstructions = 0;
399
400 /// While we walk the potentially-inlined instructions, we build up and
401 /// maintain a mapping of simplified values specific to this callsite. The
402 /// idea is to propagate any special information we have about arguments to
403 /// this call through the inlinable section of the function, and account for
404 /// likely simplifications post-inlining. The most important aspect we track
405 /// is CFG altering simplifications -- when we prove a basic block dead, that
406 /// can cause dramatic shifts in the cost of inlining a function.
407 /// Note: The simplified Value may be owned by the caller function.
408 DenseMap<Value *, Value *> SimplifiedValues;
409
410 /// Keep track of the values which map back (through function arguments) to
411 /// allocas on the caller stack which could be simplified through SROA.
413
414 /// Keep track of Allocas for which we believe we may get SROA optimization.
415 DenseSet<AllocaInst *> EnabledSROAAllocas;
416
417 /// Keep track of values which map to a pointer base and constant offset.
419
420 /// Keep track of dead blocks due to the constant arguments.
422
423 /// The mapping of the blocks to their known unique successors due to the
424 /// constant arguments.
426
427 /// Model the elimination of repeated loads that is expected to happen
428 /// whenever we simplify away the stores that would otherwise cause them to be
429 /// loads.
430 bool EnableLoadElimination = true;
431
432 /// Whether we allow inlining for recursive call.
433 bool AllowRecursiveCall = false;
434
435 SmallPtrSet<Value *, 16> LoadAddrSet;
436
437 AllocaInst *getSROAArgForValueOrNull(Value *V) const {
438 auto It = SROAArgValues.find(V);
439 if (It == SROAArgValues.end() || EnabledSROAAllocas.count(It->second) == 0)
440 return nullptr;
441 return It->second;
442 }
443
444 /// Use a value in its given form directly if possible, otherwise try looking
445 /// for it in SimplifiedValues.
446 template <typename T> T *getDirectOrSimplifiedValue(Value *V) const {
447 if (auto *Direct = dyn_cast<T>(V))
448 return Direct;
449 return getSimplifiedValue<T>(V);
450 }
451
452 // Custom simplification helper routines.
453 bool isAllocaDerivedArg(Value *V);
454 void disableSROAForArg(AllocaInst *SROAArg);
455 void disableSROA(Value *V);
456 void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
457 void disableLoadElimination();
458 bool isGEPFree(GetElementPtrInst &GEP);
459 bool canFoldInboundsGEP(GetElementPtrInst &I);
460 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
461 bool simplifyCallSite(Function *F, CallBase &Call);
462 bool simplifyCmpInstForRecCall(CmpInst &Cmp);
464 bool simplifyIntrinsicCallIsConstant(CallBase &CB);
465 bool simplifyIntrinsicCallObjectSize(CallBase &CB);
466 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
467 bool isLoweredToCall(Function *F, CallBase &Call);
468
469 /// Return true if the given argument to the function being considered for
470 /// inlining has the given attribute set either at the call site or the
471 /// function declaration. Primarily used to inspect call site specific
472 /// attributes since these can be more precise than the ones on the callee
473 /// itself.
474 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
475
476 /// Return true if the given value is known non null within the callee if
477 /// inlined through this particular callsite.
478 bool isKnownNonNullInCallee(Value *V);
479
480 /// Return true if size growth is allowed when inlining the callee at \p Call.
481 bool allowSizeGrowth(CallBase &Call);
482
483 // Custom analysis routines.
484 InlineResult analyzeBlock(BasicBlock *BB,
485 const SmallPtrSetImpl<const Value *> &EphValues);
486
487 // Disable several entry points to the visitor so we don't accidentally use
488 // them by declaring but not defining them here.
489 void visit(Module *);
490 void visit(Module &);
491 void visit(Function *);
492 void visit(Function &);
493 void visit(BasicBlock *);
494 void visit(BasicBlock &);
495
496 // Provide base case for our instruction visit.
498
499 // Our visit overrides.
500 bool visitAlloca(AllocaInst &I);
501 bool visitPHI(PHINode &I);
502 bool visitGetElementPtr(GetElementPtrInst &I);
503 bool visitBitCast(BitCastInst &I);
504 bool visitPtrToInt(PtrToIntInst &I);
505 bool visitIntToPtr(IntToPtrInst &I);
506 bool visitCastInst(CastInst &I);
507 bool visitCmpInst(CmpInst &I);
508 bool visitSub(BinaryOperator &I);
510 bool visitFNeg(UnaryOperator &I);
511 bool visitLoad(LoadInst &I);
512 bool visitStore(StoreInst &I);
513 bool visitExtractValue(ExtractValueInst &I);
514 bool visitInsertValue(InsertValueInst &I);
515 bool visitCallBase(CallBase &Call);
516 bool visitReturnInst(ReturnInst &RI);
517 bool visitBranchInst(BranchInst &BI);
518 bool visitSelectInst(SelectInst &SI);
519 bool visitSwitchInst(SwitchInst &SI);
521 bool visitResumeInst(ResumeInst &RI);
525
526public:
527 CallAnalyzer(
528 Function &Callee, CallBase &Call, const TargetTransformInfo &TTI,
529 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
530 function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
531 function_ref<const TargetLibraryInfo &(Function &)> GetTLI = nullptr,
532 ProfileSummaryInfo *PSI = nullptr,
533 OptimizationRemarkEmitter *ORE = nullptr,
534 function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache =
535 nullptr)
536 : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
537 GetTLI(GetTLI), PSI(PSI), F(Callee), DL(F.getDataLayout()), ORE(ORE),
538 CandidateCall(Call), GetEphValuesCache(GetEphValuesCache) {}
539
540 InlineResult analyze();
541
542 /// Lookup simplified Value. May return a value owned by the caller.
543 Value *getSimplifiedValueUnchecked(Value *V) const {
544 return SimplifiedValues.lookup(V);
545 }
546
547 /// Lookup simplified Value, but return nullptr if the simplified value is
548 /// owned by the caller.
549 template <typename T> T *getSimplifiedValue(Value *V) const {
550 Value *SimpleV = SimplifiedValues.lookup(V);
551 if (!SimpleV)
552 return nullptr;
553
554 // Skip checks if we know T is a global. This has a small, but measurable
555 // impact on compile-time.
556 if constexpr (std::is_base_of_v<Constant, T>)
557 return dyn_cast<T>(SimpleV);
558
559 // Make sure the simplified Value is owned by this function
560 if (auto *I = dyn_cast<Instruction>(SimpleV)) {
561 if (I->getFunction() != &F)
562 return nullptr;
563 } else if (auto *Arg = dyn_cast<Argument>(SimpleV)) {
564 if (Arg->getParent() != &F)
565 return nullptr;
566 } else if (!isa<Constant>(SimpleV))
567 return nullptr;
568 return dyn_cast<T>(SimpleV);
569 }
570
571 // Keep a bunch of stats about the cost savings found so we can print them
572 // out when debugging.
573 unsigned NumConstantArgs = 0;
574 unsigned NumConstantOffsetPtrArgs = 0;
575 unsigned NumAllocaArgs = 0;
576 unsigned NumConstantPtrCmps = 0;
577 unsigned NumConstantPtrDiffs = 0;
578 unsigned NumInstructionsSimplified = 0;
579
580 void dump();
581};
582
583// Considering forming a binary search, we should find the number of nodes
584// which is same as the number of comparisons when lowered. For a given
585// number of clusters, n, we can define a recursive function, f(n), to find
586// the number of nodes in the tree. The recursion is :
587// f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
588// and f(n) = n, when n <= 3.
589// This will lead a binary tree where the leaf should be either f(2) or f(3)
590// when n > 3. So, the number of comparisons from leaves should be n, while
591// the number of non-leaf should be :
592// 2^(log2(n) - 1) - 1
593// = 2^log2(n) * 2^-1 - 1
594// = n / 2 - 1.
595// Considering comparisons from leaf and non-leaf nodes, we can estimate the
596// number of comparisons in a simple closed form :
597// n + n / 2 - 1 = n * 3 / 2 - 1
598int64_t getExpectedNumberOfCompare(int NumCaseCluster) {
599 return 3 * static_cast<int64_t>(NumCaseCluster) / 2 - 1;
600}
601
602/// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note
603/// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer
604class InlineCostCallAnalyzer final : public CallAnalyzer {
605 const bool ComputeFullInlineCost;
606 int LoadEliminationCost = 0;
607 /// Bonus to be applied when percentage of vector instructions in callee is
608 /// high (see more details in updateThreshold).
609 int VectorBonus = 0;
610 /// Bonus to be applied when the callee has only one reachable basic block.
611 int SingleBBBonus = 0;
612
613 /// Tunable parameters that control the analysis.
614 const InlineParams &Params;
615
616 // This DenseMap stores the delta change in cost and threshold after
617 // accounting for the given instruction. The map is filled only with the
618 // flag PrintInstructionComments on.
620
621 /// Upper bound for the inlining cost. Bonuses are being applied to account
622 /// for speculative "expected profit" of the inlining decision.
623 int Threshold = 0;
624
625 /// The amount of StaticBonus applied.
626 int StaticBonusApplied = 0;
627
628 /// Attempt to evaluate indirect calls to boost its inline cost.
629 const bool BoostIndirectCalls;
630
631 /// Ignore the threshold when finalizing analysis.
632 const bool IgnoreThreshold;
633
634 // True if the cost-benefit-analysis-based inliner is enabled.
635 const bool CostBenefitAnalysisEnabled;
636
637 /// Inlining cost measured in abstract units, accounts for all the
638 /// instructions expected to be executed for a given function invocation.
639 /// Instructions that are statically proven to be dead based on call-site
640 /// arguments are not counted here.
641 int Cost = 0;
642
643 // The cumulative cost at the beginning of the basic block being analyzed. At
644 // the end of analyzing each basic block, "Cost - CostAtBBStart" represents
645 // the size of that basic block.
646 int CostAtBBStart = 0;
647
648 // The static size of live but cold basic blocks. This is "static" in the
649 // sense that it's not weighted by profile counts at all.
650 int ColdSize = 0;
651
652 // Whether inlining is decided by cost-threshold analysis.
653 bool DecidedByCostThreshold = false;
654
655 // Whether inlining is decided by cost-benefit analysis.
656 bool DecidedByCostBenefit = false;
657
658 // The cost-benefit pair computed by cost-benefit analysis.
659 std::optional<CostBenefitPair> CostBenefit;
660
661 bool SingleBB = true;
662
663 unsigned SROACostSavings = 0;
664 unsigned SROACostSavingsLost = 0;
665
666 /// The mapping of caller Alloca values to their accumulated cost savings. If
667 /// we have to disable SROA for one of the allocas, this tells us how much
668 /// cost must be added.
669 DenseMap<AllocaInst *, int> SROAArgCosts;
670
671 /// Return true if \p Call is a cold callsite.
672 bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI);
673
674 /// Update Threshold based on callsite properties such as callee
675 /// attributes and callee hotness for PGO builds. The Callee is explicitly
676 /// passed to support analyzing indirect calls whose target is inferred by
677 /// analysis.
678 void updateThreshold(CallBase &Call, Function &Callee);
679 /// Return a higher threshold if \p Call is a hot callsite.
680 std::optional<int> getHotCallSiteThreshold(CallBase &Call,
681 BlockFrequencyInfo *CallerBFI);
682
683 /// Handle a capped 'int' increment for Cost.
684 void addCost(int64_t Inc) {
685 Inc = std::clamp<int64_t>(Inc, INT_MIN, INT_MAX);
686 Cost = std::clamp<int64_t>(Inc + Cost, INT_MIN, INT_MAX);
687 }
688
689 void onDisableSROA(AllocaInst *Arg) override {
690 auto CostIt = SROAArgCosts.find(Arg);
691 if (CostIt == SROAArgCosts.end())
692 return;
693 addCost(CostIt->second);
694 SROACostSavings -= CostIt->second;
695 SROACostSavingsLost += CostIt->second;
696 SROAArgCosts.erase(CostIt);
697 }
698
699 void onDisableLoadElimination() override {
700 addCost(LoadEliminationCost);
701 LoadEliminationCost = 0;
702 }
703
704 bool onCallBaseVisitStart(CallBase &Call) override {
705 if (std::optional<int> AttrCallThresholdBonus =
706 getStringFnAttrAsInt(Call, "call-threshold-bonus"))
707 Threshold += *AttrCallThresholdBonus;
708
709 if (std::optional<int> AttrCallCost =
710 getStringFnAttrAsInt(Call, "call-inline-cost")) {
711 addCost(*AttrCallCost);
712 // Prevent further processing of the call since we want to override its
713 // inline cost, not just add to it.
714 return false;
715 }
716 return true;
717 }
718
719 void onCallPenalty() override { addCost(CallPenalty); }
720
721 void onMemAccess() override { addCost(MemAccessCost); }
722
723 void onCallArgumentSetup(const CallBase &Call) override {
724 // Pay the price of the argument setup. We account for the average 1
725 // instruction per call argument setup here.
726 addCost(Call.arg_size() * InstrCost);
727 }
728 void onLoadRelativeIntrinsic() override {
729 // This is normally lowered to 4 LLVM instructions.
730 addCost(3 * InstrCost);
731 }
732 void onLoweredCall(Function *F, CallBase &Call,
733 bool IsIndirectCall) override {
734 // We account for the average 1 instruction per call argument setup here.
735 addCost(Call.arg_size() * InstrCost);
736
737 // If we have a constant that we are calling as a function, we can peer
738 // through it and see the function target. This happens not infrequently
739 // during devirtualization and so we want to give it a hefty bonus for
740 // inlining, but cap that bonus in the event that inlining wouldn't pan out.
741 // Pretend to inline the function, with a custom threshold.
742 if (IsIndirectCall && BoostIndirectCalls) {
743 auto IndirectCallParams = Params;
744 IndirectCallParams.DefaultThreshold =
746 /// FIXME: if InlineCostCallAnalyzer is derived from, this may need
747 /// to instantiate the derived class.
748 InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
749 GetAssumptionCache, GetBFI, GetTLI, PSI, ORE,
750 false);
751 if (CA.analyze().isSuccess()) {
752 // We were able to inline the indirect call! Subtract the cost from the
753 // threshold to get the bonus we want to apply, but don't go below zero.
754 Cost -= std::max(0, CA.getThreshold() - CA.getCost());
755 }
756 } else
757 // Otherwise simply add the cost for merely making the call.
758 addCost(TTI.getInlineCallPenalty(CandidateCall.getCaller(), Call,
759 CallPenalty));
760 }
761
762 void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
763 bool DefaultDestUnreachable) override {
764 // If suitable for a jump table, consider the cost for the table size and
765 // branch to destination.
766 // Maximum valid cost increased in this function.
767 if (JumpTableSize) {
768 // Suppose a default branch includes one compare and one conditional
769 // branch if it's reachable.
770 if (!DefaultDestUnreachable)
771 addCost(2 * InstrCost);
772 // Suppose a jump table requires one load and one jump instruction.
773 int64_t JTCost =
774 static_cast<int64_t>(JumpTableSize) * InstrCost + 2 * InstrCost;
775 addCost(JTCost);
776 return;
777 }
778
779 if (NumCaseCluster <= 3) {
780 // Suppose a comparison includes one compare and one conditional branch.
781 // We can reduce a set of instructions if the default branch is
782 // undefined.
783 addCost((NumCaseCluster - DefaultDestUnreachable) * 2 * InstrCost);
784 return;
785 }
786
787 int64_t ExpectedNumberOfCompare =
788 getExpectedNumberOfCompare(NumCaseCluster);
789 int64_t SwitchCost = ExpectedNumberOfCompare * 2 * InstrCost;
790
791 addCost(SwitchCost);
792 }
793
794 // Parses the inline assembly argument to account for its cost. Inline
795 // assembly instructions incur higher costs for inlining since they cannot be
796 // analyzed and optimized.
797 void onInlineAsm(const InlineAsm &Arg) override {
799 return;
801 Arg.collectAsmStrs(AsmStrs);
802 int SectionLevel = 0;
803 int InlineAsmInstrCount = 0;
804 for (StringRef AsmStr : AsmStrs) {
805 // Trim whitespaces and comments.
806 StringRef Trimmed = AsmStr.trim();
807 size_t hashPos = Trimmed.find('#');
808 if (hashPos != StringRef::npos)
809 Trimmed = Trimmed.substr(0, hashPos);
810 // Ignore comments.
811 if (Trimmed.empty())
812 continue;
813 // Filter out the outlined assembly instructions from the cost by keeping
814 // track of the section level and only accounting for instrutions at
815 // section level of zero. Note there will be duplication in outlined
816 // sections too, but is not accounted in the inlining cost model.
817 if (Trimmed.starts_with(".pushsection")) {
818 ++SectionLevel;
819 continue;
820 }
821 if (Trimmed.starts_with(".popsection")) {
822 --SectionLevel;
823 continue;
824 }
825 // Ignore directives and labels.
826 if (Trimmed.starts_with(".") || Trimmed.contains(":"))
827 continue;
828 if (SectionLevel == 0)
829 ++InlineAsmInstrCount;
830 }
831 NumInlineAsmInstructions += InlineAsmInstrCount;
832 addCost(InlineAsmInstrCount * InlineAsmInstrCost);
833 }
834
835 void onMissedSimplification() override { addCost(InstrCost); }
836
837 void onInitializeSROAArg(AllocaInst *Arg) override {
838 assert(Arg != nullptr &&
839 "Should not initialize SROA costs for null value.");
840 auto SROAArgCost = TTI.getCallerAllocaCost(&CandidateCall, Arg);
841 SROACostSavings += SROAArgCost;
842 SROAArgCosts[Arg] = SROAArgCost;
843 }
844
845 void onAggregateSROAUse(AllocaInst *SROAArg) override {
846 auto CostIt = SROAArgCosts.find(SROAArg);
847 assert(CostIt != SROAArgCosts.end() &&
848 "expected this argument to have a cost");
849 CostIt->second += InstrCost;
850 SROACostSavings += InstrCost;
851 }
852
853 void onBlockStart(const BasicBlock *BB) override { CostAtBBStart = Cost; }
854
855 void onBlockAnalyzed(const BasicBlock *BB) override {
856 if (CostBenefitAnalysisEnabled) {
857 // Keep track of the static size of live but cold basic blocks. For now,
858 // we define a cold basic block to be one that's never executed.
859 assert(GetBFI && "GetBFI must be available");
860 BlockFrequencyInfo *BFI = &(GetBFI(F));
861 assert(BFI && "BFI must be available");
862 auto ProfileCount = BFI->getBlockProfileCount(BB);
863 if (*ProfileCount == 0)
864 ColdSize += Cost - CostAtBBStart;
865 }
866
867 auto *TI = BB->getTerminator();
868 // If we had any successors at this point, than post-inlining is likely to
869 // have them as well. Note that we assume any basic blocks which existed
870 // due to branches or switches which folded above will also fold after
871 // inlining.
872 if (SingleBB && TI->getNumSuccessors() > 1) {
873 // Take off the bonus we applied to the threshold.
874 Threshold -= SingleBBBonus;
875 SingleBB = false;
876 }
877 }
878
879 void onInstructionAnalysisStart(const Instruction *I) override {
880 // This function is called to store the initial cost of inlining before
881 // the given instruction was assessed.
883 return;
884 auto &CostDetail = InstructionCostDetailMap[I];
885 CostDetail.CostBefore = Cost;
886 CostDetail.ThresholdBefore = Threshold;
887 }
888
889 void onInstructionAnalysisFinish(const Instruction *I) override {
890 // This function is called to find new values of cost and threshold after
891 // the instruction has been assessed.
893 return;
894 auto &CostDetail = InstructionCostDetailMap[I];
895 CostDetail.CostAfter = Cost;
896 CostDetail.ThresholdAfter = Threshold;
897 }
898
899 bool isCostBenefitAnalysisEnabled() {
900 if (!PSI || !PSI->hasProfileSummary())
901 return false;
902
903 if (!GetBFI)
904 return false;
905
907 // Honor the explicit request from the user.
909 return false;
910 } else {
911 // Otherwise, require instrumentation profile.
912 if (!PSI->hasInstrumentationProfile())
913 return false;
914 }
915
916 auto *Caller = CandidateCall.getParent()->getParent();
917 if (!Caller->getEntryCount())
918 return false;
919
920 BlockFrequencyInfo *CallerBFI = &(GetBFI(*Caller));
921 if (!CallerBFI)
922 return false;
923
924 // For now, limit to hot call site.
925 if (!PSI->isHotCallSite(CandidateCall, CallerBFI))
926 return false;
927
928 // Make sure we have a nonzero entry count.
929 auto EntryCount = F.getEntryCount();
930 if (!EntryCount || !EntryCount->getCount())
931 return false;
932
933 BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
934 if (!CalleeBFI)
935 return false;
936
937 return true;
938 }
939
940 // A helper function to choose between command line override and default.
941 unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const {
942 if (InlineSavingsMultiplier.getNumOccurrences())
945 }
946
947 // A helper function to choose between command line override and default.
948 unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const {
949 if (InlineSavingsProfitableMultiplier.getNumOccurrences())
952 }
953
954 void OverrideCycleSavingsAndSizeForTesting(APInt &CycleSavings, int &Size) {
955 if (std::optional<int> AttrCycleSavings = getStringFnAttrAsInt(
956 CandidateCall, "inline-cycle-savings-for-test")) {
957 CycleSavings = *AttrCycleSavings;
958 }
959
960 if (std::optional<int> AttrRuntimeCost = getStringFnAttrAsInt(
961 CandidateCall, "inline-runtime-cost-for-test")) {
962 Size = *AttrRuntimeCost;
963 }
964 }
965
966 // Determine whether we should inline the given call site, taking into account
967 // both the size cost and the cycle savings. Return std::nullopt if we don't
968 // have sufficient profiling information to determine.
969 std::optional<bool> costBenefitAnalysis() {
970 if (!CostBenefitAnalysisEnabled)
971 return std::nullopt;
972
973 // buildInlinerPipeline in the pass builder sets HotCallSiteThreshold to 0
974 // for the prelink phase of the AutoFDO + ThinLTO build. Honor the logic by
975 // falling back to the cost-based metric.
976 // TODO: Improve this hacky condition.
977 if (Threshold == 0)
978 return std::nullopt;
979
980 assert(GetBFI);
981 BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
982 assert(CalleeBFI);
983
984 // The cycle savings expressed as the sum of InstrCost
985 // multiplied by the estimated dynamic count of each instruction we can
986 // avoid. Savings come from the call site cost, such as argument setup and
987 // the call instruction, as well as the instructions that are folded.
988 //
989 // We use 128-bit APInt here to avoid potential overflow. This variable
990 // should stay well below 10^^24 (or 2^^80) in practice. This "worst" case
991 // assumes that we can avoid or fold a billion instructions, each with a
992 // profile count of 10^^15 -- roughly the number of cycles for a 24-hour
993 // period on a 4GHz machine.
994 APInt CycleSavings(128, 0);
995
996 for (auto &BB : F) {
997 APInt CurrentSavings(128, 0);
998 for (auto &I : BB) {
999 if (BranchInst *BI = dyn_cast<BranchInst>(&I)) {
1000 // Count a conditional branch as savings if it becomes unconditional.
1001 if (BI->isConditional() &&
1002 getSimplifiedValue<ConstantInt>(BI->getCondition())) {
1003 CurrentSavings += InstrCost;
1004 }
1005 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
1006 if (getSimplifiedValue<ConstantInt>(SI->getCondition()))
1007 CurrentSavings += InstrCost;
1008 } else if (Value *V = dyn_cast<Value>(&I)) {
1009 // Count an instruction as savings if we can fold it.
1010 if (SimplifiedValues.count(V)) {
1011 CurrentSavings += InstrCost;
1012 }
1013 }
1014 }
1015
1016 auto ProfileCount = CalleeBFI->getBlockProfileCount(&BB);
1017 CurrentSavings *= *ProfileCount;
1018 CycleSavings += CurrentSavings;
1019 }
1020
1021 // Compute the cycle savings per call.
1022 auto EntryProfileCount = F.getEntryCount();
1023 assert(EntryProfileCount && EntryProfileCount->getCount());
1024 auto EntryCount = EntryProfileCount->getCount();
1025 CycleSavings += EntryCount / 2;
1026 CycleSavings = CycleSavings.udiv(EntryCount);
1027
1028 // Compute the total savings for the call site.
1029 auto *CallerBB = CandidateCall.getParent();
1030 BlockFrequencyInfo *CallerBFI = &(GetBFI(*(CallerBB->getParent())));
1031 CycleSavings += getCallsiteCost(TTI, this->CandidateCall, DL);
1032 CycleSavings *= *CallerBFI->getBlockProfileCount(CallerBB);
1033
1034 // Remove the cost of the cold basic blocks to model the runtime cost more
1035 // accurately. Both machine block placement and function splitting could
1036 // place cold blocks further from hot blocks.
1037 int Size = Cost - ColdSize;
1038
1039 // Allow tiny callees to be inlined regardless of whether they meet the
1040 // savings threshold.
1042
1043 OverrideCycleSavingsAndSizeForTesting(CycleSavings, Size);
1044 CostBenefit.emplace(APInt(128, Size), CycleSavings);
1045
1046 // Let R be the ratio of CycleSavings to Size. We accept the inlining
1047 // opportunity if R is really high and reject if R is really low. If R is
1048 // somewhere in the middle, we fall back to the cost-based analysis.
1049 //
1050 // Specifically, let R = CycleSavings / Size, we accept the inlining
1051 // opportunity if:
1052 //
1053 // PSI->getOrCompHotCountThreshold()
1054 // R > -------------------------------------------------
1055 // getInliningCostBenefitAnalysisSavingsMultiplier()
1056 //
1057 // and reject the inlining opportunity if:
1058 //
1059 // PSI->getOrCompHotCountThreshold()
1060 // R <= ----------------------------------------------------
1061 // getInliningCostBenefitAnalysisProfitableMultiplier()
1062 //
1063 // Otherwise, we fall back to the cost-based analysis.
1064 //
1065 // Implementation-wise, use multiplication (CycleSavings * Multiplier,
1066 // HotCountThreshold * Size) rather than division to avoid precision loss.
1067 APInt Threshold(128, PSI->getOrCompHotCountThreshold());
1068 Threshold *= Size;
1069
1070 APInt UpperBoundCycleSavings = CycleSavings;
1071 UpperBoundCycleSavings *= getInliningCostBenefitAnalysisSavingsMultiplier();
1072 if (UpperBoundCycleSavings.uge(Threshold))
1073 return true;
1074
1075 APInt LowerBoundCycleSavings = CycleSavings;
1076 LowerBoundCycleSavings *=
1077 getInliningCostBenefitAnalysisProfitableMultiplier();
1078 if (LowerBoundCycleSavings.ult(Threshold))
1079 return false;
1080
1081 // Otherwise, fall back to the cost-based analysis.
1082 return std::nullopt;
1083 }
1084
1085 InlineResult finalizeAnalysis() override {
1086 // Loops generally act a lot like calls in that they act like barriers to
1087 // movement, require a certain amount of setup, etc. So when optimising for
1088 // size, we penalise any call sites that perform loops. We do this after all
1089 // other costs here, so will likely only be dealing with relatively small
1090 // functions (and hence DT and LI will hopefully be cheap).
1091 auto *Caller = CandidateCall.getFunction();
1092 if (Caller->hasMinSize()) {
1093 DominatorTree DT(F);
1094 LoopInfo LI(DT);
1095 int NumLoops = 0;
1096 for (Loop *L : LI) {
1097 // Ignore loops that will not be executed
1098 if (DeadBlocks.count(L->getHeader()))
1099 continue;
1100 NumLoops++;
1101 }
1102 addCost(NumLoops * InlineConstants::LoopPenalty);
1103 }
1104
1105 // We applied the maximum possible vector bonus at the beginning. Now,
1106 // subtract the excess bonus, if any, from the Threshold before
1107 // comparing against Cost.
1108 if (NumVectorInstructions <= NumInstructions / 10)
1109 Threshold -= VectorBonus;
1110 else if (NumVectorInstructions <= NumInstructions / 2)
1111 Threshold -= VectorBonus / 2;
1112
1113 if (std::optional<int> AttrCost =
1114 getStringFnAttrAsInt(CandidateCall, "function-inline-cost"))
1115 Cost = *AttrCost;
1116
1117 if (std::optional<int> AttrCostMult = getStringFnAttrAsInt(
1118 CandidateCall,
1120 Cost *= *AttrCostMult;
1121
1122 if (std::optional<int> AttrThreshold =
1123 getStringFnAttrAsInt(CandidateCall, "function-inline-threshold"))
1124 Threshold = *AttrThreshold;
1125
1126 if (auto Result = costBenefitAnalysis()) {
1127 DecidedByCostBenefit = true;
1128 if (*Result)
1129 return InlineResult::success();
1130 else
1131 return InlineResult::failure("Cost over threshold.");
1132 }
1133
1134 if (IgnoreThreshold)
1135 return InlineResult::success();
1136
1137 DecidedByCostThreshold = true;
1138 return Cost < std::max(1, Threshold)
1140 : InlineResult::failure("Cost over threshold.");
1141 }
1142
1143 bool shouldStop() override {
1144 if (IgnoreThreshold || ComputeFullInlineCost)
1145 return false;
1146 // Bail out the moment we cross the threshold. This means we'll under-count
1147 // the cost, but only when undercounting doesn't matter.
1148 if (Cost < Threshold)
1149 return false;
1150 DecidedByCostThreshold = true;
1151 return true;
1152 }
1153
1154 void onLoadEliminationOpportunity() override {
1155 LoadEliminationCost += InstrCost;
1156 }
1157
1158 InlineResult onAnalysisStart() override {
1159 // Perform some tweaks to the cost and threshold based on the direct
1160 // callsite information.
1161
1162 // We want to more aggressively inline vector-dense kernels, so up the
1163 // threshold, and we'll lower it if the % of vector instructions gets too
1164 // low. Note that these bonuses are some what arbitrary and evolved over
1165 // time by accident as much as because they are principled bonuses.
1166 //
1167 // FIXME: It would be nice to remove all such bonuses. At least it would be
1168 // nice to base the bonus values on something more scientific.
1169 assert(NumInstructions == 0);
1170 assert(NumVectorInstructions == 0);
1171
1172 // Update the threshold based on callsite properties
1173 updateThreshold(CandidateCall, F);
1174
1175 // While Threshold depends on commandline options that can take negative
1176 // values, we want to enforce the invariant that the computed threshold and
1177 // bonuses are non-negative.
1178 assert(Threshold >= 0);
1179 assert(SingleBBBonus >= 0);
1180 assert(VectorBonus >= 0);
1181
1182 // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1183 // this Threshold any time, and cost cannot decrease, we can stop processing
1184 // the rest of the function body.
1185 Threshold += (SingleBBBonus + VectorBonus);
1186
1187 // Give out bonuses for the callsite, as the instructions setting them up
1188 // will be gone after inlining.
1189 addCost(-getCallsiteCost(TTI, this->CandidateCall, DL));
1190
1191 // If this function uses the coldcc calling convention, prefer not to inline
1192 // it.
1193 if (F.getCallingConv() == CallingConv::Cold)
1195
1196 LLVM_DEBUG(dbgs() << " Initial cost: " << Cost << "\n");
1197
1198 // Check if we're done. This can happen due to bonuses and penalties.
1199 if (Cost >= Threshold && !ComputeFullInlineCost)
1200 return InlineResult::failure("high cost");
1201
1202 return InlineResult::success();
1203 }
1204
1205public:
1206 InlineCostCallAnalyzer(
1207 Function &Callee, CallBase &Call, const InlineParams &Params,
1208 const TargetTransformInfo &TTI,
1209 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
1210 function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
1211 function_ref<const TargetLibraryInfo &(Function &)> GetTLI = nullptr,
1212 ProfileSummaryInfo *PSI = nullptr,
1213 OptimizationRemarkEmitter *ORE = nullptr, bool BoostIndirect = true,
1214 bool IgnoreThreshold = false,
1215 function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache =
1216 nullptr)
1217 : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, GetTLI, PSI,
1218 ORE, GetEphValuesCache),
1219 ComputeFullInlineCost(OptComputeFullInlineCost ||
1220 Params.ComputeFullInlineCost || ORE ||
1221 isCostBenefitAnalysisEnabled()),
1222 Params(Params), Threshold(Params.DefaultThreshold),
1223 BoostIndirectCalls(BoostIndirect), IgnoreThreshold(IgnoreThreshold),
1224 CostBenefitAnalysisEnabled(isCostBenefitAnalysisEnabled()),
1225 Writer(this) {
1226 AllowRecursiveCall = *Params.AllowRecursiveCall;
1227 }
1228
1229 /// Annotation Writer for instruction details
1230 InlineCostAnnotationWriter Writer;
1231
1232 void dump();
1233
1234 // Prints the same analysis as dump(), but its definition is not dependent
1235 // on the build.
1236 void print(raw_ostream &OS);
1237
1238 std::optional<InstructionCostDetail> getCostDetails(const Instruction *I) {
1239 auto It = InstructionCostDetailMap.find(I);
1240 if (It != InstructionCostDetailMap.end())
1241 return It->second;
1242 return std::nullopt;
1243 }
1244
1245 virtual ~InlineCostCallAnalyzer() = default;
1246 int getThreshold() const { return Threshold; }
1247 int getCost() const { return Cost; }
1248 int getStaticBonusApplied() const { return StaticBonusApplied; }
1249 std::optional<CostBenefitPair> getCostBenefitPair() { return CostBenefit; }
1250 bool wasDecidedByCostBenefit() const { return DecidedByCostBenefit; }
1251 bool wasDecidedByCostThreshold() const { return DecidedByCostThreshold; }
1252};
1253
1254// Return true if CB is the sole call to local function Callee.
1255static bool isSoleCallToLocalFunction(const CallBase &CB,
1256 const Function &Callee) {
1257 return Callee.hasLocalLinkage() && Callee.hasOneLiveUse() &&
1258 &Callee == CB.getCalledFunction();
1259}
1260
1261class InlineCostFeaturesAnalyzer final : public CallAnalyzer {
1262private:
1264
1265 // FIXME: These constants are taken from the heuristic-based cost visitor.
1266 // These should be removed entirely in a later revision to avoid reliance on
1267 // heuristics in the ML inliner.
1268 static constexpr int JTCostMultiplier = 2;
1269 static constexpr int CaseClusterCostMultiplier = 2;
1270 static constexpr int SwitchDefaultDestCostMultiplier = 2;
1271 static constexpr int SwitchCostMultiplier = 2;
1272
1273 // FIXME: These are taken from the heuristic-based cost visitor: we should
1274 // eventually abstract these to the CallAnalyzer to avoid duplication.
1275 unsigned SROACostSavingOpportunities = 0;
1276 int VectorBonus = 0;
1277 int SingleBBBonus = 0;
1278 int Threshold = 5;
1279
1281
1282 void increment(InlineCostFeatureIndex Feature, int64_t Delta = 1) {
1283 Cost[static_cast<size_t>(Feature)] += Delta;
1284 }
1285
1286 void set(InlineCostFeatureIndex Feature, int64_t Value) {
1287 Cost[static_cast<size_t>(Feature)] = Value;
1288 }
1289
1290 void onDisableSROA(AllocaInst *Arg) override {
1291 auto CostIt = SROACosts.find(Arg);
1292 if (CostIt == SROACosts.end())
1293 return;
1294
1295 increment(InlineCostFeatureIndex::sroa_losses, CostIt->second);
1296 SROACostSavingOpportunities -= CostIt->second;
1297 SROACosts.erase(CostIt);
1298 }
1299
1300 void onDisableLoadElimination() override {
1301 set(InlineCostFeatureIndex::load_elimination, 1);
1302 }
1303
1304 void onCallPenalty() override {
1305 increment(InlineCostFeatureIndex::call_penalty, CallPenalty);
1306 }
1307
1308 void onCallArgumentSetup(const CallBase &Call) override {
1309 increment(InlineCostFeatureIndex::call_argument_setup,
1310 Call.arg_size() * InstrCost);
1311 }
1312
1313 void onLoadRelativeIntrinsic() override {
1314 increment(InlineCostFeatureIndex::load_relative_intrinsic, 3 * InstrCost);
1315 }
1316
1317 void onLoweredCall(Function *F, CallBase &Call,
1318 bool IsIndirectCall) override {
1319 increment(InlineCostFeatureIndex::lowered_call_arg_setup,
1320 Call.arg_size() * InstrCost);
1321
1322 if (IsIndirectCall) {
1323 InlineParams IndirectCallParams = {/* DefaultThreshold*/ 0,
1324 /*HintThreshold*/ {},
1325 /*ColdThreshold*/ {},
1326 /*OptSizeThreshold*/ {},
1327 /*OptMinSizeThreshold*/ {},
1328 /*HotCallSiteThreshold*/ {},
1329 /*LocallyHotCallSiteThreshold*/ {},
1330 /*ColdCallSiteThreshold*/ {},
1331 /*ComputeFullInlineCost*/ true,
1332 /*EnableDeferral*/ true};
1333 IndirectCallParams.DefaultThreshold =
1335
1336 InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
1337 GetAssumptionCache, GetBFI, GetTLI, PSI, ORE,
1338 false, true);
1339 if (CA.analyze().isSuccess()) {
1340 increment(InlineCostFeatureIndex::nested_inline_cost_estimate,
1341 CA.getCost());
1342 increment(InlineCostFeatureIndex::nested_inlines, 1);
1343 }
1344 } else {
1345 onCallPenalty();
1346 }
1347 }
1348
1349 void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
1350 bool DefaultDestUnreachable) override {
1351 if (JumpTableSize) {
1352 if (!DefaultDestUnreachable)
1353 increment(InlineCostFeatureIndex::switch_default_dest_penalty,
1354 SwitchDefaultDestCostMultiplier * InstrCost);
1355 int64_t JTCost = static_cast<int64_t>(JumpTableSize) * InstrCost +
1356 JTCostMultiplier * InstrCost;
1357 increment(InlineCostFeatureIndex::jump_table_penalty, JTCost);
1358 return;
1359 }
1360
1361 if (NumCaseCluster <= 3) {
1362 increment(InlineCostFeatureIndex::case_cluster_penalty,
1363 (NumCaseCluster - DefaultDestUnreachable) *
1364 CaseClusterCostMultiplier * InstrCost);
1365 return;
1366 }
1367
1368 int64_t ExpectedNumberOfCompare =
1369 getExpectedNumberOfCompare(NumCaseCluster);
1370
1371 int64_t SwitchCost =
1372 ExpectedNumberOfCompare * SwitchCostMultiplier * InstrCost;
1373 increment(InlineCostFeatureIndex::switch_penalty, SwitchCost);
1374 }
1375
1376 void onMissedSimplification() override {
1377 increment(InlineCostFeatureIndex::unsimplified_common_instructions,
1378 InstrCost);
1379 }
1380
1381 void onInitializeSROAArg(AllocaInst *Arg) override {
1382 auto SROAArgCost = TTI.getCallerAllocaCost(&CandidateCall, Arg);
1383 SROACosts[Arg] = SROAArgCost;
1384 SROACostSavingOpportunities += SROAArgCost;
1385 }
1386
1387 void onAggregateSROAUse(AllocaInst *Arg) override {
1388 SROACosts.find(Arg)->second += InstrCost;
1389 SROACostSavingOpportunities += InstrCost;
1390 }
1391
1392 void onBlockAnalyzed(const BasicBlock *BB) override {
1393 if (BB->getTerminator()->getNumSuccessors() > 1)
1394 set(InlineCostFeatureIndex::is_multiple_blocks, 1);
1395 Threshold -= SingleBBBonus;
1396 }
1397
1398 InlineResult finalizeAnalysis() override {
1399 auto *Caller = CandidateCall.getFunction();
1400 if (Caller->hasMinSize()) {
1401 DominatorTree DT(F);
1402 LoopInfo LI(DT);
1403 for (Loop *L : LI) {
1404 // Ignore loops that will not be executed
1405 if (DeadBlocks.count(L->getHeader()))
1406 continue;
1407 increment(InlineCostFeatureIndex::num_loops,
1409 }
1410 }
1411 set(InlineCostFeatureIndex::dead_blocks, DeadBlocks.size());
1412 set(InlineCostFeatureIndex::simplified_instructions,
1413 NumInstructionsSimplified);
1414 set(InlineCostFeatureIndex::constant_args, NumConstantArgs);
1415 set(InlineCostFeatureIndex::constant_offset_ptr_args,
1416 NumConstantOffsetPtrArgs);
1417 set(InlineCostFeatureIndex::sroa_savings, SROACostSavingOpportunities);
1418
1419 if (NumVectorInstructions <= NumInstructions / 10)
1420 Threshold -= VectorBonus;
1421 else if (NumVectorInstructions <= NumInstructions / 2)
1422 Threshold -= VectorBonus / 2;
1423
1424 set(InlineCostFeatureIndex::threshold, Threshold);
1425
1426 return InlineResult::success();
1427 }
1428
1429 bool shouldStop() override { return false; }
1430
1431 void onLoadEliminationOpportunity() override {
1432 increment(InlineCostFeatureIndex::load_elimination, 1);
1433 }
1434
1435 InlineResult onAnalysisStart() override {
1436 increment(InlineCostFeatureIndex::callsite_cost,
1437 -1 * getCallsiteCost(TTI, this->CandidateCall, DL));
1438
1439 set(InlineCostFeatureIndex::cold_cc_penalty,
1440 (F.getCallingConv() == CallingConv::Cold));
1441
1442 set(InlineCostFeatureIndex::last_call_to_static_bonus,
1443 isSoleCallToLocalFunction(CandidateCall, F));
1444
1445 // FIXME: we shouldn't repeat this logic in both the Features and Cost
1446 // analyzer - instead, we should abstract it to a common method in the
1447 // CallAnalyzer
1448 int SingleBBBonusPercent = 50;
1449 int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
1450 Threshold += TTI.adjustInliningThreshold(&CandidateCall);
1451 Threshold *= TTI.getInliningThresholdMultiplier();
1452 SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
1453 VectorBonus = Threshold * VectorBonusPercent / 100;
1454 Threshold += (SingleBBBonus + VectorBonus);
1455
1456 return InlineResult::success();
1457 }
1458
1459public:
1460 InlineCostFeaturesAnalyzer(
1461 const TargetTransformInfo &TTI,
1462 function_ref<AssumptionCache &(Function &)> &GetAssumptionCache,
1464 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
1466 CallBase &Call)
1467 : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, GetTLI,
1468 PSI) {}
1469
1470 const InlineCostFeatures &features() const { return Cost; }
1471};
1472
1473} // namespace
1474
1475/// Test whether the given value is an Alloca-derived function argument.
1476bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
1477 return SROAArgValues.count(V);
1478}
1479
1480void CallAnalyzer::disableSROAForArg(AllocaInst *SROAArg) {
1481 onDisableSROA(SROAArg);
1482 EnabledSROAAllocas.erase(SROAArg);
1483 disableLoadElimination();
1484}
1485
1486void InlineCostAnnotationWriter::emitInstructionAnnot(
1488 // The cost of inlining of the given instruction is printed always.
1489 // The threshold delta is printed only when it is non-zero. It happens
1490 // when we decided to give a bonus at a particular instruction.
1491 std::optional<InstructionCostDetail> Record = ICCA->getCostDetails(I);
1492 if (!Record)
1493 OS << "; No analysis for the instruction";
1494 else {
1495 OS << "; cost before = " << Record->CostBefore
1496 << ", cost after = " << Record->CostAfter
1497 << ", threshold before = " << Record->ThresholdBefore
1498 << ", threshold after = " << Record->ThresholdAfter << ", ";
1499 OS << "cost delta = " << Record->getCostDelta();
1500 if (Record->hasThresholdChanged())
1501 OS << ", threshold delta = " << Record->getThresholdDelta();
1502 }
1503 auto *V = ICCA->getSimplifiedValueUnchecked(const_cast<Instruction *>(I));
1504 if (V) {
1505 OS << ", simplified to ";
1506 V->print(OS, true);
1507 if (auto *VI = dyn_cast<Instruction>(V)) {
1508 if (VI->getFunction() != I->getFunction())
1509 OS << " (caller instruction)";
1510 } else if (auto *VArg = dyn_cast<Argument>(V)) {
1511 if (VArg->getParent() != I->getFunction())
1512 OS << " (caller argument)";
1513 }
1514 }
1515 OS << "\n";
1516}
1517
1518/// If 'V' maps to a SROA candidate, disable SROA for it.
1519void CallAnalyzer::disableSROA(Value *V) {
1520 if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
1521 disableSROAForArg(SROAArg);
1522 }
1523}
1524
1525void CallAnalyzer::disableLoadElimination() {
1526 if (EnableLoadElimination) {
1527 onDisableLoadElimination();
1528 EnableLoadElimination = false;
1529 }
1530}
1531
1532/// Accumulate a constant GEP offset into an APInt if possible.
1533///
1534/// Returns false if unable to compute the offset for any reason. Respects any
1535/// simplified values known during the analysis of this callsite.
1536bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
1537 unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(GEP.getType());
1538 assert(IntPtrWidth == Offset.getBitWidth());
1539
1541 GTI != GTE; ++GTI) {
1542 ConstantInt *OpC =
1543 getDirectOrSimplifiedValue<ConstantInt>(GTI.getOperand());
1544 if (!OpC)
1545 return false;
1546 if (OpC->isZero())
1547 continue;
1548
1549 // Handle a struct index, which adds its field offset to the pointer.
1550 if (StructType *STy = GTI.getStructTypeOrNull()) {
1551 unsigned ElementIdx = OpC->getZExtValue();
1552 const StructLayout *SL = DL.getStructLayout(STy);
1553 Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
1554 continue;
1555 }
1556
1557 APInt TypeSize(IntPtrWidth, GTI.getSequentialElementStride(DL));
1558 Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
1559 }
1560 return true;
1561}
1562
1563/// Use TTI to check whether a GEP is free.
1564///
1565/// Respects any simplified values known during the analysis of this callsite.
1566bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
1568 Operands.push_back(GEP.getOperand(0));
1569 for (const Use &Op : GEP.indices())
1570 if (Constant *SimpleOp = getSimplifiedValue<Constant>(Op))
1571 Operands.push_back(SimpleOp);
1572 else
1573 Operands.push_back(Op);
1577}
1578
1579bool CallAnalyzer::visitAlloca(AllocaInst &I) {
1580 disableSROA(I.getOperand(0));
1581
1582 // Check whether inlining will turn a dynamic alloca into a static
1583 // alloca and handle that case.
1584 if (I.isArrayAllocation()) {
1585 Constant *Size = getSimplifiedValue<Constant>(I.getArraySize());
1586 if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
1587 // Sometimes a dynamic alloca could be converted into a static alloca
1588 // after this constant prop, and become a huge static alloca on an
1589 // unconditional CFG path. Avoid inlining if this is going to happen above
1590 // a threshold.
1591 // FIXME: If the threshold is removed or lowered too much, we could end up
1592 // being too pessimistic and prevent inlining non-problematic code. This
1593 // could result in unintended perf regressions. A better overall strategy
1594 // is needed to track stack usage during inlining.
1595 Type *Ty = I.getAllocatedType();
1596 AllocatedSize = SaturatingMultiplyAdd(
1597 AllocSize->getLimitedValue(),
1598 DL.getTypeAllocSize(Ty).getKnownMinValue(), AllocatedSize);
1600 HasDynamicAlloca = true;
1601 return false;
1602 }
1603 }
1604
1605 // Accumulate the allocated size.
1606 if (I.isStaticAlloca()) {
1607 Type *Ty = I.getAllocatedType();
1608 AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty).getKnownMinValue(),
1609 AllocatedSize);
1610 }
1611
1612 // FIXME: This is overly conservative. Dynamic allocas are inefficient for
1613 // a variety of reasons, and so we would like to not inline them into
1614 // functions which don't currently have a dynamic alloca. This simply
1615 // disables inlining altogether in the presence of a dynamic alloca.
1616 if (!I.isStaticAlloca())
1617 HasDynamicAlloca = true;
1618
1619 return false;
1620}
1621
1622bool CallAnalyzer::visitPHI(PHINode &I) {
1623 // FIXME: We need to propagate SROA *disabling* through phi nodes, even
1624 // though we don't want to propagate it's bonuses. The idea is to disable
1625 // SROA if it *might* be used in an inappropriate manner.
1626
1627 // Phi nodes are always zero-cost.
1628 // FIXME: Pointer sizes may differ between different address spaces, so do we
1629 // need to use correct address space in the call to getPointerSizeInBits here?
1630 // Or could we skip the getPointerSizeInBits call completely? As far as I can
1631 // see the ZeroOffset is used as a dummy value, so we can probably use any
1632 // bit width for the ZeroOffset?
1633 APInt ZeroOffset = APInt::getZero(DL.getPointerSizeInBits(0));
1634 bool CheckSROA = I.getType()->isPointerTy();
1635
1636 // Track the constant or pointer with constant offset we've seen so far.
1637 Constant *FirstC = nullptr;
1638 std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset};
1639 Value *FirstV = nullptr;
1640
1641 for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) {
1642 BasicBlock *Pred = I.getIncomingBlock(i);
1643 // If the incoming block is dead, skip the incoming block.
1644 if (DeadBlocks.count(Pred))
1645 continue;
1646 // If the parent block of phi is not the known successor of the incoming
1647 // block, skip the incoming block.
1648 BasicBlock *KnownSuccessor = KnownSuccessors[Pred];
1649 if (KnownSuccessor && KnownSuccessor != I.getParent())
1650 continue;
1651
1652 Value *V = I.getIncomingValue(i);
1653 // If the incoming value is this phi itself, skip the incoming value.
1654 if (&I == V)
1655 continue;
1656
1657 Constant *C = getDirectOrSimplifiedValue<Constant>(V);
1658
1659 std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset};
1660 if (!C && CheckSROA)
1661 BaseAndOffset = ConstantOffsetPtrs.lookup(V);
1662
1663 if (!C && !BaseAndOffset.first)
1664 // The incoming value is neither a constant nor a pointer with constant
1665 // offset, exit early.
1666 return true;
1667
1668 if (FirstC) {
1669 if (FirstC == C)
1670 // If we've seen a constant incoming value before and it is the same
1671 // constant we see this time, continue checking the next incoming value.
1672 continue;
1673 // Otherwise early exit because we either see a different constant or saw
1674 // a constant before but we have a pointer with constant offset this time.
1675 return true;
1676 }
1677
1678 if (FirstV) {
1679 // The same logic as above, but check pointer with constant offset here.
1680 if (FirstBaseAndOffset == BaseAndOffset)
1681 continue;
1682 return true;
1683 }
1684
1685 if (C) {
1686 // This is the 1st time we've seen a constant, record it.
1687 FirstC = C;
1688 continue;
1689 }
1690
1691 // The remaining case is that this is the 1st time we've seen a pointer with
1692 // constant offset, record it.
1693 FirstV = V;
1694 FirstBaseAndOffset = BaseAndOffset;
1695 }
1696
1697 // Check if we can map phi to a constant.
1698 if (FirstC) {
1699 SimplifiedValues[&I] = FirstC;
1700 return true;
1701 }
1702
1703 // Check if we can map phi to a pointer with constant offset.
1704 if (FirstBaseAndOffset.first) {
1705 ConstantOffsetPtrs[&I] = FirstBaseAndOffset;
1706
1707 if (auto *SROAArg = getSROAArgForValueOrNull(FirstV))
1708 SROAArgValues[&I] = SROAArg;
1709 }
1710
1711 return true;
1712}
1713
1714/// Check we can fold GEPs of constant-offset call site argument pointers.
1715/// This requires target data and inbounds GEPs.
1716///
1717/// \return true if the specified GEP can be folded.
1718bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
1719 // Check if we have a base + offset for the pointer.
1720 std::pair<Value *, APInt> BaseAndOffset =
1721 ConstantOffsetPtrs.lookup(I.getPointerOperand());
1722 if (!BaseAndOffset.first)
1723 return false;
1724
1725 // Check if the offset of this GEP is constant, and if so accumulate it
1726 // into Offset.
1727 if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second))
1728 return false;
1729
1730 // Add the result as a new mapping to Base + Offset.
1731 ConstantOffsetPtrs[&I] = BaseAndOffset;
1732
1733 return true;
1734}
1735
1736bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
1737 auto *SROAArg = getSROAArgForValueOrNull(I.getPointerOperand());
1738
1739 // Lambda to check whether a GEP's indices are all constant.
1740 auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
1741 for (const Use &Op : GEP.indices())
1742 if (!getDirectOrSimplifiedValue<Constant>(Op))
1743 return false;
1744 return true;
1745 };
1746
1749 return true;
1750
1751 if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
1752 if (SROAArg)
1753 SROAArgValues[&I] = SROAArg;
1754
1755 // Constant GEPs are modeled as free.
1756 return true;
1757 }
1758
1759 // Variable GEPs will require math and will disable SROA.
1760 if (SROAArg)
1761 disableSROAForArg(SROAArg);
1762 return isGEPFree(I);
1763}
1764
1765// Simplify \p Cmp if RHS is const and we can ValueTrack LHS.
1766// This handles the case only when the Cmp instruction is guarding a recursive
1767// call that will cause the Cmp to fail/succeed for the recursive call.
1768bool CallAnalyzer::simplifyCmpInstForRecCall(CmpInst &Cmp) {
1769 // Bail out if LHS is not a function argument or RHS is NOT const:
1770 if (!isa<Argument>(Cmp.getOperand(0)) || !isa<Constant>(Cmp.getOperand(1)))
1771 return false;
1772 auto *CmpOp = Cmp.getOperand(0);
1773 // Make sure that the callsite is recursive:
1774 if (CandidateCall.getCaller() != &F)
1775 return false;
1776 // Only handle the case when the callsite has a single predecessor:
1777 auto *CallBB = CandidateCall.getParent();
1778 auto *Predecessor = CallBB->getSinglePredecessor();
1779 if (!Predecessor)
1780 return false;
1781 // Check if the callsite is guarded by the same Cmp instruction:
1782 auto *Br = dyn_cast<BranchInst>(Predecessor->getTerminator());
1783 if (!Br || Br->isUnconditional() || Br->getCondition() != &Cmp)
1784 return false;
1785
1786 // Check if there is any arg of the recursive callsite is affecting the cmp
1787 // instr:
1788 bool ArgFound = false;
1789 Value *FuncArg = nullptr, *CallArg = nullptr;
1790 for (unsigned ArgNum = 0;
1791 ArgNum < F.arg_size() && ArgNum < CandidateCall.arg_size(); ArgNum++) {
1792 FuncArg = F.getArg(ArgNum);
1793 CallArg = CandidateCall.getArgOperand(ArgNum);
1794 if (FuncArg == CmpOp && CallArg != CmpOp) {
1795 ArgFound = true;
1796 break;
1797 }
1798 }
1799 if (!ArgFound)
1800 return false;
1801
1802 // Now we have a recursive call that is guarded by a cmp instruction.
1803 // Check if this cmp can be simplified:
1804 SimplifyQuery SQ(DL, dyn_cast<Instruction>(CallArg));
1805 CondContext CC(&Cmp);
1806 CC.Invert = (CallBB != Br->getSuccessor(0));
1807 SQ.CC = &CC;
1808 CC.AffectedValues.insert(FuncArg);
1809 Value *SimplifiedInstruction = llvm::simplifyInstructionWithOperands(
1810 cast<CmpInst>(&Cmp), {CallArg, Cmp.getOperand(1)}, SQ);
1811 if (auto *ConstVal = dyn_cast_or_null<ConstantInt>(SimplifiedInstruction)) {
1812 // Make sure that the BB of the recursive call is NOT the true successor
1813 // of the icmp. In other words, make sure that the recursion depth is 1.
1814 if ((ConstVal->isOne() && CC.Invert) ||
1815 (ConstVal->isZero() && !CC.Invert)) {
1816 SimplifiedValues[&Cmp] = ConstVal;
1817 return true;
1818 }
1819 }
1820 return false;
1821}
1822
1823/// Simplify \p I if its operands are constants and update SimplifiedValues.
1824bool CallAnalyzer::simplifyInstruction(Instruction &I) {
1826 for (Value *Op : I.operands()) {
1827 Constant *COp = getDirectOrSimplifiedValue<Constant>(Op);
1828 if (!COp)
1829 return false;
1830 COps.push_back(COp);
1831 }
1832 auto *C = ConstantFoldInstOperands(&I, COps, DL);
1833 if (!C)
1834 return false;
1835 SimplifiedValues[&I] = C;
1836 return true;
1837}
1838
1839/// Try to simplify a call to llvm.is.constant.
1840///
1841/// Duplicate the argument checking from CallAnalyzer::simplifyCallSite since
1842/// we expect calls of this specific intrinsic to be infrequent.
1843///
1844/// FIXME: Given that we know CB's parent (F) caller
1845/// (CandidateCall->getParent()->getParent()), we might be able to determine
1846/// whether inlining F into F's caller would change how the call to
1847/// llvm.is.constant would evaluate.
1848bool CallAnalyzer::simplifyIntrinsicCallIsConstant(CallBase &CB) {
1849 Value *Arg = CB.getArgOperand(0);
1850 auto *C = getDirectOrSimplifiedValue<Constant>(Arg);
1851
1852 Type *RT = CB.getFunctionType()->getReturnType();
1853 SimplifiedValues[&CB] = ConstantInt::get(RT, C ? 1 : 0);
1854 return true;
1855}
1856
1857bool CallAnalyzer::simplifyIntrinsicCallObjectSize(CallBase &CB) {
1858 // As per the langref, "The fourth argument to llvm.objectsize determines if
1859 // the value should be evaluated at runtime."
1860 if (cast<ConstantInt>(CB.getArgOperand(3))->isOne())
1861 return false;
1862
1863 Value *V = lowerObjectSizeCall(&cast<IntrinsicInst>(CB), DL, nullptr,
1864 /*MustSucceed=*/true);
1865 Constant *C = dyn_cast_or_null<Constant>(V);
1866 if (C)
1867 SimplifiedValues[&CB] = C;
1868 return C;
1869}
1870
1871bool CallAnalyzer::visitBitCast(BitCastInst &I) {
1872 // Propagate constants through bitcasts.
1874 return true;
1875
1876 // Track base/offsets through casts
1877 std::pair<Value *, APInt> BaseAndOffset =
1878 ConstantOffsetPtrs.lookup(I.getOperand(0));
1879 // Casts don't change the offset, just wrap it up.
1880 if (BaseAndOffset.first)
1881 ConstantOffsetPtrs[&I] = BaseAndOffset;
1882
1883 // Also look for SROA candidates here.
1884 if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
1885 SROAArgValues[&I] = SROAArg;
1886
1887 // Bitcasts are always zero cost.
1888 return true;
1889}
1890
1891bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
1892 // Propagate constants through ptrtoint.
1894 return true;
1895
1896 // Track base/offset pairs when converted to a plain integer provided the
1897 // integer is large enough to represent the pointer.
1898 unsigned IntegerSize = I.getType()->getScalarSizeInBits();
1899 unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace();
1900 if (IntegerSize == DL.getPointerSizeInBits(AS)) {
1901 std::pair<Value *, APInt> BaseAndOffset =
1902 ConstantOffsetPtrs.lookup(I.getOperand(0));
1903 if (BaseAndOffset.first)
1904 ConstantOffsetPtrs[&I] = BaseAndOffset;
1905 }
1906
1907 // This is really weird. Technically, ptrtoint will disable SROA. However,
1908 // unless that ptrtoint is *used* somewhere in the live basic blocks after
1909 // inlining, it will be nuked, and SROA should proceed. All of the uses which
1910 // would block SROA would also block SROA if applied directly to a pointer,
1911 // and so we can just add the integer in here. The only places where SROA is
1912 // preserved either cannot fire on an integer, or won't in-and-of themselves
1913 // disable SROA (ext) w/o some later use that we would see and disable.
1914 if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
1915 SROAArgValues[&I] = SROAArg;
1916
1919}
1920
1921bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
1922 // Propagate constants through ptrtoint.
1924 return true;
1925
1926 // Track base/offset pairs when round-tripped through a pointer without
1927 // modifications provided the integer is not too large.
1928 Value *Op = I.getOperand(0);
1929 unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
1930 if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) {
1931 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
1932 if (BaseAndOffset.first)
1933 ConstantOffsetPtrs[&I] = BaseAndOffset;
1934 }
1935
1936 // "Propagate" SROA here in the same manner as we do for ptrtoint above.
1937 if (auto *SROAArg = getSROAArgForValueOrNull(Op))
1938 SROAArgValues[&I] = SROAArg;
1939
1942}
1943
1944bool CallAnalyzer::visitCastInst(CastInst &I) {
1945 // Propagate constants through casts.
1947 return true;
1948
1949 // Disable SROA in the face of arbitrary casts we don't explicitly list
1950 // elsewhere.
1951 disableSROA(I.getOperand(0));
1952
1953 // If this is a floating-point cast, and the target says this operation
1954 // is expensive, this may eventually become a library call. Treat the cost
1955 // as such.
1956 switch (I.getOpcode()) {
1957 case Instruction::FPTrunc:
1958 case Instruction::FPExt:
1959 case Instruction::UIToFP:
1960 case Instruction::SIToFP:
1961 case Instruction::FPToUI:
1962 case Instruction::FPToSI:
1964 onCallPenalty();
1965 break;
1966 default:
1967 break;
1968 }
1969
1972}
1973
1974bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
1975 return CandidateCall.paramHasAttr(A->getArgNo(), Attr);
1976}
1977
1978bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
1979 // Does the *call site* have the NonNull attribute set on an argument? We
1980 // use the attribute on the call site to memoize any analysis done in the
1981 // caller. This will also trip if the callee function has a non-null
1982 // parameter attribute, but that's a less interesting case because hopefully
1983 // the callee would already have been simplified based on that.
1984 if (Argument *A = dyn_cast<Argument>(V))
1985 if (paramHasAttr(A, Attribute::NonNull))
1986 return true;
1987
1988 // Is this an alloca in the caller? This is distinct from the attribute case
1989 // above because attributes aren't updated within the inliner itself and we
1990 // always want to catch the alloca derived case.
1991 if (isAllocaDerivedArg(V))
1992 // We can actually predict the result of comparisons between an
1993 // alloca-derived value and null. Note that this fires regardless of
1994 // SROA firing.
1995 return true;
1996
1997 return false;
1998}
1999
2000bool CallAnalyzer::allowSizeGrowth(CallBase &Call) {
2001 // If the normal destination of the invoke or the parent block of the call
2002 // site is unreachable-terminated, there is little point in inlining this
2003 // unless there is literally zero cost.
2004 // FIXME: Note that it is possible that an unreachable-terminated block has a
2005 // hot entry. For example, in below scenario inlining hot_call_X() may be
2006 // beneficial :
2007 // main() {
2008 // hot_call_1();
2009 // ...
2010 // hot_call_N()
2011 // exit(0);
2012 // }
2013 // For now, we are not handling this corner case here as it is rare in real
2014 // code. In future, we should elaborate this based on BPI and BFI in more
2015 // general threshold adjusting heuristics in updateThreshold().
2016 if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) {
2017 if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))
2018 return false;
2019 } else if (isa<UnreachableInst>(Call.getParent()->getTerminator()))
2020 return false;
2021
2022 return true;
2023}
2024
2025bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call,
2026 BlockFrequencyInfo *CallerBFI) {
2027 // If global profile summary is available, then callsite's coldness is
2028 // determined based on that.
2029 if (PSI && PSI->hasProfileSummary())
2030 return PSI->isColdCallSite(Call, CallerBFI);
2031
2032 // Otherwise we need BFI to be available.
2033 if (!CallerBFI)
2034 return false;
2035
2036 // Determine if the callsite is cold relative to caller's entry. We could
2037 // potentially cache the computation of scaled entry frequency, but the added
2038 // complexity is not worth it unless this scaling shows up high in the
2039 // profiles.
2040 const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
2041 auto CallSiteBB = Call.getParent();
2042 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
2043 auto CallerEntryFreq =
2044 CallerBFI->getBlockFreq(&(Call.getCaller()->getEntryBlock()));
2045 return CallSiteFreq < CallerEntryFreq * ColdProb;
2046}
2047
2048std::optional<int>
2049InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call,
2050 BlockFrequencyInfo *CallerBFI) {
2051
2052 // If global profile summary is available, then callsite's hotness is
2053 // determined based on that.
2054 if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(Call, CallerBFI))
2055 return Params.HotCallSiteThreshold;
2056
2057 // Otherwise we need BFI to be available and to have a locally hot callsite
2058 // threshold.
2059 if (!CallerBFI || !Params.LocallyHotCallSiteThreshold)
2060 return std::nullopt;
2061
2062 // Determine if the callsite is hot relative to caller's entry. We could
2063 // potentially cache the computation of scaled entry frequency, but the added
2064 // complexity is not worth it unless this scaling shows up high in the
2065 // profiles.
2066 const BasicBlock *CallSiteBB = Call.getParent();
2067 BlockFrequency CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
2068 BlockFrequency CallerEntryFreq = CallerBFI->getEntryFreq();
2069 std::optional<BlockFrequency> Limit = CallerEntryFreq.mul(HotCallSiteRelFreq);
2070 if (Limit && CallSiteFreq >= *Limit)
2071 return Params.LocallyHotCallSiteThreshold;
2072
2073 // Otherwise treat it normally.
2074 return std::nullopt;
2075}
2076
2077void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
2078 // If no size growth is allowed for this inlining, set Threshold to 0.
2079 if (!allowSizeGrowth(Call)) {
2080 Threshold = 0;
2081 return;
2082 }
2083
2084 Function *Caller = Call.getCaller();
2085
2086 // return min(A, B) if B is valid.
2087 auto MinIfValid = [](int A, std::optional<int> B) {
2088 return B ? std::min(A, *B) : A;
2089 };
2090
2091 // return max(A, B) if B is valid.
2092 auto MaxIfValid = [](int A, std::optional<int> B) {
2093 return B ? std::max(A, *B) : A;
2094 };
2095
2096 // Various bonus percentages. These are multiplied by Threshold to get the
2097 // bonus values.
2098 // SingleBBBonus: This bonus is applied if the callee has a single reachable
2099 // basic block at the given callsite context. This is speculatively applied
2100 // and withdrawn if more than one basic block is seen.
2101 //
2102 // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
2103 // of the last call to a static function as inlining such functions is
2104 // guaranteed to reduce code size.
2105 //
2106 // These bonus percentages may be set to 0 based on properties of the caller
2107 // and the callsite.
2108 int SingleBBBonusPercent = 50;
2109 int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
2110 int LastCallToStaticBonus = TTI.getInliningLastCallToStaticBonus();
2111
2112 // Lambda to set all the above bonus and bonus percentages to 0.
2113 auto DisallowAllBonuses = [&]() {
2114 SingleBBBonusPercent = 0;
2115 VectorBonusPercent = 0;
2116 LastCallToStaticBonus = 0;
2117 };
2118
2119 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
2120 // and reduce the threshold if the caller has the necessary attribute.
2121 if (Caller->hasMinSize()) {
2122 Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
2123 // For minsize, we want to disable the single BB bonus and the vector
2124 // bonuses, but not the last-call-to-static bonus. Inlining the last call to
2125 // a static function will, at the minimum, eliminate the parameter setup and
2126 // call/return instructions.
2127 SingleBBBonusPercent = 0;
2128 VectorBonusPercent = 0;
2129 } else if (Caller->hasOptSize())
2130 Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
2131
2132 // Adjust the threshold based on inlinehint attribute and profile based
2133 // hotness information if the caller does not have MinSize attribute.
2134 if (!Caller->hasMinSize()) {
2135 if (Callee.hasFnAttribute(Attribute::InlineHint))
2136 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
2137
2138 // FIXME: After switching to the new passmanager, simplify the logic below
2139 // by checking only the callsite hotness/coldness as we will reliably
2140 // have local profile information.
2141 //
2142 // Callsite hotness and coldness can be determined if sample profile is
2143 // used (which adds hotness metadata to calls) or if caller's
2144 // BlockFrequencyInfo is available.
2145 BlockFrequencyInfo *CallerBFI = GetBFI ? &(GetBFI(*Caller)) : nullptr;
2146 auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI);
2147 if (!Caller->hasOptSize() && HotCallSiteThreshold) {
2148 LLVM_DEBUG(dbgs() << "Hot callsite.\n");
2149 // FIXME: This should update the threshold only if it exceeds the
2150 // current threshold, but AutoFDO + ThinLTO currently relies on this
2151 // behavior to prevent inlining of hot callsites during ThinLTO
2152 // compile phase.
2153 Threshold = *HotCallSiteThreshold;
2154 } else if (isColdCallSite(Call, CallerBFI)) {
2155 LLVM_DEBUG(dbgs() << "Cold callsite.\n");
2156 // Do not apply bonuses for a cold callsite including the
2157 // LastCallToStatic bonus. While this bonus might result in code size
2158 // reduction, it can cause the size of a non-cold caller to increase
2159 // preventing it from being inlined.
2160 DisallowAllBonuses();
2161 Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
2162 } else if (PSI) {
2163 // Use callee's global profile information only if we have no way of
2164 // determining this via callsite information.
2165 if (PSI->isFunctionEntryHot(&Callee)) {
2166 LLVM_DEBUG(dbgs() << "Hot callee.\n");
2167 // If callsite hotness can not be determined, we may still know
2168 // that the callee is hot and treat it as a weaker hint for threshold
2169 // increase.
2170 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
2171 } else if (PSI->isFunctionEntryCold(&Callee)) {
2172 LLVM_DEBUG(dbgs() << "Cold callee.\n");
2173 // Do not apply bonuses for a cold callee including the
2174 // LastCallToStatic bonus. While this bonus might result in code size
2175 // reduction, it can cause the size of a non-cold caller to increase
2176 // preventing it from being inlined.
2177 DisallowAllBonuses();
2178 Threshold = MinIfValid(Threshold, Params.ColdThreshold);
2179 }
2180 }
2181 }
2182
2183 Threshold += TTI.adjustInliningThreshold(&Call);
2184
2185 // Finally, take the target-specific inlining threshold multiplier into
2186 // account.
2187 Threshold *= TTI.getInliningThresholdMultiplier();
2188
2189 SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
2190 VectorBonus = Threshold * VectorBonusPercent / 100;
2191
2192 // If there is only one call of the function, and it has internal linkage,
2193 // the cost of inlining it drops dramatically. It may seem odd to update
2194 // Cost in updateThreshold, but the bonus depends on the logic in this method.
2195 if (isSoleCallToLocalFunction(Call, F)) {
2196 Cost -= LastCallToStaticBonus;
2197 StaticBonusApplied = LastCallToStaticBonus;
2198 }
2199}
2200
2201bool CallAnalyzer::visitCmpInst(CmpInst &I) {
2202 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
2203 // First try to handle simplified comparisons.
2205 return true;
2206
2207 // Try to handle comparison that can be simplified using ValueTracking.
2208 if (simplifyCmpInstForRecCall(I))
2209 return true;
2210
2211 if (I.getOpcode() == Instruction::FCmp)
2212 return false;
2213
2214 // Otherwise look for a comparison between constant offset pointers with
2215 // a common base.
2216 Value *LHSBase, *RHSBase;
2217 APInt LHSOffset, RHSOffset;
2218 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
2219 if (LHSBase) {
2220 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
2221 if (RHSBase && LHSBase == RHSBase) {
2222 // We have common bases, fold the icmp to a constant based on the
2223 // offsets.
2224 SimplifiedValues[&I] = ConstantInt::getBool(
2225 I.getType(),
2226 ICmpInst::compare(LHSOffset, RHSOffset, I.getPredicate()));
2227 ++NumConstantPtrCmps;
2228 return true;
2229 }
2230 }
2231
2232 auto isImplicitNullCheckCmp = [](const CmpInst &I) {
2233 for (auto *User : I.users())
2234 if (auto *Instr = dyn_cast<Instruction>(User))
2235 if (!Instr->getMetadata(LLVMContext::MD_make_implicit))
2236 return false;
2237 return true;
2238 };
2239
2240 // If the comparison is an equality comparison with null, we can simplify it
2241 // if we know the value (argument) can't be null
2242 if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1))) {
2243 if (isKnownNonNullInCallee(I.getOperand(0))) {
2244 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
2245 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
2247 return true;
2248 }
2249 // Implicit null checks act as unconditional branches and their comparisons
2250 // should be treated as simplified and free of cost.
2251 if (isImplicitNullCheckCmp(I))
2252 return true;
2253 }
2254 return handleSROA(I.getOperand(0), isa<ConstantPointerNull>(I.getOperand(1)));
2255}
2256
2257bool CallAnalyzer::visitSub(BinaryOperator &I) {
2258 // Try to handle a special case: we can fold computing the difference of two
2259 // constant-related pointers.
2260 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
2261 Value *LHSBase, *RHSBase;
2262 APInt LHSOffset, RHSOffset;
2263 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
2264 if (LHSBase) {
2265 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
2266 if (RHSBase && LHSBase == RHSBase) {
2267 // We have common bases, fold the subtract to a constant based on the
2268 // offsets.
2269 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
2270 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
2271 if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
2272 SimplifiedValues[&I] = C;
2273 ++NumConstantPtrDiffs;
2274 return true;
2275 }
2276 }
2277 }
2278
2279 // Otherwise, fall back to the generic logic for simplifying and handling
2280 // instructions.
2281 return Base::visitSub(I);
2282}
2283
2284bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
2285 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
2286 Constant *CLHS = getDirectOrSimplifiedValue<Constant>(LHS);
2287 Constant *CRHS = getDirectOrSimplifiedValue<Constant>(RHS);
2288
2289 Value *SimpleV = nullptr;
2290 if (auto FI = dyn_cast<FPMathOperator>(&I))
2291 SimpleV = simplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS,
2292 FI->getFastMathFlags(), DL);
2293 else
2294 SimpleV =
2295 simplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL);
2296
2297 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
2298 SimplifiedValues[&I] = C;
2299
2300 if (SimpleV)
2301 return true;
2302
2303 // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
2304 disableSROA(LHS);
2305 disableSROA(RHS);
2306
2307 // If the instruction is floating point, and the target says this operation
2308 // is expensive, this may eventually become a library call. Treat the cost
2309 // as such. Unless it's fneg which can be implemented with an xor.
2310 using namespace llvm::PatternMatch;
2311 if (I.getType()->isFloatingPointTy() &&
2313 !match(&I, m_FNeg(m_Value())))
2314 onCallPenalty();
2315
2316 return false;
2317}
2318
2319bool CallAnalyzer::visitFNeg(UnaryOperator &I) {
2320 Value *Op = I.getOperand(0);
2321 Constant *COp = getDirectOrSimplifiedValue<Constant>(Op);
2322
2323 Value *SimpleV = simplifyFNegInst(
2324 COp ? COp : Op, cast<FPMathOperator>(I).getFastMathFlags(), DL);
2325
2326 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
2327 SimplifiedValues[&I] = C;
2328
2329 if (SimpleV)
2330 return true;
2331
2332 // Disable any SROA on arguments to arbitrary, unsimplified fneg.
2333 disableSROA(Op);
2334
2335 return false;
2336}
2337
2338bool CallAnalyzer::visitLoad(LoadInst &I) {
2339 if (handleSROA(I.getPointerOperand(), I.isSimple()))
2340 return true;
2341
2342 // If the data is already loaded from this address and hasn't been clobbered
2343 // by any stores or calls, this load is likely to be redundant and can be
2344 // eliminated.
2345 if (EnableLoadElimination &&
2346 !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) {
2347 onLoadEliminationOpportunity();
2348 return true;
2349 }
2350
2351 onMemAccess();
2352 return false;
2353}
2354
2355bool CallAnalyzer::visitStore(StoreInst &I) {
2356 if (handleSROA(I.getPointerOperand(), I.isSimple()))
2357 return true;
2358
2359 // The store can potentially clobber loads and prevent repeated loads from
2360 // being eliminated.
2361 // FIXME:
2362 // 1. We can probably keep an initial set of eliminatable loads substracted
2363 // from the cost even when we finally see a store. We just need to disable
2364 // *further* accumulation of elimination savings.
2365 // 2. We should probably at some point thread MemorySSA for the callee into
2366 // this and then use that to actually compute *really* precise savings.
2367 disableLoadElimination();
2368
2369 onMemAccess();
2370 return false;
2371}
2372
2373bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
2374 Value *Op = I.getAggregateOperand();
2375
2376 // Special handling, because we want to simplify extractvalue with a
2377 // potential insertvalue from the caller.
2378 if (Value *SimpleOp = getSimplifiedValueUnchecked(Op)) {
2379 SimplifyQuery SQ(DL);
2380 Value *SimpleV = simplifyExtractValueInst(SimpleOp, I.getIndices(), SQ);
2381 if (SimpleV) {
2382 SimplifiedValues[&I] = SimpleV;
2383 return true;
2384 }
2385 }
2386
2387 // SROA can't look through these, but they may be free.
2388 return Base::visitExtractValue(I);
2389}
2390
2391bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
2392 // Constant folding for insert value is trivial.
2394 return true;
2395
2396 // SROA can't look through these, but they may be free.
2397 return Base::visitInsertValue(I);
2398}
2399
2400/// Try to simplify a call site.
2401///
2402/// Takes a concrete function and callsite and tries to actually simplify it by
2403/// analyzing the arguments and call itself with instsimplify. Returns true if
2404/// it has simplified the callsite to some other entity (a constant), making it
2405/// free.
2406bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) {
2407 // FIXME: Using the instsimplify logic directly for this is inefficient
2408 // because we have to continually rebuild the argument list even when no
2409 // simplifications can be performed. Until that is fixed with remapping
2410 // inside of instsimplify, directly constant fold calls here.
2411 if (!canConstantFoldCallTo(&Call, F))
2412 return false;
2413
2414 // Try to re-map the arguments to constants.
2415 SmallVector<Constant *, 4> ConstantArgs;
2416 ConstantArgs.reserve(Call.arg_size());
2417 for (Value *I : Call.args()) {
2418 Constant *C = getDirectOrSimplifiedValue<Constant>(I);
2419 if (!C)
2420 return false; // This argument doesn't map to a constant.
2421
2422 ConstantArgs.push_back(C);
2423 }
2424 if (Constant *C = ConstantFoldCall(&Call, F, ConstantArgs)) {
2425 SimplifiedValues[&Call] = C;
2426 return true;
2427 }
2428
2429 return false;
2430}
2431
2432bool CallAnalyzer::isLoweredToCall(Function *F, CallBase &Call) {
2433 const TargetLibraryInfo *TLI = GetTLI ? &GetTLI(*F) : nullptr;
2434 LibFunc LF;
2435 if (!TLI || !TLI->getLibFunc(*F, LF) || !TLI->has(LF))
2436 return TTI.isLoweredToCall(F);
2437
2438 switch (LF) {
2439 case LibFunc_memcpy_chk:
2440 case LibFunc_memmove_chk:
2441 case LibFunc_mempcpy_chk:
2442 case LibFunc_memset_chk: {
2443 // Calls to __memcpy_chk whose length is known to fit within the object
2444 // size will eventually be replaced by inline stores. Therefore, these
2445 // should not incur a call penalty. This is only really relevant on
2446 // platforms whose headers redirect memcpy to __memcpy_chk (e.g. Darwin), as
2447 // other platforms use memcpy intrinsics, which are already exempt from the
2448 // call penalty.
2449 auto *LenOp = getDirectOrSimplifiedValue<ConstantInt>(Call.getOperand(2));
2450 auto *ObjSizeOp =
2451 getDirectOrSimplifiedValue<ConstantInt>(Call.getOperand(3));
2452 if (LenOp && ObjSizeOp &&
2453 LenOp->getLimitedValue() <= ObjSizeOp->getLimitedValue()) {
2454 return false;
2455 }
2456 break;
2457 }
2458 default:
2459 break;
2460 }
2461
2462 return TTI.isLoweredToCall(F);
2463}
2464
2465bool CallAnalyzer::visitCallBase(CallBase &Call) {
2466 if (!onCallBaseVisitStart(Call))
2467 return true;
2468
2469 if (Call.hasFnAttr(Attribute::ReturnsTwice) &&
2470 !F.hasFnAttribute(Attribute::ReturnsTwice)) {
2471 // This aborts the entire analysis.
2472 ExposesReturnsTwice = true;
2473 return false;
2474 }
2475 if (isa<CallInst>(Call) && cast<CallInst>(Call).cannotDuplicate())
2476 ContainsNoDuplicateCall = true;
2477
2478 if (InlineAsm *InlineAsmOp = dyn_cast<InlineAsm>(Call.getCalledOperand()))
2479 onInlineAsm(*InlineAsmOp);
2480
2481 Function *F = Call.getCalledFunction();
2482 bool IsIndirectCall = !F;
2483 if (IsIndirectCall) {
2484 // Check if this happens to be an indirect function call to a known function
2485 // in this inline context. If not, we've done all we can.
2486 Value *Callee = Call.getCalledOperand();
2487 F = getSimplifiedValue<Function>(Callee);
2488 if (!F || F->getFunctionType() != Call.getFunctionType()) {
2489 onCallArgumentSetup(Call);
2490
2491 if (!Call.onlyReadsMemory())
2492 disableLoadElimination();
2493 return Base::visitCallBase(Call);
2494 }
2495 }
2496
2497 assert(F && "Expected a call to a known function");
2498
2499 // When we have a concrete function, first try to simplify it directly.
2500 if (simplifyCallSite(F, Call))
2501 return true;
2502
2503 // Next check if it is an intrinsic we know about.
2504 // FIXME: Lift this into part of the InstVisitor.
2505 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Call)) {
2506 switch (II->getIntrinsicID()) {
2507 default:
2508 if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(II))
2509 disableLoadElimination();
2510 return Base::visitCallBase(Call);
2511
2512 case Intrinsic::load_relative:
2513 onLoadRelativeIntrinsic();
2514 return false;
2515
2516 case Intrinsic::memset:
2517 case Intrinsic::memcpy:
2518 case Intrinsic::memmove:
2519 disableLoadElimination();
2520 // SROA can usually chew through these intrinsics, but they aren't free.
2521 return false;
2522 case Intrinsic::icall_branch_funnel:
2523 case Intrinsic::localescape:
2524 HasUninlineableIntrinsic = true;
2525 return false;
2526 case Intrinsic::vastart:
2527 InitsVargArgs = true;
2528 return false;
2529 case Intrinsic::launder_invariant_group:
2530 case Intrinsic::strip_invariant_group:
2531 if (auto *SROAArg = getSROAArgForValueOrNull(II->getOperand(0)))
2532 SROAArgValues[II] = SROAArg;
2533 return true;
2534 case Intrinsic::is_constant:
2535 return simplifyIntrinsicCallIsConstant(Call);
2536 case Intrinsic::objectsize:
2537 return simplifyIntrinsicCallObjectSize(Call);
2538 }
2539 }
2540
2541 if (F == Call.getFunction()) {
2542 // This flag will fully abort the analysis, so don't bother with anything
2543 // else.
2544 IsRecursiveCall = true;
2545 if (!AllowRecursiveCall)
2546 return false;
2547 }
2548
2549 if (isLoweredToCall(F, Call)) {
2550 onLoweredCall(F, Call, IsIndirectCall);
2551 }
2552
2553 if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory())))
2554 disableLoadElimination();
2555 return Base::visitCallBase(Call);
2556}
2557
2558bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
2559 // At least one return instruction will be free after inlining.
2560 bool Free = !HasReturn;
2561 HasReturn = true;
2562 return Free;
2563}
2564
2565bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
2566 // We model unconditional branches as essentially free -- they really
2567 // shouldn't exist at all, but handling them makes the behavior of the
2568 // inliner more regular and predictable. Interestingly, conditional branches
2569 // which will fold away are also free.
2570 return BI.isUnconditional() ||
2571 getDirectOrSimplifiedValue<ConstantInt>(BI.getCondition()) ||
2572 BI.getMetadata(LLVMContext::MD_make_implicit);
2573}
2574
2575bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
2576 bool CheckSROA = SI.getType()->isPointerTy();
2577 Value *TrueVal = SI.getTrueValue();
2578 Value *FalseVal = SI.getFalseValue();
2579
2580 Constant *TrueC = getDirectOrSimplifiedValue<Constant>(TrueVal);
2581 Constant *FalseC = getDirectOrSimplifiedValue<Constant>(FalseVal);
2582 Constant *CondC = getSimplifiedValue<Constant>(SI.getCondition());
2583
2584 if (!CondC) {
2585 // Select C, X, X => X
2586 if (TrueC == FalseC && TrueC) {
2587 SimplifiedValues[&SI] = TrueC;
2588 return true;
2589 }
2590
2591 if (!CheckSROA)
2592 return Base::visitSelectInst(SI);
2593
2594 std::pair<Value *, APInt> TrueBaseAndOffset =
2595 ConstantOffsetPtrs.lookup(TrueVal);
2596 std::pair<Value *, APInt> FalseBaseAndOffset =
2597 ConstantOffsetPtrs.lookup(FalseVal);
2598 if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
2599 ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;
2600
2601 if (auto *SROAArg = getSROAArgForValueOrNull(TrueVal))
2602 SROAArgValues[&SI] = SROAArg;
2603 return true;
2604 }
2605
2606 return Base::visitSelectInst(SI);
2607 }
2608
2609 // Select condition is a constant.
2610 Value *SelectedV = CondC->isAllOnesValue() ? TrueVal
2611 : (CondC->isNullValue()) ? FalseVal
2612 : nullptr;
2613 if (!SelectedV) {
2614 // Condition is a vector constant that is not all 1s or all 0s. If all
2615 // operands are constants, ConstantFoldSelectInstruction() can handle the
2616 // cases such as select vectors.
2617 if (TrueC && FalseC) {
2618 if (auto *C = ConstantFoldSelectInstruction(CondC, TrueC, FalseC)) {
2619 SimplifiedValues[&SI] = C;
2620 return true;
2621 }
2622 }
2623 return Base::visitSelectInst(SI);
2624 }
2625
2626 // Condition is either all 1s or all 0s. SI can be simplified.
2627 if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) {
2628 SimplifiedValues[&SI] = SelectedC;
2629 return true;
2630 }
2631
2632 if (!CheckSROA)
2633 return true;
2634
2635 std::pair<Value *, APInt> BaseAndOffset =
2636 ConstantOffsetPtrs.lookup(SelectedV);
2637 if (BaseAndOffset.first) {
2638 ConstantOffsetPtrs[&SI] = BaseAndOffset;
2639
2640 if (auto *SROAArg = getSROAArgForValueOrNull(SelectedV))
2641 SROAArgValues[&SI] = SROAArg;
2642 }
2643
2644 return true;
2645}
2646
2647bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
2648 // We model unconditional switches as free, see the comments on handling
2649 // branches.
2650 if (getDirectOrSimplifiedValue<ConstantInt>(SI.getCondition()))
2651 return true;
2652
2653 // Assume the most general case where the switch is lowered into
2654 // either a jump table, bit test, or a balanced binary tree consisting of
2655 // case clusters without merging adjacent clusters with the same
2656 // destination. We do not consider the switches that are lowered with a mix
2657 // of jump table/bit test/binary search tree. The cost of the switch is
2658 // proportional to the size of the tree or the size of jump table range.
2659 //
2660 // NB: We convert large switches which are just used to initialize large phi
2661 // nodes to lookup tables instead in simplifycfg, so this shouldn't prevent
2662 // inlining those. It will prevent inlining in cases where the optimization
2663 // does not (yet) fire.
2664
2665 unsigned JumpTableSize = 0;
2666 BlockFrequencyInfo *BFI = GetBFI ? &(GetBFI(F)) : nullptr;
2667 unsigned NumCaseCluster =
2668 TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize, PSI, BFI);
2669
2670 onFinalizeSwitch(JumpTableSize, NumCaseCluster, SI.defaultDestUnreachable());
2671 return false;
2672}
2673
2674bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
2675 // We never want to inline functions that contain an indirectbr. This is
2676 // incorrect because all the blockaddress's (in static global initializers
2677 // for example) would be referring to the original function, and this
2678 // indirect jump would jump from the inlined copy of the function into the
2679 // original function which is extremely undefined behavior.
2680 // FIXME: This logic isn't really right; we can safely inline functions with
2681 // indirectbr's as long as no other function or global references the
2682 // blockaddress of a block within the current function.
2683 HasIndirectBr = true;
2684 return false;
2685}
2686
2687bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
2688 // FIXME: It's not clear that a single instruction is an accurate model for
2689 // the inline cost of a resume instruction.
2690 return false;
2691}
2692
2693bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
2694 // FIXME: It's not clear that a single instruction is an accurate model for
2695 // the inline cost of a cleanupret instruction.
2696 return false;
2697}
2698
2699bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
2700 // FIXME: It's not clear that a single instruction is an accurate model for
2701 // the inline cost of a catchret instruction.
2702 return false;
2703}
2704
2705bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
2706 // FIXME: It might be reasonably to discount the cost of instructions leading
2707 // to unreachable as they have the lowest possible impact on both runtime and
2708 // code size.
2709 return true; // No actual code is needed for unreachable.
2710}
2711
2712bool CallAnalyzer::visitInstruction(Instruction &I) {
2713 // Some instructions are free. All of the free intrinsics can also be
2714 // handled by SROA, etc.
2717 return true;
2718
2719 // We found something we don't understand or can't handle. Mark any SROA-able
2720 // values in the operand list as no longer viable.
2721 for (const Use &Op : I.operands())
2722 disableSROA(Op);
2723
2724 return false;
2725}
2726
2727/// Analyze a basic block for its contribution to the inline cost.
2728///
2729/// This method walks the analyzer over every instruction in the given basic
2730/// block and accounts for their cost during inlining at this callsite. It
2731/// aborts early if the threshold has been exceeded or an impossible to inline
2732/// construct has been detected. It returns false if inlining is no longer
2733/// viable, and true if inlining remains viable.
2735CallAnalyzer::analyzeBlock(BasicBlock *BB,
2736 const SmallPtrSetImpl<const Value *> &EphValues) {
2737 for (Instruction &I : *BB) {
2738 // FIXME: Currently, the number of instructions in a function regardless of
2739 // our ability to simplify them during inline to constants or dead code,
2740 // are actually used by the vector bonus heuristic. As long as that's true,
2741 // we have to special case debug intrinsics here to prevent differences in
2742 // inlining due to debug symbols. Eventually, the number of unsimplified
2743 // instructions shouldn't factor into the cost computation, but until then,
2744 // hack around it here.
2745 // Similarly, skip pseudo-probes.
2746 if (I.isDebugOrPseudoInst())
2747 continue;
2748
2749 // Skip ephemeral values.
2750 if (EphValues.count(&I))
2751 continue;
2752
2753 ++NumInstructions;
2754 if (isa<ExtractElementInst>(I) || I.getType()->isVectorTy())
2755 ++NumVectorInstructions;
2756
2757 // If the instruction simplified to a constant, there is no cost to this
2758 // instruction. Visit the instructions using our InstVisitor to account for
2759 // all of the per-instruction logic. The visit tree returns true if we
2760 // consumed the instruction in any way, and false if the instruction's base
2761 // cost should count against inlining.
2762 onInstructionAnalysisStart(&I);
2763
2764 if (Base::visit(&I))
2765 ++NumInstructionsSimplified;
2766 else
2767 onMissedSimplification();
2768
2769 onInstructionAnalysisFinish(&I);
2770 using namespace ore;
2771 // If the visit this instruction detected an uninlinable pattern, abort.
2773 if (IsRecursiveCall && !AllowRecursiveCall)
2774 IR = InlineResult::failure("recursive");
2775 else if (ExposesReturnsTwice)
2776 IR = InlineResult::failure("exposes returns twice");
2777 else if (HasDynamicAlloca)
2778 IR = InlineResult::failure("dynamic alloca");
2779 else if (HasIndirectBr)
2780 IR = InlineResult::failure("indirect branch");
2781 else if (HasUninlineableIntrinsic)
2782 IR = InlineResult::failure("uninlinable intrinsic");
2783 else if (InitsVargArgs)
2784 IR = InlineResult::failure("varargs");
2785 if (!IR.isSuccess()) {
2786 if (ORE)
2787 ORE->emit([&]() {
2788 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
2789 &CandidateCall)
2790 << NV("Callee", &F) << " has uninlinable pattern ("
2791 << NV("InlineResult", IR.getFailureReason())
2792 << ") and cost is not fully computed";
2793 });
2794 return IR;
2795 }
2796
2797 // If the caller is a recursive function then we don't want to inline
2798 // functions which allocate a lot of stack space because it would increase
2799 // the caller stack usage dramatically.
2800 if (IsCallerRecursive && AllocatedSize > RecurStackSizeThreshold) {
2801 auto IR =
2802 InlineResult::failure("recursive and allocates too much stack space");
2803 if (ORE)
2804 ORE->emit([&]() {
2805 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
2806 &CandidateCall)
2807 << NV("Callee", &F) << " is "
2808 << NV("InlineResult", IR.getFailureReason())
2809 << ". Cost is not fully computed";
2810 });
2811 return IR;
2812 }
2813
2814 if (shouldStop())
2815 return InlineResult::failure(
2816 "Call site analysis is not favorable to inlining.");
2817 }
2818
2819 return InlineResult::success();
2820}
2821
2822/// Compute the base pointer and cumulative constant offsets for V.
2823///
2824/// This strips all constant offsets off of V, leaving it the base pointer, and
2825/// accumulates the total constant offset applied in the returned constant. It
2826/// returns 0 if V is not a pointer, and returns the constant '0' if there are
2827/// no constant offsets applied.
2828ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
2829 if (!V->getType()->isPointerTy())
2830 return nullptr;
2831
2832 unsigned AS = V->getType()->getPointerAddressSpace();
2833 unsigned IntPtrWidth = DL.getIndexSizeInBits(AS);
2834 APInt Offset = APInt::getZero(IntPtrWidth);
2835
2836 // Even though we don't look through PHI nodes, we could be called on an
2837 // instruction in an unreachable block, which may be on a cycle.
2839 Visited.insert(V);
2840 do {
2841 if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
2842 if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
2843 return nullptr;
2844 V = GEP->getPointerOperand();
2845 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
2846 if (GA->isInterposable())
2847 break;
2848 V = GA->getAliasee();
2849 } else {
2850 break;
2851 }
2852 assert(V->getType()->isPointerTy() && "Unexpected operand type!");
2853 } while (Visited.insert(V).second);
2854
2855 Type *IdxPtrTy = DL.getIndexType(V->getType());
2856 return cast<ConstantInt>(ConstantInt::get(IdxPtrTy, Offset));
2857}
2858
2859/// Find dead blocks due to deleted CFG edges during inlining.
2860///
2861/// If we know the successor of the current block, \p CurrBB, has to be \p
2862/// NextBB, the other successors of \p CurrBB are dead if these successors have
2863/// no live incoming CFG edges. If one block is found to be dead, we can
2864/// continue growing the dead block list by checking the successors of the dead
2865/// blocks to see if all their incoming edges are dead or not.
2866void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
2867 auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) {
2868 // A CFG edge is dead if the predecessor is dead or the predecessor has a
2869 // known successor which is not the one under exam.
2870 if (DeadBlocks.count(Pred))
2871 return true;
2872 BasicBlock *KnownSucc = KnownSuccessors[Pred];
2873 return KnownSucc && KnownSucc != Succ;
2874 };
2875
2876 auto IsNewlyDead = [&](BasicBlock *BB) {
2877 // If all the edges to a block are dead, the block is also dead.
2878 return (!DeadBlocks.count(BB) &&
2880 [&](BasicBlock *P) { return IsEdgeDead(P, BB); }));
2881 };
2882
2883 for (BasicBlock *Succ : successors(CurrBB)) {
2884 if (Succ == NextBB || !IsNewlyDead(Succ))
2885 continue;
2887 NewDead.push_back(Succ);
2888 while (!NewDead.empty()) {
2889 BasicBlock *Dead = NewDead.pop_back_val();
2890 if (DeadBlocks.insert(Dead).second)
2891 // Continue growing the dead block lists.
2892 for (BasicBlock *S : successors(Dead))
2893 if (IsNewlyDead(S))
2894 NewDead.push_back(S);
2895 }
2896 }
2897}
2898
2899/// Analyze a call site for potential inlining.
2900///
2901/// Returns true if inlining this call is viable, and false if it is not
2902/// viable. It computes the cost and adjusts the threshold based on numerous
2903/// factors and heuristics. If this method returns false but the computed cost
2904/// is below the computed threshold, then inlining was forcibly disabled by
2905/// some artifact of the routine.
2906InlineResult CallAnalyzer::analyze() {
2907 ++NumCallsAnalyzed;
2908
2909 auto Result = onAnalysisStart();
2910 if (!Result.isSuccess())
2911 return Result;
2912
2913 if (F.empty())
2914 return InlineResult::success();
2915
2916 Function *Caller = CandidateCall.getFunction();
2917 // Check if the caller function is recursive itself.
2918 for (User *U : Caller->users()) {
2919 CallBase *Call = dyn_cast<CallBase>(U);
2920 if (Call && Call->getFunction() == Caller) {
2921 IsCallerRecursive = true;
2922 break;
2923 }
2924 }
2925
2926 // Populate our simplified values by mapping from function arguments to call
2927 // arguments with known important simplifications.
2928 auto CAI = CandidateCall.arg_begin();
2929 for (Argument &FAI : F.args()) {
2930 assert(CAI != CandidateCall.arg_end());
2931 SimplifiedValues[&FAI] = *CAI;
2932 if (isa<Constant>(*CAI))
2933 ++NumConstantArgs;
2934
2935 Value *PtrArg = *CAI;
2936 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
2937 ConstantOffsetPtrs[&FAI] = std::make_pair(PtrArg, C->getValue());
2938
2939 // We can SROA any pointer arguments derived from alloca instructions.
2940 if (auto *SROAArg = dyn_cast<AllocaInst>(PtrArg)) {
2941 SROAArgValues[&FAI] = SROAArg;
2942 onInitializeSROAArg(SROAArg);
2943 EnabledSROAAllocas.insert(SROAArg);
2944 }
2945 }
2946 ++CAI;
2947 }
2948 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
2949 NumAllocaArgs = SROAArgValues.size();
2950
2951 // Collecting the ephemeral values of `F` can be expensive, so use the
2952 // ephemeral values cache if available.
2953 SmallPtrSet<const Value *, 32> EphValuesStorage;
2954 const SmallPtrSetImpl<const Value *> *EphValues = &EphValuesStorage;
2955 if (GetEphValuesCache)
2956 EphValues = &GetEphValuesCache(F).ephValues();
2957 else
2958 CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F),
2959 EphValuesStorage);
2960
2961 // The worklist of live basic blocks in the callee *after* inlining. We avoid
2962 // adding basic blocks of the callee which can be proven to be dead for this
2963 // particular call site in order to get more accurate cost estimates. This
2964 // requires a somewhat heavyweight iteration pattern: we need to walk the
2965 // basic blocks in a breadth-first order as we insert live successors. To
2966 // accomplish this, prioritizing for small iterations because we exit after
2967 // crossing our threshold, we use a small-size optimized SetVector.
2968 typedef SmallSetVector<BasicBlock *, 16> BBSetVector;
2969 BBSetVector BBWorklist;
2970 BBWorklist.insert(&F.getEntryBlock());
2971
2972 // Note that we *must not* cache the size, this loop grows the worklist.
2973 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
2974 if (shouldStop())
2975 break;
2976
2977 BasicBlock *BB = BBWorklist[Idx];
2978 if (BB->empty())
2979 continue;
2980
2981 onBlockStart(BB);
2982
2983 // Disallow inlining a blockaddress with uses other than strictly callbr.
2984 // A blockaddress only has defined behavior for an indirect branch in the
2985 // same function, and we do not currently support inlining indirect
2986 // branches. But, the inliner may not see an indirect branch that ends up
2987 // being dead code at a particular call site. If the blockaddress escapes
2988 // the function, e.g., via a global variable, inlining may lead to an
2989 // invalid cross-function reference.
2990 // FIXME: pr/39560: continue relaxing this overt restriction.
2991 if (BB->hasAddressTaken())
2992 for (User *U : BlockAddress::get(&*BB)->users())
2993 if (!isa<CallBrInst>(*U))
2994 return InlineResult::failure("blockaddress used outside of callbr");
2995
2996 // Analyze the cost of this block. If we blow through the threshold, this
2997 // returns false, and we can bail on out.
2998 InlineResult IR = analyzeBlock(BB, *EphValues);
2999 if (!IR.isSuccess())
3000 return IR;
3001
3002 Instruction *TI = BB->getTerminator();
3003
3004 // Add in the live successors by first checking whether we have terminator
3005 // that may be simplified based on the values simplified by this call.
3006 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
3007 if (BI->isConditional()) {
3008 Value *Cond = BI->getCondition();
3009 if (ConstantInt *SimpleCond = getSimplifiedValue<ConstantInt>(Cond)) {
3010 BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0);
3011 BBWorklist.insert(NextBB);
3012 KnownSuccessors[BB] = NextBB;
3013 findDeadBlocks(BB, NextBB);
3014 continue;
3015 }
3016 }
3017 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
3018 Value *Cond = SI->getCondition();
3019 if (ConstantInt *SimpleCond = getSimplifiedValue<ConstantInt>(Cond)) {
3020 BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor();
3021 BBWorklist.insert(NextBB);
3022 KnownSuccessors[BB] = NextBB;
3023 findDeadBlocks(BB, NextBB);
3024 continue;
3025 }
3026 }
3027
3028 // If we're unable to select a particular successor, just count all of
3029 // them.
3030 BBWorklist.insert_range(successors(BB));
3031
3032 onBlockAnalyzed(BB);
3033 }
3034
3035 // If this is a noduplicate call, we can still inline as long as
3036 // inlining this would cause the removal of the caller (so the instruction
3037 // is not actually duplicated, just moved).
3038 if (!isSoleCallToLocalFunction(CandidateCall, F) && ContainsNoDuplicateCall)
3039 return InlineResult::failure("noduplicate");
3040
3041 // If the callee's stack size exceeds the user-specified threshold,
3042 // do not let it be inlined.
3043 // The command line option overrides a limit set in the function attributes.
3044 size_t FinalStackSizeThreshold = StackSizeThreshold;
3045 if (!StackSizeThreshold.getNumOccurrences())
3046 if (std::optional<int> AttrMaxStackSize = getStringFnAttrAsInt(
3048 FinalStackSizeThreshold = *AttrMaxStackSize;
3049 if (AllocatedSize > FinalStackSizeThreshold)
3050 return InlineResult::failure("stacksize");
3051
3052 return finalizeAnalysis();
3053}
3054
3055void InlineCostCallAnalyzer::print(raw_ostream &OS) {
3056#define DEBUG_PRINT_STAT(x) OS << " " #x ": " << x << "\n"
3058 F.print(OS, &Writer);
3059 DEBUG_PRINT_STAT(NumConstantArgs);
3060 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
3061 DEBUG_PRINT_STAT(NumAllocaArgs);
3062 DEBUG_PRINT_STAT(NumConstantPtrCmps);
3063 DEBUG_PRINT_STAT(NumConstantPtrDiffs);
3064 DEBUG_PRINT_STAT(NumInstructionsSimplified);
3065 DEBUG_PRINT_STAT(NumInstructions);
3066 DEBUG_PRINT_STAT(NumInlineAsmInstructions);
3067 DEBUG_PRINT_STAT(SROACostSavings);
3068 DEBUG_PRINT_STAT(SROACostSavingsLost);
3069 DEBUG_PRINT_STAT(LoadEliminationCost);
3070 DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
3072 DEBUG_PRINT_STAT(Threshold);
3073#undef DEBUG_PRINT_STAT
3074}
3075
3076#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3077/// Dump stats about this call's analysis.
3078LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() { print(dbgs()); }
3079#endif
3080
3081/// Test that there are no attribute conflicts between Caller and Callee
3082/// that prevent inlining.
3084 Function *Caller, Function *Callee, TargetTransformInfo &TTI,
3085 function_ref<const TargetLibraryInfo &(Function &)> &GetTLI) {
3086 // Note that CalleeTLI must be a copy not a reference. The legacy pass manager
3087 // caches the most recently created TLI in the TargetLibraryInfoWrapperPass
3088 // object, and always returns the same object (which is overwritten on each
3089 // GetTLI call). Therefore we copy the first result.
3090 auto CalleeTLI = GetTLI(*Callee);
3091 return (IgnoreTTIInlineCompatible ||
3092 TTI.areInlineCompatible(Caller, Callee)) &&
3093 GetTLI(*Caller).areInlineCompatible(CalleeTLI,
3095 AttributeFuncs::areInlineCompatible(*Caller, *Callee);
3096}
3097
3099 const DataLayout &DL) {
3100 int64_t Cost = 0;
3101 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) {
3102 if (Call.isByValArgument(I)) {
3103 // We approximate the number of loads and stores needed by dividing the
3104 // size of the byval type by the target's pointer size.
3105 PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
3106 unsigned TypeSize = DL.getTypeSizeInBits(Call.getParamByValType(I));
3107 unsigned AS = PTy->getAddressSpace();
3108 unsigned PointerSize = DL.getPointerSizeInBits(AS);
3109 // Ceiling division.
3110 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
3111
3112 // If it generates more than 8 stores it is likely to be expanded as an
3113 // inline memcpy so we take that as an upper bound. Otherwise we assume
3114 // one load and one store per word copied.
3115 // FIXME: The maxStoresPerMemcpy setting from the target should be used
3116 // here instead of a magic number of 8, but it's not available via
3117 // DataLayout.
3118 NumStores = std::min(NumStores, 8U);
3119
3120 Cost += 2 * NumStores * InstrCost;
3121 } else {
3122 // For non-byval arguments subtract off one instruction per call
3123 // argument.
3124 Cost += InstrCost;
3125 }
3126 }
3127 // The call instruction also disappears after inlining.
3128 Cost += InstrCost;
3129 Cost += TTI.getInlineCallPenalty(Call.getCaller(), Call, CallPenalty);
3130
3131 return std::min<int64_t>(Cost, INT_MAX);
3132}
3133
3135 CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
3136 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
3137 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
3140 function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache) {
3141 return getInlineCost(Call, Call.getCalledFunction(), Params, CalleeTTI,
3142 GetAssumptionCache, GetTLI, GetBFI, PSI, ORE,
3143 GetEphValuesCache);
3144}
3145
3147 CallBase &Call, TargetTransformInfo &CalleeTTI,
3148 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
3150 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
3152 const InlineParams Params = {/* DefaultThreshold*/ 0,
3153 /*HintThreshold*/ {},
3154 /*ColdThreshold*/ {},
3155 /*OptSizeThreshold*/ {},
3156 /*OptMinSizeThreshold*/ {},
3157 /*HotCallSiteThreshold*/ {},
3158 /*LocallyHotCallSiteThreshold*/ {},
3159 /*ColdCallSiteThreshold*/ {},
3160 /*ComputeFullInlineCost*/ true,
3161 /*EnableDeferral*/ true};
3162
3163 InlineCostCallAnalyzer CA(*Call.getCalledFunction(), Call, Params, CalleeTTI,
3164 GetAssumptionCache, GetBFI, GetTLI, PSI, ORE, true,
3165 /*IgnoreThreshold*/ true);
3166 auto R = CA.analyze();
3167 if (!R.isSuccess())
3168 return std::nullopt;
3169 return CA.getCost();
3170}
3171
3172std::optional<InlineCostFeatures> llvm::getInliningCostFeatures(
3173 CallBase &Call, TargetTransformInfo &CalleeTTI,
3174 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
3176 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
3178 InlineCostFeaturesAnalyzer CFA(CalleeTTI, GetAssumptionCache, GetBFI, GetTLI,
3179 PSI, ORE, *Call.getCalledFunction(), Call);
3180 auto R = CFA.analyze();
3181 if (!R.isSuccess())
3182 return std::nullopt;
3183 return CFA.features();
3184}
3185
3187 CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI,
3188 function_ref<const TargetLibraryInfo &(Function &)> GetTLI) {
3189
3190 // Cannot inline indirect calls.
3191 if (!Callee)
3192 return InlineResult::failure("indirect call");
3193
3194 // When callee coroutine function is inlined into caller coroutine function
3195 // before coro-split pass,
3196 // coro-early pass can not handle this quiet well.
3197 // So we won't inline the coroutine function if it have not been unsplited
3198 if (Callee->isPresplitCoroutine())
3199 return InlineResult::failure("unsplited coroutine call");
3200
3201 // Never inline calls with byval arguments that does not have the alloca
3202 // address space. Since byval arguments can be replaced with a copy to an
3203 // alloca, the inlined code would need to be adjusted to handle that the
3204 // argument is in the alloca address space (so it is a little bit complicated
3205 // to solve).
3206 unsigned AllocaAS = Callee->getDataLayout().getAllocaAddrSpace();
3207 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I)
3208 if (Call.isByValArgument(I)) {
3209 PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
3210 if (PTy->getAddressSpace() != AllocaAS)
3211 return InlineResult::failure("byval arguments without alloca"
3212 " address space");
3213 }
3214
3215 // Calls to functions with always-inline attributes should be inlined
3216 // whenever possible.
3217 if (Call.hasFnAttr(Attribute::AlwaysInline)) {
3218 if (Call.getAttributes().hasFnAttr(Attribute::NoInline))
3219 return InlineResult::failure("noinline call site attribute");
3220
3221 auto IsViable = isInlineViable(*Callee);
3222 if (IsViable.isSuccess())
3223 return InlineResult::success();
3224 return InlineResult::failure(IsViable.getFailureReason());
3225 }
3226
3227 // Never inline functions with conflicting attributes (unless callee has
3228 // always-inline attribute).
3229 Function *Caller = Call.getCaller();
3230 if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI, GetTLI))
3231 return InlineResult::failure("conflicting attributes");
3232
3233 // Don't inline this call if the caller has the optnone attribute.
3234 if (Caller->hasOptNone())
3235 return InlineResult::failure("optnone attribute");
3236
3237 // Don't inline a function that treats null pointer as valid into a caller
3238 // that does not have this attribute.
3239 if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined())
3240 return InlineResult::failure("nullptr definitions incompatible");
3241
3242 // Don't inline functions which can be interposed at link-time.
3243 if (Callee->isInterposable())
3244 return InlineResult::failure("interposable");
3245
3246 // Don't inline functions marked noinline.
3247 if (Callee->hasFnAttribute(Attribute::NoInline))
3248 return InlineResult::failure("noinline function attribute");
3249
3250 // Don't inline call sites marked noinline.
3251 if (Call.isNoInline())
3252 return InlineResult::failure("noinline call site attribute");
3253
3254 // Don't inline functions that are loader replaceable.
3255 if (Callee->hasFnAttribute("loader-replaceable"))
3256 return InlineResult::failure("loader replaceable function attribute");
3257
3258 return std::nullopt;
3259}
3260
3262 CallBase &Call, Function *Callee, const InlineParams &Params,
3263 TargetTransformInfo &CalleeTTI,
3264 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
3265 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
3268 function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache) {
3269
3270 auto UserDecision =
3271 llvm::getAttributeBasedInliningDecision(Call, Callee, CalleeTTI, GetTLI);
3272
3273 if (UserDecision) {
3274 if (UserDecision->isSuccess())
3275 return llvm::InlineCost::getAlways("always inline attribute");
3276 return llvm::InlineCost::getNever(UserDecision->getFailureReason());
3277 }
3278
3281 "Inlining forced by -inline-all-viable-calls");
3282
3283 LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName()
3284 << "... (caller:" << Call.getCaller()->getName()
3285 << ")\n");
3286
3287 InlineCostCallAnalyzer CA(*Callee, Call, Params, CalleeTTI,
3288 GetAssumptionCache, GetBFI, GetTLI, PSI, ORE,
3289 /*BoostIndirect=*/true, /*IgnoreThreshold=*/false,
3290 GetEphValuesCache);
3291 InlineResult ShouldInline = CA.analyze();
3292
3293 LLVM_DEBUG(CA.dump());
3294
3295 // Always make cost benefit based decision explicit.
3296 // We use always/never here since threshold is not meaningful,
3297 // as it's not what drives cost-benefit analysis.
3298 if (CA.wasDecidedByCostBenefit()) {
3299 if (ShouldInline.isSuccess())
3300 return InlineCost::getAlways("benefit over cost",
3301 CA.getCostBenefitPair());
3302 else
3303 return InlineCost::getNever("cost over benefit", CA.getCostBenefitPair());
3304 }
3305
3306 if (CA.wasDecidedByCostThreshold())
3307 return InlineCost::get(CA.getCost(), CA.getThreshold(),
3308 CA.getStaticBonusApplied());
3309
3310 // No details on how the decision was made, simply return always or never.
3311 return ShouldInline.isSuccess()
3312 ? InlineCost::getAlways("empty function")
3313 : InlineCost::getNever(ShouldInline.getFailureReason());
3314}
3315
3317 bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
3318 for (BasicBlock &BB : F) {
3319 // Disallow inlining of functions which contain indirect branches.
3320 if (isa<IndirectBrInst>(BB.getTerminator()))
3321 return InlineResult::failure("contains indirect branches");
3322
3323 // Disallow inlining of blockaddresses which are used by non-callbr
3324 // instructions.
3325 if (BB.hasAddressTaken())
3326 for (User *U : BlockAddress::get(&BB)->users())
3327 if (!isa<CallBrInst>(*U))
3328 return InlineResult::failure("blockaddress used outside of callbr");
3329
3330 for (auto &II : BB) {
3331 CallBase *Call = dyn_cast<CallBase>(&II);
3332 if (!Call)
3333 continue;
3334
3335 // Disallow recursive calls.
3336 Function *Callee = Call->getCalledFunction();
3337 if (&F == Callee)
3338 return InlineResult::failure("recursive call");
3339
3340 // Disallow calls which expose returns-twice to a function not previously
3341 // attributed as such.
3342 if (!ReturnsTwice && isa<CallInst>(Call) &&
3343 cast<CallInst>(Call)->canReturnTwice())
3344 return InlineResult::failure("exposes returns-twice attribute");
3345
3346 if (Callee)
3347 switch (Callee->getIntrinsicID()) {
3348 default:
3349 break;
3350 case llvm::Intrinsic::icall_branch_funnel:
3351 // Disallow inlining of @llvm.icall.branch.funnel because current
3352 // backend can't separate call targets from call arguments.
3353 return InlineResult::failure(
3354 "disallowed inlining of @llvm.icall.branch.funnel");
3355 case llvm::Intrinsic::localescape:
3356 // Disallow inlining functions that call @llvm.localescape. Doing this
3357 // correctly would require major changes to the inliner.
3358 return InlineResult::failure(
3359 "disallowed inlining of @llvm.localescape");
3360 case llvm::Intrinsic::vastart:
3361 // Disallow inlining of functions that initialize VarArgs with
3362 // va_start.
3363 return InlineResult::failure(
3364 "contains VarArgs initialized with va_start");
3365 }
3366 }
3367 }
3368
3369 return InlineResult::success();
3370}
3371
3372// APIs to create InlineParams based on command line flags and/or other
3373// parameters.
3374
3376 InlineParams Params;
3377
3378 // This field is the threshold to use for a callee by default. This is
3379 // derived from one or more of:
3380 // * optimization or size-optimization levels,
3381 // * a value passed to createFunctionInliningPass function, or
3382 // * the -inline-threshold flag.
3383 // If the -inline-threshold flag is explicitly specified, that is used
3384 // irrespective of anything else.
3385 if (InlineThreshold.getNumOccurrences() > 0)
3387 else
3388 Params.DefaultThreshold = Threshold;
3389
3390 // Set the HintThreshold knob from the -inlinehint-threshold.
3392
3393 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
3395
3396 // If the -locally-hot-callsite-threshold is explicitly specified, use it to
3397 // populate LocallyHotCallSiteThreshold. Later, we populate
3398 // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
3399 // we know that optimization level is O3 (in the getInlineParams variant that
3400 // takes the opt and size levels).
3401 // FIXME: Remove this check (and make the assignment unconditional) after
3402 // addressing size regression issues at O2.
3403 if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0)
3405
3406 // Set the ColdCallSiteThreshold knob from the
3407 // -inline-cold-callsite-threshold.
3409
3410 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
3411 // -inlinehint-threshold commandline option is not explicitly given. If that
3412 // option is present, then its value applies even for callees with size and
3413 // minsize attributes.
3414 // If the -inline-threshold is not specified, set the ColdThreshold from the
3415 // -inlinecold-threshold even if it is not explicitly passed. If
3416 // -inline-threshold is specified, then -inlinecold-threshold needs to be
3417 // explicitly specified to set the ColdThreshold knob
3418 if (InlineThreshold.getNumOccurrences() == 0) {
3422 } else if (ColdThreshold.getNumOccurrences() > 0) {
3424 }
3425 return Params;
3426}
3427
3430}
3431
3432// Compute the default threshold for inlining based on the opt level and the
3433// size opt level.
3434static int computeThresholdFromOptLevels(unsigned OptLevel,
3435 unsigned SizeOptLevel) {
3436 if (OptLevel > 2)
3438 if (SizeOptLevel == 1) // -Os
3440 if (SizeOptLevel == 2) // -Oz
3442 return DefaultThreshold;
3443}
3444
3445InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
3446 auto Params =
3447 getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
3448 // At O3, use the value of -locally-hot-callsite-threshold option to populate
3449 // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
3450 // when it is specified explicitly.
3451 if (OptLevel > 2)
3453 return Params;
3454}
3455
3460 std::function<AssumptionCache &(Function &)> GetAssumptionCache =
3461 [&](Function &F) -> AssumptionCache & {
3463 };
3464
3466 ProfileSummaryInfo *PSI =
3467 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
3469
3470 // FIXME: Redesign the usage of InlineParams to expand the scope of this pass.
3471 // In the current implementation, the type of InlineParams doesn't matter as
3472 // the pass serves only for verification of inliner's decisions.
3473 // We can add a flag which determines InlineParams for this run. Right now,
3474 // the default InlineParams are used.
3475 const InlineParams Params = llvm::getInlineParams();
3476 for (BasicBlock &BB : F) {
3477 for (Instruction &I : BB) {
3478 if (auto *CB = dyn_cast<CallBase>(&I)) {
3479 Function *CalledFunction = CB->getCalledFunction();
3480 if (!CalledFunction || CalledFunction->isDeclaration())
3481 continue;
3482 OptimizationRemarkEmitter ORE(CalledFunction);
3483 InlineCostCallAnalyzer ICCA(*CalledFunction, *CB, Params, TTI,
3484 GetAssumptionCache, nullptr, nullptr, PSI,
3485 &ORE);
3486 ICCA.analyze();
3487 OS << " Analyzing call of " << CalledFunction->getName()
3488 << "... (caller:" << CB->getCaller()->getName() << ")\n";
3489 ICCA.print(OS);
3490 OS << "\n";
3491 }
3492 }
3493 }
3494 return PreservedAnalyses::all();
3495}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:638
static InstructionCost getCost(Instruction &Inst, TTI::TargetCostKind CostKind, TargetTransformInfo &TTI, TargetLibraryInfo &TLI)
Definition: CostModel.cpp:74
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
static bool isColdCallSite(CallBase &CB, BlockFrequencyInfo &CallerBFI)
Return true if the block containing the call site has a BlockFrequency of less than ColdCCRelFreq% of...
Definition: GlobalOpt.cpp:1747
Hexagon Common GEP
iv users
Definition: IVUsers.cpp:48
static cl::opt< int > InlineAsmInstrCost("inline-asm-instr-cost", cl::Hidden, cl::init(0), cl::desc("Cost of a single inline asm instruction when inlining"))
static cl::opt< int > InlineSavingsMultiplier("inline-savings-multiplier", cl::Hidden, cl::init(8), cl::desc("Multiplier to multiply cycle savings by during inlining"))
static cl::opt< int > InlineThreshold("inline-threshold", cl::Hidden, cl::init(225), cl::desc("Control the amount of inlining to perform (default = 225)"))
static cl::opt< int > CallPenalty("inline-call-penalty", cl::Hidden, cl::init(25), cl::desc("Call penalty that is applied per callsite when inlining"))
static cl::opt< int > HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000), cl::desc("Threshold for hot callsites "))
static cl::opt< int > ColdThreshold("inlinecold-threshold", cl::Hidden, cl::init(45), cl::desc("Threshold for inlining functions with cold attribute"))
static cl::opt< size_t > RecurStackSizeThreshold("recursive-inline-max-stacksize", cl::Hidden, cl::init(InlineConstants::TotalAllocaSizeRecursiveCaller), cl::desc("Do not inline recursive functions with a stack " "size that exceeds the specified limit"))
static cl::opt< bool > PrintInstructionComments("print-instruction-comments", cl::Hidden, cl::init(false), cl::desc("Prints comments for instruction based on inline cost analysis"))
static cl::opt< int > LocallyHotCallSiteThreshold("locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::desc("Threshold for locally hot callsites "))
static cl::opt< bool > InlineCallerSupersetNoBuiltin("inline-caller-superset-nobuiltin", cl::Hidden, cl::init(true), cl::desc("Allow inlining when caller has a superset of callee's nobuiltin " "attributes."))
static cl::opt< int > HintThreshold("inlinehint-threshold", cl::Hidden, cl::init(325), cl::desc("Threshold for inlining functions with inline hint"))
static cl::opt< size_t > StackSizeThreshold("inline-max-stacksize", cl::Hidden, cl::init(std::numeric_limits< size_t >::max()), cl::desc("Do not inline functions with a stack size " "that exceeds the specified limit"))
static int computeThresholdFromOptLevels(unsigned OptLevel, unsigned SizeOptLevel)
static cl::opt< uint64_t > HotCallSiteRelFreq("hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::desc("Minimum block frequency, expressed as a multiple of caller's " "entry frequency, for a callsite to be hot in the absence of " "profile information."))
static cl::opt< int > InlineSavingsProfitableMultiplier("inline-savings-profitable-multiplier", cl::Hidden, cl::init(4), cl::desc("A multiplier on top of cycle savings to decide whether the " "savings won't justify the cost"))
static cl::opt< int > MemAccessCost("inline-memaccess-cost", cl::Hidden, cl::init(0), cl::desc("Cost of load/store instruction when inlining"))
static cl::opt< int > ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden, cl::init(45), cl::desc("Threshold for inlining cold callsites"))
static cl::opt< bool > IgnoreTTIInlineCompatible("ignore-tti-inline-compatible", cl::Hidden, cl::init(false), cl::desc("Ignore TTI attributes compatibility check between callee/caller " "during inline cost calculation"))
static cl::opt< bool > OptComputeFullInlineCost("inline-cost-full", cl::Hidden, cl::desc("Compute the full inline cost of a call site even when the cost " "exceeds the threshold."))
#define DEBUG_PRINT_STAT(x)
static cl::opt< bool > InlineEnableCostBenefitAnalysis("inline-enable-cost-benefit-analysis", cl::Hidden, cl::init(false), cl::desc("Enable the cost-benefit analysis for the inliner"))
static cl::opt< int > InstrCost("inline-instr-cost", cl::Hidden, cl::init(5), cl::desc("Cost of a single instruction when inlining"))
static cl::opt< bool > InlineAllViableCalls("inline-all-viable-calls", cl::Hidden, cl::init(false), cl::desc("Inline all viable calls, even if they exceed the inlining " "threshold"))
static cl::opt< int > InlineSizeAllowance("inline-size-allowance", cl::Hidden, cl::init(100), cl::desc("The maximum size of a callee that get's " "inlined without sufficient cycle savings"))
static bool functionsHaveCompatibleAttributes(Function *Caller, Function *Callee, TargetTransformInfo &TTI, function_ref< const TargetLibraryInfo &(Function &)> &GetTLI)
Test that there are no attribute conflicts between Caller and Callee that prevent inlining.
#define DEBUG_TYPE
Definition: InlineCost.cpp:55
static cl::opt< int > ColdCallSiteRelFreq("cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::desc("Maximum block frequency, expressed as a percentage of caller's " "entry frequency, for a callsite to be cold in the absence of " "profile information."))
static cl::opt< bool > DisableGEPConstOperand("disable-gep-const-evaluation", cl::Hidden, cl::init(false), cl::desc("Disables evaluation of GetElementPtr with constant operands"))
static cl::opt< int > DefaultThreshold("inlinedefault-threshold", cl::Hidden, cl::init(225), cl::desc("Default amount of inlining to perform"))
static Constant * getFalse(Type *Ty)
For a boolean type or a vector of boolean type, return false or a vector with every element false.
Legalize the Machine IR a function s Machine IR
Definition: Legalizer.cpp:80
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
uint64_t IntrinsicInst * II
#define P(N)
FunctionAnalysisManager FAM
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
#define LLVM_DEBUG(...)
Definition: Debug.h:119
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
Value * RHS
Value * LHS
Class for arbitrary precision integers.
Definition: APInt.h:78
LLVM_ABI APInt udiv(const APInt &RHS) const
Unsigned division operation.
Definition: APInt.cpp:1573
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1111
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1041
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:200
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1221
an instruction to allocate memory on the stack
Definition: Instructions.h:64
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:255
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:412
This class represents an incoming formal argument to a Function.
Definition: Argument.h:32
virtual void emitInstructionAnnot(const Instruction *, formatted_raw_ostream &)
emitInstructionAnnot - This may be implemented to emit a string right before an instruction is emitte...
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM_ABI StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:400
AttrKind
This enumeration lists the attributes that can be associated with parameters, function results,...
Definition: Attributes.h:88
bool isValid() const
Return true if the attribute is any kind of attribute.
Definition: Attributes.h:223
LLVM Basic Block Representation.
Definition: BasicBlock.h:62
bool empty() const
Definition: BasicBlock.h:481
bool hasAddressTaken() const
Returns true if there are any uses of this basic block other than direct branches,...
Definition: BasicBlock.h:690
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:233
This class represents a no-op cast from one type to another.
static LLVM_ABI BlockAddress * get(Function *F, BasicBlock *BB)
Return a BlockAddress for the specified function and basic block.
Definition: Constants.cpp:1911
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
LLVM_ABI std::optional< uint64_t > getBlockProfileCount(const BasicBlock *BB, bool AllowSynthetic=false) const
Returns the estimated profile count of BB.
LLVM_ABI BlockFrequency getEntryFreq() const
LLVM_ABI BlockFrequency getBlockFreq(const BasicBlock *BB) const
getblockFreq - Return block frequency.
LLVM_ABI std::optional< BlockFrequency > mul(uint64_t Factor) const
Multiplies frequency with Factor. Returns nullopt in case of overflow.
Conditional or Unconditional Branch instruction.
bool isConditional() const
BasicBlock * getSuccessor(unsigned i) const
bool isUnconditional() const
Value * getCondition() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1116
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1348
LLVM_ABI bool paramHasAttr(unsigned ArgNo, Attribute::AttrKind Kind) const
Determine whether the argument or parameter has the given attribute.
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Definition: InstrTypes.h:1267
Attribute getFnAttr(StringRef Kind) const
Get the attribute of a given kind for the function.
Definition: InstrTypes.h:1632
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1292
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
Definition: InstrTypes.h:1273
FunctionType * getFunctionType() const
Definition: InstrTypes.h:1205
unsigned arg_size() const
Definition: InstrTypes.h:1290
LLVM_ABI Function * getCaller()
Helper to get the caller (the parent function).
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:448
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:666
@ ICMP_NE
not equal
Definition: InstrTypes.h:700
static LLVM_ABI Constant * getSub(Constant *C1, Constant *C2, bool HasNUW=false, bool HasNSW=false)
Definition: Constants.cpp:2654
This is the shared class of boolean and integer constants.
Definition: Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:868
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:214
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:163
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:154
static LLVM_ABI ConstantInt * getBool(LLVMContext &Context, bool V)
Definition: Constants.cpp:882
This is an important base class in LLVM.
Definition: Constant.h:43
LLVM_ABI bool isAllOnesValue() const
Return true if this is the value that would be returned by getAllOnesValue.
Definition: Constants.cpp:107
LLVM_ABI bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
Definition: Constants.cpp:90
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:203
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:177
bool erase(const KeyT &Val)
Definition: DenseMap.h:319
unsigned size() const
Definition: DenseMap.h:120
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:173
iterator end()
Definition: DenseMap.h:87
Implements a dense probed hash-table based set.
Definition: DenseSet.h:263
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:165
A cache of ephemeral values within a function.
This instruction extracts a struct member or array element value from an aggregate value.
Type * getReturnType() const
Definition: DerivedTypes.h:126
Class to represent profile counts.
Definition: Function.h:297
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:949
LLVM_ABI bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:316
static LLVM_ABI bool compare(const APInt &LHS, const APInt &RHS, ICmpInst::Predicate Pred)
Return result of LHS Pred RHS comparison.
Indirect Branch Instruction.
LLVM_ABI void collectAsmStrs(SmallVectorImpl< StringRef > &AsmStrs) const
Definition: InlineAsm.cpp:63
Represents the cost of inlining a function.
Definition: InlineCost.h:91
static InlineCost getNever(const char *Reason, std::optional< CostBenefitPair > CostBenefit=std::nullopt)
Definition: InlineCost.h:132
static InlineCost getAlways(const char *Reason, std::optional< CostBenefitPair > CostBenefit=std::nullopt)
Definition: InlineCost.h:127
static InlineCost get(int Cost, int Threshold, int StaticBonus=0)
Definition: InlineCost.h:121
InlineResult is basically true or false.
Definition: InlineCost.h:181
static InlineResult success()
Definition: InlineCost.h:186
static InlineResult failure(const char *Reason)
Definition: InlineCost.h:187
bool isSuccess() const
Definition: InlineCost.h:190
const char * getFailureReason() const
Definition: InlineCost.h:191
This instruction inserts a struct field of array element value into an aggregate value.
Base class for instruction visitors.
Definition: InstVisitor.h:78
RetTy visitIndirectBrInst(IndirectBrInst &I)
Definition: InstVisitor.h:230
RetTy visitCmpInst(CmpInst &I)
Definition: InstVisitor.h:257
RetTy visitCallBase(CallBase &I)
Definition: InstVisitor.h:262
RetTy visitCleanupReturnInst(CleanupReturnInst &I)
Definition: InstVisitor.h:239
RetTy visitUnreachableInst(UnreachableInst &I)
Definition: InstVisitor.h:236
RetTy visitSwitchInst(SwitchInst &I)
Definition: InstVisitor.h:227
void visit(Iterator Start, Iterator End)
Definition: InstVisitor.h:87
RetTy visitReturnInst(ReturnInst &I)
Definition: InstVisitor.h:221
RetTy visitBinaryOperator(BinaryOperator &I)
Definition: InstVisitor.h:256
RetTy visitResumeInst(ResumeInst &I)
Definition: InstVisitor.h:233
RetTy visitCatchReturnInst(CatchReturnInst &I)
Definition: InstVisitor.h:242
RetTy visitCastInst(CastInst &I)
Definition: InstVisitor.h:254
RetTy visitBranchInst(BranchInst &I)
Definition: InstVisitor.h:224
RetTy visitSelectInst(SelectInst &I)
Definition: InstVisitor.h:190
void visitInstruction(Instruction &I)
Definition: InstVisitor.h:275
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:82
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:428
This class represents a cast from an integer to a pointer.
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:49
Invoke instruction.
An instruction for reading from memory.
Definition: Instructions.h:180
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:40
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
An analysis over an "inner" IR unit that provides access to an analysis manager over a "outer" IR uni...
Definition: PassManager.h:716
Class to represent pointers.
Definition: DerivedTypes.h:700
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:740
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:118
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
This class represents a cast from a pointer to an integer.
Resume the propagation of an exception.
Return a value (possibly void), from a function.
This class represents the LLVM 'select' instruction.
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:168
size_type size() const
Definition: SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:380
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:470
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:401
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:541
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:356
bool empty() const
Definition: SmallVector.h:82
void reserve(size_type N)
Definition: SmallVector.h:664
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
An instruction for storing to memory.
Definition: Instructions.h:296
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition: StringRef.h:480
constexpr StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition: StringRef.h:581
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:269
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:151
bool contains(StringRef Other) const
Return true if the given string is a substring of *this, and false otherwise.
Definition: StringRef.h:434
size_t find(char C, size_t From=0) const
Search for the first character C in the string.
Definition: StringRef.h:301
StringRef trim(char Char) const
Return string with consecutive Char characters starting from the left and right removed.
Definition: StringRef.h:824
static constexpr size_t npos
Definition: StringRef.h:57
Used to lazily calculate structure layout information for a target machine, based on the DataLayout s...
Definition: DataLayout.h:626
TypeSize getElementOffset(unsigned Idx) const
Definition: DataLayout.h:657
Class to represent struct types.
Definition: DerivedTypes.h:218
Multiway switch.
Analysis pass providing the TargetTransformInfo.
Provides information about what library functions are available for the current target.
bool has(LibFunc F) const
Tests whether a library function is available.
bool getLibFunc(StringRef funcName, LibFunc &F) const
Searches for a particular function name.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
LLVM_ABI unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const
Returns a penalty for invoking call Call in F.
LLVM_ABI unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const
LLVM_ABI unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JTSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) const
@ TCK_SizeAndLatency
The weighted sum of size and latency.
LLVM_ABI int getInliningLastCallToStaticBonus() const
LLVM_ABI unsigned adjustInliningThreshold(const CallBase *CB) const
LLVM_ABI unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const
LLVM_ABI int getInlinerVectorBonusPercent() const
LLVM_ABI bool isLoweredToCall(const Function *F) const
Test whether calls to a function lower to actual program function calls.
LLVM_ABI unsigned getInliningThresholdMultiplier() const
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
LLVM_ABI unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const
LLVM_ABI bool areInlineCompatible(const Function *Caller, const Function *Callee) const
LLVM_ABI InstructionCost getFPOpCost(Type *Ty) const
Return the expected cost of supporting the floating point operation of the specified type.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
This function has undefined behavior.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
LLVM Value Representation.
Definition: Value.h:75
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1098
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:322
int getNumOccurrences() const
Definition: CommandLine.h:400
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:194
bool erase(const ValueT &V)
Definition: DenseSet.h:100
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:174
formatted_raw_ostream - A raw_ostream that wraps another one and keeps track of line and column posit...
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition: ilist_node.h:34
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:53
LLVM_ABI bool areInlineCompatible(const Function &Caller, const Function &Callee)
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition: CallingConv.h:47
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
const int ColdccPenalty
Definition: InlineCost.h:52
const char FunctionInlineCostMultiplierAttributeName[]
Definition: InlineCost.h:60
const int OptSizeThreshold
Use when optsize (-Os) is specified.
Definition: InlineCost.h:40
const int OptMinSizeThreshold
Use when minsize (-Oz) is specified.
Definition: InlineCost.h:43
const int LoopPenalty
Definition: InlineCost.h:51
const uint64_t MaxSimplifiedDynamicAllocaToInline
Do not inline dynamic allocas that have been constant propagated to be static allocas above this amou...
Definition: InlineCost.h:58
const int IndirectCallThreshold
Definition: InlineCost.h:50
const int OptAggressiveThreshold
Use when -O3 is specified.
Definition: InlineCost.h:46
const char MaxInlineStackSizeAttributeName[]
Definition: InlineCost.h:63
const unsigned TotalAllocaSizeRecursiveCaller
Do not inline functions which allocate this many bytes on the stack when the caller is recursive.
Definition: InlineCost.h:55
LLVM_ABI int getInstrCost()
Definition: InlineCost.cpp:206
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
@ Dead
Unused definition.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< InstrNode * > Instr
Definition: RDFGraph.h:389
@ FalseVal
Definition: TGLexer.h:59
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition: DWP.cpp:477
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1744
LLVM_ABI Constant * ConstantFoldSelectInstruction(Constant *Cond, Constant *V1, Constant *V2)
Attempt to constant fold a select instruction with the specified operands.
LLVM_ABI bool isAssumeLikeIntrinsic(const Instruction *I)
Return true if it is an intrinsic that cannot be speculated but also cannot trap.
LLVM_ABI bool canConstantFoldCallTo(const CallBase *Call, const Function *F)
canConstantFoldCallTo - Return true if its even possible to fold a call to the specified function.
Function::ProfileCount ProfileCount
LLVM_ABI std::optional< int > getStringFnAttrAsInt(CallBase &CB, StringRef AttrKind)
Definition: InlineCost.cpp:197
auto successors(const MachineBasicBlock *BB)
LLVM_ABI Value * lowerObjectSizeCall(IntrinsicInst *ObjectSize, const DataLayout &DL, const TargetLibraryInfo *TLI, bool MustSucceed)
Try to turn a call to @llvm.objectsize into an integer value of the given Type.
LLVM_ABI Value * simplifyInstructionWithOperands(Instruction *I, ArrayRef< Value * > NewOps, const SimplifyQuery &Q)
Like simplifyInstruction but the operands of I are replaced with NewOps.
LogicalResult failure(bool IsFailure=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:61
gep_type_iterator gep_type_end(const User *GEP)
LLVM_ABI Constant * ConstantFoldCall(const CallBase *Call, Function *F, ArrayRef< Constant * > Operands, const TargetLibraryInfo *TLI=nullptr, bool AllowNonDeterministic=true)
ConstantFoldCall - Attempt to constant fold a call to the specified function with the specified argum...
LLVM_ABI Value * simplifyInstruction(Instruction *I, const SimplifyQuery &Q)
See if we can compute a simplified version of this instruction.
LLVM_ABI InlineResult isInlineViable(Function &Callee)
Check if it is mechanically possible to inline the function Callee, based on the contents of the func...
LLVM_ABI Value * simplifyFNegInst(Value *Op, FastMathFlags FMF, const SimplifyQuery &Q)
Given operand for an FNeg, fold the result or return null.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
std::enable_if_t< std::is_unsigned_v< T >, T > SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed=nullptr)
Multiply two unsigned integers, X and Y, and add the unsigned integer, A to the product.
Definition: MathExtras.h:689
LLVM_ABI std::optional< InlineCostFeatures > getInliningCostFeatures(CallBase &Call, TargetTransformInfo &CalleeTTI, function_ref< AssumptionCache &(Function &)> GetAssumptionCache, function_ref< BlockFrequencyInfo &(Function &)> GetBFI=nullptr, function_ref< const TargetLibraryInfo &(Function &)> GetTLI=nullptr, ProfileSummaryInfo *PSI=nullptr, OptimizationRemarkEmitter *ORE=nullptr)
Get the expanded cost features.
LLVM_ABI Value * simplifyExtractValueInst(Value *Agg, ArrayRef< unsigned > Idxs, const SimplifyQuery &Q)
Given operands for an ExtractValueInst, fold the result or return null.
std::array< int, static_cast< size_t >(InlineCostFeatureIndex::NumberOfFeatures)> InlineCostFeatures
LLVM_ABI InlineCost getInlineCost(CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI, function_ref< AssumptionCache &(Function &)> GetAssumptionCache, function_ref< const TargetLibraryInfo &(Function &)> GetTLI, function_ref< BlockFrequencyInfo &(Function &)> GetBFI=nullptr, ProfileSummaryInfo *PSI=nullptr, OptimizationRemarkEmitter *ORE=nullptr, function_ref< EphemeralValuesCache &(Function &)> GetEphValuesCache=nullptr)
Get an InlineCost object representing the cost of inlining this callsite.
TargetTransformInfo TTI
LLVM_ABI std::optional< InlineResult > getAttributeBasedInliningDecision(CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI, function_ref< const TargetLibraryInfo &(Function &)> GetTLI)
Returns InlineResult::success() if the call site should be always inlined because of user directives,...
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
LLVM_ABI InlineParams getInlineParams()
Generate the parameters to tune the inline cost analysis based only on the commandline options.
LLVM_ABI int getCallsiteCost(const TargetTransformInfo &TTI, const CallBase &Call, const DataLayout &DL)
Return the cost associated with a callsite, including parameter passing and the call/return instructi...
gep_type_iterator gep_type_begin(const User *GEP)
LLVM_ABI std::optional< int > getInliningCostEstimate(CallBase &Call, TargetTransformInfo &CalleeTTI, function_ref< AssumptionCache &(Function &)> GetAssumptionCache, function_ref< BlockFrequencyInfo &(Function &)> GetBFI=nullptr, function_ref< const TargetLibraryInfo &(Function &)> GetTLI=nullptr, ProfileSummaryInfo *PSI=nullptr, OptimizationRemarkEmitter *ORE=nullptr)
Get the cost estimate ignoring thresholds.
auto predecessors(const MachineBasicBlock *BB)
LLVM_ABI Constant * ConstantFoldInstOperands(const Instruction *I, ArrayRef< Constant * > Ops, const DataLayout &DL, const TargetLibraryInfo *TLI=nullptr, bool AllowNonDeterministic=true)
ConstantFoldInstOperands - Attempt to constant fold an instruction with the specified operands.
std::enable_if_t< std::is_unsigned_v< T >, T > SaturatingAdd(T X, T Y, bool *ResultOverflowed=nullptr)
Add two unsigned integers, X and Y, of type T.
Definition: MathExtras.h:614
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
Evaluate query assuming this condition holds.
Definition: SimplifyQuery.h:63
LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM)
Thresholds to tune inline cost analysis.
Definition: InlineCost.h:207
std::optional< int > OptMinSizeThreshold
Threshold to use when the caller is optimized for minsize.
Definition: InlineCost.h:221
std::optional< int > OptSizeThreshold
Threshold to use when the caller is optimized for size.
Definition: InlineCost.h:218
std::optional< int > ColdCallSiteThreshold
Threshold to use when the callsite is considered cold.
Definition: InlineCost.h:231
std::optional< int > ColdThreshold
Threshold to use for cold callees.
Definition: InlineCost.h:215
std::optional< int > HotCallSiteThreshold
Threshold to use when the callsite is considered hot.
Definition: InlineCost.h:224
std::optional< bool > AllowRecursiveCall
Indicate whether we allow inlining for recursive call.
Definition: InlineCost.h:240
int DefaultThreshold
The default threshold to start with for a callee.
Definition: InlineCost.h:209
std::optional< int > HintThreshold
Threshold to use for callees with inline hint.
Definition: InlineCost.h:212
std::optional< int > LocallyHotCallSiteThreshold
Threshold to use when the callsite is considered hot relative to function entry.
Definition: InlineCost.h:228