LLVM 22.0.0git
AMDGPUResourceUsageAnalysis.cpp
Go to the documentation of this file.
1//===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// \brief Analyzes how many registers and other resources are used by
11/// functions.
12///
13/// The results of this analysis are used to fill the register usage, flat
14/// usage, etc. into hardware registers.
15///
16//===----------------------------------------------------------------------===//
17
19#include "AMDGPU.h"
20#include "GCNSubtarget.h"
25#include "llvm/IR/GlobalValue.h"
27
28using namespace llvm;
29using namespace llvm::AMDGPU;
30
31#define DEBUG_TYPE "amdgpu-resource-usage"
32
36
37// In code object v4 and older, we need to tell the runtime some amount ahead of
38// time if we don't know the true stack size. Assume a smaller number if this is
39// only due to dynamic / non-entry block allocas.
41 "amdgpu-assume-external-call-stack-size",
42 cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden,
43 cl::init(16384));
44
46 "amdgpu-assume-dynamic-stack-object-size",
47 cl::desc("Assumed extra stack use if there are any "
48 "variable sized objects (in bytes)"),
49 cl::Hidden, cl::init(4096));
50
52 "Function register usage analysis", true, true)
53
54static const Function *getCalleeFunction(const MachineOperand &Op) {
55 if (Op.isImm()) {
56 assert(Op.getImm() == 0);
57 return nullptr;
58 }
59 return cast<Function>(Op.getGlobal()->stripPointerCastsAndAliases());
60}
61
63 const SIInstrInfo &TII, unsigned Reg) {
64 for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
65 if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))
66 return true;
67 }
68
69 return false;
70}
71
73 MachineFunction &MF) {
74 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
75 if (!TPC)
76 return false;
77
78 const TargetMachine &TM = TPC->getTM<TargetMachine>();
79 const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo();
80
81 // By default, for code object v5 and later, track only the minimum scratch
82 // size
83 uint32_t AssumedStackSizeForDynamicSizeObjects =
85 uint32_t AssumedStackSizeForExternalCall = clAssumedStackSizeForExternalCall;
88 STI.getTargetTriple().getOS() == Triple::AMDPAL) {
89 if (!clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences())
90 AssumedStackSizeForDynamicSizeObjects = 0;
91 if (!clAssumedStackSizeForExternalCall.getNumOccurrences())
92 AssumedStackSizeForExternalCall = 0;
93 }
94
96 MF, AssumedStackSizeForDynamicSizeObjects,
97 AssumedStackSizeForExternalCall);
98
99 return false;
100}
101
102AnalysisKey AMDGPUResourceUsageAnalysis::Key;
106 const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo();
107
108 // By default, for code object v5 and later, track only the minimum scratch
109 // size
110 uint32_t AssumedStackSizeForDynamicSizeObjects =
112 uint32_t AssumedStackSizeForExternalCall = clAssumedStackSizeForExternalCall;
116 if (!clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences())
117 AssumedStackSizeForDynamicSizeObjects = 0;
118 if (!clAssumedStackSizeForExternalCall.getNumOccurrences())
119 AssumedStackSizeForExternalCall = 0;
120 }
121
123 MF, AssumedStackSizeForDynamicSizeObjects,
124 AssumedStackSizeForExternalCall);
125}
126
129 const MachineFunction &MF, uint32_t AssumedStackSizeForDynamicSizeObjects,
130 uint32_t AssumedStackSizeForExternalCall) const {
132
134 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
135 const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
136 const MachineRegisterInfo &MRI = MF.getRegInfo();
137 const SIInstrInfo *TII = ST.getInstrInfo();
138 const SIRegisterInfo &TRI = TII->getRegisterInfo();
139
140 Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
141 MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||
142 MRI.isLiveIn(MFI->getPreloadedReg(
144
145 Info.NumNamedBarrier = MFI->getNumNamedBarriers();
146
147 // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
148 // instructions aren't used to access the scratch buffer. Inline assembly may
149 // need it though.
150 //
151 // If we only have implicit uses of flat_scr on flat instructions, it is not
152 // really needed.
153 if (Info.UsesFlatScratch && !MFI->getUserSGPRInfo().hasFlatScratchInit() &&
154 (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
155 !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
156 !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
157 Info.UsesFlatScratch = false;
158 }
159
160 Info.PrivateSegmentSize = FrameInfo.getStackSize();
161
162 // Assume a big number if there are any unknown sized objects.
163 Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
164 if (Info.HasDynamicallySizedStack)
165 Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
166
167 if (MFI->isStackRealigned())
168 Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
169
170 Info.UsesVCC =
171 MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
172 Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass,
173 /*IncludeCalls=*/false);
174 if (ST.hasMAIInsts())
175 Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass,
176 /*IncludeCalls=*/false);
177
178 // If there are no calls, MachineRegisterInfo can tell us the used register
179 // count easily.
180 // A tail call isn't considered a call for MachineFrameInfo's purposes.
181 if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
182 Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass,
183 /*IncludeCalls=*/false);
184 return Info;
185 }
186
187 int32_t MaxVGPR = -1;
188 Info.CalleeSegmentSize = 0;
189
190 for (const MachineBasicBlock &MBB : MF) {
191 for (const MachineInstr &MI : MBB) {
192 for (unsigned I = 0; I < MI.getNumOperands(); ++I) {
193 const MachineOperand &MO = MI.getOperand(I);
194
195 if (!MO.isReg())
196 continue;
197
198 Register Reg = MO.getReg();
199 switch (Reg) {
200 case AMDGPU::NoRegister:
201 assert(MI.isDebugInstr() &&
202 "Instruction uses invalid noreg register");
203 continue;
204
205 case AMDGPU::XNACK_MASK:
206 case AMDGPU::XNACK_MASK_LO:
207 case AMDGPU::XNACK_MASK_HI:
208 llvm_unreachable("xnack_mask registers should not be used");
209
210 case AMDGPU::LDS_DIRECT:
211 llvm_unreachable("lds_direct register should not be used");
212
213 case AMDGPU::TBA:
214 case AMDGPU::TBA_LO:
215 case AMDGPU::TBA_HI:
216 case AMDGPU::TMA:
217 case AMDGPU::TMA_LO:
218 case AMDGPU::TMA_HI:
219 llvm_unreachable("trap handler registers should not be used");
220
221 case AMDGPU::SRC_VCCZ:
222 llvm_unreachable("src_vccz register should not be used");
223
224 case AMDGPU::SRC_EXECZ:
225 llvm_unreachable("src_execz register should not be used");
226
227 case AMDGPU::SRC_SCC:
228 llvm_unreachable("src_scc register should not be used");
229
230 default:
231 break;
232 }
233
234 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(Reg);
235 assert((!RC || TRI.isVGPRClass(RC) || TRI.isSGPRClass(RC) ||
236 TRI.isAGPRClass(RC) || AMDGPU::TTMP_32RegClass.contains(Reg) ||
237 AMDGPU::TTMP_64RegClass.contains(Reg) ||
238 AMDGPU::TTMP_128RegClass.contains(Reg) ||
239 AMDGPU::TTMP_256RegClass.contains(Reg) ||
240 AMDGPU::TTMP_512RegClass.contains(Reg)) &&
241 "Unknown register class");
242
243 if (!RC || !TRI.isVGPRClass(RC))
244 continue;
245
246 if (MI.isCall() || MI.isMetaInstruction())
247 continue;
248
249 unsigned Width = divideCeil(TRI.getRegSizeInBits(*RC), 32);
250 unsigned HWReg = TRI.getHWRegIndex(Reg);
251 int MaxUsed = HWReg + Width - 1;
252 MaxVGPR = std::max(MaxUsed, MaxVGPR);
253 }
254
255 if (MI.isCall()) {
256 // Pseudo used just to encode the underlying global. Is there a better
257 // way to track this?
258
259 const MachineOperand *CalleeOp =
260 TII->getNamedOperand(MI, AMDGPU::OpName::callee);
261
262 const Function *Callee = getCalleeFunction(*CalleeOp);
263
264 // Avoid crashing on undefined behavior with an illegal call to a
265 // kernel. If a callsite's calling convention doesn't match the
266 // function's, it's undefined behavior. If the callsite calling
267 // convention does match, that would have errored earlier.
268 if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
269 report_fatal_error("invalid call to entry function");
270
271 auto isSameFunction = [](const MachineFunction &MF, const Function *F) {
272 return F == &MF.getFunction();
273 };
274
275 if (Callee && !isSameFunction(MF, Callee))
276 Info.Callees.push_back(Callee);
277
278 bool IsIndirect = !Callee || Callee->isDeclaration();
279
280 // FIXME: Call site could have norecurse on it
281 if (!Callee || !Callee->doesNotRecurse()) {
282 Info.HasRecursion = true;
283
284 // TODO: If we happen to know there is no stack usage in the
285 // callgraph, we don't need to assume an infinitely growing stack.
286 if (!MI.isReturn()) {
287 // We don't need to assume an unknown stack size for tail calls.
288
289 // FIXME: This only benefits in the case where the kernel does not
290 // directly call the tail called function. If a kernel directly
291 // calls a tail recursive function, we'll assume maximum stack size
292 // based on the regular call instruction.
293 Info.CalleeSegmentSize = std::max(
294 Info.CalleeSegmentSize,
295 static_cast<uint64_t>(AssumedStackSizeForExternalCall));
296 }
297 }
298
299 if (IsIndirect) {
300 Info.CalleeSegmentSize =
301 std::max(Info.CalleeSegmentSize,
302 static_cast<uint64_t>(AssumedStackSizeForExternalCall));
303
304 // Register usage of indirect calls gets handled later
305 Info.UsesVCC = true;
306 Info.UsesFlatScratch = ST.hasFlatAddressSpace();
307 Info.HasDynamicallySizedStack = true;
308 Info.HasIndirectCall = true;
309 }
310 }
311 }
312 }
313
314 Info.NumVGPR = MaxVGPR + 1;
315
316 return Info;
317}
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
static cl::opt< uint32_t > clAssumedStackSizeForDynamicSizeObjects("amdgpu-assume-dynamic-stack-object-size", cl::desc("Assumed extra stack use if there are any " "variable sized objects (in bytes)"), cl::Hidden, cl::init(4096))
static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, const SIInstrInfo &TII, unsigned Reg)
static cl::opt< uint32_t > clAssumedStackSizeForExternalCall("amdgpu-assume-external-call-stack-size", cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden, cl::init(16384))
#define DEBUG_TYPE
Analyzes how many registers and other resources are used by functions.
MachineBasicBlock & MBB
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Register const TargetRegisterInfo * TRI
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:56
Target-Independent Code Generator Pass Configuration Options pass.
Result run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:255
This class represents an Operation in the Expression.
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:663
Generic base class for all target subtargets.
const Triple & getTargetTriple() const
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
Definition: MachineInstr.h:72
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
GCNUserSGPRUsageInfo & getUserSGPRInfo()
MCRegister getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:83
const MCSubtargetInfo * getMCSubtargetInfo() const
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:417
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
char & AMDGPUResourceUsageAnalysisID
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition: Error.cpp:167
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:399
SIFunctionResourceInfo analyzeResourceUsage(const MachineFunction &MF, uint32_t AssumedStackSizeForDynamicSizeObjects, uint32_t AssumedStackSizeForExternalCall) const
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
A special type used by analysis passes to provide an address that identifies that particular analysis...
Definition: Analysis.h:29