34#define DEBUG_TYPE "amdgpu-perf-hint"
38 cl::desc(
"Function mem bound threshold in %"));
42 cl::desc(
"Kernel limit wave threshold in %"));
46 cl::desc(
"Indirect access memory instruction weight"));
50 cl::desc(
"Large stride memory access weight"));
54 cl::desc(
"Large stride memory access threshold"));
56STATISTIC(NumMemBound,
"Number of functions marked as memory bound");
57STATISTIC(NumLimitWave,
"Number of functions marked as needing limit wave");
61struct AMDGPUPerfHint {
67 : FIM(FIM_), TLI(TLI_) {}
72 struct MemAccessInfo {
76 MemAccessInfo() =
default;
77 bool isLargeStride(MemAccessInfo &Reference)
const;
78#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
81 OS <<
"Value: " << *
V <<
'\n'
82 <<
"Base: " << *
Base <<
" Offset: " <<
Offset <<
'\n';
88 MemAccessInfo makeMemAccessInfo(
Instruction *)
const;
90 MemAccessInfo LastAccess;
102 bool isIndirectAccess(
const Instruction *Inst)
const;
113 bool isGlobalAddr(
const Value *V)
const;
114 bool isLocalAddr(
const Value *V)
const;
115 bool isGlobalLoadUsedInBB(
const Instruction &)
const;
118static std::pair<const Value *, const Type *> getMemoryInstrPtrAndType(
120 if (
const auto *LI = dyn_cast<LoadInst>(Inst))
121 return {LI->getPointerOperand(), LI->getType()};
122 if (
const auto *SI = dyn_cast<StoreInst>(Inst))
123 return {
SI->getPointerOperand(),
SI->getValueOperand()->getType()};
124 if (
const auto *AI = dyn_cast<AtomicCmpXchgInst>(Inst))
125 return {AI->getPointerOperand(), AI->getCompareOperand()->getType()};
126 if (
const auto *AI = dyn_cast<AtomicRMWInst>(Inst))
127 return {AI->getPointerOperand(), AI->getValOperand()->getType()};
128 if (
const auto *
MI = dyn_cast<AnyMemIntrinsic>(Inst))
131 return {
nullptr,
nullptr};
134bool AMDGPUPerfHint::isIndirectAccess(
const Instruction *Inst)
const {
138 if (
const Value *MO = getMemoryInstrPtrAndType(Inst).first) {
139 if (isGlobalAddr(MO))
143 while (!WorkSet.
empty()) {
146 if (!Visited.
insert(V).second)
150 if (
const auto *LD = dyn_cast<LoadInst>(V)) {
151 const auto *
M =
LD->getPointerOperand();
152 if (isGlobalAddr(M)) {
159 if (
const auto *
GEP = dyn_cast<GetElementPtrInst>(V)) {
160 const auto *
P =
GEP->getPointerOperand();
162 for (
unsigned I = 1, E =
GEP->getNumIndices() + 1;
I != E; ++
I)
167 if (
const auto *U = dyn_cast<UnaryInstruction>(V)) {
168 WorkSet.
insert(
U->getOperand(0));
172 if (
const auto *BO = dyn_cast<BinaryOperator>(V)) {
173 WorkSet.
insert(BO->getOperand(0));
174 WorkSet.
insert(BO->getOperand(1));
178 if (
const auto *S = dyn_cast<SelectInst>(V)) {
179 WorkSet.
insert(S->getFalseValue());
180 WorkSet.
insert(S->getTrueValue());
184 if (
const auto *E = dyn_cast<ExtractElementInst>(V)) {
185 WorkSet.
insert(E->getVectorOperand());
197bool AMDGPUPerfHint::isGlobalLoadUsedInBB(
const Instruction &
I)
const {
198 const auto *Ld = dyn_cast<LoadInst>(&
I);
201 if (!isGlobalAddr(Ld->getPointerOperand()))
205 if (
const Instruction *UsrInst = dyn_cast<Instruction>(Usr)) {
206 if (UsrInst->getParent() ==
I.getParent())
217 LLVM_DEBUG(
dbgs() <<
"[AMDGPUPerfHint] process " <<
F.getName() <<
'\n');
220 LastAccess = MemAccessInfo();
221 unsigned UsedGlobalLoadsInBB = 0;
223 if (
const Type *Ty = getMemoryInstrPtrAndType(&
I).second) {
227 if (isGlobalLoadUsedInBB(
I))
228 UsedGlobalLoadsInBB +=
Size;
229 if (isIndirectAccess(&
I))
231 if (isLargeStride(&
I))
237 if (
auto *CB = dyn_cast<CallBase>(&
I)) {
239 if (!Callee ||
Callee->isDeclaration()) {
246 auto Loc = FIM.find(Callee);
247 if (Loc == FIM.end())
251 FI.
InstCost += Loc->second.InstCost;
254 }
else if (
auto *
GEP = dyn_cast<GetElementPtrInst>(&
I)) {
257 AM.
BaseGV = dyn_cast_or_null<GlobalValue>(
const_cast<Value *
>(
Ptr));
259 if (TLI->isLegalAddressingMode(*
DL, AM,
GEP->getResultElementType(),
260 GEP->getPointerAddressSpace()))
270 unsigned GlobalMemAccPercentage = UsedGlobalLoadsInBB * 100 /
B.size();
271 if (GlobalMemAccPercentage > 50) {
273 <<
B.getName() <<
" has " << GlobalMemAccPercentage
274 <<
"% global memory access\n");
283bool AMDGPUPerfHint::runOnFunction(
Function &
F) {
285 DL = &
M.getDataLayout();
287 if (
F.hasFnAttribute(
"amdgpu-wave-limiter") &&
288 F.hasFnAttribute(
"amdgpu-memory-bound"))
295 <<
" IAMInst cost: " <<
Info->IAMInstCost <<
'\n'
296 <<
" LSMInst cost: " <<
Info->LSMInstCost <<
'\n'
297 <<
" TotalInst cost: " <<
Info->InstCost <<
'\n');
299 bool Changed =
false;
301 if (isMemBound(*Info)) {
304 F.addFnAttr(
"amdgpu-memory-bound",
"true");
311 F.addFnAttr(
"amdgpu-wave-limiter",
"true");
332bool AMDGPUPerfHint::isGlobalAddr(
const Value *V)
const {
333 if (
auto *PT = dyn_cast<PointerType>(
V->getType())) {
334 unsigned As = PT->getAddressSpace();
341bool AMDGPUPerfHint::isLocalAddr(
const Value *V)
const {
342 if (
auto *PT = dyn_cast<PointerType>(
V->getType()))
347bool AMDGPUPerfHint::isLargeStride(
const Instruction *Inst) {
350 MemAccessInfo MAI = makeMemAccessInfo(
const_cast<Instruction *
>(Inst));
351 bool IsLargeStride = MAI.isLargeStride(LastAccess);
353 LastAccess = std::move(MAI);
355 return IsLargeStride;
358AMDGPUPerfHint::MemAccessInfo
359AMDGPUPerfHint::makeMemAccessInfo(
Instruction *Inst)
const {
361 const Value *MO = getMemoryInstrPtrAndType(Inst).first;
373bool AMDGPUPerfHint::MemAccessInfo::isLargeStride(
374 MemAccessInfo &Reference)
const {
383 <<
print() <<
"<=>\n"
384 <<
Reference.print() <<
"Result:" << Result <<
'\n');
408 auto FI = FIM.
find(
F);
412 return AMDGPUPerfHint::isMemBound(FI->second);
416 auto FI = FIM.
find(
F);
420 return AMDGPUPerfHint::needLimitWave(FI->second);
425 bool Changed =
false;
428 if (!
F ||
F->isDeclaration())
432 AMDGPUPerfHint Analyzer(FIM, ST.getTargetLowering());
434 if (Analyzer.runOnFunction(*
F))
443 bool Changed =
false;
451 Function &
F = SCC.begin()->getFunction();
453 if (
F.isDeclaration())
457 AMDGPUPerfHint Analyzer(FIM, ST.getTargetLowering());
458 if (Analyzer.runOnFunction(
F))
466char AMDGPUPerfHintAnalysisLegacy::ID = 0;
470 "Analysis if a function is memory bound",
true,
true)
473 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
478 return Impl.runOnSCC(TM, SCC);
485 bool Changed =
Impl->run(
TM, CG);
static cl::opt< unsigned > LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(64), cl::Hidden, cl::desc("Large stride memory access threshold"))
static cl::opt< unsigned > IAWeight("amdgpu-indirect-access-weight", cl::init(1000), cl::Hidden, cl::desc("Indirect access memory instruction weight"))
static cl::opt< unsigned > LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(50), cl::Hidden, cl::desc("Kernel limit wave threshold in %"))
static cl::opt< unsigned > LSWeight("amdgpu-large-stride-weight", cl::init(1000), cl::Hidden, cl::desc("Large stride memory access weight"))
static cl::opt< unsigned > MemBoundThresh("amdgpu-membound-threshold", cl::init(50), cl::Hidden, cl::desc("Function mem bound threshold in %"))
Analyzes if a function potentially memory bound and if a kernel kernel may benefit from limiting numb...
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
static bool runOnFunction(Function &F, bool PostInlining)
Implements a lazy call graph analysis and related passes for the new pass manager.
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
void visit(MachineFunction &MF, MachineBasicBlock &Start, std::function< void(MachineBasicBlock *)> op)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
This file describes how to lower LLVM code to machine code.
Target-Independent Code Generator Pass Configuration Options pass.
bool isMemoryBound(const Function *F) const
bool needsWaveLimiter(const Function *F) const
bool run(const GCNTargetMachine &TM, LazyCallGraph &CG)
bool runOnSCC(const GCNTargetMachine &TM, CallGraphSCC &SCC)
A container for analyses that lazily runs them and caches their results.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
void setPreservesAll()
Set by analyses that do not transform their input at all.
A node in the call graph for a module.
virtual bool runOnSCC(CallGraphSCC &SCC)=0
runOnSCC - This method should be implemented by the subclass to perform whatever action is necessary ...
void getAnalysisUsage(AnalysisUsage &Info) const override
getAnalysisUsage - For this class, we declare that we require and preserve the call graph.
CallGraphSCC - This is a single SCC that a CallGraphSCCPass is run on.
A parsed version of the target data layout string in and methods for querying it.
An analysis pass which computes the call graph for a module.
A RefSCC of the call graph.
An SCC of the call graph.
A lazily constructed view of the call graph of a module.
LLVM_ABI void buildRefSCCs()
iterator_range< postorder_ref_scc_iterator > postorder_ref_sccs()
A Module instance is used to store all the information related to an LLVM module.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserve()
Mark an analysis as preserved.
Simple wrapper around std::function<void(raw_ostream&)>.
bool erase(PtrType Ptr)
Remove pointer from the set.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
The instances of the Type class are immutable: once they are created, they are never changed.
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
iterator find(const KeyT &Val)
LLVM Value Representation.
iterator_range< user_iterator > users()
This class implements an extremely fast bulk output stream that can only output to a stream.
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
char & AMDGPUPerfHintAnalysisLegacyID
std::unique_ptr< AMDGPUPerfHintAnalysis > Impl
const GCNTargetMachine & TM
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
bool HasDenseGlobalMemAcc
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...