LLVM 22.0.0git
AMDGPUAttributor.cpp
Go to the documentation of this file.
1//===- AMDGPUAttributor.cpp -----------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file This pass uses Attributor framework to deduce AMDGPU attributes.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AMDGPU.h"
14#include "GCNSubtarget.h"
16#include "llvm/IR/IntrinsicsAMDGPU.h"
17#include "llvm/IR/IntrinsicsR600.h"
20
21#define DEBUG_TYPE "amdgpu-attributor"
22
23using namespace llvm;
24
26 "amdgpu-indirect-call-specialization-threshold",
28 "A threshold controls whether an indirect call will be specialized"),
29 cl::init(3));
30
31#define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS,
32
34#include "AMDGPUAttributes.def"
36};
37
38#define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
39
42#include "AMDGPUAttributes.def"
44};
45
46#define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
47static constexpr std::pair<ImplicitArgumentMask, StringLiteral>
49#include "AMDGPUAttributes.def"
50};
51
52// We do not need to note the x workitem or workgroup id because they are always
53// initialized.
54//
55// TODO: We should not add the attributes if the known compile time workgroup
56// size is 1 for y/z.
58intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
59 bool HasApertureRegs, bool SupportsGetDoorBellID,
60 unsigned CodeObjectVersion) {
61 switch (ID) {
62 case Intrinsic::amdgcn_workitem_id_x:
63 NonKernelOnly = true;
64 return WORKITEM_ID_X;
65 case Intrinsic::amdgcn_workgroup_id_x:
66 NonKernelOnly = true;
67 return WORKGROUP_ID_X;
68 case Intrinsic::amdgcn_workitem_id_y:
69 case Intrinsic::r600_read_tidig_y:
70 return WORKITEM_ID_Y;
71 case Intrinsic::amdgcn_workitem_id_z:
72 case Intrinsic::r600_read_tidig_z:
73 return WORKITEM_ID_Z;
74 case Intrinsic::amdgcn_workgroup_id_y:
75 case Intrinsic::r600_read_tgid_y:
76 return WORKGROUP_ID_Y;
77 case Intrinsic::amdgcn_workgroup_id_z:
78 case Intrinsic::r600_read_tgid_z:
79 return WORKGROUP_ID_Z;
80 case Intrinsic::amdgcn_lds_kernel_id:
81 return LDS_KERNEL_ID;
82 case Intrinsic::amdgcn_dispatch_ptr:
83 return DISPATCH_PTR;
84 case Intrinsic::amdgcn_dispatch_id:
85 return DISPATCH_ID;
86 case Intrinsic::amdgcn_implicitarg_ptr:
87 return IMPLICIT_ARG_PTR;
88 // Need queue_ptr anyway. But under V5, we also need implicitarg_ptr to access
89 // queue_ptr.
90 case Intrinsic::amdgcn_queue_ptr:
91 NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
92 return QUEUE_PTR;
93 case Intrinsic::amdgcn_is_shared:
94 case Intrinsic::amdgcn_is_private:
95 if (HasApertureRegs)
96 return NOT_IMPLICIT_INPUT;
97 // Under V5, we need implicitarg_ptr + offsets to access private_base or
98 // shared_base. For pre-V5, however, need to access them through queue_ptr +
99 // offsets.
100 return CodeObjectVersion >= AMDGPU::AMDHSA_COV5 ? IMPLICIT_ARG_PTR
101 : QUEUE_PTR;
102 case Intrinsic::trap:
103 case Intrinsic::debugtrap:
104 case Intrinsic::ubsantrap:
105 if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4.
106 return CodeObjectVersion >= AMDGPU::AMDHSA_COV4 ? NOT_IMPLICIT_INPUT
107 : QUEUE_PTR;
108 NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
109 return QUEUE_PTR;
110 default:
111 return NOT_IMPLICIT_INPUT;
112 }
113}
114
115static bool castRequiresQueuePtr(unsigned SrcAS) {
116 return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
117}
118
119static bool isDSAddress(const Constant *C) {
120 const GlobalValue *GV = dyn_cast<GlobalValue>(C);
121 if (!GV)
122 return false;
123 unsigned AS = GV->getAddressSpace();
125}
126
127/// Returns true if the function requires the implicit argument be passed
128/// regardless of the function contents.
129static bool funcRequiresHostcallPtr(const Function &F) {
130 // Sanitizers require the hostcall buffer passed in the implicit arguments.
131 return F.hasFnAttribute(Attribute::SanitizeAddress) ||
132 F.hasFnAttribute(Attribute::SanitizeThread) ||
133 F.hasFnAttribute(Attribute::SanitizeMemory) ||
134 F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
135 F.hasFnAttribute(Attribute::SanitizeMemTag);
136}
137
138namespace {
139class AMDGPUInformationCache : public InformationCache {
140public:
141 AMDGPUInformationCache(const Module &M, AnalysisGetter &AG,
142 BumpPtrAllocator &Allocator,
145 CodeObjectVersion(AMDGPU::getAMDHSACodeObjectVersion(M)) {}
146
148
149 enum ConstantStatus : uint8_t {
150 NONE = 0,
151 DS_GLOBAL = 1 << 0,
152 ADDR_SPACE_CAST_PRIVATE_TO_FLAT = 1 << 1,
153 ADDR_SPACE_CAST_LOCAL_TO_FLAT = 1 << 2,
154 ADDR_SPACE_CAST_BOTH_TO_FLAT =
155 ADDR_SPACE_CAST_PRIVATE_TO_FLAT | ADDR_SPACE_CAST_LOCAL_TO_FLAT
156 };
157
158 /// Check if the subtarget has aperture regs.
159 bool hasApertureRegs(Function &F) {
160 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
161 return ST.hasApertureRegs();
162 }
163
164 /// Check if the subtarget supports GetDoorbellID.
165 bool supportsGetDoorbellID(Function &F) {
166 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
167 return ST.supportsGetDoorbellID();
168 }
169
170 std::optional<std::pair<unsigned, unsigned>>
171 getFlatWorkGroupSizeAttr(const Function &F) const {
172 auto R = AMDGPU::getIntegerPairAttribute(F, "amdgpu-flat-work-group-size");
173 if (!R)
174 return std::nullopt;
175 return std::make_pair(R->first, *(R->second));
176 }
177
178 std::pair<unsigned, unsigned>
179 getDefaultFlatWorkGroupSize(const Function &F) const {
180 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
181 return ST.getDefaultFlatWorkGroupSize(F.getCallingConv());
182 }
183
184 std::pair<unsigned, unsigned>
185 getMaximumFlatWorkGroupRange(const Function &F) {
186 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
187 return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()};
188 }
189
190 SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) {
191 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
192 return ST.getMaxNumWorkGroups(F);
193 }
194
195 /// Get code object version.
196 unsigned getCodeObjectVersion() const { return CodeObjectVersion; }
197
198 /// Get the effective value of "amdgpu-waves-per-eu" for the function,
199 /// accounting for the interaction with the passed value to use for
200 /// "amdgpu-flat-work-group-size".
201 std::pair<unsigned, unsigned>
202 getWavesPerEU(const Function &F,
203 std::pair<unsigned, unsigned> FlatWorkGroupSize) {
204 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
205 return ST.getWavesPerEU(FlatWorkGroupSize, getLDSSize(F), F);
206 }
207
208 std::optional<std::pair<unsigned, unsigned>>
209 getWavesPerEUAttr(const Function &F) {
210 auto Val = AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu",
211 /*OnlyFirstRequired=*/true);
212 if (!Val)
213 return std::nullopt;
214 if (!Val->second) {
215 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
216 Val->second = ST.getMaxWavesPerEU();
217 }
218 return std::make_pair(Val->first, *(Val->second));
219 }
220
221 std::pair<unsigned, unsigned>
222 getEffectiveWavesPerEU(const Function &F,
223 std::pair<unsigned, unsigned> WavesPerEU,
224 std::pair<unsigned, unsigned> FlatWorkGroupSize) {
225 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
226 return ST.getEffectiveWavesPerEU(WavesPerEU, FlatWorkGroupSize,
227 getLDSSize(F));
228 }
229
230 unsigned getMaxWavesPerEU(const Function &F) {
231 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
232 return ST.getMaxWavesPerEU();
233 }
234
235 unsigned getMaxAddrSpace() const override {
237 }
238
239private:
240 /// Check if the ConstantExpr \p CE uses an addrspacecast from private or
241 /// local to flat. These casts may require the queue pointer.
242 static uint8_t visitConstExpr(const ConstantExpr *CE) {
244
245 if (CE->getOpcode() == Instruction::AddrSpaceCast) {
246 unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
247 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS)
248 Status |= ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
249 else if (SrcAS == AMDGPUAS::LOCAL_ADDRESS)
250 Status |= ADDR_SPACE_CAST_LOCAL_TO_FLAT;
251 }
252
253 return Status;
254 }
255
256 /// Returns the minimum amount of LDS space used by a workgroup running
257 /// function \p F.
258 static unsigned getLDSSize(const Function &F) {
259 return AMDGPU::getIntegerPairAttribute(F, "amdgpu-lds-size",
260 {0, UINT32_MAX}, true)
261 .first;
262 }
263
264 /// Get the constant access bitmap for \p C.
265 uint8_t getConstantAccess(const Constant *C,
267 auto It = ConstantStatus.find(C);
268 if (It != ConstantStatus.end())
269 return It->second;
270
271 uint8_t Result = 0;
272 if (isDSAddress(C))
273 Result = DS_GLOBAL;
274
275 if (const auto *CE = dyn_cast<ConstantExpr>(C))
276 Result |= visitConstExpr(CE);
277
278 for (const Use &U : C->operands()) {
279 const auto *OpC = dyn_cast<Constant>(U);
280 if (!OpC || !Visited.insert(OpC).second)
281 continue;
282
283 Result |= getConstantAccess(OpC, Visited);
284 }
285 return Result;
286 }
287
288public:
289 /// Returns true if \p Fn needs the queue pointer because of \p C.
290 bool needsQueuePtr(const Constant *C, Function &Fn) {
291 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(Fn.getCallingConv());
292 bool HasAperture = hasApertureRegs(Fn);
293
294 // No need to explore the constants.
295 if (!IsNonEntryFunc && HasAperture)
296 return false;
297
299 uint8_t Access = getConstantAccess(C, Visited);
300
301 // We need to trap on DS globals in non-entry functions.
302 if (IsNonEntryFunc && (Access & DS_GLOBAL))
303 return true;
304
305 return !HasAperture && (Access & ADDR_SPACE_CAST_BOTH_TO_FLAT);
306 }
307
308 bool checkConstForAddrSpaceCastFromPrivate(const Constant *C) {
310 uint8_t Access = getConstantAccess(C, Visited);
311 return Access & ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
312 }
313
314private:
315 /// Used to determine if the Constant needs the queue pointer.
317 const unsigned CodeObjectVersion;
318};
319
320struct AAAMDAttributes
321 : public StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
322 AbstractAttribute> {
325
326 AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
327
328 /// Create an abstract attribute view for the position \p IRP.
329 static AAAMDAttributes &createForPosition(const IRPosition &IRP,
330 Attributor &A);
331
332 /// See AbstractAttribute::getName().
333 StringRef getName() const override { return "AAAMDAttributes"; }
334
335 /// See AbstractAttribute::getIdAddr().
336 const char *getIdAddr() const override { return &ID; }
337
338 /// This function should return true if the type of the \p AA is
339 /// AAAMDAttributes.
340 static bool classof(const AbstractAttribute *AA) {
341 return (AA->getIdAddr() == &ID);
342 }
343
344 /// Unique ID (due to the unique address)
345 static const char ID;
346};
347const char AAAMDAttributes::ID = 0;
348
349struct AAUniformWorkGroupSize
350 : public StateWrapper<BooleanState, AbstractAttribute> {
352 AAUniformWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
353
354 /// Create an abstract attribute view for the position \p IRP.
355 static AAUniformWorkGroupSize &createForPosition(const IRPosition &IRP,
356 Attributor &A);
357
358 /// See AbstractAttribute::getName().
359 StringRef getName() const override { return "AAUniformWorkGroupSize"; }
360
361 /// See AbstractAttribute::getIdAddr().
362 const char *getIdAddr() const override { return &ID; }
363
364 /// This function should return true if the type of the \p AA is
365 /// AAAMDAttributes.
366 static bool classof(const AbstractAttribute *AA) {
367 return (AA->getIdAddr() == &ID);
368 }
369
370 /// Unique ID (due to the unique address)
371 static const char ID;
372};
373const char AAUniformWorkGroupSize::ID = 0;
374
375struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize {
376 AAUniformWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A)
377 : AAUniformWorkGroupSize(IRP, A) {}
378
379 void initialize(Attributor &A) override {
380 Function *F = getAssociatedFunction();
381 CallingConv::ID CC = F->getCallingConv();
382
384 return;
385
386 bool InitialValue = false;
387 if (F->hasFnAttribute("uniform-work-group-size"))
388 InitialValue =
389 F->getFnAttribute("uniform-work-group-size").getValueAsString() ==
390 "true";
391
392 if (InitialValue)
393 indicateOptimisticFixpoint();
394 else
395 indicatePessimisticFixpoint();
396 }
397
398 ChangeStatus updateImpl(Attributor &A) override {
399 ChangeStatus Change = ChangeStatus::UNCHANGED;
400
401 auto CheckCallSite = [&](AbstractCallSite CS) {
402 Function *Caller = CS.getInstruction()->getFunction();
403 LLVM_DEBUG(dbgs() << "[AAUniformWorkGroupSize] Call " << Caller->getName()
404 << "->" << getAssociatedFunction()->getName() << "\n");
405
406 const auto *CallerInfo = A.getAAFor<AAUniformWorkGroupSize>(
407 *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
408 if (!CallerInfo || !CallerInfo->isValidState())
409 return false;
410
411 Change = Change | clampStateAndIndicateChange(this->getState(),
412 CallerInfo->getState());
413
414 return true;
415 };
416
417 bool AllCallSitesKnown = true;
418 if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
419 return indicatePessimisticFixpoint();
420
421 return Change;
422 }
423
424 ChangeStatus manifest(Attributor &A) override {
426 LLVMContext &Ctx = getAssociatedFunction()->getContext();
427
428 AttrList.push_back(Attribute::get(Ctx, "uniform-work-group-size",
429 getAssumed() ? "true" : "false"));
430 return A.manifestAttrs(getIRPosition(), AttrList,
431 /* ForceReplace */ true);
432 }
433
434 bool isValidState() const override {
435 // This state is always valid, even when the state is false.
436 return true;
437 }
438
439 const std::string getAsStr(Attributor *) const override {
440 return "AMDWorkGroupSize[" + std::to_string(getAssumed()) + "]";
441 }
442
443 /// See AbstractAttribute::trackStatistics()
444 void trackStatistics() const override {}
445};
446
447AAUniformWorkGroupSize &
448AAUniformWorkGroupSize::createForPosition(const IRPosition &IRP,
449 Attributor &A) {
451 return *new (A.Allocator) AAUniformWorkGroupSizeFunction(IRP, A);
453 "AAUniformWorkGroupSize is only valid for function position");
454}
455
456struct AAAMDAttributesFunction : public AAAMDAttributes {
457 AAAMDAttributesFunction(const IRPosition &IRP, Attributor &A)
458 : AAAMDAttributes(IRP, A) {}
459
460 void initialize(Attributor &A) override {
461 Function *F = getAssociatedFunction();
462
463 // If the function requires the implicit arg pointer due to sanitizers,
464 // assume it's needed even if explicitly marked as not requiring it.
465 const bool NeedsHostcall = funcRequiresHostcallPtr(*F);
466 if (NeedsHostcall) {
467 removeAssumedBits(IMPLICIT_ARG_PTR);
468 removeAssumedBits(HOSTCALL_PTR);
469 }
470
471 for (auto Attr : ImplicitAttrs) {
472 if (NeedsHostcall &&
473 (Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR))
474 continue;
475
476 if (F->hasFnAttribute(Attr.second))
477 addKnownBits(Attr.first);
478 }
479
480 if (F->isDeclaration())
481 return;
482
483 // Ignore functions with graphics calling conventions, these are currently
484 // not allowed to have kernel arguments.
485 if (AMDGPU::isGraphics(F->getCallingConv())) {
486 indicatePessimisticFixpoint();
487 return;
488 }
489 }
490
491 ChangeStatus updateImpl(Attributor &A) override {
492 Function *F = getAssociatedFunction();
493 // The current assumed state used to determine a change.
494 auto OrigAssumed = getAssumed();
495
496 // Check for Intrinsics and propagate attributes.
497 const AACallEdges *AAEdges = A.getAAFor<AACallEdges>(
498 *this, this->getIRPosition(), DepClassTy::REQUIRED);
499 if (!AAEdges || !AAEdges->isValidState() ||
500 AAEdges->hasNonAsmUnknownCallee())
501 return indicatePessimisticFixpoint();
502
503 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
504
505 bool NeedsImplicit = false;
506 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
507 bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
508 bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(*F);
509 unsigned COV = InfoCache.getCodeObjectVersion();
510
511 for (Function *Callee : AAEdges->getOptimisticEdges()) {
512 Intrinsic::ID IID = Callee->getIntrinsicID();
513 if (IID == Intrinsic::not_intrinsic) {
514 const AAAMDAttributes *AAAMD = A.getAAFor<AAAMDAttributes>(
515 *this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
516 if (!AAAMD || !AAAMD->isValidState())
517 return indicatePessimisticFixpoint();
518 *this &= *AAAMD;
519 continue;
520 }
521
522 bool NonKernelOnly = false;
523 ImplicitArgumentMask AttrMask =
524 intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit,
525 HasApertureRegs, SupportsGetDoorbellID, COV);
526 if (AttrMask != NOT_IMPLICIT_INPUT) {
527 if ((IsNonEntryFunc || !NonKernelOnly))
528 removeAssumedBits(AttrMask);
529 }
530 }
531
532 // Need implicitarg_ptr to acess queue_ptr, private_base, and shared_base.
533 if (NeedsImplicit)
534 removeAssumedBits(IMPLICIT_ARG_PTR);
535
536 if (isAssumed(QUEUE_PTR) && checkForQueuePtr(A)) {
537 // Under V5, we need implicitarg_ptr + offsets to access private_base or
538 // shared_base. We do not actually need queue_ptr.
539 if (COV >= 5)
540 removeAssumedBits(IMPLICIT_ARG_PTR);
541 else
542 removeAssumedBits(QUEUE_PTR);
543 }
544
545 if (funcRetrievesMultigridSyncArg(A, COV)) {
546 assert(!isAssumed(IMPLICIT_ARG_PTR) &&
547 "multigrid_sync_arg needs implicitarg_ptr");
548 removeAssumedBits(MULTIGRID_SYNC_ARG);
549 }
550
551 if (funcRetrievesHostcallPtr(A, COV)) {
552 assert(!isAssumed(IMPLICIT_ARG_PTR) && "hostcall needs implicitarg_ptr");
553 removeAssumedBits(HOSTCALL_PTR);
554 }
555
556 if (funcRetrievesHeapPtr(A, COV)) {
557 assert(!isAssumed(IMPLICIT_ARG_PTR) && "heap_ptr needs implicitarg_ptr");
558 removeAssumedBits(HEAP_PTR);
559 }
560
561 if (isAssumed(QUEUE_PTR) && funcRetrievesQueuePtr(A, COV)) {
562 assert(!isAssumed(IMPLICIT_ARG_PTR) && "queue_ptr needs implicitarg_ptr");
563 removeAssumedBits(QUEUE_PTR);
564 }
565
566 if (isAssumed(LDS_KERNEL_ID) && funcRetrievesLDSKernelId(A)) {
567 removeAssumedBits(LDS_KERNEL_ID);
568 }
569
570 if (isAssumed(DEFAULT_QUEUE) && funcRetrievesDefaultQueue(A, COV))
571 removeAssumedBits(DEFAULT_QUEUE);
572
573 if (isAssumed(COMPLETION_ACTION) && funcRetrievesCompletionAction(A, COV))
574 removeAssumedBits(COMPLETION_ACTION);
575
576 if (isAssumed(FLAT_SCRATCH_INIT) && needFlatScratchInit(A))
577 removeAssumedBits(FLAT_SCRATCH_INIT);
578
579 return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
580 : ChangeStatus::UNCHANGED;
581 }
582
583 ChangeStatus manifest(Attributor &A) override {
585 LLVMContext &Ctx = getAssociatedFunction()->getContext();
586
587 for (auto Attr : ImplicitAttrs) {
588 if (isKnown(Attr.first))
589 AttrList.push_back(Attribute::get(Ctx, Attr.second));
590 }
591
592 return A.manifestAttrs(getIRPosition(), AttrList,
593 /* ForceReplace */ true);
594 }
595
596 const std::string getAsStr(Attributor *) const override {
597 std::string Str;
599 OS << "AMDInfo[";
600 for (auto Attr : ImplicitAttrs)
601 if (isAssumed(Attr.first))
602 OS << ' ' << Attr.second;
603 OS << " ]";
604 return OS.str();
605 }
606
607 /// See AbstractAttribute::trackStatistics()
608 void trackStatistics() const override {}
609
610private:
611 bool checkForQueuePtr(Attributor &A) {
612 Function *F = getAssociatedFunction();
613 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
614
615 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
616
617 bool NeedsQueuePtr = false;
618
619 auto CheckAddrSpaceCasts = [&](Instruction &I) {
620 unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace();
621 if (castRequiresQueuePtr(SrcAS)) {
622 NeedsQueuePtr = true;
623 return false;
624 }
625 return true;
626 };
627
628 bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
629
630 // `checkForAllInstructions` is much more cheaper than going through all
631 // instructions, try it first.
632
633 // The queue pointer is not needed if aperture regs is present.
634 if (!HasApertureRegs) {
635 bool UsedAssumedInformation = false;
636 A.checkForAllInstructions(CheckAddrSpaceCasts, *this,
637 {Instruction::AddrSpaceCast},
638 UsedAssumedInformation);
639 }
640
641 // If we found that we need the queue pointer, nothing else to do.
642 if (NeedsQueuePtr)
643 return true;
644
645 if (!IsNonEntryFunc && HasApertureRegs)
646 return false;
647
648 for (BasicBlock &BB : *F) {
649 for (Instruction &I : BB) {
650 for (const Use &U : I.operands()) {
651 if (const auto *C = dyn_cast<Constant>(U)) {
652 if (InfoCache.needsQueuePtr(C, *F))
653 return true;
654 }
655 }
656 }
657 }
658
659 return false;
660 }
661
662 bool funcRetrievesMultigridSyncArg(Attributor &A, unsigned COV) {
664 AA::RangeTy Range(Pos, 8);
665 return funcRetrievesImplicitKernelArg(A, Range);
666 }
667
668 bool funcRetrievesHostcallPtr(Attributor &A, unsigned COV) {
670 AA::RangeTy Range(Pos, 8);
671 return funcRetrievesImplicitKernelArg(A, Range);
672 }
673
674 bool funcRetrievesDefaultQueue(Attributor &A, unsigned COV) {
676 AA::RangeTy Range(Pos, 8);
677 return funcRetrievesImplicitKernelArg(A, Range);
678 }
679
680 bool funcRetrievesCompletionAction(Attributor &A, unsigned COV) {
682 AA::RangeTy Range(Pos, 8);
683 return funcRetrievesImplicitKernelArg(A, Range);
684 }
685
686 bool funcRetrievesHeapPtr(Attributor &A, unsigned COV) {
687 if (COV < 5)
688 return false;
690 return funcRetrievesImplicitKernelArg(A, Range);
691 }
692
693 bool funcRetrievesQueuePtr(Attributor &A, unsigned COV) {
694 if (COV < 5)
695 return false;
697 return funcRetrievesImplicitKernelArg(A, Range);
698 }
699
700 bool funcRetrievesImplicitKernelArg(Attributor &A, AA::RangeTy Range) {
701 // Check if this is a call to the implicitarg_ptr builtin and it
702 // is used to retrieve the hostcall pointer. The implicit arg for
703 // hostcall is not used only if every use of the implicitarg_ptr
704 // is a load that clearly does not retrieve any byte of the
705 // hostcall pointer. We check this by tracing all the uses of the
706 // initial call to the implicitarg_ptr intrinsic.
707 auto DoesNotLeadToKernelArgLoc = [&](Instruction &I) {
708 auto &Call = cast<CallBase>(I);
709 if (Call.getIntrinsicID() != Intrinsic::amdgcn_implicitarg_ptr)
710 return true;
711
712 const auto *PointerInfoAA = A.getAAFor<AAPointerInfo>(
713 *this, IRPosition::callsite_returned(Call), DepClassTy::REQUIRED);
714 if (!PointerInfoAA || !PointerInfoAA->getState().isValidState())
715 return false;
716
717 return PointerInfoAA->forallInterferingAccesses(
718 Range, [](const AAPointerInfo::Access &Acc, bool IsExact) {
719 return Acc.getRemoteInst()->isDroppable();
720 });
721 };
722
723 bool UsedAssumedInformation = false;
724 return !A.checkForAllCallLikeInstructions(DoesNotLeadToKernelArgLoc, *this,
725 UsedAssumedInformation);
726 }
727
728 bool funcRetrievesLDSKernelId(Attributor &A) {
729 auto DoesNotRetrieve = [&](Instruction &I) {
730 auto &Call = cast<CallBase>(I);
731 return Call.getIntrinsicID() != Intrinsic::amdgcn_lds_kernel_id;
732 };
733 bool UsedAssumedInformation = false;
734 return !A.checkForAllCallLikeInstructions(DoesNotRetrieve, *this,
735 UsedAssumedInformation);
736 }
737
738 // Returns true if FlatScratchInit is needed, i.e., no-flat-scratch-init is
739 // not to be set.
740 bool needFlatScratchInit(Attributor &A) {
741 assert(isAssumed(FLAT_SCRATCH_INIT)); // only called if the bit is still set
742
743 // Check all AddrSpaceCast instructions. FlatScratchInit is needed if
744 // there is a cast from PRIVATE_ADDRESS.
745 auto AddrSpaceCastNotFromPrivate = [](Instruction &I) {
746 return cast<AddrSpaceCastInst>(I).getSrcAddressSpace() !=
748 };
749
750 bool UsedAssumedInformation = false;
751 if (!A.checkForAllInstructions(AddrSpaceCastNotFromPrivate, *this,
752 {Instruction::AddrSpaceCast},
753 UsedAssumedInformation))
754 return true;
755
756 // Check for addrSpaceCast from PRIVATE_ADDRESS in constant expressions
757 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
758
759 Function *F = getAssociatedFunction();
760 for (Instruction &I : instructions(F)) {
761 for (const Use &U : I.operands()) {
762 if (const auto *C = dyn_cast<Constant>(U)) {
763 if (InfoCache.checkConstForAddrSpaceCastFromPrivate(C))
764 return true;
765 }
766 }
767 }
768
769 // Finally check callees.
770
771 // This is called on each callee; false means callee shouldn't have
772 // no-flat-scratch-init.
773 auto CheckForNoFlatScratchInit = [&](Instruction &I) {
774 const auto &CB = cast<CallBase>(I);
775 const Function *Callee = CB.getCalledFunction();
776
777 // Callee == 0 for inline asm or indirect call with known callees.
778 // In the latter case, updateImpl() already checked the callees and we
779 // know their FLAT_SCRATCH_INIT bit is set.
780 // If function has indirect call with unknown callees, the bit is
781 // already removed in updateImpl() and execution won't reach here.
782 if (!Callee)
783 return true;
784
785 return Callee->getIntrinsicID() !=
786 Intrinsic::amdgcn_addrspacecast_nonnull;
787 };
788
789 UsedAssumedInformation = false;
790 // If any callee is false (i.e. need FlatScratchInit),
791 // checkForAllCallLikeInstructions returns false, in which case this
792 // function returns true.
793 return !A.checkForAllCallLikeInstructions(CheckForNoFlatScratchInit, *this,
794 UsedAssumedInformation);
795 }
796};
797
798AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
799 Attributor &A) {
801 return *new (A.Allocator) AAAMDAttributesFunction(IRP, A);
802 llvm_unreachable("AAAMDAttributes is only valid for function position");
803}
804
805/// Base class to derive different size ranges.
806struct AAAMDSizeRangeAttribute
807 : public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> {
809
810 StringRef AttrName;
811
812 AAAMDSizeRangeAttribute(const IRPosition &IRP, Attributor &A,
813 StringRef AttrName)
814 : Base(IRP, 32), AttrName(AttrName) {}
815
816 /// See AbstractAttribute::trackStatistics()
817 void trackStatistics() const override {}
818
819 template <class AttributeImpl> ChangeStatus updateImplImpl(Attributor &A) {
821
822 auto CheckCallSite = [&](AbstractCallSite CS) {
823 Function *Caller = CS.getInstruction()->getFunction();
824 LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()
825 << "->" << getAssociatedFunction()->getName() << '\n');
826
827 const auto *CallerInfo = A.getAAFor<AttributeImpl>(
829 if (!CallerInfo || !CallerInfo->isValidState())
830 return false;
831
832 Change |=
833 clampStateAndIndicateChange(this->getState(), CallerInfo->getState());
834
835 return true;
836 };
837
838 bool AllCallSitesKnown = true;
839 if (!A.checkForAllCallSites(CheckCallSite, *this,
840 /*RequireAllCallSites=*/true,
841 AllCallSitesKnown))
842 return indicatePessimisticFixpoint();
843
844 return Change;
845 }
846
847 /// Clamp the assumed range to the default value ([Min, Max]) and emit the
848 /// attribute if it is not same as default.
850 emitAttributeIfNotDefaultAfterClamp(Attributor &A,
851 std::pair<unsigned, unsigned> Default) {
852 auto [Min, Max] = Default;
853 unsigned Lower = getAssumed().getLower().getZExtValue();
854 unsigned Upper = getAssumed().getUpper().getZExtValue();
855
856 // Clamp the range to the default value.
857 if (Lower < Min)
858 Lower = Min;
859 if (Upper > Max + 1)
860 Upper = Max + 1;
861
862 // No manifest if the value is invalid or same as default after clamp.
863 if ((Lower == Min && Upper == Max + 1) || (Upper < Lower))
865
866 Function *F = getAssociatedFunction();
867 LLVMContext &Ctx = F->getContext();
868 SmallString<10> Buffer;
869 raw_svector_ostream OS(Buffer);
870 OS << Lower << ',' << Upper - 1;
871 return A.manifestAttrs(getIRPosition(),
872 {Attribute::get(Ctx, AttrName, OS.str())},
873 /*ForceReplace=*/true);
874 }
875
876 const std::string getAsStr(Attributor *) const override {
877 std::string Str;
879 OS << getName() << '[';
880 OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
881 OS << ']';
882 return OS.str();
883 }
884};
885
886/// Propagate amdgpu-flat-work-group-size attribute.
887struct AAAMDFlatWorkGroupSize : public AAAMDSizeRangeAttribute {
888 AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A)
889 : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-flat-work-group-size") {}
890
891 void initialize(Attributor &A) override {
892 Function *F = getAssociatedFunction();
893 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
894
895 bool HasAttr = false;
896 auto Range = InfoCache.getDefaultFlatWorkGroupSize(*F);
897 auto MaxRange = InfoCache.getMaximumFlatWorkGroupRange(*F);
898
899 if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(*F)) {
900 // We only consider an attribute that is not max range because the front
901 // end always emits the attribute, unfortunately, and sometimes it emits
902 // the max range.
903 if (*Attr != MaxRange) {
904 Range = *Attr;
905 HasAttr = true;
906 }
907 }
908
909 // We don't want to directly clamp the state if it's the max range because
910 // that is basically the worst state.
911 if (Range == MaxRange)
912 return;
913
914 auto [Min, Max] = Range;
915 ConstantRange CR(APInt(32, Min), APInt(32, Max + 1));
916 IntegerRangeState IRS(CR);
917 clampStateAndIndicateChange(this->getState(), IRS);
918
919 if (HasAttr || AMDGPU::isEntryFunctionCC(F->getCallingConv()))
920 indicateOptimisticFixpoint();
921 }
922
923 ChangeStatus updateImpl(Attributor &A) override {
924 return updateImplImpl<AAAMDFlatWorkGroupSize>(A);
925 }
926
927 /// Create an abstract attribute view for the position \p IRP.
928 static AAAMDFlatWorkGroupSize &createForPosition(const IRPosition &IRP,
929 Attributor &A);
930
931 ChangeStatus manifest(Attributor &A) override {
932 Function *F = getAssociatedFunction();
933 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
934 return emitAttributeIfNotDefaultAfterClamp(
935 A, InfoCache.getMaximumFlatWorkGroupRange(*F));
936 }
937
938 /// See AbstractAttribute::getName()
939 StringRef getName() const override { return "AAAMDFlatWorkGroupSize"; }
940
941 /// See AbstractAttribute::getIdAddr()
942 const char *getIdAddr() const override { return &ID; }
943
944 /// This function should return true if the type of the \p AA is
945 /// AAAMDFlatWorkGroupSize
946 static bool classof(const AbstractAttribute *AA) {
947 return (AA->getIdAddr() == &ID);
948 }
949
950 /// Unique ID (due to the unique address)
951 static const char ID;
952};
953
954const char AAAMDFlatWorkGroupSize::ID = 0;
955
956AAAMDFlatWorkGroupSize &
957AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP,
958 Attributor &A) {
960 return *new (A.Allocator) AAAMDFlatWorkGroupSize(IRP, A);
962 "AAAMDFlatWorkGroupSize is only valid for function position");
963}
964
965struct TupleDecIntegerRangeState : public AbstractState {
967
968 bool isValidState() const override {
969 return X.isValidState() && Y.isValidState() && Z.isValidState();
970 }
971
972 bool isAtFixpoint() const override {
973 return X.isAtFixpoint() && Y.isAtFixpoint() && Z.isAtFixpoint();
974 }
975
977 return X.indicateOptimisticFixpoint() | Y.indicateOptimisticFixpoint() |
978 Z.indicateOptimisticFixpoint();
979 }
980
982 return X.indicatePessimisticFixpoint() | Y.indicatePessimisticFixpoint() |
983 Z.indicatePessimisticFixpoint();
984 }
985
986 TupleDecIntegerRangeState operator^=(const TupleDecIntegerRangeState &Other) {
987 X ^= Other.X;
988 Y ^= Other.Y;
989 Z ^= Other.Z;
990 return *this;
991 }
992
993 bool operator==(const TupleDecIntegerRangeState &Other) const {
994 return X == Other.X && Y == Other.Y && Z == Other.Z;
995 }
996
997 TupleDecIntegerRangeState &getAssumed() { return *this; }
998 const TupleDecIntegerRangeState &getAssumed() const { return *this; }
999};
1000
1001using AAAMDMaxNumWorkgroupsState =
1003
1004/// Propagate amdgpu-max-num-workgroups attribute.
1005struct AAAMDMaxNumWorkgroups
1006 : public StateWrapper<TupleDecIntegerRangeState, AbstractAttribute> {
1008
1009 AAAMDMaxNumWorkgroups(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
1010
1011 void initialize(Attributor &A) override {
1012 Function *F = getAssociatedFunction();
1013 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
1014
1015 SmallVector<unsigned> MaxNumWorkgroups = InfoCache.getMaxNumWorkGroups(*F);
1016
1017 X.takeKnownMinimum(MaxNumWorkgroups[0]);
1018 Y.takeKnownMinimum(MaxNumWorkgroups[1]);
1019 Z.takeKnownMinimum(MaxNumWorkgroups[2]);
1020
1021 if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
1022 indicatePessimisticFixpoint();
1023 }
1024
1025 ChangeStatus updateImpl(Attributor &A) override {
1027
1028 auto CheckCallSite = [&](AbstractCallSite CS) {
1029 Function *Caller = CS.getInstruction()->getFunction();
1030 LLVM_DEBUG(dbgs() << "[AAAMDMaxNumWorkgroups] Call " << Caller->getName()
1031 << "->" << getAssociatedFunction()->getName() << '\n');
1032
1033 const auto *CallerInfo = A.getAAFor<AAAMDMaxNumWorkgroups>(
1035 if (!CallerInfo || !CallerInfo->isValidState())
1036 return false;
1037
1038 Change |=
1039 clampStateAndIndicateChange(this->getState(), CallerInfo->getState());
1040 return true;
1041 };
1042
1043 bool AllCallSitesKnown = true;
1044 if (!A.checkForAllCallSites(CheckCallSite, *this,
1045 /*RequireAllCallSites=*/true,
1046 AllCallSitesKnown))
1047 return indicatePessimisticFixpoint();
1048
1049 return Change;
1050 }
1051
1052 /// Create an abstract attribute view for the position \p IRP.
1053 static AAAMDMaxNumWorkgroups &createForPosition(const IRPosition &IRP,
1054 Attributor &A);
1055
1056 ChangeStatus manifest(Attributor &A) override {
1057 Function *F = getAssociatedFunction();
1058 LLVMContext &Ctx = F->getContext();
1059 SmallString<32> Buffer;
1060 raw_svector_ostream OS(Buffer);
1061 OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed();
1062
1063 // TODO: Should annotate loads of the group size for this to do anything
1064 // useful.
1065 return A.manifestAttrs(
1066 getIRPosition(),
1067 {Attribute::get(Ctx, "amdgpu-max-num-workgroups", OS.str())},
1068 /* ForceReplace= */ true);
1069 }
1070
1071 StringRef getName() const override { return "AAAMDMaxNumWorkgroups"; }
1072
1073 const std::string getAsStr(Attributor *) const override {
1074 std::string Buffer = "AAAMDMaxNumWorkgroupsState[";
1075 raw_string_ostream OS(Buffer);
1076 OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed()
1077 << ']';
1078 return OS.str();
1079 }
1080
1081 const char *getIdAddr() const override { return &ID; }
1082
1083 /// This function should return true if the type of the \p AA is
1084 /// AAAMDMaxNumWorkgroups
1085 static bool classof(const AbstractAttribute *AA) {
1086 return (AA->getIdAddr() == &ID);
1087 }
1088
1089 void trackStatistics() const override {}
1090
1091 /// Unique ID (due to the unique address)
1092 static const char ID;
1093};
1094
1095const char AAAMDMaxNumWorkgroups::ID = 0;
1096
1097AAAMDMaxNumWorkgroups &
1098AAAMDMaxNumWorkgroups::createForPosition(const IRPosition &IRP, Attributor &A) {
1100 return *new (A.Allocator) AAAMDMaxNumWorkgroups(IRP, A);
1101 llvm_unreachable("AAAMDMaxNumWorkgroups is only valid for function position");
1102}
1103
1104/// Propagate amdgpu-waves-per-eu attribute.
1105struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
1106 AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A)
1107 : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-waves-per-eu") {}
1108
1109 void initialize(Attributor &A) override {
1110 Function *F = getAssociatedFunction();
1111 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
1112
1113 // If the attribute exists, we will honor it if it is not the default.
1114 if (auto Attr = InfoCache.getWavesPerEUAttr(*F)) {
1115 std::pair<unsigned, unsigned> MaxWavesPerEURange{
1116 1U, InfoCache.getMaxWavesPerEU(*F)};
1117 if (*Attr != MaxWavesPerEURange) {
1118 auto [Min, Max] = *Attr;
1119 ConstantRange Range(APInt(32, Min), APInt(32, Max + 1));
1120 IntegerRangeState RangeState(Range);
1121 this->getState() = RangeState;
1122 indicateOptimisticFixpoint();
1123 return;
1124 }
1125 }
1126
1127 if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
1128 indicatePessimisticFixpoint();
1129 }
1130
1131 ChangeStatus updateImpl(Attributor &A) override {
1132 ChangeStatus Change = ChangeStatus::UNCHANGED;
1133
1134 auto CheckCallSite = [&](AbstractCallSite CS) {
1135 Function *Caller = CS.getInstruction()->getFunction();
1136 Function *Func = getAssociatedFunction();
1137 LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()
1138 << "->" << Func->getName() << '\n');
1139 (void)Func;
1140
1141 const auto *CallerAA = A.getAAFor<AAAMDWavesPerEU>(
1142 *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
1143 if (!CallerAA || !CallerAA->isValidState())
1144 return false;
1145
1146 ConstantRange Assumed = getAssumed();
1147 unsigned Min = std::max(Assumed.getLower().getZExtValue(),
1148 CallerAA->getAssumed().getLower().getZExtValue());
1149 unsigned Max = std::max(Assumed.getUpper().getZExtValue(),
1150 CallerAA->getAssumed().getUpper().getZExtValue());
1151 ConstantRange Range(APInt(32, Min), APInt(32, Max));
1152 IntegerRangeState RangeState(Range);
1153 getState() = RangeState;
1154 Change |= getState() == Assumed ? ChangeStatus::UNCHANGED
1155 : ChangeStatus::CHANGED;
1156
1157 return true;
1158 };
1159
1160 bool AllCallSitesKnown = true;
1161 if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
1162 return indicatePessimisticFixpoint();
1163
1164 return Change;
1165 }
1166
1167 /// Create an abstract attribute view for the position \p IRP.
1168 static AAAMDWavesPerEU &createForPosition(const IRPosition &IRP,
1169 Attributor &A);
1170
1171 ChangeStatus manifest(Attributor &A) override {
1172 Function *F = getAssociatedFunction();
1173 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
1174 return emitAttributeIfNotDefaultAfterClamp(
1175 A, {1U, InfoCache.getMaxWavesPerEU(*F)});
1176 }
1177
1178 /// See AbstractAttribute::getName()
1179 StringRef getName() const override { return "AAAMDWavesPerEU"; }
1180
1181 /// See AbstractAttribute::getIdAddr()
1182 const char *getIdAddr() const override { return &ID; }
1183
1184 /// This function should return true if the type of the \p AA is
1185 /// AAAMDWavesPerEU
1186 static bool classof(const AbstractAttribute *AA) {
1187 return (AA->getIdAddr() == &ID);
1188 }
1189
1190 /// Unique ID (due to the unique address)
1191 static const char ID;
1192};
1193
1194const char AAAMDWavesPerEU::ID = 0;
1195
1196AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP,
1197 Attributor &A) {
1199 return *new (A.Allocator) AAAMDWavesPerEU(IRP, A);
1200 llvm_unreachable("AAAMDWavesPerEU is only valid for function position");
1201}
1202
1203static bool inlineAsmUsesAGPRs(const InlineAsm *IA) {
1204 for (const auto &CI : IA->ParseConstraints()) {
1205 for (StringRef Code : CI.Codes) {
1206 Code.consume_front("{");
1207 if (Code.starts_with("a"))
1208 return true;
1209 }
1210 }
1211
1212 return false;
1213}
1214
1215// TODO: Migrate to range merge of amdgpu-agpr-alloc.
1216// FIXME: Why is this using Attribute::NoUnwind?
1217struct AAAMDGPUNoAGPR
1218 : public IRAttribute<Attribute::NoUnwind,
1219 StateWrapper<BooleanState, AbstractAttribute>,
1220 AAAMDGPUNoAGPR> {
1221 AAAMDGPUNoAGPR(const IRPosition &IRP, Attributor &A) : IRAttribute(IRP) {}
1222
1223 static AAAMDGPUNoAGPR &createForPosition(const IRPosition &IRP,
1224 Attributor &A) {
1226 return *new (A.Allocator) AAAMDGPUNoAGPR(IRP, A);
1227 llvm_unreachable("AAAMDGPUNoAGPR is only valid for function position");
1228 }
1229
1230 void initialize(Attributor &A) override {
1231 Function *F = getAssociatedFunction();
1232 auto [MinNumAGPR, MaxNumAGPR] =
1233 AMDGPU::getIntegerPairAttribute(*F, "amdgpu-agpr-alloc", {~0u, ~0u},
1234 /*OnlyFirstRequired=*/true);
1235 if (MinNumAGPR == 0)
1236 indicateOptimisticFixpoint();
1237 }
1238
1239 const std::string getAsStr(Attributor *A) const override {
1240 return getAssumed() ? "amdgpu-no-agpr" : "amdgpu-maybe-agpr";
1241 }
1242
1243 void trackStatistics() const override {}
1244
1245 ChangeStatus updateImpl(Attributor &A) override {
1246 // TODO: Use AACallEdges, but then we need a way to inspect asm edges.
1247
1248 auto CheckForNoAGPRs = [&](Instruction &I) {
1249 const auto &CB = cast<CallBase>(I);
1250 const Value *CalleeOp = CB.getCalledOperand();
1251 const Function *Callee = dyn_cast<Function>(CalleeOp);
1252 if (!Callee) {
1253 if (const InlineAsm *IA = dyn_cast<InlineAsm>(CalleeOp))
1254 return !inlineAsmUsesAGPRs(IA);
1255 return false;
1256 }
1257
1258 // Some intrinsics may use AGPRs, but if we have a choice, we are not
1259 // required to use AGPRs.
1260 if (Callee->isIntrinsic())
1261 return true;
1262
1263 // TODO: Handle callsite attributes
1264 const auto *CalleeInfo = A.getAAFor<AAAMDGPUNoAGPR>(
1266 return CalleeInfo && CalleeInfo->isValidState() &&
1267 CalleeInfo->getAssumed();
1268 };
1269
1270 bool UsedAssumedInformation = false;
1271 if (!A.checkForAllCallLikeInstructions(CheckForNoAGPRs, *this,
1272 UsedAssumedInformation))
1273 return indicatePessimisticFixpoint();
1275 }
1276
1277 ChangeStatus manifest(Attributor &A) override {
1278 if (!getAssumed())
1280 LLVMContext &Ctx = getAssociatedFunction()->getContext();
1281 return A.manifestAttrs(getIRPosition(),
1282 {Attribute::get(Ctx, "amdgpu-agpr-alloc", "0")});
1283 }
1284
1285 StringRef getName() const override { return "AAAMDGPUNoAGPR"; }
1286 const char *getIdAddr() const override { return &ID; }
1287
1288 /// This function should return true if the type of the \p AA is
1289 /// AAAMDGPUNoAGPRs
1290 static bool classof(const AbstractAttribute *AA) {
1291 return (AA->getIdAddr() == &ID);
1292 }
1293
1294 static const char ID;
1295};
1296
1297const char AAAMDGPUNoAGPR::ID = 0;
1298
1299static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
1301 ThinOrFullLTOPhase LTOPhase) {
1302 SetVector<Function *> Functions;
1303 for (Function &F : M) {
1304 if (!F.isIntrinsic())
1305 Functions.insert(&F);
1306 }
1307
1308 CallGraphUpdater CGUpdater;
1310 AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, TM);
1312 {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
1313 &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
1314 &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID,
1318
1319 AttributorConfig AC(CGUpdater);
1320 AC.IsClosedWorldModule = Options.IsClosedWorld;
1321 AC.Allowed = &Allowed;
1322 AC.IsModulePass = true;
1323 AC.DefaultInitializeLiveInternals = false;
1324 AC.IndirectCalleeSpecializationCallback =
1325 [](Attributor &A, const AbstractAttribute &AA, CallBase &CB,
1326 Function &Callee, unsigned NumAssumedCallees) {
1327 return !AMDGPU::isEntryFunctionCC(Callee.getCallingConv()) &&
1328 (NumAssumedCallees <= IndirectCallSpecializationThreshold);
1329 };
1330 AC.IPOAmendableCB = [](const Function &F) {
1331 return F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
1332 };
1333
1334 Attributor A(Functions, InfoCache, AC);
1335
1336 LLVM_DEBUG({
1337 StringRef LTOPhaseStr = to_string(LTOPhase);
1338 dbgs() << "[AMDGPUAttributor] Running at phase " << LTOPhaseStr << '\n'
1339 << "[AMDGPUAttributor] Module " << M.getName() << " is "
1340 << (AC.IsClosedWorldModule ? "" : "not ")
1341 << "assumed to be a closed world.\n";
1342 });
1343
1344 for (auto *F : Functions) {
1345 A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(*F));
1346 A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(*F));
1347 A.getOrCreateAAFor<AAAMDMaxNumWorkgroups>(IRPosition::function(*F));
1348 A.getOrCreateAAFor<AAAMDGPUNoAGPR>(IRPosition::function(*F));
1349 CallingConv::ID CC = F->getCallingConv();
1350 if (!AMDGPU::isEntryFunctionCC(CC)) {
1351 A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(*F));
1352 A.getOrCreateAAFor<AAAMDWavesPerEU>(IRPosition::function(*F));
1353 }
1354
1355 for (auto &I : instructions(F)) {
1356 Value *Ptr = nullptr;
1357 if (auto *LI = dyn_cast<LoadInst>(&I))
1358 Ptr = LI->getPointerOperand();
1359 else if (auto *SI = dyn_cast<StoreInst>(&I))
1360 Ptr = SI->getPointerOperand();
1361 else if (auto *RMW = dyn_cast<AtomicRMWInst>(&I))
1362 Ptr = RMW->getPointerOperand();
1363 else if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(&I))
1364 Ptr = CmpX->getPointerOperand();
1365
1366 if (Ptr) {
1367 A.getOrCreateAAFor<AAAddressSpace>(IRPosition::value(*Ptr));
1368 A.getOrCreateAAFor<AANoAliasAddrSpace>(IRPosition::value(*Ptr));
1369 }
1370 }
1371 }
1372
1373 return A.run() == ChangeStatus::CHANGED;
1374}
1375} // namespace
1376
1379
1382 AnalysisGetter AG(FAM);
1383
1384 // TODO: Probably preserves CFG
1385 return runImpl(M, AG, TM, Options, LTOPhase) ? PreservedAnalyses::none()
1387}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isDSAddress(const Constant *C)
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static cl::opt< unsigned > IndirectCallSpecializationThreshold("amdgpu-indirect-call-specialization-threshold", cl::desc("A threshold controls whether an indirect call will be specialized"), cl::init(3))
static ImplicitArgumentMask intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit, bool HasApertureRegs, bool SupportsGetDoorBellID, unsigned CodeObjectVersion)
ImplicitArgumentMask
@ NOT_IMPLICIT_INPUT
@ ALL_ARGUMENT_MASK
static bool funcRequiresHostcallPtr(const Function &F)
Returns true if the function requires the implicit argument be passed regardless of the function cont...
ImplicitArgumentPositions
@ LAST_ARG_POS
static bool castRequiresQueuePtr(unsigned SrcAS)
Expand Atomic instructions
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
DXIL Resource Access
@ Default
Definition: DwarfDebug.cpp:86
std::optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1328
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
Definition: ExpandFp.cpp:597
AMD GCN specific subclass of TargetSubtarget.
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
FunctionAnalysisManager FAM
static StringRef getName(Value *V)
Basic Register Allocator
raw_pwrite_stream & OS
#define LLVM_DEBUG(...)
Definition: Debug.h:119
static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, ArrayRef< StringLiteral > StandardNames)
Initialize the set of available library functions based on the specified target triple.
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
Class for arbitrary precision integers.
Definition: APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1540
AbstractCallSite.
This class represents a conversion between pointers from one address space to another.
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:255
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:412
static LLVM_ABI Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:95
LLVM Basic Block Representation.
Definition: BasicBlock.h:62
Allocate memory in an ever growing pool, as if by bump-pointer.
Definition: Allocator.h:67
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1116
Wrapper to unify "old style" CallGraph and "new style" LazyCallGraph.
A constant value that is initialized with an expression using other constant values.
Definition: Constants.h:1120
This class represents a range of values.
Definition: ConstantRange.h:47
const APInt & getLower() const
Return the lower value for this range.
const APInt & getUpper() const
Return the upper value for this range.
This is an important base class in LLVM.
Definition: Constant.h:43
Implements a dense probed hash-table based set.
Definition: DenseSet.h:263
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:270
unsigned getAddressSpace() const
Definition: GlobalValue.h:207
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:585
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:112
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: Analysis.h:115
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:118
A vector that has set insertion semantics.
Definition: SetVector.h:59
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:168
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:380
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:401
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:541
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:83
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
LLVM_ABI bool isDroppable() const
A droppable user is a user for which uses can be dropped without affecting correctness and should be ...
Definition: User.cpp:115
LLVM Value Representation.
Definition: Value.h:75
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:662
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:692
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
unsigned getAMDHSACodeObjectVersion(const Module &M)
unsigned getDefaultQueueImplicitArgPosition(unsigned CodeObjectVersion)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
unsigned getHostcallImplicitArgPosition(unsigned CodeObjectVersion)
unsigned getCompletionActionImplicitArgPosition(unsigned CodeObjectVersion)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
unsigned getMultigridSyncArgImplicitArgPosition(unsigned CodeObjectVersion)
E & operator^=(E &LHS, E RHS)
Definition: BitmaskEnum.h:201
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ CE
Windows NT (Windows on ARM)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393
NodeAddr< CodeNode * > Code
Definition: RDFGraph.h:388
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ NONE
Definition: Attributor.h:6612
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
@ CGSCC
Definition: Attributor.h:6614
ThinOrFullLTOPhase
This enumerates the LLVM full LTO or ThinLTO optimization phases.
Definition: Pass.h:77
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
const char * to_string(ThinOrFullLTOPhase Phase)
Definition: Pass.cpp:301
ChangeStatus clampStateAndIndicateChange(StateType &S, const StateType &R)
Helper function to clamp a state S of type StateType with the information in R and indicate/return if...
Definition: Attributor.h:3505
ChangeStatus
{
Definition: Attributor.h:496
@ REQUIRED
The target cannot be valid if the source is not.
@ Default
The result values are uniform if and only if all operands are uniform.
An abstract interface for address space information.
Definition: Attributor.h:6383
static LLVM_ABI const char ID
Unique ID (due to the unique address)
Definition: Attributor.h:6419
An abstract state for querying live call edges.
Definition: Attributor.h:5541
virtual const SetVector< Function * > & getOptimisticEdges() const =0
Get the optimistic edges.
static LLVM_ABI const char ID
Unique ID (due to the unique address)
Definition: Attributor.h:5585
virtual bool hasNonAsmUnknownCallee() const =0
Is there any call with a unknown callee, excluding any inline asm.
static LLVM_ABI const char ID
Unique ID (due to the unique address)
Definition: Attributor.h:6579
An abstract interface for potential address space information.
Definition: Attributor.h:6428
static LLVM_ABI const char ID
Unique ID (due to the unique address)
Definition: Attributor.h:6460
An access description.
Definition: Attributor.h:6043
Instruction * getRemoteInst() const
Return the actual instruction that causes the access.
Definition: Attributor.h:6143
An abstract interface for struct information.
Definition: Attributor.h:5810
virtual bool forallInterferingAccesses(AA::RangeTy Range, function_ref< bool(const Access &, bool)> CB) const =0
Call CB on all accesses that might interfere with Range and return true if all such accesses were kno...
static LLVM_ABI const char ID
Unique ID (due to the unique address)
Definition: Attributor.h:6260
static LLVM_ABI const char ID
Unique ID (due to the unique address)
Definition: Attributor.h:5355
static LLVM_ABI const char ID
Unique ID (due to the unique address)
Definition: Attributor.h:5393
static LLVM_ABI const char ID
Unique ID (due to the unique address)
Definition: Attributor.h:6330
Helper to represent an access offset and size, with logic to deal with uncertainty and check for over...
Definition: Attributor.h:241
Base struct for all "concrete attribute" deductions.
Definition: Attributor.h:3322
virtual const char * getIdAddr() const =0
This function should return the address of the ID of the AbstractAttribute.
An interface to query the internal state of an abstract attribute.
Definition: Attributor.h:2642
virtual ChangeStatus indicatePessimisticFixpoint()=0
Indicate that the abstract state should converge to the pessimistic state.
virtual bool isAtFixpoint() const =0
Return if this abstract state is fixed, thus does not need to be updated if information changes as it...
virtual bool isValidState() const =0
Return if this abstract state is in a valid state.
virtual ChangeStatus indicateOptimisticFixpoint()=0
Indicate that the abstract state should converge to the optimistic state.
Wrapper for FunctionAnalysisManager.
Definition: Attributor.h:1134
Configuration for the Attributor.
Definition: Attributor.h:1439
The fixpoint analysis framework that orchestrates the attribute deduction.
Definition: Attributor.h:1533
Class to accumulate and hold information about a callee.
Specialization of the integer state for a decreasing value, hence 0 is the best state and ~0u the wor...
Definition: Attributor.h:2892
Helper class that provides common functionality to manifest IR attributes.
Definition: Attributor.h:3227
ChangeStatus manifest(Attributor &A) override
See AbstractAttribute::manifest(...).
Definition: Attributor.h:3259
Helper to describe and deal with positions in the LLVM-IR.
Definition: Attributor.h:593
static const IRPosition callsite_returned(const CallBase &CB)
Create a position describing the returned value of CB.
Definition: Attributor.h:661
static const IRPosition value(const Value &V, const CallBaseContext *CBContext=nullptr)
Create a position describing the value of V.
Definition: Attributor.h:617
@ IRP_FUNCTION
An attribute for a function (scope).
Definition: Attributor.h:605
static const IRPosition function(const Function &F, const CallBaseContext *CBContext=nullptr)
Create a position describing the function scope of F.
Definition: Attributor.h:636
Kind getPositionKind() const
Return the associated position kind.
Definition: Attributor.h:889
Data structure to hold cached (LLVM-IR) information.
Definition: Attributor.h:1210
virtual unsigned getMaxAddrSpace() const
Definition: Attributor.h:1359
State for an integer range.
Definition: Attributor.h:2968
bool isValidState() const override
See AbstractState::isValidState() NOTE: For now we simply pretend that the worst possible state is in...
Definition: Attributor.h:2701
Helper to tie a abstract state implementation to an abstract attribute.
Definition: Attributor.h:3211
StateType & getState() override
See AbstractAttribute::getState(...).
Definition: Attributor.h:3219