LLVM 22.0.0git
AMDGPUAttributor.cpp
Go to the documentation of this file.
1//===- AMDGPUAttributor.cpp -----------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file This pass uses Attributor framework to deduce AMDGPU attributes.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AMDGPU.h"
14#include "GCNSubtarget.h"
16#include "llvm/IR/IntrinsicsAMDGPU.h"
17#include "llvm/IR/IntrinsicsR600.h"
20
21#define DEBUG_TYPE "amdgpu-attributor"
22
23using namespace llvm;
24
26 "amdgpu-indirect-call-specialization-threshold",
28 "A threshold controls whether an indirect call will be specialized"),
29 cl::init(3));
30
31#define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS,
32
34#include "AMDGPUAttributes.def"
36};
37
38#define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
39
42#include "AMDGPUAttributes.def"
44};
45
46#define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
47static constexpr std::pair<ImplicitArgumentMask, StringLiteral>
49#include "AMDGPUAttributes.def"
50};
51
52// We do not need to note the x workitem or workgroup id because they are always
53// initialized.
54//
55// TODO: We should not add the attributes if the known compile time workgroup
56// size is 1 for y/z.
58intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
59 bool HasApertureRegs, bool SupportsGetDoorBellID,
60 unsigned CodeObjectVersion) {
61 switch (ID) {
62 case Intrinsic::amdgcn_workitem_id_x:
63 NonKernelOnly = true;
64 return WORKITEM_ID_X;
65 case Intrinsic::amdgcn_workgroup_id_x:
66 NonKernelOnly = true;
67 return WORKGROUP_ID_X;
68 case Intrinsic::amdgcn_workitem_id_y:
69 case Intrinsic::r600_read_tidig_y:
70 return WORKITEM_ID_Y;
71 case Intrinsic::amdgcn_workitem_id_z:
72 case Intrinsic::r600_read_tidig_z:
73 return WORKITEM_ID_Z;
74 case Intrinsic::amdgcn_workgroup_id_y:
75 case Intrinsic::r600_read_tgid_y:
76 return WORKGROUP_ID_Y;
77 case Intrinsic::amdgcn_workgroup_id_z:
78 case Intrinsic::r600_read_tgid_z:
79 return WORKGROUP_ID_Z;
80 case Intrinsic::amdgcn_lds_kernel_id:
81 return LDS_KERNEL_ID;
82 case Intrinsic::amdgcn_dispatch_ptr:
83 return DISPATCH_PTR;
84 case Intrinsic::amdgcn_dispatch_id:
85 return DISPATCH_ID;
86 case Intrinsic::amdgcn_implicitarg_ptr:
87 return IMPLICIT_ARG_PTR;
88 // Need queue_ptr anyway. But under V5, we also need implicitarg_ptr to access
89 // queue_ptr.
90 case Intrinsic::amdgcn_queue_ptr:
91 NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
92 return QUEUE_PTR;
93 case Intrinsic::amdgcn_is_shared:
94 case Intrinsic::amdgcn_is_private:
95 if (HasApertureRegs)
96 return NOT_IMPLICIT_INPUT;
97 // Under V5, we need implicitarg_ptr + offsets to access private_base or
98 // shared_base. For pre-V5, however, need to access them through queue_ptr +
99 // offsets.
100 return CodeObjectVersion >= AMDGPU::AMDHSA_COV5 ? IMPLICIT_ARG_PTR
101 : QUEUE_PTR;
102 case Intrinsic::trap:
103 case Intrinsic::debugtrap:
104 case Intrinsic::ubsantrap:
105 if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4.
106 return CodeObjectVersion >= AMDGPU::AMDHSA_COV4 ? NOT_IMPLICIT_INPUT
107 : QUEUE_PTR;
108 NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
109 return QUEUE_PTR;
110 default:
111 return NOT_IMPLICIT_INPUT;
112 }
113}
114
115static bool castRequiresQueuePtr(unsigned SrcAS) {
116 return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
117}
118
119static bool isDSAddress(const Constant *C) {
121 if (!GV)
122 return false;
123 unsigned AS = GV->getAddressSpace();
125}
126
127/// Returns true if the function requires the implicit argument be passed
128/// regardless of the function contents.
129static bool funcRequiresHostcallPtr(const Function &F) {
130 // Sanitizers require the hostcall buffer passed in the implicit arguments.
131 return F.hasFnAttribute(Attribute::SanitizeAddress) ||
132 F.hasFnAttribute(Attribute::SanitizeThread) ||
133 F.hasFnAttribute(Attribute::SanitizeMemory) ||
134 F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
135 F.hasFnAttribute(Attribute::SanitizeMemTag);
136}
137
138namespace {
139class AMDGPUInformationCache : public InformationCache {
140public:
141 AMDGPUInformationCache(const Module &M, AnalysisGetter &AG,
143 SetVector<Function *> *CGSCC, TargetMachine &TM)
144 : InformationCache(M, AG, Allocator, CGSCC), TM(TM),
145 CodeObjectVersion(AMDGPU::getAMDHSACodeObjectVersion(M)) {}
146
147 TargetMachine &TM;
148
149 enum ConstantStatus : uint8_t {
150 NONE = 0,
151 DS_GLOBAL = 1 << 0,
152 ADDR_SPACE_CAST_PRIVATE_TO_FLAT = 1 << 1,
153 ADDR_SPACE_CAST_LOCAL_TO_FLAT = 1 << 2,
154 ADDR_SPACE_CAST_BOTH_TO_FLAT =
155 ADDR_SPACE_CAST_PRIVATE_TO_FLAT | ADDR_SPACE_CAST_LOCAL_TO_FLAT
156 };
157
158 /// Check if the subtarget has aperture regs.
159 bool hasApertureRegs(Function &F) {
160 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
161 return ST.hasApertureRegs();
162 }
163
164 /// Check if the subtarget supports GetDoorbellID.
165 bool supportsGetDoorbellID(Function &F) {
166 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
167 return ST.supportsGetDoorbellID();
168 }
169
170 std::optional<std::pair<unsigned, unsigned>>
171 getFlatWorkGroupSizeAttr(const Function &F) const {
172 auto R = AMDGPU::getIntegerPairAttribute(F, "amdgpu-flat-work-group-size");
173 if (!R)
174 return std::nullopt;
175 return std::make_pair(R->first, *(R->second));
176 }
177
178 std::pair<unsigned, unsigned>
179 getDefaultFlatWorkGroupSize(const Function &F) const {
180 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
181 return ST.getDefaultFlatWorkGroupSize(F.getCallingConv());
182 }
183
184 std::pair<unsigned, unsigned>
185 getMaximumFlatWorkGroupRange(const Function &F) {
186 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
187 return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()};
188 }
189
190 SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) {
191 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
192 return ST.getMaxNumWorkGroups(F);
193 }
194
195 /// Get code object version.
196 unsigned getCodeObjectVersion() const { return CodeObjectVersion; }
197
198 /// Get the effective value of "amdgpu-waves-per-eu" for the function,
199 /// accounting for the interaction with the passed value to use for
200 /// "amdgpu-flat-work-group-size".
201 std::pair<unsigned, unsigned>
202 getWavesPerEU(const Function &F,
203 std::pair<unsigned, unsigned> FlatWorkGroupSize) {
204 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
205 return ST.getWavesPerEU(FlatWorkGroupSize, getLDSSize(F), F);
206 }
207
208 std::optional<std::pair<unsigned, unsigned>>
209 getWavesPerEUAttr(const Function &F) {
210 auto Val = AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu",
211 /*OnlyFirstRequired=*/true);
212 if (!Val)
213 return std::nullopt;
214 if (!Val->second) {
215 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
216 Val->second = ST.getMaxWavesPerEU();
217 }
218 return std::make_pair(Val->first, *(Val->second));
219 }
220
221 std::pair<unsigned, unsigned>
222 getEffectiveWavesPerEU(const Function &F,
223 std::pair<unsigned, unsigned> WavesPerEU,
224 std::pair<unsigned, unsigned> FlatWorkGroupSize) {
225 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
226 return ST.getEffectiveWavesPerEU(WavesPerEU, FlatWorkGroupSize,
227 getLDSSize(F));
228 }
229
230 unsigned getMaxWavesPerEU(const Function &F) {
231 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
232 return ST.getMaxWavesPerEU();
233 }
234
235 unsigned getMaxAddrSpace() const override {
237 }
238
239private:
240 /// Check if the ConstantExpr \p CE uses an addrspacecast from private or
241 /// local to flat. These casts may require the queue pointer.
242 static uint8_t visitConstExpr(const ConstantExpr *CE) {
243 uint8_t Status = NONE;
244
245 if (CE->getOpcode() == Instruction::AddrSpaceCast) {
246 unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
247 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS)
248 Status |= ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
249 else if (SrcAS == AMDGPUAS::LOCAL_ADDRESS)
250 Status |= ADDR_SPACE_CAST_LOCAL_TO_FLAT;
251 }
252
253 return Status;
254 }
255
256 /// Returns the minimum amount of LDS space used by a workgroup running
257 /// function \p F.
258 static unsigned getLDSSize(const Function &F) {
259 return AMDGPU::getIntegerPairAttribute(F, "amdgpu-lds-size",
260 {0, UINT32_MAX}, true)
261 .first;
262 }
263
264 /// Get the constant access bitmap for \p C.
265 uint8_t getConstantAccess(const Constant *C,
266 SmallPtrSetImpl<const Constant *> &Visited) {
267 auto It = ConstantStatus.find(C);
268 if (It != ConstantStatus.end())
269 return It->second;
270
271 uint8_t Result = 0;
272 if (isDSAddress(C))
273 Result = DS_GLOBAL;
274
275 if (const auto *CE = dyn_cast<ConstantExpr>(C))
276 Result |= visitConstExpr(CE);
277
278 for (const Use &U : C->operands()) {
279 const auto *OpC = dyn_cast<Constant>(U);
280 if (!OpC || !Visited.insert(OpC).second)
281 continue;
282
283 Result |= getConstantAccess(OpC, Visited);
284 }
285 return Result;
286 }
287
288public:
289 /// Returns true if \p Fn needs the queue pointer because of \p C.
290 bool needsQueuePtr(const Constant *C, Function &Fn) {
291 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(Fn.getCallingConv());
292 bool HasAperture = hasApertureRegs(Fn);
293
294 // No need to explore the constants.
295 if (!IsNonEntryFunc && HasAperture)
296 return false;
297
298 SmallPtrSet<const Constant *, 8> Visited;
299 uint8_t Access = getConstantAccess(C, Visited);
300
301 // We need to trap on DS globals in non-entry functions.
302 if (IsNonEntryFunc && (Access & DS_GLOBAL))
303 return true;
304
305 return !HasAperture && (Access & ADDR_SPACE_CAST_BOTH_TO_FLAT);
306 }
307
308 bool checkConstForAddrSpaceCastFromPrivate(const Constant *C) {
309 SmallPtrSet<const Constant *, 8> Visited;
310 uint8_t Access = getConstantAccess(C, Visited);
311 return Access & ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
312 }
313
314private:
315 /// Used to determine if the Constant needs the queue pointer.
316 DenseMap<const Constant *, uint8_t> ConstantStatus;
317 const unsigned CodeObjectVersion;
318};
319
320struct AAAMDAttributes
321 : public StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
322 AbstractAttribute> {
323 using Base = StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
324 AbstractAttribute>;
325
326 AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
327
328 /// Create an abstract attribute view for the position \p IRP.
329 static AAAMDAttributes &createForPosition(const IRPosition &IRP,
330 Attributor &A);
331
332 /// See AbstractAttribute::getName().
333 StringRef getName() const override { return "AAAMDAttributes"; }
334
335 /// See AbstractAttribute::getIdAddr().
336 const char *getIdAddr() const override { return &ID; }
337
338 /// This function should return true if the type of the \p AA is
339 /// AAAMDAttributes.
340 static bool classof(const AbstractAttribute *AA) {
341 return (AA->getIdAddr() == &ID);
342 }
343
344 /// Unique ID (due to the unique address)
345 static const char ID;
346};
347const char AAAMDAttributes::ID = 0;
348
349struct AAUniformWorkGroupSize
350 : public StateWrapper<BooleanState, AbstractAttribute> {
351 using Base = StateWrapper<BooleanState, AbstractAttribute>;
352 AAUniformWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
353
354 /// Create an abstract attribute view for the position \p IRP.
355 static AAUniformWorkGroupSize &createForPosition(const IRPosition &IRP,
356 Attributor &A);
357
358 /// See AbstractAttribute::getName().
359 StringRef getName() const override { return "AAUniformWorkGroupSize"; }
360
361 /// See AbstractAttribute::getIdAddr().
362 const char *getIdAddr() const override { return &ID; }
363
364 /// This function should return true if the type of the \p AA is
365 /// AAAMDAttributes.
366 static bool classof(const AbstractAttribute *AA) {
367 return (AA->getIdAddr() == &ID);
368 }
369
370 /// Unique ID (due to the unique address)
371 static const char ID;
372};
373const char AAUniformWorkGroupSize::ID = 0;
374
375struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize {
376 AAUniformWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A)
377 : AAUniformWorkGroupSize(IRP, A) {}
378
379 void initialize(Attributor &A) override {
380 Function *F = getAssociatedFunction();
381 CallingConv::ID CC = F->getCallingConv();
382
383 if (CC != CallingConv::AMDGPU_KERNEL)
384 return;
385
386 bool InitialValue = false;
387 if (F->hasFnAttribute("uniform-work-group-size"))
388 InitialValue =
389 F->getFnAttribute("uniform-work-group-size").getValueAsString() ==
390 "true";
391
392 if (InitialValue)
393 indicateOptimisticFixpoint();
394 else
395 indicatePessimisticFixpoint();
396 }
397
398 ChangeStatus updateImpl(Attributor &A) override {
399 ChangeStatus Change = ChangeStatus::UNCHANGED;
400
401 auto CheckCallSite = [&](AbstractCallSite CS) {
402 Function *Caller = CS.getInstruction()->getFunction();
403 LLVM_DEBUG(dbgs() << "[AAUniformWorkGroupSize] Call " << Caller->getName()
404 << "->" << getAssociatedFunction()->getName() << "\n");
405
406 const auto *CallerInfo = A.getAAFor<AAUniformWorkGroupSize>(
407 *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
408 if (!CallerInfo || !CallerInfo->isValidState())
409 return false;
410
411 Change = Change | clampStateAndIndicateChange(this->getState(),
412 CallerInfo->getState());
413
414 return true;
415 };
416
417 bool AllCallSitesKnown = true;
418 if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
419 return indicatePessimisticFixpoint();
420
421 return Change;
422 }
423
424 ChangeStatus manifest(Attributor &A) override {
426 LLVMContext &Ctx = getAssociatedFunction()->getContext();
427
428 AttrList.push_back(Attribute::get(Ctx, "uniform-work-group-size",
429 getAssumed() ? "true" : "false"));
430 return A.manifestAttrs(getIRPosition(), AttrList,
431 /* ForceReplace */ true);
432 }
433
434 bool isValidState() const override {
435 // This state is always valid, even when the state is false.
436 return true;
437 }
438
439 const std::string getAsStr(Attributor *) const override {
440 return "AMDWorkGroupSize[" + std::to_string(getAssumed()) + "]";
441 }
442
443 /// See AbstractAttribute::trackStatistics()
444 void trackStatistics() const override {}
445};
446
447AAUniformWorkGroupSize &
448AAUniformWorkGroupSize::createForPosition(const IRPosition &IRP,
449 Attributor &A) {
451 return *new (A.Allocator) AAUniformWorkGroupSizeFunction(IRP, A);
453 "AAUniformWorkGroupSize is only valid for function position");
454}
455
456struct AAAMDAttributesFunction : public AAAMDAttributes {
457 AAAMDAttributesFunction(const IRPosition &IRP, Attributor &A)
458 : AAAMDAttributes(IRP, A) {}
459
460 void initialize(Attributor &A) override {
461 Function *F = getAssociatedFunction();
462
463 // If the function requires the implicit arg pointer due to sanitizers,
464 // assume it's needed even if explicitly marked as not requiring it.
465 const bool NeedsHostcall = funcRequiresHostcallPtr(*F);
466 if (NeedsHostcall) {
467 removeAssumedBits(IMPLICIT_ARG_PTR);
468 removeAssumedBits(HOSTCALL_PTR);
469 }
470
471 for (auto Attr : ImplicitAttrs) {
472 if (NeedsHostcall &&
473 (Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR))
474 continue;
475
476 if (F->hasFnAttribute(Attr.second))
477 addKnownBits(Attr.first);
478 }
479
480 if (F->isDeclaration())
481 return;
482
483 // Ignore functions with graphics calling conventions, these are currently
484 // not allowed to have kernel arguments.
485 if (AMDGPU::isGraphics(F->getCallingConv())) {
486 indicatePessimisticFixpoint();
487 return;
488 }
489 }
490
491 ChangeStatus updateImpl(Attributor &A) override {
492 Function *F = getAssociatedFunction();
493 // The current assumed state used to determine a change.
494 auto OrigAssumed = getAssumed();
495
496 // Check for Intrinsics and propagate attributes.
497 const AACallEdges *AAEdges = A.getAAFor<AACallEdges>(
498 *this, this->getIRPosition(), DepClassTy::REQUIRED);
499 if (!AAEdges || !AAEdges->isValidState() ||
500 AAEdges->hasNonAsmUnknownCallee())
501 return indicatePessimisticFixpoint();
502
503 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
504
505 bool NeedsImplicit = false;
506 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
507 bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
508 bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(*F);
509 unsigned COV = InfoCache.getCodeObjectVersion();
510
511 for (Function *Callee : AAEdges->getOptimisticEdges()) {
512 Intrinsic::ID IID = Callee->getIntrinsicID();
513 if (IID == Intrinsic::not_intrinsic) {
514 const AAAMDAttributes *AAAMD = A.getAAFor<AAAMDAttributes>(
515 *this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
516 if (!AAAMD || !AAAMD->isValidState())
517 return indicatePessimisticFixpoint();
518 *this &= *AAAMD;
519 continue;
520 }
521
522 bool NonKernelOnly = false;
523 ImplicitArgumentMask AttrMask =
524 intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit,
525 HasApertureRegs, SupportsGetDoorbellID, COV);
526 if (AttrMask != NOT_IMPLICIT_INPUT) {
527 if ((IsNonEntryFunc || !NonKernelOnly))
528 removeAssumedBits(AttrMask);
529 }
530 }
531
532 // Need implicitarg_ptr to acess queue_ptr, private_base, and shared_base.
533 if (NeedsImplicit)
534 removeAssumedBits(IMPLICIT_ARG_PTR);
535
536 if (isAssumed(QUEUE_PTR) && checkForQueuePtr(A)) {
537 // Under V5, we need implicitarg_ptr + offsets to access private_base or
538 // shared_base. We do not actually need queue_ptr.
539 if (COV >= 5)
540 removeAssumedBits(IMPLICIT_ARG_PTR);
541 else
542 removeAssumedBits(QUEUE_PTR);
543 }
544
545 if (funcRetrievesMultigridSyncArg(A, COV)) {
546 assert(!isAssumed(IMPLICIT_ARG_PTR) &&
547 "multigrid_sync_arg needs implicitarg_ptr");
548 removeAssumedBits(MULTIGRID_SYNC_ARG);
549 }
550
551 if (funcRetrievesHostcallPtr(A, COV)) {
552 assert(!isAssumed(IMPLICIT_ARG_PTR) && "hostcall needs implicitarg_ptr");
553 removeAssumedBits(HOSTCALL_PTR);
554 }
555
556 if (funcRetrievesHeapPtr(A, COV)) {
557 assert(!isAssumed(IMPLICIT_ARG_PTR) && "heap_ptr needs implicitarg_ptr");
558 removeAssumedBits(HEAP_PTR);
559 }
560
561 if (isAssumed(QUEUE_PTR) && funcRetrievesQueuePtr(A, COV)) {
562 assert(!isAssumed(IMPLICIT_ARG_PTR) && "queue_ptr needs implicitarg_ptr");
563 removeAssumedBits(QUEUE_PTR);
564 }
565
566 if (isAssumed(LDS_KERNEL_ID) && funcRetrievesLDSKernelId(A)) {
567 removeAssumedBits(LDS_KERNEL_ID);
568 }
569
570 if (isAssumed(DEFAULT_QUEUE) && funcRetrievesDefaultQueue(A, COV))
571 removeAssumedBits(DEFAULT_QUEUE);
572
573 if (isAssumed(COMPLETION_ACTION) && funcRetrievesCompletionAction(A, COV))
574 removeAssumedBits(COMPLETION_ACTION);
575
576 if (isAssumed(FLAT_SCRATCH_INIT) && needFlatScratchInit(A))
577 removeAssumedBits(FLAT_SCRATCH_INIT);
578
579 return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
580 : ChangeStatus::UNCHANGED;
581 }
582
583 ChangeStatus manifest(Attributor &A) override {
585 LLVMContext &Ctx = getAssociatedFunction()->getContext();
586
587 for (auto Attr : ImplicitAttrs) {
588 if (isKnown(Attr.first))
589 AttrList.push_back(Attribute::get(Ctx, Attr.second));
590 }
591
592 return A.manifestAttrs(getIRPosition(), AttrList,
593 /* ForceReplace */ true);
594 }
595
596 const std::string getAsStr(Attributor *) const override {
597 std::string Str;
598 raw_string_ostream OS(Str);
599 OS << "AMDInfo[";
600 for (auto Attr : ImplicitAttrs)
601 if (isAssumed(Attr.first))
602 OS << ' ' << Attr.second;
603 OS << " ]";
604 return OS.str();
605 }
606
607 /// See AbstractAttribute::trackStatistics()
608 void trackStatistics() const override {}
609
610private:
611 bool checkForQueuePtr(Attributor &A) {
612 Function *F = getAssociatedFunction();
613 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
614
615 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
616
617 bool NeedsQueuePtr = false;
618
619 auto CheckAddrSpaceCasts = [&](Instruction &I) {
620 unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace();
621 if (castRequiresQueuePtr(SrcAS)) {
622 NeedsQueuePtr = true;
623 return false;
624 }
625 return true;
626 };
627
628 bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
629
630 // `checkForAllInstructions` is much more cheaper than going through all
631 // instructions, try it first.
632
633 // The queue pointer is not needed if aperture regs is present.
634 if (!HasApertureRegs) {
635 bool UsedAssumedInformation = false;
636 A.checkForAllInstructions(CheckAddrSpaceCasts, *this,
637 {Instruction::AddrSpaceCast},
638 UsedAssumedInformation);
639 }
640
641 // If we found that we need the queue pointer, nothing else to do.
642 if (NeedsQueuePtr)
643 return true;
644
645 if (!IsNonEntryFunc && HasApertureRegs)
646 return false;
647
648 for (BasicBlock &BB : *F) {
649 for (Instruction &I : BB) {
650 for (const Use &U : I.operands()) {
651 if (const auto *C = dyn_cast<Constant>(U)) {
652 if (InfoCache.needsQueuePtr(C, *F))
653 return true;
654 }
655 }
656 }
657 }
658
659 return false;
660 }
661
662 bool funcRetrievesMultigridSyncArg(Attributor &A, unsigned COV) {
664 AA::RangeTy Range(Pos, 8);
665 return funcRetrievesImplicitKernelArg(A, Range);
666 }
667
668 bool funcRetrievesHostcallPtr(Attributor &A, unsigned COV) {
670 AA::RangeTy Range(Pos, 8);
671 return funcRetrievesImplicitKernelArg(A, Range);
672 }
673
674 bool funcRetrievesDefaultQueue(Attributor &A, unsigned COV) {
676 AA::RangeTy Range(Pos, 8);
677 return funcRetrievesImplicitKernelArg(A, Range);
678 }
679
680 bool funcRetrievesCompletionAction(Attributor &A, unsigned COV) {
682 AA::RangeTy Range(Pos, 8);
683 return funcRetrievesImplicitKernelArg(A, Range);
684 }
685
686 bool funcRetrievesHeapPtr(Attributor &A, unsigned COV) {
687 if (COV < 5)
688 return false;
690 return funcRetrievesImplicitKernelArg(A, Range);
691 }
692
693 bool funcRetrievesQueuePtr(Attributor &A, unsigned COV) {
694 if (COV < 5)
695 return false;
697 return funcRetrievesImplicitKernelArg(A, Range);
698 }
699
700 bool funcRetrievesImplicitKernelArg(Attributor &A, AA::RangeTy Range) {
701 // Check if this is a call to the implicitarg_ptr builtin and it
702 // is used to retrieve the hostcall pointer. The implicit arg for
703 // hostcall is not used only if every use of the implicitarg_ptr
704 // is a load that clearly does not retrieve any byte of the
705 // hostcall pointer. We check this by tracing all the uses of the
706 // initial call to the implicitarg_ptr intrinsic.
707 auto DoesNotLeadToKernelArgLoc = [&](Instruction &I) {
708 auto &Call = cast<CallBase>(I);
709 if (Call.getIntrinsicID() != Intrinsic::amdgcn_implicitarg_ptr)
710 return true;
711
712 const auto *PointerInfoAA = A.getAAFor<AAPointerInfo>(
713 *this, IRPosition::callsite_returned(Call), DepClassTy::REQUIRED);
714 if (!PointerInfoAA || !PointerInfoAA->getState().isValidState())
715 return false;
716
717 return PointerInfoAA->forallInterferingAccesses(
718 Range, [](const AAPointerInfo::Access &Acc, bool IsExact) {
719 return Acc.getRemoteInst()->isDroppable();
720 });
721 };
722
723 bool UsedAssumedInformation = false;
724 return !A.checkForAllCallLikeInstructions(DoesNotLeadToKernelArgLoc, *this,
725 UsedAssumedInformation);
726 }
727
728 bool funcRetrievesLDSKernelId(Attributor &A) {
729 auto DoesNotRetrieve = [&](Instruction &I) {
730 auto &Call = cast<CallBase>(I);
731 return Call.getIntrinsicID() != Intrinsic::amdgcn_lds_kernel_id;
732 };
733 bool UsedAssumedInformation = false;
734 return !A.checkForAllCallLikeInstructions(DoesNotRetrieve, *this,
735 UsedAssumedInformation);
736 }
737
738 // Returns true if FlatScratchInit is needed, i.e., no-flat-scratch-init is
739 // not to be set.
740 bool needFlatScratchInit(Attributor &A) {
741 assert(isAssumed(FLAT_SCRATCH_INIT)); // only called if the bit is still set
742
743 // Check all AddrSpaceCast instructions. FlatScratchInit is needed if
744 // there is a cast from PRIVATE_ADDRESS.
745 auto AddrSpaceCastNotFromPrivate = [](Instruction &I) {
746 return cast<AddrSpaceCastInst>(I).getSrcAddressSpace() !=
748 };
749
750 bool UsedAssumedInformation = false;
751 if (!A.checkForAllInstructions(AddrSpaceCastNotFromPrivate, *this,
752 {Instruction::AddrSpaceCast},
753 UsedAssumedInformation))
754 return true;
755
756 // Check for addrSpaceCast from PRIVATE_ADDRESS in constant expressions
757 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
758
759 Function *F = getAssociatedFunction();
760 for (Instruction &I : instructions(F)) {
761 for (const Use &U : I.operands()) {
762 if (const auto *C = dyn_cast<Constant>(U)) {
763 if (InfoCache.checkConstForAddrSpaceCastFromPrivate(C))
764 return true;
765 }
766 }
767 }
768
769 // Finally check callees.
770
771 // This is called on each callee; false means callee shouldn't have
772 // no-flat-scratch-init.
773 auto CheckForNoFlatScratchInit = [&](Instruction &I) {
774 const auto &CB = cast<CallBase>(I);
775 const Function *Callee = CB.getCalledFunction();
776
777 // Callee == 0 for inline asm or indirect call with known callees.
778 // In the latter case, updateImpl() already checked the callees and we
779 // know their FLAT_SCRATCH_INIT bit is set.
780 // If function has indirect call with unknown callees, the bit is
781 // already removed in updateImpl() and execution won't reach here.
782 if (!Callee)
783 return true;
784
785 return Callee->getIntrinsicID() !=
786 Intrinsic::amdgcn_addrspacecast_nonnull;
787 };
788
789 UsedAssumedInformation = false;
790 // If any callee is false (i.e. need FlatScratchInit),
791 // checkForAllCallLikeInstructions returns false, in which case this
792 // function returns true.
793 return !A.checkForAllCallLikeInstructions(CheckForNoFlatScratchInit, *this,
794 UsedAssumedInformation);
795 }
796};
797
798AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
799 Attributor &A) {
801 return *new (A.Allocator) AAAMDAttributesFunction(IRP, A);
802 llvm_unreachable("AAAMDAttributes is only valid for function position");
803}
804
805/// Base class to derive different size ranges.
806struct AAAMDSizeRangeAttribute
807 : public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> {
808 using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>;
809
810 StringRef AttrName;
811
812 AAAMDSizeRangeAttribute(const IRPosition &IRP, Attributor &A,
813 StringRef AttrName)
814 : Base(IRP, 32), AttrName(AttrName) {}
815
816 /// See AbstractAttribute::trackStatistics()
817 void trackStatistics() const override {}
818
819 template <class AttributeImpl> ChangeStatus updateImplImpl(Attributor &A) {
820 ChangeStatus Change = ChangeStatus::UNCHANGED;
821
822 auto CheckCallSite = [&](AbstractCallSite CS) {
823 Function *Caller = CS.getInstruction()->getFunction();
824 LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()
825 << "->" << getAssociatedFunction()->getName() << '\n');
826
827 const auto *CallerInfo = A.getAAFor<AttributeImpl>(
828 *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
829 if (!CallerInfo || !CallerInfo->isValidState())
830 return false;
831
832 Change |=
833 clampStateAndIndicateChange(this->getState(), CallerInfo->getState());
834
835 return true;
836 };
837
838 bool AllCallSitesKnown = true;
839 if (!A.checkForAllCallSites(CheckCallSite, *this,
840 /*RequireAllCallSites=*/true,
841 AllCallSitesKnown))
842 return indicatePessimisticFixpoint();
843
844 return Change;
845 }
846
847 /// Clamp the assumed range to the default value ([Min, Max]) and emit the
848 /// attribute if it is not same as default.
850 emitAttributeIfNotDefaultAfterClamp(Attributor &A,
851 std::pair<unsigned, unsigned> Default) {
852 auto [Min, Max] = Default;
853 unsigned Lower = getAssumed().getLower().getZExtValue();
854 unsigned Upper = getAssumed().getUpper().getZExtValue();
855
856 // Clamp the range to the default value.
857 if (Lower < Min)
858 Lower = Min;
859 if (Upper > Max + 1)
860 Upper = Max + 1;
861
862 // No manifest if the value is invalid or same as default after clamp.
863 if ((Lower == Min && Upper == Max + 1) || (Upper < Lower))
864 return ChangeStatus::UNCHANGED;
865
866 Function *F = getAssociatedFunction();
867 LLVMContext &Ctx = F->getContext();
868 SmallString<10> Buffer;
869 raw_svector_ostream OS(Buffer);
870 OS << Lower << ',' << Upper - 1;
871 return A.manifestAttrs(getIRPosition(),
872 {Attribute::get(Ctx, AttrName, OS.str())},
873 /*ForceReplace=*/true);
874 }
875
876 const std::string getAsStr(Attributor *) const override {
877 std::string Str;
878 raw_string_ostream OS(Str);
879 OS << getName() << '[';
880 OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
881 OS << ']';
882 return OS.str();
883 }
884};
885
886/// Propagate amdgpu-flat-work-group-size attribute.
887struct AAAMDFlatWorkGroupSize : public AAAMDSizeRangeAttribute {
888 AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A)
889 : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-flat-work-group-size") {}
890
891 void initialize(Attributor &A) override {
892 Function *F = getAssociatedFunction();
893 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
894
895 bool HasAttr = false;
896 auto Range = InfoCache.getDefaultFlatWorkGroupSize(*F);
897 auto MaxRange = InfoCache.getMaximumFlatWorkGroupRange(*F);
898
899 if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(*F)) {
900 // We only consider an attribute that is not max range because the front
901 // end always emits the attribute, unfortunately, and sometimes it emits
902 // the max range.
903 if (*Attr != MaxRange) {
904 Range = *Attr;
905 HasAttr = true;
906 }
907 }
908
909 // We don't want to directly clamp the state if it's the max range because
910 // that is basically the worst state.
911 if (Range == MaxRange)
912 return;
913
914 auto [Min, Max] = Range;
915 ConstantRange CR(APInt(32, Min), APInt(32, Max + 1));
916 IntegerRangeState IRS(CR);
917 clampStateAndIndicateChange(this->getState(), IRS);
918
919 if (HasAttr || AMDGPU::isEntryFunctionCC(F->getCallingConv()))
920 indicateOptimisticFixpoint();
921 }
922
923 ChangeStatus updateImpl(Attributor &A) override {
924 return updateImplImpl<AAAMDFlatWorkGroupSize>(A);
925 }
926
927 /// Create an abstract attribute view for the position \p IRP.
928 static AAAMDFlatWorkGroupSize &createForPosition(const IRPosition &IRP,
929 Attributor &A);
930
931 ChangeStatus manifest(Attributor &A) override {
932 Function *F = getAssociatedFunction();
933 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
934 return emitAttributeIfNotDefaultAfterClamp(
935 A, InfoCache.getMaximumFlatWorkGroupRange(*F));
936 }
937
938 /// See AbstractAttribute::getName()
939 StringRef getName() const override { return "AAAMDFlatWorkGroupSize"; }
940
941 /// See AbstractAttribute::getIdAddr()
942 const char *getIdAddr() const override { return &ID; }
943
944 /// This function should return true if the type of the \p AA is
945 /// AAAMDFlatWorkGroupSize
946 static bool classof(const AbstractAttribute *AA) {
947 return (AA->getIdAddr() == &ID);
948 }
949
950 /// Unique ID (due to the unique address)
951 static const char ID;
952};
953
954const char AAAMDFlatWorkGroupSize::ID = 0;
955
956AAAMDFlatWorkGroupSize &
957AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP,
958 Attributor &A) {
960 return *new (A.Allocator) AAAMDFlatWorkGroupSize(IRP, A);
962 "AAAMDFlatWorkGroupSize is only valid for function position");
963}
964
965struct TupleDecIntegerRangeState : public AbstractState {
966 DecIntegerState<uint32_t> X, Y, Z;
967
968 bool isValidState() const override {
969 return X.isValidState() && Y.isValidState() && Z.isValidState();
970 }
971
972 bool isAtFixpoint() const override {
973 return X.isAtFixpoint() && Y.isAtFixpoint() && Z.isAtFixpoint();
974 }
975
976 ChangeStatus indicateOptimisticFixpoint() override {
977 return X.indicateOptimisticFixpoint() | Y.indicateOptimisticFixpoint() |
978 Z.indicateOptimisticFixpoint();
979 }
980
981 ChangeStatus indicatePessimisticFixpoint() override {
982 return X.indicatePessimisticFixpoint() | Y.indicatePessimisticFixpoint() |
983 Z.indicatePessimisticFixpoint();
984 }
985
986 TupleDecIntegerRangeState operator^=(const TupleDecIntegerRangeState &Other) {
987 X ^= Other.X;
988 Y ^= Other.Y;
989 Z ^= Other.Z;
990 return *this;
991 }
992
993 bool operator==(const TupleDecIntegerRangeState &Other) const {
994 return X == Other.X && Y == Other.Y && Z == Other.Z;
995 }
996
997 TupleDecIntegerRangeState &getAssumed() { return *this; }
998 const TupleDecIntegerRangeState &getAssumed() const { return *this; }
999};
1000
1001using AAAMDMaxNumWorkgroupsState =
1002 StateWrapper<TupleDecIntegerRangeState, AbstractAttribute, uint32_t>;
1003
1004/// Propagate amdgpu-max-num-workgroups attribute.
1005struct AAAMDMaxNumWorkgroups
1006 : public StateWrapper<TupleDecIntegerRangeState, AbstractAttribute> {
1007 using Base = StateWrapper<TupleDecIntegerRangeState, AbstractAttribute>;
1008
1009 AAAMDMaxNumWorkgroups(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
1010
1011 void initialize(Attributor &A) override {
1012 Function *F = getAssociatedFunction();
1013 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
1014
1015 SmallVector<unsigned> MaxNumWorkgroups = InfoCache.getMaxNumWorkGroups(*F);
1016
1017 X.takeKnownMinimum(MaxNumWorkgroups[0]);
1018 Y.takeKnownMinimum(MaxNumWorkgroups[1]);
1019 Z.takeKnownMinimum(MaxNumWorkgroups[2]);
1020
1021 if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
1022 indicatePessimisticFixpoint();
1023 }
1024
1025 ChangeStatus updateImpl(Attributor &A) override {
1026 ChangeStatus Change = ChangeStatus::UNCHANGED;
1027
1028 auto CheckCallSite = [&](AbstractCallSite CS) {
1029 Function *Caller = CS.getInstruction()->getFunction();
1030 LLVM_DEBUG(dbgs() << "[AAAMDMaxNumWorkgroups] Call " << Caller->getName()
1031 << "->" << getAssociatedFunction()->getName() << '\n');
1032
1033 const auto *CallerInfo = A.getAAFor<AAAMDMaxNumWorkgroups>(
1034 *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
1035 if (!CallerInfo || !CallerInfo->isValidState())
1036 return false;
1037
1038 Change |=
1039 clampStateAndIndicateChange(this->getState(), CallerInfo->getState());
1040 return true;
1041 };
1042
1043 bool AllCallSitesKnown = true;
1044 if (!A.checkForAllCallSites(CheckCallSite, *this,
1045 /*RequireAllCallSites=*/true,
1046 AllCallSitesKnown))
1047 return indicatePessimisticFixpoint();
1048
1049 return Change;
1050 }
1051
1052 /// Create an abstract attribute view for the position \p IRP.
1053 static AAAMDMaxNumWorkgroups &createForPosition(const IRPosition &IRP,
1054 Attributor &A);
1055
1056 ChangeStatus manifest(Attributor &A) override {
1057 Function *F = getAssociatedFunction();
1058 LLVMContext &Ctx = F->getContext();
1059 SmallString<32> Buffer;
1060 raw_svector_ostream OS(Buffer);
1061 OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed();
1062
1063 // TODO: Should annotate loads of the group size for this to do anything
1064 // useful.
1065 return A.manifestAttrs(
1066 getIRPosition(),
1067 {Attribute::get(Ctx, "amdgpu-max-num-workgroups", OS.str())},
1068 /* ForceReplace= */ true);
1069 }
1070
1071 StringRef getName() const override { return "AAAMDMaxNumWorkgroups"; }
1072
1073 const std::string getAsStr(Attributor *) const override {
1074 std::string Buffer = "AAAMDMaxNumWorkgroupsState[";
1075 raw_string_ostream OS(Buffer);
1076 OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed()
1077 << ']';
1078 return OS.str();
1079 }
1080
1081 const char *getIdAddr() const override { return &ID; }
1082
1083 /// This function should return true if the type of the \p AA is
1084 /// AAAMDMaxNumWorkgroups
1085 static bool classof(const AbstractAttribute *AA) {
1086 return (AA->getIdAddr() == &ID);
1087 }
1088
1089 void trackStatistics() const override {}
1090
1091 /// Unique ID (due to the unique address)
1092 static const char ID;
1093};
1094
1095const char AAAMDMaxNumWorkgroups::ID = 0;
1096
1097AAAMDMaxNumWorkgroups &
1098AAAMDMaxNumWorkgroups::createForPosition(const IRPosition &IRP, Attributor &A) {
1100 return *new (A.Allocator) AAAMDMaxNumWorkgroups(IRP, A);
1101 llvm_unreachable("AAAMDMaxNumWorkgroups is only valid for function position");
1102}
1103
1104/// Propagate amdgpu-waves-per-eu attribute.
1105struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
1106 AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A)
1107 : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-waves-per-eu") {}
1108
1109 void initialize(Attributor &A) override {
1110 Function *F = getAssociatedFunction();
1111 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
1112
1113 // If the attribute exists, we will honor it if it is not the default.
1114 if (auto Attr = InfoCache.getWavesPerEUAttr(*F)) {
1115 std::pair<unsigned, unsigned> MaxWavesPerEURange{
1116 1U, InfoCache.getMaxWavesPerEU(*F)};
1117 if (*Attr != MaxWavesPerEURange) {
1118 auto [Min, Max] = *Attr;
1119 ConstantRange Range(APInt(32, Min), APInt(32, Max + 1));
1120 IntegerRangeState RangeState(Range);
1121 this->getState() = RangeState;
1122 indicateOptimisticFixpoint();
1123 return;
1124 }
1125 }
1126
1127 if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
1128 indicatePessimisticFixpoint();
1129 }
1130
1131 ChangeStatus updateImpl(Attributor &A) override {
1132 ChangeStatus Change = ChangeStatus::UNCHANGED;
1133
1134 auto CheckCallSite = [&](AbstractCallSite CS) {
1135 Function *Caller = CS.getInstruction()->getFunction();
1136 Function *Func = getAssociatedFunction();
1137 LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()
1138 << "->" << Func->getName() << '\n');
1139 (void)Func;
1140
1141 const auto *CallerAA = A.getAAFor<AAAMDWavesPerEU>(
1142 *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
1143 if (!CallerAA || !CallerAA->isValidState())
1144 return false;
1145
1146 ConstantRange Assumed = getAssumed();
1147 unsigned Min = std::max(Assumed.getLower().getZExtValue(),
1148 CallerAA->getAssumed().getLower().getZExtValue());
1149 unsigned Max = std::max(Assumed.getUpper().getZExtValue(),
1150 CallerAA->getAssumed().getUpper().getZExtValue());
1151 ConstantRange Range(APInt(32, Min), APInt(32, Max));
1152 IntegerRangeState RangeState(Range);
1153 getState() = RangeState;
1154 Change |= getState() == Assumed ? ChangeStatus::UNCHANGED
1155 : ChangeStatus::CHANGED;
1156
1157 return true;
1158 };
1159
1160 bool AllCallSitesKnown = true;
1161 if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
1162 return indicatePessimisticFixpoint();
1163
1164 return Change;
1165 }
1166
1167 /// Create an abstract attribute view for the position \p IRP.
1168 static AAAMDWavesPerEU &createForPosition(const IRPosition &IRP,
1169 Attributor &A);
1170
1171 ChangeStatus manifest(Attributor &A) override {
1172 Function *F = getAssociatedFunction();
1173 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
1174 return emitAttributeIfNotDefaultAfterClamp(
1175 A, {1U, InfoCache.getMaxWavesPerEU(*F)});
1176 }
1177
1178 /// See AbstractAttribute::getName()
1179 StringRef getName() const override { return "AAAMDWavesPerEU"; }
1180
1181 /// See AbstractAttribute::getIdAddr()
1182 const char *getIdAddr() const override { return &ID; }
1183
1184 /// This function should return true if the type of the \p AA is
1185 /// AAAMDWavesPerEU
1186 static bool classof(const AbstractAttribute *AA) {
1187 return (AA->getIdAddr() == &ID);
1188 }
1189
1190 /// Unique ID (due to the unique address)
1191 static const char ID;
1192};
1193
1194const char AAAMDWavesPerEU::ID = 0;
1195
1196AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP,
1197 Attributor &A) {
1199 return *new (A.Allocator) AAAMDWavesPerEU(IRP, A);
1200 llvm_unreachable("AAAMDWavesPerEU is only valid for function position");
1201}
1202
1203static bool inlineAsmUsesAGPRs(const InlineAsm *IA) {
1204 for (const auto &CI : IA->ParseConstraints()) {
1205 for (StringRef Code : CI.Codes) {
1206 Code.consume_front("{");
1207 if (Code.starts_with("a"))
1208 return true;
1209 }
1210 }
1211
1212 return false;
1213}
1214
1215// TODO: Migrate to range merge of amdgpu-agpr-alloc.
1216// FIXME: Why is this using Attribute::NoUnwind?
1217struct AAAMDGPUNoAGPR
1218 : public IRAttribute<Attribute::NoUnwind,
1219 StateWrapper<BooleanState, AbstractAttribute>,
1220 AAAMDGPUNoAGPR> {
1221 AAAMDGPUNoAGPR(const IRPosition &IRP, Attributor &A) : IRAttribute(IRP) {}
1222
1223 static AAAMDGPUNoAGPR &createForPosition(const IRPosition &IRP,
1224 Attributor &A) {
1226 return *new (A.Allocator) AAAMDGPUNoAGPR(IRP, A);
1227 llvm_unreachable("AAAMDGPUNoAGPR is only valid for function position");
1228 }
1229
1230 void initialize(Attributor &A) override {
1231 Function *F = getAssociatedFunction();
1232 auto [MinNumAGPR, MaxNumAGPR] =
1233 AMDGPU::getIntegerPairAttribute(*F, "amdgpu-agpr-alloc", {~0u, ~0u},
1234 /*OnlyFirstRequired=*/true);
1235 if (MinNumAGPR == 0)
1236 indicateOptimisticFixpoint();
1237 }
1238
1239 const std::string getAsStr(Attributor *A) const override {
1240 return getAssumed() ? "amdgpu-no-agpr" : "amdgpu-maybe-agpr";
1241 }
1242
1243 void trackStatistics() const override {}
1244
1245 ChangeStatus updateImpl(Attributor &A) override {
1246 // TODO: Use AACallEdges, but then we need a way to inspect asm edges.
1247
1248 auto CheckForNoAGPRs = [&](Instruction &I) {
1249 const auto &CB = cast<CallBase>(I);
1250 const Value *CalleeOp = CB.getCalledOperand();
1251 const Function *Callee = dyn_cast<Function>(CalleeOp);
1252 if (!Callee) {
1253 if (const InlineAsm *IA = dyn_cast<InlineAsm>(CalleeOp))
1254 return !inlineAsmUsesAGPRs(IA);
1255 return false;
1256 }
1257
1258 // Some intrinsics may use AGPRs, but if we have a choice, we are not
1259 // required to use AGPRs.
1260 if (Callee->isIntrinsic())
1261 return true;
1262
1263 // TODO: Handle callsite attributes
1264 const auto *CalleeInfo = A.getAAFor<AAAMDGPUNoAGPR>(
1265 *this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
1266 return CalleeInfo && CalleeInfo->isValidState() &&
1267 CalleeInfo->getAssumed();
1268 };
1269
1270 bool UsedAssumedInformation = false;
1271 if (!A.checkForAllCallLikeInstructions(CheckForNoAGPRs, *this,
1272 UsedAssumedInformation))
1273 return indicatePessimisticFixpoint();
1274 return ChangeStatus::UNCHANGED;
1275 }
1276
1277 ChangeStatus manifest(Attributor &A) override {
1278 if (!getAssumed())
1279 return ChangeStatus::UNCHANGED;
1280 LLVMContext &Ctx = getAssociatedFunction()->getContext();
1281 return A.manifestAttrs(getIRPosition(),
1282 {Attribute::get(Ctx, "amdgpu-agpr-alloc", "0")});
1283 }
1284
1285 StringRef getName() const override { return "AAAMDGPUNoAGPR"; }
1286 const char *getIdAddr() const override { return &ID; }
1287
1288 /// This function should return true if the type of the \p AA is
1289 /// AAAMDGPUNoAGPRs
1290 static bool classof(const AbstractAttribute *AA) {
1291 return (AA->getIdAddr() == &ID);
1292 }
1293
1294 static const char ID;
1295};
1296
1297const char AAAMDGPUNoAGPR::ID = 0;
1298
1299static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
1300 AMDGPUAttributorOptions Options,
1301 ThinOrFullLTOPhase LTOPhase) {
1302 SetVector<Function *> Functions;
1303 for (Function &F : M) {
1304 if (!F.isIntrinsic())
1305 Functions.insert(&F);
1306 }
1307
1308 CallGraphUpdater CGUpdater;
1310 AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, TM);
1311 DenseSet<const char *> Allowed(
1312 {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
1313 &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
1314 &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID,
1318
1319 AttributorConfig AC(CGUpdater);
1320 AC.IsClosedWorldModule = Options.IsClosedWorld;
1321 AC.Allowed = &Allowed;
1322 AC.IsModulePass = true;
1323 AC.DefaultInitializeLiveInternals = false;
1324 AC.IndirectCalleeSpecializationCallback =
1325 [](Attributor &A, const AbstractAttribute &AA, CallBase &CB,
1326 Function &Callee, unsigned NumAssumedCallees) {
1327 return !AMDGPU::isEntryFunctionCC(Callee.getCallingConv()) &&
1328 (NumAssumedCallees <= IndirectCallSpecializationThreshold);
1329 };
1330 AC.IPOAmendableCB = [](const Function &F) {
1331 return F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
1332 };
1333
1334 Attributor A(Functions, InfoCache, AC);
1335
1336 LLVM_DEBUG({
1337 StringRef LTOPhaseStr = to_string(LTOPhase);
1338 dbgs() << "[AMDGPUAttributor] Running at phase " << LTOPhaseStr << '\n'
1339 << "[AMDGPUAttributor] Module " << M.getName() << " is "
1340 << (AC.IsClosedWorldModule ? "" : "not ")
1341 << "assumed to be a closed world.\n";
1342 });
1343
1344 for (auto *F : Functions) {
1345 A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(*F));
1346 A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(*F));
1347 A.getOrCreateAAFor<AAAMDMaxNumWorkgroups>(IRPosition::function(*F));
1348 A.getOrCreateAAFor<AAAMDGPUNoAGPR>(IRPosition::function(*F));
1349 CallingConv::ID CC = F->getCallingConv();
1350 if (!AMDGPU::isEntryFunctionCC(CC)) {
1351 A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(*F));
1352 A.getOrCreateAAFor<AAAMDWavesPerEU>(IRPosition::function(*F));
1353 }
1354
1355 for (auto &I : instructions(F)) {
1356 Value *Ptr = nullptr;
1357 if (auto *LI = dyn_cast<LoadInst>(&I))
1358 Ptr = LI->getPointerOperand();
1359 else if (auto *SI = dyn_cast<StoreInst>(&I))
1360 Ptr = SI->getPointerOperand();
1361 else if (auto *RMW = dyn_cast<AtomicRMWInst>(&I))
1362 Ptr = RMW->getPointerOperand();
1363 else if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(&I))
1364 Ptr = CmpX->getPointerOperand();
1365
1366 if (Ptr) {
1367 A.getOrCreateAAFor<AAAddressSpace>(IRPosition::value(*Ptr));
1368 A.getOrCreateAAFor<AANoAliasAddrSpace>(IRPosition::value(*Ptr));
1369 }
1370 }
1371 }
1372
1373 return A.run() == ChangeStatus::CHANGED;
1374}
1375} // namespace
1376
1379
1382 AnalysisGetter AG(FAM);
1383
1384 // TODO: Probably preserves CFG
1385 return runImpl(M, AG, TM, Options, LTOPhase) ? PreservedAnalyses::none()
1387}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isDSAddress(const Constant *C)
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static cl::opt< unsigned > IndirectCallSpecializationThreshold("amdgpu-indirect-call-specialization-threshold", cl::desc("A threshold controls whether an indirect call will be specialized"), cl::init(3))
static ImplicitArgumentMask intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit, bool HasApertureRegs, bool SupportsGetDoorBellID, unsigned CodeObjectVersion)
ImplicitArgumentMask
@ NOT_IMPLICIT_INPUT
@ ALL_ARGUMENT_MASK
static bool funcRequiresHostcallPtr(const Function &F)
Returns true if the function requires the implicit argument be passed regardless of the function cont...
ImplicitArgumentPositions
@ LAST_ARG_POS
static bool castRequiresQueuePtr(unsigned SrcAS)
Expand Atomic instructions
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
DXIL Resource Access
@ Default
static bool runImpl(Function &F, const TargetLowering &TLI, AssumptionCache *AC)
Definition ExpandFp.cpp:992
AMD GCN specific subclass of TargetSubtarget.
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
Machine Check Debug Module
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
FunctionAnalysisManager FAM
static StringRef getName(Value *V)
Basic Register Allocator
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, ArrayRef< StringLiteral > StandardNames)
Initialize the set of available library functions based on the specified target triple.
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
LLVM_ABI Intrinsic::ID getIntrinsicID() const
Returns the intrinsic ID of the intrinsic called or Intrinsic::not_intrinsic if the called function i...
const APInt & getLower() const
Return the lower value for this range.
const APInt & getUpper() const
Return the upper value for this range.
This is an important base class in LLVM.
Definition Constant.h:43
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
unsigned getAddressSpace() const
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition Analysis.h:115
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:168
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
void push_back(const T &Elt)
std::string str() const
str - Get the contents as an std::string.
Definition StringRef.h:233
LLVM_ABI bool isDroppable() const
A droppable user is a user for which uses can be dropped without affecting correctness and should be ...
Definition User.cpp:115
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
unsigned getAMDHSACodeObjectVersion(const Module &M)
unsigned getDefaultQueueImplicitArgPosition(unsigned CodeObjectVersion)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
unsigned getHostcallImplicitArgPosition(unsigned CodeObjectVersion)
unsigned getCompletionActionImplicitArgPosition(unsigned CodeObjectVersion)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
unsigned getMultigridSyncArgImplicitArgPosition(unsigned CodeObjectVersion)
E & operator^=(E &LHS, E RHS)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ CE
Windows NT (Windows on ARM)
Definition MCAsmInfo.h:48
initializer< Ty > init(const Ty &Val)
NodeAddr< CodeNode * > Code
Definition RDFGraph.h:388
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
InnerAnalysisManagerProxy< FunctionAnalysisManager, Module > FunctionAnalysisManagerModuleProxy
Provide the FunctionAnalysisManager to Module proxy.
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
ThinOrFullLTOPhase
This enumerates the LLVM full LTO or ThinLTO optimization phases.
Definition Pass.h:77
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
BumpPtrAllocatorImpl BumpPtrAllocator
The standard BumpPtrAllocator which just uses the default template parameters.
Definition Allocator.h:383
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
const char * to_string(ThinOrFullLTOPhase Phase)
Definition Pass.cpp:301
@ Other
Any other memory.
Definition ModRef.h:68
ChangeStatus clampStateAndIndicateChange(StateType &S, const StateType &R)
Helper function to clamp a state S of type StateType with the information in R and indicate/return if...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
ChangeStatus
{
Definition Attributor.h:496
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
Definition MIRParser.h:39
static LLVM_ABI const char ID
Unique ID (due to the unique address)
virtual const SetVector< Function * > & getOptimisticEdges() const =0
Get the optimistic edges.
static LLVM_ABI const char ID
Unique ID (due to the unique address)
virtual bool hasNonAsmUnknownCallee() const =0
Is there any call with a unknown callee, excluding any inline asm.
static LLVM_ABI const char ID
Unique ID (due to the unique address)
static LLVM_ABI const char ID
Unique ID (due to the unique address)
Instruction * getRemoteInst() const
Return the actual instruction that causes the access.
static LLVM_ABI const char ID
Unique ID (due to the unique address)
static LLVM_ABI const char ID
Unique ID (due to the unique address)
static LLVM_ABI const char ID
Unique ID (due to the unique address)
static LLVM_ABI const char ID
Unique ID (due to the unique address)
virtual const char * getIdAddr() const =0
This function should return the address of the ID of the AbstractAttribute.
Wrapper for FunctionAnalysisManager.
The fixpoint analysis framework that orchestrates the attribute deduction.
Helper to describe and deal with positions in the LLVM-IR.
Definition Attributor.h:593
static const IRPosition callsite_returned(const CallBase &CB)
Create a position describing the returned value of CB.
Definition Attributor.h:661
static const IRPosition value(const Value &V, const CallBaseContext *CBContext=nullptr)
Create a position describing the value of V.
Definition Attributor.h:617
@ IRP_FUNCTION
An attribute for a function (scope).
Definition Attributor.h:605
static const IRPosition function(const Function &F, const CallBaseContext *CBContext=nullptr)
Create a position describing the function scope of F.
Definition Attributor.h:636
Kind getPositionKind() const
Return the associated position kind.
Definition Attributor.h:889
Data structure to hold cached (LLVM-IR) information.
bool isValidState() const override
See AbstractState::isValidState() NOTE: For now we simply pretend that the worst possible state is in...
Helper to tie a abstract state implementation to an abstract attribute.