LLVM 22.0.0git
AMDGPUAttributor.cpp
Go to the documentation of this file.
1//===- AMDGPUAttributor.cpp -----------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file This pass uses Attributor framework to deduce AMDGPU attributes.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AMDGPU.h"
14#include "GCNSubtarget.h"
16#include "llvm/IR/IntrinsicsAMDGPU.h"
17#include "llvm/IR/IntrinsicsR600.h"
20
21#define DEBUG_TYPE "amdgpu-attributor"
22
23using namespace llvm;
24
26 "amdgpu-indirect-call-specialization-threshold",
28 "A threshold controls whether an indirect call will be specialized"),
29 cl::init(3));
30
31#define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS,
32
34#include "AMDGPUAttributes.def"
36};
37
38#define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
39
42#include "AMDGPUAttributes.def"
44};
45
46#define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
47static constexpr std::pair<ImplicitArgumentMask, StringLiteral>
49#include "AMDGPUAttributes.def"
50};
51
52// We do not need to note the x workitem or workgroup id because they are always
53// initialized.
54//
55// TODO: We should not add the attributes if the known compile time workgroup
56// size is 1 for y/z.
58intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
59 bool HasApertureRegs, bool SupportsGetDoorBellID,
60 unsigned CodeObjectVersion) {
61 switch (ID) {
62 case Intrinsic::amdgcn_workitem_id_x:
63 NonKernelOnly = true;
64 return WORKITEM_ID_X;
65 case Intrinsic::amdgcn_workgroup_id_x:
66 NonKernelOnly = true;
67 return WORKGROUP_ID_X;
68 case Intrinsic::amdgcn_workitem_id_y:
69 case Intrinsic::r600_read_tidig_y:
70 return WORKITEM_ID_Y;
71 case Intrinsic::amdgcn_workitem_id_z:
72 case Intrinsic::r600_read_tidig_z:
73 return WORKITEM_ID_Z;
74 case Intrinsic::amdgcn_workgroup_id_y:
75 case Intrinsic::r600_read_tgid_y:
76 return WORKGROUP_ID_Y;
77 case Intrinsic::amdgcn_workgroup_id_z:
78 case Intrinsic::r600_read_tgid_z:
79 return WORKGROUP_ID_Z;
80 case Intrinsic::amdgcn_cluster_id_x:
81 NonKernelOnly = true;
82 return CLUSTER_ID_X;
83 case Intrinsic::amdgcn_cluster_id_y:
84 return CLUSTER_ID_Y;
85 case Intrinsic::amdgcn_cluster_id_z:
86 return CLUSTER_ID_Z;
87 case Intrinsic::amdgcn_lds_kernel_id:
88 return LDS_KERNEL_ID;
89 case Intrinsic::amdgcn_dispatch_ptr:
90 return DISPATCH_PTR;
91 case Intrinsic::amdgcn_dispatch_id:
92 return DISPATCH_ID;
93 case Intrinsic::amdgcn_implicitarg_ptr:
94 return IMPLICIT_ARG_PTR;
95 // Need queue_ptr anyway. But under V5, we also need implicitarg_ptr to access
96 // queue_ptr.
97 case Intrinsic::amdgcn_queue_ptr:
98 NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
99 return QUEUE_PTR;
100 case Intrinsic::amdgcn_is_shared:
101 case Intrinsic::amdgcn_is_private:
102 if (HasApertureRegs)
103 return NOT_IMPLICIT_INPUT;
104 // Under V5, we need implicitarg_ptr + offsets to access private_base or
105 // shared_base. For pre-V5, however, need to access them through queue_ptr +
106 // offsets.
107 return CodeObjectVersion >= AMDGPU::AMDHSA_COV5 ? IMPLICIT_ARG_PTR
108 : QUEUE_PTR;
109 case Intrinsic::trap:
110 case Intrinsic::debugtrap:
111 case Intrinsic::ubsantrap:
112 if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4.
113 return CodeObjectVersion >= AMDGPU::AMDHSA_COV4 ? NOT_IMPLICIT_INPUT
114 : QUEUE_PTR;
115 NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
116 return QUEUE_PTR;
117 default:
118 return NOT_IMPLICIT_INPUT;
119 }
120}
121
122static bool castRequiresQueuePtr(unsigned SrcAS) {
123 return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
124}
125
126static bool isDSAddress(const Constant *C) {
128 if (!GV)
129 return false;
130 unsigned AS = GV->getAddressSpace();
132}
133
134/// Returns true if the function requires the implicit argument be passed
135/// regardless of the function contents.
136static bool funcRequiresHostcallPtr(const Function &F) {
137 // Sanitizers require the hostcall buffer passed in the implicit arguments.
138 return F.hasFnAttribute(Attribute::SanitizeAddress) ||
139 F.hasFnAttribute(Attribute::SanitizeThread) ||
140 F.hasFnAttribute(Attribute::SanitizeMemory) ||
141 F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
142 F.hasFnAttribute(Attribute::SanitizeMemTag);
143}
144
145namespace {
146class AMDGPUInformationCache : public InformationCache {
147public:
148 AMDGPUInformationCache(const Module &M, AnalysisGetter &AG,
150 SetVector<Function *> *CGSCC, TargetMachine &TM)
151 : InformationCache(M, AG, Allocator, CGSCC), TM(TM),
152 CodeObjectVersion(AMDGPU::getAMDHSACodeObjectVersion(M)) {}
153
154 TargetMachine &TM;
155
156 enum ConstantStatus : uint8_t {
157 NONE = 0,
158 DS_GLOBAL = 1 << 0,
159 ADDR_SPACE_CAST_PRIVATE_TO_FLAT = 1 << 1,
160 ADDR_SPACE_CAST_LOCAL_TO_FLAT = 1 << 2,
161 ADDR_SPACE_CAST_BOTH_TO_FLAT =
162 ADDR_SPACE_CAST_PRIVATE_TO_FLAT | ADDR_SPACE_CAST_LOCAL_TO_FLAT
163 };
164
165 /// Check if the subtarget has aperture regs.
166 bool hasApertureRegs(Function &F) {
167 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
168 return ST.hasApertureRegs();
169 }
170
171 /// Check if the subtarget supports GetDoorbellID.
172 bool supportsGetDoorbellID(Function &F) {
173 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
174 return ST.supportsGetDoorbellID();
175 }
176
177 std::optional<std::pair<unsigned, unsigned>>
178 getFlatWorkGroupSizeAttr(const Function &F) const {
179 auto R = AMDGPU::getIntegerPairAttribute(F, "amdgpu-flat-work-group-size");
180 if (!R)
181 return std::nullopt;
182 return std::make_pair(R->first, *(R->second));
183 }
184
185 std::pair<unsigned, unsigned>
186 getDefaultFlatWorkGroupSize(const Function &F) const {
187 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
188 return ST.getDefaultFlatWorkGroupSize(F.getCallingConv());
189 }
190
191 std::pair<unsigned, unsigned>
192 getMaximumFlatWorkGroupRange(const Function &F) {
193 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
194 return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()};
195 }
196
197 SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) {
198 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
199 return ST.getMaxNumWorkGroups(F);
200 }
201
202 /// Get code object version.
203 unsigned getCodeObjectVersion() const { return CodeObjectVersion; }
204
205 /// Get the effective value of "amdgpu-waves-per-eu" for the function,
206 /// accounting for the interaction with the passed value to use for
207 /// "amdgpu-flat-work-group-size".
208 std::pair<unsigned, unsigned>
209 getWavesPerEU(const Function &F,
210 std::pair<unsigned, unsigned> FlatWorkGroupSize) {
211 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
212 return ST.getWavesPerEU(FlatWorkGroupSize, getLDSSize(F), F);
213 }
214
215 std::optional<std::pair<unsigned, unsigned>>
216 getWavesPerEUAttr(const Function &F) {
217 auto Val = AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu",
218 /*OnlyFirstRequired=*/true);
219 if (!Val)
220 return std::nullopt;
221 if (!Val->second) {
222 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
223 Val->second = ST.getMaxWavesPerEU();
224 }
225 return std::make_pair(Val->first, *(Val->second));
226 }
227
228 std::pair<unsigned, unsigned>
229 getEffectiveWavesPerEU(const Function &F,
230 std::pair<unsigned, unsigned> WavesPerEU,
231 std::pair<unsigned, unsigned> FlatWorkGroupSize) {
232 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
233 return ST.getEffectiveWavesPerEU(WavesPerEU, FlatWorkGroupSize,
234 getLDSSize(F));
235 }
236
237 unsigned getMaxWavesPerEU(const Function &F) {
238 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
239 return ST.getMaxWavesPerEU();
240 }
241
242 unsigned getMaxAddrSpace() const override {
244 }
245
246private:
247 /// Check if the ConstantExpr \p CE uses an addrspacecast from private or
248 /// local to flat. These casts may require the queue pointer.
249 static uint8_t visitConstExpr(const ConstantExpr *CE) {
250 uint8_t Status = NONE;
251
252 if (CE->getOpcode() == Instruction::AddrSpaceCast) {
253 unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
254 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS)
255 Status |= ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
256 else if (SrcAS == AMDGPUAS::LOCAL_ADDRESS)
257 Status |= ADDR_SPACE_CAST_LOCAL_TO_FLAT;
258 }
259
260 return Status;
261 }
262
263 /// Returns the minimum amount of LDS space used by a workgroup running
264 /// function \p F.
265 static unsigned getLDSSize(const Function &F) {
266 return AMDGPU::getIntegerPairAttribute(F, "amdgpu-lds-size",
267 {0, UINT32_MAX}, true)
268 .first;
269 }
270
271 /// Get the constant access bitmap for \p C.
272 uint8_t getConstantAccess(const Constant *C,
273 SmallPtrSetImpl<const Constant *> &Visited) {
274 auto It = ConstantStatus.find(C);
275 if (It != ConstantStatus.end())
276 return It->second;
277
278 uint8_t Result = 0;
279 if (isDSAddress(C))
280 Result = DS_GLOBAL;
281
282 if (const auto *CE = dyn_cast<ConstantExpr>(C))
283 Result |= visitConstExpr(CE);
284
285 for (const Use &U : C->operands()) {
286 const auto *OpC = dyn_cast<Constant>(U);
287 if (!OpC || !Visited.insert(OpC).second)
288 continue;
289
290 Result |= getConstantAccess(OpC, Visited);
291 }
292 return Result;
293 }
294
295public:
296 /// Returns true if \p Fn needs the queue pointer because of \p C.
297 bool needsQueuePtr(const Constant *C, Function &Fn) {
298 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(Fn.getCallingConv());
299 bool HasAperture = hasApertureRegs(Fn);
300
301 // No need to explore the constants.
302 if (!IsNonEntryFunc && HasAperture)
303 return false;
304
305 SmallPtrSet<const Constant *, 8> Visited;
306 uint8_t Access = getConstantAccess(C, Visited);
307
308 // We need to trap on DS globals in non-entry functions.
309 if (IsNonEntryFunc && (Access & DS_GLOBAL))
310 return true;
311
312 return !HasAperture && (Access & ADDR_SPACE_CAST_BOTH_TO_FLAT);
313 }
314
315 bool checkConstForAddrSpaceCastFromPrivate(const Constant *C) {
316 SmallPtrSet<const Constant *, 8> Visited;
317 uint8_t Access = getConstantAccess(C, Visited);
318 return Access & ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
319 }
320
321private:
322 /// Used to determine if the Constant needs the queue pointer.
323 DenseMap<const Constant *, uint8_t> ConstantStatus;
324 const unsigned CodeObjectVersion;
325};
326
327struct AAAMDAttributes
328 : public StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
329 AbstractAttribute> {
330 using Base = StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
331 AbstractAttribute>;
332
333 AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
334
335 /// Create an abstract attribute view for the position \p IRP.
336 static AAAMDAttributes &createForPosition(const IRPosition &IRP,
337 Attributor &A);
338
339 /// See AbstractAttribute::getName().
340 StringRef getName() const override { return "AAAMDAttributes"; }
341
342 /// See AbstractAttribute::getIdAddr().
343 const char *getIdAddr() const override { return &ID; }
344
345 /// This function should return true if the type of the \p AA is
346 /// AAAMDAttributes.
347 static bool classof(const AbstractAttribute *AA) {
348 return (AA->getIdAddr() == &ID);
349 }
350
351 /// Unique ID (due to the unique address)
352 static const char ID;
353};
354const char AAAMDAttributes::ID = 0;
355
356struct AAUniformWorkGroupSize
357 : public StateWrapper<BooleanState, AbstractAttribute> {
358 using Base = StateWrapper<BooleanState, AbstractAttribute>;
359 AAUniformWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
360
361 /// Create an abstract attribute view for the position \p IRP.
362 static AAUniformWorkGroupSize &createForPosition(const IRPosition &IRP,
363 Attributor &A);
364
365 /// See AbstractAttribute::getName().
366 StringRef getName() const override { return "AAUniformWorkGroupSize"; }
367
368 /// See AbstractAttribute::getIdAddr().
369 const char *getIdAddr() const override { return &ID; }
370
371 /// This function should return true if the type of the \p AA is
372 /// AAAMDAttributes.
373 static bool classof(const AbstractAttribute *AA) {
374 return (AA->getIdAddr() == &ID);
375 }
376
377 /// Unique ID (due to the unique address)
378 static const char ID;
379};
380const char AAUniformWorkGroupSize::ID = 0;
381
382struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize {
383 AAUniformWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A)
384 : AAUniformWorkGroupSize(IRP, A) {}
385
386 void initialize(Attributor &A) override {
387 Function *F = getAssociatedFunction();
388 CallingConv::ID CC = F->getCallingConv();
389
390 if (CC != CallingConv::AMDGPU_KERNEL)
391 return;
392
393 bool InitialValue = false;
394 if (F->hasFnAttribute("uniform-work-group-size"))
395 InitialValue =
396 F->getFnAttribute("uniform-work-group-size").getValueAsString() ==
397 "true";
398
399 if (InitialValue)
400 indicateOptimisticFixpoint();
401 else
402 indicatePessimisticFixpoint();
403 }
404
405 ChangeStatus updateImpl(Attributor &A) override {
406 ChangeStatus Change = ChangeStatus::UNCHANGED;
407
408 auto CheckCallSite = [&](AbstractCallSite CS) {
409 Function *Caller = CS.getInstruction()->getFunction();
410 LLVM_DEBUG(dbgs() << "[AAUniformWorkGroupSize] Call " << Caller->getName()
411 << "->" << getAssociatedFunction()->getName() << "\n");
412
413 const auto *CallerInfo = A.getAAFor<AAUniformWorkGroupSize>(
414 *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
415 if (!CallerInfo || !CallerInfo->isValidState())
416 return false;
417
418 Change = Change | clampStateAndIndicateChange(this->getState(),
419 CallerInfo->getState());
420
421 return true;
422 };
423
424 bool AllCallSitesKnown = true;
425 if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
426 return indicatePessimisticFixpoint();
427
428 return Change;
429 }
430
431 ChangeStatus manifest(Attributor &A) override {
433 LLVMContext &Ctx = getAssociatedFunction()->getContext();
434
435 AttrList.push_back(Attribute::get(Ctx, "uniform-work-group-size",
436 getAssumed() ? "true" : "false"));
437 return A.manifestAttrs(getIRPosition(), AttrList,
438 /* ForceReplace */ true);
439 }
440
441 bool isValidState() const override {
442 // This state is always valid, even when the state is false.
443 return true;
444 }
445
446 const std::string getAsStr(Attributor *) const override {
447 return "AMDWorkGroupSize[" + std::to_string(getAssumed()) + "]";
448 }
449
450 /// See AbstractAttribute::trackStatistics()
451 void trackStatistics() const override {}
452};
453
454AAUniformWorkGroupSize &
455AAUniformWorkGroupSize::createForPosition(const IRPosition &IRP,
456 Attributor &A) {
458 return *new (A.Allocator) AAUniformWorkGroupSizeFunction(IRP, A);
460 "AAUniformWorkGroupSize is only valid for function position");
461}
462
463struct AAAMDAttributesFunction : public AAAMDAttributes {
464 AAAMDAttributesFunction(const IRPosition &IRP, Attributor &A)
465 : AAAMDAttributes(IRP, A) {}
466
467 void initialize(Attributor &A) override {
468 Function *F = getAssociatedFunction();
469
470 // If the function requires the implicit arg pointer due to sanitizers,
471 // assume it's needed even if explicitly marked as not requiring it.
472 const bool NeedsHostcall = funcRequiresHostcallPtr(*F);
473 if (NeedsHostcall) {
474 removeAssumedBits(IMPLICIT_ARG_PTR);
475 removeAssumedBits(HOSTCALL_PTR);
476 }
477
478 for (auto Attr : ImplicitAttrs) {
479 if (NeedsHostcall &&
480 (Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR))
481 continue;
482
483 if (F->hasFnAttribute(Attr.second))
484 addKnownBits(Attr.first);
485 }
486
487 if (F->isDeclaration())
488 return;
489
490 // Ignore functions with graphics calling conventions, these are currently
491 // not allowed to have kernel arguments.
492 if (AMDGPU::isGraphics(F->getCallingConv())) {
493 indicatePessimisticFixpoint();
494 return;
495 }
496 }
497
498 ChangeStatus updateImpl(Attributor &A) override {
499 Function *F = getAssociatedFunction();
500 // The current assumed state used to determine a change.
501 auto OrigAssumed = getAssumed();
502
503 // Check for Intrinsics and propagate attributes.
504 const AACallEdges *AAEdges = A.getAAFor<AACallEdges>(
505 *this, this->getIRPosition(), DepClassTy::REQUIRED);
506 if (!AAEdges || !AAEdges->isValidState() ||
507 AAEdges->hasNonAsmUnknownCallee())
508 return indicatePessimisticFixpoint();
509
510 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
511
512 bool NeedsImplicit = false;
513 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
514 bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
515 bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(*F);
516 unsigned COV = InfoCache.getCodeObjectVersion();
517
518 for (Function *Callee : AAEdges->getOptimisticEdges()) {
519 Intrinsic::ID IID = Callee->getIntrinsicID();
520 if (IID == Intrinsic::not_intrinsic) {
521 const AAAMDAttributes *AAAMD = A.getAAFor<AAAMDAttributes>(
522 *this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
523 if (!AAAMD || !AAAMD->isValidState())
524 return indicatePessimisticFixpoint();
525 *this &= *AAAMD;
526 continue;
527 }
528
529 bool NonKernelOnly = false;
530 ImplicitArgumentMask AttrMask =
531 intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit,
532 HasApertureRegs, SupportsGetDoorbellID, COV);
533 if (AttrMask != NOT_IMPLICIT_INPUT) {
534 if ((IsNonEntryFunc || !NonKernelOnly))
535 removeAssumedBits(AttrMask);
536 }
537 }
538
539 // Need implicitarg_ptr to acess queue_ptr, private_base, and shared_base.
540 if (NeedsImplicit)
541 removeAssumedBits(IMPLICIT_ARG_PTR);
542
543 if (isAssumed(QUEUE_PTR) && checkForQueuePtr(A)) {
544 // Under V5, we need implicitarg_ptr + offsets to access private_base or
545 // shared_base. We do not actually need queue_ptr.
546 if (COV >= 5)
547 removeAssumedBits(IMPLICIT_ARG_PTR);
548 else
549 removeAssumedBits(QUEUE_PTR);
550 }
551
552 if (funcRetrievesMultigridSyncArg(A, COV)) {
553 assert(!isAssumed(IMPLICIT_ARG_PTR) &&
554 "multigrid_sync_arg needs implicitarg_ptr");
555 removeAssumedBits(MULTIGRID_SYNC_ARG);
556 }
557
558 if (funcRetrievesHostcallPtr(A, COV)) {
559 assert(!isAssumed(IMPLICIT_ARG_PTR) && "hostcall needs implicitarg_ptr");
560 removeAssumedBits(HOSTCALL_PTR);
561 }
562
563 if (funcRetrievesHeapPtr(A, COV)) {
564 assert(!isAssumed(IMPLICIT_ARG_PTR) && "heap_ptr needs implicitarg_ptr");
565 removeAssumedBits(HEAP_PTR);
566 }
567
568 if (isAssumed(QUEUE_PTR) && funcRetrievesQueuePtr(A, COV)) {
569 assert(!isAssumed(IMPLICIT_ARG_PTR) && "queue_ptr needs implicitarg_ptr");
570 removeAssumedBits(QUEUE_PTR);
571 }
572
573 if (isAssumed(LDS_KERNEL_ID) && funcRetrievesLDSKernelId(A)) {
574 removeAssumedBits(LDS_KERNEL_ID);
575 }
576
577 if (isAssumed(DEFAULT_QUEUE) && funcRetrievesDefaultQueue(A, COV))
578 removeAssumedBits(DEFAULT_QUEUE);
579
580 if (isAssumed(COMPLETION_ACTION) && funcRetrievesCompletionAction(A, COV))
581 removeAssumedBits(COMPLETION_ACTION);
582
583 if (isAssumed(FLAT_SCRATCH_INIT) && needFlatScratchInit(A))
584 removeAssumedBits(FLAT_SCRATCH_INIT);
585
586 return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
587 : ChangeStatus::UNCHANGED;
588 }
589
590 ChangeStatus manifest(Attributor &A) override {
592 LLVMContext &Ctx = getAssociatedFunction()->getContext();
593
594 for (auto Attr : ImplicitAttrs) {
595 if (isKnown(Attr.first))
596 AttrList.push_back(Attribute::get(Ctx, Attr.second));
597 }
598
599 return A.manifestAttrs(getIRPosition(), AttrList,
600 /* ForceReplace */ true);
601 }
602
603 const std::string getAsStr(Attributor *) const override {
604 std::string Str;
605 raw_string_ostream OS(Str);
606 OS << "AMDInfo[";
607 for (auto Attr : ImplicitAttrs)
608 if (isAssumed(Attr.first))
609 OS << ' ' << Attr.second;
610 OS << " ]";
611 return OS.str();
612 }
613
614 /// See AbstractAttribute::trackStatistics()
615 void trackStatistics() const override {}
616
617private:
618 bool checkForQueuePtr(Attributor &A) {
619 Function *F = getAssociatedFunction();
620 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
621
622 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
623
624 bool NeedsQueuePtr = false;
625
626 auto CheckAddrSpaceCasts = [&](Instruction &I) {
627 unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace();
628 if (castRequiresQueuePtr(SrcAS)) {
629 NeedsQueuePtr = true;
630 return false;
631 }
632 return true;
633 };
634
635 bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
636
637 // `checkForAllInstructions` is much more cheaper than going through all
638 // instructions, try it first.
639
640 // The queue pointer is not needed if aperture regs is present.
641 if (!HasApertureRegs) {
642 bool UsedAssumedInformation = false;
643 A.checkForAllInstructions(CheckAddrSpaceCasts, *this,
644 {Instruction::AddrSpaceCast},
645 UsedAssumedInformation);
646 }
647
648 // If we found that we need the queue pointer, nothing else to do.
649 if (NeedsQueuePtr)
650 return true;
651
652 if (!IsNonEntryFunc && HasApertureRegs)
653 return false;
654
655 for (BasicBlock &BB : *F) {
656 for (Instruction &I : BB) {
657 for (const Use &U : I.operands()) {
658 if (const auto *C = dyn_cast<Constant>(U)) {
659 if (InfoCache.needsQueuePtr(C, *F))
660 return true;
661 }
662 }
663 }
664 }
665
666 return false;
667 }
668
669 bool funcRetrievesMultigridSyncArg(Attributor &A, unsigned COV) {
671 AA::RangeTy Range(Pos, 8);
672 return funcRetrievesImplicitKernelArg(A, Range);
673 }
674
675 bool funcRetrievesHostcallPtr(Attributor &A, unsigned COV) {
677 AA::RangeTy Range(Pos, 8);
678 return funcRetrievesImplicitKernelArg(A, Range);
679 }
680
681 bool funcRetrievesDefaultQueue(Attributor &A, unsigned COV) {
683 AA::RangeTy Range(Pos, 8);
684 return funcRetrievesImplicitKernelArg(A, Range);
685 }
686
687 bool funcRetrievesCompletionAction(Attributor &A, unsigned COV) {
689 AA::RangeTy Range(Pos, 8);
690 return funcRetrievesImplicitKernelArg(A, Range);
691 }
692
693 bool funcRetrievesHeapPtr(Attributor &A, unsigned COV) {
694 if (COV < 5)
695 return false;
697 return funcRetrievesImplicitKernelArg(A, Range);
698 }
699
700 bool funcRetrievesQueuePtr(Attributor &A, unsigned COV) {
701 if (COV < 5)
702 return false;
704 return funcRetrievesImplicitKernelArg(A, Range);
705 }
706
707 bool funcRetrievesImplicitKernelArg(Attributor &A, AA::RangeTy Range) {
708 // Check if this is a call to the implicitarg_ptr builtin and it
709 // is used to retrieve the hostcall pointer. The implicit arg for
710 // hostcall is not used only if every use of the implicitarg_ptr
711 // is a load that clearly does not retrieve any byte of the
712 // hostcall pointer. We check this by tracing all the uses of the
713 // initial call to the implicitarg_ptr intrinsic.
714 auto DoesNotLeadToKernelArgLoc = [&](Instruction &I) {
715 auto &Call = cast<CallBase>(I);
716 if (Call.getIntrinsicID() != Intrinsic::amdgcn_implicitarg_ptr)
717 return true;
718
719 const auto *PointerInfoAA = A.getAAFor<AAPointerInfo>(
720 *this, IRPosition::callsite_returned(Call), DepClassTy::REQUIRED);
721 if (!PointerInfoAA || !PointerInfoAA->getState().isValidState())
722 return false;
723
724 return PointerInfoAA->forallInterferingAccesses(
725 Range, [](const AAPointerInfo::Access &Acc, bool IsExact) {
726 return Acc.getRemoteInst()->isDroppable();
727 });
728 };
729
730 bool UsedAssumedInformation = false;
731 return !A.checkForAllCallLikeInstructions(DoesNotLeadToKernelArgLoc, *this,
732 UsedAssumedInformation);
733 }
734
735 bool funcRetrievesLDSKernelId(Attributor &A) {
736 auto DoesNotRetrieve = [&](Instruction &I) {
737 auto &Call = cast<CallBase>(I);
738 return Call.getIntrinsicID() != Intrinsic::amdgcn_lds_kernel_id;
739 };
740 bool UsedAssumedInformation = false;
741 return !A.checkForAllCallLikeInstructions(DoesNotRetrieve, *this,
742 UsedAssumedInformation);
743 }
744
745 // Returns true if FlatScratchInit is needed, i.e., no-flat-scratch-init is
746 // not to be set.
747 bool needFlatScratchInit(Attributor &A) {
748 assert(isAssumed(FLAT_SCRATCH_INIT)); // only called if the bit is still set
749
750 // Check all AddrSpaceCast instructions. FlatScratchInit is needed if
751 // there is a cast from PRIVATE_ADDRESS.
752 auto AddrSpaceCastNotFromPrivate = [](Instruction &I) {
753 return cast<AddrSpaceCastInst>(I).getSrcAddressSpace() !=
755 };
756
757 bool UsedAssumedInformation = false;
758 if (!A.checkForAllInstructions(AddrSpaceCastNotFromPrivate, *this,
759 {Instruction::AddrSpaceCast},
760 UsedAssumedInformation))
761 return true;
762
763 // Check for addrSpaceCast from PRIVATE_ADDRESS in constant expressions
764 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
765
766 Function *F = getAssociatedFunction();
767 for (Instruction &I : instructions(F)) {
768 for (const Use &U : I.operands()) {
769 if (const auto *C = dyn_cast<Constant>(U)) {
770 if (InfoCache.checkConstForAddrSpaceCastFromPrivate(C))
771 return true;
772 }
773 }
774 }
775
776 // Finally check callees.
777
778 // This is called on each callee; false means callee shouldn't have
779 // no-flat-scratch-init.
780 auto CheckForNoFlatScratchInit = [&](Instruction &I) {
781 const auto &CB = cast<CallBase>(I);
782 const Function *Callee = CB.getCalledFunction();
783
784 // Callee == 0 for inline asm or indirect call with known callees.
785 // In the latter case, updateImpl() already checked the callees and we
786 // know their FLAT_SCRATCH_INIT bit is set.
787 // If function has indirect call with unknown callees, the bit is
788 // already removed in updateImpl() and execution won't reach here.
789 if (!Callee)
790 return true;
791
792 return Callee->getIntrinsicID() !=
793 Intrinsic::amdgcn_addrspacecast_nonnull;
794 };
795
796 UsedAssumedInformation = false;
797 // If any callee is false (i.e. need FlatScratchInit),
798 // checkForAllCallLikeInstructions returns false, in which case this
799 // function returns true.
800 return !A.checkForAllCallLikeInstructions(CheckForNoFlatScratchInit, *this,
801 UsedAssumedInformation);
802 }
803};
804
805AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
806 Attributor &A) {
808 return *new (A.Allocator) AAAMDAttributesFunction(IRP, A);
809 llvm_unreachable("AAAMDAttributes is only valid for function position");
810}
811
812/// Base class to derive different size ranges.
813struct AAAMDSizeRangeAttribute
814 : public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> {
815 using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>;
816
817 StringRef AttrName;
818
819 AAAMDSizeRangeAttribute(const IRPosition &IRP, Attributor &A,
820 StringRef AttrName)
821 : Base(IRP, 32), AttrName(AttrName) {}
822
823 /// See AbstractAttribute::trackStatistics()
824 void trackStatistics() const override {}
825
826 template <class AttributeImpl> ChangeStatus updateImplImpl(Attributor &A) {
827 ChangeStatus Change = ChangeStatus::UNCHANGED;
828
829 auto CheckCallSite = [&](AbstractCallSite CS) {
830 Function *Caller = CS.getInstruction()->getFunction();
831 LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()
832 << "->" << getAssociatedFunction()->getName() << '\n');
833
834 const auto *CallerInfo = A.getAAFor<AttributeImpl>(
835 *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
836 if (!CallerInfo || !CallerInfo->isValidState())
837 return false;
838
839 Change |=
840 clampStateAndIndicateChange(this->getState(), CallerInfo->getState());
841
842 return true;
843 };
844
845 bool AllCallSitesKnown = true;
846 if (!A.checkForAllCallSites(CheckCallSite, *this,
847 /*RequireAllCallSites=*/true,
848 AllCallSitesKnown))
849 return indicatePessimisticFixpoint();
850
851 return Change;
852 }
853
854 /// Clamp the assumed range to the default value ([Min, Max]) and emit the
855 /// attribute if it is not same as default.
857 emitAttributeIfNotDefaultAfterClamp(Attributor &A,
858 std::pair<unsigned, unsigned> Default) {
859 auto [Min, Max] = Default;
860 unsigned Lower = getAssumed().getLower().getZExtValue();
861 unsigned Upper = getAssumed().getUpper().getZExtValue();
862
863 // Clamp the range to the default value.
864 if (Lower < Min)
865 Lower = Min;
866 if (Upper > Max + 1)
867 Upper = Max + 1;
868
869 // No manifest if the value is invalid or same as default after clamp.
870 if ((Lower == Min && Upper == Max + 1) || (Upper < Lower))
871 return ChangeStatus::UNCHANGED;
872
873 Function *F = getAssociatedFunction();
874 LLVMContext &Ctx = F->getContext();
875 SmallString<10> Buffer;
876 raw_svector_ostream OS(Buffer);
877 OS << Lower << ',' << Upper - 1;
878 return A.manifestAttrs(getIRPosition(),
879 {Attribute::get(Ctx, AttrName, OS.str())},
880 /*ForceReplace=*/true);
881 }
882
883 const std::string getAsStr(Attributor *) const override {
884 std::string Str;
885 raw_string_ostream OS(Str);
886 OS << getName() << '[';
887 OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
888 OS << ']';
889 return OS.str();
890 }
891};
892
893/// Propagate amdgpu-flat-work-group-size attribute.
894struct AAAMDFlatWorkGroupSize : public AAAMDSizeRangeAttribute {
895 AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A)
896 : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-flat-work-group-size") {}
897
898 void initialize(Attributor &A) override {
899 Function *F = getAssociatedFunction();
900 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
901
902 bool HasAttr = false;
903 auto Range = InfoCache.getDefaultFlatWorkGroupSize(*F);
904 auto MaxRange = InfoCache.getMaximumFlatWorkGroupRange(*F);
905
906 if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(*F)) {
907 // We only consider an attribute that is not max range because the front
908 // end always emits the attribute, unfortunately, and sometimes it emits
909 // the max range.
910 if (*Attr != MaxRange) {
911 Range = *Attr;
912 HasAttr = true;
913 }
914 }
915
916 // We don't want to directly clamp the state if it's the max range because
917 // that is basically the worst state.
918 if (Range == MaxRange)
919 return;
920
921 auto [Min, Max] = Range;
922 ConstantRange CR(APInt(32, Min), APInt(32, Max + 1));
923 IntegerRangeState IRS(CR);
924 clampStateAndIndicateChange(this->getState(), IRS);
925
926 if (HasAttr || AMDGPU::isEntryFunctionCC(F->getCallingConv()))
927 indicateOptimisticFixpoint();
928 }
929
930 ChangeStatus updateImpl(Attributor &A) override {
931 return updateImplImpl<AAAMDFlatWorkGroupSize>(A);
932 }
933
934 /// Create an abstract attribute view for the position \p IRP.
935 static AAAMDFlatWorkGroupSize &createForPosition(const IRPosition &IRP,
936 Attributor &A);
937
938 ChangeStatus manifest(Attributor &A) override {
939 Function *F = getAssociatedFunction();
940 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
941 return emitAttributeIfNotDefaultAfterClamp(
942 A, InfoCache.getMaximumFlatWorkGroupRange(*F));
943 }
944
945 /// See AbstractAttribute::getName()
946 StringRef getName() const override { return "AAAMDFlatWorkGroupSize"; }
947
948 /// See AbstractAttribute::getIdAddr()
949 const char *getIdAddr() const override { return &ID; }
950
951 /// This function should return true if the type of the \p AA is
952 /// AAAMDFlatWorkGroupSize
953 static bool classof(const AbstractAttribute *AA) {
954 return (AA->getIdAddr() == &ID);
955 }
956
957 /// Unique ID (due to the unique address)
958 static const char ID;
959};
960
961const char AAAMDFlatWorkGroupSize::ID = 0;
962
963AAAMDFlatWorkGroupSize &
964AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP,
965 Attributor &A) {
967 return *new (A.Allocator) AAAMDFlatWorkGroupSize(IRP, A);
969 "AAAMDFlatWorkGroupSize is only valid for function position");
970}
971
972struct TupleDecIntegerRangeState : public AbstractState {
973 DecIntegerState<uint32_t> X, Y, Z;
974
975 bool isValidState() const override {
976 return X.isValidState() && Y.isValidState() && Z.isValidState();
977 }
978
979 bool isAtFixpoint() const override {
980 return X.isAtFixpoint() && Y.isAtFixpoint() && Z.isAtFixpoint();
981 }
982
983 ChangeStatus indicateOptimisticFixpoint() override {
984 return X.indicateOptimisticFixpoint() | Y.indicateOptimisticFixpoint() |
985 Z.indicateOptimisticFixpoint();
986 }
987
988 ChangeStatus indicatePessimisticFixpoint() override {
989 return X.indicatePessimisticFixpoint() | Y.indicatePessimisticFixpoint() |
990 Z.indicatePessimisticFixpoint();
991 }
992
993 TupleDecIntegerRangeState operator^=(const TupleDecIntegerRangeState &Other) {
994 X ^= Other.X;
995 Y ^= Other.Y;
996 Z ^= Other.Z;
997 return *this;
998 }
999
1000 bool operator==(const TupleDecIntegerRangeState &Other) const {
1001 return X == Other.X && Y == Other.Y && Z == Other.Z;
1002 }
1003
1004 TupleDecIntegerRangeState &getAssumed() { return *this; }
1005 const TupleDecIntegerRangeState &getAssumed() const { return *this; }
1006};
1007
1008using AAAMDMaxNumWorkgroupsState =
1009 StateWrapper<TupleDecIntegerRangeState, AbstractAttribute, uint32_t>;
1010
1011/// Propagate amdgpu-max-num-workgroups attribute.
1012struct AAAMDMaxNumWorkgroups
1013 : public StateWrapper<TupleDecIntegerRangeState, AbstractAttribute> {
1014 using Base = StateWrapper<TupleDecIntegerRangeState, AbstractAttribute>;
1015
1016 AAAMDMaxNumWorkgroups(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
1017
1018 void initialize(Attributor &A) override {
1019 Function *F = getAssociatedFunction();
1020 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
1021
1022 SmallVector<unsigned> MaxNumWorkgroups = InfoCache.getMaxNumWorkGroups(*F);
1023
1024 X.takeKnownMinimum(MaxNumWorkgroups[0]);
1025 Y.takeKnownMinimum(MaxNumWorkgroups[1]);
1026 Z.takeKnownMinimum(MaxNumWorkgroups[2]);
1027
1028 if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
1029 indicatePessimisticFixpoint();
1030 }
1031
1032 ChangeStatus updateImpl(Attributor &A) override {
1033 ChangeStatus Change = ChangeStatus::UNCHANGED;
1034
1035 auto CheckCallSite = [&](AbstractCallSite CS) {
1036 Function *Caller = CS.getInstruction()->getFunction();
1037 LLVM_DEBUG(dbgs() << "[AAAMDMaxNumWorkgroups] Call " << Caller->getName()
1038 << "->" << getAssociatedFunction()->getName() << '\n');
1039
1040 const auto *CallerInfo = A.getAAFor<AAAMDMaxNumWorkgroups>(
1041 *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
1042 if (!CallerInfo || !CallerInfo->isValidState())
1043 return false;
1044
1045 Change |=
1046 clampStateAndIndicateChange(this->getState(), CallerInfo->getState());
1047 return true;
1048 };
1049
1050 bool AllCallSitesKnown = true;
1051 if (!A.checkForAllCallSites(CheckCallSite, *this,
1052 /*RequireAllCallSites=*/true,
1053 AllCallSitesKnown))
1054 return indicatePessimisticFixpoint();
1055
1056 return Change;
1057 }
1058
1059 /// Create an abstract attribute view for the position \p IRP.
1060 static AAAMDMaxNumWorkgroups &createForPosition(const IRPosition &IRP,
1061 Attributor &A);
1062
1063 ChangeStatus manifest(Attributor &A) override {
1064 Function *F = getAssociatedFunction();
1065 LLVMContext &Ctx = F->getContext();
1066 SmallString<32> Buffer;
1067 raw_svector_ostream OS(Buffer);
1068 OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed();
1069
1070 // TODO: Should annotate loads of the group size for this to do anything
1071 // useful.
1072 return A.manifestAttrs(
1073 getIRPosition(),
1074 {Attribute::get(Ctx, "amdgpu-max-num-workgroups", OS.str())},
1075 /* ForceReplace= */ true);
1076 }
1077
1078 StringRef getName() const override { return "AAAMDMaxNumWorkgroups"; }
1079
1080 const std::string getAsStr(Attributor *) const override {
1081 std::string Buffer = "AAAMDMaxNumWorkgroupsState[";
1082 raw_string_ostream OS(Buffer);
1083 OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed()
1084 << ']';
1085 return OS.str();
1086 }
1087
1088 const char *getIdAddr() const override { return &ID; }
1089
1090 /// This function should return true if the type of the \p AA is
1091 /// AAAMDMaxNumWorkgroups
1092 static bool classof(const AbstractAttribute *AA) {
1093 return (AA->getIdAddr() == &ID);
1094 }
1095
1096 void trackStatistics() const override {}
1097
1098 /// Unique ID (due to the unique address)
1099 static const char ID;
1100};
1101
1102const char AAAMDMaxNumWorkgroups::ID = 0;
1103
1104AAAMDMaxNumWorkgroups &
1105AAAMDMaxNumWorkgroups::createForPosition(const IRPosition &IRP, Attributor &A) {
1107 return *new (A.Allocator) AAAMDMaxNumWorkgroups(IRP, A);
1108 llvm_unreachable("AAAMDMaxNumWorkgroups is only valid for function position");
1109}
1110
1111/// Propagate amdgpu-waves-per-eu attribute.
1112struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
1113 AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A)
1114 : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-waves-per-eu") {}
1115
1116 void initialize(Attributor &A) override {
1117 Function *F = getAssociatedFunction();
1118 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
1119
1120 // If the attribute exists, we will honor it if it is not the default.
1121 if (auto Attr = InfoCache.getWavesPerEUAttr(*F)) {
1122 std::pair<unsigned, unsigned> MaxWavesPerEURange{
1123 1U, InfoCache.getMaxWavesPerEU(*F)};
1124 if (*Attr != MaxWavesPerEURange) {
1125 auto [Min, Max] = *Attr;
1126 ConstantRange Range(APInt(32, Min), APInt(32, Max + 1));
1127 IntegerRangeState RangeState(Range);
1128 this->getState() = RangeState;
1129 indicateOptimisticFixpoint();
1130 return;
1131 }
1132 }
1133
1134 if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
1135 indicatePessimisticFixpoint();
1136 }
1137
1138 ChangeStatus updateImpl(Attributor &A) override {
1139 ChangeStatus Change = ChangeStatus::UNCHANGED;
1140
1141 auto CheckCallSite = [&](AbstractCallSite CS) {
1142 Function *Caller = CS.getInstruction()->getFunction();
1143 Function *Func = getAssociatedFunction();
1144 LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()
1145 << "->" << Func->getName() << '\n');
1146 (void)Func;
1147
1148 const auto *CallerAA = A.getAAFor<AAAMDWavesPerEU>(
1149 *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
1150 if (!CallerAA || !CallerAA->isValidState())
1151 return false;
1152
1153 ConstantRange Assumed = getAssumed();
1154 unsigned Min = std::max(Assumed.getLower().getZExtValue(),
1155 CallerAA->getAssumed().getLower().getZExtValue());
1156 unsigned Max = std::max(Assumed.getUpper().getZExtValue(),
1157 CallerAA->getAssumed().getUpper().getZExtValue());
1158 ConstantRange Range(APInt(32, Min), APInt(32, Max));
1159 IntegerRangeState RangeState(Range);
1160 getState() = RangeState;
1161 Change |= getState() == Assumed ? ChangeStatus::UNCHANGED
1162 : ChangeStatus::CHANGED;
1163
1164 return true;
1165 };
1166
1167 bool AllCallSitesKnown = true;
1168 if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
1169 return indicatePessimisticFixpoint();
1170
1171 return Change;
1172 }
1173
1174 /// Create an abstract attribute view for the position \p IRP.
1175 static AAAMDWavesPerEU &createForPosition(const IRPosition &IRP,
1176 Attributor &A);
1177
1178 ChangeStatus manifest(Attributor &A) override {
1179 Function *F = getAssociatedFunction();
1180 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
1181 return emitAttributeIfNotDefaultAfterClamp(
1182 A, {1U, InfoCache.getMaxWavesPerEU(*F)});
1183 }
1184
1185 /// See AbstractAttribute::getName()
1186 StringRef getName() const override { return "AAAMDWavesPerEU"; }
1187
1188 /// See AbstractAttribute::getIdAddr()
1189 const char *getIdAddr() const override { return &ID; }
1190
1191 /// This function should return true if the type of the \p AA is
1192 /// AAAMDWavesPerEU
1193 static bool classof(const AbstractAttribute *AA) {
1194 return (AA->getIdAddr() == &ID);
1195 }
1196
1197 /// Unique ID (due to the unique address)
1198 static const char ID;
1199};
1200
1201const char AAAMDWavesPerEU::ID = 0;
1202
1203AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP,
1204 Attributor &A) {
1206 return *new (A.Allocator) AAAMDWavesPerEU(IRP, A);
1207 llvm_unreachable("AAAMDWavesPerEU is only valid for function position");
1208}
1209
1210static bool inlineAsmUsesAGPRs(const InlineAsm *IA) {
1211 for (const auto &CI : IA->ParseConstraints()) {
1212 for (StringRef Code : CI.Codes) {
1213 Code.consume_front("{");
1214 if (Code.starts_with("a"))
1215 return true;
1216 }
1217 }
1218
1219 return false;
1220}
1221
1222// TODO: Migrate to range merge of amdgpu-agpr-alloc.
1223// FIXME: Why is this using Attribute::NoUnwind?
1224struct AAAMDGPUNoAGPR
1225 : public IRAttribute<Attribute::NoUnwind,
1226 StateWrapper<BooleanState, AbstractAttribute>,
1227 AAAMDGPUNoAGPR> {
1228 AAAMDGPUNoAGPR(const IRPosition &IRP, Attributor &A) : IRAttribute(IRP) {}
1229
1230 static AAAMDGPUNoAGPR &createForPosition(const IRPosition &IRP,
1231 Attributor &A) {
1233 return *new (A.Allocator) AAAMDGPUNoAGPR(IRP, A);
1234 llvm_unreachable("AAAMDGPUNoAGPR is only valid for function position");
1235 }
1236
1237 void initialize(Attributor &A) override {
1238 Function *F = getAssociatedFunction();
1239 auto [MinNumAGPR, MaxNumAGPR] =
1240 AMDGPU::getIntegerPairAttribute(*F, "amdgpu-agpr-alloc", {~0u, ~0u},
1241 /*OnlyFirstRequired=*/true);
1242 if (MinNumAGPR == 0)
1243 indicateOptimisticFixpoint();
1244 }
1245
1246 const std::string getAsStr(Attributor *A) const override {
1247 return getAssumed() ? "amdgpu-no-agpr" : "amdgpu-maybe-agpr";
1248 }
1249
1250 void trackStatistics() const override {}
1251
1252 ChangeStatus updateImpl(Attributor &A) override {
1253 // TODO: Use AACallEdges, but then we need a way to inspect asm edges.
1254
1255 auto CheckForNoAGPRs = [&](Instruction &I) {
1256 const auto &CB = cast<CallBase>(I);
1257 const Value *CalleeOp = CB.getCalledOperand();
1258 const Function *Callee = dyn_cast<Function>(CalleeOp);
1259 if (!Callee) {
1260 if (const InlineAsm *IA = dyn_cast<InlineAsm>(CalleeOp))
1261 return !inlineAsmUsesAGPRs(IA);
1262 return false;
1263 }
1264
1265 // Some intrinsics may use AGPRs, but if we have a choice, we are not
1266 // required to use AGPRs.
1267 if (Callee->isIntrinsic())
1268 return true;
1269
1270 // TODO: Handle callsite attributes
1271 const auto *CalleeInfo = A.getAAFor<AAAMDGPUNoAGPR>(
1272 *this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
1273 return CalleeInfo && CalleeInfo->isValidState() &&
1274 CalleeInfo->getAssumed();
1275 };
1276
1277 bool UsedAssumedInformation = false;
1278 if (!A.checkForAllCallLikeInstructions(CheckForNoAGPRs, *this,
1279 UsedAssumedInformation))
1280 return indicatePessimisticFixpoint();
1281 return ChangeStatus::UNCHANGED;
1282 }
1283
1284 ChangeStatus manifest(Attributor &A) override {
1285 if (!getAssumed())
1286 return ChangeStatus::UNCHANGED;
1287 LLVMContext &Ctx = getAssociatedFunction()->getContext();
1288 return A.manifestAttrs(getIRPosition(),
1289 {Attribute::get(Ctx, "amdgpu-agpr-alloc", "0")});
1290 }
1291
1292 StringRef getName() const override { return "AAAMDGPUNoAGPR"; }
1293 const char *getIdAddr() const override { return &ID; }
1294
1295 /// This function should return true if the type of the \p AA is
1296 /// AAAMDGPUNoAGPRs
1297 static bool classof(const AbstractAttribute *AA) {
1298 return (AA->getIdAddr() == &ID);
1299 }
1300
1301 static const char ID;
1302};
1303
1304const char AAAMDGPUNoAGPR::ID = 0;
1305
1306/// An abstract attribute to propagate the function attribute
1307/// "amdgpu-cluster-dims" from kernel entry functions to device functions.
1308struct AAAMDGPUClusterDims
1309 : public StateWrapper<BooleanState, AbstractAttribute> {
1310 using Base = StateWrapper<BooleanState, AbstractAttribute>;
1311 AAAMDGPUClusterDims(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
1312
1313 /// Create an abstract attribute view for the position \p IRP.
1314 static AAAMDGPUClusterDims &createForPosition(const IRPosition &IRP,
1315 Attributor &A);
1316
1317 /// See AbstractAttribute::getName().
1318 StringRef getName() const override { return "AAAMDGPUClusterDims"; }
1319
1320 /// See AbstractAttribute::getIdAddr().
1321 const char *getIdAddr() const override { return &ID; }
1322
1323 /// This function should return true if the type of the \p AA is
1324 /// AAAMDGPUClusterDims.
1325 static bool classof(const AbstractAttribute *AA) {
1326 return AA->getIdAddr() == &ID;
1327 }
1328
1329 virtual const AMDGPU::ClusterDimsAttr &getClusterDims() const = 0;
1330
1331 /// Unique ID (due to the unique address)
1332 static const char ID;
1333};
1334
1335const char AAAMDGPUClusterDims::ID = 0;
1336
1337struct AAAMDGPUClusterDimsFunction : public AAAMDGPUClusterDims {
1338 AAAMDGPUClusterDimsFunction(const IRPosition &IRP, Attributor &A)
1339 : AAAMDGPUClusterDims(IRP, A) {}
1340
1341 void initialize(Attributor &A) override {
1342 Function *F = getAssociatedFunction();
1343 assert(F && "empty associated function");
1344
1346
1347 // No matter what a kernel function has, it is final.
1348 if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
1349 if (Attr.isUnknown())
1350 indicatePessimisticFixpoint();
1351 else
1352 indicateOptimisticFixpoint();
1353 }
1354 }
1355
1356 const std::string getAsStr(Attributor *A) const override {
1357 if (!getAssumed() || Attr.isUnknown())
1358 return "unknown";
1359 if (Attr.isNoCluster())
1360 return "no";
1361 if (Attr.isVariableDims())
1362 return "variable";
1363 return Attr.to_string();
1364 }
1365
1366 void trackStatistics() const override {}
1367
1368 ChangeStatus updateImpl(Attributor &A) override {
1369 auto OldState = Attr;
1370
1371 auto CheckCallSite = [&](AbstractCallSite CS) {
1372 const auto *CallerAA = A.getAAFor<AAAMDGPUClusterDims>(
1373 *this, IRPosition::function(*CS.getInstruction()->getFunction()),
1374 DepClassTy::REQUIRED);
1375 if (!CallerAA || !CallerAA->isValidState())
1376 return false;
1377
1378 return merge(CallerAA->getClusterDims());
1379 };
1380
1381 bool UsedAssumedInformation = false;
1382 if (!A.checkForAllCallSites(CheckCallSite, *this,
1383 /*RequireAllCallSites=*/true,
1384 UsedAssumedInformation))
1385 return indicatePessimisticFixpoint();
1386
1387 return OldState == Attr ? ChangeStatus::UNCHANGED : ChangeStatus::CHANGED;
1388 }
1389
1390 ChangeStatus manifest(Attributor &A) override {
1391 if (Attr.isUnknown())
1392 return ChangeStatus::UNCHANGED;
1393 return A.manifestAttrs(
1394 getIRPosition(),
1395 {Attribute::get(getAssociatedFunction()->getContext(), AttrName,
1396 Attr.to_string())},
1397 /*ForceReplace=*/true);
1398 }
1399
1400 const AMDGPU::ClusterDimsAttr &getClusterDims() const override {
1401 return Attr;
1402 }
1403
1404private:
1405 bool merge(const AMDGPU::ClusterDimsAttr &Other) {
1406 // Case 1: Both of them are unknown yet, we do nothing and continue wait for
1407 // propagation.
1408 if (Attr.isUnknown() && Other.isUnknown())
1409 return true;
1410
1411 // Case 2: The other is determined, but we are unknown yet, we simply take
1412 // the other's value.
1413 if (Attr.isUnknown()) {
1414 Attr = Other;
1415 return true;
1416 }
1417
1418 // Case 3: We are determined but the other is unknown yet, we simply keep
1419 // everything unchanged.
1420 if (Other.isUnknown())
1421 return true;
1422
1423 // After this point, both are determined.
1424
1425 // Case 4: If they are same, we do nothing.
1426 if (Attr == Other)
1427 return true;
1428
1429 // Now they are not same.
1430
1431 // Case 5: If either of us uses cluster (but not both; otherwise case 4
1432 // would hold), then it is unknown whether cluster will be used, and the
1433 // state is final, unlike case 1.
1434 if (Attr.isNoCluster() || Other.isNoCluster()) {
1435 Attr.setUnknown();
1436 return false;
1437 }
1438
1439 // Case 6: Both of us use cluster, but the dims are different, so the result
1440 // is, cluster is used, but we just don't have a fixed dims.
1441 Attr.setVariableDims();
1442 return true;
1443 }
1444
1445 AMDGPU::ClusterDimsAttr Attr;
1446
1447 static constexpr const char AttrName[] = "amdgpu-cluster-dims";
1448};
1449
1450AAAMDGPUClusterDims &
1451AAAMDGPUClusterDims::createForPosition(const IRPosition &IRP, Attributor &A) {
1453 return *new (A.Allocator) AAAMDGPUClusterDimsFunction(IRP, A);
1454 llvm_unreachable("AAAMDGPUClusterDims is only valid for function position");
1455}
1456
1457static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
1458 AMDGPUAttributorOptions Options,
1459 ThinOrFullLTOPhase LTOPhase) {
1460 SetVector<Function *> Functions;
1461 for (Function &F : M) {
1462 if (!F.isIntrinsic())
1463 Functions.insert(&F);
1464 }
1465
1466 CallGraphUpdater CGUpdater;
1468 AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, TM);
1469 DenseSet<const char *> Allowed(
1470 {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
1471 &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
1472 &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID,
1475 &AAIndirectCallInfo::ID, &AAAMDGPUClusterDims::ID});
1476
1477 AttributorConfig AC(CGUpdater);
1478 AC.IsClosedWorldModule = Options.IsClosedWorld;
1479 AC.Allowed = &Allowed;
1480 AC.IsModulePass = true;
1481 AC.DefaultInitializeLiveInternals = false;
1482 AC.IndirectCalleeSpecializationCallback =
1483 [](Attributor &A, const AbstractAttribute &AA, CallBase &CB,
1484 Function &Callee, unsigned NumAssumedCallees) {
1485 return !AMDGPU::isEntryFunctionCC(Callee.getCallingConv()) &&
1486 (NumAssumedCallees <= IndirectCallSpecializationThreshold);
1487 };
1488 AC.IPOAmendableCB = [](const Function &F) {
1489 return F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
1490 };
1491
1492 Attributor A(Functions, InfoCache, AC);
1493
1494 LLVM_DEBUG({
1495 StringRef LTOPhaseStr = to_string(LTOPhase);
1496 dbgs() << "[AMDGPUAttributor] Running at phase " << LTOPhaseStr << '\n'
1497 << "[AMDGPUAttributor] Module " << M.getName() << " is "
1498 << (AC.IsClosedWorldModule ? "" : "not ")
1499 << "assumed to be a closed world.\n";
1500 });
1501
1502 for (auto *F : Functions) {
1503 A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(*F));
1504 A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(*F));
1505 A.getOrCreateAAFor<AAAMDMaxNumWorkgroups>(IRPosition::function(*F));
1506 A.getOrCreateAAFor<AAAMDGPUNoAGPR>(IRPosition::function(*F));
1507 CallingConv::ID CC = F->getCallingConv();
1508 if (!AMDGPU::isEntryFunctionCC(CC)) {
1509 A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(*F));
1510 A.getOrCreateAAFor<AAAMDWavesPerEU>(IRPosition::function(*F));
1511 }
1512
1513 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(*F);
1514 if (!F->isDeclaration() && ST.hasClusters())
1515 A.getOrCreateAAFor<AAAMDGPUClusterDims>(IRPosition::function(*F));
1516
1517 for (auto &I : instructions(F)) {
1518 Value *Ptr = nullptr;
1519 if (auto *LI = dyn_cast<LoadInst>(&I))
1520 Ptr = LI->getPointerOperand();
1521 else if (auto *SI = dyn_cast<StoreInst>(&I))
1522 Ptr = SI->getPointerOperand();
1523 else if (auto *RMW = dyn_cast<AtomicRMWInst>(&I))
1524 Ptr = RMW->getPointerOperand();
1525 else if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(&I))
1526 Ptr = CmpX->getPointerOperand();
1527
1528 if (Ptr) {
1529 A.getOrCreateAAFor<AAAddressSpace>(IRPosition::value(*Ptr));
1530 A.getOrCreateAAFor<AANoAliasAddrSpace>(IRPosition::value(*Ptr));
1531 }
1532 }
1533 }
1534
1535 return A.run() == ChangeStatus::CHANGED;
1536}
1537} // namespace
1538
1541
1544 AnalysisGetter AG(FAM);
1545
1546 // TODO: Probably preserves CFG
1547 return runImpl(M, AG, TM, Options, LTOPhase) ? PreservedAnalyses::none()
1549}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isDSAddress(const Constant *C)
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static cl::opt< unsigned > IndirectCallSpecializationThreshold("amdgpu-indirect-call-specialization-threshold", cl::desc("A threshold controls whether an indirect call will be specialized"), cl::init(3))
static ImplicitArgumentMask intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit, bool HasApertureRegs, bool SupportsGetDoorBellID, unsigned CodeObjectVersion)
ImplicitArgumentMask
@ NOT_IMPLICIT_INPUT
@ ALL_ARGUMENT_MASK
static bool funcRequiresHostcallPtr(const Function &F)
Returns true if the function requires the implicit argument be passed regardless of the function cont...
ImplicitArgumentPositions
@ LAST_ARG_POS
static bool castRequiresQueuePtr(unsigned SrcAS)
Expand Atomic instructions
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
DXIL Resource Access
@ Default
static bool runImpl(Function &F, const TargetLowering &TLI, AssumptionCache *AC)
Definition ExpandFp.cpp:992
AMD GCN specific subclass of TargetSubtarget.
static LVOptions Options
Definition LVOptions.cpp:25
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
Machine Check Debug Module
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
FunctionAnalysisManager FAM
static StringRef getName(Value *V)
Basic Register Allocator
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, ArrayRef< StringLiteral > StandardNames)
Initialize the set of available library functions based on the specified target triple.
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static ClusterDimsAttr get(const Function &F)
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
LLVM_ABI Intrinsic::ID getIntrinsicID() const
Returns the intrinsic ID of the intrinsic called or Intrinsic::not_intrinsic if the called function i...
const APInt & getLower() const
Return the lower value for this range.
const APInt & getUpper() const
Return the upper value for this range.
This is an important base class in LLVM.
Definition Constant.h:43
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
unsigned getAddressSpace() const
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition Analysis.h:115
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:168
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
void push_back(const T &Elt)
std::string str() const
str - Get the contents as an std::string.
Definition StringRef.h:233
LLVM_ABI bool isDroppable() const
A droppable user is a user for which uses can be dropped without affecting correctness and should be ...
Definition User.cpp:115
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
unsigned getAMDHSACodeObjectVersion(const Module &M)
unsigned getDefaultQueueImplicitArgPosition(unsigned CodeObjectVersion)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
unsigned getHostcallImplicitArgPosition(unsigned CodeObjectVersion)
unsigned getCompletionActionImplicitArgPosition(unsigned CodeObjectVersion)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
unsigned getMultigridSyncArgImplicitArgPosition(unsigned CodeObjectVersion)
E & operator^=(E &LHS, E RHS)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ CE
Windows NT (Windows on ARM)
Definition MCAsmInfo.h:48
initializer< Ty > init(const Ty &Val)
NodeAddr< CodeNode * > Code
Definition RDFGraph.h:388
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
Context & getContext() const
Definition BasicBlock.h:99
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
InnerAnalysisManagerProxy< FunctionAnalysisManager, Module > FunctionAnalysisManagerModuleProxy
Provide the FunctionAnalysisManager to Module proxy.
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
ThinOrFullLTOPhase
This enumerates the LLVM full LTO or ThinLTO optimization phases.
Definition Pass.h:77
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
BumpPtrAllocatorImpl BumpPtrAllocator
The standard BumpPtrAllocator which just uses the default template parameters.
Definition Allocator.h:383
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
const char * to_string(ThinOrFullLTOPhase Phase)
Definition Pass.cpp:301
@ Other
Any other memory.
Definition ModRef.h:68
ChangeStatus clampStateAndIndicateChange(StateType &S, const StateType &R)
Helper function to clamp a state S of type StateType with the information in R and indicate/return if...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
ChangeStatus
{
Definition Attributor.h:496
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
Definition MIRParser.h:39
static LLVM_ABI const char ID
Unique ID (due to the unique address)
virtual const SetVector< Function * > & getOptimisticEdges() const =0
Get the optimistic edges.
static LLVM_ABI const char ID
Unique ID (due to the unique address)
virtual bool hasNonAsmUnknownCallee() const =0
Is there any call with a unknown callee, excluding any inline asm.
static LLVM_ABI const char ID
Unique ID (due to the unique address)
static LLVM_ABI const char ID
Unique ID (due to the unique address)
Instruction * getRemoteInst() const
Return the actual instruction that causes the access.
static LLVM_ABI const char ID
Unique ID (due to the unique address)
static LLVM_ABI const char ID
Unique ID (due to the unique address)
static LLVM_ABI const char ID
Unique ID (due to the unique address)
static LLVM_ABI const char ID
Unique ID (due to the unique address)
virtual const char * getIdAddr() const =0
This function should return the address of the ID of the AbstractAttribute.
Wrapper for FunctionAnalysisManager.
The fixpoint analysis framework that orchestrates the attribute deduction.
Helper to describe and deal with positions in the LLVM-IR.
Definition Attributor.h:593
static const IRPosition callsite_returned(const CallBase &CB)
Create a position describing the returned value of CB.
Definition Attributor.h:661
static const IRPosition value(const Value &V, const CallBaseContext *CBContext=nullptr)
Create a position describing the value of V.
Definition Attributor.h:617
@ IRP_FUNCTION
An attribute for a function (scope).
Definition Attributor.h:605
static const IRPosition function(const Function &F, const CallBaseContext *CBContext=nullptr)
Create a position describing the function scope of F.
Definition Attributor.h:636
Kind getPositionKind() const
Return the associated position kind.
Definition Attributor.h:889
Data structure to hold cached (LLVM-IR) information.
bool isValidState() const override
See AbstractState::isValidState() NOTE: For now we simply pretend that the worst possible state is in...
Helper to tie a abstract state implementation to an abstract attribute.