LLVM 22.0.0git
AMDGPUAsmPrinter.cpp
Go to the documentation of this file.
1//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10///
11/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12/// code. When passed an MCAsmStreamer it prints assembly and when passed
13/// an MCObjectStreamer it outputs binary code.
14//
15//===----------------------------------------------------------------------===//
16//
17
18#include "AMDGPUAsmPrinter.h"
19#include "AMDGPU.h"
23#include "GCNSubtarget.h"
28#include "R600AsmPrinter.h"
40#include "llvm/MC/MCAssembler.h"
41#include "llvm/MC/MCContext.h"
43#include "llvm/MC/MCStreamer.h"
44#include "llvm/MC/MCValue.h"
51
52using namespace llvm;
53using namespace llvm::AMDGPU;
54
55// This should get the default rounding mode from the kernel. We just set the
56// default here, but this could change if the OpenCL rounding mode pragmas are
57// used.
58//
59// The denormal mode here should match what is reported by the OpenCL runtime
60// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
61// can also be override to flush with the -cl-denorms-are-zero compiler flag.
62//
63// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
64// precision, and leaves single precision to flush all and does not report
65// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
66// CL_FP_DENORM for both.
67//
68// FIXME: It seems some instructions do not support single precision denormals
69// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
70// and sin_f32, cos_f32 on most parts).
71
72// We want to use these instructions, and using fp32 denormals also causes
73// instructions to run at the double precision rate for the device so it's
74// probably best to just report no single precision denormals.
78 FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) |
79 FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue());
80}
81
82static AsmPrinter *
84 std::unique_ptr<MCStreamer> &&Streamer) {
85 return new AMDGPUAsmPrinter(tm, std::move(Streamer));
86}
87
94}
95
97 std::unique_ptr<MCStreamer> Streamer)
98 : AsmPrinter(TM, std::move(Streamer)) {
99 assert(OutStreamer && "AsmPrinter constructed without streamer");
100}
101
103 return "AMDGPU Assembly Printer";
104}
105
107 return TM.getMCSubtargetInfo();
108}
109
111 if (!OutStreamer)
112 return nullptr;
113 return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer());
114}
115
118}
119
120void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
122
123 // TODO: Which one is called first, emitStartOfAsmFile or
124 // emitFunctionBodyStart?
125 if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
126 initializeTargetID(M);
127
130 return;
131
133
136 CodeObjectVersion);
137 HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID());
138 }
139
142}
143
145 // Init target streamer if it has not yet happened
147 initTargetStreamer(M);
148
151
152 // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
153 // Emit HSA Metadata (NT_AMD_HSA_METADATA).
155 HSAMetadataStream->end();
156 bool Success = HSAMetadataStream->emitTo(*getTargetStreamer());
157 (void)Success;
158 assert(Success && "Malformed HSA Metadata");
159 }
160}
161
164 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
165 const Function &F = MF->getFunction();
166
167 // TODO: We're checking this late, would be nice to check it earlier.
168 if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) {
170 STM.getCPU() + " is only available on code object version 6 or better");
171 }
172
173 // TODO: Which one is called first, emitStartOfAsmFile or
174 // emitFunctionBodyStart?
175 if (!getTargetStreamer()->getTargetID())
176 initializeTargetID(*F.getParent());
177
178 const auto &FunctionTargetID = STM.getTargetID();
179 // Make sure function's xnack settings are compatible with module's
180 // xnack settings.
181 if (FunctionTargetID.isXnackSupported() &&
182 FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any &&
183 FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) {
184 OutContext.reportError({}, "xnack setting of '" + Twine(MF->getName()) +
185 "' function does not match module xnack setting");
186 return;
187 }
188 // Make sure function's sramecc settings are compatible with module's
189 // sramecc settings.
190 if (FunctionTargetID.isSramEccSupported() &&
191 FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any &&
192 FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) {
193 OutContext.reportError({}, "sramecc setting of '" + Twine(MF->getName()) +
194 "' function does not match module sramecc setting");
195 return;
196 }
197
198 if (!MFI.isEntryFunction())
199 return;
200
201 if (STM.isMesaKernel(F) &&
202 (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
203 F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
204 AMDGPUMCKernelCodeT KernelCode;
205 getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
206 KernelCode.validate(&STM, MF->getContext());
208 }
209
210 if (STM.isAmdHsaOS())
211 HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
212}
213
216 if (!MFI.isEntryFunction())
217 return;
218
220 return;
221
222 auto &Streamer = getTargetStreamer()->getStreamer();
223 auto &Context = Streamer.getContext();
224 auto &ObjectFileInfo = *Context.getObjectFileInfo();
225 auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
226
227 Streamer.pushSection();
228 Streamer.switchSection(&ReadOnlySection);
229
230 // CP microcode requires the kernel descriptor to be allocated on 64 byte
231 // alignment.
232 Streamer.emitValueToAlignment(Align(64), 0, 1, 0);
233 ReadOnlySection.ensureMinAlignment(Align(64));
234
235 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
236
237 SmallString<128> KernelName;
238 getNameWithPrefix(KernelName, &MF->getFunction());
240 STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
241 CurrentProgramInfo.NumVGPRsForWavesPerEU,
243 CurrentProgramInfo.NumSGPRsForWavesPerEU,
245 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
246 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Context),
247 Context),
248 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);
249
250 Streamer.popSection();
251}
252
254 Register RegNo = MI->getOperand(0).getReg();
255
258 OS << "implicit-def: "
259 << printReg(RegNo, MF->getSubtarget().getRegisterInfo());
260
261 if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL)
262 OS << " : SGPR spill to VGPR lane";
263
264 OutStreamer->AddComment(OS.str());
265 OutStreamer->addBlankLine();
266}
267
271 return;
272 }
273
275 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
276 if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) {
277 SmallString<128> SymbolName;
278 getNameWithPrefix(SymbolName, &MF->getFunction()),
280 SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
281 }
282 if (DumpCodeInstEmitter) {
283 // Disassemble function name label to text.
284 DisasmLines.push_back(MF->getName().str() + ":");
285 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
286 HexLines.emplace_back("");
287 }
288
290}
291
293 if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) {
294 // Write a line for the basic block label if it is not only fallthrough.
295 DisasmLines.push_back(
296 (Twine("BB") + Twine(getFunctionNumber())
297 + "_" + Twine(MBB.getNumber()) + ":").str());
298 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
299 HexLines.emplace_back("");
300 }
302}
303
306 if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
308 Twine(GV->getName()) +
309 ": unsupported initializer for address space");
310 return;
311 }
312
313 // LDS variables aren't emitted in HSA or PAL yet.
315 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
316 return;
317
318 MCSymbol *GVSym = getSymbol(GV);
319
320 GVSym->redefineIfPossible();
321 if (GVSym->isDefined() || GVSym->isVariable())
322 report_fatal_error("symbol '" + Twine(GVSym->getName()) +
323 "' is already defined");
324
325 const DataLayout &DL = GV->getDataLayout();
326 uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
327 Align Alignment = GV->getAlign().value_or(Align(4));
328
329 emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
330 emitLinkage(GV, GVSym);
331 auto *TS = getTargetStreamer();
332 TS->emitAMDGPULDS(GVSym, Size, Alignment);
333 return;
334 }
335
337}
338
340 CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M);
341
343 switch (CodeObjectVersion) {
345 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV4>();
346 break;
348 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV5>();
349 break;
351 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV6>();
352 break;
353 default:
354 reportFatalUsageError("unsupported code object version");
355 }
356 }
357
359}
360
361void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
362 if (F.isDeclaration() || !AMDGPU::isModuleEntryFunctionCC(F.getCallingConv()))
363 return;
364
367 MCSymbol *FnSym = TM.getSymbol(&F);
368 bool IsLocal = F.hasLocalLinkage();
369
370 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
371 int64_t Val;
372 if (Value->evaluateAsAbsolute(Val)) {
373 Res = Val;
374 return true;
375 }
376 return false;
377 };
378
379 const uint64_t MaxScratchPerWorkitem =
381 MCSymbol *ScratchSizeSymbol = RI.getSymbol(
382 FnSym->getName(), RIK::RIK_PrivateSegSize, OutContext, IsLocal);
383 uint64_t ScratchSize;
384 if (ScratchSizeSymbol->isVariable() &&
385 TryGetMCExprValue(ScratchSizeSymbol->getVariableValue(), ScratchSize) &&
386 ScratchSize > MaxScratchPerWorkitem) {
387 DiagnosticInfoStackSize DiagStackSize(F, ScratchSize, MaxScratchPerWorkitem,
388 DS_Error);
389 F.getContext().diagnose(DiagStackSize);
390 }
391
392 // Validate addressable scalar registers (i.e., prior to added implicit
393 // SGPRs).
394 MCSymbol *NumSGPRSymbol =
395 RI.getSymbol(FnSym->getName(), RIK::RIK_NumSGPR, OutContext, IsLocal);
397 !STM.hasSGPRInitBug()) {
398 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
399 uint64_t NumSgpr;
400 if (NumSGPRSymbol->isVariable() &&
401 TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
402 NumSgpr > MaxAddressableNumSGPRs) {
403 DiagnosticInfoResourceLimit Diag(F, "addressable scalar registers",
404 NumSgpr, MaxAddressableNumSGPRs,
406 F.getContext().diagnose(Diag);
407 return;
408 }
409 }
410
411 MCSymbol *VCCUsedSymbol =
412 RI.getSymbol(FnSym->getName(), RIK::RIK_UsesVCC, OutContext, IsLocal);
413 MCSymbol *FlatUsedSymbol = RI.getSymbol(
414 FnSym->getName(), RIK::RIK_UsesFlatScratch, OutContext, IsLocal);
415 uint64_t VCCUsed, FlatUsed, NumSgpr;
416
417 if (NumSGPRSymbol->isVariable() && VCCUsedSymbol->isVariable() &&
418 FlatUsedSymbol->isVariable() &&
419 TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
420 TryGetMCExprValue(VCCUsedSymbol->getVariableValue(), VCCUsed) &&
421 TryGetMCExprValue(FlatUsedSymbol->getVariableValue(), FlatUsed)) {
422
423 // Recomputes NumSgprs + implicit SGPRs but all symbols should now be
424 // resolvable.
425 NumSgpr += IsaInfo::getNumExtraSGPRs(
426 &STM, VCCUsed, FlatUsed,
427 getTargetStreamer()->getTargetID()->isXnackOnOrAny());
429 STM.hasSGPRInitBug()) {
430 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
431 if (NumSgpr > MaxAddressableNumSGPRs) {
432 DiagnosticInfoResourceLimit Diag(F, "scalar registers", NumSgpr,
433 MaxAddressableNumSGPRs, DS_Error,
435 F.getContext().diagnose(Diag);
436 return;
437 }
438 }
439
440 MCSymbol *NumVgprSymbol =
441 RI.getSymbol(FnSym->getName(), RIK::RIK_NumVGPR, OutContext, IsLocal);
442 MCSymbol *NumAgprSymbol =
443 RI.getSymbol(FnSym->getName(), RIK::RIK_NumAGPR, OutContext, IsLocal);
444 uint64_t NumVgpr, NumAgpr;
445
447 getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
449 if (MF && NumVgprSymbol->isVariable() && NumAgprSymbol->isVariable() &&
450 TryGetMCExprValue(NumVgprSymbol->getVariableValue(), NumVgpr) &&
451 TryGetMCExprValue(NumAgprSymbol->getVariableValue(), NumAgpr)) {
453 unsigned MaxWaves = MFI.getMaxWavesPerEU();
454 uint64_t TotalNumVgpr =
455 getTotalNumVGPRs(STM.hasGFX90AInsts(), NumAgpr, NumVgpr);
456 uint64_t NumVGPRsForWavesPerEU =
457 std::max({TotalNumVgpr, (uint64_t)1,
459 MaxWaves, MFI.getDynamicVGPRBlockSize())});
460 uint64_t NumSGPRsForWavesPerEU = std::max(
461 {NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)});
462 const MCExpr *OccupancyExpr = AMDGPUMCExpr::createOccupancy(
463 STM.getOccupancyWithWorkGroupSizes(*MF).second,
464 MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext),
465 MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext),
467 uint64_t Occupancy;
468
469 const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute(
470 F, "amdgpu-waves-per-eu", {0, 0}, true);
471
472 if (TryGetMCExprValue(OccupancyExpr, Occupancy) && Occupancy < MinWEU) {
474 F, F.getSubprogram(),
475 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
476 "'" +
477 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
478 ", final occupancy is " + Twine(Occupancy));
479 F.getContext().diagnose(Diag);
480 return;
481 }
482 }
483 }
484}
485
487 // Pad with s_code_end to help tools and guard against instruction prefetch
488 // causing stale data in caches. Arguably this should be done by the linker,
489 // which is why this isn't done for Mesa.
490 // Don't do it if there is no code.
491 const MCSubtargetInfo &STI = *getGlobalSTI();
492 if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
496 if (TextSect->hasInstructions()) {
497 OutStreamer->switchSection(TextSect);
499 }
500 }
501
502 // Assign expressions which can only be resolved when all other functions are
503 // known.
505
506 // Switch section and emit all GPR maximums within the processed module.
507 OutStreamer->pushSection();
508 MCSectionELF *MaxGPRSection =
509 OutContext.getELFSection(".AMDGPU.gpr_maximums", ELF::SHT_PROGBITS, 0);
510 OutStreamer->switchSection(MaxGPRSection);
514 OutStreamer->popSection();
515
516 for (Function &F : M.functions())
517 validateMCResourceInfo(F);
518
519 RI.reset();
520
522}
523
524SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) {
526 raw_svector_ostream OSS(Str);
527 auto &Streamer = getTargetStreamer()->getStreamer();
528 auto &Context = Streamer.getContext();
529 const MCExpr *New = foldAMDGPUMCExpr(Value, Context);
530 printAMDGPUMCExpr(New, OSS, MAI);
531 return Str;
532}
533
534// Print comments that apply to both callable functions and entry points.
535void AMDGPUAsmPrinter::emitCommonFunctionComments(
536 const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR,
537 const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize,
538 const AMDGPUMachineFunction *MFI) {
539 OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
540 OutStreamer->emitRawComment(" TotalNumSgprs: " + getMCExprStr(NumSGPR),
541 false);
542 OutStreamer->emitRawComment(" NumVgprs: " + getMCExprStr(NumVGPR), false);
543 if (NumAGPR && TotalNumVGPR) {
544 OutStreamer->emitRawComment(" NumAgprs: " + getMCExprStr(NumAGPR), false);
545 OutStreamer->emitRawComment(" TotalNumVgprs: " + getMCExprStr(TotalNumVGPR),
546 false);
547 }
548 OutStreamer->emitRawComment(" ScratchSize: " + getMCExprStr(ScratchSize),
549 false);
550 OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
551 false);
552}
553
554const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
555 const MachineFunction &MF) const {
557 MCContext &Ctx = MF.getContext();
558 uint16_t KernelCodeProperties = 0;
559 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
561
562 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
563 KernelCodeProperties |=
564 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
565 }
566 if (UserSGPRInfo.hasDispatchPtr()) {
567 KernelCodeProperties |=
568 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
569 }
570 if (UserSGPRInfo.hasQueuePtr()) {
571 KernelCodeProperties |=
572 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
573 }
574 if (UserSGPRInfo.hasKernargSegmentPtr()) {
575 KernelCodeProperties |=
576 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
577 }
578 if (UserSGPRInfo.hasDispatchID()) {
579 KernelCodeProperties |=
580 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
581 }
582 if (UserSGPRInfo.hasFlatScratchInit()) {
583 KernelCodeProperties |=
584 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
585 }
586 if (UserSGPRInfo.hasPrivateSegmentSize()) {
587 KernelCodeProperties |=
588 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
589 }
590 if (ST.isWave32()) {
591 KernelCodeProperties |=
592 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
593 }
594 if (isGFX1250(ST) && ST.hasCUStores()) {
595 KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES;
596 }
597
598 // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be
599 // un-evaluatable at this point so it cannot be conditionally checked here.
600 // Instead, we'll directly shift the possibly unknown MCExpr into its place
601 // and bitwise-or it into KernelCodeProperties.
602 const MCExpr *KernelCodePropExpr =
603 MCConstantExpr::create(KernelCodeProperties, Ctx);
604 const MCExpr *OrValue = MCConstantExpr::create(
605 amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_SHIFT, Ctx);
606 OrValue = MCBinaryExpr::createShl(CurrentProgramInfo.DynamicCallStack,
607 OrValue, Ctx);
608 KernelCodePropExpr = MCBinaryExpr::createOr(KernelCodePropExpr, OrValue, Ctx);
609
610 return KernelCodePropExpr;
611}
612
614AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
615 const SIProgramInfo &PI) const {
617 const Function &F = MF.getFunction();
619 MCContext &Ctx = MF.getContext();
620
621 MCKernelDescriptor KernelDescriptor;
622
623 KernelDescriptor.group_segment_fixed_size =
625 KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
626
627 Align MaxKernArgAlign;
628 KernelDescriptor.kernarg_size = MCConstantExpr::create(
629 STM.getKernArgSegmentSize(F, MaxKernArgAlign), Ctx);
630
631 KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM, Ctx);
632 KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(Ctx);
633 KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
634
635 int64_t PGRM_Rsrc3 = 1;
636 bool EvaluatableRsrc3 =
637 CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGRM_Rsrc3);
638 (void)PGRM_Rsrc3;
639 (void)EvaluatableRsrc3;
641 STM.hasGFX90AInsts() || AMDGPU::isGFX1250(STM) || !EvaluatableRsrc3 ||
642 static_cast<uint64_t>(PGRM_Rsrc3) == 0);
643 KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3;
644
645 KernelDescriptor.kernarg_preload = MCConstantExpr::create(
646 AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0,
647 Ctx);
648
649 return KernelDescriptor;
650}
651
653 // Init target streamer lazily on the first function so that previous passes
654 // can set metadata.
656 initTargetStreamer(*MF.getFunction().getParent());
657
658 ResourceUsage =
659 &getAnalysis<AMDGPUResourceUsageAnalysisWrapperPass>().getResourceInfo();
660 CurrentProgramInfo.reset(MF);
661
663 MCContext &Ctx = MF.getContext();
664
665 // The starting address of all shader programs must be 256 bytes aligned.
666 // Regular functions just need the basic required instruction alignment.
667 MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
668
670
673 bool IsLocal = MF.getFunction().hasLocalLinkage();
674 // FIXME: This should be an explicit check for Mesa.
675 if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
676 MCSectionELF *ConfigSection =
677 Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
678 OutStreamer->switchSection(ConfigSection);
679 }
680
681 RI.gatherResourceInfo(MF, *ResourceUsage, OutContext);
682
683 if (MFI->isModuleEntryFunction()) {
684 getSIProgramInfo(CurrentProgramInfo, MF);
685 }
686
687 if (STM.isAmdPalOS()) {
688 if (MFI->isEntryFunction())
689 EmitPALMetadata(MF, CurrentProgramInfo);
690 else if (MFI->isModuleEntryFunction())
691 emitPALFunctionMetadata(MF);
692 } else if (!STM.isAmdHsaOS()) {
693 EmitProgramInfoSI(MF, CurrentProgramInfo);
694 }
695
696 DumpCodeInstEmitter = nullptr;
697 if (STM.dumpCode()) {
698 // For -dumpcode, get the assembler out of the streamer. This only works
699 // with -filetype=obj.
700 MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
701 if (Assembler)
702 DumpCodeInstEmitter = Assembler->getEmitterPtr();
703 }
704
705 DisasmLines.clear();
706 HexLines.clear();
708
710
711 emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(),
712 STM.hasMAIInsts());
713
714 {
717 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext,
718 IsLocal),
719 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumAGPR, OutContext,
720 IsLocal),
721 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumSGPR, OutContext,
722 IsLocal),
723 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumNamedBarrier,
724 OutContext, IsLocal),
725 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
726 OutContext, IsLocal),
727 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesVCC, OutContext,
728 IsLocal),
729 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesFlatScratch,
730 OutContext, IsLocal),
731 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasDynSizedStack,
732 OutContext, IsLocal),
733 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasRecursion, OutContext,
734 IsLocal),
735 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasIndirectCall,
736 OutContext, IsLocal));
737 }
738
739 // Emit _dvgpr$ symbol when appropriate.
740 emitDVgprSymbol(MF);
741
742 if (isVerbose()) {
743 MCSectionELF *CommentSection =
744 Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
745 OutStreamer->switchSection(CommentSection);
746
747 if (!MFI->isEntryFunction()) {
749 OutStreamer->emitRawComment(" Function info:", false);
750
751 emitCommonFunctionComments(
752 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext,
753 IsLocal)
755 STM.hasMAIInsts()
756 ? RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumAGPR,
757 OutContext, IsLocal)
759 : nullptr,
760 RI.createTotalNumVGPRs(MF, Ctx),
762 MF,
764 Ctx),
765 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
766 OutContext, IsLocal)
768 CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
769 return false;
770 }
771
772 OutStreamer->emitRawComment(" Kernel info:", false);
773 emitCommonFunctionComments(
774 CurrentProgramInfo.NumArchVGPR,
775 STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr,
776 CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR,
777 CurrentProgramInfo.ScratchSize,
778 CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
779
780 OutStreamer->emitRawComment(
781 " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
782 OutStreamer->emitRawComment(
783 " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
784 OutStreamer->emitRawComment(
785 " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
786 " bytes/workgroup (compile time only)", false);
787
788 OutStreamer->emitRawComment(
789 " SGPRBlocks: " + getMCExprStr(CurrentProgramInfo.SGPRBlocks), false);
790
791 OutStreamer->emitRawComment(
792 " VGPRBlocks: " + getMCExprStr(CurrentProgramInfo.VGPRBlocks), false);
793
794 OutStreamer->emitRawComment(
795 " NumSGPRsForWavesPerEU: " +
796 getMCExprStr(CurrentProgramInfo.NumSGPRsForWavesPerEU),
797 false);
798 OutStreamer->emitRawComment(
799 " NumVGPRsForWavesPerEU: " +
800 getMCExprStr(CurrentProgramInfo.NumVGPRsForWavesPerEU),
801 false);
802
803 if (STM.hasGFX90AInsts()) {
804 const MCExpr *AdjustedAccum = MCBinaryExpr::createAdd(
805 CurrentProgramInfo.AccumOffset, MCConstantExpr::create(1, Ctx), Ctx);
806 AdjustedAccum = MCBinaryExpr::createMul(
807 AdjustedAccum, MCConstantExpr::create(4, Ctx), Ctx);
808 OutStreamer->emitRawComment(
809 " AccumOffset: " + getMCExprStr(AdjustedAccum), false);
810 }
811
812 if (AMDGPU::isGFX1250(STM))
813 OutStreamer->emitRawComment(
814 " NamedBarCnt: " + getMCExprStr(CurrentProgramInfo.NamedBarCnt),
815 false);
816
817 OutStreamer->emitRawComment(
818 " Occupancy: " + getMCExprStr(CurrentProgramInfo.Occupancy), false);
819
820 OutStreamer->emitRawComment(
821 " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
822
823 OutStreamer->emitRawComment(
824 " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
825 getMCExprStr(CurrentProgramInfo.ScratchEnable),
826 false);
827 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
828 Twine(CurrentProgramInfo.UserSGPR),
829 false);
830 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
831 Twine(CurrentProgramInfo.TrapHandlerEnable),
832 false);
833 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
834 Twine(CurrentProgramInfo.TGIdXEnable),
835 false);
836 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
837 Twine(CurrentProgramInfo.TGIdYEnable),
838 false);
839 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
840 Twine(CurrentProgramInfo.TGIdZEnable),
841 false);
842 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
843 Twine(CurrentProgramInfo.TIdIGCompCount),
844 false);
845
846 [[maybe_unused]] int64_t PGMRSrc3;
848 STM.hasGFX90AInsts() || AMDGPU::isGFX1250(STM) ||
849 (CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGMRSrc3) &&
850 static_cast<uint64_t>(PGMRSrc3) == 0));
851 if (STM.hasGFX90AInsts()) {
852 OutStreamer->emitRawComment(
853 " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
854 getMCExprStr(MCKernelDescriptor::bits_get(
855 CurrentProgramInfo.ComputePGMRSrc3,
856 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
857 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)),
858 false);
859 OutStreamer->emitRawComment(
860 " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
861 getMCExprStr(MCKernelDescriptor::bits_get(
862 CurrentProgramInfo.ComputePGMRSrc3,
863 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
864 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)),
865 false);
866 }
867 }
868
869 if (DumpCodeInstEmitter) {
870
871 OutStreamer->switchSection(
872 Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0));
873
874 for (size_t i = 0; i < DisasmLines.size(); ++i) {
875 std::string Comment = "\n";
876 if (!HexLines[i].empty()) {
877 Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
878 Comment += " ; " + HexLines[i] + "\n";
879 }
880
881 OutStreamer->emitBytes(StringRef(DisasmLines[i]));
882 OutStreamer->emitBytes(StringRef(Comment));
883 }
884 }
885
886 return false;
887}
888
889// When appropriate, add a _dvgpr$ symbol, with the value of the function
890// symbol, plus an offset encoding one less than the number of VGPR blocks used
891// by the function in bits 5..3 of the symbol value. A "VGPR block" can be
892// either 16 VGPRs (for a max of 128), or 32 VGPRs (for a max of 256). This is
893// used by a front-end to have functions that are chained rather than called,
894// and a dispatcher that dynamically resizes the VGPR count before dispatching
895// to a function.
896void AMDGPUAsmPrinter::emitDVgprSymbol(MachineFunction &MF) {
898 if (MFI.isDynamicVGPREnabled() &&
900 MCContext &Ctx = MF.getContext();
901 unsigned BlockSize = MFI.getDynamicVGPRBlockSize();
902 MCValue NumVGPRs;
903 if (!CurrentProgramInfo.NumVGPRsForWavesPerEU->evaluateAsRelocatable(
904 NumVGPRs, nullptr) ||
905 !NumVGPRs.isAbsolute()) {
906 llvm_unreachable("unable to resolve NumVGPRs for _dvgpr$ symbol");
907 }
908 // Calculate number of VGPR blocks.
909 // Treat 0 VGPRs as 1 VGPR to avoid underflowing.
910 unsigned NumBlocks =
911 divideCeil(std::max(unsigned(NumVGPRs.getConstant()), 1U), BlockSize);
912
913 if (NumBlocks > 8) {
915 "too many DVGPR blocks for _dvgpr$ symbol for '" +
916 Twine(CurrentFnSym->getName()) + "'");
917 return;
918 }
919 unsigned EncodedNumBlocks = (NumBlocks - 1) << 3;
920 // Add to function symbol to create _dvgpr$ symbol.
921 const MCExpr *DVgprFuncVal = MCBinaryExpr::createAdd(
923 MCConstantExpr::create(EncodedNumBlocks, Ctx), Ctx);
924 MCSymbol *DVgprFuncSym =
925 Ctx.getOrCreateSymbol(Twine("_dvgpr$") + CurrentFnSym->getName());
926 OutStreamer->emitAssignment(DVgprFuncSym, DVgprFuncVal);
927 emitVisibility(DVgprFuncSym, MF.getFunction().getVisibility());
928 emitLinkage(&MF.getFunction(), DVgprFuncSym);
929 }
930}
931
932// TODO: Fold this into emitFunctionBodyStart.
933void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
934 // In the beginning all features are either 'Any' or 'NotSupported',
935 // depending on global target features. This will cover empty modules.
937 getGlobalSTI()->getFeatureString());
938
939 // If module is empty, we are done.
940 if (M.empty())
941 return;
942
943 // If module is not empty, need to find first 'Off' or 'On' feature
944 // setting per feature from functions in module.
945 for (auto &F : M) {
946 auto &TSTargetID = getTargetStreamer()->getTargetID();
947 if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) &&
948 (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff()))
949 break;
950
952 const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID();
953 if (TSTargetID->isXnackSupported())
954 if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any)
955 TSTargetID->setXnackSetting(STMTargetID.getXnackSetting());
956 if (TSTargetID->isSramEccSupported())
957 if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any)
958 TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting());
959 }
960}
961
962// AccumOffset computed for the MCExpr equivalent of:
963// alignTo(std::max(1, NumVGPR), 4) / 4 - 1;
964static const MCExpr *computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx) {
965 const MCExpr *ConstFour = MCConstantExpr::create(4, Ctx);
966 const MCExpr *ConstOne = MCConstantExpr::create(1, Ctx);
967
968 // Can't be lower than 1 for subsequent alignTo.
969 const MCExpr *MaximumTaken =
970 AMDGPUMCExpr::createMax({ConstOne, NumVGPR}, Ctx);
971
972 // Practically, it's computing divideCeil(MaximumTaken, 4).
973 const MCExpr *DivCeil = MCBinaryExpr::createDiv(
974 AMDGPUMCExpr::createAlignTo(MaximumTaken, ConstFour, Ctx), ConstFour,
975 Ctx);
976
977 return MCBinaryExpr::createSub(DivCeil, ConstOne, Ctx);
978}
979
980void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
981 const MachineFunction &MF) {
983 bool IsLocal = MF.getFunction().hasLocalLinkage();
984 MCContext &Ctx = MF.getContext();
985
986 auto CreateExpr = [&Ctx](int64_t Value) {
987 return MCConstantExpr::create(Value, Ctx);
988 };
989
990 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
991 int64_t Val;
992 if (Value->evaluateAsAbsolute(Val)) {
993 Res = Val;
994 return true;
995 }
996 return false;
997 };
998
999 auto GetSymRefExpr =
1000 [&](MCResourceInfo::ResourceInfoKind RIK) -> const MCExpr * {
1001 MCSymbol *Sym =
1002 RI.getSymbol(CurrentFnSym->getName(), RIK, OutContext, IsLocal);
1003 return MCSymbolRefExpr::create(Sym, Ctx);
1004 };
1005
1007 ProgInfo.NumArchVGPR = GetSymRefExpr(RIK::RIK_NumVGPR);
1008 ProgInfo.NumAccVGPR = GetSymRefExpr(RIK::RIK_NumAGPR);
1010 ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
1011
1012 ProgInfo.AccumOffset = computeAccumOffset(ProgInfo.NumArchVGPR, Ctx);
1013 ProgInfo.TgSplit = STM.isTgSplitEnabled();
1014 ProgInfo.NumSGPR = GetSymRefExpr(RIK::RIK_NumSGPR);
1015 ProgInfo.ScratchSize = GetSymRefExpr(RIK::RIK_PrivateSegSize);
1016 ProgInfo.VCCUsed = GetSymRefExpr(RIK::RIK_UsesVCC);
1017 ProgInfo.FlatUsed = GetSymRefExpr(RIK::RIK_UsesFlatScratch);
1018 ProgInfo.DynamicCallStack =
1019 MCBinaryExpr::createOr(GetSymRefExpr(RIK::RIK_HasDynSizedStack),
1020 GetSymRefExpr(RIK::RIK_HasRecursion), Ctx);
1021
1022 const MCExpr *BarBlkConst = MCConstantExpr::create(4, Ctx);
1023 const MCExpr *AlignToBlk = AMDGPUMCExpr::createAlignTo(
1024 GetSymRefExpr(RIK::RIK_NumNamedBarrier), BarBlkConst, Ctx);
1025 ProgInfo.NamedBarCnt = MCBinaryExpr::createDiv(AlignToBlk, BarBlkConst, Ctx);
1026
1028
1029 // The calculations related to SGPR/VGPR blocks are
1030 // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
1031 // unified.
1032 const MCExpr *ExtraSGPRs = AMDGPUMCExpr::createExtraSGPRs(
1033 ProgInfo.VCCUsed, ProgInfo.FlatUsed,
1034 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx);
1035
1036 // Check the addressable register limit before we add ExtraSGPRs.
1038 !STM.hasSGPRInitBug()) {
1039 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
1040 uint64_t NumSgpr;
1041 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
1042 NumSgpr > MaxAddressableNumSGPRs) {
1043 // This can happen due to a compiler bug or when using inline asm.
1046 MF.getFunction(), "addressable scalar registers", NumSgpr,
1047 MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit);
1048 Ctx.diagnose(Diag);
1049 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs - 1);
1050 }
1051 }
1052
1053 // Account for extra SGPRs and VGPRs reserved for debugger use.
1054 ProgInfo.NumSGPR = MCBinaryExpr::createAdd(ProgInfo.NumSGPR, ExtraSGPRs, Ctx);
1055
1056 const Function &F = MF.getFunction();
1057
1058 // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
1059 // dispatch registers as function args.
1060 unsigned WaveDispatchNumSGPR = MFI->getNumWaveDispatchSGPRs(),
1061 WaveDispatchNumVGPR = MFI->getNumWaveDispatchVGPRs();
1062
1063 if (WaveDispatchNumSGPR) {
1065 {ProgInfo.NumSGPR,
1066 MCBinaryExpr::createAdd(CreateExpr(WaveDispatchNumSGPR), ExtraSGPRs,
1067 Ctx)},
1068 Ctx);
1069 }
1070
1071 if (WaveDispatchNumVGPR) {
1073 {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
1074
1076 ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
1077 }
1078
1079 // Adjust number of registers used to meet default/requested minimum/maximum
1080 // number of waves per execution unit request.
1081 unsigned MaxWaves = MFI->getMaxWavesPerEU();
1082 ProgInfo.NumSGPRsForWavesPerEU =
1083 AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, CreateExpr(1ul),
1084 CreateExpr(STM.getMinNumSGPRs(MaxWaves))},
1085 Ctx);
1086 ProgInfo.NumVGPRsForWavesPerEU =
1087 AMDGPUMCExpr::createMax({ProgInfo.NumVGPR, CreateExpr(1ul),
1088 CreateExpr(STM.getMinNumVGPRs(
1089 MaxWaves, MFI->getDynamicVGPRBlockSize()))},
1090 Ctx);
1091
1093 STM.hasSGPRInitBug()) {
1094 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
1095 uint64_t NumSgpr;
1096 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
1097 NumSgpr > MaxAddressableNumSGPRs) {
1098 // This can happen due to a compiler bug or when using inline asm to use
1099 // the registers which are usually reserved for vcc etc.
1101 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers",
1102 NumSgpr, MaxAddressableNumSGPRs,
1104 Ctx.diagnose(Diag);
1105 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs);
1106 ProgInfo.NumSGPRsForWavesPerEU = CreateExpr(MaxAddressableNumSGPRs);
1107 }
1108 }
1109
1110 if (STM.hasSGPRInitBug()) {
1111 ProgInfo.NumSGPR =
1113 ProgInfo.NumSGPRsForWavesPerEU =
1115 }
1116
1117 if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
1119 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs",
1120 MFI->getNumUserSGPRs(),
1122 Ctx.diagnose(Diag);
1123 }
1124
1125 if (MFI->getLDSSize() > STM.getAddressableLocalMemorySize()) {
1128 MF.getFunction(), "local memory", MFI->getLDSSize(),
1130 Ctx.diagnose(Diag);
1131 }
1132 // The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks:
1133 // (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1
1134 auto GetNumGPRBlocks = [&CreateExpr, &Ctx](const MCExpr *NumGPR,
1135 unsigned Granule) {
1136 const MCExpr *OneConst = CreateExpr(1ul);
1137 const MCExpr *GranuleConst = CreateExpr(Granule);
1138 const MCExpr *MaxNumGPR = AMDGPUMCExpr::createMax({NumGPR, OneConst}, Ctx);
1139 const MCExpr *AlignToGPR =
1140 AMDGPUMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx);
1141 const MCExpr *DivGPR =
1142 MCBinaryExpr::createDiv(AlignToGPR, GranuleConst, Ctx);
1143 const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx);
1144 return SubGPR;
1145 };
1146 // GFX10+ will always allocate 128 SGPRs and this field must be 0
1148 ProgInfo.SGPRBlocks = CreateExpr(0ul);
1149 } else {
1150 ProgInfo.SGPRBlocks = GetNumGPRBlocks(
1152 }
1153 ProgInfo.VGPRBlocks = GetNumGPRBlocks(ProgInfo.NumVGPRsForWavesPerEU,
1155
1156 const SIModeRegisterDefaults Mode = MFI->getMode();
1157
1158 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
1159 // register.
1160 ProgInfo.FloatMode = getFPMode(Mode);
1161
1162 ProgInfo.IEEEMode = Mode.IEEE;
1163
1164 // Make clamp modifier on NaN input returns 0.
1165 ProgInfo.DX10Clamp = Mode.DX10Clamp;
1166
1167 unsigned LDSAlignShift;
1168 if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize327680)) {
1169 // LDS is allocated in 256 dword blocks.
1170 LDSAlignShift = 10;
1171 } else if (STM.getFeatureBits().test(
1172 FeatureAddressableLocalMemorySize163840)) {
1173 // LDS is allocated in 320 dword blocks.
1174 LDSAlignShift = 11;
1175 } else if (STM.getFeatureBits().test(
1176 FeatureAddressableLocalMemorySize65536)) {
1177 // LDS is allocated in 128 dword blocks.
1178 LDSAlignShift = 9;
1179 } else {
1180 // LDS is allocated in 64 dword blocks.
1181 LDSAlignShift = 8;
1182 }
1183
1184 ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
1185 ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
1186
1187 ProgInfo.LDSSize = MFI->getLDSSize();
1188 ProgInfo.LDSBlocks =
1189 alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
1190
1191 // The MCExpr equivalent of divideCeil.
1192 auto DivideCeil = [&Ctx](const MCExpr *Numerator, const MCExpr *Denominator) {
1193 const MCExpr *Ceil =
1194 AMDGPUMCExpr::createAlignTo(Numerator, Denominator, Ctx);
1195 return MCBinaryExpr::createDiv(Ceil, Denominator, Ctx);
1196 };
1197
1198 // Scratch is allocated in 64-dword or 256-dword blocks.
1199 unsigned ScratchAlignShift =
1200 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
1201 // We need to program the hardware with the amount of scratch memory that
1202 // is used by the entire wave. ProgInfo.ScratchSize is the amount of
1203 // scratch memory used per thread.
1204 ProgInfo.ScratchBlocks = DivideCeil(
1206 CreateExpr(STM.getWavefrontSize()), Ctx),
1207 CreateExpr(1ULL << ScratchAlignShift));
1208
1209 if (STM.supportsWGP()) {
1210 ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
1211 }
1212
1213 if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
1214 ProgInfo.MemOrdered = 1;
1215 ProgInfo.FwdProgress = 1;
1216 }
1217
1218 // 0 = X, 1 = XY, 2 = XYZ
1219 unsigned TIDIGCompCnt = 0;
1220 if (MFI->hasWorkItemIDZ())
1221 TIDIGCompCnt = 2;
1222 else if (MFI->hasWorkItemIDY())
1223 TIDIGCompCnt = 1;
1224
1225 // The private segment wave byte offset is the last of the system SGPRs. We
1226 // initially assumed it was allocated, and may have used it. It shouldn't harm
1227 // anything to disable it if we know the stack isn't used here. We may still
1228 // have emitted code reading it to initialize scratch, but if that's unused
1229 // reading garbage should be OK.
1232 MCConstantExpr::create(0, Ctx), Ctx),
1233 ProgInfo.DynamicCallStack, Ctx);
1234
1235 ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
1236 // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
1237 ProgInfo.TrapHandlerEnable =
1238 STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled();
1239 ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
1240 ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
1241 ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
1242 ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();
1243 ProgInfo.TIdIGCompCount = TIDIGCompCnt;
1244 ProgInfo.EXCPEnMSB = 0;
1245 // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
1246 ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
1247 ProgInfo.EXCPEnable = 0;
1248
1249 // return ((Dst & ~Mask) | (Value << Shift))
1250 auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
1251 uint32_t Shift) {
1252 const auto *Shft = MCConstantExpr::create(Shift, Ctx);
1253 const auto *Msk = MCConstantExpr::create(Mask, Ctx);
1254 Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
1256 Ctx);
1257 return Dst;
1258 };
1259
1260 if (STM.hasGFX90AInsts()) {
1261 ProgInfo.ComputePGMRSrc3 =
1262 SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.AccumOffset,
1263 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
1264 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT);
1265 ProgInfo.ComputePGMRSrc3 =
1266 SetBits(ProgInfo.ComputePGMRSrc3, CreateExpr(ProgInfo.TgSplit),
1267 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
1268 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
1269 }
1270
1271 if (AMDGPU::isGFX1250(STM))
1272 ProgInfo.ComputePGMRSrc3 =
1273 SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.NamedBarCnt,
1274 amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT,
1275 amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT);
1276
1278 STM.computeOccupancy(F, ProgInfo.LDSSize).second,
1280 MFI->getDynamicVGPRBlockSize(), STM, Ctx);
1281
1282 const auto [MinWEU, MaxWEU] =
1283 AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
1284 uint64_t Occupancy;
1285 if (TryGetMCExprValue(ProgInfo.Occupancy, Occupancy) && Occupancy < MinWEU) {
1287 F, F.getSubprogram(),
1288 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
1289 "'" +
1290 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
1291 ", final occupancy is " + Twine(Occupancy));
1292 F.getContext().diagnose(Diag);
1293 }
1294
1295 if (isGFX11Plus(STM)) {
1296 uint32_t CodeSizeInBytes = (uint32_t)std::min(
1297 ProgInfo.getFunctionCodeSize(MF, true /* IsLowerBound */),
1298 (uint64_t)std::numeric_limits<uint32_t>::max());
1299 uint32_t CodeSizeInLines = divideCeil(CodeSizeInBytes, 128);
1300 uint32_t Field, Shift, Width;
1301 if (isGFX11(STM)) {
1302 Field = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE;
1303 Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT;
1304 Width = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH;
1305 } else {
1306 Field = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE;
1307 Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT;
1308 Width = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH;
1309 }
1310 uint64_t InstPrefSize = std::min(CodeSizeInLines, (1u << Width) - 1);
1311 ProgInfo.ComputePGMRSrc3 = SetBits(ProgInfo.ComputePGMRSrc3,
1312 CreateExpr(InstPrefSize), Field, Shift);
1313 }
1314}
1315
1316static unsigned getRsrcReg(CallingConv::ID CallConv) {
1317 switch (CallConv) {
1318 default: [[fallthrough]];
1326 }
1327}
1328
1329void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
1330 const SIProgramInfo &CurrentProgramInfo) {
1332 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1333 unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
1334 MCContext &Ctx = MF.getContext();
1335
1336 // (((Value) & Mask) << Shift)
1337 auto SetBits = [&Ctx](const MCExpr *Value, uint32_t Mask, uint32_t Shift) {
1338 const MCExpr *msk = MCConstantExpr::create(Mask, Ctx);
1339 const MCExpr *shft = MCConstantExpr::create(Shift, Ctx);
1341 shft, Ctx);
1342 };
1343
1344 auto EmitResolvedOrExpr = [this](const MCExpr *Value, unsigned Size) {
1345 int64_t Val;
1346 if (Value->evaluateAsAbsolute(Val))
1347 OutStreamer->emitIntValue(static_cast<uint64_t>(Val), Size);
1348 else
1349 OutStreamer->emitValue(Value, Size);
1350 };
1351
1354
1355 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx),
1356 /*Size=*/4);
1357
1359 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc2(Ctx), /*Size=*/4);
1360
1362
1363 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1364 // appropriate generation.
1365 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1366 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1367 /*Mask=*/0x3FFFF, /*Shift=*/12),
1368 /*Size=*/4);
1369 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1370 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1371 /*Mask=*/0x7FFF, /*Shift=*/12),
1372 /*Size=*/4);
1373 } else {
1374 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1375 /*Mask=*/0x1FFF, /*Shift=*/12),
1376 /*Size=*/4);
1377 }
1378
1379 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
1380 // 0" comment but I don't see a corresponding field in the register spec.
1381 } else {
1382 OutStreamer->emitInt32(RsrcReg);
1383
1384 const MCExpr *GPRBlocks = MCBinaryExpr::createOr(
1385 SetBits(CurrentProgramInfo.VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0),
1386 SetBits(CurrentProgramInfo.SGPRBlocks, /*Mask=*/0x0F, /*Shift=*/6),
1387 MF.getContext());
1388 EmitResolvedOrExpr(GPRBlocks, /*Size=*/4);
1390
1391 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1392 // appropriate generation.
1393 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1394 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1395 /*Mask=*/0x3FFFF, /*Shift=*/12),
1396 /*Size=*/4);
1397 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1398 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1399 /*Mask=*/0x7FFF, /*Shift=*/12),
1400 /*Size=*/4);
1401 } else {
1402 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1403 /*Mask=*/0x1FFF, /*Shift=*/12),
1404 /*Size=*/4);
1405 }
1406 }
1407
1410 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1411 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1412 : CurrentProgramInfo.LDSBlocks;
1413 OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1415 OutStreamer->emitInt32(MFI->getPSInputEnable());
1417 OutStreamer->emitInt32(MFI->getPSInputAddr());
1418 }
1419
1420 OutStreamer->emitInt32(R_SPILLED_SGPRS);
1421 OutStreamer->emitInt32(MFI->getNumSpilledSGPRs());
1422 OutStreamer->emitInt32(R_SPILLED_VGPRS);
1423 OutStreamer->emitInt32(MFI->getNumSpilledVGPRs());
1424}
1425
1426// Helper function to add common PAL Metadata 3.0+
1428 const SIProgramInfo &CurrentProgramInfo,
1429 CallingConv::ID CC, const GCNSubtarget &ST,
1430 unsigned DynamicVGPRBlockSize) {
1431 if (ST.hasIEEEMode())
1432 MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
1433
1434 MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode);
1435 MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered);
1436 MD->setHwStage(CC, ".forward_progress", (bool)CurrentProgramInfo.FwdProgress);
1437
1438 if (AMDGPU::isCompute(CC)) {
1439 MD->setHwStage(CC, ".trap_present",
1440 (bool)CurrentProgramInfo.TrapHandlerEnable);
1441 MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
1442
1443 if (DynamicVGPRBlockSize != 0)
1444 MD->setComputeRegisters(".dynamic_vgpr_en", true);
1445 }
1446
1447 MD->setHwStage(CC, ".lds_size",
1448 (unsigned)(CurrentProgramInfo.LdsSize *
1449 getLdsDwGranularity(ST) * sizeof(uint32_t)));
1450}
1451
1452// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1453// is AMDPAL. It stores each compute/SPI register setting and other PAL
1454// metadata items into the PALMD::Metadata, combining with any provided by the
1455// frontend as LLVM metadata. Once all functions are written, the PAL metadata
1456// is then written as a single block in the .note section.
1457void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
1458 const SIProgramInfo &CurrentProgramInfo) {
1460 auto CC = MF.getFunction().getCallingConv();
1461 auto *MD = getTargetStreamer()->getPALMetadata();
1462 auto &Ctx = MF.getContext();
1463
1464 MD->setEntryPoint(CC, MF.getFunction().getName());
1465 MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx);
1466
1467 // For targets that support dynamic VGPRs, set the number of saved dynamic
1468 // VGPRs (if any) in the PAL metadata.
1469 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1470 if (MFI->isDynamicVGPREnabled() &&
1472 MD->setHwStage(CC, ".dynamic_vgpr_saved_count",
1474
1475 // Only set AGPRs for supported devices
1476 if (STM.hasMAIInsts()) {
1477 MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
1478 }
1479
1480 MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx);
1481 if (MD->getPALMajorVersion() < 3) {
1482 MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM, Ctx), Ctx);
1483 if (AMDGPU::isCompute(CC)) {
1484 MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
1485 } else {
1486 const MCExpr *HasScratchBlocks =
1487 MCBinaryExpr::createGT(CurrentProgramInfo.ScratchBlocks,
1488 MCConstantExpr::create(0, Ctx), Ctx);
1489 auto [Shift, Mask] = getShiftMask(C_00B84C_SCRATCH_EN);
1490 MD->setRsrc2(CC, maskShiftSet(HasScratchBlocks, Mask, Shift, Ctx), Ctx);
1491 }
1492 } else {
1493 MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
1494 MD->setHwStage(CC, ".scratch_en", msgpack::Type::Boolean,
1495 CurrentProgramInfo.ScratchEnable);
1496 EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM,
1498 }
1499
1500 // ScratchSize is in bytes, 16 aligned.
1501 MD->setScratchSize(
1502 CC,
1503 AMDGPUMCExpr::createAlignTo(CurrentProgramInfo.ScratchSize,
1504 MCConstantExpr::create(16, Ctx), Ctx),
1505 Ctx);
1506
1508 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1509 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1510 : CurrentProgramInfo.LDSBlocks;
1511 if (MD->getPALMajorVersion() < 3) {
1512 MD->setRsrc2(
1513 CC,
1515 Ctx);
1516 MD->setSpiPsInputEna(MFI->getPSInputEnable());
1517 MD->setSpiPsInputAddr(MFI->getPSInputAddr());
1518 } else {
1519 // Graphics registers
1520 const unsigned ExtraLdsDwGranularity =
1521 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 256 : 128;
1522 MD->setGraphicsRegisters(
1523 ".ps_extra_lds_size",
1524 (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t)));
1525
1526 // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
1527 static StringLiteral const PsInputFields[] = {
1528 ".persp_sample_ena", ".persp_center_ena",
1529 ".persp_centroid_ena", ".persp_pull_model_ena",
1530 ".linear_sample_ena", ".linear_center_ena",
1531 ".linear_centroid_ena", ".line_stipple_tex_ena",
1532 ".pos_x_float_ena", ".pos_y_float_ena",
1533 ".pos_z_float_ena", ".pos_w_float_ena",
1534 ".front_face_ena", ".ancillary_ena",
1535 ".sample_coverage_ena", ".pos_fixed_pt_ena"};
1536 unsigned PSInputEna = MFI->getPSInputEnable();
1537 unsigned PSInputAddr = MFI->getPSInputAddr();
1538 for (auto [Idx, Field] : enumerate(PsInputFields)) {
1539 MD->setGraphicsRegisters(".spi_ps_input_ena", Field,
1540 (bool)((PSInputEna >> Idx) & 1));
1541 MD->setGraphicsRegisters(".spi_ps_input_addr", Field,
1542 (bool)((PSInputAddr >> Idx) & 1));
1543 }
1544 }
1545 }
1546
1547 // For version 3 and above the wave front size is already set in the metadata
1548 if (MD->getPALMajorVersion() < 3 && STM.isWave32())
1549 MD->setWave32(MF.getFunction().getCallingConv());
1550}
1551
1552void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
1553 auto *MD = getTargetStreamer()->getPALMetadata();
1554 const MachineFrameInfo &MFI = MF.getFrameInfo();
1555 StringRef FnName = MF.getFunction().getName();
1556 MD->setFunctionScratchSize(FnName, MFI.getStackSize());
1558 MCContext &Ctx = MF.getContext();
1559
1560 if (MD->getPALMajorVersion() < 3) {
1561 // Set compute registers
1562 MD->setRsrc1(
1564 CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST, Ctx), Ctx);
1565 MD->setRsrc2(CallingConv::AMDGPU_CS,
1566 CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
1567 } else {
1569 MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST,
1571 }
1572
1573 // Set optional info
1574 MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize);
1575 MD->setFunctionNumUsedVgprs(FnName, CurrentProgramInfo.NumVGPRsForWavesPerEU);
1576 MD->setFunctionNumUsedSgprs(FnName, CurrentProgramInfo.NumSGPRsForWavesPerEU);
1577}
1578
1579// This is supposed to be log2(Size)
1581 switch (Size) {
1582 case 4:
1583 return AMD_ELEMENT_4_BYTES;
1584 case 8:
1585 return AMD_ELEMENT_8_BYTES;
1586 case 16:
1587 return AMD_ELEMENT_16_BYTES;
1588 default:
1589 llvm_unreachable("invalid private_element_size");
1590 }
1591}
1592
1593void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out,
1594 const SIProgramInfo &CurrentProgramInfo,
1595 const MachineFunction &MF) const {
1596 const Function &F = MF.getFunction();
1597 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
1598 F.getCallingConv() == CallingConv::SPIR_KERNEL);
1599
1601 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1602 MCContext &Ctx = MF.getContext();
1603
1604 Out.initDefault(&STM, Ctx, /*InitMCExpr=*/false);
1605
1607 CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx);
1609 CurrentProgramInfo.getComputePGMRSrc2(Ctx);
1611
1612 Out.is_dynamic_callstack = CurrentProgramInfo.DynamicCallStack;
1613
1615 getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
1616
1617 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo();
1618 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
1620 }
1621
1622 if (UserSGPRInfo.hasDispatchPtr())
1624
1625 if (UserSGPRInfo.hasQueuePtr())
1627
1628 if (UserSGPRInfo.hasKernargSegmentPtr())
1630
1631 if (UserSGPRInfo.hasDispatchID())
1633
1634 if (UserSGPRInfo.hasFlatScratchInit())
1636
1637 if (UserSGPRInfo.hasPrivateSegmentSize())
1639
1640 if (STM.isXNACKEnabled())
1642
1643 Align MaxKernArgAlign;
1644 Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
1645 Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
1646 Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
1647 Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
1648 Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
1649
1650 // kernarg_segment_alignment is specified as log of the alignment.
1651 // The minimum alignment is 16.
1652 // FIXME: The metadata treats the minimum as 4?
1653 Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign));
1654}
1655
1657 const char *ExtraCode, raw_ostream &O) {
1658 // First try the generic code, which knows about modifiers like 'c' and 'n'.
1659 if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O))
1660 return false;
1661
1662 if (ExtraCode && ExtraCode[0]) {
1663 if (ExtraCode[1] != 0)
1664 return true; // Unknown modifier.
1665
1666 switch (ExtraCode[0]) {
1667 case 'r':
1668 break;
1669 default:
1670 return true;
1671 }
1672 }
1673
1674 // TODO: Should be able to support other operand types like globals.
1675 const MachineOperand &MO = MI->getOperand(OpNo);
1676 if (MO.isReg()) {
1679 return false;
1680 }
1681 if (MO.isImm()) {
1682 int64_t Val = MO.getImm();
1684 O << Val;
1685 } else if (isUInt<16>(Val)) {
1686 O << format("0x%" PRIx16, static_cast<uint16_t>(Val));
1687 } else if (isUInt<32>(Val)) {
1688 O << format("0x%" PRIx32, static_cast<uint32_t>(Val));
1689 } else {
1690 O << format("0x%" PRIx64, static_cast<uint64_t>(Val));
1691 }
1692 return false;
1693 }
1694 return true;
1695}
1696
1703}
1704
1705void AMDGPUAsmPrinter::emitResourceUsageRemarks(
1706 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo,
1707 bool isModuleEntryFunction, bool hasMAIInsts) {
1708 if (!ORE)
1709 return;
1710
1711 const char *Name = "kernel-resource-usage";
1712 const char *Indent = " ";
1713
1714 // If the remark is not specifically enabled, do not output to yaml
1717 return;
1718
1719 // Currently non-kernel functions have no resources to emit.
1721 return;
1722
1723 auto EmitResourceUsageRemark = [&](StringRef RemarkName,
1724 StringRef RemarkLabel, auto Argument) {
1725 // Add an indent for every line besides the line with the kernel name. This
1726 // makes it easier to tell which resource usage go with which kernel since
1727 // the kernel name will always be displayed first.
1728 std::string LabelStr = RemarkLabel.str() + ": ";
1729 if (RemarkName != "FunctionName")
1730 LabelStr = Indent + LabelStr;
1731
1732 ORE->emit([&]() {
1733 return MachineOptimizationRemarkAnalysis(Name, RemarkName,
1735 &MF.front())
1736 << LabelStr << ore::NV(RemarkName, Argument);
1737 });
1738 };
1739
1740 // FIXME: Formatting here is pretty nasty because clang does not accept
1741 // newlines from diagnostics. This forces us to emit multiple diagnostic
1742 // remarks to simulate newlines. If and when clang does accept newlines, this
1743 // formatting should be aggregated into one remark with newlines to avoid
1744 // printing multiple diagnostic location and diag opts.
1745 EmitResourceUsageRemark("FunctionName", "Function Name",
1746 MF.getFunction().getName());
1747 EmitResourceUsageRemark("NumSGPR", "TotalSGPRs",
1748 getMCExprStr(CurrentProgramInfo.NumSGPR));
1749 EmitResourceUsageRemark("NumVGPR", "VGPRs",
1750 getMCExprStr(CurrentProgramInfo.NumArchVGPR));
1751 if (hasMAIInsts) {
1752 EmitResourceUsageRemark("NumAGPR", "AGPRs",
1753 getMCExprStr(CurrentProgramInfo.NumAccVGPR));
1754 }
1755 EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
1756 getMCExprStr(CurrentProgramInfo.ScratchSize));
1757 int64_t DynStack;
1758 bool DynStackEvaluatable =
1759 CurrentProgramInfo.DynamicCallStack->evaluateAsAbsolute(DynStack);
1760 StringRef DynamicStackStr =
1761 DynStackEvaluatable && DynStack ? "True" : "False";
1762 EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr);
1763 EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
1764 getMCExprStr(CurrentProgramInfo.Occupancy));
1765 EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
1766 CurrentProgramInfo.SGPRSpill);
1767 EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
1768 CurrentProgramInfo.VGPRSpill);
1769 if (isModuleEntryFunction)
1770 EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]",
1771 CurrentProgramInfo.LDSSize);
1772}
1773
1774char AMDGPUAsmPrinter::ID = 0;
1775
1776INITIALIZE_PASS(AMDGPUAsmPrinter, "amdgpu-asm-printer",
1777 "AMDGPU Assembly Printer", false, false)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD, const SIProgramInfo &CurrentProgramInfo, CallingConv::ID CC, const GCNSubtarget &ST, unsigned DynamicVGPRBlockSize)
static unsigned getRsrcReg(CallingConv::ID CallConv)
LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmPrinter()
static amd_element_byte_size_t getElementByteSizeValue(unsigned Size)
static uint32_t getFPMode(SIModeRegisterDefaults Mode)
static const MCExpr * computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx)
static AsmPrinter * createAMDGPUAsmPrinterPass(TargetMachine &tm, std::unique_ptr< MCStreamer > &&Streamer)
AMDGPU Assembly printer class.
AMDGPU HSA Metadata Streamer.
AMDHSA kernel descriptor MCExpr struct for use in MC layer.
MC infrastructure to propagate the function level resource usage info.
Analyzes how many registers and other resources are used by functions.
AMDHSA kernel descriptor definitions.
MC layer struct for AMDGPUMCKernelCodeT, provides MCExpr functionality where required.
amd_element_byte_size_t
The values used to define the number of bytes to use for the swizzle element size.
@ AMD_ELEMENT_8_BYTES
@ AMD_ELEMENT_16_BYTES
@ AMD_ELEMENT_4_BYTES
#define AMD_HSA_BITS_SET(dst, mask, val)
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID
@ AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE
@ AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE
@ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR
@ AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED
@ AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT
@ AMD_CODE_PROPERTY_IS_PTR64
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_ABI
Definition: Compiler.h:213
#define LLVM_EXTERNAL_VISIBILITY
Definition: Compiler.h:132
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
std::string Name
uint64_t Size
Symbol * Sym
Definition: ELF_riscv.cpp:479
AMD GCN specific subclass of TargetSubtarget.
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition: MD5.cpp:55
===- MachineOptimizationRemarkEmitter.h - Opt Diagnostics -*- C++ -*-—===//
OptimizedStructLayoutField Field
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:56
R600 Assembly printer class.
#define R_00B028_SPI_SHADER_PGM_RSRC1_PS
Definition: SIDefines.h:1109
#define R_0286E8_SPI_TMPRING_SIZE
Definition: SIDefines.h:1247
#define FP_ROUND_MODE_DP(x)
Definition: SIDefines.h:1229
#define C_00B84C_SCRATCH_EN
Definition: SIDefines.h:1145
#define FP_ROUND_ROUND_TO_NEAREST
Definition: SIDefines.h:1221
#define R_0286D0_SPI_PS_INPUT_ADDR
Definition: SIDefines.h:1180
#define R_00B860_COMPUTE_TMPRING_SIZE
Definition: SIDefines.h:1242
#define R_00B428_SPI_SHADER_PGM_RSRC1_HS
Definition: SIDefines.h:1132
#define R_00B328_SPI_SHADER_PGM_RSRC1_ES
Definition: SIDefines.h:1131
#define R_00B528_SPI_SHADER_PGM_RSRC1_LS
Definition: SIDefines.h:1140
#define R_0286CC_SPI_PS_INPUT_ENA
Definition: SIDefines.h:1179
#define R_00B128_SPI_SHADER_PGM_RSRC1_VS
Definition: SIDefines.h:1118
#define FP_DENORM_MODE_DP(x)
Definition: SIDefines.h:1240
#define R_00B848_COMPUTE_PGM_RSRC1
Definition: SIDefines.h:1182
#define R_SPILLED_SGPRS
Definition: SIDefines.h:1261
#define FP_ROUND_MODE_SP(x)
Definition: SIDefines.h:1228
#define FP_DENORM_MODE_SP(x)
Definition: SIDefines.h:1239
#define R_00B228_SPI_SHADER_PGM_RSRC1_GS
Definition: SIDefines.h:1123
#define R_SPILLED_VGPRS
Definition: SIDefines.h:1262
#define S_00B02C_EXTRA_LDS_SIZE(x)
Definition: SIDefines.h:1117
#define R_00B84C_COMPUTE_PGM_RSRC2
Definition: SIDefines.h:1142
#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS
Definition: SIDefines.h:1116
raw_pwrite_stream & OS
static const int BlockSize
Definition: TarWriter.cpp:33
void emitFunctionEntryLabel() override
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
const MCSubtargetInfo * getGlobalSTI() const
void emitImplicitDef(const MachineInstr *MI) const override
Targets can override this to customize the output of IMPLICIT_DEF instructions in verbose mode.
std::vector< std::string > DisasmLines
void emitStartOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the start of their fi...
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
std::vector< std::string > HexLines
void emitGlobalVariable(const GlobalVariable *GV) override
Emit the specified global variable to the .s file.
void getAnalysisUsage(AnalysisUsage &AU) const override
Record analysis usage.
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &O) override
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
bool runOnMachineFunction(MachineFunction &MF) override
Emit the specified function out to the OutStreamer.
void emitFunctionBodyEnd() override
Targets can override this to emit stuff after the last basic block in the function.
bool doFinalization(Module &M) override
Shut down the asmprinter.
void emitEndOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the end of their file...
AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr< MCStreamer > Streamer)
bool doInitialization(Module &M) override
Set up the AsmPrinter when we are working on a new module.
void emitFunctionBodyStart() override
Targets can override this to emit stuff before the first basic block in the function.
void emitBasicBlockStart(const MachineBasicBlock &MBB) override
Targets can override this to emit stuff at the start of a basic block.
AMDGPUTargetStreamer * getTargetStreamer() const
static void printRegOperand(MCRegister Reg, raw_ostream &O, const MCRegisterInfo &MRI)
static const AMDGPUMCExpr * createMax(ArrayRef< const MCExpr * > Args, MCContext &Ctx)
Definition: AMDGPUMCExpr.h:79
static const AMDGPUMCExpr * createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs, const MCExpr *NumVGPRs, unsigned DynamicVGPRBlockSize, const GCNSubtarget &STM, MCContext &Ctx)
Mimics GCNSubtarget::computeOccupancy for MCExpr.
static const AMDGPUMCExpr * createTotalNumVGPR(const MCExpr *NumAGPR, const MCExpr *NumVGPR, MCContext &Ctx)
static const AMDGPUMCExpr * createExtraSGPRs(const MCExpr *VCCUsed, const MCExpr *FlatScrUsed, bool XNACKUsed, MCContext &Ctx)
Allow delayed MCExpr resolve of ExtraSGPRs (in case VCCUsed or FlatScrUsed are unresolvable but neede...
static const AMDGPUMCExpr * createAlignTo(const MCExpr *Value, const MCExpr *Align, MCContext &Ctx)
Definition: AMDGPUMCExpr.h:93
void setHwStage(unsigned CC, StringRef field, unsigned Val)
void setComputeRegisters(StringRef field, unsigned Val)
std::pair< unsigned, unsigned > getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
unsigned getAddressableLocalMemorySize() const
Return the maximum number of bytes of LDS that can be allocated to a single workgroup.
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
unsigned getWavefrontSize() const
virtual void EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName, const AMDGPU::MCKernelDescriptor &KernelDescriptor, const MCExpr *NextVGPR, const MCExpr *NextSGPR, const MCExpr *ReserveVCC, const MCExpr *ReserveFlatScr)
AMDGPUPALMetadata * getPALMetadata()
virtual void EmitDirectiveAMDHSACodeObjectVersion(unsigned COV)
void initializeTargetID(const MCSubtargetInfo &STI)
virtual void EmitMCResourceInfo(const MCSymbol *NumVGPR, const MCSymbol *NumAGPR, const MCSymbol *NumExplicitSGPR, const MCSymbol *NumNamedBarrier, const MCSymbol *PrivateSegmentSize, const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch, const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion, const MCSymbol *HasIndirectCall)
virtual bool EmitCodeEnd(const MCSubtargetInfo &STI)
virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type)
virtual void EmitMCResourceMaximums(const MCSymbol *MaxVGPR, const MCSymbol *MaxAGPR, const MCSymbol *MaxSGPR)
virtual void EmitDirectiveAMDGCNTarget()
virtual void EmitAMDKernelCodeT(AMDGPU::AMDGPUMCKernelCodeT &Header)
const std::optional< AMDGPU::IsaInfo::AMDGPUTargetID > & getTargetID() const
void setXnackSetting(TargetIDSetting NewXnackSetting)
Sets xnack setting to NewXnackSetting.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
This class represents an incoming formal argument to a Function.
Definition: Argument.h:32
This class is intended to be used as a driving class for all asm writers.
Definition: AsmPrinter.h:90
const TargetLoweringObjectFile & getObjFileLowering() const
Return information about object file lowering.
Definition: AsmPrinter.cpp:413
MCSymbol * getSymbol(const GlobalValue *GV) const
Definition: AsmPrinter.cpp:706
virtual void emitGlobalVariable(const GlobalVariable *GV)
Emit the specified global variable to the .s file.
Definition: AsmPrinter.cpp:728
TargetMachine & TM
Target machine description.
Definition: AsmPrinter.h:93
const MCAsmInfo * MAI
Target Asm Printer information.
Definition: AsmPrinter.h:96
MachineFunction * MF
The current machine function.
Definition: AsmPrinter.h:108
virtual void SetupMachineFunction(MachineFunction &MF)
This should be called when a new MachineFunction is being processed from runOnMachineFunction.
void emitFunctionBody()
This method emits the body and trailer for a function.
virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const
Return true if the basic block has exactly one predecessor and the control transfer mechanism between...
bool doInitialization(Module &M) override
Set up the AsmPrinter when we are working on a new module.
Definition: AsmPrinter.cpp:464
virtual void emitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const
This emits linkage information about GVSym based on GV, if this is supported by the target.
Definition: AsmPrinter.cpp:661
void getAnalysisUsage(AnalysisUsage &AU) const override
Record analysis usage.
Definition: AsmPrinter.cpp:455
unsigned getFunctionNumber() const
Return a unique ID for the current function.
Definition: AsmPrinter.cpp:409
MachineOptimizationRemarkEmitter * ORE
Optimization remark emitter.
Definition: AsmPrinter.h:120
MCSymbol * CurrentFnSym
The symbol for the current function.
Definition: AsmPrinter.h:127
MachineModuleInfo * MMI
This is a pointer to the current MachineModuleInfo.
Definition: AsmPrinter.h:111
MCContext & OutContext
This is the context for the output file that we are streaming.
Definition: AsmPrinter.h:100
bool doFinalization(Module &M) override
Shut down the asmprinter.
virtual void emitBasicBlockStart(const MachineBasicBlock &MBB)
Targets can override this to emit stuff at the start of a basic block.
void emitVisibility(MCSymbol *Sym, unsigned Visibility, bool IsDefinition=true) const
This emits visibility information about symbol, if this is supported by the target.
std::unique_ptr< MCStreamer > OutStreamer
This is the MCStreamer object for the file we are generating.
Definition: AsmPrinter.h:105
bool isVerbose() const
Return true if assembly output should contain comments.
Definition: AsmPrinter.h:307
void getNameWithPrefix(SmallVectorImpl< char > &Name, const GlobalValue *GV) const
Definition: AsmPrinter.cpp:701
virtual void emitFunctionEntryLabel()
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &OS)
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
Diagnostic information for optimization failures.
Diagnostic information for stack size etc.
DISubprogram * getSubprogram() const
Get the attached subprogram.
Definition: Metadata.cpp:1915
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:359
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
bool hasGFX90AInsts() const
bool hasMAIInsts() const
Definition: GCNSubtarget.h:878
bool hasSGPRInitBug() const
unsigned getMinNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
bool isTgSplitEnabled() const
Definition: GCNSubtarget.h:660
bool isCuModeEnabled() const
Definition: GCNSubtarget.h:664
const AMDGPU::IsaInfo::AMDGPUTargetID & getTargetID() const
Definition: GCNSubtarget.h:346
bool dumpCode() const
Definition: GCNSubtarget.h:558
std::pair< unsigned, unsigned > computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:652
bool isWave32() const
bool supportsWGP() const
Definition: GCNSubtarget.h:394
unsigned getMaxNumUserSGPRs() const
Generation getGeneration() const
Definition: GCNSubtarget.h:356
unsigned getAddressableNumSGPRs() const
unsigned getMaxWaveScratchSize() const
Definition: GCNSubtarget.h:360
bool hasKernargSegmentPtr() const
bool hasPrivateSegmentBuffer() const
bool hasPrivateSegmentSize() const
VisibilityTypes getVisibility() const
Definition: GlobalValue.h:250
LLVM_ABI bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:316
bool hasLocalLinkage() const
Definition: GlobalValue.h:530
unsigned getAddressSpace() const
Definition: GlobalValue.h:207
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:663
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition: Globals.cpp:132
Type * getValueType() const
Definition: GlobalValue.h:298
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
bool hasInitializer() const
Definitions have initializers, declarations don't.
MaybeAlign getAlign() const
Returns the alignment of the given variable.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI const DiagnosticHandler * getDiagHandlerPtr() const
getDiagHandlerPtr - Returns const raw pointer of DiagnosticHandler set by setDiagnosticHandler.
MCCodeEmitter * getEmitterPtr() const
Definition: MCAssembler.h:173
static const MCBinaryExpr * createAdd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition: MCExpr.h:343
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:348
static const MCBinaryExpr * createOr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:408
static const MCBinaryExpr * createLOr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:378
static const MCBinaryExpr * createMul(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:398
static const MCBinaryExpr * createGT(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:363
static const MCBinaryExpr * createDiv(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:353
static const MCBinaryExpr * createShl(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:413
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:428
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition: MCExpr.cpp:212
Context object for machine code objects.
Definition: MCContext.h:83
MCSectionELF * getELFSection(const Twine &Section, unsigned Type, unsigned Flags)
Definition: MCContext.h:549
LLVM_ABI void reportError(SMLoc L, const Twine &Msg)
Definition: MCContext.cpp:1115
LLVM_ABI MCSymbol * getOrCreateSymbol(const Twine &Name)
Lookup the symbol inside with the specified Name.
Definition: MCContext.cpp:203
Base class for the full range of assembler expressions which are needed for parsing.
Definition: MCExpr.h:34
LLVM_ABI bool evaluateAsRelocatable(MCValue &Res, const MCAssembler *Asm) const
Try to evaluate the expression to a relocatable value, i.e.
Definition: MCExpr.cpp:450
MCSection * getTextSection() const
MCContext & getContext() const
MCSymbol * getMaxSGPRSymbol(MCContext &OutContext)
MCSymbol * getMaxAGPRSymbol(MCContext &OutContext)
const MCExpr * createTotalNumVGPRs(const MachineFunction &MF, MCContext &Ctx)
void finalize(MCContext &OutContext)
MCSymbol * getSymbol(StringRef FuncName, ResourceInfoKind RIK, MCContext &OutContext, bool IsLocal)
MCSymbol * getMaxVGPRSymbol(MCContext &OutContext)
const MCExpr * createTotalNumSGPRs(const MachineFunction &MF, bool hasXnack, MCContext &Ctx)
void gatherResourceInfo(const MachineFunction &MF, const AMDGPUResourceUsageAnalysisWrapperPass::FunctionResourceInfo &FRI, MCContext &OutContext)
AMDGPUResourceUsageAnalysis gathers resource usage on a per-function granularity.
This represents a section on linux, lots of unix variants and some bare metal systems.
Definition: MCSectionELF.h:27
Instances of this class represent a uniqued identifier for a section in the current translation unit.
Definition: MCSection.h:496
bool hasInstructions() const
Definition: MCSection.h:591
Generic base class for all target subtargets.
const Triple & getTargetTriple() const
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition: MCExpr.h:214
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:42
bool isDefined() const
isDefined - Check if this symbol is defined (i.e., it has an address).
Definition: MCSymbol.h:233
StringRef getName() const
getName - Get the symbol name.
Definition: MCSymbol.h:188
bool isVariable() const
isVariable - Check if this is a variable symbol.
Definition: MCSymbol.h:267
void redefineIfPossible()
Prepare this symbol to be redefined.
Definition: MCSymbol.h:212
const MCExpr * getVariableValue() const
Get the expression of the variable symbol.
Definition: MCSymbol.h:270
MCStreamer & getStreamer()
Definition: MCStreamer.h:101
static const MCUnaryExpr * createNot(const MCExpr *Expr, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition: MCExpr.h:273
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
void setAlignment(Align A)
setAlignment - Set the alignment of the function.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MCContext & getContext() const
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
Representation of each machine instruction.
Definition: MachineInstr.h:72
This class contains meta information specific to a module.
LLVM_ABI MachineFunction * getMachineFunction(const Function &F) const
Returns the MachineFunction associated to IR function F if there is one, otherwise nullptr.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
Diagnostic information for optimization analysis remarks.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
GCNUserSGPRUsageInfo & getUserSGPRInfo()
SIModeRegisterDefaults getMode() const
unsigned getScratchReservedForDynamicVGPRs() const
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:862
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
std::string str() const
str - Get the contents as an std::string.
Definition: StringRef.h:233
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:83
const Triple & getTargetTriple() const
const MCSubtargetInfo * getMCSubtargetInfo() const
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
MCSymbol * getSymbol(const GlobalValue *GV) const
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:417
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:82
LLVM Value Representation.
Definition: Value.h:75
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:322
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:53
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:692
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ LOCAL_ADDRESS
Address space for local memory.
unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI, std::optional< bool > EnableWavefrontSize32)
unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed, bool FlatScrUsed, bool XNACKUsed)
unsigned getSGPREncodingGranule(const MCSubtargetInfo *STI)
int32_t getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR)
void printAMDGPUMCExpr(const MCExpr *Expr, raw_ostream &OS, const MCAsmInfo *MAI)
LLVM_READNONE constexpr bool isModuleEntryFunctionCC(CallingConv::ID CC)
unsigned getLdsDwGranularity(const MCSubtargetInfo &ST)
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
const MCExpr * maskShiftSet(const MCExpr *Val, uint32_t Mask, uint32_t Shift, MCContext &Ctx)
Provided with the MCExpr * Val, uint32 Mask and Shift, will return the masked and left shifted,...
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isGFX90A(const MCSubtargetInfo &STI)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool hasMAIInsts(const MCSubtargetInfo &STI)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isGFX11Plus(const MCSubtargetInfo &STI)
const MCExpr * foldAMDGPUMCExpr(const MCExpr *Expr, MCContext &Ctx)
bool isGFX10Plus(const MCSubtargetInfo &STI)
constexpr std::pair< unsigned, unsigned > getShiftMask(unsigned Value)
Deduce the least significant bit aligned shift and mask values for a binary Complement Value (as they...
bool isGFX1250(const MCSubtargetInfo &STI)
unsigned hasKernargPreload(const MCSubtargetInfo &STI)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:126
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:144
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
@ SHT_PROGBITS
Definition: ELF.h:1140
@ STT_AMDGPU_HSA_KERNEL
Definition: ELF.h:1422
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1702
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2491
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
Target & getTheR600Target()
The target for R600 GPUs.
@ DK_ResourceLimit
AsmPrinter * createR600AsmPrinterPass(TargetMachine &TM, std::unique_ptr< MCStreamer > &&Streamer)
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition: Error.cpp:167
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:126
@ Success
The lock was released successfully.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:399
Target & getTheGCNTarget()
The target for GCN GPUs.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1886
@ DS_Error
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition: Alignment.h:208
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition: Error.cpp:180
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:856
void validate(const MCSubtargetInfo *STI, MCContext &Ctx)
void initDefault(const MCSubtargetInfo *STI, MCContext &Ctx, bool InitMCExpr=true)
static const MCExpr * bits_get(const MCExpr *Src, uint32_t Shift, uint32_t Mask, MCContext &Ctx)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
virtual bool isAnalysisRemarkEnabled(StringRef PassName) const
Return true if analysis remarks are enabled, override to provide different implementation.
Track resource usage for kernels / entry functions.
Definition: SIProgramInfo.h:32
const MCExpr * NumSGPR
Definition: SIProgramInfo.h:74
const MCExpr * NumArchVGPR
Definition: SIProgramInfo.h:70
uint64_t getFunctionCodeSize(const MachineFunction &MF, bool IsLowerBound=false)
const MCExpr * getComputePGMRSrc2(MCContext &Ctx) const
Compute the value of the ComputePGMRsrc2 register.
const MCExpr * VGPRBlocks
Definition: SIProgramInfo.h:36
const MCExpr * ScratchBlocks
Definition: SIProgramInfo.h:52
const MCExpr * ComputePGMRSrc3
Definition: SIProgramInfo.h:67
const MCExpr * getComputePGMRSrc1(const GCNSubtarget &ST, MCContext &Ctx) const
Compute the value of the ComputePGMRsrc1 register.
const MCExpr * VCCUsed
Definition: SIProgramInfo.h:97
const MCExpr * FlatUsed
Definition: SIProgramInfo.h:78
uint32_t TrapHandlerEnable
Definition: SIProgramInfo.h:57
const MCExpr * NamedBarCnt
Definition: SIProgramInfo.h:87
const MCExpr * ScratchEnable
Definition: SIProgramInfo.h:55
const MCExpr * AccumOffset
Definition: SIProgramInfo.h:72
const MCExpr * NumAccVGPR
Definition: SIProgramInfo.h:71
const MCExpr * DynamicCallStack
Definition: SIProgramInfo.h:94
const MCExpr * SGPRBlocks
Definition: SIProgramInfo.h:37
const MCExpr * NumVGPRsForWavesPerEU
Definition: SIProgramInfo.h:84
const MCExpr * NumVGPR
Definition: SIProgramInfo.h:69
const MCExpr * getPGMRSrc1(CallingConv::ID CC, const GCNSubtarget &ST, MCContext &Ctx) const
const MCExpr * Occupancy
Definition: SIProgramInfo.h:90
const MCExpr * ScratchSize
Definition: SIProgramInfo.h:48
const MCExpr * NumSGPRsForWavesPerEU
Definition: SIProgramInfo.h:81
void reset(const MachineFunction &MF)
static void RegisterAsmPrinter(Target &T, Target::AsmPrinterCtorTy Fn)
RegisterAsmPrinter - Register an AsmPrinter implementation for the given target.