LLVM 22.0.0git
AMDGPUAsmPrinter.cpp
Go to the documentation of this file.
1//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10///
11/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12/// code. When passed an MCAsmStreamer it prints assembly and when passed
13/// an MCObjectStreamer it outputs binary code.
14//
15//===----------------------------------------------------------------------===//
16//
17
18#include "AMDGPUAsmPrinter.h"
19#include "AMDGPU.h"
23#include "GCNSubtarget.h"
28#include "R600AsmPrinter.h"
40#include "llvm/MC/MCAssembler.h"
41#include "llvm/MC/MCContext.h"
43#include "llvm/MC/MCStreamer.h"
44#include "llvm/MC/MCValue.h"
51
52using namespace llvm;
53using namespace llvm::AMDGPU;
54
55// This should get the default rounding mode from the kernel. We just set the
56// default here, but this could change if the OpenCL rounding mode pragmas are
57// used.
58//
59// The denormal mode here should match what is reported by the OpenCL runtime
60// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
61// can also be override to flush with the -cl-denorms-are-zero compiler flag.
62//
63// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
64// precision, and leaves single precision to flush all and does not report
65// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
66// CL_FP_DENORM for both.
67//
68// FIXME: It seems some instructions do not support single precision denormals
69// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
70// and sin_f32, cos_f32 on most parts).
71
72// We want to use these instructions, and using fp32 denormals also causes
73// instructions to run at the double precision rate for the device so it's
74// probably best to just report no single precision denormals.
81
82static AsmPrinter *
84 std::unique_ptr<MCStreamer> &&Streamer) {
85 return new AMDGPUAsmPrinter(tm, std::move(Streamer));
86}
87
95
97 std::unique_ptr<MCStreamer> Streamer)
98 : AsmPrinter(TM, std::move(Streamer)) {
99 assert(OutStreamer && "AsmPrinter constructed without streamer");
100}
101
103 return "AMDGPU Assembly Printer";
104}
105
107 return TM.getMCSubtargetInfo();
108}
109
111 if (!OutStreamer)
112 return nullptr;
113 return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer());
114}
115
119
120void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
122
123 // TODO: Which one is called first, emitStartOfAsmFile or
124 // emitFunctionBodyStart?
125 if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
126 initializeTargetID(M);
127
130 return;
131
133
136 CodeObjectVersion);
137 HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID());
138 }
139
142}
143
145 // Init target streamer if it has not yet happened
147 initTargetStreamer(M);
148
149 if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
151
152 // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
153 // Emit HSA Metadata (NT_AMD_HSA_METADATA).
154 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
155 HSAMetadataStream->end();
156 bool Success = HSAMetadataStream->emitTo(*getTargetStreamer());
157 (void)Success;
158 assert(Success && "Malformed HSA Metadata");
159 }
160}
161
163 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
164 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
165 const Function &F = MF->getFunction();
166
167 // TODO: We're checking this late, would be nice to check it earlier.
168 if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) {
170 STM.getCPU() + " is only available on code object version 6 or better");
171 }
172
173 // TODO: Which one is called first, emitStartOfAsmFile or
174 // emitFunctionBodyStart?
175 if (!getTargetStreamer()->getTargetID())
176 initializeTargetID(*F.getParent());
177
178 const auto &FunctionTargetID = STM.getTargetID();
179 // Make sure function's xnack settings are compatible with module's
180 // xnack settings.
181 if (FunctionTargetID.isXnackSupported() &&
182 FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any &&
183 FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) {
184 OutContext.reportError({}, "xnack setting of '" + Twine(MF->getName()) +
185 "' function does not match module xnack setting");
186 return;
187 }
188 // Make sure function's sramecc settings are compatible with module's
189 // sramecc settings.
190 if (FunctionTargetID.isSramEccSupported() &&
191 FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any &&
192 FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) {
193 OutContext.reportError({}, "sramecc setting of '" + Twine(MF->getName()) +
194 "' function does not match module sramecc setting");
195 return;
196 }
197
198 if (!MFI.isEntryFunction())
199 return;
200
201 if (STM.isMesaKernel(F) &&
202 (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
203 F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
204 AMDGPUMCKernelCodeT KernelCode;
205 getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
206 KernelCode.validate(&STM, MF->getContext());
208 }
209
210 if (STM.isAmdHsaOS())
211 HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
212}
213
215 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
216 if (!MFI.isEntryFunction())
217 return;
218
219 if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
220 return;
221
222 auto &Streamer = getTargetStreamer()->getStreamer();
223 auto &Context = Streamer.getContext();
224 auto &ObjectFileInfo = *Context.getObjectFileInfo();
225 auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
226
227 Streamer.pushSection();
228 Streamer.switchSection(&ReadOnlySection);
229
230 // CP microcode requires the kernel descriptor to be allocated on 64 byte
231 // alignment.
232 Streamer.emitValueToAlignment(Align(64), 0, 1, 0);
233 ReadOnlySection.ensureMinAlignment(Align(64));
234
235 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
236
237 SmallString<128> KernelName;
238 getNameWithPrefix(KernelName, &MF->getFunction());
240 STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
241 CurrentProgramInfo.NumVGPRsForWavesPerEU,
243 CurrentProgramInfo.NumSGPRsForWavesPerEU,
245 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
246 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Context),
247 Context),
248 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);
249
250 Streamer.popSection();
251}
252
254 Register RegNo = MI->getOperand(0).getReg();
255
257 raw_svector_ostream OS(Str);
258 OS << "implicit-def: "
259 << printReg(RegNo, MF->getSubtarget().getRegisterInfo());
260
261 if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL)
262 OS << " : SGPR spill to VGPR lane";
263
264 OutStreamer->AddComment(OS.str());
265 OutStreamer->addBlankLine();
266}
267
269 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
271 return;
272 }
273
274 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
275 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
276 if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) {
277 SmallString<128> SymbolName;
278 getNameWithPrefix(SymbolName, &MF->getFunction()),
280 SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
281 }
282 if (DumpCodeInstEmitter) {
283 // Disassemble function name label to text.
284 DisasmLines.push_back(MF->getName().str() + ":");
285 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
286 HexLines.emplace_back("");
287 }
288
290}
291
293 if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) {
294 // Write a line for the basic block label if it is not only fallthrough.
295 DisasmLines.push_back(
296 (Twine("BB") + Twine(getFunctionNumber())
297 + "_" + Twine(MBB.getNumber()) + ":").str());
298 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
299 HexLines.emplace_back("");
300 }
302}
303
306 if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
307 OutContext.reportError({},
308 Twine(GV->getName()) +
309 ": unsupported initializer for address space");
310 return;
311 }
312
313 // LDS variables aren't emitted in HSA or PAL yet.
314 const Triple::OSType OS = TM.getTargetTriple().getOS();
315 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
316 return;
317
318 MCSymbol *GVSym = getSymbol(GV);
319
320 GVSym->redefineIfPossible();
321 if (GVSym->isDefined() || GVSym->isVariable())
322 report_fatal_error("symbol '" + Twine(GVSym->getName()) +
323 "' is already defined");
324
325 const DataLayout &DL = GV->getDataLayout();
326 uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
327 Align Alignment = GV->getAlign().value_or(Align(4));
328
329 emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
330 emitLinkage(GV, GVSym);
331 auto *TS = getTargetStreamer();
332 TS->emitAMDGPULDS(GVSym, Size, Alignment);
333 return;
334 }
335
337}
338
340 CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M);
341
342 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
343 switch (CodeObjectVersion) {
345 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV4>();
346 break;
348 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV5>();
349 break;
351 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV6>();
352 break;
353 default:
354 reportFatalUsageError("unsupported code object version");
355 }
356 }
357
359}
360
361void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
362 if (F.isDeclaration() || !AMDGPU::isModuleEntryFunctionCC(F.getCallingConv()))
363 return;
364
367 MCSymbol *FnSym = TM.getSymbol(&F);
368 bool IsLocal = F.hasLocalLinkage();
369
370 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
371 int64_t Val;
372 if (Value->evaluateAsAbsolute(Val)) {
373 Res = Val;
374 return true;
375 }
376 return false;
377 };
378
379 const uint64_t MaxScratchPerWorkitem =
381 MCSymbol *ScratchSizeSymbol = RI.getSymbol(
382 FnSym->getName(), RIK::RIK_PrivateSegSize, OutContext, IsLocal);
383 uint64_t ScratchSize;
384 if (ScratchSizeSymbol->isVariable() &&
385 TryGetMCExprValue(ScratchSizeSymbol->getVariableValue(), ScratchSize) &&
386 ScratchSize > MaxScratchPerWorkitem) {
387 DiagnosticInfoStackSize DiagStackSize(F, ScratchSize, MaxScratchPerWorkitem,
388 DS_Error);
389 F.getContext().diagnose(DiagStackSize);
390 }
391
392 // Validate addressable scalar registers (i.e., prior to added implicit
393 // SGPRs).
394 MCSymbol *NumSGPRSymbol =
395 RI.getSymbol(FnSym->getName(), RIK::RIK_NumSGPR, OutContext, IsLocal);
397 !STM.hasSGPRInitBug()) {
398 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
399 uint64_t NumSgpr;
400 if (NumSGPRSymbol->isVariable() &&
401 TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
402 NumSgpr > MaxAddressableNumSGPRs) {
403 DiagnosticInfoResourceLimit Diag(F, "addressable scalar registers",
404 NumSgpr, MaxAddressableNumSGPRs,
406 F.getContext().diagnose(Diag);
407 return;
408 }
409 }
410
411 MCSymbol *VCCUsedSymbol =
412 RI.getSymbol(FnSym->getName(), RIK::RIK_UsesVCC, OutContext, IsLocal);
413 MCSymbol *FlatUsedSymbol = RI.getSymbol(
414 FnSym->getName(), RIK::RIK_UsesFlatScratch, OutContext, IsLocal);
415 uint64_t VCCUsed, FlatUsed, NumSgpr;
416
417 if (NumSGPRSymbol->isVariable() && VCCUsedSymbol->isVariable() &&
418 FlatUsedSymbol->isVariable() &&
419 TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
420 TryGetMCExprValue(VCCUsedSymbol->getVariableValue(), VCCUsed) &&
421 TryGetMCExprValue(FlatUsedSymbol->getVariableValue(), FlatUsed)) {
422
423 // Recomputes NumSgprs + implicit SGPRs but all symbols should now be
424 // resolvable.
425 NumSgpr += IsaInfo::getNumExtraSGPRs(
426 &STM, VCCUsed, FlatUsed,
427 getTargetStreamer()->getTargetID()->isXnackOnOrAny());
429 STM.hasSGPRInitBug()) {
430 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
431 if (NumSgpr > MaxAddressableNumSGPRs) {
432 DiagnosticInfoResourceLimit Diag(F, "scalar registers", NumSgpr,
433 MaxAddressableNumSGPRs, DS_Error,
435 F.getContext().diagnose(Diag);
436 return;
437 }
438 }
439
440 MCSymbol *NumVgprSymbol =
441 RI.getSymbol(FnSym->getName(), RIK::RIK_NumVGPR, OutContext, IsLocal);
442 MCSymbol *NumAgprSymbol =
443 RI.getSymbol(FnSym->getName(), RIK::RIK_NumAGPR, OutContext, IsLocal);
444 uint64_t NumVgpr, NumAgpr;
445
446 MachineModuleInfo &MMI =
448 MachineFunction *MF = MMI.getMachineFunction(F);
449 if (MF && NumVgprSymbol->isVariable() && NumAgprSymbol->isVariable() &&
450 TryGetMCExprValue(NumVgprSymbol->getVariableValue(), NumVgpr) &&
451 TryGetMCExprValue(NumAgprSymbol->getVariableValue(), NumAgpr)) {
452 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
453 unsigned MaxWaves = MFI.getMaxWavesPerEU();
454 uint64_t TotalNumVgpr =
455 getTotalNumVGPRs(STM.hasGFX90AInsts(), NumAgpr, NumVgpr);
456 uint64_t NumVGPRsForWavesPerEU =
457 std::max({TotalNumVgpr, (uint64_t)1,
458 (uint64_t)STM.getMinNumVGPRs(
459 MaxWaves, MFI.getDynamicVGPRBlockSize())});
460 uint64_t NumSGPRsForWavesPerEU = std::max(
461 {NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)});
462 const MCExpr *OccupancyExpr = AMDGPUMCExpr::createOccupancy(
463 STM.getOccupancyWithWorkGroupSizes(*MF).second,
464 MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext),
465 MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext),
467 uint64_t Occupancy;
468
469 const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute(
470 F, "amdgpu-waves-per-eu", {0, 0}, true);
471
472 if (TryGetMCExprValue(OccupancyExpr, Occupancy) && Occupancy < MinWEU) {
473 DiagnosticInfoOptimizationFailure Diag(
474 F, F.getSubprogram(),
475 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
476 "'" +
477 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
478 ", final occupancy is " + Twine(Occupancy));
479 F.getContext().diagnose(Diag);
480 return;
481 }
482 }
483 }
484}
485
487 // Pad with s_code_end to help tools and guard against instruction prefetch
488 // causing stale data in caches. Arguably this should be done by the linker,
489 // which is why this isn't done for Mesa.
490 // Don't do it if there is no code.
491 const MCSubtargetInfo &STI = *getGlobalSTI();
492 if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
496 if (TextSect->hasInstructions()) {
497 OutStreamer->switchSection(TextSect);
499 }
500 }
501
502 // Assign expressions which can only be resolved when all other functions are
503 // known.
504 RI.finalize(OutContext);
505
506 // Switch section and emit all GPR maximums within the processed module.
507 OutStreamer->pushSection();
508 MCSectionELF *MaxGPRSection =
509 OutContext.getELFSection(".AMDGPU.gpr_maximums", ELF::SHT_PROGBITS, 0);
510 OutStreamer->switchSection(MaxGPRSection);
512 RI.getMaxAGPRSymbol(OutContext),
513 RI.getMaxSGPRSymbol(OutContext));
514 OutStreamer->popSection();
515
516 for (Function &F : M.functions())
517 validateMCResourceInfo(F);
518
519 RI.reset();
520
522}
523
524SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) {
526 raw_svector_ostream OSS(Str);
527 auto &Streamer = getTargetStreamer()->getStreamer();
528 auto &Context = Streamer.getContext();
529 const MCExpr *New = foldAMDGPUMCExpr(Value, Context);
530 printAMDGPUMCExpr(New, OSS, MAI);
531 return Str;
532}
533
534// Print comments that apply to both callable functions and entry points.
535void AMDGPUAsmPrinter::emitCommonFunctionComments(
536 const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR,
537 const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize,
538 const AMDGPUMachineFunction *MFI) {
539 OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
540 OutStreamer->emitRawComment(" TotalNumSgprs: " + getMCExprStr(NumSGPR),
541 false);
542 OutStreamer->emitRawComment(" NumVgprs: " + getMCExprStr(NumVGPR), false);
543 if (NumAGPR && TotalNumVGPR) {
544 OutStreamer->emitRawComment(" NumAgprs: " + getMCExprStr(NumAGPR), false);
545 OutStreamer->emitRawComment(" TotalNumVgprs: " + getMCExprStr(TotalNumVGPR),
546 false);
547 }
548 OutStreamer->emitRawComment(" ScratchSize: " + getMCExprStr(ScratchSize),
549 false);
550 OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
551 false);
552}
553
554const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
555 const MachineFunction &MF) const {
556 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
557 MCContext &Ctx = MF.getContext();
558 uint16_t KernelCodeProperties = 0;
559 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
560
561 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
562 KernelCodeProperties |=
563 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
564 }
565 if (UserSGPRInfo.hasDispatchPtr()) {
566 KernelCodeProperties |=
567 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
568 }
569 if (UserSGPRInfo.hasQueuePtr()) {
570 KernelCodeProperties |=
571 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
572 }
573 if (UserSGPRInfo.hasKernargSegmentPtr()) {
574 KernelCodeProperties |=
575 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
576 }
577 if (UserSGPRInfo.hasDispatchID()) {
578 KernelCodeProperties |=
579 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
580 }
581 if (UserSGPRInfo.hasFlatScratchInit()) {
582 KernelCodeProperties |=
583 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
584 }
585 if (UserSGPRInfo.hasPrivateSegmentSize()) {
586 KernelCodeProperties |=
587 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
588 }
589 if (MF.getSubtarget<GCNSubtarget>().isWave32()) {
590 KernelCodeProperties |=
591 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
592 }
593
594 // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be
595 // un-evaluatable at this point so it cannot be conditionally checked here.
596 // Instead, we'll directly shift the possibly unknown MCExpr into its place
597 // and bitwise-or it into KernelCodeProperties.
598 const MCExpr *KernelCodePropExpr =
599 MCConstantExpr::create(KernelCodeProperties, Ctx);
600 const MCExpr *OrValue = MCConstantExpr::create(
601 amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_SHIFT, Ctx);
602 OrValue = MCBinaryExpr::createShl(CurrentProgramInfo.DynamicCallStack,
603 OrValue, Ctx);
604 KernelCodePropExpr = MCBinaryExpr::createOr(KernelCodePropExpr, OrValue, Ctx);
605
606 return KernelCodePropExpr;
607}
608
609MCKernelDescriptor
610AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
611 const SIProgramInfo &PI) const {
612 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
613 const Function &F = MF.getFunction();
614 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
615 MCContext &Ctx = MF.getContext();
616
617 MCKernelDescriptor KernelDescriptor;
618
619 KernelDescriptor.group_segment_fixed_size =
621 KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
622
623 Align MaxKernArgAlign;
624 KernelDescriptor.kernarg_size = MCConstantExpr::create(
625 STM.getKernArgSegmentSize(F, MaxKernArgAlign), Ctx);
626
627 KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM, Ctx);
628 KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(Ctx);
629 KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
630
631 int64_t PGRM_Rsrc3 = 1;
632 bool EvaluatableRsrc3 =
633 CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGRM_Rsrc3);
634 (void)PGRM_Rsrc3;
635 (void)EvaluatableRsrc3;
637 STM.hasGFX90AInsts() || AMDGPU::isGFX1250(STM) || !EvaluatableRsrc3 ||
638 static_cast<uint64_t>(PGRM_Rsrc3) == 0);
639 KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3;
640
641 KernelDescriptor.kernarg_preload = MCConstantExpr::create(
642 AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0,
643 Ctx);
644
645 return KernelDescriptor;
646}
647
649 // Init target streamer lazily on the first function so that previous passes
650 // can set metadata.
652 initTargetStreamer(*MF.getFunction().getParent());
653
654 ResourceUsage =
656 CurrentProgramInfo.reset(MF);
657
658 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
659 MCContext &Ctx = MF.getContext();
660
661 // The starting address of all shader programs must be 256 bytes aligned.
662 // Regular functions just need the basic required instruction alignment.
663 MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
664
666
667 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
669 bool IsLocal = MF.getFunction().hasLocalLinkage();
670 // FIXME: This should be an explicit check for Mesa.
671 if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
672 MCSectionELF *ConfigSection =
673 Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
674 OutStreamer->switchSection(ConfigSection);
675 }
676
677 RI.gatherResourceInfo(MF, *ResourceUsage, OutContext);
678
679 if (MFI->isModuleEntryFunction()) {
680 getSIProgramInfo(CurrentProgramInfo, MF);
681 }
682
683 if (STM.isAmdPalOS()) {
684 if (MFI->isEntryFunction())
685 EmitPALMetadata(MF, CurrentProgramInfo);
686 else if (MFI->isModuleEntryFunction())
687 emitPALFunctionMetadata(MF);
688 } else if (!STM.isAmdHsaOS()) {
689 EmitProgramInfoSI(MF, CurrentProgramInfo);
690 }
691
692 DumpCodeInstEmitter = nullptr;
693 if (STM.dumpCode()) {
694 // For -dumpcode, get the assembler out of the streamer. This only works
695 // with -filetype=obj.
696 MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
697 if (Assembler)
698 DumpCodeInstEmitter = Assembler->getEmitterPtr();
699 }
700
701 DisasmLines.clear();
702 HexLines.clear();
704
706
707 emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(),
708 STM.hasMAIInsts());
709
710 {
713 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext,
714 IsLocal),
715 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumAGPR, OutContext,
716 IsLocal),
717 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumSGPR, OutContext,
718 IsLocal),
719 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumNamedBarrier,
720 OutContext, IsLocal),
721 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
722 OutContext, IsLocal),
723 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesVCC, OutContext,
724 IsLocal),
725 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesFlatScratch,
726 OutContext, IsLocal),
727 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasDynSizedStack,
728 OutContext, IsLocal),
729 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasRecursion, OutContext,
730 IsLocal),
731 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasIndirectCall,
732 OutContext, IsLocal));
733 }
734
735 // Emit _dvgpr$ symbol when appropriate.
736 emitDVgprSymbol(MF);
737
738 if (isVerbose()) {
739 MCSectionELF *CommentSection =
740 Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
741 OutStreamer->switchSection(CommentSection);
742
743 if (!MFI->isEntryFunction()) {
745 OutStreamer->emitRawComment(" Function info:", false);
746
747 emitCommonFunctionComments(
748 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext,
749 IsLocal)
750 ->getVariableValue(),
751 STM.hasMAIInsts()
752 ? RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumAGPR,
753 OutContext, IsLocal)
754 ->getVariableValue()
755 : nullptr,
756 RI.createTotalNumVGPRs(MF, Ctx),
757 RI.createTotalNumSGPRs(
758 MF,
759 MF.getSubtarget<GCNSubtarget>().getTargetID().isXnackOnOrAny(),
760 Ctx),
761 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
762 OutContext, IsLocal)
763 ->getVariableValue(),
764 CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
765 return false;
766 }
767
768 OutStreamer->emitRawComment(" Kernel info:", false);
769 emitCommonFunctionComments(
770 CurrentProgramInfo.NumArchVGPR,
771 STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr,
772 CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR,
773 CurrentProgramInfo.ScratchSize,
774 CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
775
776 OutStreamer->emitRawComment(
777 " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
778 OutStreamer->emitRawComment(
779 " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
780 OutStreamer->emitRawComment(
781 " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
782 " bytes/workgroup (compile time only)", false);
783
784 OutStreamer->emitRawComment(
785 " SGPRBlocks: " + getMCExprStr(CurrentProgramInfo.SGPRBlocks), false);
786
787 OutStreamer->emitRawComment(
788 " VGPRBlocks: " + getMCExprStr(CurrentProgramInfo.VGPRBlocks), false);
789
790 OutStreamer->emitRawComment(
791 " NumSGPRsForWavesPerEU: " +
792 getMCExprStr(CurrentProgramInfo.NumSGPRsForWavesPerEU),
793 false);
794 OutStreamer->emitRawComment(
795 " NumVGPRsForWavesPerEU: " +
796 getMCExprStr(CurrentProgramInfo.NumVGPRsForWavesPerEU),
797 false);
798
799 if (STM.hasGFX90AInsts()) {
800 const MCExpr *AdjustedAccum = MCBinaryExpr::createAdd(
801 CurrentProgramInfo.AccumOffset, MCConstantExpr::create(1, Ctx), Ctx);
802 AdjustedAccum = MCBinaryExpr::createMul(
803 AdjustedAccum, MCConstantExpr::create(4, Ctx), Ctx);
804 OutStreamer->emitRawComment(
805 " AccumOffset: " + getMCExprStr(AdjustedAccum), false);
806 }
807
808 if (AMDGPU::isGFX1250(STM))
809 OutStreamer->emitRawComment(
810 " NamedBarCnt: " + getMCExprStr(CurrentProgramInfo.NamedBarCnt),
811 false);
812
813 OutStreamer->emitRawComment(
814 " Occupancy: " + getMCExprStr(CurrentProgramInfo.Occupancy), false);
815
816 OutStreamer->emitRawComment(
817 " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
818
819 OutStreamer->emitRawComment(
820 " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
821 getMCExprStr(CurrentProgramInfo.ScratchEnable),
822 false);
823 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
824 Twine(CurrentProgramInfo.UserSGPR),
825 false);
826 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
827 Twine(CurrentProgramInfo.TrapHandlerEnable),
828 false);
829 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
830 Twine(CurrentProgramInfo.TGIdXEnable),
831 false);
832 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
833 Twine(CurrentProgramInfo.TGIdYEnable),
834 false);
835 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
836 Twine(CurrentProgramInfo.TGIdZEnable),
837 false);
838 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
839 Twine(CurrentProgramInfo.TIdIGCompCount),
840 false);
841
842 [[maybe_unused]] int64_t PGMRSrc3;
844 STM.hasGFX90AInsts() || AMDGPU::isGFX1250(STM) ||
845 (CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGMRSrc3) &&
846 static_cast<uint64_t>(PGMRSrc3) == 0));
847 if (STM.hasGFX90AInsts()) {
848 OutStreamer->emitRawComment(
849 " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
850 getMCExprStr(MCKernelDescriptor::bits_get(
851 CurrentProgramInfo.ComputePGMRSrc3,
852 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
853 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)),
854 false);
855 OutStreamer->emitRawComment(
856 " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
857 getMCExprStr(MCKernelDescriptor::bits_get(
858 CurrentProgramInfo.ComputePGMRSrc3,
859 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
860 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)),
861 false);
862 }
863 }
864
865 if (DumpCodeInstEmitter) {
866
867 OutStreamer->switchSection(
868 Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0));
869
870 for (size_t i = 0; i < DisasmLines.size(); ++i) {
871 std::string Comment = "\n";
872 if (!HexLines[i].empty()) {
873 Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
874 Comment += " ; " + HexLines[i] + "\n";
875 }
876
877 OutStreamer->emitBytes(StringRef(DisasmLines[i]));
878 OutStreamer->emitBytes(StringRef(Comment));
879 }
880 }
881
882 return false;
883}
884
885// When appropriate, add a _dvgpr$ symbol, with the value of the function
886// symbol, plus an offset encoding one less than the number of VGPR blocks used
887// by the function in bits 5..3 of the symbol value. A "VGPR block" can be
888// either 16 VGPRs (for a max of 128), or 32 VGPRs (for a max of 256). This is
889// used by a front-end to have functions that are chained rather than called,
890// and a dispatcher that dynamically resizes the VGPR count before dispatching
891// to a function.
892void AMDGPUAsmPrinter::emitDVgprSymbol(MachineFunction &MF) {
894 if (MFI.isDynamicVGPREnabled() &&
896 MCContext &Ctx = MF.getContext();
897 unsigned BlockSize = MFI.getDynamicVGPRBlockSize();
898 MCValue NumVGPRs;
899 if (!CurrentProgramInfo.NumVGPRsForWavesPerEU->evaluateAsRelocatable(
900 NumVGPRs, nullptr) ||
901 !NumVGPRs.isAbsolute()) {
902 llvm_unreachable("unable to resolve NumVGPRs for _dvgpr$ symbol");
903 }
904 // Calculate number of VGPR blocks.
905 // Treat 0 VGPRs as 1 VGPR to avoid underflowing.
906 unsigned NumBlocks =
907 divideCeil(std::max(unsigned(NumVGPRs.getConstant()), 1U), BlockSize);
908
909 if (NumBlocks > 8) {
911 "too many DVGPR blocks for _dvgpr$ symbol for '" +
912 Twine(CurrentFnSym->getName()) + "'");
913 return;
914 }
915 unsigned EncodedNumBlocks = (NumBlocks - 1) << 3;
916 // Add to function symbol to create _dvgpr$ symbol.
917 const MCExpr *DVgprFuncVal = MCBinaryExpr::createAdd(
919 MCConstantExpr::create(EncodedNumBlocks, Ctx), Ctx);
920 MCSymbol *DVgprFuncSym =
921 Ctx.getOrCreateSymbol(Twine("_dvgpr$") + CurrentFnSym->getName());
922 OutStreamer->emitAssignment(DVgprFuncSym, DVgprFuncVal);
923 emitVisibility(DVgprFuncSym, MF.getFunction().getVisibility());
924 emitLinkage(&MF.getFunction(), DVgprFuncSym);
925 }
926}
927
928// TODO: Fold this into emitFunctionBodyStart.
929void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
930 // In the beginning all features are either 'Any' or 'NotSupported',
931 // depending on global target features. This will cover empty modules.
933 getGlobalSTI()->getFeatureString());
934
935 // If module is empty, we are done.
936 if (M.empty())
937 return;
938
939 // If module is not empty, need to find first 'Off' or 'On' feature
940 // setting per feature from functions in module.
941 for (auto &F : M) {
942 auto &TSTargetID = getTargetStreamer()->getTargetID();
943 if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) &&
944 (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff()))
945 break;
946
947 const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
948 const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID();
949 if (TSTargetID->isXnackSupported())
950 if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any)
951 TSTargetID->setXnackSetting(STMTargetID.getXnackSetting());
952 if (TSTargetID->isSramEccSupported())
953 if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any)
954 TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting());
955 }
956}
957
958// AccumOffset computed for the MCExpr equivalent of:
959// alignTo(std::max(1, NumVGPR), 4) / 4 - 1;
960static const MCExpr *computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx) {
961 const MCExpr *ConstFour = MCConstantExpr::create(4, Ctx);
962 const MCExpr *ConstOne = MCConstantExpr::create(1, Ctx);
963
964 // Can't be lower than 1 for subsequent alignTo.
965 const MCExpr *MaximumTaken =
966 AMDGPUMCExpr::createMax({ConstOne, NumVGPR}, Ctx);
967
968 // Practically, it's computing divideCeil(MaximumTaken, 4).
969 const MCExpr *DivCeil = MCBinaryExpr::createDiv(
970 AMDGPUMCExpr::createAlignTo(MaximumTaken, ConstFour, Ctx), ConstFour,
971 Ctx);
972
973 return MCBinaryExpr::createSub(DivCeil, ConstOne, Ctx);
974}
975
976void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
977 const MachineFunction &MF) {
978 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
979 bool IsLocal = MF.getFunction().hasLocalLinkage();
980 MCContext &Ctx = MF.getContext();
981
982 auto CreateExpr = [&Ctx](int64_t Value) {
983 return MCConstantExpr::create(Value, Ctx);
984 };
985
986 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
987 int64_t Val;
988 if (Value->evaluateAsAbsolute(Val)) {
989 Res = Val;
990 return true;
991 }
992 return false;
993 };
994
995 auto GetSymRefExpr =
996 [&](MCResourceInfo::ResourceInfoKind RIK) -> const MCExpr * {
997 MCSymbol *Sym =
998 RI.getSymbol(CurrentFnSym->getName(), RIK, OutContext, IsLocal);
999 return MCSymbolRefExpr::create(Sym, Ctx);
1000 };
1001
1003 ProgInfo.NumArchVGPR = GetSymRefExpr(RIK::RIK_NumVGPR);
1004 ProgInfo.NumAccVGPR = GetSymRefExpr(RIK::RIK_NumAGPR);
1006 ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
1007
1008 ProgInfo.AccumOffset = computeAccumOffset(ProgInfo.NumArchVGPR, Ctx);
1009 ProgInfo.TgSplit = STM.isTgSplitEnabled();
1010 ProgInfo.NumSGPR = GetSymRefExpr(RIK::RIK_NumSGPR);
1011 ProgInfo.ScratchSize = GetSymRefExpr(RIK::RIK_PrivateSegSize);
1012 ProgInfo.VCCUsed = GetSymRefExpr(RIK::RIK_UsesVCC);
1013 ProgInfo.FlatUsed = GetSymRefExpr(RIK::RIK_UsesFlatScratch);
1014 ProgInfo.DynamicCallStack =
1015 MCBinaryExpr::createOr(GetSymRefExpr(RIK::RIK_HasDynSizedStack),
1016 GetSymRefExpr(RIK::RIK_HasRecursion), Ctx);
1017
1018 const MCExpr *BarBlkConst = MCConstantExpr::create(4, Ctx);
1019 const MCExpr *AlignToBlk = AMDGPUMCExpr::createAlignTo(
1020 GetSymRefExpr(RIK::RIK_NumNamedBarrier), BarBlkConst, Ctx);
1021 ProgInfo.NamedBarCnt = MCBinaryExpr::createDiv(AlignToBlk, BarBlkConst, Ctx);
1022
1023 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1024
1025 // The calculations related to SGPR/VGPR blocks are
1026 // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
1027 // unified.
1028 const MCExpr *ExtraSGPRs = AMDGPUMCExpr::createExtraSGPRs(
1029 ProgInfo.VCCUsed, ProgInfo.FlatUsed,
1030 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx);
1031
1032 // Check the addressable register limit before we add ExtraSGPRs.
1034 !STM.hasSGPRInitBug()) {
1035 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
1036 uint64_t NumSgpr;
1037 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
1038 NumSgpr > MaxAddressableNumSGPRs) {
1039 // This can happen due to a compiler bug or when using inline asm.
1040 LLVMContext &Ctx = MF.getFunction().getContext();
1041 DiagnosticInfoResourceLimit Diag(
1042 MF.getFunction(), "addressable scalar registers", NumSgpr,
1043 MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit);
1044 Ctx.diagnose(Diag);
1045 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs - 1);
1046 }
1047 }
1048
1049 // Account for extra SGPRs and VGPRs reserved for debugger use.
1050 ProgInfo.NumSGPR = MCBinaryExpr::createAdd(ProgInfo.NumSGPR, ExtraSGPRs, Ctx);
1051
1052 const Function &F = MF.getFunction();
1053
1054 // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
1055 // dispatch registers as function args.
1056 unsigned WaveDispatchNumSGPR = MFI->getNumWaveDispatchSGPRs(),
1057 WaveDispatchNumVGPR = MFI->getNumWaveDispatchVGPRs();
1058
1059 if (WaveDispatchNumSGPR) {
1061 {ProgInfo.NumSGPR,
1062 MCBinaryExpr::createAdd(CreateExpr(WaveDispatchNumSGPR), ExtraSGPRs,
1063 Ctx)},
1064 Ctx);
1065 }
1066
1067 if (WaveDispatchNumVGPR) {
1069 {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
1070
1072 ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
1073 }
1074
1075 // Adjust number of registers used to meet default/requested minimum/maximum
1076 // number of waves per execution unit request.
1077 unsigned MaxWaves = MFI->getMaxWavesPerEU();
1078 ProgInfo.NumSGPRsForWavesPerEU =
1079 AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, CreateExpr(1ul),
1080 CreateExpr(STM.getMinNumSGPRs(MaxWaves))},
1081 Ctx);
1082 ProgInfo.NumVGPRsForWavesPerEU =
1083 AMDGPUMCExpr::createMax({ProgInfo.NumVGPR, CreateExpr(1ul),
1084 CreateExpr(STM.getMinNumVGPRs(
1085 MaxWaves, MFI->getDynamicVGPRBlockSize()))},
1086 Ctx);
1087
1089 STM.hasSGPRInitBug()) {
1090 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
1091 uint64_t NumSgpr;
1092 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
1093 NumSgpr > MaxAddressableNumSGPRs) {
1094 // This can happen due to a compiler bug or when using inline asm to use
1095 // the registers which are usually reserved for vcc etc.
1096 LLVMContext &Ctx = MF.getFunction().getContext();
1097 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers",
1098 NumSgpr, MaxAddressableNumSGPRs,
1100 Ctx.diagnose(Diag);
1101 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs);
1102 ProgInfo.NumSGPRsForWavesPerEU = CreateExpr(MaxAddressableNumSGPRs);
1103 }
1104 }
1105
1106 if (STM.hasSGPRInitBug()) {
1107 ProgInfo.NumSGPR =
1109 ProgInfo.NumSGPRsForWavesPerEU =
1111 }
1112
1113 if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
1114 LLVMContext &Ctx = MF.getFunction().getContext();
1115 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs",
1116 MFI->getNumUserSGPRs(),
1118 Ctx.diagnose(Diag);
1119 }
1120
1121 if (MFI->getLDSSize() > STM.getAddressableLocalMemorySize()) {
1122 LLVMContext &Ctx = MF.getFunction().getContext();
1123 DiagnosticInfoResourceLimit Diag(
1124 MF.getFunction(), "local memory", MFI->getLDSSize(),
1126 Ctx.diagnose(Diag);
1127 }
1128 // The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks:
1129 // (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1
1130 auto GetNumGPRBlocks = [&CreateExpr, &Ctx](const MCExpr *NumGPR,
1131 unsigned Granule) {
1132 const MCExpr *OneConst = CreateExpr(1ul);
1133 const MCExpr *GranuleConst = CreateExpr(Granule);
1134 const MCExpr *MaxNumGPR = AMDGPUMCExpr::createMax({NumGPR, OneConst}, Ctx);
1135 const MCExpr *AlignToGPR =
1136 AMDGPUMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx);
1137 const MCExpr *DivGPR =
1138 MCBinaryExpr::createDiv(AlignToGPR, GranuleConst, Ctx);
1139 const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx);
1140 return SubGPR;
1141 };
1142 // GFX10+ will always allocate 128 SGPRs and this field must be 0
1144 ProgInfo.SGPRBlocks = CreateExpr(0ul);
1145 } else {
1146 ProgInfo.SGPRBlocks = GetNumGPRBlocks(
1148 }
1149 ProgInfo.VGPRBlocks = GetNumGPRBlocks(ProgInfo.NumVGPRsForWavesPerEU,
1151
1152 const SIModeRegisterDefaults Mode = MFI->getMode();
1153
1154 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
1155 // register.
1156 ProgInfo.FloatMode = getFPMode(Mode);
1157
1158 ProgInfo.IEEEMode = Mode.IEEE;
1159
1160 // Make clamp modifier on NaN input returns 0.
1161 ProgInfo.DX10Clamp = Mode.DX10Clamp;
1162
1163 unsigned LDSAlignShift;
1164 if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize327680)) {
1165 // LDS is allocated in 256 dword blocks.
1166 LDSAlignShift = 10;
1167 } else if (STM.getFeatureBits().test(
1168 FeatureAddressableLocalMemorySize163840)) {
1169 // LDS is allocated in 320 dword blocks.
1170 LDSAlignShift = 11;
1171 } else if (STM.getFeatureBits().test(
1172 FeatureAddressableLocalMemorySize65536)) {
1173 // LDS is allocated in 128 dword blocks.
1174 LDSAlignShift = 9;
1175 } else {
1176 // LDS is allocated in 64 dword blocks.
1177 LDSAlignShift = 8;
1178 }
1179
1180 ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
1181 ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
1182
1183 ProgInfo.LDSSize = MFI->getLDSSize();
1184 ProgInfo.LDSBlocks =
1185 alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
1186
1187 // The MCExpr equivalent of divideCeil.
1188 auto DivideCeil = [&Ctx](const MCExpr *Numerator, const MCExpr *Denominator) {
1189 const MCExpr *Ceil =
1190 AMDGPUMCExpr::createAlignTo(Numerator, Denominator, Ctx);
1191 return MCBinaryExpr::createDiv(Ceil, Denominator, Ctx);
1192 };
1193
1194 // Scratch is allocated in 64-dword or 256-dword blocks.
1195 unsigned ScratchAlignShift =
1196 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
1197 // We need to program the hardware with the amount of scratch memory that
1198 // is used by the entire wave. ProgInfo.ScratchSize is the amount of
1199 // scratch memory used per thread.
1200 ProgInfo.ScratchBlocks = DivideCeil(
1202 CreateExpr(STM.getWavefrontSize()), Ctx),
1203 CreateExpr(1ULL << ScratchAlignShift));
1204
1205 if (STM.supportsWGP()) {
1206 ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
1207 }
1208
1209 if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
1210 ProgInfo.MemOrdered = 1;
1211 ProgInfo.FwdProgress = 1;
1212 }
1213
1214 // 0 = X, 1 = XY, 2 = XYZ
1215 unsigned TIDIGCompCnt = 0;
1216 if (MFI->hasWorkItemIDZ())
1217 TIDIGCompCnt = 2;
1218 else if (MFI->hasWorkItemIDY())
1219 TIDIGCompCnt = 1;
1220
1221 // The private segment wave byte offset is the last of the system SGPRs. We
1222 // initially assumed it was allocated, and may have used it. It shouldn't harm
1223 // anything to disable it if we know the stack isn't used here. We may still
1224 // have emitted code reading it to initialize scratch, but if that's unused
1225 // reading garbage should be OK.
1228 MCConstantExpr::create(0, Ctx), Ctx),
1229 ProgInfo.DynamicCallStack, Ctx);
1230
1231 ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
1232 // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
1233 ProgInfo.TrapHandlerEnable =
1234 STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled();
1235 ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
1236 ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
1237 ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
1238 ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();
1239 ProgInfo.TIdIGCompCount = TIDIGCompCnt;
1240 ProgInfo.EXCPEnMSB = 0;
1241 // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
1242 ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
1243 ProgInfo.EXCPEnable = 0;
1244
1245 // return ((Dst & ~Mask) | (Value << Shift))
1246 auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
1247 uint32_t Shift) {
1248 const auto *Shft = MCConstantExpr::create(Shift, Ctx);
1249 const auto *Msk = MCConstantExpr::create(Mask, Ctx);
1250 Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
1252 Ctx);
1253 return Dst;
1254 };
1255
1256 if (STM.hasGFX90AInsts()) {
1257 ProgInfo.ComputePGMRSrc3 =
1258 SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.AccumOffset,
1259 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
1260 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT);
1261 ProgInfo.ComputePGMRSrc3 =
1262 SetBits(ProgInfo.ComputePGMRSrc3, CreateExpr(ProgInfo.TgSplit),
1263 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
1264 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
1265 }
1266
1267 if (AMDGPU::isGFX1250(STM))
1268 ProgInfo.ComputePGMRSrc3 =
1269 SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.NamedBarCnt,
1270 amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT,
1271 amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT);
1272
1274 STM.computeOccupancy(F, ProgInfo.LDSSize).second,
1276 MFI->getDynamicVGPRBlockSize(), STM, Ctx);
1277
1278 const auto [MinWEU, MaxWEU] =
1279 AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
1280 uint64_t Occupancy;
1281 if (TryGetMCExprValue(ProgInfo.Occupancy, Occupancy) && Occupancy < MinWEU) {
1282 DiagnosticInfoOptimizationFailure Diag(
1283 F, F.getSubprogram(),
1284 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
1285 "'" +
1286 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
1287 ", final occupancy is " + Twine(Occupancy));
1288 F.getContext().diagnose(Diag);
1289 }
1290
1291 if (isGFX11Plus(STM)) {
1292 uint32_t CodeSizeInBytes = (uint32_t)std::min(
1293 ProgInfo.getFunctionCodeSize(MF, true /* IsLowerBound */),
1294 (uint64_t)std::numeric_limits<uint32_t>::max());
1295 uint32_t CodeSizeInLines = divideCeil(CodeSizeInBytes, 128);
1296 uint32_t Field, Shift, Width;
1297 if (isGFX11(STM)) {
1298 Field = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE;
1299 Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT;
1300 Width = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH;
1301 } else {
1302 Field = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE;
1303 Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT;
1304 Width = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH;
1305 }
1306 uint64_t InstPrefSize = std::min(CodeSizeInLines, (1u << Width) - 1);
1307 ProgInfo.ComputePGMRSrc3 = SetBits(ProgInfo.ComputePGMRSrc3,
1308 CreateExpr(InstPrefSize), Field, Shift);
1309 }
1310}
1311
1324
1325void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
1326 const SIProgramInfo &CurrentProgramInfo) {
1327 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1328 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1329 unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
1330 MCContext &Ctx = MF.getContext();
1331
1332 // (((Value) & Mask) << Shift)
1333 auto SetBits = [&Ctx](const MCExpr *Value, uint32_t Mask, uint32_t Shift) {
1334 const MCExpr *msk = MCConstantExpr::create(Mask, Ctx);
1335 const MCExpr *shft = MCConstantExpr::create(Shift, Ctx);
1337 shft, Ctx);
1338 };
1339
1340 auto EmitResolvedOrExpr = [this](const MCExpr *Value, unsigned Size) {
1341 int64_t Val;
1342 if (Value->evaluateAsAbsolute(Val))
1343 OutStreamer->emitIntValue(static_cast<uint64_t>(Val), Size);
1344 else
1345 OutStreamer->emitValue(Value, Size);
1346 };
1347
1348 if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
1350
1351 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx),
1352 /*Size=*/4);
1353
1355 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc2(Ctx), /*Size=*/4);
1356
1358
1359 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1360 // appropriate generation.
1361 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1362 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1363 /*Mask=*/0x3FFFF, /*Shift=*/12),
1364 /*Size=*/4);
1365 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1366 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1367 /*Mask=*/0x7FFF, /*Shift=*/12),
1368 /*Size=*/4);
1369 } else {
1370 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1371 /*Mask=*/0x1FFF, /*Shift=*/12),
1372 /*Size=*/4);
1373 }
1374
1375 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
1376 // 0" comment but I don't see a corresponding field in the register spec.
1377 } else {
1378 OutStreamer->emitInt32(RsrcReg);
1379
1380 const MCExpr *GPRBlocks = MCBinaryExpr::createOr(
1381 SetBits(CurrentProgramInfo.VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0),
1382 SetBits(CurrentProgramInfo.SGPRBlocks, /*Mask=*/0x0F, /*Shift=*/6),
1383 MF.getContext());
1384 EmitResolvedOrExpr(GPRBlocks, /*Size=*/4);
1386
1387 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1388 // appropriate generation.
1389 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1390 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1391 /*Mask=*/0x3FFFF, /*Shift=*/12),
1392 /*Size=*/4);
1393 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1394 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1395 /*Mask=*/0x7FFF, /*Shift=*/12),
1396 /*Size=*/4);
1397 } else {
1398 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1399 /*Mask=*/0x1FFF, /*Shift=*/12),
1400 /*Size=*/4);
1401 }
1402 }
1403
1404 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1406 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1407 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1408 : CurrentProgramInfo.LDSBlocks;
1409 OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1411 OutStreamer->emitInt32(MFI->getPSInputEnable());
1413 OutStreamer->emitInt32(MFI->getPSInputAddr());
1414 }
1415
1416 OutStreamer->emitInt32(R_SPILLED_SGPRS);
1417 OutStreamer->emitInt32(MFI->getNumSpilledSGPRs());
1418 OutStreamer->emitInt32(R_SPILLED_VGPRS);
1419 OutStreamer->emitInt32(MFI->getNumSpilledVGPRs());
1420}
1421
1422// Helper function to add common PAL Metadata 3.0+
1424 const SIProgramInfo &CurrentProgramInfo,
1425 CallingConv::ID CC, const GCNSubtarget &ST,
1426 unsigned DynamicVGPRBlockSize) {
1427 if (ST.hasIEEEMode())
1428 MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
1429
1430 MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode);
1431 MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered);
1432 MD->setHwStage(CC, ".forward_progress", (bool)CurrentProgramInfo.FwdProgress);
1433
1434 if (AMDGPU::isCompute(CC)) {
1435 MD->setHwStage(CC, ".trap_present",
1436 (bool)CurrentProgramInfo.TrapHandlerEnable);
1437 MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
1438
1439 if (DynamicVGPRBlockSize != 0)
1440 MD->setComputeRegisters(".dynamic_vgpr_en", true);
1441 }
1442
1444 CC, ".lds_size",
1445 (unsigned)(CurrentProgramInfo.LdsSize * getLdsDwGranularity(ST) *
1446 sizeof(uint32_t)));
1447}
1448
1449// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1450// is AMDPAL. It stores each compute/SPI register setting and other PAL
1451// metadata items into the PALMD::Metadata, combining with any provided by the
1452// frontend as LLVM metadata. Once all functions are written, the PAL metadata
1453// is then written as a single block in the .note section.
1454void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
1455 const SIProgramInfo &CurrentProgramInfo) {
1456 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1457 auto CC = MF.getFunction().getCallingConv();
1458 auto *MD = getTargetStreamer()->getPALMetadata();
1459 auto &Ctx = MF.getContext();
1460
1461 MD->setEntryPoint(CC, MF.getFunction().getName());
1462 MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx);
1463
1464 // For targets that support dynamic VGPRs, set the number of saved dynamic
1465 // VGPRs (if any) in the PAL metadata.
1466 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1467 if (MFI->isDynamicVGPREnabled() &&
1469 MD->setHwStage(CC, ".dynamic_vgpr_saved_count",
1471
1472 // Only set AGPRs for supported devices
1473 if (STM.hasMAIInsts()) {
1474 MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
1475 }
1476
1477 MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx);
1478 if (MD->getPALMajorVersion() < 3) {
1479 MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM, Ctx), Ctx);
1480 if (AMDGPU::isCompute(CC)) {
1481 MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
1482 } else {
1483 const MCExpr *HasScratchBlocks =
1484 MCBinaryExpr::createGT(CurrentProgramInfo.ScratchBlocks,
1485 MCConstantExpr::create(0, Ctx), Ctx);
1486 auto [Shift, Mask] = getShiftMask(C_00B84C_SCRATCH_EN);
1487 MD->setRsrc2(CC, maskShiftSet(HasScratchBlocks, Mask, Shift, Ctx), Ctx);
1488 }
1489 } else {
1490 MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
1491 MD->setHwStage(CC, ".scratch_en", msgpack::Type::Boolean,
1492 CurrentProgramInfo.ScratchEnable);
1493 EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM,
1495 }
1496
1497 // ScratchSize is in bytes, 16 aligned.
1498 MD->setScratchSize(
1499 CC,
1500 AMDGPUMCExpr::createAlignTo(CurrentProgramInfo.ScratchSize,
1501 MCConstantExpr::create(16, Ctx), Ctx),
1502 Ctx);
1503
1504 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1505 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1506 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1507 : CurrentProgramInfo.LDSBlocks;
1508 if (MD->getPALMajorVersion() < 3) {
1509 MD->setRsrc2(
1510 CC,
1512 Ctx);
1513 MD->setSpiPsInputEna(MFI->getPSInputEnable());
1514 MD->setSpiPsInputAddr(MFI->getPSInputAddr());
1515 } else {
1516 // Graphics registers
1517 const unsigned ExtraLdsDwGranularity =
1518 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 256 : 128;
1519 MD->setGraphicsRegisters(
1520 ".ps_extra_lds_size",
1521 (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t)));
1522
1523 // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
1524 static StringLiteral const PsInputFields[] = {
1525 ".persp_sample_ena", ".persp_center_ena",
1526 ".persp_centroid_ena", ".persp_pull_model_ena",
1527 ".linear_sample_ena", ".linear_center_ena",
1528 ".linear_centroid_ena", ".line_stipple_tex_ena",
1529 ".pos_x_float_ena", ".pos_y_float_ena",
1530 ".pos_z_float_ena", ".pos_w_float_ena",
1531 ".front_face_ena", ".ancillary_ena",
1532 ".sample_coverage_ena", ".pos_fixed_pt_ena"};
1533 unsigned PSInputEna = MFI->getPSInputEnable();
1534 unsigned PSInputAddr = MFI->getPSInputAddr();
1535 for (auto [Idx, Field] : enumerate(PsInputFields)) {
1536 MD->setGraphicsRegisters(".spi_ps_input_ena", Field,
1537 (bool)((PSInputEna >> Idx) & 1));
1538 MD->setGraphicsRegisters(".spi_ps_input_addr", Field,
1539 (bool)((PSInputAddr >> Idx) & 1));
1540 }
1541 }
1542 }
1543
1544 // For version 3 and above the wave front size is already set in the metadata
1545 if (MD->getPALMajorVersion() < 3 && STM.isWave32())
1546 MD->setWave32(MF.getFunction().getCallingConv());
1547}
1548
1549void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
1550 auto *MD = getTargetStreamer()->getPALMetadata();
1551 const MachineFrameInfo &MFI = MF.getFrameInfo();
1552 StringRef FnName = MF.getFunction().getName();
1553 MD->setFunctionScratchSize(FnName, MFI.getStackSize());
1554 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1555 MCContext &Ctx = MF.getContext();
1556
1557 if (MD->getPALMajorVersion() < 3) {
1558 // Set compute registers
1559 MD->setRsrc1(
1561 CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST, Ctx), Ctx);
1562 MD->setRsrc2(CallingConv::AMDGPU_CS,
1563 CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
1564 } else {
1566 MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST,
1567 MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize());
1568 }
1569
1570 // Set optional info
1571 MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize);
1572 MD->setFunctionNumUsedVgprs(FnName, CurrentProgramInfo.NumVGPRsForWavesPerEU);
1573 MD->setFunctionNumUsedSgprs(FnName, CurrentProgramInfo.NumSGPRsForWavesPerEU);
1574}
1575
1576// This is supposed to be log2(Size)
1578 switch (Size) {
1579 case 4:
1580 return AMD_ELEMENT_4_BYTES;
1581 case 8:
1582 return AMD_ELEMENT_8_BYTES;
1583 case 16:
1584 return AMD_ELEMENT_16_BYTES;
1585 default:
1586 llvm_unreachable("invalid private_element_size");
1587 }
1588}
1589
1590void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out,
1591 const SIProgramInfo &CurrentProgramInfo,
1592 const MachineFunction &MF) const {
1593 const Function &F = MF.getFunction();
1594 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
1595 F.getCallingConv() == CallingConv::SPIR_KERNEL);
1596
1597 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1598 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1599 MCContext &Ctx = MF.getContext();
1600
1601 Out.initDefault(&STM, Ctx, /*InitMCExpr=*/false);
1602
1604 CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx);
1606 CurrentProgramInfo.getComputePGMRSrc2(Ctx);
1608
1609 Out.is_dynamic_callstack = CurrentProgramInfo.DynamicCallStack;
1610
1612 getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
1613
1614 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo();
1615 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
1617 }
1618
1619 if (UserSGPRInfo.hasDispatchPtr())
1621
1622 if (UserSGPRInfo.hasQueuePtr())
1624
1625 if (UserSGPRInfo.hasKernargSegmentPtr())
1627
1628 if (UserSGPRInfo.hasDispatchID())
1630
1631 if (UserSGPRInfo.hasFlatScratchInit())
1633
1634 if (UserSGPRInfo.hasPrivateSegmentSize())
1636
1637 if (STM.isXNACKEnabled())
1639
1640 Align MaxKernArgAlign;
1641 Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
1642 Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
1643 Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
1644 Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
1645 Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
1646
1647 // kernarg_segment_alignment is specified as log of the alignment.
1648 // The minimum alignment is 16.
1649 // FIXME: The metadata treats the minimum as 4?
1650 Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign));
1651}
1652
1654 const char *ExtraCode, raw_ostream &O) {
1655 // First try the generic code, which knows about modifiers like 'c' and 'n'.
1656 if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O))
1657 return false;
1658
1659 if (ExtraCode && ExtraCode[0]) {
1660 if (ExtraCode[1] != 0)
1661 return true; // Unknown modifier.
1662
1663 switch (ExtraCode[0]) {
1664 case 'r':
1665 break;
1666 default:
1667 return true;
1668 }
1669 }
1670
1671 // TODO: Should be able to support other operand types like globals.
1672 const MachineOperand &MO = MI->getOperand(OpNo);
1673 if (MO.isReg()) {
1675 *MF->getSubtarget().getRegisterInfo());
1676 return false;
1677 }
1678 if (MO.isImm()) {
1679 int64_t Val = MO.getImm();
1681 O << Val;
1682 } else if (isUInt<16>(Val)) {
1683 O << format("0x%" PRIx16, static_cast<uint16_t>(Val));
1684 } else if (isUInt<32>(Val)) {
1685 O << format("0x%" PRIx32, static_cast<uint32_t>(Val));
1686 } else {
1687 O << format("0x%" PRIx64, static_cast<uint64_t>(Val));
1688 }
1689 return false;
1690 }
1691 return true;
1692}
1693
1701
1702void AMDGPUAsmPrinter::emitResourceUsageRemarks(
1703 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo,
1704 bool isModuleEntryFunction, bool hasMAIInsts) {
1705 if (!ORE)
1706 return;
1707
1708 const char *Name = "kernel-resource-usage";
1709 const char *Indent = " ";
1710
1711 // If the remark is not specifically enabled, do not output to yaml
1713 if (!Ctx.getDiagHandlerPtr()->isAnalysisRemarkEnabled(Name))
1714 return;
1715
1716 // Currently non-kernel functions have no resources to emit.
1718 return;
1719
1720 auto EmitResourceUsageRemark = [&](StringRef RemarkName,
1721 StringRef RemarkLabel, auto Argument) {
1722 // Add an indent for every line besides the line with the kernel name. This
1723 // makes it easier to tell which resource usage go with which kernel since
1724 // the kernel name will always be displayed first.
1725 std::string LabelStr = RemarkLabel.str() + ": ";
1726 if (RemarkName != "FunctionName")
1727 LabelStr = Indent + LabelStr;
1728
1729 ORE->emit([&]() {
1730 return MachineOptimizationRemarkAnalysis(Name, RemarkName,
1732 &MF.front())
1733 << LabelStr << ore::NV(RemarkName, Argument);
1734 });
1735 };
1736
1737 // FIXME: Formatting here is pretty nasty because clang does not accept
1738 // newlines from diagnostics. This forces us to emit multiple diagnostic
1739 // remarks to simulate newlines. If and when clang does accept newlines, this
1740 // formatting should be aggregated into one remark with newlines to avoid
1741 // printing multiple diagnostic location and diag opts.
1742 EmitResourceUsageRemark("FunctionName", "Function Name",
1743 MF.getFunction().getName());
1744 EmitResourceUsageRemark("NumSGPR", "TotalSGPRs",
1745 getMCExprStr(CurrentProgramInfo.NumSGPR));
1746 EmitResourceUsageRemark("NumVGPR", "VGPRs",
1747 getMCExprStr(CurrentProgramInfo.NumArchVGPR));
1748 if (hasMAIInsts) {
1749 EmitResourceUsageRemark("NumAGPR", "AGPRs",
1750 getMCExprStr(CurrentProgramInfo.NumAccVGPR));
1751 }
1752 EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
1753 getMCExprStr(CurrentProgramInfo.ScratchSize));
1754 int64_t DynStack;
1755 bool DynStackEvaluatable =
1756 CurrentProgramInfo.DynamicCallStack->evaluateAsAbsolute(DynStack);
1757 StringRef DynamicStackStr =
1758 DynStackEvaluatable && DynStack ? "True" : "False";
1759 EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr);
1760 EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
1761 getMCExprStr(CurrentProgramInfo.Occupancy));
1762 EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
1763 CurrentProgramInfo.SGPRSpill);
1764 EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
1765 CurrentProgramInfo.VGPRSpill);
1766 if (isModuleEntryFunction)
1767 EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]",
1768 CurrentProgramInfo.LDSSize);
1769}
1770
1771char AMDGPUAsmPrinter::ID = 0;
1772
1773INITIALIZE_PASS(AMDGPUAsmPrinter, "amdgpu-asm-printer",
1774 "AMDGPU Assembly Printer", false, false)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD, const SIProgramInfo &CurrentProgramInfo, CallingConv::ID CC, const GCNSubtarget &ST, unsigned DynamicVGPRBlockSize)
static unsigned getRsrcReg(CallingConv::ID CallConv)
LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmPrinter()
static amd_element_byte_size_t getElementByteSizeValue(unsigned Size)
static uint32_t getFPMode(SIModeRegisterDefaults Mode)
static const MCExpr * computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx)
static AsmPrinter * createAMDGPUAsmPrinterPass(TargetMachine &tm, std::unique_ptr< MCStreamer > &&Streamer)
AMDGPU Assembly printer class.
AMDGPU HSA Metadata Streamer.
AMDHSA kernel descriptor MCExpr struct for use in MC layer.
MC infrastructure to propagate the function level resource usage info.
Analyzes how many registers and other resources are used by functions.
AMDHSA kernel descriptor definitions.
MC layer struct for AMDGPUMCKernelCodeT, provides MCExpr functionality where required.
amd_element_byte_size_t
The values used to define the number of bytes to use for the swizzle element size.
@ AMD_ELEMENT_8_BYTES
@ AMD_ELEMENT_16_BYTES
@ AMD_ELEMENT_4_BYTES
#define AMD_HSA_BITS_SET(dst, mask, val)
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID
@ AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE
@ AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE
@ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR
@ AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED
@ AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT
@ AMD_CODE_PROPERTY_IS_PTR64
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Analysis containing CSE Info
Definition CSEInfo.cpp:27
#define LLVM_ABI
Definition Compiler.h:213
#define LLVM_EXTERNAL_VISIBILITY
Definition Compiler.h:132
AMD GCN specific subclass of TargetSubtarget.
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:55
===- MachineOptimizationRemarkEmitter.h - Opt Diagnostics -*- C++ -*-—===//
OptimizedStructLayoutField Field
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
R600 Assembly printer class.
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define R_00B028_SPI_SHADER_PGM_RSRC1_PS
Definition SIDefines.h:1122
#define R_0286E8_SPI_TMPRING_SIZE
Definition SIDefines.h:1260
#define FP_ROUND_MODE_DP(x)
Definition SIDefines.h:1242
#define C_00B84C_SCRATCH_EN
Definition SIDefines.h:1158
#define FP_ROUND_ROUND_TO_NEAREST
Definition SIDefines.h:1234
#define R_0286D0_SPI_PS_INPUT_ADDR
Definition SIDefines.h:1193
#define R_00B860_COMPUTE_TMPRING_SIZE
Definition SIDefines.h:1255
#define R_00B428_SPI_SHADER_PGM_RSRC1_HS
Definition SIDefines.h:1145
#define R_00B328_SPI_SHADER_PGM_RSRC1_ES
Definition SIDefines.h:1144
#define R_00B528_SPI_SHADER_PGM_RSRC1_LS
Definition SIDefines.h:1153
#define R_0286CC_SPI_PS_INPUT_ENA
Definition SIDefines.h:1192
#define R_00B128_SPI_SHADER_PGM_RSRC1_VS
Definition SIDefines.h:1131
#define FP_DENORM_MODE_DP(x)
Definition SIDefines.h:1253
#define R_00B848_COMPUTE_PGM_RSRC1
Definition SIDefines.h:1195
#define R_SPILLED_SGPRS
Definition SIDefines.h:1274
#define FP_ROUND_MODE_SP(x)
Definition SIDefines.h:1241
#define FP_DENORM_MODE_SP(x)
Definition SIDefines.h:1252
#define R_00B228_SPI_SHADER_PGM_RSRC1_GS
Definition SIDefines.h:1136
#define R_SPILLED_VGPRS
Definition SIDefines.h:1275
#define S_00B02C_EXTRA_LDS_SIZE(x)
Definition SIDefines.h:1130
#define R_00B84C_COMPUTE_PGM_RSRC2
Definition SIDefines.h:1155
#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS
Definition SIDefines.h:1129
static const int BlockSize
Definition TarWriter.cpp:33
void emitFunctionEntryLabel() override
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
const MCSubtargetInfo * getGlobalSTI() const
void emitImplicitDef(const MachineInstr *MI) const override
Targets can override this to customize the output of IMPLICIT_DEF instructions in verbose mode.
std::vector< std::string > DisasmLines
void emitStartOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the start of their fi...
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
std::vector< std::string > HexLines
void emitGlobalVariable(const GlobalVariable *GV) override
Emit the specified global variable to the .s file.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &O) override
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
void emitFunctionBodyEnd() override
Targets can override this to emit stuff after the last basic block in the function.
bool doFinalization(Module &M) override
doFinalization - Virtual method overriden by subclasses to do any necessary clean up after all passes...
void emitEndOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the end of their file...
AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr< MCStreamer > Streamer)
bool doInitialization(Module &M) override
doInitialization - Virtual method overridden by subclasses to do any necessary initialization before ...
void emitFunctionBodyStart() override
Targets can override this to emit stuff before the first basic block in the function.
void emitBasicBlockStart(const MachineBasicBlock &MBB) override
Targets can override this to emit stuff at the start of a basic block.
AMDGPUTargetStreamer * getTargetStreamer() const
static void printRegOperand(MCRegister Reg, raw_ostream &O, const MCRegisterInfo &MRI)
static const AMDGPUMCExpr * createMax(ArrayRef< const MCExpr * > Args, MCContext &Ctx)
static const AMDGPUMCExpr * createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs, const MCExpr *NumVGPRs, unsigned DynamicVGPRBlockSize, const GCNSubtarget &STM, MCContext &Ctx)
Mimics GCNSubtarget::computeOccupancy for MCExpr.
static const AMDGPUMCExpr * createTotalNumVGPR(const MCExpr *NumAGPR, const MCExpr *NumVGPR, MCContext &Ctx)
static const AMDGPUMCExpr * createExtraSGPRs(const MCExpr *VCCUsed, const MCExpr *FlatScrUsed, bool XNACKUsed, MCContext &Ctx)
Allow delayed MCExpr resolve of ExtraSGPRs (in case VCCUsed or FlatScrUsed are unresolvable but neede...
static const AMDGPUMCExpr * createAlignTo(const MCExpr *Value, const MCExpr *Align, MCContext &Ctx)
void setHwStage(unsigned CC, StringRef field, unsigned Val)
void updateHwStageMaximum(unsigned CC, StringRef field, unsigned Val)
void setComputeRegisters(StringRef field, unsigned Val)
std::pair< unsigned, unsigned > getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
unsigned getAddressableLocalMemorySize() const
Return the maximum number of bytes of LDS that can be allocated to a single workgroup.
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
unsigned getWavefrontSize() const
virtual void EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName, const AMDGPU::MCKernelDescriptor &KernelDescriptor, const MCExpr *NextVGPR, const MCExpr *NextSGPR, const MCExpr *ReserveVCC, const MCExpr *ReserveFlatScr)
AMDGPUPALMetadata * getPALMetadata()
virtual void EmitDirectiveAMDHSACodeObjectVersion(unsigned COV)
void initializeTargetID(const MCSubtargetInfo &STI)
virtual void EmitMCResourceInfo(const MCSymbol *NumVGPR, const MCSymbol *NumAGPR, const MCSymbol *NumExplicitSGPR, const MCSymbol *NumNamedBarrier, const MCSymbol *PrivateSegmentSize, const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch, const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion, const MCSymbol *HasIndirectCall)
virtual bool EmitCodeEnd(const MCSubtargetInfo &STI)
virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type)
virtual void EmitMCResourceMaximums(const MCSymbol *MaxVGPR, const MCSymbol *MaxAGPR, const MCSymbol *MaxSGPR)
virtual void EmitAMDKernelCodeT(AMDGPU::AMDGPUMCKernelCodeT &Header)
const std::optional< AMDGPU::IsaInfo::AMDGPUTargetID > & getTargetID() const
void setXnackSetting(TargetIDSetting NewXnackSetting)
Sets xnack setting to NewXnackSetting.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
This class is intended to be used as a driving class for all asm writers.
Definition AsmPrinter.h:90
const TargetLoweringObjectFile & getObjFileLowering() const
Return information about object file lowering.
MCSymbol * getSymbol(const GlobalValue *GV) const
virtual void emitGlobalVariable(const GlobalVariable *GV)
Emit the specified global variable to the .s file.
TargetMachine & TM
Target machine description.
Definition AsmPrinter.h:93
const MCAsmInfo * MAI
Target Asm Printer information.
Definition AsmPrinter.h:96
MachineFunction * MF
The current machine function.
Definition AsmPrinter.h:108
virtual void SetupMachineFunction(MachineFunction &MF)
This should be called when a new MachineFunction is being processed from runOnMachineFunction.
void emitFunctionBody()
This method emits the body and trailer for a function.
virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const
Return true if the basic block has exactly one predecessor and the control transfer mechanism between...
bool doInitialization(Module &M) override
Set up the AsmPrinter when we are working on a new module.
virtual void emitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const
This emits linkage information about GVSym based on GV, if this is supported by the target.
void getAnalysisUsage(AnalysisUsage &AU) const override
Record analysis usage.
unsigned getFunctionNumber() const
Return a unique ID for the current function.
MachineOptimizationRemarkEmitter * ORE
Optimization remark emitter.
Definition AsmPrinter.h:120
AsmPrinter(TargetMachine &TM, std::unique_ptr< MCStreamer > Streamer, char &ID=AsmPrinter::ID)
MCSymbol * CurrentFnSym
The symbol for the current function.
Definition AsmPrinter.h:127
MachineModuleInfo * MMI
This is a pointer to the current MachineModuleInfo.
Definition AsmPrinter.h:111
MCContext & OutContext
This is the context for the output file that we are streaming.
Definition AsmPrinter.h:100
bool doFinalization(Module &M) override
Shut down the asmprinter.
virtual void emitBasicBlockStart(const MachineBasicBlock &MBB)
Targets can override this to emit stuff at the start of a basic block.
void emitVisibility(MCSymbol *Sym, unsigned Visibility, bool IsDefinition=true) const
This emits visibility information about symbol, if this is supported by the target.
std::unique_ptr< MCStreamer > OutStreamer
This is the MCStreamer object for the file we are generating.
Definition AsmPrinter.h:105
bool isVerbose() const
Return true if assembly output should contain comments.
Definition AsmPrinter.h:307
void getNameWithPrefix(SmallVectorImpl< char > &Name, const GlobalValue *GV) const
virtual void emitFunctionEntryLabel()
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &OS)
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
DISubprogram * getSubprogram() const
Get the attached subprogram.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
bool hasGFX90AInsts() const
bool hasMAIInsts() const
bool hasSGPRInitBug() const
unsigned getMinNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
bool isTgSplitEnabled() const
bool isCuModeEnabled() const
const AMDGPU::IsaInfo::AMDGPUTargetID & getTargetID() const
bool dumpCode() const
std::pair< unsigned, unsigned > computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
bool isTrapHandlerEnabled() const
bool isWave32() const
bool supportsWGP() const
unsigned getMaxNumUserSGPRs() const
Generation getGeneration() const
unsigned getAddressableNumSGPRs() const
unsigned getMaxWaveScratchSize() const
bool hasPrivateSegmentBuffer() const
VisibilityTypes getVisibility() const
LLVM_ABI bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition Globals.cpp:316
unsigned getAddressSpace() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition Globals.cpp:132
Type * getValueType() const
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
bool hasInitializer() const
Definitions have initializers, declarations don't.
MaybeAlign getAlign() const
Returns the alignment of the given variable.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
MCCodeEmitter * getEmitterPtr() const
static const MCBinaryExpr * createAdd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:343
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:348
static const MCBinaryExpr * createOr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:408
static const MCBinaryExpr * createLOr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:378
static const MCBinaryExpr * createMul(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:398
static const MCBinaryExpr * createGT(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:363
static const MCBinaryExpr * createDiv(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:353
static const MCBinaryExpr * createShl(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:413
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:428
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition MCExpr.cpp:212
Context object for machine code objects.
Definition MCContext.h:83
const MCObjectFileInfo * getObjectFileInfo() const
Definition MCContext.h:416
LLVM_ABI void reportError(SMLoc L, const Twine &Msg)
LLVM_ABI MCSymbol * getOrCreateSymbol(const Twine &Name)
Lookup the symbol inside with the specified Name.
Base class for the full range of assembler expressions which are needed for parsing.
Definition MCExpr.h:34
LLVM_ABI bool evaluateAsRelocatable(MCValue &Res, const MCAssembler *Asm) const
Try to evaluate the expression to a relocatable value, i.e.
Definition MCExpr.cpp:450
MCSection * getReadOnlySection() const
MCSection * getTextSection() const
MCContext & getContext() const
This represents a section on linux, lots of unix variants and some bare metal systems.
Instances of this class represent a uniqued identifier for a section in the current translation unit.
Definition MCSection.h:496
void ensureMinAlignment(Align MinAlignment)
Makes sure that Alignment is at least MinAlignment.
Definition MCSection.h:583
bool hasInstructions() const
Definition MCSection.h:591
MCContext & getContext() const
Definition MCStreamer.h:314
Generic base class for all target subtargets.
const Triple & getTargetTriple() const
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:214
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
bool isDefined() const
isDefined - Check if this symbol is defined (i.e., it has an address).
Definition MCSymbol.h:233
StringRef getName() const
getName - Get the symbol name.
Definition MCSymbol.h:188
bool isVariable() const
isVariable - Check if this is a variable symbol.
Definition MCSymbol.h:267
void redefineIfPossible()
Prepare this symbol to be redefined.
Definition MCSymbol.h:212
const MCExpr * getVariableValue() const
Get the expression of the variable symbol.
Definition MCSymbol.h:270
MCStreamer & getStreamer()
Definition MCStreamer.h:101
static const MCUnaryExpr * createNot(const MCExpr *Expr, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:273
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
MCContext & getContext() const
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
Representation of each machine instruction.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
Diagnostic information for optimization analysis remarks.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
Wrapper class representing virtual and physical registers.
Definition Register.h:19
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
GCNUserSGPRUsageInfo & getUserSGPRInfo()
SIModeRegisterDefaults getMode() const
unsigned getScratchReservedForDynamicVGPRs() const
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
MCSymbol * getSymbol(const GlobalValue *GV) const
OSType getOS() const
Get the parsed operating system type of this triple.
Definition Triple.h:420
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM Value Representation.
Definition Value.h:75
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
A raw_ostream that writes to an SmallVector or SmallString.
StringRef str() const
Return a StringRef for the vector contents.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ LOCAL_ADDRESS
Address space for local memory.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI, std::optional< bool > EnableWavefrontSize32)
unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed, bool FlatScrUsed, bool XNACKUsed)
unsigned getSGPREncodingGranule(const MCSubtargetInfo *STI)
int32_t getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR)
void printAMDGPUMCExpr(const MCExpr *Expr, raw_ostream &OS, const MCAsmInfo *MAI)
LLVM_READNONE constexpr bool isModuleEntryFunctionCC(CallingConv::ID CC)
unsigned getLdsDwGranularity(const MCSubtargetInfo &ST)
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
const MCExpr * maskShiftSet(const MCExpr *Val, uint32_t Mask, uint32_t Shift, MCContext &Ctx)
Provided with the MCExpr * Val, uint32 Mask and Shift, will return the masked and left shifted,...
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isGFX90A(const MCSubtargetInfo &STI)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool hasMAIInsts(const MCSubtargetInfo &STI)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isGFX11Plus(const MCSubtargetInfo &STI)
const MCExpr * foldAMDGPUMCExpr(const MCExpr *Expr, MCContext &Ctx)
bool isGFX10Plus(const MCSubtargetInfo &STI)
constexpr std::pair< unsigned, unsigned > getShiftMask(unsigned Value)
Deduce the least significant bit aligned shift and mask values for a binary Complement Value (as they...
bool isGFX1250(const MCSubtargetInfo &STI)
unsigned hasKernargPreload(const MCSubtargetInfo &STI)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ SHT_PROGBITS
Definition ELF.h:1140
@ STT_AMDGPU_HSA_KERNEL
Definition ELF.h:1422
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1685
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2474
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
Target & getTheR600Target()
The target for R600 GPUs.
@ DK_ResourceLimit
AsmPrinter * createR600AsmPrinterPass(TargetMachine &TM, std::unique_ptr< MCStreamer > &&Streamer)
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:126
@ Success
The lock was released successfully.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:399
Target & getTheGCNTarget()
The target for GCN GPUs.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:155
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1869
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:208
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:180
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:851
void validate(const MCSubtargetInfo *STI, MCContext &Ctx)
void initDefault(const MCSubtargetInfo *STI, MCContext &Ctx, bool InitMCExpr=true)
static const MCExpr * bits_get(const MCExpr *Src, uint32_t Shift, uint32_t Mask, MCContext &Ctx)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Track resource usage for kernels / entry functions.
const MCExpr * NumSGPR
const MCExpr * NumArchVGPR
uint64_t getFunctionCodeSize(const MachineFunction &MF, bool IsLowerBound=false)
const MCExpr * getComputePGMRSrc2(MCContext &Ctx) const
Compute the value of the ComputePGMRsrc2 register.
const MCExpr * VGPRBlocks
const MCExpr * ScratchBlocks
const MCExpr * ComputePGMRSrc3
const MCExpr * getComputePGMRSrc1(const GCNSubtarget &ST, MCContext &Ctx) const
Compute the value of the ComputePGMRsrc1 register.
const MCExpr * VCCUsed
const MCExpr * FlatUsed
const MCExpr * NamedBarCnt
const MCExpr * ScratchEnable
const MCExpr * AccumOffset
const MCExpr * NumAccVGPR
const MCExpr * DynamicCallStack
const MCExpr * SGPRBlocks
const MCExpr * NumVGPRsForWavesPerEU
const MCExpr * NumVGPR
const MCExpr * Occupancy
const MCExpr * ScratchSize
const MCExpr * NumSGPRsForWavesPerEU
static void RegisterAsmPrinter(Target &T, Target::AsmPrinterCtorTy Fn)
RegisterAsmPrinter - Register an AsmPrinter implementation for the given target.