LLVM 22.0.0git
GCNSubtarget.cpp
Go to the documentation of this file.
1//===-- GCNSubtarget.cpp - GCN Subtarget Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Implements the GCN specific subclass of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#include "GCNSubtarget.h"
15#include "AMDGPUCallLowering.h"
17#include "AMDGPULegalizerInfo.h"
20#include "AMDGPUTargetMachine.h"
28#include "llvm/IR/MDBuilder.h"
29#include <algorithm>
30
31using namespace llvm;
32
33#define DEBUG_TYPE "gcn-subtarget"
34
35#define GET_SUBTARGETINFO_TARGET_DESC
36#define GET_SUBTARGETINFO_CTOR
37#define AMDGPUSubtarget GCNSubtarget
38#include "AMDGPUGenSubtargetInfo.inc"
39#undef AMDGPUSubtarget
40
42 "amdgpu-vgpr-index-mode",
43 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
44 cl::init(false));
45
46static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
47 cl::desc("Enable the use of AA during codegen."),
48 cl::init(true));
49
51 NSAThreshold("amdgpu-nsa-threshold",
52 cl::desc("Number of addresses from which to enable MIMG NSA."),
54
56
58 StringRef GPU,
59 StringRef FS) {
60 // Determine default and user-specified characteristics
61 //
62 // We want to be able to turn these off, but making this a subtarget feature
63 // for SI has the unhelpful behavior that it unsets everything else if you
64 // disable it.
65 //
66 // Similarly we want enable-prt-strict-null to be on by default and not to
67 // unset everything else if it is disabled
68
69 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
70
71 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by
72 // default
73 if (isAmdHsaOS())
74 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
75
76 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
77
78 // Disable mutually exclusive bits.
79 if (FS.contains_insensitive("+wavefrontsize")) {
80 if (!FS.contains_insensitive("wavefrontsize16"))
81 FullFS += "-wavefrontsize16,";
82 if (!FS.contains_insensitive("wavefrontsize32"))
83 FullFS += "-wavefrontsize32,";
84 if (!FS.contains_insensitive("wavefrontsize64"))
85 FullFS += "-wavefrontsize64,";
86 }
87
88 FullFS += FS;
89
90 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
91
92 // Implement the "generic" processors, which acts as the default when no
93 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
94 // the first amdgcn target that supports flat addressing. Other OSes defaults
95 // to the first amdgcn target.
99 // Assume wave64 for the unknown target, if not explicitly set.
100 if (getWavefrontSizeLog2() == 0)
102 } else if (!hasFeature(AMDGPU::FeatureWavefrontSize32) &&
103 !hasFeature(AMDGPU::FeatureWavefrontSize64)) {
104 // If there is no default wave size it must be a generation before gfx10,
105 // these have FeatureWavefrontSize64 in their definition already. For gfx10+
106 // set wave32 as a default.
107 ToggleFeature(AMDGPU::FeatureWavefrontSize32);
109 }
110
111 // We don't support FP64 for EG/NI atm.
113
114 // Targets must either support 64-bit offsets for MUBUF instructions, and/or
115 // support flat operations, otherwise they cannot access a 64-bit global
116 // address space
117 assert(hasAddr64() || hasFlat());
118 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
119 // that do not support ADDR64 variants of MUBUF instructions. Such targets
120 // cannot use a 64 bit offset with a MUBUF instruction to access the global
121 // address space
122 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
123 ToggleFeature(AMDGPU::FeatureFlatForGlobal);
124 FlatForGlobal = true;
125 }
126 // Unless +-flat-for-global is specified, use MUBUF instructions for global
127 // address space access if flat operations are not available.
128 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
129 ToggleFeature(AMDGPU::FeatureFlatForGlobal);
130 FlatForGlobal = false;
131 }
132
133 // Set defaults if needed.
134 if (MaxPrivateElementSize == 0)
136
137 if (LDSBankCount == 0)
138 LDSBankCount = 32;
139
142
144
147
149
150 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
151 << TargetID.getXnackSetting() << '\n');
152 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
153 << TargetID.getSramEccSetting() << '\n');
154
155 return *this;
156}
157
159 LLVMContext &Ctx = F.getContext();
160 if (hasFeature(AMDGPU::FeatureWavefrontSize32) &&
161 hasFeature(AMDGPU::FeatureWavefrontSize64)) {
163 F, "must specify exactly one of wavefrontsize32 and wavefrontsize64"));
164 }
165}
166
168 const GCNTargetMachine &TM)
169 : // clang-format off
170 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
171 AMDGPUSubtarget(TT),
172 TargetTriple(TT),
173 TargetID(*this),
174 InstrItins(getInstrItineraryForCPU(GPU)),
175 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
176 TLInfo(TM, *this),
177 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
178 // clang-format on
181
182 TSInfo = std::make_unique<AMDGPUSelectionDAGInfo>();
183
184 CallLoweringInfo = std::make_unique<AMDGPUCallLowering>(*getTargetLowering());
185 InlineAsmLoweringInfo =
186 std::make_unique<InlineAsmLowering>(getTargetLowering());
187 Legalizer = std::make_unique<AMDGPULegalizerInfo>(*this, TM);
188 RegBankInfo = std::make_unique<AMDGPURegisterBankInfo>(*this);
189 InstSelector =
190 std::make_unique<AMDGPUInstructionSelector>(*this, *RegBankInfo, TM);
191}
192
194 return TSInfo.get();
195}
196
197unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
198 if (getGeneration() < GFX10)
199 return 1;
200
201 switch (Opcode) {
202 case AMDGPU::V_LSHLREV_B64_e64:
203 case AMDGPU::V_LSHLREV_B64_gfx10:
204 case AMDGPU::V_LSHLREV_B64_e64_gfx11:
205 case AMDGPU::V_LSHLREV_B64_e32_gfx12:
206 case AMDGPU::V_LSHLREV_B64_e64_gfx12:
207 case AMDGPU::V_LSHL_B64_e64:
208 case AMDGPU::V_LSHRREV_B64_e64:
209 case AMDGPU::V_LSHRREV_B64_gfx10:
210 case AMDGPU::V_LSHRREV_B64_e64_gfx11:
211 case AMDGPU::V_LSHRREV_B64_e64_gfx12:
212 case AMDGPU::V_LSHR_B64_e64:
213 case AMDGPU::V_ASHRREV_I64_e64:
214 case AMDGPU::V_ASHRREV_I64_gfx10:
215 case AMDGPU::V_ASHRREV_I64_e64_gfx11:
216 case AMDGPU::V_ASHRREV_I64_e64_gfx12:
217 case AMDGPU::V_ASHR_I64_e64:
218 return 1;
219 }
220
221 return 2;
222}
223
224/// This list was mostly derived from experimentation.
225bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
226 switch (Opcode) {
227 case AMDGPU::V_CVT_F16_F32_e32:
228 case AMDGPU::V_CVT_F16_F32_e64:
229 case AMDGPU::V_CVT_F16_U16_e32:
230 case AMDGPU::V_CVT_F16_U16_e64:
231 case AMDGPU::V_CVT_F16_I16_e32:
232 case AMDGPU::V_CVT_F16_I16_e64:
233 case AMDGPU::V_RCP_F16_e64:
234 case AMDGPU::V_RCP_F16_e32:
235 case AMDGPU::V_RSQ_F16_e64:
236 case AMDGPU::V_RSQ_F16_e32:
237 case AMDGPU::V_SQRT_F16_e64:
238 case AMDGPU::V_SQRT_F16_e32:
239 case AMDGPU::V_LOG_F16_e64:
240 case AMDGPU::V_LOG_F16_e32:
241 case AMDGPU::V_EXP_F16_e64:
242 case AMDGPU::V_EXP_F16_e32:
243 case AMDGPU::V_SIN_F16_e64:
244 case AMDGPU::V_SIN_F16_e32:
245 case AMDGPU::V_COS_F16_e64:
246 case AMDGPU::V_COS_F16_e32:
247 case AMDGPU::V_FLOOR_F16_e64:
248 case AMDGPU::V_FLOOR_F16_e32:
249 case AMDGPU::V_CEIL_F16_e64:
250 case AMDGPU::V_CEIL_F16_e32:
251 case AMDGPU::V_TRUNC_F16_e64:
252 case AMDGPU::V_TRUNC_F16_e32:
253 case AMDGPU::V_RNDNE_F16_e64:
254 case AMDGPU::V_RNDNE_F16_e32:
255 case AMDGPU::V_FRACT_F16_e64:
256 case AMDGPU::V_FRACT_F16_e32:
257 case AMDGPU::V_FREXP_MANT_F16_e64:
258 case AMDGPU::V_FREXP_MANT_F16_e32:
259 case AMDGPU::V_FREXP_EXP_I16_F16_e64:
260 case AMDGPU::V_FREXP_EXP_I16_F16_e32:
261 case AMDGPU::V_LDEXP_F16_e64:
262 case AMDGPU::V_LDEXP_F16_e32:
263 case AMDGPU::V_LSHLREV_B16_e64:
264 case AMDGPU::V_LSHLREV_B16_e32:
265 case AMDGPU::V_LSHRREV_B16_e64:
266 case AMDGPU::V_LSHRREV_B16_e32:
267 case AMDGPU::V_ASHRREV_I16_e64:
268 case AMDGPU::V_ASHRREV_I16_e32:
269 case AMDGPU::V_ADD_U16_e64:
270 case AMDGPU::V_ADD_U16_e32:
271 case AMDGPU::V_SUB_U16_e64:
272 case AMDGPU::V_SUB_U16_e32:
273 case AMDGPU::V_SUBREV_U16_e64:
274 case AMDGPU::V_SUBREV_U16_e32:
275 case AMDGPU::V_MUL_LO_U16_e64:
276 case AMDGPU::V_MUL_LO_U16_e32:
277 case AMDGPU::V_ADD_F16_e64:
278 case AMDGPU::V_ADD_F16_e32:
279 case AMDGPU::V_SUB_F16_e64:
280 case AMDGPU::V_SUB_F16_e32:
281 case AMDGPU::V_SUBREV_F16_e64:
282 case AMDGPU::V_SUBREV_F16_e32:
283 case AMDGPU::V_MUL_F16_e64:
284 case AMDGPU::V_MUL_F16_e32:
285 case AMDGPU::V_MAX_F16_e64:
286 case AMDGPU::V_MAX_F16_e32:
287 case AMDGPU::V_MIN_F16_e64:
288 case AMDGPU::V_MIN_F16_e32:
289 case AMDGPU::V_MAX_U16_e64:
290 case AMDGPU::V_MAX_U16_e32:
291 case AMDGPU::V_MIN_U16_e64:
292 case AMDGPU::V_MIN_U16_e32:
293 case AMDGPU::V_MAX_I16_e64:
294 case AMDGPU::V_MAX_I16_e32:
295 case AMDGPU::V_MIN_I16_e64:
296 case AMDGPU::V_MIN_I16_e32:
297 case AMDGPU::V_MAD_F16_e64:
298 case AMDGPU::V_MAD_U16_e64:
299 case AMDGPU::V_MAD_I16_e64:
300 case AMDGPU::V_FMA_F16_e64:
301 case AMDGPU::V_DIV_FIXUP_F16_e64:
302 // On gfx10, all 16-bit instructions preserve the high bits.
304 case AMDGPU::V_MADAK_F16:
305 case AMDGPU::V_MADMK_F16:
306 case AMDGPU::V_MAC_F16_e64:
307 case AMDGPU::V_MAC_F16_e32:
308 case AMDGPU::V_FMAMK_F16:
309 case AMDGPU::V_FMAAK_F16:
310 case AMDGPU::V_FMAC_F16_e64:
311 case AMDGPU::V_FMAC_F16_e32:
312 // In gfx9, the preferred handling of the unused high 16-bits changed. Most
313 // instructions maintain the legacy behavior of 0ing. Some instructions
314 // changed to preserving the high bits.
316 case AMDGPU::V_MAD_MIXLO_F16:
317 case AMDGPU::V_MAD_MIXHI_F16:
318 default:
319 return false;
320 }
321}
322
324 const SchedRegion &Region) const {
325 // Track register pressure so the scheduler can try to decrease
326 // pressure once register usage is above the threshold defined by
327 // SIRegisterInfo::getRegPressureSetLimit()
328 Policy.ShouldTrackPressure = true;
329
330 // Enabling both top down and bottom up scheduling seems to give us less
331 // register spills than just using one of these approaches on its own.
332 Policy.OnlyTopDown = false;
333 Policy.OnlyBottomUp = false;
334
335 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
336 if (!enableSIScheduler())
337 Policy.ShouldTrackLaneMasks = true;
338}
339
341 const SchedRegion &Region) const {
342 const Function &F = Region.RegionBegin->getMF()->getFunction();
343 Attribute PostRADirectionAttr = F.getFnAttribute("amdgpu-post-ra-direction");
344 if (!PostRADirectionAttr.isValid())
345 return;
346
347 StringRef PostRADirectionStr = PostRADirectionAttr.getValueAsString();
348 if (PostRADirectionStr == "topdown") {
349 Policy.OnlyTopDown = true;
350 Policy.OnlyBottomUp = false;
351 } else if (PostRADirectionStr == "bottomup") {
352 Policy.OnlyTopDown = false;
353 Policy.OnlyBottomUp = true;
354 } else if (PostRADirectionStr == "bidirectional") {
355 Policy.OnlyTopDown = false;
356 Policy.OnlyBottomUp = false;
357 } else {
359 F, F.getSubprogram(), "invalid value for postRA direction attribute");
360 F.getContext().diagnose(Diag);
361 }
362
363 LLVM_DEBUG({
364 const char *DirStr = "default";
365 if (Policy.OnlyTopDown && !Policy.OnlyBottomUp)
366 DirStr = "topdown";
367 else if (!Policy.OnlyTopDown && Policy.OnlyBottomUp)
368 DirStr = "bottomup";
369 else if (!Policy.OnlyTopDown && !Policy.OnlyBottomUp)
370 DirStr = "bidirectional";
371
372 dbgs() << "Post-MI-sched direction (" << F.getName() << "): " << DirStr
373 << '\n';
374 });
375}
376
378 if (isWave32()) {
379 // Fix implicit $vcc operands after MIParser has verified that they match
380 // the instruction definitions.
381 for (auto &MBB : MF) {
382 for (auto &MI : MBB)
383 InstrInfo.fixImplicitOperands(MI);
384 }
385 }
386}
387
389 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
390}
391
394}
395
396bool GCNSubtarget::useAA() const { return UseAA; }
397
398unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
400 getGeneration());
401}
402
403unsigned
405 unsigned DynamicVGPRBlockSize) const {
407 DynamicVGPRBlockSize);
408}
409
410unsigned
411GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
413 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
414
415 if (HasFlatScratch || HasArchitectedFlatScratch) {
417 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
419 return 4; // FLAT_SCRATCH, VCC (in that order).
420 }
421
422 if (isXNACKEnabled())
423 return 4; // XNACK, VCC (in that order).
424 return 2; // VCC.
425}
426
430}
431
433 // In principle we do not need to reserve SGPR pair used for flat_scratch if
434 // we know flat instructions do not access the stack anywhere in the
435 // program. For now assume it's needed if we have flat instructions.
436 const bool KernelUsesFlatScratch = hasFlatAddressSpace();
437 return getBaseReservedNumSGPRs(KernelUsesFlatScratch);
438}
439
440std::pair<unsigned, unsigned>
442 unsigned NumSGPRs, unsigned NumVGPRs) const {
443 unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
444 // Temporarily check both the attribute and the subtarget feature until the
445 // latter is removed.
446 if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled())
447 DynamicVGPRBlockSize = getDynamicVGPRBlockSize();
448
449 auto [MinOcc, MaxOcc] = getOccupancyWithWorkGroupSizes(LDSSize, F);
450 unsigned SGPROcc = getOccupancyWithNumSGPRs(NumSGPRs);
451 unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs, DynamicVGPRBlockSize);
452
453 // Maximum occupancy may be further limited by high SGPR/VGPR usage.
454 MaxOcc = std::min(MaxOcc, std::min(SGPROcc, VGPROcc));
455 return {std::min(MinOcc, MaxOcc), MaxOcc};
456}
457
459 const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
460 unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
461 // Compute maximum number of SGPRs function can use using default/requested
462 // minimum number of waves per execution unit.
463 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
464 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
465
466 // Check if maximum number of SGPRs was explicitly requested using
467 // "amdgpu-num-sgpr" attribute.
468 unsigned Requested =
469 F.getFnAttributeAsParsedInteger("amdgpu-num-sgpr", MaxNumSGPRs);
470
471 if (Requested != MaxNumSGPRs) {
472 // Make sure requested value does not violate subtarget's specifications.
473 if (Requested && (Requested <= ReservedNumSGPRs))
474 Requested = 0;
475
476 // If more SGPRs are required to support the input user/system SGPRs,
477 // increase to accommodate them.
478 //
479 // FIXME: This really ends up using the requested number of SGPRs + number
480 // of reserved special registers in total. Theoretically you could re-use
481 // the last input registers for these special registers, but this would
482 // require a lot of complexity to deal with the weird aliasing.
483 unsigned InputNumSGPRs = PreloadedSGPRs;
484 if (Requested && Requested < InputNumSGPRs)
485 Requested = InputNumSGPRs;
486
487 // Make sure requested value is compatible with values implied by
488 // default/requested minimum/maximum number of waves per execution unit.
489 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
490 Requested = 0;
491 if (WavesPerEU.second && Requested &&
492 Requested < getMinNumSGPRs(WavesPerEU.second))
493 Requested = 0;
494
495 if (Requested)
496 MaxNumSGPRs = Requested;
497 }
498
499 if (hasSGPRInitBug())
501
502 return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
503}
504
506 const Function &F = MF.getFunction();
510}
511
513 using USI = GCNUserSGPRUsageInfo;
514 // Max number of user SGPRs
515 const unsigned MaxUserSGPRs =
516 USI::getNumUserSGPRForField(USI::PrivateSegmentBufferID) +
517 USI::getNumUserSGPRForField(USI::DispatchPtrID) +
518 USI::getNumUserSGPRForField(USI::QueuePtrID) +
519 USI::getNumUserSGPRForField(USI::KernargSegmentPtrID) +
520 USI::getNumUserSGPRForField(USI::DispatchIdID) +
521 USI::getNumUserSGPRForField(USI::FlatScratchInitID) +
522 USI::getNumUserSGPRForField(USI::ImplicitBufferPtrID);
523
524 // Max number of system SGPRs
525 const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
526 1 + // WorkGroupIDY
527 1 + // WorkGroupIDZ
528 1 + // WorkGroupInfo
529 1; // private segment wave byte offset
530
531 // Max number of synthetic SGPRs
532 const unsigned SyntheticSGPRs = 1; // LDSKernelId
533
534 return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs;
535}
536
540}
541
543 const Function &F, std::pair<unsigned, unsigned> NumVGPRBounds) const {
544 const auto &[Min, Max] = NumVGPRBounds;
545
546 // Check if maximum number of VGPRs was explicitly requested using
547 // "amdgpu-num-vgpr" attribute.
548
549 unsigned Requested = F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", Max);
550 if (Requested != Max && hasGFX90AInsts())
551 Requested *= 2;
552
553 // Make sure requested value is inside the range of possible VGPR usage.
554 return std::clamp(Requested, Min, Max);
555}
556
558 // Temporarily check both the attribute and the subtarget feature, until the
559 // latter is removed.
560 unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
561 if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled())
562 DynamicVGPRBlockSize = getDynamicVGPRBlockSize();
563
564 std::pair<unsigned, unsigned> Waves = getWavesPerEU(F);
565 return getBaseMaxNumVGPRs(
566 F, {getMinNumVGPRs(Waves.second, DynamicVGPRBlockSize),
567 getMaxNumVGPRs(Waves.first, DynamicVGPRBlockSize)});
568}
569
571 return getMaxNumVGPRs(MF.getFunction());
572}
573
574std::pair<unsigned, unsigned>
576 const unsigned MaxVectorRegs = getMaxNumVGPRs(F);
577
578 unsigned MaxNumVGPRs = MaxVectorRegs;
579 unsigned MaxNumAGPRs = 0;
580
581 // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically,
582 // a wave may have up to 512 total vector registers combining together both
583 // VGPRs and AGPRs. Hence, in an entry function without calls and without
584 // AGPRs used within it, it is possible to use the whole vector register
585 // budget for VGPRs.
586 //
587 // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split
588 // register file accordingly.
589 if (hasGFX90AInsts()) {
590 unsigned MinNumAGPRs = 0;
591 const unsigned TotalNumAGPRs = AMDGPU::AGPR_32RegClass.getNumRegs();
592 const unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
593
594 const std::pair<unsigned, unsigned> DefaultNumAGPR = {~0u, ~0u};
595
596 // TODO: The lower bound should probably force the number of required
597 // registers up, overriding amdgpu-waves-per-eu.
598 std::tie(MinNumAGPRs, MaxNumAGPRs) =
599 AMDGPU::getIntegerPairAttribute(F, "amdgpu-agpr-alloc", DefaultNumAGPR,
600 /*OnlyFirstRequired=*/true);
601
602 if (MinNumAGPRs == DefaultNumAGPR.first) {
603 // Default to splitting half the registers if AGPRs are required.
604 MinNumAGPRs = MaxNumAGPRs = MaxVectorRegs / 2;
605 } else {
606 // Align to accum_offset's allocation granularity.
607 MinNumAGPRs = alignTo(MinNumAGPRs, 4);
608
609 MinNumAGPRs = std::min(MinNumAGPRs, TotalNumAGPRs);
610 }
611
612 // Clamp values to be inbounds of our limits, and ensure min <= max.
613
614 MaxNumAGPRs = std::min(std::max(MinNumAGPRs, MaxNumAGPRs), MaxVectorRegs);
615 MinNumAGPRs = std::min(std::min(MinNumAGPRs, TotalNumAGPRs), MaxNumAGPRs);
616
617 MaxNumVGPRs = std::min(MaxVectorRegs - MinNumAGPRs, TotalNumVGPRs);
618 MaxNumAGPRs = std::min(MaxVectorRegs - MaxNumVGPRs, MaxNumAGPRs);
619
620 assert(MaxNumVGPRs + MaxNumAGPRs <= MaxVectorRegs &&
621 MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= TotalNumVGPRs &&
622 "invalid register counts");
623 } else if (hasMAIInsts()) {
624 // On gfx908 the number of AGPRs always equals the number of VGPRs.
625 MaxNumAGPRs = MaxNumVGPRs = MaxVectorRegs;
626 }
627
628 return std::pair(MaxNumVGPRs, MaxNumAGPRs);
629}
630
632 SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep,
633 const TargetSchedModel *SchedModel) const {
634 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || !Def->isInstr() ||
635 !Use->isInstr())
636 return;
637
638 MachineInstr *DefI = Def->getInstr();
639 MachineInstr *UseI = Use->getInstr();
640
641 if (DefI->isBundle()) {
643 auto Reg = Dep.getReg();
646 unsigned Lat = 0;
647 for (++I; I != E && I->isBundledWithPred(); ++I) {
648 if (I->modifiesRegister(Reg, TRI))
649 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
650 else if (Lat)
651 --Lat;
652 }
653 Dep.setLatency(Lat);
654 } else if (UseI->isBundle()) {
656 auto Reg = Dep.getReg();
659 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
660 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
661 if (I->readsRegister(Reg, TRI))
662 break;
663 --Lat;
664 }
665 Dep.setLatency(Lat);
666 } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {
667 // Work around the fact that SIInstrInfo::fixImplicitOperands modifies
668 // implicit operands which come from the MCInstrDesc, which can fool
669 // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
670 // pseudo operands.
672 DefI, DefOpIdx, UseI, UseOpIdx));
673 }
674}
675
678 return 0; // Not MIMG encoding.
679
680 if (NSAThreshold.getNumOccurrences() > 0)
681 return std::max(NSAThreshold.getValue(), 2u);
682
684 "amdgpu-nsa-threshold", -1);
685 if (Value > 0)
686 return std::max(Value, 2);
687
688 return NSAThreshold;
689}
690
692 const GCNSubtarget &ST)
693 : ST(ST) {
694 const CallingConv::ID CC = F.getCallingConv();
695 const bool IsKernel =
697
698 if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0))
699 KernargSegmentPtr = true;
700
701 bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
702 if (IsAmdHsaOrMesa && !ST.enableFlatScratch())
703 PrivateSegmentBuffer = true;
704 else if (ST.isMesaGfxShader(F))
705 ImplicitBufferPtr = true;
706
707 if (!AMDGPU::isGraphics(CC)) {
708 if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr"))
709 DispatchPtr = true;
710
711 // FIXME: Can this always be disabled with < COv5?
712 if (!F.hasFnAttribute("amdgpu-no-queue-ptr"))
713 QueuePtr = true;
714
715 if (!F.hasFnAttribute("amdgpu-no-dispatch-id"))
716 DispatchID = true;
717 }
718
719 if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) &&
720 (IsAmdHsaOrMesa || ST.enableFlatScratch()) &&
721 // FlatScratchInit cannot be true for graphics CC if enableFlatScratch()
722 // is false.
723 (ST.enableFlatScratch() ||
724 (!AMDGPU::isGraphics(CC) &&
725 !F.hasFnAttribute("amdgpu-no-flat-scratch-init"))) &&
726 !ST.flatScratchIsArchitected()) {
727 FlatScratchInit = true;
728 }
729
731 NumUsedUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID);
732
735
736 if (hasDispatchPtr())
737 NumUsedUserSGPRs += getNumUserSGPRForField(DispatchPtrID);
738
739 if (hasQueuePtr())
740 NumUsedUserSGPRs += getNumUserSGPRForField(QueuePtrID);
741
743 NumUsedUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID);
744
745 if (hasDispatchID())
746 NumUsedUserSGPRs += getNumUserSGPRForField(DispatchIdID);
747
748 if (hasFlatScratchInit())
749 NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID);
750
752 NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentSizeID);
753}
754
756 assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST));
757 NumKernargPreloadSGPRs += NumSGPRs;
758 NumUsedUserSGPRs += NumSGPRs;
759}
760
762 return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs;
763}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static cl::opt< bool > UseAA("aarch64-use-aa", cl::init(true), cl::desc("Enable the use of AA during codegen."))
This file describes how to lower LLVM calls to machine code calls.
This file declares the targeting of the InstructionSelector class for AMDGPU.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock & MBB
static cl::opt< unsigned > NSAThreshold("amdgpu-nsa-threshold", cl::desc("Number of addresses from which to enable MIMG NSA."), cl::init(2), cl::Hidden)
static cl::opt< bool > EnableVGPRIndexMode("amdgpu-vgpr-index-mode", cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), cl::init(false))
static cl::opt< bool > UseAA("amdgpu-use-aa-in-codegen", cl::desc("Enable the use of AA during codegen."), cl::init(true))
AMD GCN specific subclass of TargetSubtarget.
IRTranslator LLVM IR MI
This file describes how to lower LLVM inline asm to machine code INLINEASM.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Register const TargetRegisterInfo * TRI
if(PassOpts->AAPipeline)
This file defines the SmallString class.
#define LLVM_DEBUG(...)
Definition: Debug.h:119
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
std::pair< unsigned, unsigned > getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
unsigned getWavefrontSizeLog2() const
unsigned AddressableLocalMemorySize
TargetIDSetting getXnackSetting() const
TargetIDSetting getSramEccSetting() const
LLVM_ABI StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:400
bool isValid() const
Return true if the attribute is any kind of attribute.
Definition: Attributes.h:223
Diagnostic information for optimization failures.
Diagnostic information for unsupported feature in backend.
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition: Function.cpp:774
bool hasFlat() const
Definition: GCNSubtarget.h:428
bool useVGPRIndexMode() const
void mirFileLoaded(MachineFunction &MF) const override
unsigned MaxPrivateElementSize
Definition: GCNSubtarget.h:68
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS)
bool hasGFX90AInsts() const
bool hasMAIInsts() const
Definition: GCNSubtarget.h:878
unsigned getConstantBusLimit(unsigned Opcode) const
const InstrItineraryData * getInstrItineraryData() const override
Definition: GCNSubtarget.h:350
void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep, const TargetSchedModel *SchedModel) const override
void overridePostRASchedPolicy(MachineSchedPolicy &Policy, const SchedRegion &Region) const override
bool hasMadF16() const
bool hasSGPRInitBug() const
unsigned getMinNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
bool isDynamicVGPREnabled() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:320
unsigned getBaseMaxNumVGPRs(const Function &F, std::pair< unsigned, unsigned > NumVGPRBounds) const
bool zeroesHigh16BitsOfDest(unsigned Opcode) const
Returns if the result of this instruction with a 16-bit result returned in a 32-bit register implicit...
unsigned getBaseMaxNumSGPRs(const Function &F, std::pair< unsigned, unsigned > WavesPerEU, unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const
unsigned getMaxNumPreloadedSGPRs() const
GCNSubtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
void overrideSchedPolicy(MachineSchedPolicy &Policy, const SchedRegion &Region) const override
std::pair< unsigned, unsigned > computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
unsigned getMaxNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:316
unsigned getNSAThreshold(const MachineFunction &MF) const
bool hasFlatAddressSpace() const
Definition: GCNSubtarget.h:670
unsigned getReservedNumSGPRs(const MachineFunction &MF) const
bool hasMovrel() const
bool useAA() const override
bool isWave32() const
bool hasVGPRIndexMode() const
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs, unsigned DynamicVGPRBlockSize) const
Return the maximum number of waves per SIMD for kernels using VGPRs VGPRs.
bool HasArchitectedFlatScratch
Definition: GCNSubtarget.h:222
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const
Return the maximum number of waves per SIMD for kernels using SGPRs SGPRs.
unsigned getMaxWavesPerEU() const
Generation getGeneration() const
Definition: GCNSubtarget.h:356
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM)
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
std::pair< unsigned, unsigned > getMaxNumVectorRegs(const Function &F) const
Return a pair of maximum numbers of VGPRs and AGPRs that meet the number of waves per execution unit ...
bool isXNACKEnabled() const
Definition: GCNSubtarget.h:656
unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const
bool enableSIScheduler() const
bool hasAddr64() const
Definition: GCNSubtarget.h:424
unsigned getDynamicVGPRBlockSize() const
bool hasFP64() const
Definition: GCNSubtarget.h:404
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
~GCNSubtarget() override
const SelectionDAGTargetInfo * getSelectionDAGInfo() const override
AMDGPU::IsaInfo::AMDGPUTargetID TargetID
Definition: GCNSubtarget.h:64
static unsigned getNumUserSGPRForField(UserSGPRID ID)
bool hasKernargSegmentPtr() const
void allocKernargPreloadSGPRs(unsigned NumSGPRs)
bool hasPrivateSegmentBuffer() const
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST)
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
instr_iterator instr_end()
Instructions::const_iterator const_instr_iterator
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
Definition: MachineInstr.h:72
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:359
bool isBundle() const
Scheduling dependency.
Definition: ScheduleDAG.h:51
Kind getKind() const
Returns an enum value representing the kind of the dependence.
Definition: ScheduleDAG.h:513
@ Data
Regular data dependence (aka true-dependence).
Definition: ScheduleDAG.h:55
void setLatency(unsigned Lat)
Sets the latency for this edge.
Definition: ScheduleDAG.h:147
unsigned getLatency() const
Returns the latency value for this edge, which roughly means the minimum number of cycles that must e...
Definition: ScheduleDAG.h:142
Register getReg() const
Returns the register associated with this edge.
Definition: ScheduleDAG.h:216
const TargetSchedModel & getSchedModel() const
Definition: SIInstrInfo.h:1542
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
std::pair< unsigned, unsigned > getWavesPerEU() const
GCNUserSGPRUsageInfo & getUserSGPRInfo()
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:249
Targets can subclass this to parameterize the SelectionDAG lowering and instruction selection process...
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
Information about stack frame layout on the target.
Provide an instruction scheduling machine model to CodeGen passes.
LLVM_ABI unsigned computeOperandLatency(const MachineInstr *DefMI, unsigned DefOperIdx, const MachineInstr *UseMI, unsigned UseOperIdx) const
Compute operand latency based on the available machine model.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:47
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
LLVM Value Representation.
Definition: Value.h:75
self_iterator getIterator()
Definition: ilist_node.h:134
unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI, unsigned NumVGPRs, unsigned DynamicVGPRBlockSize)
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
unsigned getLocalMemorySize(const MCSubtargetInfo *STI)
unsigned getEUsPerCU(const MCSubtargetInfo *STI)
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs, unsigned MaxWaves, AMDGPUSubtarget::Generation Gen)
unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
unsigned getDynamicVGPRBlockSize(const Function &F)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:144
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
Define a generic scheduling policy for targets that don't provide their own MachineSchedStrategy.
bool ShouldTrackLaneMasks
Track LaneMasks to allow reordering of independent subregister writes of the same vreg.
A region of an MBB for scheduling.