LLVM 22.0.0git
SIMachineFunctionInfo.cpp
Go to the documentation of this file.
1//===- SIMachineFunctionInfo.cpp - SI Machine Function Info ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AMDGPUSubtarget.h"
11#include "GCNSubtarget.h"
13#include "SIRegisterInfo.h"
21#include "llvm/IR/CallingConv.h"
23#include "llvm/IR/Function.h"
24#include <cassert>
25#include <optional>
26#include <vector>
27
28enum { MAX_LANES = 64 };
29
30using namespace llvm;
31
32// TODO -- delete this flag once we have more robust mechanisms to allocate the
33// optimal RC for Opc and Dest of MFMA. In particular, there are high RP cases
34// where it is better to produce the VGPR form (e.g. if there are VGPR users
35// of the MFMA result).
37 "amdgpu-mfma-vgpr-form", cl::Hidden,
38 cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If "
39 "unspecified, default to compiler heuristics"),
40 cl::init(false));
41
43 const SITargetLowering *TLI = STI->getTargetLowering();
44 return static_cast<const GCNTargetMachine &>(TLI->getTargetMachine());
45}
46
48 const GCNSubtarget *STI)
49 : AMDGPUMachineFunction(F, *STI), Mode(F, *STI), GWSResourcePSV(getTM(STI)),
50 UserSGPRInfo(F, *STI), WorkGroupIDX(false), WorkGroupIDY(false),
51 WorkGroupIDZ(false), WorkGroupInfo(false), LDSKernelId(false),
52 PrivateSegmentWaveByteOffset(false), WorkItemIDX(false),
53 WorkItemIDY(false), WorkItemIDZ(false), ImplicitArgPtr(false),
54 GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0),
55 IsWholeWaveFunction(F.getCallingConv() ==
57 const GCNSubtarget &ST = *STI;
58 FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
59 WavesPerEU = ST.getWavesPerEU(F);
60 MaxNumWorkGroups = ST.getMaxNumWorkGroups(F);
61 assert(MaxNumWorkGroups.size() == 3);
62
63 // Temporarily check both the attribute and the subtarget feature, until the
64 // latter is completely removed.
65 DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
66 if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
67 DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
68
69 Occupancy = ST.computeOccupancy(F, getLDSSize()).second;
70 CallingConv::ID CC = F.getCallingConv();
71
72 VRegFlags.reserve(1024);
73
74 const bool IsKernel = CC == CallingConv::AMDGPU_KERNEL ||
76
77 if (IsKernel) {
78 WorkGroupIDX = true;
79 WorkItemIDX = true;
80 } else if (CC == CallingConv::AMDGPU_PS) {
81 PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
82 }
83
84 MayNeedAGPRs = ST.hasMAIInsts();
85 if (ST.hasGFX90AInsts()) {
86 // FIXME: MayNeedAGPRs is a misnomer for how this is used. MFMA selection
87 // should be separated from availability of AGPRs
88 if (MFMAVGPRForm ||
89 (ST.getMaxNumVGPRs(F) <= ST.getAddressableNumArchVGPRs() &&
90 !mayUseAGPRs(F)))
91 MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
92 }
93
94 if (AMDGPU::isChainCC(CC)) {
95 // Chain functions don't receive an SP from their caller, but are free to
96 // set one up. For now, we can use s32 to match what amdgpu_gfx functions
97 // would use if called, but this can be revisited.
98 // FIXME: Only reserve this if we actually need it.
99 StackPtrOffsetReg = AMDGPU::SGPR32;
100
101 ScratchRSrcReg = AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51;
102
103 ArgInfo.PrivateSegmentBuffer =
104 ArgDescriptor::createRegister(ScratchRSrcReg);
105
106 ImplicitArgPtr = false;
107 } else if (!isEntryFunction()) {
108 if (CC != CallingConv::AMDGPU_Gfx &&
111
112 FrameOffsetReg = AMDGPU::SGPR33;
113 StackPtrOffsetReg = AMDGPU::SGPR32;
114
115 if (!ST.enableFlatScratch()) {
116 // Non-entry functions have no special inputs for now, other registers
117 // required for scratch access.
118 ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
119
120 ArgInfo.PrivateSegmentBuffer =
121 ArgDescriptor::createRegister(ScratchRSrcReg);
122 }
123
124 if (!F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
125 ImplicitArgPtr = true;
126 } else {
127 ImplicitArgPtr = false;
129 std::max(ST.getAlignmentForImplicitArgPtr(), MaxKernArgAlign);
130 }
131
132 if (!AMDGPU::isGraphics(CC) ||
134 ST.hasArchitectedSGPRs())) {
135 if (IsKernel || !F.hasFnAttribute("amdgpu-no-workgroup-id-x"))
136 WorkGroupIDX = true;
137
138 if (!F.hasFnAttribute("amdgpu-no-workgroup-id-y"))
139 WorkGroupIDY = true;
140
141 if (!F.hasFnAttribute("amdgpu-no-workgroup-id-z"))
142 WorkGroupIDZ = true;
143 }
144
145 if (!AMDGPU::isGraphics(CC)) {
146 if (IsKernel || !F.hasFnAttribute("amdgpu-no-workitem-id-x"))
147 WorkItemIDX = true;
148
149 if (!F.hasFnAttribute("amdgpu-no-workitem-id-y") &&
150 ST.getMaxWorkitemID(F, 1) != 0)
151 WorkItemIDY = true;
152
153 if (!F.hasFnAttribute("amdgpu-no-workitem-id-z") &&
154 ST.getMaxWorkitemID(F, 2) != 0)
155 WorkItemIDZ = true;
156
157 if (!IsKernel && !F.hasFnAttribute("amdgpu-no-lds-kernel-id"))
158 LDSKernelId = true;
159 }
160
161 if (isEntryFunction()) {
162 // X, XY, and XYZ are the only supported combinations, so make sure Y is
163 // enabled if Z is.
164 if (WorkItemIDZ)
165 WorkItemIDY = true;
166
167 if (!ST.flatScratchIsArchitected()) {
168 PrivateSegmentWaveByteOffset = true;
169
170 // HS and GS always have the scratch wave offset in SGPR5 on GFX9.
171 if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
173 ArgInfo.PrivateSegmentWaveByteOffset =
174 ArgDescriptor::createRegister(AMDGPU::SGPR5);
175 }
176 }
177
178 Attribute A = F.getFnAttribute("amdgpu-git-ptr-high");
179 StringRef S = A.getValueAsString();
180 if (!S.empty())
181 S.consumeInteger(0, GITPtrHigh);
182
183 A = F.getFnAttribute("amdgpu-32bit-address-high-bits");
184 S = A.getValueAsString();
185 if (!S.empty())
186 S.consumeInteger(0, HighBitsOf32BitAddress);
187
188 MaxMemoryClusterDWords = F.getFnAttributeAsParsedInteger(
189 "amdgpu-max-memory-cluster-dwords", DefaultMemoryClusterDWordsLimit);
190
191 // On GFX908, in order to guarantee copying between AGPRs, we need a scratch
192 // VGPR available at all times. For now, reserve highest available VGPR. After
193 // RA, shift it to the lowest available unused VGPR if the one exist.
194 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
195 VGPRForAGPRCopy =
196 AMDGPU::VGPR_32RegClass.getRegister(ST.getMaxNumVGPRs(F) - 1);
197 }
198
199 ClusterDims = AMDGPU::ClusterDimsAttr::get(F);
200}
201
208
211 const GCNSubtarget& ST = MF.getSubtarget<GCNSubtarget>();
212 limitOccupancy(ST.getOccupancyWithWorkGroupSizes(MF).second);
213}
214
216 const SIRegisterInfo &TRI) {
217 ArgInfo.PrivateSegmentBuffer =
218 ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
219 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SGPR_128RegClass));
220 NumUserSGPRs += 4;
221 return ArgInfo.PrivateSegmentBuffer.getRegister();
222}
223
225 ArgInfo.DispatchPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
226 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
227 NumUserSGPRs += 2;
228 return ArgInfo.DispatchPtr.getRegister();
229}
230
232 ArgInfo.QueuePtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
233 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
234 NumUserSGPRs += 2;
235 return ArgInfo.QueuePtr.getRegister();
236}
237
239 ArgInfo.KernargSegmentPtr
240 = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
241 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
242 NumUserSGPRs += 2;
243 return ArgInfo.KernargSegmentPtr.getRegister();
244}
245
247 ArgInfo.DispatchID = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
248 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
249 NumUserSGPRs += 2;
250 return ArgInfo.DispatchID.getRegister();
251}
252
254 ArgInfo.FlatScratchInit = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
255 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
256 NumUserSGPRs += 2;
257 return ArgInfo.FlatScratchInit.getRegister();
258}
259
261 ArgInfo.PrivateSegmentSize = ArgDescriptor::createRegister(getNextUserSGPR());
262 NumUserSGPRs += 1;
263 return ArgInfo.PrivateSegmentSize.getRegister();
264}
265
267 ArgInfo.ImplicitBufferPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
268 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
269 NumUserSGPRs += 2;
270 return ArgInfo.ImplicitBufferPtr.getRegister();
271}
272
274 ArgInfo.LDSKernelId = ArgDescriptor::createRegister(getNextUserSGPR());
275 NumUserSGPRs += 1;
276 return ArgInfo.LDSKernelId.getRegister();
277}
278
280 const SIRegisterInfo &TRI, const TargetRegisterClass *RC,
281 unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs) {
282 auto [It, Inserted] = ArgInfo.PreloadKernArgs.try_emplace(KernArgIdx);
283 assert(Inserted && "Preload kernel argument allocated twice.");
284 NumUserSGPRs += PaddingSGPRs;
285 // If the available register tuples are aligned with the kernarg to be
286 // preloaded use that register, otherwise we need to use a set of SGPRs and
287 // merge them.
288 if (!ArgInfo.FirstKernArgPreloadReg)
289 ArgInfo.FirstKernArgPreloadReg = getNextUserSGPR();
290 Register PreloadReg =
291 TRI.getMatchingSuperReg(getNextUserSGPR(), AMDGPU::sub0, RC);
292 auto &Regs = It->second.Regs;
293 if (PreloadReg &&
294 (RC == &AMDGPU::SReg_32RegClass || RC == &AMDGPU::SReg_64RegClass)) {
295 Regs.push_back(PreloadReg);
296 NumUserSGPRs += AllocSizeDWord;
297 } else {
298 Regs.reserve(AllocSizeDWord);
299 for (unsigned I = 0; I < AllocSizeDWord; ++I) {
300 Regs.push_back(getNextUserSGPR());
301 NumUserSGPRs++;
302 }
303 }
304
305 // Track the actual number of SGPRs that HW will preload to.
306 UserSGPRInfo.allocKernargPreloadSGPRs(AllocSizeDWord + PaddingSGPRs);
307 return &Regs;
308}
309
311 uint64_t Size, Align Alignment) {
312 // Skip if it is an entry function or the register is already added.
313 if (isEntryFunction() || WWMSpills.count(VGPR))
314 return;
315
316 // Skip if this is a function with the amdgpu_cs_chain or
317 // amdgpu_cs_chain_preserve calling convention and this is a scratch register.
318 // We never need to allocate a spill for these because we don't even need to
319 // restore the inactive lanes for them (they're scratchier than the usual
320 // scratch registers). We only need to do this if we have calls to
321 // llvm.amdgcn.cs.chain (otherwise there's no one to save them for, since
322 // chain functions do not return) and the function did not contain a call to
323 // llvm.amdgcn.init.whole.wave (since in that case there are no inactive lanes
324 // when entering the function).
325 if (isChainFunction() &&
328 return;
329
330 WWMSpills.insert(std::make_pair(
331 VGPR, MF.getFrameInfo().CreateSpillStackObject(Size, Alignment)));
332}
333
334// Separate out the callee-saved and scratch registers.
336 MachineFunction &MF,
337 SmallVectorImpl<std::pair<Register, int>> &CalleeSavedRegs,
338 SmallVectorImpl<std::pair<Register, int>> &ScratchRegs) const {
339 const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
340 for (auto &Reg : WWMSpills) {
341 if (isCalleeSavedReg(CSRegs, Reg.first))
342 CalleeSavedRegs.push_back(Reg);
343 else
344 ScratchRegs.push_back(Reg);
345 }
346}
347
349 MCPhysReg Reg) const {
350 for (unsigned I = 0; CSRegs[I]; ++I) {
351 if (CSRegs[I] == Reg)
352 return true;
353 }
354
355 return false;
356}
357
360 BitVector &SavedVGPRs) {
361 const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
363 for (unsigned I = 0, E = WWMVGPRs.size(); I < E; ++I) {
364 Register Reg = WWMVGPRs[I];
365 Register NewReg =
366 TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
367 if (!NewReg || NewReg >= Reg)
368 break;
369
370 MRI.replaceRegWith(Reg, NewReg);
371
372 // Update various tables with the new VGPR.
373 WWMVGPRs[I] = NewReg;
374 WWMReservedRegs.remove(Reg);
375 WWMReservedRegs.insert(NewReg);
376 MRI.reserveReg(NewReg, TRI);
377
378 // Replace the register in SpillPhysVGPRs. This is needed to look for free
379 // lanes while spilling special SGPRs like FP, BP, etc. during PEI.
380 auto *RegItr = llvm::find(SpillPhysVGPRs, Reg);
381 if (RegItr != SpillPhysVGPRs.end()) {
382 unsigned Idx = std::distance(SpillPhysVGPRs.begin(), RegItr);
383 SpillPhysVGPRs[Idx] = NewReg;
384 }
385
386 // The generic `determineCalleeSaves` might have set the old register if it
387 // is in the CSR range.
388 SavedVGPRs.reset(Reg);
389
390 for (MachineBasicBlock &MBB : MF) {
391 MBB.removeLiveIn(Reg);
392 MBB.sortUniqueLiveIns();
393 }
394
395 Reg = NewReg;
396 }
397}
398
399bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills(
400 MachineFunction &MF, int FI, unsigned LaneIndex) {
402 Register LaneVGPR;
403 if (!LaneIndex) {
404 LaneVGPR = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
405 SpillVGPRs.push_back(LaneVGPR);
406 } else {
407 LaneVGPR = SpillVGPRs.back();
408 }
409
410 SGPRSpillsToVirtualVGPRLanes[FI].emplace_back(LaneVGPR, LaneIndex);
411 return true;
412}
413
414bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills(
415 MachineFunction &MF, int FI, unsigned LaneIndex, bool IsPrologEpilog) {
416 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
417 const SIRegisterInfo *TRI = ST.getRegisterInfo();
418 MachineRegisterInfo &MRI = MF.getRegInfo();
419 Register LaneVGPR;
420 if (!LaneIndex) {
421 // Find the highest available register if called before RA to ensure the
422 // lowest registers are available for allocation. The LaneVGPR, in that
423 // case, will be shifted back to the lowest range after VGPR allocation.
424 LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF,
425 !IsPrologEpilog);
426 if (LaneVGPR == AMDGPU::NoRegister) {
427 // We have no VGPRs left for spilling SGPRs. Reset because we will not
428 // partially spill the SGPR to VGPRs.
429 SGPRSpillsToPhysicalVGPRLanes.erase(FI);
430 return false;
431 }
432
433 if (IsPrologEpilog)
434 allocateWWMSpill(MF, LaneVGPR);
435
436 reserveWWMRegister(LaneVGPR);
437 for (MachineBasicBlock &MBB : MF) {
438 MBB.addLiveIn(LaneVGPR);
440 }
441 SpillPhysVGPRs.push_back(LaneVGPR);
442 } else {
443 LaneVGPR = SpillPhysVGPRs.back();
444 }
445
446 SGPRSpillsToPhysicalVGPRLanes[FI].emplace_back(LaneVGPR, LaneIndex);
447 return true;
448}
449
451 MachineFunction &MF, int FI, bool SpillToPhysVGPRLane,
452 bool IsPrologEpilog) {
453 std::vector<SIRegisterInfo::SpilledReg> &SpillLanes =
454 SpillToPhysVGPRLane ? SGPRSpillsToPhysicalVGPRLanes[FI]
455 : SGPRSpillsToVirtualVGPRLanes[FI];
456
457 // This has already been allocated.
458 if (!SpillLanes.empty())
459 return true;
460
461 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
462 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
463 unsigned WaveSize = ST.getWavefrontSize();
464
465 unsigned Size = FrameInfo.getObjectSize(FI);
466 unsigned NumLanes = Size / 4;
467
468 if (NumLanes > WaveSize)
469 return false;
470
471 assert(Size >= 4 && "invalid sgpr spill size");
472 assert(ST.getRegisterInfo()->spillSGPRToVGPR() &&
473 "not spilling SGPRs to VGPRs");
474
475 unsigned &NumSpillLanes = SpillToPhysVGPRLane ? NumPhysicalVGPRSpillLanes
476 : NumVirtualVGPRSpillLanes;
477
478 for (unsigned I = 0; I < NumLanes; ++I, ++NumSpillLanes) {
479 unsigned LaneIndex = (NumSpillLanes % WaveSize);
480
481 bool Allocated = SpillToPhysVGPRLane
482 ? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex,
483 IsPrologEpilog)
484 : allocateVirtualVGPRForSGPRSpills(MF, FI, LaneIndex);
485 if (!Allocated) {
486 NumSpillLanes -= I;
487 return false;
488 }
489 }
490
491 return true;
492}
493
494/// Reserve AGPRs or VGPRs to support spilling for FrameIndex \p FI.
495/// Either AGPR is spilled to VGPR to vice versa.
496/// Returns true if a \p FI can be eliminated completely.
498 int FI,
499 bool isAGPRtoVGPR) {
501 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
502 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
503
504 assert(ST.hasMAIInsts() && FrameInfo.isSpillSlotObjectIndex(FI));
505
506 auto &Spill = VGPRToAGPRSpills[FI];
507
508 // This has already been allocated.
509 if (!Spill.Lanes.empty())
510 return Spill.FullyAllocated;
511
512 unsigned Size = FrameInfo.getObjectSize(FI);
513 unsigned NumLanes = Size / 4;
514 Spill.Lanes.resize(NumLanes, AMDGPU::NoRegister);
515
516 const TargetRegisterClass &RC =
517 isAGPRtoVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::AGPR_32RegClass;
518 auto Regs = RC.getRegisters();
519
520 auto &SpillRegs = isAGPRtoVGPR ? SpillAGPR : SpillVGPR;
521 const SIRegisterInfo *TRI = ST.getRegisterInfo();
522 Spill.FullyAllocated = true;
523
524 // FIXME: Move allocation logic out of MachineFunctionInfo and initialize
525 // once.
526 BitVector OtherUsedRegs;
527 OtherUsedRegs.resize(TRI->getNumRegs());
528
529 const uint32_t *CSRMask =
530 TRI->getCallPreservedMask(MF, MF.getFunction().getCallingConv());
531 if (CSRMask)
532 OtherUsedRegs.setBitsInMask(CSRMask);
533
534 // TODO: Should include register tuples, but doesn't matter with current
535 // usage.
536 for (MCPhysReg Reg : SpillAGPR)
537 OtherUsedRegs.set(Reg);
538 for (MCPhysReg Reg : SpillVGPR)
539 OtherUsedRegs.set(Reg);
540
541 SmallVectorImpl<MCPhysReg>::const_iterator NextSpillReg = Regs.begin();
542 for (int I = NumLanes - 1; I >= 0; --I) {
543 NextSpillReg = std::find_if(
544 NextSpillReg, Regs.end(), [&MRI, &OtherUsedRegs](MCPhysReg Reg) {
545 return MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg) &&
546 !OtherUsedRegs[Reg];
547 });
548
549 if (NextSpillReg == Regs.end()) { // Registers exhausted
550 Spill.FullyAllocated = false;
551 break;
552 }
553
554 OtherUsedRegs.set(*NextSpillReg);
555 SpillRegs.push_back(*NextSpillReg);
556 MRI.reserveReg(*NextSpillReg, TRI);
557 Spill.Lanes[I] = *NextSpillReg++;
558 }
559
560 return Spill.FullyAllocated;
561}
562
564 MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs) {
565 // Remove dead frame indices from function frame, however keep FP & BP since
566 // spills for them haven't been inserted yet. And also make sure to remove the
567 // frame indices from `SGPRSpillsToVirtualVGPRLanes` data structure,
568 // otherwise, it could result in an unexpected side effect and bug, in case of
569 // any re-mapping of freed frame indices by later pass(es) like "stack slot
570 // coloring".
571 for (auto &R : make_early_inc_range(SGPRSpillsToVirtualVGPRLanes)) {
572 MFI.RemoveStackObject(R.first);
573 SGPRSpillsToVirtualVGPRLanes.erase(R.first);
574 }
575
576 // Remove the dead frame indices of CSR SGPRs which are spilled to physical
577 // VGPR lanes during SILowerSGPRSpills pass.
578 if (!ResetSGPRSpillStackIDs) {
579 for (auto &R : make_early_inc_range(SGPRSpillsToPhysicalVGPRLanes)) {
580 MFI.RemoveStackObject(R.first);
581 SGPRSpillsToPhysicalVGPRLanes.erase(R.first);
582 }
583 }
584 bool HaveSGPRToMemory = false;
585
586 if (ResetSGPRSpillStackIDs) {
587 // All other SGPRs must be allocated on the default stack, so reset the
588 // stack ID.
589 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); I != E;
590 ++I) {
594 HaveSGPRToMemory = true;
595 }
596 }
597 }
598 }
599
600 for (auto &R : VGPRToAGPRSpills) {
601 if (R.second.IsDead)
602 MFI.RemoveStackObject(R.first);
603 }
604
605 return HaveSGPRToMemory;
606}
607
609 const SIRegisterInfo &TRI) {
610 if (ScavengeFI)
611 return *ScavengeFI;
612
613 ScavengeFI =
614 MFI.CreateStackObject(TRI.getSpillSize(AMDGPU::SGPR_32RegClass),
615 TRI.getSpillAlign(AMDGPU::SGPR_32RegClass), false);
616 return *ScavengeFI;
617}
618
619MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const {
620 assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
621 return AMDGPU::SGPR0 + NumUserSGPRs;
622}
623
624MCPhysReg SIMachineFunctionInfo::getNextSystemSGPR() const {
625 return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
626}
627
628void SIMachineFunctionInfo::MRI_NoteNewVirtualRegister(Register Reg) {
629 VRegFlags.grow(Reg);
630}
631
632void SIMachineFunctionInfo::MRI_NoteCloneVirtualRegister(Register NewReg,
633 Register SrcReg) {
634 VRegFlags.grow(NewReg);
635 VRegFlags[NewReg] = VRegFlags[SrcReg];
636}
637
640 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
641 if (!ST.isAmdPalOS())
642 return Register();
643 Register GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in
644 if (ST.hasMergedShaders()) {
645 switch (MF.getFunction().getCallingConv()) {
648 // Low GIT address is passed in s8 rather than s0 for an LS+HS or
649 // ES+GS merged shader on gfx9+.
650 GitPtrLo = AMDGPU::SGPR8;
651 return GitPtrLo;
652 default:
653 return GitPtrLo;
654 }
655 }
656 return GitPtrLo;
657}
658
660 const TargetRegisterInfo &TRI) {
662 {
663 raw_string_ostream OS(Dest.Value);
664 OS << printReg(Reg, &TRI);
665 }
666 return Dest;
667}
668
669static std::optional<yaml::SIArgumentInfo>
671 const TargetRegisterInfo &TRI) {
673
674 auto convertArg = [&](std::optional<yaml::SIArgument> &A,
675 const ArgDescriptor &Arg) {
676 if (!Arg)
677 return false;
678
679 // Create a register or stack argument.
681 if (Arg.isRegister()) {
683 OS << printReg(Arg.getRegister(), &TRI);
684 } else
685 SA.StackOffset = Arg.getStackOffset();
686 // Check and update the optional mask.
687 if (Arg.isMasked())
688 SA.Mask = Arg.getMask();
689
690 A = SA;
691 return true;
692 };
693
694 // TODO: Need to serialize kernarg preloads.
695 bool Any = false;
696 Any |= convertArg(AI.PrivateSegmentBuffer, ArgInfo.PrivateSegmentBuffer);
697 Any |= convertArg(AI.DispatchPtr, ArgInfo.DispatchPtr);
698 Any |= convertArg(AI.QueuePtr, ArgInfo.QueuePtr);
699 Any |= convertArg(AI.KernargSegmentPtr, ArgInfo.KernargSegmentPtr);
700 Any |= convertArg(AI.DispatchID, ArgInfo.DispatchID);
701 Any |= convertArg(AI.FlatScratchInit, ArgInfo.FlatScratchInit);
702 Any |= convertArg(AI.LDSKernelId, ArgInfo.LDSKernelId);
703 Any |= convertArg(AI.PrivateSegmentSize, ArgInfo.PrivateSegmentSize);
704 Any |= convertArg(AI.WorkGroupIDX, ArgInfo.WorkGroupIDX);
705 Any |= convertArg(AI.WorkGroupIDY, ArgInfo.WorkGroupIDY);
706 Any |= convertArg(AI.WorkGroupIDZ, ArgInfo.WorkGroupIDZ);
707 Any |= convertArg(AI.WorkGroupInfo, ArgInfo.WorkGroupInfo);
708 Any |= convertArg(AI.PrivateSegmentWaveByteOffset,
709 ArgInfo.PrivateSegmentWaveByteOffset);
710 Any |= convertArg(AI.ImplicitArgPtr, ArgInfo.ImplicitArgPtr);
711 Any |= convertArg(AI.ImplicitBufferPtr, ArgInfo.ImplicitBufferPtr);
712 Any |= convertArg(AI.WorkItemIDX, ArgInfo.WorkItemIDX);
713 Any |= convertArg(AI.WorkItemIDY, ArgInfo.WorkItemIDY);
714 Any |= convertArg(AI.WorkItemIDZ, ArgInfo.WorkItemIDZ);
715
716 if (Any)
717 return AI;
718
719 return std::nullopt;
720}
721
724 const llvm::MachineFunction &MF)
725 : ExplicitKernArgSize(MFI.getExplicitKernArgSize()),
726 MaxKernArgAlign(MFI.getMaxKernArgAlign()), LDSSize(MFI.getLDSSize()),
727 GDSSize(MFI.getGDSSize()), DynLDSAlign(MFI.getDynLDSAlign()),
728 IsEntryFunction(MFI.isEntryFunction()),
729 NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()),
730 MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()),
731 HasSpilledSGPRs(MFI.hasSpilledSGPRs()),
732 HasSpilledVGPRs(MFI.hasSpilledVGPRs()),
733 NumWaveDispatchSGPRs(MFI.getNumWaveDispatchSGPRs()),
734 NumWaveDispatchVGPRs(MFI.getNumWaveDispatchVGPRs()),
735 HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()),
736 Occupancy(MFI.getOccupancy()),
737 ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)),
738 FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)),
739 StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)),
740 BytesInStackArgArea(MFI.getBytesInStackArgArea()),
741 ReturnsVoid(MFI.returnsVoid()),
742 ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)),
743 PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()),
744 MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()),
745 Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()),
746 IsWholeWaveFunction(MFI.isWholeWaveFunction()),
747 DynamicVGPRBlockSize(MFI.getDynamicVGPRBlockSize()),
748 ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()) {
749 for (Register Reg : MFI.getSGPRSpillPhysVGPRs())
750 SpillPhysVGPRS.push_back(regToString(Reg, TRI));
751
752 for (Register Reg : MFI.getWWMReservedRegs())
753 WWMReservedRegs.push_back(regToString(Reg, TRI));
754
755 if (MFI.getLongBranchReservedReg())
757 if (MFI.getVGPRForAGPRCopy())
759
760 if (MFI.getSGPRForEXECCopy())
762
763 auto SFI = MFI.getOptionalScavengeFI();
764 if (SFI)
766}
767
771
773 const yaml::SIMachineFunctionInfo &YamlMFI, const MachineFunction &MF,
777 LDSSize = YamlMFI.LDSSize;
778 GDSSize = YamlMFI.GDSSize;
779 DynLDSAlign = YamlMFI.DynLDSAlign;
780 PSInputAddr = YamlMFI.PSInputAddr;
781 PSInputEnable = YamlMFI.PSInputEnable;
782 MaxMemoryClusterDWords = YamlMFI.MaxMemoryClusterDWords;
783 HighBitsOf32BitAddress = YamlMFI.HighBitsOf32BitAddress;
784 Occupancy = YamlMFI.Occupancy;
787 MemoryBound = YamlMFI.MemoryBound;
788 WaveLimiter = YamlMFI.WaveLimiter;
789 HasSpilledSGPRs = YamlMFI.HasSpilledSGPRs;
790 HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs;
791 NumWaveDispatchSGPRs = YamlMFI.NumWaveDispatchSGPRs;
792 NumWaveDispatchVGPRs = YamlMFI.NumWaveDispatchVGPRs;
793 BytesInStackArgArea = YamlMFI.BytesInStackArgArea;
794 ReturnsVoid = YamlMFI.ReturnsVoid;
795 IsWholeWaveFunction = YamlMFI.IsWholeWaveFunction;
796
797 if (YamlMFI.ScavengeFI) {
798 auto FIOrErr = YamlMFI.ScavengeFI->getFI(MF.getFrameInfo());
799 if (!FIOrErr) {
800 // Create a diagnostic for a the frame index.
801 const MemoryBuffer &Buffer =
802 *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
803
804 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1, 1,
805 SourceMgr::DK_Error, toString(FIOrErr.takeError()),
806 "", {}, {});
807 SourceRange = YamlMFI.ScavengeFI->SourceRange;
808 return true;
809 }
810 ScavengeFI = *FIOrErr;
811 } else {
812 ScavengeFI = std::nullopt;
813 }
814 return false;
815}
816
818 auto [MinNumAGPR, MaxNumAGPR] =
819 AMDGPU::getIntegerPairAttribute(F, "amdgpu-agpr-alloc", {~0u, ~0u},
820 /*OnlyFirstRequired=*/true);
821 return MinNumAGPR != 0u;
822}
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
Base class for AMDGPU specific classes of TargetSubtarget.
MachineBasicBlock & MBB
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
AMD GCN specific subclass of TargetSubtarget.
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
if(PassOpts->AAPipeline)
static cl::opt< bool > MFMAVGPRForm("amdgpu-mfma-vgpr-form", cl::Hidden, cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If " "unspecified, default to compiler heuristics"), cl::init(false))
const GCNTargetMachine & getTM(const GCNSubtarget *STI)
static std::optional< yaml::SIArgumentInfo > convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo, const TargetRegisterInfo &TRI)
static yaml::StringValue regToString(Register Reg, const TargetRegisterInfo &TRI)
Interface definition for SIRegisterInfo.
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
AMDGPUMachineFunction(const Function &F, const AMDGPUSubtarget &ST)
Align DynLDSAlign
Align for dynamic shared memory if any.
uint32_t LDSSize
Number of bytes in the LDS that are being used.
static ClusterDimsAttr get(const Function &F)
Functions, function parameters, and return types can have attributes to indicate how they should be t...
Definition Attributes.h:69
BitVector & reset()
Definition BitVector.h:392
void resize(unsigned N, bool t=false)
resize - Grow or shrink the bitvector.
Definition BitVector.h:341
BitVector & set()
Definition BitVector.h:351
void setBitsInMask(const uint32_t *Mask, unsigned MaskWords=~0u)
setBitsInMask - Add '1' bits from Mask to this vector.
Definition BitVector.h:707
void push_back(bool Val)
Definition BitVector.h:466
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
const SITargetLowering * getTargetLowering() const override
LLVM_ABI void sortUniqueLiveIns()
Sorts and uniques the LiveIns vector.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
LLVM_ABI int CreateSpillStackObject(uint64_t Size, Align Alignment)
Create a new statically sized stack object that represents a spill slot, returning a nonnegative iden...
void setStackID(int ObjectIdx, uint8_t ID)
bool hasTailCall() const
Returns true if the function contains a tail call.
bool isSpillSlotObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a spill slot.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
void RemoveStackObject(int ObjectIdx)
Remove or mark dead a statically sized stack object.
int getObjectIndexEnd() const
Return one past the maximum frame object index.
uint8_t getStackID(int ObjectIdx) const
int getObjectIndexBegin() const
Return the minimum frame object index.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * cloneInfo(const Ty &Old)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI const MCPhysReg * getCalleeSavedRegs() const
Returns list of callee saved registers.
This interface provides simple read-only access to a block of memory, and provides simple methods for...
virtual StringRef getBufferIdentifier() const
Return an identifier for this buffer, typically the filename it was read from.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool initializeBaseYamlFields(const yaml::SIMachineFunctionInfo &YamlMFI, const MachineFunction &MF, PerFunctionMIParsingState &PFS, SMDiagnostic &Error, SMRange &SourceRange)
void shiftWwmVGPRsToLowestRange(MachineFunction &MF, SmallVectorImpl< Register > &WWMVGPRs, BitVector &SavedVGPRs)
Register addPrivateSegmentSize(const SIRegisterInfo &TRI)
void allocateWWMSpill(MachineFunction &MF, Register VGPR, uint64_t Size=4, Align Alignment=Align(4))
Register addDispatchPtr(const SIRegisterInfo &TRI)
Register addFlatScratchInit(const SIRegisterInfo &TRI)
ArrayRef< Register > getSGPRSpillPhysVGPRs() const
int getScavengeFI(MachineFrameInfo &MFI, const SIRegisterInfo &TRI)
Register addQueuePtr(const SIRegisterInfo &TRI)
SIMachineFunctionInfo(const SIMachineFunctionInfo &MFI)=default
Register getGITPtrLoReg(const MachineFunction &MF) const
bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR)
Reserve AGPRs or VGPRs to support spilling for FrameIndex FI.
void splitWWMSpillRegisters(MachineFunction &MF, SmallVectorImpl< std::pair< Register, int > > &CalleeSavedRegs, SmallVectorImpl< std::pair< Register, int > > &ScratchRegs) const
bool mayUseAGPRs(const Function &F) const
bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg) const
bool allocateSGPRSpillToVGPRLane(MachineFunction &MF, int FI, bool SpillToPhysVGPRLane=false, bool IsPrologEpilog=false)
Register addKernargSegmentPtr(const SIRegisterInfo &TRI)
Register addDispatchID(const SIRegisterInfo &TRI)
bool removeDeadFrameIndices(MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs)
If ResetSGPRSpillStackIDs is true, reset the stack ID from sgpr-spill to the default stack.
MachineFunctionInfo * clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF, const DenseMap< MachineBasicBlock *, MachineBasicBlock * > &Src2DstMBB) const override
Make a functionally equivalent copy of this MachineFunctionInfo in MF.
bool checkIndexInPrologEpilogSGPRSpills(int FI) const
Register addPrivateSegmentBuffer(const SIRegisterInfo &TRI)
const ReservedRegSet & getWWMReservedRegs() const
std::optional< int > getOptionalScavengeFI() const
Register addImplicitBufferPtr(const SIRegisterInfo &TRI)
void limitOccupancy(const MachineFunction &MF)
SmallVectorImpl< MCRegister > * addPreloadedKernArg(const SIRegisterInfo &TRI, const TargetRegisterClass *RC, unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs)
static bool isChainScratchRegister(Register VGPR)
Instances of this class encapsulate one diagnostic report, allowing printing to a raw_ostream as a ca...
Definition SourceMgr.h:282
Represents a location in source code.
Definition SMLoc.h:23
Represents a range in source code.
Definition SMLoc.h:48
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
typename SuperClass::const_iterator const_iterator
unsigned getMainFileID() const
Definition SourceMgr.h:133
const MemoryBuffer * getMemoryBuffer(unsigned i) const
Definition SourceMgr.h:126
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
bool consumeInteger(unsigned Radix, T &Result)
Parse the current string as an integer of the specified radix.
Definition StringRef.h:509
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:151
const TargetMachine & getTargetMachine() const
ArrayRef< MCPhysReg > getRegisters() const
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
A raw_ostream that writes to an std::string.
unsigned getInitialPSInputAddr(const Function &F)
unsigned getDynamicVGPRBlockSize(const Function &F)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
CallingConv Namespace - This namespace contains an enum with a value for the well-known calling conve...
Definition CallingConv.h:21
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ SPIR_KERNEL
Used for SPIR kernel functions.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1733
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:626
BumpPtrAllocatorImpl BumpPtrAllocator
The standard BumpPtrAllocator which just uses the default template parameters.
Definition Allocator.h:383
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition SIInstrInfo.h:40
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition SCCPSolver.h:42
MachineFunctionInfo - This class can be derived from and used by targets to hold private target-speci...
A serializaable representation of a reference to a stack object or fixed stack object.
This class should be specialized by any type that needs to be converted to/from a YAML mapping.
Definition YAMLTraits.h:62
std::optional< SIArgument > PrivateSegmentWaveByteOffset
std::optional< SIArgument > WorkGroupIDY
std::optional< SIArgument > FlatScratchInit
std::optional< SIArgument > DispatchPtr
std::optional< SIArgument > DispatchID
std::optional< SIArgument > WorkItemIDY
std::optional< SIArgument > WorkGroupIDX
std::optional< SIArgument > ImplicitArgPtr
std::optional< SIArgument > QueuePtr
std::optional< SIArgument > WorkGroupInfo
std::optional< SIArgument > LDSKernelId
std::optional< SIArgument > ImplicitBufferPtr
std::optional< SIArgument > WorkItemIDX
std::optional< SIArgument > KernargSegmentPtr
std::optional< SIArgument > WorkItemIDZ
std::optional< SIArgument > PrivateSegmentSize
std::optional< SIArgument > PrivateSegmentBuffer
std::optional< SIArgument > WorkGroupIDZ
std::optional< unsigned > Mask
static SIArgument createArgument(bool IsReg)
SmallVector< StringValue > WWMReservedRegs
void mappingImpl(yaml::IO &YamlIO) override
std::optional< SIArgumentInfo > ArgInfo
SmallVector< StringValue, 2 > SpillPhysVGPRS
std::optional< FrameIndex > ScavengeFI
A wrapper around std::string which contains a source range that's being set during parsing.