LLVM 22.0.0git
SIMachineFunctionInfo.cpp
Go to the documentation of this file.
1//===- SIMachineFunctionInfo.cpp - SI Machine Function Info ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AMDGPUSubtarget.h"
11#include "GCNSubtarget.h"
13#include "SIRegisterInfo.h"
21#include "llvm/IR/CallingConv.h"
23#include "llvm/IR/Function.h"
24#include <cassert>
25#include <optional>
26#include <vector>
27
28enum { MAX_LANES = 64 };
29
30using namespace llvm;
31
32// TODO -- delete this flag once we have more robust mechanisms to allocate the
33// optimal RC for Opc and Dest of MFMA. In particular, there are high RP cases
34// where it is better to produce the VGPR form (e.g. if there are VGPR users
35// of the MFMA result).
37 "amdgpu-mfma-vgpr-form", cl::Hidden,
38 cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If "
39 "unspecified, default to compiler heuristics"),
40 cl::init(false));
41
43 const SITargetLowering *TLI = STI->getTargetLowering();
44 return static_cast<const GCNTargetMachine &>(TLI->getTargetMachine());
45}
46
48 const GCNSubtarget *STI)
49 : AMDGPUMachineFunction(F, *STI), Mode(F, *STI), GWSResourcePSV(getTM(STI)),
50 UserSGPRInfo(F, *STI), WorkGroupIDX(false), WorkGroupIDY(false),
51 WorkGroupIDZ(false), WorkGroupInfo(false), LDSKernelId(false),
52 PrivateSegmentWaveByteOffset(false), WorkItemIDX(false),
53 WorkItemIDY(false), WorkItemIDZ(false), ImplicitArgPtr(false),
54 GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0),
55 IsWholeWaveFunction(F.getCallingConv() ==
56 CallingConv::AMDGPU_Gfx_WholeWave) {
57 const GCNSubtarget &ST = *STI;
58 FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
59 WavesPerEU = ST.getWavesPerEU(F);
60 MaxNumWorkGroups = ST.getMaxNumWorkGroups(F);
61 assert(MaxNumWorkGroups.size() == 3);
62
63 // Temporarily check both the attribute and the subtarget feature, until the
64 // latter is completely removed.
65 DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
66 if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
67 DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
68
69 Occupancy = ST.computeOccupancy(F, getLDSSize()).second;
70 CallingConv::ID CC = F.getCallingConv();
71
72 VRegFlags.reserve(1024);
73
74 const bool IsKernel = CC == CallingConv::AMDGPU_KERNEL ||
76
77 if (IsKernel) {
78 WorkGroupIDX = true;
79 WorkItemIDX = true;
80 } else if (CC == CallingConv::AMDGPU_PS) {
81 PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
82 }
83
84 MayNeedAGPRs = ST.hasMAIInsts();
85 if (ST.hasGFX90AInsts()) {
86 // FIXME: MayNeedAGPRs is a misnomer for how this is used. MFMA selection
87 // should be separated from availability of AGPRs
88 if (MFMAVGPRForm ||
89 (ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() &&
90 !mayUseAGPRs(F)))
91 MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
92 }
93
94 if (AMDGPU::isChainCC(CC)) {
95 // Chain functions don't receive an SP from their caller, but are free to
96 // set one up. For now, we can use s32 to match what amdgpu_gfx functions
97 // would use if called, but this can be revisited.
98 // FIXME: Only reserve this if we actually need it.
99 StackPtrOffsetReg = AMDGPU::SGPR32;
100
101 ScratchRSrcReg = AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51;
102
103 ArgInfo.PrivateSegmentBuffer =
104 ArgDescriptor::createRegister(ScratchRSrcReg);
105
106 ImplicitArgPtr = false;
107 } else if (!isEntryFunction()) {
108 if (CC != CallingConv::AMDGPU_Gfx &&
111
112 FrameOffsetReg = AMDGPU::SGPR33;
113 StackPtrOffsetReg = AMDGPU::SGPR32;
114
115 if (!ST.enableFlatScratch()) {
116 // Non-entry functions have no special inputs for now, other registers
117 // required for scratch access.
118 ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
119
120 ArgInfo.PrivateSegmentBuffer =
121 ArgDescriptor::createRegister(ScratchRSrcReg);
122 }
123
124 if (!F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
125 ImplicitArgPtr = true;
126 } else {
127 ImplicitArgPtr = false;
129 std::max(ST.getAlignmentForImplicitArgPtr(), MaxKernArgAlign);
130 }
131
132 if (!AMDGPU::isGraphics(CC) ||
134 ST.hasArchitectedSGPRs())) {
135 if (IsKernel || !F.hasFnAttribute("amdgpu-no-workgroup-id-x"))
136 WorkGroupIDX = true;
137
138 if (!F.hasFnAttribute("amdgpu-no-workgroup-id-y"))
139 WorkGroupIDY = true;
140
141 if (!F.hasFnAttribute("amdgpu-no-workgroup-id-z"))
142 WorkGroupIDZ = true;
143 }
144
145 if (!AMDGPU::isGraphics(CC)) {
146 if (IsKernel || !F.hasFnAttribute("amdgpu-no-workitem-id-x"))
147 WorkItemIDX = true;
148
149 if (!F.hasFnAttribute("amdgpu-no-workitem-id-y") &&
150 ST.getMaxWorkitemID(F, 1) != 0)
151 WorkItemIDY = true;
152
153 if (!F.hasFnAttribute("amdgpu-no-workitem-id-z") &&
154 ST.getMaxWorkitemID(F, 2) != 0)
155 WorkItemIDZ = true;
156
157 if (!IsKernel && !F.hasFnAttribute("amdgpu-no-lds-kernel-id"))
158 LDSKernelId = true;
159 }
160
161 if (isEntryFunction()) {
162 // X, XY, and XYZ are the only supported combinations, so make sure Y is
163 // enabled if Z is.
164 if (WorkItemIDZ)
165 WorkItemIDY = true;
166
167 if (!ST.flatScratchIsArchitected()) {
168 PrivateSegmentWaveByteOffset = true;
169
170 // HS and GS always have the scratch wave offset in SGPR5 on GFX9.
171 if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
173 ArgInfo.PrivateSegmentWaveByteOffset =
174 ArgDescriptor::createRegister(AMDGPU::SGPR5);
175 }
176 }
177
178 Attribute A = F.getFnAttribute("amdgpu-git-ptr-high");
179 StringRef S = A.getValueAsString();
180 if (!S.empty())
181 S.consumeInteger(0, GITPtrHigh);
182
183 A = F.getFnAttribute("amdgpu-32bit-address-high-bits");
184 S = A.getValueAsString();
185 if (!S.empty())
186 S.consumeInteger(0, HighBitsOf32BitAddress);
187
188 MaxMemoryClusterDWords = F.getFnAttributeAsParsedInteger(
189 "amdgpu-max-memory-cluster-dwords", DefaultMemoryClusterDWordsLimit);
190
191 // On GFX908, in order to guarantee copying between AGPRs, we need a scratch
192 // VGPR available at all times. For now, reserve highest available VGPR. After
193 // RA, shift it to the lowest available unused VGPR if the one exist.
194 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
195 VGPRForAGPRCopy =
196 AMDGPU::VGPR_32RegClass.getRegister(ST.getMaxNumVGPRs(F) - 1);
197 }
198}
199
201 BumpPtrAllocator &Allocator, MachineFunction &DestMF,
203 const {
204 return DestMF.cloneInfo<SIMachineFunctionInfo>(*this);
205}
206
209 const GCNSubtarget& ST = MF.getSubtarget<GCNSubtarget>();
210 limitOccupancy(ST.getOccupancyWithWorkGroupSizes(MF).second);
211}
212
214 const SIRegisterInfo &TRI) {
215 ArgInfo.PrivateSegmentBuffer =
216 ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
217 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SGPR_128RegClass));
218 NumUserSGPRs += 4;
219 return ArgInfo.PrivateSegmentBuffer.getRegister();
220}
221
223 ArgInfo.DispatchPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
224 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
225 NumUserSGPRs += 2;
226 return ArgInfo.DispatchPtr.getRegister();
227}
228
230 ArgInfo.QueuePtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
231 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
232 NumUserSGPRs += 2;
233 return ArgInfo.QueuePtr.getRegister();
234}
235
237 ArgInfo.KernargSegmentPtr
238 = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
239 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
240 NumUserSGPRs += 2;
241 return ArgInfo.KernargSegmentPtr.getRegister();
242}
243
245 ArgInfo.DispatchID = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
246 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
247 NumUserSGPRs += 2;
248 return ArgInfo.DispatchID.getRegister();
249}
250
252 ArgInfo.FlatScratchInit = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
253 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
254 NumUserSGPRs += 2;
255 return ArgInfo.FlatScratchInit.getRegister();
256}
257
259 ArgInfo.PrivateSegmentSize = ArgDescriptor::createRegister(getNextUserSGPR());
260 NumUserSGPRs += 1;
261 return ArgInfo.PrivateSegmentSize.getRegister();
262}
263
265 ArgInfo.ImplicitBufferPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
266 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
267 NumUserSGPRs += 2;
268 return ArgInfo.ImplicitBufferPtr.getRegister();
269}
270
272 ArgInfo.LDSKernelId = ArgDescriptor::createRegister(getNextUserSGPR());
273 NumUserSGPRs += 1;
274 return ArgInfo.LDSKernelId.getRegister();
275}
276
278 const SIRegisterInfo &TRI, const TargetRegisterClass *RC,
279 unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs) {
280 auto [It, Inserted] = ArgInfo.PreloadKernArgs.try_emplace(KernArgIdx);
281 assert(Inserted && "Preload kernel argument allocated twice.");
282 NumUserSGPRs += PaddingSGPRs;
283 // If the available register tuples are aligned with the kernarg to be
284 // preloaded use that register, otherwise we need to use a set of SGPRs and
285 // merge them.
286 if (!ArgInfo.FirstKernArgPreloadReg)
287 ArgInfo.FirstKernArgPreloadReg = getNextUserSGPR();
288 Register PreloadReg =
289 TRI.getMatchingSuperReg(getNextUserSGPR(), AMDGPU::sub0, RC);
290 auto &Regs = It->second.Regs;
291 if (PreloadReg &&
292 (RC == &AMDGPU::SReg_32RegClass || RC == &AMDGPU::SReg_64RegClass)) {
293 Regs.push_back(PreloadReg);
294 NumUserSGPRs += AllocSizeDWord;
295 } else {
296 Regs.reserve(AllocSizeDWord);
297 for (unsigned I = 0; I < AllocSizeDWord; ++I) {
298 Regs.push_back(getNextUserSGPR());
299 NumUserSGPRs++;
300 }
301 }
302
303 // Track the actual number of SGPRs that HW will preload to.
304 UserSGPRInfo.allocKernargPreloadSGPRs(AllocSizeDWord + PaddingSGPRs);
305 return &Regs;
306}
307
309 uint64_t Size, Align Alignment) {
310 // Skip if it is an entry function or the register is already added.
311 if (isEntryFunction() || WWMSpills.count(VGPR))
312 return;
313
314 // Skip if this is a function with the amdgpu_cs_chain or
315 // amdgpu_cs_chain_preserve calling convention and this is a scratch register.
316 // We never need to allocate a spill for these because we don't even need to
317 // restore the inactive lanes for them (they're scratchier than the usual
318 // scratch registers). We only need to do this if we have calls to
319 // llvm.amdgcn.cs.chain (otherwise there's no one to save them for, since
320 // chain functions do not return) and the function did not contain a call to
321 // llvm.amdgcn.init.whole.wave (since in that case there are no inactive lanes
322 // when entering the function).
323 if (isChainFunction() &&
326 return;
327
328 WWMSpills.insert(std::make_pair(
329 VGPR, MF.getFrameInfo().CreateSpillStackObject(Size, Alignment)));
330}
331
332// Separate out the callee-saved and scratch registers.
334 MachineFunction &MF,
335 SmallVectorImpl<std::pair<Register, int>> &CalleeSavedRegs,
336 SmallVectorImpl<std::pair<Register, int>> &ScratchRegs) const {
337 const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
338 for (auto &Reg : WWMSpills) {
339 if (isCalleeSavedReg(CSRegs, Reg.first))
340 CalleeSavedRegs.push_back(Reg);
341 else
342 ScratchRegs.push_back(Reg);
343 }
344}
345
347 MCPhysReg Reg) const {
348 for (unsigned I = 0; CSRegs[I]; ++I) {
349 if (CSRegs[I] == Reg)
350 return true;
351 }
352
353 return false;
354}
355
358 BitVector &SavedVGPRs) {
359 const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
361 for (unsigned I = 0, E = WWMVGPRs.size(); I < E; ++I) {
362 Register Reg = WWMVGPRs[I];
363 Register NewReg =
364 TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
365 if (!NewReg || NewReg >= Reg)
366 break;
367
368 MRI.replaceRegWith(Reg, NewReg);
369
370 // Update various tables with the new VGPR.
371 WWMVGPRs[I] = NewReg;
372 WWMReservedRegs.remove(Reg);
373 WWMReservedRegs.insert(NewReg);
374 MRI.reserveReg(NewReg, TRI);
375
376 // Replace the register in SpillPhysVGPRs. This is needed to look for free
377 // lanes while spilling special SGPRs like FP, BP, etc. during PEI.
378 auto *RegItr = llvm::find(SpillPhysVGPRs, Reg);
379 if (RegItr != SpillPhysVGPRs.end()) {
380 unsigned Idx = std::distance(SpillPhysVGPRs.begin(), RegItr);
381 SpillPhysVGPRs[Idx] = NewReg;
382 }
383
384 // The generic `determineCalleeSaves` might have set the old register if it
385 // is in the CSR range.
386 SavedVGPRs.reset(Reg);
387
388 for (MachineBasicBlock &MBB : MF) {
389 MBB.removeLiveIn(Reg);
391 }
392
393 Reg = NewReg;
394 }
395}
396
397bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills(
398 MachineFunction &MF, int FI, unsigned LaneIndex) {
400 Register LaneVGPR;
401 if (!LaneIndex) {
402 LaneVGPR = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
403 SpillVGPRs.push_back(LaneVGPR);
404 } else {
405 LaneVGPR = SpillVGPRs.back();
406 }
407
408 SGPRSpillsToVirtualVGPRLanes[FI].emplace_back(LaneVGPR, LaneIndex);
409 return true;
410}
411
412bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills(
413 MachineFunction &MF, int FI, unsigned LaneIndex, bool IsPrologEpilog) {
415 const SIRegisterInfo *TRI = ST.getRegisterInfo();
417 Register LaneVGPR;
418 if (!LaneIndex) {
419 // Find the highest available register if called before RA to ensure the
420 // lowest registers are available for allocation. The LaneVGPR, in that
421 // case, will be shifted back to the lowest range after VGPR allocation.
422 LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF,
423 !IsPrologEpilog);
424 if (LaneVGPR == AMDGPU::NoRegister) {
425 // We have no VGPRs left for spilling SGPRs. Reset because we will not
426 // partially spill the SGPR to VGPRs.
427 SGPRSpillsToPhysicalVGPRLanes.erase(FI);
428 return false;
429 }
430
431 if (IsPrologEpilog)
432 allocateWWMSpill(MF, LaneVGPR);
433
434 reserveWWMRegister(LaneVGPR);
435 for (MachineBasicBlock &MBB : MF) {
436 MBB.addLiveIn(LaneVGPR);
438 }
439 SpillPhysVGPRs.push_back(LaneVGPR);
440 } else {
441 LaneVGPR = SpillPhysVGPRs.back();
442 }
443
444 SGPRSpillsToPhysicalVGPRLanes[FI].emplace_back(LaneVGPR, LaneIndex);
445 return true;
446}
447
449 MachineFunction &MF, int FI, bool SpillToPhysVGPRLane,
450 bool IsPrologEpilog) {
451 std::vector<SIRegisterInfo::SpilledReg> &SpillLanes =
452 SpillToPhysVGPRLane ? SGPRSpillsToPhysicalVGPRLanes[FI]
453 : SGPRSpillsToVirtualVGPRLanes[FI];
454
455 // This has already been allocated.
456 if (!SpillLanes.empty())
457 return true;
458
459 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
460 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
461 unsigned WaveSize = ST.getWavefrontSize();
462
463 unsigned Size = FrameInfo.getObjectSize(FI);
464 unsigned NumLanes = Size / 4;
465
466 if (NumLanes > WaveSize)
467 return false;
468
469 assert(Size >= 4 && "invalid sgpr spill size");
470 assert(ST.getRegisterInfo()->spillSGPRToVGPR() &&
471 "not spilling SGPRs to VGPRs");
472
473 unsigned &NumSpillLanes = SpillToPhysVGPRLane ? NumPhysicalVGPRSpillLanes
474 : NumVirtualVGPRSpillLanes;
475
476 for (unsigned I = 0; I < NumLanes; ++I, ++NumSpillLanes) {
477 unsigned LaneIndex = (NumSpillLanes % WaveSize);
478
479 bool Allocated = SpillToPhysVGPRLane
480 ? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex,
481 IsPrologEpilog)
482 : allocateVirtualVGPRForSGPRSpills(MF, FI, LaneIndex);
483 if (!Allocated) {
484 NumSpillLanes -= I;
485 return false;
486 }
487 }
488
489 return true;
490}
491
492/// Reserve AGPRs or VGPRs to support spilling for FrameIndex \p FI.
493/// Either AGPR is spilled to VGPR to vice versa.
494/// Returns true if a \p FI can be eliminated completely.
496 int FI,
497 bool isAGPRtoVGPR) {
499 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
500 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
501
502 assert(ST.hasMAIInsts() && FrameInfo.isSpillSlotObjectIndex(FI));
503
504 auto &Spill = VGPRToAGPRSpills[FI];
505
506 // This has already been allocated.
507 if (!Spill.Lanes.empty())
508 return Spill.FullyAllocated;
509
510 unsigned Size = FrameInfo.getObjectSize(FI);
511 unsigned NumLanes = Size / 4;
512 Spill.Lanes.resize(NumLanes, AMDGPU::NoRegister);
513
514 const TargetRegisterClass &RC =
515 isAGPRtoVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::AGPR_32RegClass;
516 auto Regs = RC.getRegisters();
517
518 auto &SpillRegs = isAGPRtoVGPR ? SpillAGPR : SpillVGPR;
519 const SIRegisterInfo *TRI = ST.getRegisterInfo();
520 Spill.FullyAllocated = true;
521
522 // FIXME: Move allocation logic out of MachineFunctionInfo and initialize
523 // once.
524 BitVector OtherUsedRegs;
525 OtherUsedRegs.resize(TRI->getNumRegs());
526
527 const uint32_t *CSRMask =
528 TRI->getCallPreservedMask(MF, MF.getFunction().getCallingConv());
529 if (CSRMask)
530 OtherUsedRegs.setBitsInMask(CSRMask);
531
532 // TODO: Should include register tuples, but doesn't matter with current
533 // usage.
534 for (MCPhysReg Reg : SpillAGPR)
535 OtherUsedRegs.set(Reg);
536 for (MCPhysReg Reg : SpillVGPR)
537 OtherUsedRegs.set(Reg);
538
539 SmallVectorImpl<MCPhysReg>::const_iterator NextSpillReg = Regs.begin();
540 for (int I = NumLanes - 1; I >= 0; --I) {
541 NextSpillReg = std::find_if(
542 NextSpillReg, Regs.end(), [&MRI, &OtherUsedRegs](MCPhysReg Reg) {
543 return MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg) &&
544 !OtherUsedRegs[Reg];
545 });
546
547 if (NextSpillReg == Regs.end()) { // Registers exhausted
548 Spill.FullyAllocated = false;
549 break;
550 }
551
552 OtherUsedRegs.set(*NextSpillReg);
553 SpillRegs.push_back(*NextSpillReg);
554 MRI.reserveReg(*NextSpillReg, TRI);
555 Spill.Lanes[I] = *NextSpillReg++;
556 }
557
558 return Spill.FullyAllocated;
559}
560
562 MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs) {
563 // Remove dead frame indices from function frame, however keep FP & BP since
564 // spills for them haven't been inserted yet. And also make sure to remove the
565 // frame indices from `SGPRSpillsToVirtualVGPRLanes` data structure,
566 // otherwise, it could result in an unexpected side effect and bug, in case of
567 // any re-mapping of freed frame indices by later pass(es) like "stack slot
568 // coloring".
569 for (auto &R : make_early_inc_range(SGPRSpillsToVirtualVGPRLanes)) {
570 MFI.RemoveStackObject(R.first);
571 SGPRSpillsToVirtualVGPRLanes.erase(R.first);
572 }
573
574 // Remove the dead frame indices of CSR SGPRs which are spilled to physical
575 // VGPR lanes during SILowerSGPRSpills pass.
576 if (!ResetSGPRSpillStackIDs) {
577 for (auto &R : make_early_inc_range(SGPRSpillsToPhysicalVGPRLanes)) {
578 MFI.RemoveStackObject(R.first);
579 SGPRSpillsToPhysicalVGPRLanes.erase(R.first);
580 }
581 }
582 bool HaveSGPRToMemory = false;
583
584 if (ResetSGPRSpillStackIDs) {
585 // All other SGPRs must be allocated on the default stack, so reset the
586 // stack ID.
587 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); I != E;
588 ++I) {
592 HaveSGPRToMemory = true;
593 }
594 }
595 }
596 }
597
598 for (auto &R : VGPRToAGPRSpills) {
599 if (R.second.IsDead)
600 MFI.RemoveStackObject(R.first);
601 }
602
603 return HaveSGPRToMemory;
604}
605
607 const SIRegisterInfo &TRI) {
608 if (ScavengeFI)
609 return *ScavengeFI;
610
611 ScavengeFI =
612 MFI.CreateStackObject(TRI.getSpillSize(AMDGPU::SGPR_32RegClass),
613 TRI.getSpillAlign(AMDGPU::SGPR_32RegClass), false);
614 return *ScavengeFI;
615}
616
617MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const {
618 assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
619 return AMDGPU::SGPR0 + NumUserSGPRs;
620}
621
622MCPhysReg SIMachineFunctionInfo::getNextSystemSGPR() const {
623 return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
624}
625
626void SIMachineFunctionInfo::MRI_NoteNewVirtualRegister(Register Reg) {
627 VRegFlags.grow(Reg);
628}
629
630void SIMachineFunctionInfo::MRI_NoteCloneVirtualRegister(Register NewReg,
631 Register SrcReg) {
632 VRegFlags.grow(NewReg);
633 VRegFlags[NewReg] = VRegFlags[SrcReg];
634}
635
638 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
639 if (!ST.isAmdPalOS())
640 return Register();
641 Register GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in
642 if (ST.hasMergedShaders()) {
643 switch (MF.getFunction().getCallingConv()) {
646 // Low GIT address is passed in s8 rather than s0 for an LS+HS or
647 // ES+GS merged shader on gfx9+.
648 GitPtrLo = AMDGPU::SGPR8;
649 return GitPtrLo;
650 default:
651 return GitPtrLo;
652 }
653 }
654 return GitPtrLo;
655}
656
658 const TargetRegisterInfo &TRI) {
660 {
662 OS << printReg(Reg, &TRI);
663 }
664 return Dest;
665}
666
667static std::optional<yaml::SIArgumentInfo>
669 const TargetRegisterInfo &TRI) {
671
672 auto convertArg = [&](std::optional<yaml::SIArgument> &A,
673 const ArgDescriptor &Arg) {
674 if (!Arg)
675 return false;
676
677 // Create a register or stack argument.
679 if (Arg.isRegister()) {
681 OS << printReg(Arg.getRegister(), &TRI);
682 } else
683 SA.StackOffset = Arg.getStackOffset();
684 // Check and update the optional mask.
685 if (Arg.isMasked())
686 SA.Mask = Arg.getMask();
687
688 A = SA;
689 return true;
690 };
691
692 // TODO: Need to serialize kernarg preloads.
693 bool Any = false;
694 Any |= convertArg(AI.PrivateSegmentBuffer, ArgInfo.PrivateSegmentBuffer);
695 Any |= convertArg(AI.DispatchPtr, ArgInfo.DispatchPtr);
696 Any |= convertArg(AI.QueuePtr, ArgInfo.QueuePtr);
697 Any |= convertArg(AI.KernargSegmentPtr, ArgInfo.KernargSegmentPtr);
698 Any |= convertArg(AI.DispatchID, ArgInfo.DispatchID);
699 Any |= convertArg(AI.FlatScratchInit, ArgInfo.FlatScratchInit);
700 Any |= convertArg(AI.LDSKernelId, ArgInfo.LDSKernelId);
701 Any |= convertArg(AI.PrivateSegmentSize, ArgInfo.PrivateSegmentSize);
702 Any |= convertArg(AI.WorkGroupIDX, ArgInfo.WorkGroupIDX);
703 Any |= convertArg(AI.WorkGroupIDY, ArgInfo.WorkGroupIDY);
704 Any |= convertArg(AI.WorkGroupIDZ, ArgInfo.WorkGroupIDZ);
705 Any |= convertArg(AI.WorkGroupInfo, ArgInfo.WorkGroupInfo);
706 Any |= convertArg(AI.PrivateSegmentWaveByteOffset,
707 ArgInfo.PrivateSegmentWaveByteOffset);
708 Any |= convertArg(AI.ImplicitArgPtr, ArgInfo.ImplicitArgPtr);
709 Any |= convertArg(AI.ImplicitBufferPtr, ArgInfo.ImplicitBufferPtr);
710 Any |= convertArg(AI.WorkItemIDX, ArgInfo.WorkItemIDX);
711 Any |= convertArg(AI.WorkItemIDY, ArgInfo.WorkItemIDY);
712 Any |= convertArg(AI.WorkItemIDZ, ArgInfo.WorkItemIDZ);
713
714 if (Any)
715 return AI;
716
717 return std::nullopt;
718}
719
722 const llvm::MachineFunction &MF)
723 : ExplicitKernArgSize(MFI.getExplicitKernArgSize()),
724 MaxKernArgAlign(MFI.getMaxKernArgAlign()), LDSSize(MFI.getLDSSize()),
725 GDSSize(MFI.getGDSSize()), DynLDSAlign(MFI.getDynLDSAlign()),
726 IsEntryFunction(MFI.isEntryFunction()),
727 NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()),
728 MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()),
729 HasSpilledSGPRs(MFI.hasSpilledSGPRs()),
730 HasSpilledVGPRs(MFI.hasSpilledVGPRs()),
731 NumWaveDispatchSGPRs(MFI.getNumWaveDispatchSGPRs()),
732 NumWaveDispatchVGPRs(MFI.getNumWaveDispatchVGPRs()),
733 HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()),
734 Occupancy(MFI.getOccupancy()),
735 ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)),
736 FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)),
737 StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)),
738 BytesInStackArgArea(MFI.getBytesInStackArgArea()),
739 ReturnsVoid(MFI.returnsVoid()),
740 ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)),
741 PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()),
742 MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()),
743 Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()),
744 IsWholeWaveFunction(MFI.isWholeWaveFunction()),
745 DynamicVGPRBlockSize(MFI.getDynamicVGPRBlockSize()),
746 ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()) {
747 for (Register Reg : MFI.getSGPRSpillPhysVGPRs())
748 SpillPhysVGPRS.push_back(regToString(Reg, TRI));
749
750 for (Register Reg : MFI.getWWMReservedRegs())
751 WWMReservedRegs.push_back(regToString(Reg, TRI));
752
753 if (MFI.getLongBranchReservedReg())
755 if (MFI.getVGPRForAGPRCopy())
757
758 if (MFI.getSGPRForEXECCopy())
760
761 auto SFI = MFI.getOptionalScavengeFI();
762 if (SFI)
764}
765
768}
769
771 const yaml::SIMachineFunctionInfo &YamlMFI, const MachineFunction &MF,
775 LDSSize = YamlMFI.LDSSize;
776 GDSSize = YamlMFI.GDSSize;
777 DynLDSAlign = YamlMFI.DynLDSAlign;
778 PSInputAddr = YamlMFI.PSInputAddr;
782 Occupancy = YamlMFI.Occupancy;
785 MemoryBound = YamlMFI.MemoryBound;
786 WaveLimiter = YamlMFI.WaveLimiter;
792 ReturnsVoid = YamlMFI.ReturnsVoid;
794
795 if (YamlMFI.ScavengeFI) {
796 auto FIOrErr = YamlMFI.ScavengeFI->getFI(MF.getFrameInfo());
797 if (!FIOrErr) {
798 // Create a diagnostic for a the frame index.
799 const MemoryBuffer &Buffer =
800 *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
801
802 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1, 1,
803 SourceMgr::DK_Error, toString(FIOrErr.takeError()),
804 "", {}, {});
805 SourceRange = YamlMFI.ScavengeFI->SourceRange;
806 return true;
807 }
808 ScavengeFI = *FIOrErr;
809 } else {
810 ScavengeFI = std::nullopt;
811 }
812 return false;
813}
814
816 auto [MinNumAGPR, MaxNumAGPR] =
817 AMDGPU::getIntegerPairAttribute(F, "amdgpu-agpr-alloc", {~0u, ~0u},
818 /*OnlyFirstRequired=*/true);
819 return MinNumAGPR != 0u;
820}
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
Base class for AMDGPU specific classes of TargetSubtarget.
MachineBasicBlock & MBB
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
IO & YamlIO
Definition: ELFYAML.cpp:1327
AMD GCN specific subclass of TargetSubtarget.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Register const TargetRegisterInfo * TRI
static cl::opt< bool > MFMAVGPRForm("amdgpu-mfma-vgpr-form", cl::Hidden, cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If " "unspecified, default to compiler heuristics"), cl::init(false))
const GCNTargetMachine & getTM(const GCNSubtarget *STI)
static std::optional< yaml::SIArgumentInfo > convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo, const TargetRegisterInfo &TRI)
static yaml::StringValue regToString(Register Reg, const TargetRegisterInfo &TRI)
Interface definition for SIRegisterInfo.
raw_pwrite_stream & OS
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
Definition: Any.h:28
BitVector & reset()
Definition: BitVector.h:392
void resize(unsigned N, bool t=false)
resize - Grow or shrink the bitvector.
Definition: BitVector.h:341
BitVector & set()
Definition: BitVector.h:351
void setBitsInMask(const uint32_t *Mask, unsigned MaskWords=~0u)
setBitsInMask - Add '1' bits from Mask to this vector.
Definition: BitVector.h:707
void push_back(bool Val)
Definition: BitVector.h:466
Allocate memory in an ever growing pool, as if by bump-pointer.
Definition: Allocator.h:67
Lightweight error class with error context and mandatory checking.
Definition: Error.h:159
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:270
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:316
void allocKernargPreloadSGPRs(unsigned NumSGPRs)
LLVM_ABI void removeLiveIn(MCRegister Reg, LaneBitmask LaneMask=LaneBitmask::getAll())
Remove the specified register from the live in set.
LLVM_ABI void sortUniqueLiveIns()
Sorts and uniques the LiveIns vector.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
LLVM_ABI int CreateSpillStackObject(uint64_t Size, Align Alignment)
Create a new statically sized stack object that represents a spill slot, returning a nonnegative iden...
void setStackID(int ObjectIdx, uint8_t ID)
bool hasTailCall() const
Returns true if the function contains a tail call.
bool isSpillSlotObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a spill slot.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
void RemoveStackObject(int ObjectIdx)
Remove or mark dead a statically sized stack object.
int getObjectIndexEnd() const
Return one past the maximum frame object index.
uint8_t getStackID(int ObjectIdx) const
int getObjectIndexBegin() const
Return the minimum frame object index.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * cloneInfo(const Ty &Old)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI const MCPhysReg * getCalleeSavedRegs() const
Returns list of callee saved registers.
size_type count(const KeyT &Key) const
Definition: MapVector.h:139
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:115
This interface provides simple read-only access to a block of memory, and provides simple methods for...
Definition: MemoryBuffer.h:52
virtual StringRef getBufferIdentifier() const
Return an identifier for this buffer, typically the filename it was read from.
Definition: MemoryBuffer.h:77
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool initializeBaseYamlFields(const yaml::SIMachineFunctionInfo &YamlMFI, const MachineFunction &MF, PerFunctionMIParsingState &PFS, SMDiagnostic &Error, SMRange &SourceRange)
void shiftWwmVGPRsToLowestRange(MachineFunction &MF, SmallVectorImpl< Register > &WWMVGPRs, BitVector &SavedVGPRs)
Register addPrivateSegmentSize(const SIRegisterInfo &TRI)
void allocateWWMSpill(MachineFunction &MF, Register VGPR, uint64_t Size=4, Align Alignment=Align(4))
Register addDispatchPtr(const SIRegisterInfo &TRI)
Register addFlatScratchInit(const SIRegisterInfo &TRI)
ArrayRef< Register > getSGPRSpillPhysVGPRs() const
int getScavengeFI(MachineFrameInfo &MFI, const SIRegisterInfo &TRI)
Register addQueuePtr(const SIRegisterInfo &TRI)
SIMachineFunctionInfo(const SIMachineFunctionInfo &MFI)=default
Register getGITPtrLoReg(const MachineFunction &MF) const
bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR)
Reserve AGPRs or VGPRs to support spilling for FrameIndex FI.
void splitWWMSpillRegisters(MachineFunction &MF, SmallVectorImpl< std::pair< Register, int > > &CalleeSavedRegs, SmallVectorImpl< std::pair< Register, int > > &ScratchRegs) const
bool mayUseAGPRs(const Function &F) const
bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg) const
bool allocateSGPRSpillToVGPRLane(MachineFunction &MF, int FI, bool SpillToPhysVGPRLane=false, bool IsPrologEpilog=false)
Register addKernargSegmentPtr(const SIRegisterInfo &TRI)
Register addDispatchID(const SIRegisterInfo &TRI)
bool removeDeadFrameIndices(MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs)
If ResetSGPRSpillStackIDs is true, reset the stack ID from sgpr-spill to the default stack.
MachineFunctionInfo * clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF, const DenseMap< MachineBasicBlock *, MachineBasicBlock * > &Src2DstMBB) const override
Make a functionally equivalent copy of this MachineFunctionInfo in MF.
bool checkIndexInPrologEpilogSGPRSpills(int FI) const
Register addPrivateSegmentBuffer(const SIRegisterInfo &TRI)
const ReservedRegSet & getWWMReservedRegs() const
std::optional< int > getOptionalScavengeFI() const
Register addImplicitBufferPtr(const SIRegisterInfo &TRI)
void limitOccupancy(const MachineFunction &MF)
SmallVectorImpl< MCRegister > * addPreloadedKernArg(const SIRegisterInfo &TRI, const TargetRegisterClass *RC, unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs)
static bool isChainScratchRegister(Register VGPR)
Instances of this class encapsulate one diagnostic report, allowing printing to a raw_ostream as a ca...
Definition: SourceMgr.h:282
Represents a location in source code.
Definition: SMLoc.h:23
Represents a range in source code.
Definition: SMLoc.h:48
bool remove(const value_type &X)
Remove an item from the set vector.
Definition: SetVector.h:198
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:168
size_t size() const
Definition: SmallVector.h:79
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
typename SuperClass::const_iterator const_iterator
Definition: SmallVector.h:579
unsigned getMainFileID() const
Definition: SourceMgr.h:133
const MemoryBuffer * getMemoryBuffer(unsigned i) const
Definition: SourceMgr.h:126
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
bool consumeInteger(unsigned Radix, T &Result)
Parse the current string as an integer of the specified radix.
Definition: StringRef.h:509
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:151
const TargetMachine & getTargetMachine() const
ArrayRef< MCPhysReg > getRegisters() const
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:662
unsigned getInitialPSInputAddr(const Function &F)
unsigned getDynamicVGPRBlockSize(const Function &F)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:144
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1770
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition: MCRegister.h:21
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:663
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition: SIInstrInfo.h:40
const char * toString(DWARFSectionKind Kind)
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition: SCCPSolver.h:42
MachineFunctionInfo - This class can be derived from and used by targets to hold private target-speci...
A serializaable representation of a reference to a stack object or fixed stack object.
std::optional< SIArgument > PrivateSegmentWaveByteOffset
std::optional< SIArgument > WorkGroupIDY
std::optional< SIArgument > FlatScratchInit
std::optional< SIArgument > DispatchPtr
std::optional< SIArgument > DispatchID
std::optional< SIArgument > WorkItemIDY
std::optional< SIArgument > WorkGroupIDX
std::optional< SIArgument > ImplicitArgPtr
std::optional< SIArgument > QueuePtr
std::optional< SIArgument > WorkGroupInfo
std::optional< SIArgument > LDSKernelId
std::optional< SIArgument > ImplicitBufferPtr
std::optional< SIArgument > WorkItemIDX
std::optional< SIArgument > KernargSegmentPtr
std::optional< SIArgument > WorkItemIDZ
std::optional< SIArgument > PrivateSegmentSize
std::optional< SIArgument > PrivateSegmentBuffer
std::optional< SIArgument > WorkGroupIDZ
std::optional< unsigned > Mask
static SIArgument createArgument(bool IsReg)
SmallVector< StringValue > WWMReservedRegs
void mappingImpl(yaml::IO &YamlIO) override
SmallVector< StringValue, 2 > SpillPhysVGPRS
std::optional< FrameIndex > ScavengeFI
A wrapper around std::string which contains a source range that's being set during parsing.