LLVM 22.0.0git
SIMachineFunctionInfo.cpp
Go to the documentation of this file.
1//===- SIMachineFunctionInfo.cpp - SI Machine Function Info ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AMDGPUSubtarget.h"
11#include "GCNSubtarget.h"
13#include "SIRegisterInfo.h"
21#include "llvm/IR/CallingConv.h"
23#include "llvm/IR/Function.h"
24#include <cassert>
25#include <optional>
26#include <vector>
27
28enum { MAX_LANES = 64 };
29
30using namespace llvm;
31
32// TODO -- delete this flag once we have more robust mechanisms to allocate the
33// optimal RC for Opc and Dest of MFMA. In particular, there are high RP cases
34// where it is better to produce the VGPR form (e.g. if there are VGPR users
35// of the MFMA result).
37 "amdgpu-mfma-vgpr-form", cl::Hidden,
38 cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If "
39 "unspecified, default to compiler heuristics"),
40 cl::init(false));
41
43 const SITargetLowering *TLI = STI->getTargetLowering();
44 return static_cast<const GCNTargetMachine &>(TLI->getTargetMachine());
45}
46
48 const GCNSubtarget *STI)
49 : AMDGPUMachineFunction(F, *STI), Mode(F, *STI), GWSResourcePSV(getTM(STI)),
50 UserSGPRInfo(F, *STI), WorkGroupIDX(false), WorkGroupIDY(false),
51 WorkGroupIDZ(false), WorkGroupInfo(false), LDSKernelId(false),
52 PrivateSegmentWaveByteOffset(false), WorkItemIDX(false),
53 WorkItemIDY(false), WorkItemIDZ(false), ImplicitArgPtr(false),
54 GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0),
55 IsWholeWaveFunction(F.getCallingConv() ==
57 const GCNSubtarget &ST = *STI;
58 FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
59 WavesPerEU = ST.getWavesPerEU(F);
60 MaxNumWorkGroups = ST.getMaxNumWorkGroups(F);
61 assert(MaxNumWorkGroups.size() == 3);
62
63 // Temporarily check both the attribute and the subtarget feature, until the
64 // latter is completely removed.
65 DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
66 if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
67 DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
68
69 Occupancy = ST.computeOccupancy(F, getLDSSize()).second;
70 CallingConv::ID CC = F.getCallingConv();
71
72 VRegFlags.reserve(1024);
73
74 const bool IsKernel = CC == CallingConv::AMDGPU_KERNEL ||
76
77 if (IsKernel) {
78 WorkGroupIDX = true;
79 WorkItemIDX = true;
80 } else if (CC == CallingConv::AMDGPU_PS) {
81 PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
82 }
83
84 MayNeedAGPRs = ST.hasMAIInsts();
85 if (ST.hasGFX90AInsts()) {
86 // FIXME: MayNeedAGPRs is a misnomer for how this is used. MFMA selection
87 // should be separated from availability of AGPRs
88 if (MFMAVGPRForm ||
89 (ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() &&
90 !mayUseAGPRs(F)))
91 MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
92 }
93
94 if (AMDGPU::isChainCC(CC)) {
95 // Chain functions don't receive an SP from their caller, but are free to
96 // set one up. For now, we can use s32 to match what amdgpu_gfx functions
97 // would use if called, but this can be revisited.
98 // FIXME: Only reserve this if we actually need it.
99 StackPtrOffsetReg = AMDGPU::SGPR32;
100
101 ScratchRSrcReg = AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51;
102
103 ArgInfo.PrivateSegmentBuffer =
104 ArgDescriptor::createRegister(ScratchRSrcReg);
105
106 ImplicitArgPtr = false;
107 } else if (!isEntryFunction()) {
108 if (CC != CallingConv::AMDGPU_Gfx &&
111
112 FrameOffsetReg = AMDGPU::SGPR33;
113 StackPtrOffsetReg = AMDGPU::SGPR32;
114
115 if (!ST.enableFlatScratch()) {
116 // Non-entry functions have no special inputs for now, other registers
117 // required for scratch access.
118 ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
119
120 ArgInfo.PrivateSegmentBuffer =
121 ArgDescriptor::createRegister(ScratchRSrcReg);
122 }
123
124 if (!F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
125 ImplicitArgPtr = true;
126 } else {
127 ImplicitArgPtr = false;
129 std::max(ST.getAlignmentForImplicitArgPtr(), MaxKernArgAlign);
130 }
131
132 if (!AMDGPU::isGraphics(CC) ||
134 ST.hasArchitectedSGPRs())) {
135 if (IsKernel || !F.hasFnAttribute("amdgpu-no-workgroup-id-x"))
136 WorkGroupIDX = true;
137
138 if (!F.hasFnAttribute("amdgpu-no-workgroup-id-y"))
139 WorkGroupIDY = true;
140
141 if (!F.hasFnAttribute("amdgpu-no-workgroup-id-z"))
142 WorkGroupIDZ = true;
143 }
144
145 if (!AMDGPU::isGraphics(CC)) {
146 if (IsKernel || !F.hasFnAttribute("amdgpu-no-workitem-id-x"))
147 WorkItemIDX = true;
148
149 if (!F.hasFnAttribute("amdgpu-no-workitem-id-y") &&
150 ST.getMaxWorkitemID(F, 1) != 0)
151 WorkItemIDY = true;
152
153 if (!F.hasFnAttribute("amdgpu-no-workitem-id-z") &&
154 ST.getMaxWorkitemID(F, 2) != 0)
155 WorkItemIDZ = true;
156
157 if (!IsKernel && !F.hasFnAttribute("amdgpu-no-lds-kernel-id"))
158 LDSKernelId = true;
159 }
160
161 if (isEntryFunction()) {
162 // X, XY, and XYZ are the only supported combinations, so make sure Y is
163 // enabled if Z is.
164 if (WorkItemIDZ)
165 WorkItemIDY = true;
166
167 if (!ST.flatScratchIsArchitected()) {
168 PrivateSegmentWaveByteOffset = true;
169
170 // HS and GS always have the scratch wave offset in SGPR5 on GFX9.
171 if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
173 ArgInfo.PrivateSegmentWaveByteOffset =
174 ArgDescriptor::createRegister(AMDGPU::SGPR5);
175 }
176 }
177
178 Attribute A = F.getFnAttribute("amdgpu-git-ptr-high");
179 StringRef S = A.getValueAsString();
180 if (!S.empty())
181 S.consumeInteger(0, GITPtrHigh);
182
183 A = F.getFnAttribute("amdgpu-32bit-address-high-bits");
184 S = A.getValueAsString();
185 if (!S.empty())
186 S.consumeInteger(0, HighBitsOf32BitAddress);
187
188 MaxMemoryClusterDWords = F.getFnAttributeAsParsedInteger(
189 "amdgpu-max-memory-cluster-dwords", DefaultMemoryClusterDWordsLimit);
190
191 // On GFX908, in order to guarantee copying between AGPRs, we need a scratch
192 // VGPR available at all times. For now, reserve highest available VGPR. After
193 // RA, shift it to the lowest available unused VGPR if the one exist.
194 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
195 VGPRForAGPRCopy =
196 AMDGPU::VGPR_32RegClass.getRegister(ST.getMaxNumVGPRs(F) - 1);
197 }
198}
199
206
209 const GCNSubtarget& ST = MF.getSubtarget<GCNSubtarget>();
210 limitOccupancy(ST.getOccupancyWithWorkGroupSizes(MF).second);
211}
212
214 const SIRegisterInfo &TRI) {
215 ArgInfo.PrivateSegmentBuffer =
216 ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
217 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SGPR_128RegClass));
218 NumUserSGPRs += 4;
219 return ArgInfo.PrivateSegmentBuffer.getRegister();
220}
221
223 ArgInfo.DispatchPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
224 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
225 NumUserSGPRs += 2;
226 return ArgInfo.DispatchPtr.getRegister();
227}
228
230 ArgInfo.QueuePtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
231 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
232 NumUserSGPRs += 2;
233 return ArgInfo.QueuePtr.getRegister();
234}
235
237 ArgInfo.KernargSegmentPtr
238 = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
239 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
240 NumUserSGPRs += 2;
241 return ArgInfo.KernargSegmentPtr.getRegister();
242}
243
245 ArgInfo.DispatchID = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
246 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
247 NumUserSGPRs += 2;
248 return ArgInfo.DispatchID.getRegister();
249}
250
252 ArgInfo.FlatScratchInit = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
253 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
254 NumUserSGPRs += 2;
255 return ArgInfo.FlatScratchInit.getRegister();
256}
257
259 ArgInfo.PrivateSegmentSize = ArgDescriptor::createRegister(getNextUserSGPR());
260 NumUserSGPRs += 1;
261 return ArgInfo.PrivateSegmentSize.getRegister();
262}
263
265 ArgInfo.ImplicitBufferPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
266 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
267 NumUserSGPRs += 2;
268 return ArgInfo.ImplicitBufferPtr.getRegister();
269}
270
272 ArgInfo.LDSKernelId = ArgDescriptor::createRegister(getNextUserSGPR());
273 NumUserSGPRs += 1;
274 return ArgInfo.LDSKernelId.getRegister();
275}
276
278 const SIRegisterInfo &TRI, const TargetRegisterClass *RC,
279 unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs) {
280 auto [It, Inserted] = ArgInfo.PreloadKernArgs.try_emplace(KernArgIdx);
281 assert(Inserted && "Preload kernel argument allocated twice.");
282 NumUserSGPRs += PaddingSGPRs;
283 // If the available register tuples are aligned with the kernarg to be
284 // preloaded use that register, otherwise we need to use a set of SGPRs and
285 // merge them.
286 if (!ArgInfo.FirstKernArgPreloadReg)
287 ArgInfo.FirstKernArgPreloadReg = getNextUserSGPR();
288 Register PreloadReg =
289 TRI.getMatchingSuperReg(getNextUserSGPR(), AMDGPU::sub0, RC);
290 auto &Regs = It->second.Regs;
291 if (PreloadReg &&
292 (RC == &AMDGPU::SReg_32RegClass || RC == &AMDGPU::SReg_64RegClass)) {
293 Regs.push_back(PreloadReg);
294 NumUserSGPRs += AllocSizeDWord;
295 } else {
296 Regs.reserve(AllocSizeDWord);
297 for (unsigned I = 0; I < AllocSizeDWord; ++I) {
298 Regs.push_back(getNextUserSGPR());
299 NumUserSGPRs++;
300 }
301 }
302
303 // Track the actual number of SGPRs that HW will preload to.
304 UserSGPRInfo.allocKernargPreloadSGPRs(AllocSizeDWord + PaddingSGPRs);
305 return &Regs;
306}
307
309 uint64_t Size, Align Alignment) {
310 // Skip if it is an entry function or the register is already added.
311 if (isEntryFunction() || WWMSpills.count(VGPR))
312 return;
313
314 // Skip if this is a function with the amdgpu_cs_chain or
315 // amdgpu_cs_chain_preserve calling convention and this is a scratch register.
316 // We never need to allocate a spill for these because we don't even need to
317 // restore the inactive lanes for them (they're scratchier than the usual
318 // scratch registers). We only need to do this if we have calls to
319 // llvm.amdgcn.cs.chain (otherwise there's no one to save them for, since
320 // chain functions do not return) and the function did not contain a call to
321 // llvm.amdgcn.init.whole.wave (since in that case there are no inactive lanes
322 // when entering the function).
323 if (isChainFunction() &&
326 return;
327
328 WWMSpills.insert(std::make_pair(
329 VGPR, MF.getFrameInfo().CreateSpillStackObject(Size, Alignment)));
330}
331
332// Separate out the callee-saved and scratch registers.
334 MachineFunction &MF,
335 SmallVectorImpl<std::pair<Register, int>> &CalleeSavedRegs,
336 SmallVectorImpl<std::pair<Register, int>> &ScratchRegs) const {
337 const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
338 for (auto &Reg : WWMSpills) {
339 if (isCalleeSavedReg(CSRegs, Reg.first))
340 CalleeSavedRegs.push_back(Reg);
341 else
342 ScratchRegs.push_back(Reg);
343 }
344}
345
347 MCPhysReg Reg) const {
348 for (unsigned I = 0; CSRegs[I]; ++I) {
349 if (CSRegs[I] == Reg)
350 return true;
351 }
352
353 return false;
354}
355
358 BitVector &SavedVGPRs) {
359 const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
361 for (unsigned I = 0, E = WWMVGPRs.size(); I < E; ++I) {
362 Register Reg = WWMVGPRs[I];
363 Register NewReg =
364 TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
365 if (!NewReg || NewReg >= Reg)
366 break;
367
368 MRI.replaceRegWith(Reg, NewReg);
369
370 // Update various tables with the new VGPR.
371 WWMVGPRs[I] = NewReg;
372 WWMReservedRegs.remove(Reg);
373 WWMReservedRegs.insert(NewReg);
374 MRI.reserveReg(NewReg, TRI);
375
376 // Replace the register in SpillPhysVGPRs. This is needed to look for free
377 // lanes while spilling special SGPRs like FP, BP, etc. during PEI.
378 auto *RegItr = llvm::find(SpillPhysVGPRs, Reg);
379 if (RegItr != SpillPhysVGPRs.end()) {
380 unsigned Idx = std::distance(SpillPhysVGPRs.begin(), RegItr);
381 SpillPhysVGPRs[Idx] = NewReg;
382 }
383
384 // The generic `determineCalleeSaves` might have set the old register if it
385 // is in the CSR range.
386 SavedVGPRs.reset(Reg);
387
388 for (MachineBasicBlock &MBB : MF) {
389 MBB.removeLiveIn(Reg);
390 MBB.sortUniqueLiveIns();
391 }
392
393 Reg = NewReg;
394 }
395}
396
397bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills(
398 MachineFunction &MF, int FI, unsigned LaneIndex) {
400 Register LaneVGPR;
401 if (!LaneIndex) {
402 LaneVGPR = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
403 SpillVGPRs.push_back(LaneVGPR);
404 } else {
405 LaneVGPR = SpillVGPRs.back();
406 }
407
408 SGPRSpillsToVirtualVGPRLanes[FI].emplace_back(LaneVGPR, LaneIndex);
409 return true;
410}
411
412bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills(
413 MachineFunction &MF, int FI, unsigned LaneIndex, bool IsPrologEpilog) {
414 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
415 const SIRegisterInfo *TRI = ST.getRegisterInfo();
416 MachineRegisterInfo &MRI = MF.getRegInfo();
417 Register LaneVGPR;
418 if (!LaneIndex) {
419 // Find the highest available register if called before RA to ensure the
420 // lowest registers are available for allocation. The LaneVGPR, in that
421 // case, will be shifted back to the lowest range after VGPR allocation.
422 LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF,
423 !IsPrologEpilog);
424 if (LaneVGPR == AMDGPU::NoRegister) {
425 // We have no VGPRs left for spilling SGPRs. Reset because we will not
426 // partially spill the SGPR to VGPRs.
427 SGPRSpillsToPhysicalVGPRLanes.erase(FI);
428 return false;
429 }
430
431 if (IsPrologEpilog)
432 allocateWWMSpill(MF, LaneVGPR);
433
434 reserveWWMRegister(LaneVGPR);
435 for (MachineBasicBlock &MBB : MF) {
436 MBB.addLiveIn(LaneVGPR);
438 }
439 SpillPhysVGPRs.push_back(LaneVGPR);
440 } else {
441 LaneVGPR = SpillPhysVGPRs.back();
442 }
443
444 SGPRSpillsToPhysicalVGPRLanes[FI].emplace_back(LaneVGPR, LaneIndex);
445 return true;
446}
447
449 MachineFunction &MF, int FI, bool SpillToPhysVGPRLane,
450 bool IsPrologEpilog) {
451 std::vector<SIRegisterInfo::SpilledReg> &SpillLanes =
452 SpillToPhysVGPRLane ? SGPRSpillsToPhysicalVGPRLanes[FI]
453 : SGPRSpillsToVirtualVGPRLanes[FI];
454
455 // This has already been allocated.
456 if (!SpillLanes.empty())
457 return true;
458
459 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
460 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
461 unsigned WaveSize = ST.getWavefrontSize();
462
463 unsigned Size = FrameInfo.getObjectSize(FI);
464 unsigned NumLanes = Size / 4;
465
466 if (NumLanes > WaveSize)
467 return false;
468
469 assert(Size >= 4 && "invalid sgpr spill size");
470 assert(ST.getRegisterInfo()->spillSGPRToVGPR() &&
471 "not spilling SGPRs to VGPRs");
472
473 unsigned &NumSpillLanes = SpillToPhysVGPRLane ? NumPhysicalVGPRSpillLanes
474 : NumVirtualVGPRSpillLanes;
475
476 for (unsigned I = 0; I < NumLanes; ++I, ++NumSpillLanes) {
477 unsigned LaneIndex = (NumSpillLanes % WaveSize);
478
479 bool Allocated = SpillToPhysVGPRLane
480 ? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex,
481 IsPrologEpilog)
482 : allocateVirtualVGPRForSGPRSpills(MF, FI, LaneIndex);
483 if (!Allocated) {
484 NumSpillLanes -= I;
485 return false;
486 }
487 }
488
489 return true;
490}
491
492/// Reserve AGPRs or VGPRs to support spilling for FrameIndex \p FI.
493/// Either AGPR is spilled to VGPR to vice versa.
494/// Returns true if a \p FI can be eliminated completely.
496 int FI,
497 bool isAGPRtoVGPR) {
499 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
500 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
501
502 assert(ST.hasMAIInsts() && FrameInfo.isSpillSlotObjectIndex(FI));
503
504 auto &Spill = VGPRToAGPRSpills[FI];
505
506 // This has already been allocated.
507 if (!Spill.Lanes.empty())
508 return Spill.FullyAllocated;
509
510 unsigned Size = FrameInfo.getObjectSize(FI);
511 unsigned NumLanes = Size / 4;
512 Spill.Lanes.resize(NumLanes, AMDGPU::NoRegister);
513
514 const TargetRegisterClass &RC =
515 isAGPRtoVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::AGPR_32RegClass;
516 auto Regs = RC.getRegisters();
517
518 auto &SpillRegs = isAGPRtoVGPR ? SpillAGPR : SpillVGPR;
519 const SIRegisterInfo *TRI = ST.getRegisterInfo();
520 Spill.FullyAllocated = true;
521
522 // FIXME: Move allocation logic out of MachineFunctionInfo and initialize
523 // once.
524 BitVector OtherUsedRegs;
525 OtherUsedRegs.resize(TRI->getNumRegs());
526
527 const uint32_t *CSRMask =
528 TRI->getCallPreservedMask(MF, MF.getFunction().getCallingConv());
529 if (CSRMask)
530 OtherUsedRegs.setBitsInMask(CSRMask);
531
532 // TODO: Should include register tuples, but doesn't matter with current
533 // usage.
534 for (MCPhysReg Reg : SpillAGPR)
535 OtherUsedRegs.set(Reg);
536 for (MCPhysReg Reg : SpillVGPR)
537 OtherUsedRegs.set(Reg);
538
539 SmallVectorImpl<MCPhysReg>::const_iterator NextSpillReg = Regs.begin();
540 for (int I = NumLanes - 1; I >= 0; --I) {
541 NextSpillReg = std::find_if(
542 NextSpillReg, Regs.end(), [&MRI, &OtherUsedRegs](MCPhysReg Reg) {
543 return MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg) &&
544 !OtherUsedRegs[Reg];
545 });
546
547 if (NextSpillReg == Regs.end()) { // Registers exhausted
548 Spill.FullyAllocated = false;
549 break;
550 }
551
552 OtherUsedRegs.set(*NextSpillReg);
553 SpillRegs.push_back(*NextSpillReg);
554 MRI.reserveReg(*NextSpillReg, TRI);
555 Spill.Lanes[I] = *NextSpillReg++;
556 }
557
558 return Spill.FullyAllocated;
559}
560
562 MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs) {
563 // Remove dead frame indices from function frame, however keep FP & BP since
564 // spills for them haven't been inserted yet. And also make sure to remove the
565 // frame indices from `SGPRSpillsToVirtualVGPRLanes` data structure,
566 // otherwise, it could result in an unexpected side effect and bug, in case of
567 // any re-mapping of freed frame indices by later pass(es) like "stack slot
568 // coloring".
569 for (auto &R : make_early_inc_range(SGPRSpillsToVirtualVGPRLanes)) {
570 MFI.RemoveStackObject(R.first);
571 SGPRSpillsToVirtualVGPRLanes.erase(R.first);
572 }
573
574 // Remove the dead frame indices of CSR SGPRs which are spilled to physical
575 // VGPR lanes during SILowerSGPRSpills pass.
576 if (!ResetSGPRSpillStackIDs) {
577 for (auto &R : make_early_inc_range(SGPRSpillsToPhysicalVGPRLanes)) {
578 MFI.RemoveStackObject(R.first);
579 SGPRSpillsToPhysicalVGPRLanes.erase(R.first);
580 }
581 }
582 bool HaveSGPRToMemory = false;
583
584 if (ResetSGPRSpillStackIDs) {
585 // All other SGPRs must be allocated on the default stack, so reset the
586 // stack ID.
587 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); I != E;
588 ++I) {
592 HaveSGPRToMemory = true;
593 }
594 }
595 }
596 }
597
598 for (auto &R : VGPRToAGPRSpills) {
599 if (R.second.IsDead)
600 MFI.RemoveStackObject(R.first);
601 }
602
603 return HaveSGPRToMemory;
604}
605
607 const SIRegisterInfo &TRI) {
608 if (ScavengeFI)
609 return *ScavengeFI;
610
611 ScavengeFI =
612 MFI.CreateStackObject(TRI.getSpillSize(AMDGPU::SGPR_32RegClass),
613 TRI.getSpillAlign(AMDGPU::SGPR_32RegClass), false);
614 return *ScavengeFI;
615}
616
617MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const {
618 assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
619 return AMDGPU::SGPR0 + NumUserSGPRs;
620}
621
622MCPhysReg SIMachineFunctionInfo::getNextSystemSGPR() const {
623 return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
624}
625
626void SIMachineFunctionInfo::MRI_NoteNewVirtualRegister(Register Reg) {
627 VRegFlags.grow(Reg);
628}
629
630void SIMachineFunctionInfo::MRI_NoteCloneVirtualRegister(Register NewReg,
631 Register SrcReg) {
632 VRegFlags.grow(NewReg);
633 VRegFlags[NewReg] = VRegFlags[SrcReg];
634}
635
638 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
639 if (!ST.isAmdPalOS())
640 return Register();
641 Register GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in
642 if (ST.hasMergedShaders()) {
643 switch (MF.getFunction().getCallingConv()) {
646 // Low GIT address is passed in s8 rather than s0 for an LS+HS or
647 // ES+GS merged shader on gfx9+.
648 GitPtrLo = AMDGPU::SGPR8;
649 return GitPtrLo;
650 default:
651 return GitPtrLo;
652 }
653 }
654 return GitPtrLo;
655}
656
658 const TargetRegisterInfo &TRI) {
660 {
661 raw_string_ostream OS(Dest.Value);
662 OS << printReg(Reg, &TRI);
663 }
664 return Dest;
665}
666
667static std::optional<yaml::SIArgumentInfo>
669 const TargetRegisterInfo &TRI) {
671
672 auto convertArg = [&](std::optional<yaml::SIArgument> &A,
673 const ArgDescriptor &Arg) {
674 if (!Arg)
675 return false;
676
677 // Create a register or stack argument.
679 if (Arg.isRegister()) {
681 OS << printReg(Arg.getRegister(), &TRI);
682 } else
683 SA.StackOffset = Arg.getStackOffset();
684 // Check and update the optional mask.
685 if (Arg.isMasked())
686 SA.Mask = Arg.getMask();
687
688 A = SA;
689 return true;
690 };
691
692 // TODO: Need to serialize kernarg preloads.
693 bool Any = false;
694 Any |= convertArg(AI.PrivateSegmentBuffer, ArgInfo.PrivateSegmentBuffer);
695 Any |= convertArg(AI.DispatchPtr, ArgInfo.DispatchPtr);
696 Any |= convertArg(AI.QueuePtr, ArgInfo.QueuePtr);
697 Any |= convertArg(AI.KernargSegmentPtr, ArgInfo.KernargSegmentPtr);
698 Any |= convertArg(AI.DispatchID, ArgInfo.DispatchID);
699 Any |= convertArg(AI.FlatScratchInit, ArgInfo.FlatScratchInit);
700 Any |= convertArg(AI.LDSKernelId, ArgInfo.LDSKernelId);
701 Any |= convertArg(AI.PrivateSegmentSize, ArgInfo.PrivateSegmentSize);
702 Any |= convertArg(AI.WorkGroupIDX, ArgInfo.WorkGroupIDX);
703 Any |= convertArg(AI.WorkGroupIDY, ArgInfo.WorkGroupIDY);
704 Any |= convertArg(AI.WorkGroupIDZ, ArgInfo.WorkGroupIDZ);
705 Any |= convertArg(AI.WorkGroupInfo, ArgInfo.WorkGroupInfo);
706 Any |= convertArg(AI.PrivateSegmentWaveByteOffset,
707 ArgInfo.PrivateSegmentWaveByteOffset);
708 Any |= convertArg(AI.ImplicitArgPtr, ArgInfo.ImplicitArgPtr);
709 Any |= convertArg(AI.ImplicitBufferPtr, ArgInfo.ImplicitBufferPtr);
710 Any |= convertArg(AI.WorkItemIDX, ArgInfo.WorkItemIDX);
711 Any |= convertArg(AI.WorkItemIDY, ArgInfo.WorkItemIDY);
712 Any |= convertArg(AI.WorkItemIDZ, ArgInfo.WorkItemIDZ);
713
714 if (Any)
715 return AI;
716
717 return std::nullopt;
718}
719
722 const llvm::MachineFunction &MF)
723 : ExplicitKernArgSize(MFI.getExplicitKernArgSize()),
724 MaxKernArgAlign(MFI.getMaxKernArgAlign()), LDSSize(MFI.getLDSSize()),
725 GDSSize(MFI.getGDSSize()), DynLDSAlign(MFI.getDynLDSAlign()),
726 IsEntryFunction(MFI.isEntryFunction()),
727 NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()),
728 MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()),
729 HasSpilledSGPRs(MFI.hasSpilledSGPRs()),
730 HasSpilledVGPRs(MFI.hasSpilledVGPRs()),
731 NumWaveDispatchSGPRs(MFI.getNumWaveDispatchSGPRs()),
732 NumWaveDispatchVGPRs(MFI.getNumWaveDispatchVGPRs()),
733 HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()),
734 Occupancy(MFI.getOccupancy()),
735 ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)),
736 FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)),
737 StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)),
738 BytesInStackArgArea(MFI.getBytesInStackArgArea()),
739 ReturnsVoid(MFI.returnsVoid()),
740 ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)),
741 PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()),
742 MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()),
743 Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()),
744 IsWholeWaveFunction(MFI.isWholeWaveFunction()),
745 DynamicVGPRBlockSize(MFI.getDynamicVGPRBlockSize()),
746 ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()) {
747 for (Register Reg : MFI.getSGPRSpillPhysVGPRs())
748 SpillPhysVGPRS.push_back(regToString(Reg, TRI));
749
750 for (Register Reg : MFI.getWWMReservedRegs())
751 WWMReservedRegs.push_back(regToString(Reg, TRI));
752
753 if (MFI.getLongBranchReservedReg())
755 if (MFI.getVGPRForAGPRCopy())
757
758 if (MFI.getSGPRForEXECCopy())
760
761 auto SFI = MFI.getOptionalScavengeFI();
762 if (SFI)
764}
765
769
771 const yaml::SIMachineFunctionInfo &YamlMFI, const MachineFunction &MF,
775 LDSSize = YamlMFI.LDSSize;
776 GDSSize = YamlMFI.GDSSize;
777 DynLDSAlign = YamlMFI.DynLDSAlign;
778 PSInputAddr = YamlMFI.PSInputAddr;
779 PSInputEnable = YamlMFI.PSInputEnable;
780 MaxMemoryClusterDWords = YamlMFI.MaxMemoryClusterDWords;
781 HighBitsOf32BitAddress = YamlMFI.HighBitsOf32BitAddress;
782 Occupancy = YamlMFI.Occupancy;
785 MemoryBound = YamlMFI.MemoryBound;
786 WaveLimiter = YamlMFI.WaveLimiter;
787 HasSpilledSGPRs = YamlMFI.HasSpilledSGPRs;
788 HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs;
789 NumWaveDispatchSGPRs = YamlMFI.NumWaveDispatchSGPRs;
790 NumWaveDispatchVGPRs = YamlMFI.NumWaveDispatchVGPRs;
791 BytesInStackArgArea = YamlMFI.BytesInStackArgArea;
792 ReturnsVoid = YamlMFI.ReturnsVoid;
793 IsWholeWaveFunction = YamlMFI.IsWholeWaveFunction;
794
795 if (YamlMFI.ScavengeFI) {
796 auto FIOrErr = YamlMFI.ScavengeFI->getFI(MF.getFrameInfo());
797 if (!FIOrErr) {
798 // Create a diagnostic for a the frame index.
799 const MemoryBuffer &Buffer =
800 *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
801
802 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1, 1,
803 SourceMgr::DK_Error, toString(FIOrErr.takeError()),
804 "", {}, {});
805 SourceRange = YamlMFI.ScavengeFI->SourceRange;
806 return true;
807 }
808 ScavengeFI = *FIOrErr;
809 } else {
810 ScavengeFI = std::nullopt;
811 }
812 return false;
813}
814
816 auto [MinNumAGPR, MaxNumAGPR] =
817 AMDGPU::getIntegerPairAttribute(F, "amdgpu-agpr-alloc", {~0u, ~0u},
818 /*OnlyFirstRequired=*/true);
819 return MinNumAGPR != 0u;
820}
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
Base class for AMDGPU specific classes of TargetSubtarget.
MachineBasicBlock & MBB
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
AMD GCN specific subclass of TargetSubtarget.
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
if(PassOpts->AAPipeline)
static cl::opt< bool > MFMAVGPRForm("amdgpu-mfma-vgpr-form", cl::Hidden, cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If " "unspecified, default to compiler heuristics"), cl::init(false))
const GCNTargetMachine & getTM(const GCNSubtarget *STI)
static std::optional< yaml::SIArgumentInfo > convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo, const TargetRegisterInfo &TRI)
static yaml::StringValue regToString(Register Reg, const TargetRegisterInfo &TRI)
Interface definition for SIRegisterInfo.
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
AMDGPUMachineFunction(const Function &F, const AMDGPUSubtarget &ST)
Align DynLDSAlign
Align for dynamic shared memory if any.
uint32_t LDSSize
Number of bytes in the LDS that are being used.
Functions, function parameters, and return types can have attributes to indicate how they should be t...
Definition Attributes.h:69
BitVector & reset()
Definition BitVector.h:392
void resize(unsigned N, bool t=false)
resize - Grow or shrink the bitvector.
Definition BitVector.h:341
BitVector & set()
Definition BitVector.h:351
void setBitsInMask(const uint32_t *Mask, unsigned MaskWords=~0u)
setBitsInMask - Add '1' bits from Mask to this vector.
Definition BitVector.h:707
void push_back(bool Val)
Definition BitVector.h:466
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
const SITargetLowering * getTargetLowering() const override
LLVM_ABI void sortUniqueLiveIns()
Sorts and uniques the LiveIns vector.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
LLVM_ABI int CreateSpillStackObject(uint64_t Size, Align Alignment)
Create a new statically sized stack object that represents a spill slot, returning a nonnegative iden...
void setStackID(int ObjectIdx, uint8_t ID)
bool hasTailCall() const
Returns true if the function contains a tail call.
bool isSpillSlotObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a spill slot.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
void RemoveStackObject(int ObjectIdx)
Remove or mark dead a statically sized stack object.
int getObjectIndexEnd() const
Return one past the maximum frame object index.
uint8_t getStackID(int ObjectIdx) const
int getObjectIndexBegin() const
Return the minimum frame object index.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * cloneInfo(const Ty &Old)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI const MCPhysReg * getCalleeSavedRegs() const
Returns list of callee saved registers.
This interface provides simple read-only access to a block of memory, and provides simple methods for...
virtual StringRef getBufferIdentifier() const
Return an identifier for this buffer, typically the filename it was read from.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool initializeBaseYamlFields(const yaml::SIMachineFunctionInfo &YamlMFI, const MachineFunction &MF, PerFunctionMIParsingState &PFS, SMDiagnostic &Error, SMRange &SourceRange)
void shiftWwmVGPRsToLowestRange(MachineFunction &MF, SmallVectorImpl< Register > &WWMVGPRs, BitVector &SavedVGPRs)
Register addPrivateSegmentSize(const SIRegisterInfo &TRI)
void allocateWWMSpill(MachineFunction &MF, Register VGPR, uint64_t Size=4, Align Alignment=Align(4))
Register addDispatchPtr(const SIRegisterInfo &TRI)
Register addFlatScratchInit(const SIRegisterInfo &TRI)
ArrayRef< Register > getSGPRSpillPhysVGPRs() const
int getScavengeFI(MachineFrameInfo &MFI, const SIRegisterInfo &TRI)
Register addQueuePtr(const SIRegisterInfo &TRI)
SIMachineFunctionInfo(const SIMachineFunctionInfo &MFI)=default
Register getGITPtrLoReg(const MachineFunction &MF) const
bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR)
Reserve AGPRs or VGPRs to support spilling for FrameIndex FI.
void splitWWMSpillRegisters(MachineFunction &MF, SmallVectorImpl< std::pair< Register, int > > &CalleeSavedRegs, SmallVectorImpl< std::pair< Register, int > > &ScratchRegs) const
bool mayUseAGPRs(const Function &F) const
bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg) const
bool allocateSGPRSpillToVGPRLane(MachineFunction &MF, int FI, bool SpillToPhysVGPRLane=false, bool IsPrologEpilog=false)
Register addKernargSegmentPtr(const SIRegisterInfo &TRI)
Register addDispatchID(const SIRegisterInfo &TRI)
bool removeDeadFrameIndices(MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs)
If ResetSGPRSpillStackIDs is true, reset the stack ID from sgpr-spill to the default stack.
MachineFunctionInfo * clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF, const DenseMap< MachineBasicBlock *, MachineBasicBlock * > &Src2DstMBB) const override
Make a functionally equivalent copy of this MachineFunctionInfo in MF.
bool checkIndexInPrologEpilogSGPRSpills(int FI) const
Register addPrivateSegmentBuffer(const SIRegisterInfo &TRI)
const ReservedRegSet & getWWMReservedRegs() const
std::optional< int > getOptionalScavengeFI() const
Register addImplicitBufferPtr(const SIRegisterInfo &TRI)
void limitOccupancy(const MachineFunction &MF)
SmallVectorImpl< MCRegister > * addPreloadedKernArg(const SIRegisterInfo &TRI, const TargetRegisterClass *RC, unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs)
static bool isChainScratchRegister(Register VGPR)
Instances of this class encapsulate one diagnostic report, allowing printing to a raw_ostream as a ca...
Definition SourceMgr.h:282
Represents a location in source code.
Definition SMLoc.h:23
Represents a range in source code.
Definition SMLoc.h:48
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
typename SuperClass::const_iterator const_iterator
unsigned getMainFileID() const
Definition SourceMgr.h:133
const MemoryBuffer * getMemoryBuffer(unsigned i) const
Definition SourceMgr.h:126
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
bool consumeInteger(unsigned Radix, T &Result)
Parse the current string as an integer of the specified radix.
Definition StringRef.h:509
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:151
const TargetMachine & getTargetMachine() const
ArrayRef< MCPhysReg > getRegisters() const
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
A raw_ostream that writes to an std::string.
unsigned getInitialPSInputAddr(const Function &F)
unsigned getDynamicVGPRBlockSize(const Function &F)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
CallingConv Namespace - This namespace contains an enum with a value for the well-known calling conve...
Definition CallingConv.h:21
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ SPIR_KERNEL
Used for SPIR kernel functions.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:646
BumpPtrAllocatorImpl BumpPtrAllocator
The standard BumpPtrAllocator which just uses the default template parameters.
Definition Allocator.h:383
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition SIInstrInfo.h:40
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition SCCPSolver.h:42
MachineFunctionInfo - This class can be derived from and used by targets to hold private target-speci...
A serializaable representation of a reference to a stack object or fixed stack object.
This class should be specialized by any type that needs to be converted to/from a YAML mapping.
Definition YAMLTraits.h:62
std::optional< SIArgument > PrivateSegmentWaveByteOffset
std::optional< SIArgument > WorkGroupIDY
std::optional< SIArgument > FlatScratchInit
std::optional< SIArgument > DispatchPtr
std::optional< SIArgument > DispatchID
std::optional< SIArgument > WorkItemIDY
std::optional< SIArgument > WorkGroupIDX
std::optional< SIArgument > ImplicitArgPtr
std::optional< SIArgument > QueuePtr
std::optional< SIArgument > WorkGroupInfo
std::optional< SIArgument > LDSKernelId
std::optional< SIArgument > ImplicitBufferPtr
std::optional< SIArgument > WorkItemIDX
std::optional< SIArgument > KernargSegmentPtr
std::optional< SIArgument > WorkItemIDZ
std::optional< SIArgument > PrivateSegmentSize
std::optional< SIArgument > PrivateSegmentBuffer
std::optional< SIArgument > WorkGroupIDZ
std::optional< unsigned > Mask
static SIArgument createArgument(bool IsReg)
SmallVector< StringValue > WWMReservedRegs
void mappingImpl(yaml::IO &YamlIO) override
std::optional< SIArgumentInfo > ArgInfo
SmallVector< StringValue, 2 > SpillPhysVGPRS
std::optional< FrameIndex > ScavengeFI
A wrapper around std::string which contains a source range that's being set during parsing.