LLVM 22.0.0git
SIMachineFunctionInfo.cpp
Go to the documentation of this file.
1//===- SIMachineFunctionInfo.cpp - SI Machine Function Info ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AMDGPUSubtarget.h"
11#include "GCNSubtarget.h"
13#include "SIRegisterInfo.h"
21#include "llvm/IR/CallingConv.h"
23#include "llvm/IR/Function.h"
24#include <cassert>
25#include <optional>
26#include <vector>
27
28enum { MAX_LANES = 64 };
29
30using namespace llvm;
31
32// TODO -- delete this flag once we have more robust mechanisms to allocate the
33// optimal RC for Opc and Dest of MFMA. In particular, there are high RP cases
34// where it is better to produce the VGPR form (e.g. if there are VGPR users
35// of the MFMA result).
37 "amdgpu-mfma-vgpr-form", cl::Hidden,
38 cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If "
39 "unspecified, default to compiler heuristics"),
40 cl::init(false));
41
43 const SITargetLowering *TLI = STI->getTargetLowering();
44 return static_cast<const GCNTargetMachine &>(TLI->getTargetMachine());
45}
46
48 const GCNSubtarget *STI)
49 : AMDGPUMachineFunction(F, *STI), Mode(F, *STI), GWSResourcePSV(getTM(STI)),
50 UserSGPRInfo(F, *STI), WorkGroupIDX(false), WorkGroupIDY(false),
51 WorkGroupIDZ(false), WorkGroupInfo(false), LDSKernelId(false),
52 PrivateSegmentWaveByteOffset(false), WorkItemIDX(false),
53 WorkItemIDY(false), WorkItemIDZ(false), ImplicitArgPtr(false),
54 GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0),
55 IsWholeWaveFunction(F.getCallingConv() ==
57 const GCNSubtarget &ST = *STI;
58 FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
59 WavesPerEU = ST.getWavesPerEU(F);
60 MaxNumWorkGroups = ST.getMaxNumWorkGroups(F);
61 assert(MaxNumWorkGroups.size() == 3);
62
63 // Temporarily check both the attribute and the subtarget feature, until the
64 // latter is completely removed.
65 DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
66 if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
67 DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
68
69 Occupancy = ST.computeOccupancy(F, getLDSSize()).second;
70 CallingConv::ID CC = F.getCallingConv();
71
72 VRegFlags.reserve(1024);
73
74 const bool IsKernel = CC == CallingConv::AMDGPU_KERNEL ||
76
77 if (IsKernel) {
78 WorkGroupIDX = true;
79 WorkItemIDX = true;
80 } else if (CC == CallingConv::AMDGPU_PS) {
81 PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
82 }
83
84 MayNeedAGPRs = ST.hasMAIInsts();
85 if (ST.hasGFX90AInsts()) {
86 // FIXME: MayNeedAGPRs is a misnomer for how this is used. MFMA selection
87 // should be separated from availability of AGPRs
88 if (MFMAVGPRForm ||
89 (ST.getMaxNumVGPRs(F) <= ST.getAddressableNumArchVGPRs() &&
90 !mayUseAGPRs(F)))
91 MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
92 }
93
94 if (AMDGPU::isChainCC(CC)) {
95 // Chain functions don't receive an SP from their caller, but are free to
96 // set one up. For now, we can use s32 to match what amdgpu_gfx functions
97 // would use if called, but this can be revisited.
98 // FIXME: Only reserve this if we actually need it.
99 StackPtrOffsetReg = AMDGPU::SGPR32;
100
101 ScratchRSrcReg = AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51;
102
103 ArgInfo.PrivateSegmentBuffer =
104 ArgDescriptor::createRegister(ScratchRSrcReg);
105
106 ImplicitArgPtr = false;
107 } else if (!isEntryFunction()) {
108 if (CC != CallingConv::AMDGPU_Gfx &&
111
112 FrameOffsetReg = AMDGPU::SGPR33;
113 StackPtrOffsetReg = AMDGPU::SGPR32;
114
115 if (!ST.enableFlatScratch()) {
116 // Non-entry functions have no special inputs for now, other registers
117 // required for scratch access.
118 ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
119
120 ArgInfo.PrivateSegmentBuffer =
121 ArgDescriptor::createRegister(ScratchRSrcReg);
122 }
123
124 if (!F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
125 ImplicitArgPtr = true;
126 } else {
127 ImplicitArgPtr = false;
129 std::max(ST.getAlignmentForImplicitArgPtr(), MaxKernArgAlign);
130 }
131
132 if (!AMDGPU::isGraphics(CC) ||
134 ST.hasArchitectedSGPRs())) {
135 if (IsKernel || !F.hasFnAttribute("amdgpu-no-workgroup-id-x") ||
136 !F.hasFnAttribute("amdgpu-no-cluster-id-x"))
137 WorkGroupIDX = true;
138
139 if (!F.hasFnAttribute("amdgpu-no-workgroup-id-y") ||
140 !F.hasFnAttribute("amdgpu-no-cluster-id-y"))
141 WorkGroupIDY = true;
142
143 if (!F.hasFnAttribute("amdgpu-no-workgroup-id-z") ||
144 !F.hasFnAttribute("amdgpu-no-cluster-id-z"))
145 WorkGroupIDZ = true;
146 }
147
148 if (!AMDGPU::isGraphics(CC)) {
149 if (IsKernel || !F.hasFnAttribute("amdgpu-no-workitem-id-x"))
150 WorkItemIDX = true;
151
152 if (!F.hasFnAttribute("amdgpu-no-workitem-id-y") &&
153 ST.getMaxWorkitemID(F, 1) != 0)
154 WorkItemIDY = true;
155
156 if (!F.hasFnAttribute("amdgpu-no-workitem-id-z") &&
157 ST.getMaxWorkitemID(F, 2) != 0)
158 WorkItemIDZ = true;
159
160 if (!IsKernel && !F.hasFnAttribute("amdgpu-no-lds-kernel-id"))
161 LDSKernelId = true;
162 }
163
164 if (isEntryFunction()) {
165 // X, XY, and XYZ are the only supported combinations, so make sure Y is
166 // enabled if Z is.
167 if (WorkItemIDZ)
168 WorkItemIDY = true;
169
170 if (!ST.flatScratchIsArchitected()) {
171 PrivateSegmentWaveByteOffset = true;
172
173 // HS and GS always have the scratch wave offset in SGPR5 on GFX9.
174 if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
176 ArgInfo.PrivateSegmentWaveByteOffset =
177 ArgDescriptor::createRegister(AMDGPU::SGPR5);
178 }
179 }
180
181 Attribute A = F.getFnAttribute("amdgpu-git-ptr-high");
182 StringRef S = A.getValueAsString();
183 if (!S.empty())
184 S.consumeInteger(0, GITPtrHigh);
185
186 A = F.getFnAttribute("amdgpu-32bit-address-high-bits");
187 S = A.getValueAsString();
188 if (!S.empty())
189 S.consumeInteger(0, HighBitsOf32BitAddress);
190
191 MaxMemoryClusterDWords = F.getFnAttributeAsParsedInteger(
192 "amdgpu-max-memory-cluster-dwords", DefaultMemoryClusterDWordsLimit);
193
194 // On GFX908, in order to guarantee copying between AGPRs, we need a scratch
195 // VGPR available at all times. For now, reserve highest available VGPR. After
196 // RA, shift it to the lowest available unused VGPR if the one exist.
197 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
198 VGPRForAGPRCopy =
199 AMDGPU::VGPR_32RegClass.getRegister(ST.getMaxNumVGPRs(F) - 1);
200 }
201
202 ClusterDims = AMDGPU::ClusterDimsAttr::get(F);
203}
204
211
214 const GCNSubtarget& ST = MF.getSubtarget<GCNSubtarget>();
215 limitOccupancy(ST.getOccupancyWithWorkGroupSizes(MF).second);
216}
217
219 const SIRegisterInfo &TRI) {
220 ArgInfo.PrivateSegmentBuffer =
221 ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
222 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SGPR_128RegClass));
223 NumUserSGPRs += 4;
224 return ArgInfo.PrivateSegmentBuffer.getRegister();
225}
226
228 ArgInfo.DispatchPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
229 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
230 NumUserSGPRs += 2;
231 return ArgInfo.DispatchPtr.getRegister();
232}
233
235 ArgInfo.QueuePtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
236 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
237 NumUserSGPRs += 2;
238 return ArgInfo.QueuePtr.getRegister();
239}
240
242 ArgInfo.KernargSegmentPtr
243 = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
244 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
245 NumUserSGPRs += 2;
246 return ArgInfo.KernargSegmentPtr.getRegister();
247}
248
250 ArgInfo.DispatchID = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
251 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
252 NumUserSGPRs += 2;
253 return ArgInfo.DispatchID.getRegister();
254}
255
257 ArgInfo.FlatScratchInit = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
258 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
259 NumUserSGPRs += 2;
260 return ArgInfo.FlatScratchInit.getRegister();
261}
262
264 ArgInfo.PrivateSegmentSize = ArgDescriptor::createRegister(getNextUserSGPR());
265 NumUserSGPRs += 1;
266 return ArgInfo.PrivateSegmentSize.getRegister();
267}
268
270 ArgInfo.ImplicitBufferPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
271 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
272 NumUserSGPRs += 2;
273 return ArgInfo.ImplicitBufferPtr.getRegister();
274}
275
277 ArgInfo.LDSKernelId = ArgDescriptor::createRegister(getNextUserSGPR());
278 NumUserSGPRs += 1;
279 return ArgInfo.LDSKernelId.getRegister();
280}
281
283 const SIRegisterInfo &TRI, const TargetRegisterClass *RC,
284 unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs) {
285 auto [It, Inserted] = ArgInfo.PreloadKernArgs.try_emplace(KernArgIdx);
286 assert(Inserted && "Preload kernel argument allocated twice.");
287 NumUserSGPRs += PaddingSGPRs;
288 // If the available register tuples are aligned with the kernarg to be
289 // preloaded use that register, otherwise we need to use a set of SGPRs and
290 // merge them.
291 if (!ArgInfo.FirstKernArgPreloadReg)
292 ArgInfo.FirstKernArgPreloadReg = getNextUserSGPR();
293 Register PreloadReg =
294 TRI.getMatchingSuperReg(getNextUserSGPR(), AMDGPU::sub0, RC);
295 auto &Regs = It->second.Regs;
296 if (PreloadReg &&
297 (RC == &AMDGPU::SReg_32RegClass || RC == &AMDGPU::SReg_64RegClass)) {
298 Regs.push_back(PreloadReg);
299 NumUserSGPRs += AllocSizeDWord;
300 } else {
301 Regs.reserve(AllocSizeDWord);
302 for (unsigned I = 0; I < AllocSizeDWord; ++I) {
303 Regs.push_back(getNextUserSGPR());
304 NumUserSGPRs++;
305 }
306 }
307
308 // Track the actual number of SGPRs that HW will preload to.
309 UserSGPRInfo.allocKernargPreloadSGPRs(AllocSizeDWord + PaddingSGPRs);
310 return &Regs;
311}
312
314 uint64_t Size, Align Alignment) {
315 // Skip if it is an entry function or the register is already added.
316 if (isEntryFunction() || WWMSpills.count(VGPR))
317 return;
318
319 // Skip if this is a function with the amdgpu_cs_chain or
320 // amdgpu_cs_chain_preserve calling convention and this is a scratch register.
321 // We never need to allocate a spill for these because we don't even need to
322 // restore the inactive lanes for them (they're scratchier than the usual
323 // scratch registers). We only need to do this if we have calls to
324 // llvm.amdgcn.cs.chain (otherwise there's no one to save them for, since
325 // chain functions do not return) and the function did not contain a call to
326 // llvm.amdgcn.init.whole.wave (since in that case there are no inactive lanes
327 // when entering the function).
328 if (isChainFunction() &&
331 return;
332
333 WWMSpills.insert(std::make_pair(
334 VGPR, MF.getFrameInfo().CreateSpillStackObject(Size, Alignment)));
335}
336
337// Separate out the callee-saved and scratch registers.
339 MachineFunction &MF,
340 SmallVectorImpl<std::pair<Register, int>> &CalleeSavedRegs,
341 SmallVectorImpl<std::pair<Register, int>> &ScratchRegs) const {
342 const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
343 for (auto &Reg : WWMSpills) {
344 if (isCalleeSavedReg(CSRegs, Reg.first))
345 CalleeSavedRegs.push_back(Reg);
346 else
347 ScratchRegs.push_back(Reg);
348 }
349}
350
352 MCPhysReg Reg) const {
353 for (unsigned I = 0; CSRegs[I]; ++I) {
354 if (CSRegs[I] == Reg)
355 return true;
356 }
357
358 return false;
359}
360
363 BitVector &SavedVGPRs) {
364 const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
366 for (unsigned I = 0, E = WWMVGPRs.size(); I < E; ++I) {
367 Register Reg = WWMVGPRs[I];
368 Register NewReg =
369 TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
370 if (!NewReg || NewReg >= Reg)
371 break;
372
373 MRI.replaceRegWith(Reg, NewReg);
374
375 // Update various tables with the new VGPR.
376 WWMVGPRs[I] = NewReg;
377 WWMReservedRegs.remove(Reg);
378 WWMReservedRegs.insert(NewReg);
379 MRI.reserveReg(NewReg, TRI);
380
381 // Replace the register in SpillPhysVGPRs. This is needed to look for free
382 // lanes while spilling special SGPRs like FP, BP, etc. during PEI.
383 auto *RegItr = llvm::find(SpillPhysVGPRs, Reg);
384 if (RegItr != SpillPhysVGPRs.end()) {
385 unsigned Idx = std::distance(SpillPhysVGPRs.begin(), RegItr);
386 SpillPhysVGPRs[Idx] = NewReg;
387 }
388
389 // The generic `determineCalleeSaves` might have set the old register if it
390 // is in the CSR range.
391 SavedVGPRs.reset(Reg);
392
393 for (MachineBasicBlock &MBB : MF) {
394 MBB.removeLiveIn(Reg);
395 MBB.sortUniqueLiveIns();
396 }
397
398 Reg = NewReg;
399 }
400}
401
402bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills(
403 MachineFunction &MF, int FI, unsigned LaneIndex) {
405 Register LaneVGPR;
406 if (!LaneIndex) {
407 LaneVGPR = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
408 SpillVGPRs.push_back(LaneVGPR);
409 } else {
410 LaneVGPR = SpillVGPRs.back();
411 }
412
413 SGPRSpillsToVirtualVGPRLanes[FI].emplace_back(LaneVGPR, LaneIndex);
414 return true;
415}
416
417bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills(
418 MachineFunction &MF, int FI, unsigned LaneIndex, bool IsPrologEpilog) {
419 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
420 const SIRegisterInfo *TRI = ST.getRegisterInfo();
421 MachineRegisterInfo &MRI = MF.getRegInfo();
422 Register LaneVGPR;
423 if (!LaneIndex) {
424 // Find the highest available register if called before RA to ensure the
425 // lowest registers are available for allocation. The LaneVGPR, in that
426 // case, will be shifted back to the lowest range after VGPR allocation.
427 LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF,
428 !IsPrologEpilog);
429 if (LaneVGPR == AMDGPU::NoRegister) {
430 // We have no VGPRs left for spilling SGPRs. Reset because we will not
431 // partially spill the SGPR to VGPRs.
432 SGPRSpillsToPhysicalVGPRLanes.erase(FI);
433 return false;
434 }
435
436 if (IsPrologEpilog)
437 allocateWWMSpill(MF, LaneVGPR);
438
439 reserveWWMRegister(LaneVGPR);
440 for (MachineBasicBlock &MBB : MF) {
441 MBB.addLiveIn(LaneVGPR);
443 }
444 SpillPhysVGPRs.push_back(LaneVGPR);
445 } else {
446 LaneVGPR = SpillPhysVGPRs.back();
447 }
448
449 SGPRSpillsToPhysicalVGPRLanes[FI].emplace_back(LaneVGPR, LaneIndex);
450 return true;
451}
452
454 MachineFunction &MF, int FI, bool SpillToPhysVGPRLane,
455 bool IsPrologEpilog) {
456 std::vector<SIRegisterInfo::SpilledReg> &SpillLanes =
457 SpillToPhysVGPRLane ? SGPRSpillsToPhysicalVGPRLanes[FI]
458 : SGPRSpillsToVirtualVGPRLanes[FI];
459
460 // This has already been allocated.
461 if (!SpillLanes.empty())
462 return true;
463
464 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
465 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
466 unsigned WaveSize = ST.getWavefrontSize();
467
468 unsigned Size = FrameInfo.getObjectSize(FI);
469 unsigned NumLanes = Size / 4;
470
471 if (NumLanes > WaveSize)
472 return false;
473
474 assert(Size >= 4 && "invalid sgpr spill size");
475 assert(ST.getRegisterInfo()->spillSGPRToVGPR() &&
476 "not spilling SGPRs to VGPRs");
477
478 unsigned &NumSpillLanes = SpillToPhysVGPRLane ? NumPhysicalVGPRSpillLanes
479 : NumVirtualVGPRSpillLanes;
480
481 for (unsigned I = 0; I < NumLanes; ++I, ++NumSpillLanes) {
482 unsigned LaneIndex = (NumSpillLanes % WaveSize);
483
484 bool Allocated = SpillToPhysVGPRLane
485 ? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex,
486 IsPrologEpilog)
487 : allocateVirtualVGPRForSGPRSpills(MF, FI, LaneIndex);
488 if (!Allocated) {
489 NumSpillLanes -= I;
490 return false;
491 }
492 }
493
494 return true;
495}
496
497/// Reserve AGPRs or VGPRs to support spilling for FrameIndex \p FI.
498/// Either AGPR is spilled to VGPR to vice versa.
499/// Returns true if a \p FI can be eliminated completely.
501 int FI,
502 bool isAGPRtoVGPR) {
504 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
505 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
506
507 assert(ST.hasMAIInsts() && FrameInfo.isSpillSlotObjectIndex(FI));
508
509 auto &Spill = VGPRToAGPRSpills[FI];
510
511 // This has already been allocated.
512 if (!Spill.Lanes.empty())
513 return Spill.FullyAllocated;
514
515 unsigned Size = FrameInfo.getObjectSize(FI);
516 unsigned NumLanes = Size / 4;
517 Spill.Lanes.resize(NumLanes, AMDGPU::NoRegister);
518
519 const TargetRegisterClass &RC =
520 isAGPRtoVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::AGPR_32RegClass;
521 auto Regs = RC.getRegisters();
522
523 auto &SpillRegs = isAGPRtoVGPR ? SpillAGPR : SpillVGPR;
524 const SIRegisterInfo *TRI = ST.getRegisterInfo();
525 Spill.FullyAllocated = true;
526
527 // FIXME: Move allocation logic out of MachineFunctionInfo and initialize
528 // once.
529 BitVector OtherUsedRegs;
530 OtherUsedRegs.resize(TRI->getNumRegs());
531
532 const uint32_t *CSRMask =
533 TRI->getCallPreservedMask(MF, MF.getFunction().getCallingConv());
534 if (CSRMask)
535 OtherUsedRegs.setBitsInMask(CSRMask);
536
537 // TODO: Should include register tuples, but doesn't matter with current
538 // usage.
539 for (MCPhysReg Reg : SpillAGPR)
540 OtherUsedRegs.set(Reg);
541 for (MCPhysReg Reg : SpillVGPR)
542 OtherUsedRegs.set(Reg);
543
544 SmallVectorImpl<MCPhysReg>::const_iterator NextSpillReg = Regs.begin();
545 for (int I = NumLanes - 1; I >= 0; --I) {
546 NextSpillReg = std::find_if(
547 NextSpillReg, Regs.end(), [&MRI, &OtherUsedRegs](MCPhysReg Reg) {
548 return MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg) &&
549 !OtherUsedRegs[Reg];
550 });
551
552 if (NextSpillReg == Regs.end()) { // Registers exhausted
553 Spill.FullyAllocated = false;
554 break;
555 }
556
557 OtherUsedRegs.set(*NextSpillReg);
558 SpillRegs.push_back(*NextSpillReg);
559 MRI.reserveReg(*NextSpillReg, TRI);
560 Spill.Lanes[I] = *NextSpillReg++;
561 }
562
563 return Spill.FullyAllocated;
564}
565
567 MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs) {
568 // Remove dead frame indices from function frame, however keep FP & BP since
569 // spills for them haven't been inserted yet. And also make sure to remove the
570 // frame indices from `SGPRSpillsToVirtualVGPRLanes` data structure,
571 // otherwise, it could result in an unexpected side effect and bug, in case of
572 // any re-mapping of freed frame indices by later pass(es) like "stack slot
573 // coloring".
574 for (auto &R : make_early_inc_range(SGPRSpillsToVirtualVGPRLanes)) {
575 MFI.RemoveStackObject(R.first);
576 SGPRSpillsToVirtualVGPRLanes.erase(R.first);
577 }
578
579 // Remove the dead frame indices of CSR SGPRs which are spilled to physical
580 // VGPR lanes during SILowerSGPRSpills pass.
581 if (!ResetSGPRSpillStackIDs) {
582 for (auto &R : make_early_inc_range(SGPRSpillsToPhysicalVGPRLanes)) {
583 MFI.RemoveStackObject(R.first);
584 SGPRSpillsToPhysicalVGPRLanes.erase(R.first);
585 }
586 }
587 bool HaveSGPRToMemory = false;
588
589 if (ResetSGPRSpillStackIDs) {
590 // All other SGPRs must be allocated on the default stack, so reset the
591 // stack ID.
592 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); I != E;
593 ++I) {
597 HaveSGPRToMemory = true;
598 }
599 }
600 }
601 }
602
603 for (auto &R : VGPRToAGPRSpills) {
604 if (R.second.IsDead)
605 MFI.RemoveStackObject(R.first);
606 }
607
608 return HaveSGPRToMemory;
609}
610
612 const SIRegisterInfo &TRI) {
613 if (ScavengeFI)
614 return *ScavengeFI;
615
616 ScavengeFI =
617 MFI.CreateStackObject(TRI.getSpillSize(AMDGPU::SGPR_32RegClass),
618 TRI.getSpillAlign(AMDGPU::SGPR_32RegClass), false);
619 return *ScavengeFI;
620}
621
622MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const {
623 assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
624 return AMDGPU::SGPR0 + NumUserSGPRs;
625}
626
627MCPhysReg SIMachineFunctionInfo::getNextSystemSGPR() const {
628 return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
629}
630
631void SIMachineFunctionInfo::MRI_NoteNewVirtualRegister(Register Reg) {
632 VRegFlags.grow(Reg);
633}
634
635void SIMachineFunctionInfo::MRI_NoteCloneVirtualRegister(Register NewReg,
636 Register SrcReg) {
637 VRegFlags.grow(NewReg);
638 VRegFlags[NewReg] = VRegFlags[SrcReg];
639}
640
643 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
644 if (!ST.isAmdPalOS())
645 return Register();
646 Register GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in
647 if (ST.hasMergedShaders()) {
648 switch (MF.getFunction().getCallingConv()) {
651 // Low GIT address is passed in s8 rather than s0 for an LS+HS or
652 // ES+GS merged shader on gfx9+.
653 GitPtrLo = AMDGPU::SGPR8;
654 return GitPtrLo;
655 default:
656 return GitPtrLo;
657 }
658 }
659 return GitPtrLo;
660}
661
663 const TargetRegisterInfo &TRI) {
665 {
666 raw_string_ostream OS(Dest.Value);
667 OS << printReg(Reg, &TRI);
668 }
669 return Dest;
670}
671
672static std::optional<yaml::SIArgumentInfo>
674 const TargetRegisterInfo &TRI) {
676
677 auto convertArg = [&](std::optional<yaml::SIArgument> &A,
678 const ArgDescriptor &Arg) {
679 if (!Arg)
680 return false;
681
682 // Create a register or stack argument.
684 if (Arg.isRegister()) {
686 OS << printReg(Arg.getRegister(), &TRI);
687 } else
688 SA.StackOffset = Arg.getStackOffset();
689 // Check and update the optional mask.
690 if (Arg.isMasked())
691 SA.Mask = Arg.getMask();
692
693 A = SA;
694 return true;
695 };
696
697 // TODO: Need to serialize kernarg preloads.
698 bool Any = false;
699 Any |= convertArg(AI.PrivateSegmentBuffer, ArgInfo.PrivateSegmentBuffer);
700 Any |= convertArg(AI.DispatchPtr, ArgInfo.DispatchPtr);
701 Any |= convertArg(AI.QueuePtr, ArgInfo.QueuePtr);
702 Any |= convertArg(AI.KernargSegmentPtr, ArgInfo.KernargSegmentPtr);
703 Any |= convertArg(AI.DispatchID, ArgInfo.DispatchID);
704 Any |= convertArg(AI.FlatScratchInit, ArgInfo.FlatScratchInit);
705 Any |= convertArg(AI.LDSKernelId, ArgInfo.LDSKernelId);
706 Any |= convertArg(AI.PrivateSegmentSize, ArgInfo.PrivateSegmentSize);
707 Any |= convertArg(AI.WorkGroupIDX, ArgInfo.WorkGroupIDX);
708 Any |= convertArg(AI.WorkGroupIDY, ArgInfo.WorkGroupIDY);
709 Any |= convertArg(AI.WorkGroupIDZ, ArgInfo.WorkGroupIDZ);
710 Any |= convertArg(AI.WorkGroupInfo, ArgInfo.WorkGroupInfo);
711 Any |= convertArg(AI.PrivateSegmentWaveByteOffset,
712 ArgInfo.PrivateSegmentWaveByteOffset);
713 Any |= convertArg(AI.ImplicitArgPtr, ArgInfo.ImplicitArgPtr);
714 Any |= convertArg(AI.ImplicitBufferPtr, ArgInfo.ImplicitBufferPtr);
715 Any |= convertArg(AI.WorkItemIDX, ArgInfo.WorkItemIDX);
716 Any |= convertArg(AI.WorkItemIDY, ArgInfo.WorkItemIDY);
717 Any |= convertArg(AI.WorkItemIDZ, ArgInfo.WorkItemIDZ);
718
719 if (Any)
720 return AI;
721
722 return std::nullopt;
723}
724
727 const llvm::MachineFunction &MF)
728 : ExplicitKernArgSize(MFI.getExplicitKernArgSize()),
729 MaxKernArgAlign(MFI.getMaxKernArgAlign()), LDSSize(MFI.getLDSSize()),
730 GDSSize(MFI.getGDSSize()), DynLDSAlign(MFI.getDynLDSAlign()),
731 IsEntryFunction(MFI.isEntryFunction()),
732 NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()),
733 MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()),
734 HasSpilledSGPRs(MFI.hasSpilledSGPRs()),
735 HasSpilledVGPRs(MFI.hasSpilledVGPRs()),
736 NumWaveDispatchSGPRs(MFI.getNumWaveDispatchSGPRs()),
737 NumWaveDispatchVGPRs(MFI.getNumWaveDispatchVGPRs()),
738 HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()),
739 Occupancy(MFI.getOccupancy()),
740 ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)),
741 FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)),
742 StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)),
743 BytesInStackArgArea(MFI.getBytesInStackArgArea()),
744 ReturnsVoid(MFI.returnsVoid()),
745 ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)),
746 PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()),
747 MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()),
748 Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()),
749 IsWholeWaveFunction(MFI.isWholeWaveFunction()),
750 DynamicVGPRBlockSize(MFI.getDynamicVGPRBlockSize()),
751 ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()) {
752 for (Register Reg : MFI.getSGPRSpillPhysVGPRs())
753 SpillPhysVGPRS.push_back(regToString(Reg, TRI));
754
755 for (Register Reg : MFI.getWWMReservedRegs())
756 WWMReservedRegs.push_back(regToString(Reg, TRI));
757
758 if (MFI.getLongBranchReservedReg())
760 if (MFI.getVGPRForAGPRCopy())
762
763 if (MFI.getSGPRForEXECCopy())
765
766 auto SFI = MFI.getOptionalScavengeFI();
767 if (SFI)
769}
770
774
776 const yaml::SIMachineFunctionInfo &YamlMFI, const MachineFunction &MF,
780 LDSSize = YamlMFI.LDSSize;
781 GDSSize = YamlMFI.GDSSize;
782 DynLDSAlign = YamlMFI.DynLDSAlign;
783 PSInputAddr = YamlMFI.PSInputAddr;
784 PSInputEnable = YamlMFI.PSInputEnable;
785 MaxMemoryClusterDWords = YamlMFI.MaxMemoryClusterDWords;
786 HighBitsOf32BitAddress = YamlMFI.HighBitsOf32BitAddress;
787 Occupancy = YamlMFI.Occupancy;
790 MemoryBound = YamlMFI.MemoryBound;
791 WaveLimiter = YamlMFI.WaveLimiter;
792 HasSpilledSGPRs = YamlMFI.HasSpilledSGPRs;
793 HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs;
794 NumWaveDispatchSGPRs = YamlMFI.NumWaveDispatchSGPRs;
795 NumWaveDispatchVGPRs = YamlMFI.NumWaveDispatchVGPRs;
796 BytesInStackArgArea = YamlMFI.BytesInStackArgArea;
797 ReturnsVoid = YamlMFI.ReturnsVoid;
798 IsWholeWaveFunction = YamlMFI.IsWholeWaveFunction;
799
800 if (YamlMFI.ScavengeFI) {
801 auto FIOrErr = YamlMFI.ScavengeFI->getFI(MF.getFrameInfo());
802 if (!FIOrErr) {
803 // Create a diagnostic for a the frame index.
804 const MemoryBuffer &Buffer =
805 *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
806
807 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1, 1,
808 SourceMgr::DK_Error, toString(FIOrErr.takeError()),
809 "", {}, {});
810 SourceRange = YamlMFI.ScavengeFI->SourceRange;
811 return true;
812 }
813 ScavengeFI = *FIOrErr;
814 } else {
815 ScavengeFI = std::nullopt;
816 }
817 return false;
818}
819
821 auto [MinNumAGPR, MaxNumAGPR] =
822 AMDGPU::getIntegerPairAttribute(F, "amdgpu-agpr-alloc", {~0u, ~0u},
823 /*OnlyFirstRequired=*/true);
824 return MinNumAGPR != 0u;
825}
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
Base class for AMDGPU specific classes of TargetSubtarget.
MachineBasicBlock & MBB
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
AMD GCN specific subclass of TargetSubtarget.
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
if(PassOpts->AAPipeline)
static cl::opt< bool > MFMAVGPRForm("amdgpu-mfma-vgpr-form", cl::Hidden, cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If " "unspecified, default to compiler heuristics"), cl::init(false))
const GCNTargetMachine & getTM(const GCNSubtarget *STI)
static std::optional< yaml::SIArgumentInfo > convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo, const TargetRegisterInfo &TRI)
static yaml::StringValue regToString(Register Reg, const TargetRegisterInfo &TRI)
Interface definition for SIRegisterInfo.
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
AMDGPUMachineFunction(const Function &F, const AMDGPUSubtarget &ST)
Align DynLDSAlign
Align for dynamic shared memory if any.
uint32_t LDSSize
Number of bytes in the LDS that are being used.
static ClusterDimsAttr get(const Function &F)
Functions, function parameters, and return types can have attributes to indicate how they should be t...
Definition Attributes.h:69
BitVector & reset()
Definition BitVector.h:392
void resize(unsigned N, bool t=false)
resize - Grow or shrink the bitvector.
Definition BitVector.h:341
BitVector & set()
Definition BitVector.h:351
void setBitsInMask(const uint32_t *Mask, unsigned MaskWords=~0u)
setBitsInMask - Add '1' bits from Mask to this vector.
Definition BitVector.h:707
void push_back(bool Val)
Definition BitVector.h:466
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
const SITargetLowering * getTargetLowering() const override
LLVM_ABI void sortUniqueLiveIns()
Sorts and uniques the LiveIns vector.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
LLVM_ABI int CreateSpillStackObject(uint64_t Size, Align Alignment)
Create a new statically sized stack object that represents a spill slot, returning a nonnegative iden...
void setStackID(int ObjectIdx, uint8_t ID)
bool hasTailCall() const
Returns true if the function contains a tail call.
bool isSpillSlotObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a spill slot.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
void RemoveStackObject(int ObjectIdx)
Remove or mark dead a statically sized stack object.
int getObjectIndexEnd() const
Return one past the maximum frame object index.
uint8_t getStackID(int ObjectIdx) const
int getObjectIndexBegin() const
Return the minimum frame object index.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * cloneInfo(const Ty &Old)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI const MCPhysReg * getCalleeSavedRegs() const
Returns list of callee saved registers.
This interface provides simple read-only access to a block of memory, and provides simple methods for...
virtual StringRef getBufferIdentifier() const
Return an identifier for this buffer, typically the filename it was read from.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool initializeBaseYamlFields(const yaml::SIMachineFunctionInfo &YamlMFI, const MachineFunction &MF, PerFunctionMIParsingState &PFS, SMDiagnostic &Error, SMRange &SourceRange)
void shiftWwmVGPRsToLowestRange(MachineFunction &MF, SmallVectorImpl< Register > &WWMVGPRs, BitVector &SavedVGPRs)
Register addPrivateSegmentSize(const SIRegisterInfo &TRI)
void allocateWWMSpill(MachineFunction &MF, Register VGPR, uint64_t Size=4, Align Alignment=Align(4))
Register addDispatchPtr(const SIRegisterInfo &TRI)
Register addFlatScratchInit(const SIRegisterInfo &TRI)
ArrayRef< Register > getSGPRSpillPhysVGPRs() const
int getScavengeFI(MachineFrameInfo &MFI, const SIRegisterInfo &TRI)
Register addQueuePtr(const SIRegisterInfo &TRI)
SIMachineFunctionInfo(const SIMachineFunctionInfo &MFI)=default
Register getGITPtrLoReg(const MachineFunction &MF) const
bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR)
Reserve AGPRs or VGPRs to support spilling for FrameIndex FI.
void splitWWMSpillRegisters(MachineFunction &MF, SmallVectorImpl< std::pair< Register, int > > &CalleeSavedRegs, SmallVectorImpl< std::pair< Register, int > > &ScratchRegs) const
bool mayUseAGPRs(const Function &F) const
bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg) const
bool allocateSGPRSpillToVGPRLane(MachineFunction &MF, int FI, bool SpillToPhysVGPRLane=false, bool IsPrologEpilog=false)
Register addKernargSegmentPtr(const SIRegisterInfo &TRI)
Register addDispatchID(const SIRegisterInfo &TRI)
bool removeDeadFrameIndices(MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs)
If ResetSGPRSpillStackIDs is true, reset the stack ID from sgpr-spill to the default stack.
MachineFunctionInfo * clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF, const DenseMap< MachineBasicBlock *, MachineBasicBlock * > &Src2DstMBB) const override
Make a functionally equivalent copy of this MachineFunctionInfo in MF.
bool checkIndexInPrologEpilogSGPRSpills(int FI) const
Register addPrivateSegmentBuffer(const SIRegisterInfo &TRI)
const ReservedRegSet & getWWMReservedRegs() const
std::optional< int > getOptionalScavengeFI() const
Register addImplicitBufferPtr(const SIRegisterInfo &TRI)
void limitOccupancy(const MachineFunction &MF)
SmallVectorImpl< MCRegister > * addPreloadedKernArg(const SIRegisterInfo &TRI, const TargetRegisterClass *RC, unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs)
static bool isChainScratchRegister(Register VGPR)
Instances of this class encapsulate one diagnostic report, allowing printing to a raw_ostream as a ca...
Definition SourceMgr.h:282
Represents a location in source code.
Definition SMLoc.h:23
Represents a range in source code.
Definition SMLoc.h:48
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
typename SuperClass::const_iterator const_iterator
unsigned getMainFileID() const
Definition SourceMgr.h:133
const MemoryBuffer * getMemoryBuffer(unsigned i) const
Definition SourceMgr.h:126
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
bool consumeInteger(unsigned Radix, T &Result)
Parse the current string as an integer of the specified radix.
Definition StringRef.h:509
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:151
const TargetMachine & getTargetMachine() const
ArrayRef< MCPhysReg > getRegisters() const
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
A raw_ostream that writes to an std::string.
unsigned getInitialPSInputAddr(const Function &F)
unsigned getDynamicVGPRBlockSize(const Function &F)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
CallingConv Namespace - This namespace contains an enum with a value for the well-known calling conve...
Definition CallingConv.h:21
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ SPIR_KERNEL
Used for SPIR kernel functions.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1740
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:627
BumpPtrAllocatorImpl BumpPtrAllocator
The standard BumpPtrAllocator which just uses the default template parameters.
Definition Allocator.h:383
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition SIInstrInfo.h:40
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition SCCPSolver.h:42
MachineFunctionInfo - This class can be derived from and used by targets to hold private target-speci...
A serializaable representation of a reference to a stack object or fixed stack object.
This class should be specialized by any type that needs to be converted to/from a YAML mapping.
Definition YAMLTraits.h:62
std::optional< SIArgument > PrivateSegmentWaveByteOffset
std::optional< SIArgument > WorkGroupIDY
std::optional< SIArgument > FlatScratchInit
std::optional< SIArgument > DispatchPtr
std::optional< SIArgument > DispatchID
std::optional< SIArgument > WorkItemIDY
std::optional< SIArgument > WorkGroupIDX
std::optional< SIArgument > ImplicitArgPtr
std::optional< SIArgument > QueuePtr
std::optional< SIArgument > WorkGroupInfo
std::optional< SIArgument > LDSKernelId
std::optional< SIArgument > ImplicitBufferPtr
std::optional< SIArgument > WorkItemIDX
std::optional< SIArgument > KernargSegmentPtr
std::optional< SIArgument > WorkItemIDZ
std::optional< SIArgument > PrivateSegmentSize
std::optional< SIArgument > PrivateSegmentBuffer
std::optional< SIArgument > WorkGroupIDZ
std::optional< unsigned > Mask
static SIArgument createArgument(bool IsReg)
SmallVector< StringValue > WWMReservedRegs
void mappingImpl(yaml::IO &YamlIO) override
std::optional< SIArgumentInfo > ArgInfo
SmallVector< StringValue, 2 > SpillPhysVGPRS
std::optional< FrameIndex > ScavengeFI
A wrapper around std::string which contains a source range that's being set during parsing.