LLVM 22.0.0git
SIRegisterInfo.cpp
Go to the documentation of this file.
1//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI implementation of the TargetRegisterInfo class.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
16#include "GCNSubtarget.h"
20#include "SIRegisterInfo.h"
26
27using namespace llvm;
28
29#define GET_REGINFO_TARGET_DESC
30#include "AMDGPUGenRegisterInfo.inc"
31
33 "amdgpu-spill-sgpr-to-vgpr",
34 cl::desc("Enable spilling SGPRs to VGPRs"),
36 cl::init(true));
37
38std::array<std::vector<int16_t>, 32> SIRegisterInfo::RegSplitParts;
39std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable;
40
41// Map numbers of DWORDs to indexes in SubRegFromChannelTable.
42// Valid indexes are shifted 1, such that a 0 mapping means unsupported.
43// e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8,
44// meaning index 7 in SubRegFromChannelTable.
45static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = {
46 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9};
47
48static void emitUnsupportedError(const Function &Fn, const MachineInstr &MI,
49 const Twine &ErrMsg) {
51 DiagnosticInfoUnsupported(Fn, ErrMsg, MI.getDebugLoc()));
52}
53
54namespace llvm {
55
56// A temporary struct to spill SGPRs.
57// This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits
58// just v_writelane and v_readlane.
59//
60// When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR
61// is saved to scratch (or the other way around for loads).
62// For this, a VGPR is required where the needed lanes can be clobbered. The
63// RegScavenger can provide a VGPR where currently active lanes can be
64// clobbered, but we still need to save inactive lanes.
65// The high-level steps are:
66// - Try to scavenge SGPR(s) to save exec
67// - Try to scavenge VGPR
68// - Save needed, all or inactive lanes of a TmpVGPR
69// - Spill/Restore SGPRs using TmpVGPR
70// - Restore TmpVGPR
71//
72// To save all lanes of TmpVGPR, exec needs to be saved and modified. If we
73// cannot scavenge temporary SGPRs to save exec, we use the following code:
74// buffer_store_dword TmpVGPR ; only if active lanes need to be saved
75// s_not exec, exec
76// buffer_store_dword TmpVGPR ; save inactive lanes
77// s_not exec, exec
79 struct PerVGPRData {
80 unsigned PerVGPR;
81 unsigned NumVGPRs;
82 int64_t VGPRLanes;
83 };
84
85 // The SGPR to save
89 unsigned NumSubRegs;
90 bool IsKill;
91 const DebugLoc &DL;
92
93 /* When spilling to stack */
94 // The SGPRs are written into this VGPR, which is then written to scratch
95 // (or vice versa for loads).
96 Register TmpVGPR = AMDGPU::NoRegister;
97 // Temporary spill slot to save TmpVGPR to.
98 int TmpVGPRIndex = 0;
99 // If TmpVGPR is live before the spill or if it is scavenged.
100 bool TmpVGPRLive = false;
101 // Scavenged SGPR to save EXEC.
102 Register SavedExecReg = AMDGPU::NoRegister;
103 // Stack index to write the SGPRs to.
104 int Index;
105 unsigned EltSize = 4;
106
115 unsigned MovOpc;
116 unsigned NotOpc;
117
121 : SGPRSpillBuilder(TRI, TII, IsWave32, MI, MI->getOperand(0).getReg(),
122 MI->getOperand(0).isKill(), Index, RS) {}
123
126 bool IsKill, int Index, RegScavenger *RS)
127 : SuperReg(Reg), MI(MI), IsKill(IsKill), DL(MI->getDebugLoc()),
128 Index(Index), RS(RS), MBB(MI->getParent()), MF(*MBB->getParent()),
129 MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
131 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg);
134
135 if (IsWave32) {
136 ExecReg = AMDGPU::EXEC_LO;
137 MovOpc = AMDGPU::S_MOV_B32;
138 NotOpc = AMDGPU::S_NOT_B32;
139 } else {
140 ExecReg = AMDGPU::EXEC;
141 MovOpc = AMDGPU::S_MOV_B64;
142 NotOpc = AMDGPU::S_NOT_B64;
143 }
144
145 assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
146 assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
147 SuperReg != AMDGPU::EXEC && "exec should never spill");
148 }
149
152 Data.PerVGPR = IsWave32 ? 32 : 64;
153 Data.NumVGPRs = (NumSubRegs + (Data.PerVGPR - 1)) / Data.PerVGPR;
154 Data.VGPRLanes = (1LL << std::min(Data.PerVGPR, NumSubRegs)) - 1LL;
155 return Data;
156 }
157
158 // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is
159 // free.
160 // Writes these instructions if an SGPR can be scavenged:
161 // s_mov_b64 s[6:7], exec ; Save exec
162 // s_mov_b64 exec, 3 ; Wanted lanemask
163 // buffer_store_dword v1 ; Write scavenged VGPR to emergency slot
164 //
165 // Writes these instructions if no SGPR can be scavenged:
166 // buffer_store_dword v0 ; Only if no free VGPR was found
167 // s_not_b64 exec, exec
168 // buffer_store_dword v0 ; Save inactive lanes
169 // ; exec stays inverted, it is flipped back in
170 // ; restore.
171 void prepare() {
172 // Scavenged temporary VGPR to use. It must be scavenged once for any number
173 // of spilled subregs.
174 // FIXME: The liveness analysis is limited and does not tell if a register
175 // is in use in lanes that are currently inactive. We can never be sure if
176 // a register as actually in use in another lane, so we need to save all
177 // used lanes of the chosen VGPR.
178 assert(RS && "Cannot spill SGPR to memory without RegScavenger");
179 TmpVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false,
180 0, false);
181
182 // Reserve temporary stack slot
184 if (TmpVGPR) {
185 // Found a register that is dead in the currently active lanes, we only
186 // need to spill inactive lanes.
187 TmpVGPRLive = false;
188 } else {
189 // Pick v0 because it doesn't make a difference.
190 TmpVGPR = AMDGPU::VGPR0;
191 TmpVGPRLive = true;
192 }
193
194 if (TmpVGPRLive) {
195 // We need to inform the scavenger that this index is already in use until
196 // we're done with the custom emergency spill.
198 }
199
200 // We may end up recursively calling the scavenger, and don't want to re-use
201 // the same register.
203
204 // Try to scavenge SGPRs to save exec
205 assert(!SavedExecReg && "Exec is already saved, refuse to save again");
206 const TargetRegisterClass &RC =
207 IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass;
209 SavedExecReg = RS->scavengeRegisterBackwards(RC, MI, false, 0, false);
210
211 int64_t VGPRLanes = getPerVGPRData().VGPRLanes;
212
213 if (SavedExecReg) {
215 // Set exec to needed lanes
217 auto I =
218 BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes);
219 if (!TmpVGPRLive)
221 // Spill needed lanes
222 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
223 } else {
224 // The modify and restore of exec clobber SCC, which we would have to save
225 // and restore. FIXME: We probably would need to reserve a register for
226 // this.
227 if (RS->isRegUsed(AMDGPU::SCC))
229 "unhandled SGPR spill to memory");
230
231 // Spill active lanes
232 if (TmpVGPRLive)
233 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false,
234 /*IsKill*/ false);
235 // Spill inactive lanes
236 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
237 if (!TmpVGPRLive)
239 I->getOperand(2).setIsDead(); // Mark SCC as dead.
240 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
241 }
242 }
243
244 // Writes these instructions if an SGPR can be scavenged:
245 // buffer_load_dword v1 ; Write scavenged VGPR to emergency slot
246 // s_waitcnt vmcnt(0) ; If a free VGPR was found
247 // s_mov_b64 exec, s[6:7] ; Save exec
248 //
249 // Writes these instructions if no SGPR can be scavenged:
250 // buffer_load_dword v0 ; Restore inactive lanes
251 // s_waitcnt vmcnt(0) ; If a free VGPR was found
252 // s_not_b64 exec, exec
253 // buffer_load_dword v0 ; Only if no free VGPR was found
254 void restore() {
255 if (SavedExecReg) {
256 // Restore used lanes
257 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
258 /*IsKill*/ false);
259 // Restore exec
260 auto I = BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg)
262 // Add an implicit use of the load so it is not dead.
263 // FIXME This inserts an unnecessary waitcnt
264 if (!TmpVGPRLive) {
266 }
267 } else {
268 // Restore inactive lanes
269 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
270 /*IsKill*/ false);
271 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
272 if (!TmpVGPRLive)
274 I->getOperand(2).setIsDead(); // Mark SCC as dead.
275
276 // Restore active lanes
277 if (TmpVGPRLive)
278 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true);
279 }
280
281 // Inform the scavenger where we're releasing our custom scavenged register.
282 if (TmpVGPRLive) {
283 MachineBasicBlock::iterator RestorePt = std::prev(MI);
285 }
286 }
287
288 // Write TmpVGPR to memory or read TmpVGPR from memory.
289 // Either using a single buffer_load/store if exec is set to the needed mask
290 // or using
291 // buffer_load
292 // s_not exec, exec
293 // buffer_load
294 // s_not exec, exec
295 void readWriteTmpVGPR(unsigned Offset, bool IsLoad) {
296 if (SavedExecReg) {
297 // Spill needed lanes
298 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
299 } else {
300 // The modify and restore of exec clobber SCC, which we would have to save
301 // and restore. FIXME: We probably would need to reserve a register for
302 // this.
303 if (RS->isRegUsed(AMDGPU::SCC))
305 "unhandled SGPR spill to memory");
306
307 // Spill active lanes
308 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad,
309 /*IsKill*/ false);
310 // Spill inactive lanes
311 auto Not0 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
312 Not0->getOperand(2).setIsDead(); // Mark SCC as dead.
313 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
314 auto Not1 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
315 Not1->getOperand(2).setIsDead(); // Mark SCC as dead.
316 }
317 }
318
320 assert(MBB->getParent() == &MF);
321 MI = NewMI;
322 MBB = NewMBB;
323 }
324};
325
326} // namespace llvm
327
329 : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour(),
330 ST.getAMDGPUDwarfFlavour(),
331 /*PC=*/0, ST.getHwMode()),
332 ST(ST), SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) {
333
334 assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 &&
335 getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) &&
336 (getSubRegIndexLaneMask(AMDGPU::lo16) |
337 getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() ==
338 getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() &&
339 "getNumCoveredRegs() will not work with generated subreg masks!");
340
341 RegPressureIgnoredUnits.resize(getNumRegUnits());
342 RegPressureIgnoredUnits.set(*regunits(MCRegister::from(AMDGPU::M0)).begin());
343 for (auto Reg : AMDGPU::VGPR_16RegClass) {
344 if (AMDGPU::isHi16Reg(Reg, *this))
345 RegPressureIgnoredUnits.set(*regunits(Reg).begin());
346 }
347
348 // HACK: Until this is fully tablegen'd.
349 static llvm::once_flag InitializeRegSplitPartsFlag;
350
351 static auto InitializeRegSplitPartsOnce = [this]() {
352 for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) {
353 unsigned Size = getSubRegIdxSize(Idx);
354 if (Size & 15)
355 continue;
356 std::vector<int16_t> &Vec = RegSplitParts[Size / 16 - 1];
357 unsigned Pos = getSubRegIdxOffset(Idx);
358 if (Pos % Size)
359 continue;
360 Pos /= Size;
361 if (Vec.empty()) {
362 unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits.
363 Vec.resize(MaxNumParts);
364 }
365 Vec[Pos] = Idx;
366 }
367 };
368
369 static llvm::once_flag InitializeSubRegFromChannelTableFlag;
370
371 static auto InitializeSubRegFromChannelTableOnce = [this]() {
372 for (auto &Row : SubRegFromChannelTable)
373 Row.fill(AMDGPU::NoSubRegister);
374 for (unsigned Idx = 1; Idx < getNumSubRegIndices(); ++Idx) {
375 unsigned Width = getSubRegIdxSize(Idx) / 32;
376 unsigned Offset = getSubRegIdxOffset(Idx) / 32;
378 Width = SubRegFromChannelTableWidthMap[Width];
379 if (Width == 0)
380 continue;
381 unsigned TableIdx = Width - 1;
382 assert(TableIdx < SubRegFromChannelTable.size());
383 assert(Offset < SubRegFromChannelTable[TableIdx].size());
384 SubRegFromChannelTable[TableIdx][Offset] = Idx;
385 }
386 };
387
388 llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce);
389 llvm::call_once(InitializeSubRegFromChannelTableFlag,
390 InitializeSubRegFromChannelTableOnce);
391}
392
393void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved,
394 MCRegister Reg) const {
395 for (MCRegAliasIterator R(Reg, this, true); R.isValid(); ++R)
396 Reserved.set(*R);
397}
398
399// Forced to be here by one .inc
401 const MachineFunction *MF) const {
403 switch (CC) {
404 case CallingConv::C:
407 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_SaveList
408 : CSR_AMDGPU_SaveList;
411 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList
412 : CSR_AMDGPU_SI_Gfx_SaveList;
414 return CSR_AMDGPU_CS_ChainPreserve_SaveList;
415 default: {
416 // Dummy to not crash RegisterClassInfo.
417 static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
418 return &NoCalleeSavedReg;
419 }
420 }
421}
422
423const MCPhysReg *
425 return nullptr;
426}
427
429 CallingConv::ID CC) const {
430 switch (CC) {
431 case CallingConv::C:
434 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_RegMask
435 : CSR_AMDGPU_RegMask;
438 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask
439 : CSR_AMDGPU_SI_Gfx_RegMask;
442 // Calls to these functions never return, so we can pretend everything is
443 // preserved.
444 return AMDGPU_AllVGPRs_RegMask;
445 default:
446 return nullptr;
447 }
448}
449
451 return CSR_AMDGPU_NoRegs_RegMask;
452}
453
455 return VGPR >= AMDGPU::VGPR0 && VGPR < AMDGPU::VGPR8;
456}
457
460 const MachineFunction &MF) const {
461 // FIXME: Should have a helper function like getEquivalentVGPRClass to get the
462 // equivalent AV class. If used one, the verifier will crash after
463 // RegBankSelect in the GISel flow. The aligned regclasses are not fully given
464 // until Instruction selection.
465 if (ST.hasMAIInsts() && (isVGPRClass(RC) || isAGPRClass(RC))) {
466 if (RC == &AMDGPU::VGPR_32RegClass || RC == &AMDGPU::AGPR_32RegClass)
467 return &AMDGPU::AV_32RegClass;
468 if (RC == &AMDGPU::VReg_64RegClass || RC == &AMDGPU::AReg_64RegClass)
469 return &AMDGPU::AV_64RegClass;
470 if (RC == &AMDGPU::VReg_64_Align2RegClass ||
471 RC == &AMDGPU::AReg_64_Align2RegClass)
472 return &AMDGPU::AV_64_Align2RegClass;
473 if (RC == &AMDGPU::VReg_96RegClass || RC == &AMDGPU::AReg_96RegClass)
474 return &AMDGPU::AV_96RegClass;
475 if (RC == &AMDGPU::VReg_96_Align2RegClass ||
476 RC == &AMDGPU::AReg_96_Align2RegClass)
477 return &AMDGPU::AV_96_Align2RegClass;
478 if (RC == &AMDGPU::VReg_128RegClass || RC == &AMDGPU::AReg_128RegClass)
479 return &AMDGPU::AV_128RegClass;
480 if (RC == &AMDGPU::VReg_128_Align2RegClass ||
481 RC == &AMDGPU::AReg_128_Align2RegClass)
482 return &AMDGPU::AV_128_Align2RegClass;
483 if (RC == &AMDGPU::VReg_160RegClass || RC == &AMDGPU::AReg_160RegClass)
484 return &AMDGPU::AV_160RegClass;
485 if (RC == &AMDGPU::VReg_160_Align2RegClass ||
486 RC == &AMDGPU::AReg_160_Align2RegClass)
487 return &AMDGPU::AV_160_Align2RegClass;
488 if (RC == &AMDGPU::VReg_192RegClass || RC == &AMDGPU::AReg_192RegClass)
489 return &AMDGPU::AV_192RegClass;
490 if (RC == &AMDGPU::VReg_192_Align2RegClass ||
491 RC == &AMDGPU::AReg_192_Align2RegClass)
492 return &AMDGPU::AV_192_Align2RegClass;
493 if (RC == &AMDGPU::VReg_256RegClass || RC == &AMDGPU::AReg_256RegClass)
494 return &AMDGPU::AV_256RegClass;
495 if (RC == &AMDGPU::VReg_256_Align2RegClass ||
496 RC == &AMDGPU::AReg_256_Align2RegClass)
497 return &AMDGPU::AV_256_Align2RegClass;
498 if (RC == &AMDGPU::VReg_512RegClass || RC == &AMDGPU::AReg_512RegClass)
499 return &AMDGPU::AV_512RegClass;
500 if (RC == &AMDGPU::VReg_512_Align2RegClass ||
501 RC == &AMDGPU::AReg_512_Align2RegClass)
502 return &AMDGPU::AV_512_Align2RegClass;
503 if (RC == &AMDGPU::VReg_1024RegClass || RC == &AMDGPU::AReg_1024RegClass)
504 return &AMDGPU::AV_1024RegClass;
505 if (RC == &AMDGPU::VReg_1024_Align2RegClass ||
506 RC == &AMDGPU::AReg_1024_Align2RegClass)
507 return &AMDGPU::AV_1024_Align2RegClass;
508 }
509
511}
512
514 const SIFrameLowering *TFI = ST.getFrameLowering();
516
517 // During ISel lowering we always reserve the stack pointer in entry and chain
518 // functions, but never actually want to reference it when accessing our own
519 // frame. If we need a frame pointer we use it, but otherwise we can just use
520 // an immediate "0" which we represent by returning NoRegister.
521 if (FuncInfo->isBottomOfStack()) {
522 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register();
523 }
524 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg()
525 : FuncInfo->getStackPtrOffsetReg();
526}
527
529 // When we need stack realignment, we can't reference off of the
530 // stack pointer, so we reserve a base pointer.
531 return shouldRealignStack(MF);
532}
533
534Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }
535
537 return AMDGPU_AllVGPRs_RegMask;
538}
539
541 return AMDGPU_AllAGPRs_RegMask;
542}
543
545 return AMDGPU_AllVectorRegs_RegMask;
546}
547
549 return AMDGPU_AllAllocatableSRegs_RegMask;
550}
551
552unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel,
553 unsigned NumRegs) {
554 assert(NumRegs < SubRegFromChannelTableWidthMap.size());
555 unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs];
556 assert(NumRegIndex && "Not implemented");
557 assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size());
558 return SubRegFromChannelTable[NumRegIndex - 1][Channel];
559}
560
563 const unsigned Align,
564 const TargetRegisterClass *RC) const {
565 unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), Align) - Align;
566 MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
567 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, RC);
568}
569
571 const MachineFunction &MF) const {
572 return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass);
573}
574
576 BitVector Reserved(getNumRegs());
577 Reserved.set(AMDGPU::MODE);
578
580
581 // Reserve special purpose registers.
582 //
583 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but
584 // this seems likely to result in bugs, so I'm marking them as reserved.
585 reserveRegisterTuples(Reserved, AMDGPU::EXEC);
586 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
587
588 // M0 has to be reserved so that llvm accepts it as a live-in into a block.
589 reserveRegisterTuples(Reserved, AMDGPU::M0);
590
591 // Reserve src_vccz, src_execz, src_scc.
592 reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ);
593 reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ);
594 reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC);
595
596 // Reserve the memory aperture registers
597 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
598 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
599 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
600 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
601 reserveRegisterTuples(Reserved, AMDGPU::SRC_FLAT_SCRATCH_BASE_LO);
602 reserveRegisterTuples(Reserved, AMDGPU::SRC_FLAT_SCRATCH_BASE_HI);
603
604 // Reserve async counters pseudo registers
605 reserveRegisterTuples(Reserved, AMDGPU::ASYNCcnt);
606 reserveRegisterTuples(Reserved, AMDGPU::TENSORcnt);
607
608 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen.
609 reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID);
610
611 // Reserve xnack_mask registers - support is not implemented in Codegen.
612 reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK);
613
614 // Reserve lds_direct register - support is not implemented in Codegen.
615 reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT);
616
617 // Reserve Trap Handler registers - support is not implemented in Codegen.
618 reserveRegisterTuples(Reserved, AMDGPU::TBA);
619 reserveRegisterTuples(Reserved, AMDGPU::TMA);
620 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1);
621 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3);
622 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5);
623 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7);
624 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
625 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
626 reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13);
627 reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15);
628
629 // Reserve null register - it shall never be allocated
630 reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL64);
631
632 // Reserve SGPRs.
633 //
634 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
635 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
636 for (const TargetRegisterClass *RC : regclasses()) {
637 if (RC->isBaseClass() && isSGPRClass(RC)) {
638 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
639 for (MCPhysReg Reg : *RC) {
640 unsigned Index = getHWRegIndex(Reg);
641 if (Index + NumRegs > MaxNumSGPRs && Index < TotalNumSGPRs)
642 Reserved.set(Reg);
643 }
644 }
645 }
646
647 Register ScratchRSrcReg = MFI->getScratchRSrcReg();
648 if (ScratchRSrcReg != AMDGPU::NoRegister) {
649 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we
650 // need to spill.
651 // TODO: May need to reserve a VGPR if doing LDS spilling.
652 reserveRegisterTuples(Reserved, ScratchRSrcReg);
653 }
654
655 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
656 if (LongBranchReservedReg)
657 reserveRegisterTuples(Reserved, LongBranchReservedReg);
658
659 // We have to assume the SP is needed in case there are calls in the function,
660 // which is detected after the function is lowered. If we aren't really going
661 // to need SP, don't bother reserving it.
662 MCRegister StackPtrReg = MFI->getStackPtrOffsetReg();
663 if (StackPtrReg) {
664 reserveRegisterTuples(Reserved, StackPtrReg);
665 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
666 }
667
668 MCRegister FrameReg = MFI->getFrameOffsetReg();
669 if (FrameReg) {
670 reserveRegisterTuples(Reserved, FrameReg);
671 assert(!isSubRegister(ScratchRSrcReg, FrameReg));
672 }
673
674 if (hasBasePointer(MF)) {
675 MCRegister BasePtrReg = getBaseRegister();
676 reserveRegisterTuples(Reserved, BasePtrReg);
677 assert(!isSubRegister(ScratchRSrcReg, BasePtrReg));
678 }
679
680 // FIXME: Use same reserved register introduced in D149775
681 // SGPR used to preserve EXEC MASK around WWM spill/copy instructions.
682 Register ExecCopyReg = MFI->getSGPRForEXECCopy();
683 if (ExecCopyReg)
684 reserveRegisterTuples(Reserved, ExecCopyReg);
685
686 // Reserve VGPRs/AGPRs.
687 //
688 auto [MaxNumVGPRs, MaxNumAGPRs] = ST.getMaxNumVectorRegs(MF.getFunction());
689
690 for (const TargetRegisterClass *RC : regclasses()) {
691 if (RC->isBaseClass() && isVGPRClass(RC)) {
692 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
693 for (MCPhysReg Reg : *RC) {
694 unsigned Index = getHWRegIndex(Reg);
695 if (Index + NumRegs > MaxNumVGPRs)
696 Reserved.set(Reg);
697 }
698 }
699 }
700
701 // Reserve all the AGPRs if there are no instructions to use it.
702 if (!ST.hasMAIInsts())
703 MaxNumAGPRs = 0;
704 for (const TargetRegisterClass *RC : regclasses()) {
705 if (RC->isBaseClass() && isAGPRClass(RC)) {
706 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
707 for (MCPhysReg Reg : *RC) {
708 unsigned Index = getHWRegIndex(Reg);
709 if (Index + NumRegs > MaxNumAGPRs)
710 Reserved.set(Reg);
711 }
712 }
713 }
714
715 // On GFX908, in order to guarantee copying between AGPRs, we need a scratch
716 // VGPR available at all times.
717 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
718 reserveRegisterTuples(Reserved, MFI->getVGPRForAGPRCopy());
719 }
720
721 // During wwm-regalloc, reserve the registers for perlane VGPR allocation. The
722 // MFI->getNonWWMRegMask() field will have a valid bitmask only during
723 // wwm-regalloc and it would be empty otherwise.
724 BitVector NonWWMRegMask = MFI->getNonWWMRegMask();
725 if (!NonWWMRegMask.empty()) {
726 for (unsigned RegI = AMDGPU::VGPR0, RegE = AMDGPU::VGPR0 + MaxNumVGPRs;
727 RegI < RegE; ++RegI) {
728 if (NonWWMRegMask.test(RegI))
729 reserveRegisterTuples(Reserved, RegI);
730 }
731 }
732
733 for (Register Reg : MFI->getWWMReservedRegs())
734 reserveRegisterTuples(Reserved, Reg);
735
736 // FIXME: Stop using reserved registers for this.
737 for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
738 reserveRegisterTuples(Reserved, Reg);
739
740 for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
741 reserveRegisterTuples(Reserved, Reg);
742
743 return Reserved;
744}
745
747 MCRegister PhysReg) const {
748 return !MF.getRegInfo().isReserved(PhysReg);
749}
750
753 // On entry or in chain functions, the base address is 0, so it can't possibly
754 // need any more alignment.
755
756 // FIXME: Should be able to specify the entry frame alignment per calling
757 // convention instead.
758 if (Info->isBottomOfStack())
759 return false;
760
762}
763
766 if (Info->isEntryFunction()) {
767 const MachineFrameInfo &MFI = Fn.getFrameInfo();
768 return MFI.hasStackObjects() || MFI.hasCalls();
769 }
770
771 // May need scavenger for dealing with callee saved registers.
772 return true;
773}
774
776 const MachineFunction &MF) const {
777 // Do not use frame virtual registers. They used to be used for SGPRs, but
778 // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the
779 // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a
780 // spill.
781 return false;
782}
783
785 const MachineFunction &MF) const {
786 const MachineFrameInfo &MFI = MF.getFrameInfo();
787 return MFI.hasStackObjects();
788}
789
791 const MachineFunction &) const {
792 // There are no special dedicated stack or frame pointers.
793 return true;
794}
795
798
799 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
800 AMDGPU::OpName::offset);
801 return MI->getOperand(OffIdx).getImm();
802}
803
805 int Idx) const {
806 switch (MI->getOpcode()) {
807 case AMDGPU::V_ADD_U32_e32:
808 case AMDGPU::V_ADD_U32_e64:
809 case AMDGPU::V_ADD_CO_U32_e32: {
810 int OtherIdx = Idx == 1 ? 2 : 1;
811 const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
812 return OtherOp.isImm() ? OtherOp.getImm() : 0;
813 }
814 case AMDGPU::V_ADD_CO_U32_e64: {
815 int OtherIdx = Idx == 2 ? 3 : 2;
816 const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
817 return OtherOp.isImm() ? OtherOp.getImm() : 0;
818 }
819 default:
820 break;
821 }
822
824 return 0;
825
826 assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
827 AMDGPU::OpName::vaddr) ||
828 (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
829 AMDGPU::OpName::saddr))) &&
830 "Should never see frame index on non-address operand");
831
833}
834
836 const MachineInstr &MI) {
837 assert(MI.getDesc().isAdd());
838 const MachineOperand &Src0 = MI.getOperand(1);
839 const MachineOperand &Src1 = MI.getOperand(2);
840
841 if (Src0.isFI()) {
842 return Src1.isImm() || (Src1.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
843 Src1.getReg()));
844 }
845
846 if (Src1.isFI()) {
847 return Src0.isImm() || (Src0.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
848 Src0.getReg()));
849 }
850
851 return false;
852}
853
855 // TODO: Handle v_add_co_u32, v_or_b32, v_and_b32 and scalar opcodes.
856 switch (MI->getOpcode()) {
857 case AMDGPU::V_ADD_U32_e32: {
858 // TODO: We could handle this but it requires work to avoid violating
859 // operand restrictions.
860 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e32) < 2 &&
861 !isFIPlusImmOrVGPR(*this, *MI))
862 return false;
863 [[fallthrough]];
864 }
865 case AMDGPU::V_ADD_U32_e64:
866 // FIXME: This optimization is barely profitable enableFlatScratch as-is.
867 //
868 // Much of the benefit with the MUBUF handling is we avoid duplicating the
869 // shift of the frame register, which isn't needed with scratch.
870 //
871 // materializeFrameBaseRegister doesn't know the register classes of the
872 // uses, and unconditionally uses an s_add_i32, which will end up using a
873 // copy for the vector uses.
874 return !ST.enableFlatScratch();
875 case AMDGPU::V_ADD_CO_U32_e32:
876 if (ST.getConstantBusLimit(AMDGPU::V_ADD_CO_U32_e32) < 2 &&
877 !isFIPlusImmOrVGPR(*this, *MI))
878 return false;
879 // We can't deal with the case where the carry out has a use (though this
880 // should never happen)
881 return MI->getOperand(3).isDead();
882 case AMDGPU::V_ADD_CO_U32_e64:
883 // TODO: Should we check use_empty instead?
884 return MI->getOperand(1).isDead();
885 default:
886 break;
887 }
888
890 return false;
891
892 int64_t FullOffset = Offset + getScratchInstrOffset(MI);
893
894 const SIInstrInfo *TII = ST.getInstrInfo();
896 return !TII->isLegalMUBUFImmOffset(FullOffset);
897
898 return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS,
900}
901
903 int FrameIdx,
904 int64_t Offset) const {
906 DebugLoc DL; // Defaults to "unknown"
907
908 if (Ins != MBB->end())
909 DL = Ins->getDebugLoc();
910
912 const SIInstrInfo *TII = ST.getInstrInfo();
914 unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32
915 : AMDGPU::V_MOV_B32_e32;
916
917 Register BaseReg = MRI.createVirtualRegister(
918 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XEXEC_HIRegClass
919 : &AMDGPU::VGPR_32RegClass);
920
921 if (Offset == 0) {
922 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg)
923 .addFrameIndex(FrameIdx);
924 return BaseReg;
925 }
926
927 Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
928
929 Register FIReg = MRI.createVirtualRegister(
930 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass
931 : &AMDGPU::VGPR_32RegClass);
932
933 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
934 .addImm(Offset);
935 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg)
936 .addFrameIndex(FrameIdx);
937
938 if (ST.enableFlatScratch() ) {
939 // FIXME: Make sure scc isn't live in.
940 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg)
941 .addReg(OffsetReg, RegState::Kill)
942 .addReg(FIReg)
943 .setOperandDead(3); // scc
944 return BaseReg;
945 }
946
947 TII->getAddNoCarry(*MBB, Ins, DL, BaseReg)
948 .addReg(OffsetReg, RegState::Kill)
949 .addReg(FIReg)
950 .addImm(0); // clamp bit
951
952 return BaseReg;
953}
954
956 int64_t Offset) const {
957 const SIInstrInfo *TII = ST.getInstrInfo();
958
959 switch (MI.getOpcode()) {
960 case AMDGPU::V_ADD_U32_e32:
961 case AMDGPU::V_ADD_CO_U32_e32: {
962 MachineOperand *FIOp = &MI.getOperand(2);
963 MachineOperand *ImmOp = &MI.getOperand(1);
964 if (!FIOp->isFI())
965 std::swap(FIOp, ImmOp);
966
967 if (!ImmOp->isImm()) {
968 assert(Offset == 0);
969 FIOp->ChangeToRegister(BaseReg, false);
970 TII->legalizeOperandsVOP2(MI.getMF()->getRegInfo(), MI);
971 return;
972 }
973
974 int64_t TotalOffset = ImmOp->getImm() + Offset;
975 if (TotalOffset == 0) {
976 MI.setDesc(TII->get(AMDGPU::COPY));
977 for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
978 MI.removeOperand(I);
979
980 MI.getOperand(1).ChangeToRegister(BaseReg, false);
981 return;
982 }
983
984 ImmOp->setImm(TotalOffset);
985
986 MachineBasicBlock *MBB = MI.getParent();
989
990 // FIXME: materializeFrameBaseRegister does not know the register class of
991 // the uses of the frame index, and assumes SGPR for enableFlatScratch. Emit
992 // a copy so we have a legal operand and hope the register coalescer can
993 // clean it up.
994 if (isSGPRReg(MRI, BaseReg)) {
995 Register BaseRegVGPR =
996 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
997 BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), BaseRegVGPR)
998 .addReg(BaseReg);
999 MI.getOperand(2).ChangeToRegister(BaseRegVGPR, false);
1000 } else {
1001 MI.getOperand(2).ChangeToRegister(BaseReg, false);
1002 }
1003 return;
1004 }
1005 case AMDGPU::V_ADD_U32_e64:
1006 case AMDGPU::V_ADD_CO_U32_e64: {
1007 int Src0Idx = MI.getNumExplicitDefs();
1008 MachineOperand *FIOp = &MI.getOperand(Src0Idx);
1009 MachineOperand *ImmOp = &MI.getOperand(Src0Idx + 1);
1010 if (!FIOp->isFI())
1011 std::swap(FIOp, ImmOp);
1012
1013 if (!ImmOp->isImm()) {
1014 FIOp->ChangeToRegister(BaseReg, false);
1015 TII->legalizeOperandsVOP3(MI.getMF()->getRegInfo(), MI);
1016 return;
1017 }
1018
1019 int64_t TotalOffset = ImmOp->getImm() + Offset;
1020 if (TotalOffset == 0) {
1021 MI.setDesc(TII->get(AMDGPU::COPY));
1022
1023 for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
1024 MI.removeOperand(I);
1025
1026 MI.getOperand(1).ChangeToRegister(BaseReg, false);
1027 } else {
1028 FIOp->ChangeToRegister(BaseReg, false);
1029 ImmOp->setImm(TotalOffset);
1030 }
1031
1032 return;
1033 }
1034 default:
1035 break;
1036 }
1037
1038 bool IsFlat = TII->isFLATScratch(MI);
1039
1040#ifndef NDEBUG
1041 // FIXME: Is it possible to be storing a frame index to itself?
1042 bool SeenFI = false;
1043 for (const MachineOperand &MO: MI.operands()) {
1044 if (MO.isFI()) {
1045 if (SeenFI)
1046 llvm_unreachable("should not see multiple frame indices");
1047
1048 SeenFI = true;
1049 }
1050 }
1051#endif
1052
1053 MachineOperand *FIOp =
1054 TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr
1055 : AMDGPU::OpName::vaddr);
1056
1057 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
1058 int64_t NewOffset = OffsetOp->getImm() + Offset;
1059
1060 assert(FIOp && FIOp->isFI() && "frame index must be address operand");
1061 assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI));
1062
1063 if (IsFlat) {
1064 assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
1066 "offset should be legal");
1067 FIOp->ChangeToRegister(BaseReg, false);
1068 OffsetOp->setImm(NewOffset);
1069 return;
1070 }
1071
1072#ifndef NDEBUG
1073 MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
1074 assert(SOffset->isImm() && SOffset->getImm() == 0);
1075#endif
1076
1077 assert(TII->isLegalMUBUFImmOffset(NewOffset) && "offset should be legal");
1078
1079 FIOp->ChangeToRegister(BaseReg, false);
1080 OffsetOp->setImm(NewOffset);
1081}
1082
1084 Register BaseReg,
1085 int64_t Offset) const {
1086
1087 switch (MI->getOpcode()) {
1088 case AMDGPU::V_ADD_U32_e32:
1089 case AMDGPU::V_ADD_CO_U32_e32:
1090 return true;
1091 case AMDGPU::V_ADD_U32_e64:
1092 case AMDGPU::V_ADD_CO_U32_e64:
1094 default:
1095 break;
1096 }
1097
1099 return false;
1100
1101 int64_t NewOffset = Offset + getScratchInstrOffset(MI);
1102
1103 const SIInstrInfo *TII = ST.getInstrInfo();
1105 return TII->isLegalMUBUFImmOffset(NewOffset);
1106
1107 return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
1109}
1110
1112 const MachineFunction &MF, unsigned Kind) const {
1113 // This is inaccurate. It depends on the instruction and address space. The
1114 // only place where we should hit this is for dealing with frame indexes /
1115 // private accesses, so this is correct in that case.
1116 return &AMDGPU::VGPR_32RegClass;
1117}
1118
1119const TargetRegisterClass *
1121 if (isAGPRClass(RC) && !ST.hasGFX90AInsts())
1122 return getEquivalentVGPRClass(RC);
1123 if (RC == &AMDGPU::SCC_CLASSRegClass)
1124 return getWaveMaskRegClass();
1125
1126 return RC;
1127}
1128
1130 const SIInstrInfo *TII) {
1131
1132 unsigned Op = MI.getOpcode();
1133 switch (Op) {
1134 case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE:
1135 case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE:
1136 // FIXME: This assumes the mask is statically known and not computed at
1137 // runtime. However, some ABIs may want to compute the mask dynamically and
1138 // this will need to be updated.
1139 return llvm::popcount(
1140 (uint64_t)TII->getNamedOperand(MI, AMDGPU::OpName::mask)->getImm());
1141 case AMDGPU::SI_SPILL_S1024_SAVE:
1142 case AMDGPU::SI_SPILL_S1024_RESTORE:
1143 case AMDGPU::SI_SPILL_V1024_SAVE:
1144 case AMDGPU::SI_SPILL_V1024_RESTORE:
1145 case AMDGPU::SI_SPILL_A1024_SAVE:
1146 case AMDGPU::SI_SPILL_A1024_RESTORE:
1147 case AMDGPU::SI_SPILL_AV1024_SAVE:
1148 case AMDGPU::SI_SPILL_AV1024_RESTORE:
1149 return 32;
1150 case AMDGPU::SI_SPILL_S512_SAVE:
1151 case AMDGPU::SI_SPILL_S512_RESTORE:
1152 case AMDGPU::SI_SPILL_V512_SAVE:
1153 case AMDGPU::SI_SPILL_V512_RESTORE:
1154 case AMDGPU::SI_SPILL_A512_SAVE:
1155 case AMDGPU::SI_SPILL_A512_RESTORE:
1156 case AMDGPU::SI_SPILL_AV512_SAVE:
1157 case AMDGPU::SI_SPILL_AV512_RESTORE:
1158 return 16;
1159 case AMDGPU::SI_SPILL_S384_SAVE:
1160 case AMDGPU::SI_SPILL_S384_RESTORE:
1161 case AMDGPU::SI_SPILL_V384_SAVE:
1162 case AMDGPU::SI_SPILL_V384_RESTORE:
1163 case AMDGPU::SI_SPILL_A384_SAVE:
1164 case AMDGPU::SI_SPILL_A384_RESTORE:
1165 case AMDGPU::SI_SPILL_AV384_SAVE:
1166 case AMDGPU::SI_SPILL_AV384_RESTORE:
1167 return 12;
1168 case AMDGPU::SI_SPILL_S352_SAVE:
1169 case AMDGPU::SI_SPILL_S352_RESTORE:
1170 case AMDGPU::SI_SPILL_V352_SAVE:
1171 case AMDGPU::SI_SPILL_V352_RESTORE:
1172 case AMDGPU::SI_SPILL_A352_SAVE:
1173 case AMDGPU::SI_SPILL_A352_RESTORE:
1174 case AMDGPU::SI_SPILL_AV352_SAVE:
1175 case AMDGPU::SI_SPILL_AV352_RESTORE:
1176 return 11;
1177 case AMDGPU::SI_SPILL_S320_SAVE:
1178 case AMDGPU::SI_SPILL_S320_RESTORE:
1179 case AMDGPU::SI_SPILL_V320_SAVE:
1180 case AMDGPU::SI_SPILL_V320_RESTORE:
1181 case AMDGPU::SI_SPILL_A320_SAVE:
1182 case AMDGPU::SI_SPILL_A320_RESTORE:
1183 case AMDGPU::SI_SPILL_AV320_SAVE:
1184 case AMDGPU::SI_SPILL_AV320_RESTORE:
1185 return 10;
1186 case AMDGPU::SI_SPILL_S288_SAVE:
1187 case AMDGPU::SI_SPILL_S288_RESTORE:
1188 case AMDGPU::SI_SPILL_V288_SAVE:
1189 case AMDGPU::SI_SPILL_V288_RESTORE:
1190 case AMDGPU::SI_SPILL_A288_SAVE:
1191 case AMDGPU::SI_SPILL_A288_RESTORE:
1192 case AMDGPU::SI_SPILL_AV288_SAVE:
1193 case AMDGPU::SI_SPILL_AV288_RESTORE:
1194 return 9;
1195 case AMDGPU::SI_SPILL_S256_SAVE:
1196 case AMDGPU::SI_SPILL_S256_RESTORE:
1197 case AMDGPU::SI_SPILL_V256_SAVE:
1198 case AMDGPU::SI_SPILL_V256_RESTORE:
1199 case AMDGPU::SI_SPILL_A256_SAVE:
1200 case AMDGPU::SI_SPILL_A256_RESTORE:
1201 case AMDGPU::SI_SPILL_AV256_SAVE:
1202 case AMDGPU::SI_SPILL_AV256_RESTORE:
1203 return 8;
1204 case AMDGPU::SI_SPILL_S224_SAVE:
1205 case AMDGPU::SI_SPILL_S224_RESTORE:
1206 case AMDGPU::SI_SPILL_V224_SAVE:
1207 case AMDGPU::SI_SPILL_V224_RESTORE:
1208 case AMDGPU::SI_SPILL_A224_SAVE:
1209 case AMDGPU::SI_SPILL_A224_RESTORE:
1210 case AMDGPU::SI_SPILL_AV224_SAVE:
1211 case AMDGPU::SI_SPILL_AV224_RESTORE:
1212 return 7;
1213 case AMDGPU::SI_SPILL_S192_SAVE:
1214 case AMDGPU::SI_SPILL_S192_RESTORE:
1215 case AMDGPU::SI_SPILL_V192_SAVE:
1216 case AMDGPU::SI_SPILL_V192_RESTORE:
1217 case AMDGPU::SI_SPILL_A192_SAVE:
1218 case AMDGPU::SI_SPILL_A192_RESTORE:
1219 case AMDGPU::SI_SPILL_AV192_SAVE:
1220 case AMDGPU::SI_SPILL_AV192_RESTORE:
1221 return 6;
1222 case AMDGPU::SI_SPILL_S160_SAVE:
1223 case AMDGPU::SI_SPILL_S160_RESTORE:
1224 case AMDGPU::SI_SPILL_V160_SAVE:
1225 case AMDGPU::SI_SPILL_V160_RESTORE:
1226 case AMDGPU::SI_SPILL_A160_SAVE:
1227 case AMDGPU::SI_SPILL_A160_RESTORE:
1228 case AMDGPU::SI_SPILL_AV160_SAVE:
1229 case AMDGPU::SI_SPILL_AV160_RESTORE:
1230 return 5;
1231 case AMDGPU::SI_SPILL_S128_SAVE:
1232 case AMDGPU::SI_SPILL_S128_RESTORE:
1233 case AMDGPU::SI_SPILL_V128_SAVE:
1234 case AMDGPU::SI_SPILL_V128_RESTORE:
1235 case AMDGPU::SI_SPILL_A128_SAVE:
1236 case AMDGPU::SI_SPILL_A128_RESTORE:
1237 case AMDGPU::SI_SPILL_AV128_SAVE:
1238 case AMDGPU::SI_SPILL_AV128_RESTORE:
1239 return 4;
1240 case AMDGPU::SI_SPILL_S96_SAVE:
1241 case AMDGPU::SI_SPILL_S96_RESTORE:
1242 case AMDGPU::SI_SPILL_V96_SAVE:
1243 case AMDGPU::SI_SPILL_V96_RESTORE:
1244 case AMDGPU::SI_SPILL_A96_SAVE:
1245 case AMDGPU::SI_SPILL_A96_RESTORE:
1246 case AMDGPU::SI_SPILL_AV96_SAVE:
1247 case AMDGPU::SI_SPILL_AV96_RESTORE:
1248 return 3;
1249 case AMDGPU::SI_SPILL_S64_SAVE:
1250 case AMDGPU::SI_SPILL_S64_RESTORE:
1251 case AMDGPU::SI_SPILL_V64_SAVE:
1252 case AMDGPU::SI_SPILL_V64_RESTORE:
1253 case AMDGPU::SI_SPILL_A64_SAVE:
1254 case AMDGPU::SI_SPILL_A64_RESTORE:
1255 case AMDGPU::SI_SPILL_AV64_SAVE:
1256 case AMDGPU::SI_SPILL_AV64_RESTORE:
1257 return 2;
1258 case AMDGPU::SI_SPILL_S32_SAVE:
1259 case AMDGPU::SI_SPILL_S32_RESTORE:
1260 case AMDGPU::SI_SPILL_V32_SAVE:
1261 case AMDGPU::SI_SPILL_V32_RESTORE:
1262 case AMDGPU::SI_SPILL_A32_SAVE:
1263 case AMDGPU::SI_SPILL_A32_RESTORE:
1264 case AMDGPU::SI_SPILL_AV32_SAVE:
1265 case AMDGPU::SI_SPILL_AV32_RESTORE:
1266 case AMDGPU::SI_SPILL_WWM_V32_SAVE:
1267 case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
1268 case AMDGPU::SI_SPILL_WWM_AV32_SAVE:
1269 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE:
1270 case AMDGPU::SI_SPILL_V16_SAVE:
1271 case AMDGPU::SI_SPILL_V16_RESTORE:
1272 return 1;
1273 default: llvm_unreachable("Invalid spill opcode");
1274 }
1275}
1276
1277static int getOffsetMUBUFStore(unsigned Opc) {
1278 switch (Opc) {
1279 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
1280 return AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1281 case AMDGPU::BUFFER_STORE_BYTE_OFFEN:
1282 return AMDGPU::BUFFER_STORE_BYTE_OFFSET;
1283 case AMDGPU::BUFFER_STORE_SHORT_OFFEN:
1284 return AMDGPU::BUFFER_STORE_SHORT_OFFSET;
1285 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
1286 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
1287 case AMDGPU::BUFFER_STORE_DWORDX3_OFFEN:
1288 return AMDGPU::BUFFER_STORE_DWORDX3_OFFSET;
1289 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN:
1290 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
1291 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN:
1292 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET;
1293 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN:
1294 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET;
1295 default:
1296 return -1;
1297 }
1298}
1299
1300static int getOffsetMUBUFLoad(unsigned Opc) {
1301 switch (Opc) {
1302 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
1303 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1304 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN:
1305 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET;
1306 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN:
1307 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET;
1308 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN:
1309 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET;
1310 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN:
1311 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET;
1312 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN:
1313 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
1314 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN:
1315 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET;
1316 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN:
1317 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET;
1318 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN:
1319 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET;
1320 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN:
1321 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET;
1322 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN:
1323 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET;
1324 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN:
1325 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET;
1326 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN:
1327 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET;
1328 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN:
1329 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET;
1330 default:
1331 return -1;
1332 }
1333}
1334
1335static int getOffenMUBUFStore(unsigned Opc) {
1336 switch (Opc) {
1337 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
1338 return AMDGPU::BUFFER_STORE_DWORD_OFFEN;
1339 case AMDGPU::BUFFER_STORE_BYTE_OFFSET:
1340 return AMDGPU::BUFFER_STORE_BYTE_OFFEN;
1341 case AMDGPU::BUFFER_STORE_SHORT_OFFSET:
1342 return AMDGPU::BUFFER_STORE_SHORT_OFFEN;
1343 case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET:
1344 return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN;
1345 case AMDGPU::BUFFER_STORE_DWORDX3_OFFSET:
1346 return AMDGPU::BUFFER_STORE_DWORDX3_OFFEN;
1347 case AMDGPU::BUFFER_STORE_DWORDX4_OFFSET:
1348 return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN;
1349 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET:
1350 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN;
1351 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET:
1352 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN;
1353 default:
1354 return -1;
1355 }
1356}
1357
1358static int getOffenMUBUFLoad(unsigned Opc) {
1359 switch (Opc) {
1360 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
1361 return AMDGPU::BUFFER_LOAD_DWORD_OFFEN;
1362 case AMDGPU::BUFFER_LOAD_UBYTE_OFFSET:
1363 return AMDGPU::BUFFER_LOAD_UBYTE_OFFEN;
1364 case AMDGPU::BUFFER_LOAD_SBYTE_OFFSET:
1365 return AMDGPU::BUFFER_LOAD_SBYTE_OFFEN;
1366 case AMDGPU::BUFFER_LOAD_USHORT_OFFSET:
1367 return AMDGPU::BUFFER_LOAD_USHORT_OFFEN;
1368 case AMDGPU::BUFFER_LOAD_SSHORT_OFFSET:
1369 return AMDGPU::BUFFER_LOAD_SSHORT_OFFEN;
1370 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET:
1371 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
1372 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET:
1373 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN;
1374 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET:
1375 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
1376 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET:
1377 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN;
1378 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET:
1379 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN;
1380 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET:
1381 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN;
1382 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET:
1383 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN;
1384 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET:
1385 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN;
1386 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET:
1387 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN;
1388 default:
1389 return -1;
1390 }
1391}
1392
1396 int Index, unsigned Lane,
1397 unsigned ValueReg, bool IsKill) {
1400 const SIInstrInfo *TII = ST.getInstrInfo();
1401
1402 MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane);
1403
1404 if (Reg == AMDGPU::NoRegister)
1405 return MachineInstrBuilder();
1406
1407 bool IsStore = MI->mayStore();
1409 auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
1410
1411 unsigned Dst = IsStore ? Reg : ValueReg;
1412 unsigned Src = IsStore ? ValueReg : Reg;
1413 bool IsVGPR = TRI->isVGPR(MRI, Reg);
1414 DebugLoc DL = MI->getDebugLoc();
1415 if (IsVGPR == TRI->isVGPR(MRI, ValueReg)) {
1416 // Spiller during regalloc may restore a spilled register to its superclass.
1417 // It could result in AGPR spills restored to VGPRs or the other way around,
1418 // making the src and dst with identical regclasses at this point. It just
1419 // needs a copy in such cases.
1420 auto CopyMIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dst)
1421 .addReg(Src, getKillRegState(IsKill));
1423 return CopyMIB;
1424 }
1425 unsigned Opc = (IsStore ^ IsVGPR) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64
1426 : AMDGPU::V_ACCVGPR_READ_B32_e64;
1427
1428 auto MIB = BuildMI(MBB, MI, DL, TII->get(Opc), Dst)
1429 .addReg(Src, getKillRegState(IsKill));
1431 return MIB;
1432}
1433
1434// This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
1435// need to handle the case where an SGPR may need to be spilled while spilling.
1437 MachineFrameInfo &MFI,
1439 int Index,
1440 int64_t Offset) {
1441 const SIInstrInfo *TII = ST.getInstrInfo();
1442 MachineBasicBlock *MBB = MI->getParent();
1443 const DebugLoc &DL = MI->getDebugLoc();
1444 bool IsStore = MI->mayStore();
1445
1446 unsigned Opc = MI->getOpcode();
1447 int LoadStoreOp = IsStore ?
1449 if (LoadStoreOp == -1)
1450 return false;
1451
1452 const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
1453 if (spillVGPRtoAGPR(ST, *MBB, MI, Index, 0, Reg->getReg(), false).getInstr())
1454 return true;
1455
1456 MachineInstrBuilder NewMI =
1457 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
1458 .add(*Reg)
1459 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
1460 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
1461 .addImm(Offset)
1462 .addImm(0) // cpol
1463 .addImm(0) // swz
1464 .cloneMemRefs(*MI);
1465
1466 const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
1467 AMDGPU::OpName::vdata_in);
1468 if (VDataIn)
1469 NewMI.add(*VDataIn);
1470 return true;
1471}
1472
1474 unsigned LoadStoreOp,
1475 unsigned EltSize) {
1476 bool IsStore = TII->get(LoadStoreOp).mayStore();
1477 bool HasVAddr = AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::vaddr);
1478 bool UseST =
1479 !HasVAddr && !AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::saddr);
1480
1481 // Handle block load/store first.
1482 if (TII->isBlockLoadStore(LoadStoreOp))
1483 return LoadStoreOp;
1484
1485 switch (EltSize) {
1486 case 4:
1487 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1488 : AMDGPU::SCRATCH_LOAD_DWORD_SADDR;
1489 break;
1490 case 8:
1491 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR
1492 : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR;
1493 break;
1494 case 12:
1495 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR
1496 : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR;
1497 break;
1498 case 16:
1499 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR
1500 : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR;
1501 break;
1502 default:
1503 llvm_unreachable("Unexpected spill load/store size!");
1504 }
1505
1506 if (HasVAddr)
1507 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
1508 else if (UseST)
1509 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
1510
1511 return LoadStoreOp;
1512}
1513
1516 unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill,
1517 MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO,
1518 RegScavenger *RS, LiveRegUnits *LiveUnits) const {
1519 assert((!RS || !LiveUnits) && "Only RS or LiveUnits can be set but not both");
1520
1522 const SIInstrInfo *TII = ST.getInstrInfo();
1523 const MachineFrameInfo &MFI = MF->getFrameInfo();
1524 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
1525
1526 const MCInstrDesc *Desc = &TII->get(LoadStoreOp);
1527 bool IsStore = Desc->mayStore();
1528 bool IsFlat = TII->isFLATScratch(LoadStoreOp);
1529 bool IsBlock = TII->isBlockLoadStore(LoadStoreOp);
1530
1531 bool CanClobberSCC = false;
1532 bool Scavenged = false;
1533 MCRegister SOffset = ScratchOffsetReg;
1534
1535 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
1536 // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores.
1537 const bool IsAGPR = !ST.hasGFX90AInsts() && isAGPRClass(RC);
1538 const unsigned RegWidth = AMDGPU::getRegBitWidth(*RC) / 8;
1539
1540 // Always use 4 byte operations for AGPRs because we need to scavenge
1541 // a temporary VGPR.
1542 // If we're using a block operation, the element should be the whole block.
1543 unsigned EltSize = IsBlock ? RegWidth
1544 : (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u)
1545 : 4u;
1546 unsigned NumSubRegs = RegWidth / EltSize;
1547 unsigned Size = NumSubRegs * EltSize;
1548 unsigned RemSize = RegWidth - Size;
1549 unsigned NumRemSubRegs = RemSize ? 1 : 0;
1550 int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
1551 int64_t MaterializedOffset = Offset;
1552
1553 int64_t MaxOffset = Offset + Size + RemSize - EltSize;
1554 int64_t ScratchOffsetRegDelta = 0;
1555
1556 if (IsFlat && EltSize > 4) {
1557 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1558 Desc = &TII->get(LoadStoreOp);
1559 }
1560
1561 Align Alignment = MFI.getObjectAlign(Index);
1562 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
1563
1564 assert((IsFlat || ((Offset % EltSize) == 0)) &&
1565 "unexpected VGPR spill offset");
1566
1567 // Track a VGPR to use for a constant offset we need to materialize.
1568 Register TmpOffsetVGPR;
1569
1570 // Track a VGPR to use as an intermediate value.
1571 Register TmpIntermediateVGPR;
1572 bool UseVGPROffset = false;
1573
1574 // Materialize a VGPR offset required for the given SGPR/VGPR/Immediate
1575 // combination.
1576 auto MaterializeVOffset = [&](Register SGPRBase, Register TmpVGPR,
1577 int64_t VOffset) {
1578 // We are using a VGPR offset
1579 if (IsFlat && SGPRBase) {
1580 // We only have 1 VGPR offset, or 1 SGPR offset. We don't have a free
1581 // SGPR, so perform the add as vector.
1582 // We don't need a base SGPR in the kernel.
1583
1584 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) >= 2) {
1585 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), TmpVGPR)
1586 .addReg(SGPRBase)
1587 .addImm(VOffset)
1588 .addImm(0); // clamp
1589 } else {
1590 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
1591 .addReg(SGPRBase);
1592 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), TmpVGPR)
1593 .addImm(VOffset)
1594 .addReg(TmpOffsetVGPR);
1595 }
1596 } else {
1597 assert(TmpOffsetVGPR);
1598 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
1599 .addImm(VOffset);
1600 }
1601 };
1602
1603 bool IsOffsetLegal =
1604 IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
1606 : TII->isLegalMUBUFImmOffset(MaxOffset);
1607 if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) {
1608 SOffset = MCRegister();
1609
1610 // We don't have access to the register scavenger if this function is called
1611 // during PEI::scavengeFrameVirtualRegs() so use LiveUnits in this case.
1612 // TODO: Clobbering SCC is not necessary for scratch instructions in the
1613 // entry.
1614 if (RS) {
1615 SOffset = RS->scavengeRegisterBackwards(AMDGPU::SGPR_32RegClass, MI, false, 0, false);
1616
1617 // Piggy back on the liveness scan we just did see if SCC is dead.
1618 CanClobberSCC = !RS->isRegUsed(AMDGPU::SCC);
1619 } else if (LiveUnits) {
1620 CanClobberSCC = LiveUnits->available(AMDGPU::SCC);
1621 for (MCRegister Reg : AMDGPU::SGPR_32RegClass) {
1622 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
1623 SOffset = Reg;
1624 break;
1625 }
1626 }
1627 }
1628
1629 if (ScratchOffsetReg != AMDGPU::NoRegister && !CanClobberSCC)
1630 SOffset = Register();
1631
1632 if (!SOffset) {
1633 UseVGPROffset = true;
1634
1635 if (RS) {
1636 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
1637 } else {
1638 assert(LiveUnits);
1639 for (MCRegister Reg : AMDGPU::VGPR_32RegClass) {
1640 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
1641 TmpOffsetVGPR = Reg;
1642 break;
1643 }
1644 }
1645 }
1646
1647 assert(TmpOffsetVGPR);
1648 } else if (!SOffset && CanClobberSCC) {
1649 // There are no free SGPRs, and since we are in the process of spilling
1650 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true
1651 // on SI/CI and on VI it is true until we implement spilling using scalar
1652 // stores), we have no way to free up an SGPR. Our solution here is to
1653 // add the offset directly to the ScratchOffset or StackPtrOffset
1654 // register, and then subtract the offset after the spill to return the
1655 // register to it's original value.
1656
1657 // TODO: If we don't have to do an emergency stack slot spill, converting
1658 // to use the VGPR offset is fewer instructions.
1659 if (!ScratchOffsetReg)
1660 ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg();
1661 SOffset = ScratchOffsetReg;
1662 ScratchOffsetRegDelta = Offset;
1663 } else {
1664 Scavenged = true;
1665 }
1666
1667 // We currently only support spilling VGPRs to EltSize boundaries, meaning
1668 // we can simplify the adjustment of Offset here to just scale with
1669 // WavefrontSize.
1670 if (!IsFlat && !UseVGPROffset)
1671 Offset *= ST.getWavefrontSize();
1672
1673 if (!UseVGPROffset && !SOffset)
1674 report_fatal_error("could not scavenge SGPR to spill in entry function");
1675
1676 if (UseVGPROffset) {
1677 // We are using a VGPR offset
1678 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, Offset);
1679 } else if (ScratchOffsetReg == AMDGPU::NoRegister) {
1680 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset);
1681 } else {
1682 assert(Offset != 0);
1683 auto Add = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
1684 .addReg(ScratchOffsetReg)
1685 .addImm(Offset);
1686 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1687 }
1688
1689 Offset = 0;
1690 }
1691
1692 if (IsFlat && SOffset == AMDGPU::NoRegister) {
1693 assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0
1694 && "Unexpected vaddr for flat scratch with a FI operand");
1695
1696 if (UseVGPROffset) {
1697 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
1698 } else {
1700 assert(!TII->isBlockLoadStore(LoadStoreOp) && "Block ops don't have ST");
1701 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
1702 }
1703
1704 Desc = &TII->get(LoadStoreOp);
1705 }
1706
1707 for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e;
1708 ++i, RegOffset += EltSize) {
1709 if (i == NumSubRegs) {
1710 EltSize = RemSize;
1711 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1712 }
1713 Desc = &TII->get(LoadStoreOp);
1714
1715 if (!IsFlat && UseVGPROffset) {
1716 int NewLoadStoreOp = IsStore ? getOffenMUBUFStore(LoadStoreOp)
1717 : getOffenMUBUFLoad(LoadStoreOp);
1718 Desc = &TII->get(NewLoadStoreOp);
1719 }
1720
1721 if (UseVGPROffset && TmpOffsetVGPR == TmpIntermediateVGPR) {
1722 // If we are spilling an AGPR beyond the range of the memory instruction
1723 // offset and need to use a VGPR offset, we ideally have at least 2
1724 // scratch VGPRs. If we don't have a second free VGPR without spilling,
1725 // recycle the VGPR used for the offset which requires resetting after
1726 // each subregister.
1727
1728 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, MaterializedOffset);
1729 }
1730
1731 unsigned NumRegs = EltSize / 4;
1732 Register SubReg = e == 1
1733 ? ValueReg
1734 : Register(getSubReg(ValueReg,
1735 getSubRegFromChannel(RegOffset / 4, NumRegs)));
1736
1737 unsigned SOffsetRegState = 0;
1738 unsigned SrcDstRegState = getDefRegState(!IsStore);
1739 const bool IsLastSubReg = i + 1 == e;
1740 const bool IsFirstSubReg = i == 0;
1741 if (IsLastSubReg) {
1742 SOffsetRegState |= getKillRegState(Scavenged);
1743 // The last implicit use carries the "Kill" flag.
1744 SrcDstRegState |= getKillRegState(IsKill);
1745 }
1746
1747 // Make sure the whole register is defined if there are undef components by
1748 // adding an implicit def of the super-reg on the first instruction.
1749 bool NeedSuperRegDef = e > 1 && IsStore && IsFirstSubReg;
1750 bool NeedSuperRegImpOperand = e > 1;
1751
1752 // Remaining element size to spill into memory after some parts of it
1753 // spilled into either AGPRs or VGPRs.
1754 unsigned RemEltSize = EltSize;
1755
1756 // AGPRs to spill VGPRs and vice versa are allocated in a reverse order,
1757 // starting from the last lane. In case if a register cannot be completely
1758 // spilled into another register that will ensure its alignment does not
1759 // change. For targets with VGPR alignment requirement this is important
1760 // in case of flat scratch usage as we might get a scratch_load or
1761 // scratch_store of an unaligned register otherwise.
1762 for (int LaneS = (RegOffset + EltSize) / 4 - 1, Lane = LaneS,
1763 LaneE = RegOffset / 4;
1764 Lane >= LaneE; --Lane) {
1765 bool IsSubReg = e > 1 || EltSize > 4;
1766 Register Sub = IsSubReg
1767 ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane)))
1768 : ValueReg;
1769 auto MIB = spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, Sub, IsKill);
1770 if (!MIB.getInstr())
1771 break;
1772 if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == LaneS && IsFirstSubReg)) {
1773 MIB.addReg(ValueReg, RegState::ImplicitDefine);
1774 NeedSuperRegDef = false;
1775 }
1776 if ((IsSubReg || NeedSuperRegImpOperand) && (IsFirstSubReg || IsLastSubReg)) {
1777 NeedSuperRegImpOperand = true;
1778 unsigned State = SrcDstRegState;
1779 if (!IsLastSubReg || (Lane != LaneE))
1780 State &= ~RegState::Kill;
1781 if (!IsFirstSubReg || (Lane != LaneS))
1782 State &= ~RegState::Define;
1783 MIB.addReg(ValueReg, RegState::Implicit | State);
1784 }
1785 RemEltSize -= 4;
1786 }
1787
1788 if (!RemEltSize) // Fully spilled into AGPRs.
1789 continue;
1790
1791 if (RemEltSize != EltSize) { // Partially spilled to AGPRs
1792 assert(IsFlat && EltSize > 4);
1793
1794 unsigned NumRegs = RemEltSize / 4;
1795 SubReg = Register(getSubReg(ValueReg,
1796 getSubRegFromChannel(RegOffset / 4, NumRegs)));
1797 unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize);
1798 Desc = &TII->get(Opc);
1799 }
1800
1801 unsigned FinalReg = SubReg;
1802
1803 if (IsAGPR) {
1804 assert(EltSize == 4);
1805
1806 if (!TmpIntermediateVGPR) {
1807 TmpIntermediateVGPR = FuncInfo->getVGPRForAGPRCopy();
1808 assert(MF->getRegInfo().isReserved(TmpIntermediateVGPR));
1809 }
1810 if (IsStore) {
1811 auto AccRead = BuildMI(MBB, MI, DL,
1812 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64),
1813 TmpIntermediateVGPR)
1814 .addReg(SubReg, getKillRegState(IsKill));
1815 if (NeedSuperRegDef)
1816 AccRead.addReg(ValueReg, RegState::ImplicitDefine);
1817 if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg))
1818 AccRead.addReg(ValueReg, RegState::Implicit);
1820 }
1821 SubReg = TmpIntermediateVGPR;
1822 } else if (UseVGPROffset) {
1823 if (!TmpOffsetVGPR) {
1824 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
1825 MI, false, 0);
1826 RS->setRegUsed(TmpOffsetVGPR);
1827 }
1828 }
1829
1830 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset);
1831 MachineMemOperand *NewMMO =
1832 MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize,
1833 commonAlignment(Alignment, RegOffset));
1834
1835 auto MIB =
1836 BuildMI(MBB, MI, DL, *Desc)
1837 .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill));
1838
1839 if (UseVGPROffset) {
1840 // For an AGPR spill, we reuse the same temp VGPR for the offset and the
1841 // intermediate accvgpr_write.
1842 MIB.addReg(TmpOffsetVGPR, getKillRegState(IsLastSubReg && !IsAGPR));
1843 }
1844
1845 if (!IsFlat)
1846 MIB.addReg(FuncInfo->getScratchRSrcReg());
1847
1848 if (SOffset == AMDGPU::NoRegister) {
1849 if (!IsFlat) {
1850 if (UseVGPROffset && ScratchOffsetReg) {
1851 MIB.addReg(ScratchOffsetReg);
1852 } else {
1853 assert(FuncInfo->isBottomOfStack());
1854 MIB.addImm(0);
1855 }
1856 }
1857 } else {
1858 MIB.addReg(SOffset, SOffsetRegState);
1859 }
1860
1861 MIB.addImm(Offset + RegOffset);
1862
1863 bool LastUse = MMO->getFlags() & MOLastUse;
1864 MIB.addImm(LastUse ? AMDGPU::CPol::TH_LU : 0); // cpol
1865
1866 if (!IsFlat)
1867 MIB.addImm(0); // swz
1868 MIB.addMemOperand(NewMMO);
1869
1870 if (!IsAGPR && NeedSuperRegDef)
1871 MIB.addReg(ValueReg, RegState::ImplicitDefine);
1872
1873 if (!IsStore && IsAGPR && TmpIntermediateVGPR != AMDGPU::NoRegister) {
1874 MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64),
1875 FinalReg)
1876 .addReg(TmpIntermediateVGPR, RegState::Kill);
1878 }
1879
1880 bool IsSrcDstDef = SrcDstRegState & RegState::Define;
1881 if (NeedSuperRegImpOperand &&
1882 (IsFirstSubReg || (IsLastSubReg && !IsSrcDstDef)))
1883 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
1884
1885 // The epilog restore of a wwm-scratch register can cause undesired
1886 // optimization during machine-cp post PrologEpilogInserter if the same
1887 // register was assigned for return value ABI lowering with a COPY
1888 // instruction. As given below, with the epilog reload, the earlier COPY
1889 // appeared to be dead during machine-cp.
1890 // ...
1891 // v0 in WWM operation, needs the WWM spill at prolog/epilog.
1892 // $vgpr0 = V_WRITELANE_B32 $sgpr20, 0, $vgpr0
1893 // ...
1894 // Epilog block:
1895 // $vgpr0 = COPY $vgpr1 // outgoing value moved to v0
1896 // ...
1897 // WWM spill restore to preserve the inactive lanes of v0.
1898 // $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1
1899 // $vgpr0 = BUFFER_LOAD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0
1900 // $exec = S_MOV_B64 killed $sgpr4_sgpr5
1901 // ...
1902 // SI_RETURN implicit $vgpr0
1903 // ...
1904 // To fix it, mark the same reg as a tied op for such restore instructions
1905 // so that it marks a usage for the preceding COPY.
1906 if (!IsStore && MI != MBB.end() && MI->isReturn() &&
1907 MI->readsRegister(SubReg, this)) {
1908 MIB.addReg(SubReg, RegState::Implicit);
1909 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1910 }
1911
1912 // If we're building a block load, we should add artificial uses for the
1913 // CSR VGPRs that are *not* being transferred. This is because liveness
1914 // analysis is not aware of the mask, so we need to somehow inform it that
1915 // those registers are not available before the load and they should not be
1916 // scavenged.
1917 if (!IsStore && TII->isBlockLoadStore(LoadStoreOp))
1918 addImplicitUsesForBlockCSRLoad(MIB, ValueReg);
1919 }
1920
1921 if (ScratchOffsetRegDelta != 0) {
1922 // Subtract the offset we added to the ScratchOffset register.
1923 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
1924 .addReg(SOffset)
1925 .addImm(-ScratchOffsetRegDelta);
1926 }
1927}
1928
1930 Register BlockReg) const {
1931 const MachineFunction *MF = MIB->getParent()->getParent();
1932 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
1933 uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(BlockReg);
1934 Register BaseVGPR = getSubReg(BlockReg, AMDGPU::sub0);
1935 for (unsigned RegOffset = 1; RegOffset < 32; ++RegOffset)
1936 if (!(Mask & (1 << RegOffset)) &&
1937 isCalleeSavedPhysReg(BaseVGPR + RegOffset, *MF))
1938 MIB.addUse(BaseVGPR + RegOffset, RegState::Implicit);
1939}
1940
1942 int Offset, bool IsLoad,
1943 bool IsKill) const {
1944 // Load/store VGPR
1945 MachineFrameInfo &FrameInfo = SB.MF.getFrameInfo();
1946 assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill);
1947
1948 Register FrameReg =
1949 FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(SB.MF)
1950 ? getBaseRegister()
1951 : getFrameRegister(SB.MF);
1952
1953 Align Alignment = FrameInfo.getObjectAlign(Index);
1957 SB.EltSize, Alignment);
1958
1959 if (IsLoad) {
1960 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
1961 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1962 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, false,
1963 FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
1964 } else {
1965 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1966 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1967 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, IsKill,
1968 FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
1969 // This only ever adds one VGPR spill
1970 SB.MFI.addToSpilledVGPRs(1);
1971 }
1972}
1973
1975 RegScavenger *RS, SlotIndexes *Indexes,
1976 LiveIntervals *LIS, bool OnlyToVGPR,
1977 bool SpillToPhysVGPRLane) const {
1978 assert(!MI->getOperand(0).isUndef() &&
1979 "undef spill should have been deleted earlier");
1980
1981 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
1982
1983 ArrayRef<SpilledReg> VGPRSpills =
1984 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
1986 bool SpillToVGPR = !VGPRSpills.empty();
1987 if (OnlyToVGPR && !SpillToVGPR)
1988 return false;
1989
1990 assert(SpillToVGPR || (SB.SuperReg != SB.MFI.getStackPtrOffsetReg() &&
1991 SB.SuperReg != SB.MFI.getFrameOffsetReg()));
1992
1993 if (SpillToVGPR) {
1994
1995 // Since stack slot coloring pass is trying to optimize SGPR spills,
1996 // VGPR lanes (mapped from spill stack slot) may be shared for SGPR
1997 // spills of different sizes. This accounts for number of VGPR lanes alloted
1998 // equal to the largest SGPR being spilled in them.
1999 assert(SB.NumSubRegs <= VGPRSpills.size() &&
2000 "Num of SGPRs spilled should be less than or equal to num of "
2001 "the VGPR lanes.");
2002
2003 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
2005 SB.NumSubRegs == 1
2006 ? SB.SuperReg
2007 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2008 SpilledReg Spill = VGPRSpills[i];
2009
2010 bool IsFirstSubreg = i == 0;
2011 bool IsLastSubreg = i == SB.NumSubRegs - 1;
2012 bool UseKill = SB.IsKill && IsLastSubreg;
2013
2014
2015 // Mark the "old value of vgpr" input undef only if this is the first sgpr
2016 // spill to this specific vgpr in the first basic block.
2017 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2018 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), Spill.VGPR)
2019 .addReg(SubReg, getKillRegState(UseKill))
2020 .addImm(Spill.Lane)
2021 .addReg(Spill.VGPR);
2022 if (Indexes) {
2023 if (IsFirstSubreg)
2024 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2025 else
2026 Indexes->insertMachineInstrInMaps(*MIB);
2027 }
2028
2029 if (IsFirstSubreg && SB.NumSubRegs > 1) {
2030 // We may be spilling a super-register which is only partially defined,
2031 // and need to ensure later spills think the value is defined.
2032 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
2033 }
2034
2035 if (SB.NumSubRegs > 1 && (IsFirstSubreg || IsLastSubreg))
2036 MIB.addReg(SB.SuperReg, getKillRegState(UseKill) | RegState::Implicit);
2037
2038 // FIXME: Since this spills to another register instead of an actual
2039 // frame index, we should delete the frame index when all references to
2040 // it are fixed.
2041 }
2042 } else {
2043 SB.prepare();
2044
2045 // SubReg carries the "Kill" flag when SubReg == SB.SuperReg.
2046 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
2047
2048 // Per VGPR helper data
2049 auto PVD = SB.getPerVGPRData();
2050
2051 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2052 unsigned TmpVGPRFlags = RegState::Undef;
2053
2054 // Write sub registers into the VGPR
2055 for (unsigned i = Offset * PVD.PerVGPR,
2056 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2057 i < e; ++i) {
2059 SB.NumSubRegs == 1
2060 ? SB.SuperReg
2061 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2062
2063 MachineInstrBuilder WriteLane =
2064 BuildMI(*SB.MBB, MI, SB.DL,
2065 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), SB.TmpVGPR)
2066 .addReg(SubReg, SubKillState)
2067 .addImm(i % PVD.PerVGPR)
2068 .addReg(SB.TmpVGPR, TmpVGPRFlags);
2069 TmpVGPRFlags = 0;
2070
2071 if (Indexes) {
2072 if (i == 0)
2073 Indexes->replaceMachineInstrInMaps(*MI, *WriteLane);
2074 else
2075 Indexes->insertMachineInstrInMaps(*WriteLane);
2076 }
2077
2078 // There could be undef components of a spilled super register.
2079 // TODO: Can we detect this and skip the spill?
2080 if (SB.NumSubRegs > 1) {
2081 // The last implicit use of the SB.SuperReg carries the "Kill" flag.
2082 unsigned SuperKillState = 0;
2083 if (i + 1 == SB.NumSubRegs)
2084 SuperKillState |= getKillRegState(SB.IsKill);
2085 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
2086 }
2087 }
2088
2089 // Write out VGPR
2090 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ false);
2091 }
2092
2093 SB.restore();
2094 }
2095
2096 MI->eraseFromParent();
2098
2099 if (LIS)
2101
2102 return true;
2103}
2104
2106 RegScavenger *RS, SlotIndexes *Indexes,
2107 LiveIntervals *LIS, bool OnlyToVGPR,
2108 bool SpillToPhysVGPRLane) const {
2109 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
2110
2111 ArrayRef<SpilledReg> VGPRSpills =
2112 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
2114 bool SpillToVGPR = !VGPRSpills.empty();
2115 if (OnlyToVGPR && !SpillToVGPR)
2116 return false;
2117
2118 if (SpillToVGPR) {
2119 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
2121 SB.NumSubRegs == 1
2122 ? SB.SuperReg
2123 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2124
2125 SpilledReg Spill = VGPRSpills[i];
2126 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2127 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
2128 .addReg(Spill.VGPR)
2129 .addImm(Spill.Lane);
2130 if (SB.NumSubRegs > 1 && i == 0)
2132 if (Indexes) {
2133 if (i == e - 1)
2134 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2135 else
2136 Indexes->insertMachineInstrInMaps(*MIB);
2137 }
2138 }
2139 } else {
2140 SB.prepare();
2141
2142 // Per VGPR helper data
2143 auto PVD = SB.getPerVGPRData();
2144
2145 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2146 // Load in VGPR data
2147 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ true);
2148
2149 // Unpack lanes
2150 for (unsigned i = Offset * PVD.PerVGPR,
2151 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2152 i < e; ++i) {
2154 SB.NumSubRegs == 1
2155 ? SB.SuperReg
2156 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2157
2158 bool LastSubReg = (i + 1 == e);
2159 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2160 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
2161 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
2162 .addImm(i);
2163 if (SB.NumSubRegs > 1 && i == 0)
2165 if (Indexes) {
2166 if (i == e - 1)
2167 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2168 else
2169 Indexes->insertMachineInstrInMaps(*MIB);
2170 }
2171 }
2172 }
2173
2174 SB.restore();
2175 }
2176
2177 MI->eraseFromParent();
2178
2179 if (LIS)
2181
2182 return true;
2183}
2184
2186 MachineBasicBlock &RestoreMBB,
2187 Register SGPR, RegScavenger *RS) const {
2188 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, SGPR, false, 0,
2189 RS);
2190 SB.prepare();
2191 // Generate the spill of SGPR to SB.TmpVGPR.
2192 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
2193 auto PVD = SB.getPerVGPRData();
2194 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2195 unsigned TmpVGPRFlags = RegState::Undef;
2196 // Write sub registers into the VGPR
2197 for (unsigned i = Offset * PVD.PerVGPR,
2198 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2199 i < e; ++i) {
2201 SB.NumSubRegs == 1
2202 ? SB.SuperReg
2203 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2204
2205 MachineInstrBuilder WriteLane =
2206 BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32),
2207 SB.TmpVGPR)
2208 .addReg(SubReg, SubKillState)
2209 .addImm(i % PVD.PerVGPR)
2210 .addReg(SB.TmpVGPR, TmpVGPRFlags);
2211 TmpVGPRFlags = 0;
2212 // There could be undef components of a spilled super register.
2213 // TODO: Can we detect this and skip the spill?
2214 if (SB.NumSubRegs > 1) {
2215 // The last implicit use of the SB.SuperReg carries the "Kill" flag.
2216 unsigned SuperKillState = 0;
2217 if (i + 1 == SB.NumSubRegs)
2218 SuperKillState |= getKillRegState(SB.IsKill);
2219 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
2220 }
2221 }
2222 // Don't need to write VGPR out.
2223 }
2224
2225 MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
2226
2227 // Restore clobbered registers in the specified restore block.
2228 MI = RestoreMBB.end();
2229 SB.setMI(&RestoreMBB, MI);
2230 // Generate the restore of SGPR from SB.TmpVGPR.
2231 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2232 // Don't need to load VGPR in.
2233 // Unpack lanes
2234 for (unsigned i = Offset * PVD.PerVGPR,
2235 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2236 i < e; ++i) {
2238 SB.NumSubRegs == 1
2239 ? SB.SuperReg
2240 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2241 MRI.constrainRegClass(SubReg, &AMDGPU::SReg_32_XM0RegClass);
2242 bool LastSubReg = (i + 1 == e);
2243 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32),
2244 SubReg)
2245 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
2246 .addImm(i);
2247 if (SB.NumSubRegs > 1 && i == 0)
2249 }
2250 }
2251 SB.restore();
2252
2254 return false;
2255}
2256
2257/// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to
2258/// a VGPR and the stack slot can be safely eliminated when all other users are
2259/// handled.
2262 SlotIndexes *Indexes, LiveIntervals *LIS, bool SpillToPhysVGPRLane) const {
2263 switch (MI->getOpcode()) {
2264 case AMDGPU::SI_SPILL_S1024_SAVE:
2265 case AMDGPU::SI_SPILL_S512_SAVE:
2266 case AMDGPU::SI_SPILL_S384_SAVE:
2267 case AMDGPU::SI_SPILL_S352_SAVE:
2268 case AMDGPU::SI_SPILL_S320_SAVE:
2269 case AMDGPU::SI_SPILL_S288_SAVE:
2270 case AMDGPU::SI_SPILL_S256_SAVE:
2271 case AMDGPU::SI_SPILL_S224_SAVE:
2272 case AMDGPU::SI_SPILL_S192_SAVE:
2273 case AMDGPU::SI_SPILL_S160_SAVE:
2274 case AMDGPU::SI_SPILL_S128_SAVE:
2275 case AMDGPU::SI_SPILL_S96_SAVE:
2276 case AMDGPU::SI_SPILL_S64_SAVE:
2277 case AMDGPU::SI_SPILL_S32_SAVE:
2278 return spillSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
2279 case AMDGPU::SI_SPILL_S1024_RESTORE:
2280 case AMDGPU::SI_SPILL_S512_RESTORE:
2281 case AMDGPU::SI_SPILL_S384_RESTORE:
2282 case AMDGPU::SI_SPILL_S352_RESTORE:
2283 case AMDGPU::SI_SPILL_S320_RESTORE:
2284 case AMDGPU::SI_SPILL_S288_RESTORE:
2285 case AMDGPU::SI_SPILL_S256_RESTORE:
2286 case AMDGPU::SI_SPILL_S224_RESTORE:
2287 case AMDGPU::SI_SPILL_S192_RESTORE:
2288 case AMDGPU::SI_SPILL_S160_RESTORE:
2289 case AMDGPU::SI_SPILL_S128_RESTORE:
2290 case AMDGPU::SI_SPILL_S96_RESTORE:
2291 case AMDGPU::SI_SPILL_S64_RESTORE:
2292 case AMDGPU::SI_SPILL_S32_RESTORE:
2293 return restoreSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
2294 default:
2295 llvm_unreachable("not an SGPR spill instruction");
2296 }
2297}
2298
2300 int SPAdj, unsigned FIOperandNum,
2301 RegScavenger *RS) const {
2302 MachineFunction *MF = MI->getParent()->getParent();
2303 MachineBasicBlock *MBB = MI->getParent();
2305 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
2306 const SIInstrInfo *TII = ST.getInstrInfo();
2307 const DebugLoc &DL = MI->getDebugLoc();
2308
2309 assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?");
2310
2312 "unreserved scratch RSRC register");
2313
2314 MachineOperand *FIOp = &MI->getOperand(FIOperandNum);
2315 int Index = MI->getOperand(FIOperandNum).getIndex();
2316
2317 Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
2318 ? getBaseRegister()
2319 : getFrameRegister(*MF);
2320
2321 switch (MI->getOpcode()) {
2322 // SGPR register spill
2323 case AMDGPU::SI_SPILL_S1024_SAVE:
2324 case AMDGPU::SI_SPILL_S512_SAVE:
2325 case AMDGPU::SI_SPILL_S384_SAVE:
2326 case AMDGPU::SI_SPILL_S352_SAVE:
2327 case AMDGPU::SI_SPILL_S320_SAVE:
2328 case AMDGPU::SI_SPILL_S288_SAVE:
2329 case AMDGPU::SI_SPILL_S256_SAVE:
2330 case AMDGPU::SI_SPILL_S224_SAVE:
2331 case AMDGPU::SI_SPILL_S192_SAVE:
2332 case AMDGPU::SI_SPILL_S160_SAVE:
2333 case AMDGPU::SI_SPILL_S128_SAVE:
2334 case AMDGPU::SI_SPILL_S96_SAVE:
2335 case AMDGPU::SI_SPILL_S64_SAVE:
2336 case AMDGPU::SI_SPILL_S32_SAVE: {
2337 return spillSGPR(MI, Index, RS);
2338 }
2339
2340 // SGPR register restore
2341 case AMDGPU::SI_SPILL_S1024_RESTORE:
2342 case AMDGPU::SI_SPILL_S512_RESTORE:
2343 case AMDGPU::SI_SPILL_S384_RESTORE:
2344 case AMDGPU::SI_SPILL_S352_RESTORE:
2345 case AMDGPU::SI_SPILL_S320_RESTORE:
2346 case AMDGPU::SI_SPILL_S288_RESTORE:
2347 case AMDGPU::SI_SPILL_S256_RESTORE:
2348 case AMDGPU::SI_SPILL_S224_RESTORE:
2349 case AMDGPU::SI_SPILL_S192_RESTORE:
2350 case AMDGPU::SI_SPILL_S160_RESTORE:
2351 case AMDGPU::SI_SPILL_S128_RESTORE:
2352 case AMDGPU::SI_SPILL_S96_RESTORE:
2353 case AMDGPU::SI_SPILL_S64_RESTORE:
2354 case AMDGPU::SI_SPILL_S32_RESTORE: {
2355 return restoreSGPR(MI, Index, RS);
2356 }
2357
2358 // VGPR register spill
2359 case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE: {
2360 // Put mask into M0.
2361 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
2362 AMDGPU::M0)
2363 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask));
2365 }
2366 case AMDGPU::SI_SPILL_V1024_SAVE:
2367 case AMDGPU::SI_SPILL_V512_SAVE:
2368 case AMDGPU::SI_SPILL_V384_SAVE:
2369 case AMDGPU::SI_SPILL_V352_SAVE:
2370 case AMDGPU::SI_SPILL_V320_SAVE:
2371 case AMDGPU::SI_SPILL_V288_SAVE:
2372 case AMDGPU::SI_SPILL_V256_SAVE:
2373 case AMDGPU::SI_SPILL_V224_SAVE:
2374 case AMDGPU::SI_SPILL_V192_SAVE:
2375 case AMDGPU::SI_SPILL_V160_SAVE:
2376 case AMDGPU::SI_SPILL_V128_SAVE:
2377 case AMDGPU::SI_SPILL_V96_SAVE:
2378 case AMDGPU::SI_SPILL_V64_SAVE:
2379 case AMDGPU::SI_SPILL_V32_SAVE:
2380 case AMDGPU::SI_SPILL_V16_SAVE:
2381 case AMDGPU::SI_SPILL_A1024_SAVE:
2382 case AMDGPU::SI_SPILL_A512_SAVE:
2383 case AMDGPU::SI_SPILL_A384_SAVE:
2384 case AMDGPU::SI_SPILL_A352_SAVE:
2385 case AMDGPU::SI_SPILL_A320_SAVE:
2386 case AMDGPU::SI_SPILL_A288_SAVE:
2387 case AMDGPU::SI_SPILL_A256_SAVE:
2388 case AMDGPU::SI_SPILL_A224_SAVE:
2389 case AMDGPU::SI_SPILL_A192_SAVE:
2390 case AMDGPU::SI_SPILL_A160_SAVE:
2391 case AMDGPU::SI_SPILL_A128_SAVE:
2392 case AMDGPU::SI_SPILL_A96_SAVE:
2393 case AMDGPU::SI_SPILL_A64_SAVE:
2394 case AMDGPU::SI_SPILL_A32_SAVE:
2395 case AMDGPU::SI_SPILL_AV1024_SAVE:
2396 case AMDGPU::SI_SPILL_AV512_SAVE:
2397 case AMDGPU::SI_SPILL_AV384_SAVE:
2398 case AMDGPU::SI_SPILL_AV352_SAVE:
2399 case AMDGPU::SI_SPILL_AV320_SAVE:
2400 case AMDGPU::SI_SPILL_AV288_SAVE:
2401 case AMDGPU::SI_SPILL_AV256_SAVE:
2402 case AMDGPU::SI_SPILL_AV224_SAVE:
2403 case AMDGPU::SI_SPILL_AV192_SAVE:
2404 case AMDGPU::SI_SPILL_AV160_SAVE:
2405 case AMDGPU::SI_SPILL_AV128_SAVE:
2406 case AMDGPU::SI_SPILL_AV96_SAVE:
2407 case AMDGPU::SI_SPILL_AV64_SAVE:
2408 case AMDGPU::SI_SPILL_AV32_SAVE:
2409 case AMDGPU::SI_SPILL_WWM_V32_SAVE:
2410 case AMDGPU::SI_SPILL_WWM_AV32_SAVE: {
2411 const MachineOperand *VData = TII->getNamedOperand(*MI,
2412 AMDGPU::OpName::vdata);
2413 if (VData->isUndef()) {
2414 MI->eraseFromParent();
2415 return true;
2416 }
2417
2418 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
2419 MFI->getStackPtrOffsetReg());
2420
2421 unsigned Opc;
2422 if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_SAVE) {
2423 assert(ST.enableFlatScratch() && "Flat Scratch is not enabled!");
2424 Opc = AMDGPU::SCRATCH_STORE_SHORT_SADDR_t16;
2425 } else {
2426 Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_SAVE
2427 ? AMDGPU::SCRATCH_STORE_BLOCK_SADDR
2428 : ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
2429 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
2430 }
2431
2432 auto *MBB = MI->getParent();
2433 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
2434 if (IsWWMRegSpill) {
2435 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
2436 RS->isRegUsed(AMDGPU::SCC));
2437 }
2439 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2440 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2441 *MI->memoperands_begin(), RS);
2443 if (IsWWMRegSpill)
2444 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
2445
2446 MI->eraseFromParent();
2447 return true;
2448 }
2449 case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE: {
2450 // Put mask into M0.
2451 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
2452 AMDGPU::M0)
2453 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask));
2455 }
2456 case AMDGPU::SI_SPILL_V16_RESTORE:
2457 case AMDGPU::SI_SPILL_V32_RESTORE:
2458 case AMDGPU::SI_SPILL_V64_RESTORE:
2459 case AMDGPU::SI_SPILL_V96_RESTORE:
2460 case AMDGPU::SI_SPILL_V128_RESTORE:
2461 case AMDGPU::SI_SPILL_V160_RESTORE:
2462 case AMDGPU::SI_SPILL_V192_RESTORE:
2463 case AMDGPU::SI_SPILL_V224_RESTORE:
2464 case AMDGPU::SI_SPILL_V256_RESTORE:
2465 case AMDGPU::SI_SPILL_V288_RESTORE:
2466 case AMDGPU::SI_SPILL_V320_RESTORE:
2467 case AMDGPU::SI_SPILL_V352_RESTORE:
2468 case AMDGPU::SI_SPILL_V384_RESTORE:
2469 case AMDGPU::SI_SPILL_V512_RESTORE:
2470 case AMDGPU::SI_SPILL_V1024_RESTORE:
2471 case AMDGPU::SI_SPILL_A32_RESTORE:
2472 case AMDGPU::SI_SPILL_A64_RESTORE:
2473 case AMDGPU::SI_SPILL_A96_RESTORE:
2474 case AMDGPU::SI_SPILL_A128_RESTORE:
2475 case AMDGPU::SI_SPILL_A160_RESTORE:
2476 case AMDGPU::SI_SPILL_A192_RESTORE:
2477 case AMDGPU::SI_SPILL_A224_RESTORE:
2478 case AMDGPU::SI_SPILL_A256_RESTORE:
2479 case AMDGPU::SI_SPILL_A288_RESTORE:
2480 case AMDGPU::SI_SPILL_A320_RESTORE:
2481 case AMDGPU::SI_SPILL_A352_RESTORE:
2482 case AMDGPU::SI_SPILL_A384_RESTORE:
2483 case AMDGPU::SI_SPILL_A512_RESTORE:
2484 case AMDGPU::SI_SPILL_A1024_RESTORE:
2485 case AMDGPU::SI_SPILL_AV32_RESTORE:
2486 case AMDGPU::SI_SPILL_AV64_RESTORE:
2487 case AMDGPU::SI_SPILL_AV96_RESTORE:
2488 case AMDGPU::SI_SPILL_AV128_RESTORE:
2489 case AMDGPU::SI_SPILL_AV160_RESTORE:
2490 case AMDGPU::SI_SPILL_AV192_RESTORE:
2491 case AMDGPU::SI_SPILL_AV224_RESTORE:
2492 case AMDGPU::SI_SPILL_AV256_RESTORE:
2493 case AMDGPU::SI_SPILL_AV288_RESTORE:
2494 case AMDGPU::SI_SPILL_AV320_RESTORE:
2495 case AMDGPU::SI_SPILL_AV352_RESTORE:
2496 case AMDGPU::SI_SPILL_AV384_RESTORE:
2497 case AMDGPU::SI_SPILL_AV512_RESTORE:
2498 case AMDGPU::SI_SPILL_AV1024_RESTORE:
2499 case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
2500 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE: {
2501 const MachineOperand *VData = TII->getNamedOperand(*MI,
2502 AMDGPU::OpName::vdata);
2503 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
2504 MFI->getStackPtrOffsetReg());
2505
2506 unsigned Opc;
2507 if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_RESTORE) {
2508 assert(ST.enableFlatScratch() && "Flat Scratch is not enabled!");
2509 Opc = AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16;
2510 } else {
2511 Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE
2512 ? AMDGPU::SCRATCH_LOAD_BLOCK_SADDR
2513 : ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
2514 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
2515 }
2516
2517 auto *MBB = MI->getParent();
2518 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
2519 if (IsWWMRegSpill) {
2520 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
2521 RS->isRegUsed(AMDGPU::SCC));
2522 }
2523
2525 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2526 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2527 *MI->memoperands_begin(), RS);
2528
2529 if (IsWWMRegSpill)
2530 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
2531
2532 MI->eraseFromParent();
2533 return true;
2534 }
2535 case AMDGPU::V_ADD_U32_e32:
2536 case AMDGPU::V_ADD_U32_e64:
2537 case AMDGPU::V_ADD_CO_U32_e32:
2538 case AMDGPU::V_ADD_CO_U32_e64: {
2539 // TODO: Handle sub, and, or.
2540 unsigned NumDefs = MI->getNumExplicitDefs();
2541 unsigned Src0Idx = NumDefs;
2542
2543 bool HasClamp = false;
2544 MachineOperand *VCCOp = nullptr;
2545
2546 switch (MI->getOpcode()) {
2547 case AMDGPU::V_ADD_U32_e32:
2548 break;
2549 case AMDGPU::V_ADD_U32_e64:
2550 HasClamp = MI->getOperand(3).getImm();
2551 break;
2552 case AMDGPU::V_ADD_CO_U32_e32:
2553 VCCOp = &MI->getOperand(3);
2554 break;
2555 case AMDGPU::V_ADD_CO_U32_e64:
2556 VCCOp = &MI->getOperand(1);
2557 HasClamp = MI->getOperand(4).getImm();
2558 break;
2559 default:
2560 break;
2561 }
2562 bool DeadVCC = !VCCOp || VCCOp->isDead();
2563 MachineOperand &DstOp = MI->getOperand(0);
2564 Register DstReg = DstOp.getReg();
2565
2566 unsigned OtherOpIdx =
2567 FIOperandNum == Src0Idx ? FIOperandNum + 1 : Src0Idx;
2568 MachineOperand *OtherOp = &MI->getOperand(OtherOpIdx);
2569
2570 unsigned Src1Idx = Src0Idx + 1;
2571 Register MaterializedReg = FrameReg;
2572 Register ScavengedVGPR;
2573
2574 int64_t Offset = FrameInfo.getObjectOffset(Index);
2575 // For the non-immediate case, we could fall through to the default
2576 // handling, but we do an in-place update of the result register here to
2577 // avoid scavenging another register.
2578 if (OtherOp->isImm()) {
2579 int64_t TotalOffset = OtherOp->getImm() + Offset;
2580
2581 if (!ST.hasVOP3Literal() && SIInstrInfo::isVOP3(*MI) &&
2582 !AMDGPU::isInlinableIntLiteral(TotalOffset)) {
2583 // If we can't support a VOP3 literal in the VALU instruction, we
2584 // can't specially fold into the add.
2585 // TODO: Handle VOP3->VOP2 shrink to support the fold.
2586 break;
2587 }
2588
2589 OtherOp->setImm(TotalOffset);
2590 Offset = 0;
2591 }
2592
2593 if (FrameReg && !ST.enableFlatScratch()) {
2594 // We should just do an in-place update of the result register. However,
2595 // the value there may also be used by the add, in which case we need a
2596 // temporary register.
2597 //
2598 // FIXME: The scavenger is not finding the result register in the
2599 // common case where the add does not read the register.
2600
2601 ScavengedVGPR = RS->scavengeRegisterBackwards(
2602 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false, /*SPAdj=*/0);
2603
2604 // TODO: If we have a free SGPR, it's sometimes better to use a scalar
2605 // shift.
2606 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64))
2607 .addDef(ScavengedVGPR, RegState::Renamable)
2609 .addReg(FrameReg);
2610 MaterializedReg = ScavengedVGPR;
2611 }
2612
2613 if ((!OtherOp->isImm() || OtherOp->getImm() != 0) && MaterializedReg) {
2614 if (ST.enableFlatScratch() &&
2615 !TII->isOperandLegal(*MI, Src1Idx, OtherOp)) {
2616 // We didn't need the shift above, so we have an SGPR for the frame
2617 // register, but may have a VGPR only operand.
2618 //
2619 // TODO: On gfx10+, we can easily change the opcode to the e64 version
2620 // and use the higher constant bus restriction to avoid this copy.
2621
2622 if (!ScavengedVGPR) {
2623 ScavengedVGPR = RS->scavengeRegisterBackwards(
2624 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2625 /*SPAdj=*/0);
2626 }
2627
2628 assert(ScavengedVGPR != DstReg);
2629
2630 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2631 .addReg(MaterializedReg,
2632 MaterializedReg != FrameReg ? RegState::Kill : 0);
2633 MaterializedReg = ScavengedVGPR;
2634 }
2635
2636 // TODO: In the flat scratch case, if this is an add of an SGPR, and SCC
2637 // is not live, we could use a scalar add + vector add instead of 2
2638 // vector adds.
2639 auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(MI->getOpcode()))
2640 .addDef(DstReg, RegState::Renamable);
2641 if (NumDefs == 2)
2642 AddI32.add(MI->getOperand(1));
2643
2644 unsigned MaterializedRegFlags =
2645 MaterializedReg != FrameReg ? RegState::Kill : 0;
2646
2647 if (isVGPRClass(getPhysRegBaseClass(MaterializedReg))) {
2648 // If we know we have a VGPR already, it's more likely the other
2649 // operand is a legal vsrc0.
2650 AddI32
2651 .add(*OtherOp)
2652 .addReg(MaterializedReg, MaterializedRegFlags);
2653 } else {
2654 // Commute operands to avoid violating VOP2 restrictions. This will
2655 // typically happen when using scratch.
2656 AddI32
2657 .addReg(MaterializedReg, MaterializedRegFlags)
2658 .add(*OtherOp);
2659 }
2660
2661 if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
2662 MI->getOpcode() == AMDGPU::V_ADD_U32_e64)
2663 AddI32.addImm(0); // clamp
2664
2665 if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e32)
2666 AddI32.setOperandDead(3); // Dead vcc
2667
2668 MaterializedReg = DstReg;
2669
2670 OtherOp->ChangeToRegister(MaterializedReg, false);
2671 OtherOp->setIsKill(true);
2673 Offset = 0;
2674 } else if (Offset != 0) {
2675 assert(!MaterializedReg);
2677 Offset = 0;
2678 } else {
2679 if (DeadVCC && !HasClamp) {
2680 assert(Offset == 0);
2681
2682 // TODO: Losing kills and implicit operands. Just mutate to copy and
2683 // let lowerCopy deal with it?
2684 if (OtherOp->isReg() && OtherOp->getReg() == DstReg) {
2685 // Folded to an identity copy.
2686 MI->eraseFromParent();
2687 return true;
2688 }
2689
2690 // The immediate value should be in OtherOp
2691 MI->setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
2692 MI->removeOperand(FIOperandNum);
2693
2694 unsigned NumOps = MI->getNumOperands();
2695 for (unsigned I = NumOps - 2; I >= NumDefs + 1; --I)
2696 MI->removeOperand(I);
2697
2698 if (NumDefs == 2)
2699 MI->removeOperand(1);
2700
2701 // The code below can't deal with a mov.
2702 return true;
2703 }
2704
2705 // This folded to a constant, but we have to keep the add around for
2706 // pointless implicit defs or clamp modifier.
2707 FIOp->ChangeToImmediate(0);
2708 }
2709
2710 // Try to improve legality by commuting.
2711 if (!TII->isOperandLegal(*MI, Src1Idx) && TII->commuteInstruction(*MI)) {
2712 std::swap(FIOp, OtherOp);
2713 std::swap(FIOperandNum, OtherOpIdx);
2714 }
2715
2716 // We need at most one mov to satisfy the operand constraints. Prefer to
2717 // move the FI operand first, as it may be a literal in a VOP3
2718 // instruction.
2719 for (unsigned SrcIdx : {FIOperandNum, OtherOpIdx}) {
2720 if (!TII->isOperandLegal(*MI, SrcIdx)) {
2721 // If commuting didn't make the operands legal, we need to materialize
2722 // in a register.
2723 // TODO: Can use SGPR on gfx10+ in some cases.
2724 if (!ScavengedVGPR) {
2725 ScavengedVGPR = RS->scavengeRegisterBackwards(
2726 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2727 /*SPAdj=*/0);
2728 }
2729
2730 assert(ScavengedVGPR != DstReg);
2731
2732 MachineOperand &Src = MI->getOperand(SrcIdx);
2733 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2734 .add(Src);
2735
2736 Src.ChangeToRegister(ScavengedVGPR, false);
2737 Src.setIsKill(true);
2738 break;
2739 }
2740 }
2741
2742 // Fold out add of 0 case that can appear in kernels.
2743 if (FIOp->isImm() && FIOp->getImm() == 0 && DeadVCC && !HasClamp) {
2744 if (OtherOp->isReg() && OtherOp->getReg() != DstReg) {
2745 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::COPY), DstReg).add(*OtherOp);
2746 }
2747
2748 MI->eraseFromParent();
2749 }
2750
2751 return true;
2752 }
2753 case AMDGPU::S_ADD_I32:
2754 case AMDGPU::S_ADD_U32: {
2755 // TODO: Handle s_or_b32, s_and_b32.
2756 unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1;
2757 MachineOperand &OtherOp = MI->getOperand(OtherOpIdx);
2758
2759 assert(FrameReg || MFI->isBottomOfStack());
2760
2761 MachineOperand &DstOp = MI->getOperand(0);
2762 const DebugLoc &DL = MI->getDebugLoc();
2763 Register MaterializedReg = FrameReg;
2764
2765 // Defend against live scc, which should never happen in practice.
2766 bool DeadSCC = MI->getOperand(3).isDead();
2767
2768 Register TmpReg;
2769
2770 // FIXME: Scavenger should figure out that the result register is
2771 // available. Also should do this for the v_add case.
2772 if (OtherOp.isReg() && OtherOp.getReg() != DstOp.getReg())
2773 TmpReg = DstOp.getReg();
2774
2775 if (FrameReg && !ST.enableFlatScratch()) {
2776 // FIXME: In the common case where the add does not also read its result
2777 // (i.e. this isn't a reg += fi), it's not finding the dest reg as
2778 // available.
2779 if (!TmpReg)
2780 TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2781 MI, /*RestoreAfter=*/false, 0,
2782 /*AllowSpill=*/false);
2783 if (TmpReg) {
2784 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_LSHR_B32))
2785 .addDef(TmpReg, RegState::Renamable)
2786 .addReg(FrameReg)
2788 .setOperandDead(3); // Set SCC dead
2789 }
2790 MaterializedReg = TmpReg;
2791 }
2792
2793 int64_t Offset = FrameInfo.getObjectOffset(Index);
2794
2795 // For the non-immediate case, we could fall through to the default
2796 // handling, but we do an in-place update of the result register here to
2797 // avoid scavenging another register.
2798 if (OtherOp.isImm()) {
2799 OtherOp.setImm(OtherOp.getImm() + Offset);
2800 Offset = 0;
2801
2802 if (MaterializedReg)
2803 FIOp->ChangeToRegister(MaterializedReg, false);
2804 else
2805 FIOp->ChangeToImmediate(0);
2806 } else if (MaterializedReg) {
2807 // If we can't fold the other operand, do another increment.
2808 Register DstReg = DstOp.getReg();
2809
2810 if (!TmpReg && MaterializedReg == FrameReg) {
2811 TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2812 MI, /*RestoreAfter=*/false, 0,
2813 /*AllowSpill=*/false);
2814 DstReg = TmpReg;
2815 }
2816
2817 if (TmpReg) {
2818 auto AddI32 = BuildMI(*MBB, *MI, DL, MI->getDesc())
2819 .addDef(DstReg, RegState::Renamable)
2820 .addReg(MaterializedReg, RegState::Kill)
2821 .add(OtherOp);
2822 if (DeadSCC)
2823 AddI32.setOperandDead(3);
2824
2825 MaterializedReg = DstReg;
2826
2827 OtherOp.ChangeToRegister(MaterializedReg, false);
2828 OtherOp.setIsKill(true);
2829 OtherOp.setIsRenamable(true);
2830 }
2832 } else {
2833 // If we don't have any other offset to apply, we can just directly
2834 // interpret the frame index as the offset.
2836 }
2837
2838 if (DeadSCC && OtherOp.isImm() && OtherOp.getImm() == 0) {
2839 assert(Offset == 0);
2840 MI->removeOperand(3);
2841 MI->removeOperand(OtherOpIdx);
2842 MI->setDesc(TII->get(FIOp->isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2843 } else if (DeadSCC && FIOp->isImm() && FIOp->getImm() == 0) {
2844 assert(Offset == 0);
2845 MI->removeOperand(3);
2846 MI->removeOperand(FIOperandNum);
2847 MI->setDesc(
2848 TII->get(OtherOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2849 }
2850
2851 assert(!FIOp->isFI());
2852 return true;
2853 }
2854 default: {
2855 break;
2856 }
2857 }
2858
2859 int64_t Offset = FrameInfo.getObjectOffset(Index);
2860 if (ST.enableFlatScratch()) {
2861 if (TII->isFLATScratch(*MI)) {
2862 assert(
2863 (int16_t)FIOperandNum ==
2864 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::saddr));
2865
2866 // The offset is always swizzled, just replace it
2867 if (FrameReg)
2868 FIOp->ChangeToRegister(FrameReg, false);
2869
2871 TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
2872 int64_t NewOffset = Offset + OffsetOp->getImm();
2873 if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
2875 OffsetOp->setImm(NewOffset);
2876 if (FrameReg)
2877 return false;
2878 Offset = 0;
2879 }
2880
2881 if (!Offset) {
2882 unsigned Opc = MI->getOpcode();
2883 int NewOpc = -1;
2884 if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr)) {
2886 } else if (ST.hasFlatScratchSTMode()) {
2887 // On GFX10 we have ST mode to use no registers for an address.
2888 // Otherwise we need to materialize 0 into an SGPR.
2890 }
2891
2892 if (NewOpc != -1) {
2893 // removeOperand doesn't fixup tied operand indexes as it goes, so
2894 // it asserts. Untie vdst_in for now and retie them afterwards.
2895 int VDstIn =
2896 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
2897 bool TiedVDst = VDstIn != -1 && MI->getOperand(VDstIn).isReg() &&
2898 MI->getOperand(VDstIn).isTied();
2899 if (TiedVDst)
2900 MI->untieRegOperand(VDstIn);
2901
2902 MI->removeOperand(
2903 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr));
2904
2905 if (TiedVDst) {
2906 int NewVDst =
2907 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
2908 int NewVDstIn =
2909 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst_in);
2910 assert(NewVDst != -1 && NewVDstIn != -1 && "Must be tied!");
2911 MI->tieOperands(NewVDst, NewVDstIn);
2912 }
2913 MI->setDesc(TII->get(NewOpc));
2914 return false;
2915 }
2916 }
2917 }
2918
2919 if (!FrameReg) {
2921 if (TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp))
2922 return false;
2923 }
2924
2925 // We need to use register here. Check if we can use an SGPR or need
2926 // a VGPR.
2927 FIOp->ChangeToRegister(AMDGPU::M0, false);
2928 bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, FIOp);
2929
2930 if (!Offset && FrameReg && UseSGPR) {
2931 FIOp->setReg(FrameReg);
2932 return false;
2933 }
2934
2935 const TargetRegisterClass *RC =
2936 UseSGPR ? &AMDGPU::SReg_32_XM0RegClass : &AMDGPU::VGPR_32RegClass;
2937
2938 Register TmpReg =
2939 RS->scavengeRegisterBackwards(*RC, MI, false, 0, !UseSGPR);
2940 FIOp->setReg(TmpReg);
2941 FIOp->setIsKill();
2942
2943 if ((!FrameReg || !Offset) && TmpReg) {
2944 unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2945 auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg);
2946 if (FrameReg)
2947 MIB.addReg(FrameReg);
2948 else
2949 MIB.addImm(Offset);
2950
2951 return false;
2952 }
2953
2954 bool NeedSaveSCC = RS->isRegUsed(AMDGPU::SCC) &&
2955 !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
2956
2957 Register TmpSReg =
2958 UseSGPR ? TmpReg
2959 : RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2960 MI, false, 0, !UseSGPR);
2961
2962 // TODO: for flat scratch another attempt can be made with a VGPR index
2963 // if no SGPRs can be scavenged.
2964 if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR))
2965 report_fatal_error("Cannot scavenge register in FI elimination!");
2966
2967 if (!TmpSReg) {
2968 // Use frame register and restore it after.
2969 TmpSReg = FrameReg;
2970 FIOp->setReg(FrameReg);
2971 FIOp->setIsKill(false);
2972 }
2973
2974 if (NeedSaveSCC) {
2975 assert(!(Offset & 0x1) && "Flat scratch offset must be aligned!");
2976 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADDC_U32), TmpSReg)
2977 .addReg(FrameReg)
2978 .addImm(Offset);
2979 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITCMP1_B32))
2980 .addReg(TmpSReg)
2981 .addImm(0);
2982 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITSET0_B32), TmpSReg)
2983 .addImm(0)
2984 .addReg(TmpSReg);
2985 } else {
2986 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg)
2987 .addReg(FrameReg)
2988 .addImm(Offset);
2989 }
2990
2991 if (!UseSGPR)
2992 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
2993 .addReg(TmpSReg, RegState::Kill);
2994
2995 if (TmpSReg == FrameReg) {
2996 // Undo frame register modification.
2997 if (NeedSaveSCC &&
2998 !MI->registerDefIsDead(AMDGPU::SCC, /*TRI=*/nullptr)) {
3000 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADDC_U32),
3001 TmpSReg)
3002 .addReg(FrameReg)
3003 .addImm(-Offset);
3004 I = BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITCMP1_B32))
3005 .addReg(TmpSReg)
3006 .addImm(0);
3007 BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITSET0_B32),
3008 TmpSReg)
3009 .addImm(0)
3010 .addReg(TmpSReg);
3011 } else {
3012 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32),
3013 FrameReg)
3014 .addReg(FrameReg)
3015 .addImm(-Offset);
3016 }
3017 }
3018
3019 return false;
3020 }
3021
3022 bool IsMUBUF = TII->isMUBUF(*MI);
3023
3024 if (!IsMUBUF && !MFI->isBottomOfStack()) {
3025 // Convert to a swizzled stack address by scaling by the wave size.
3026 // In an entry function/kernel the offset is already swizzled.
3027 bool IsSALU = isSGPRClass(TII->getOpRegClass(*MI, FIOperandNum));
3028 bool LiveSCC = RS->isRegUsed(AMDGPU::SCC) &&
3029 !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
3030 const TargetRegisterClass *RC = IsSALU && !LiveSCC
3031 ? &AMDGPU::SReg_32RegClass
3032 : &AMDGPU::VGPR_32RegClass;
3033 bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
3034 MI->getOpcode() == AMDGPU::V_MOV_B32_e64 ||
3035 MI->getOpcode() == AMDGPU::S_MOV_B32;
3036 Register ResultReg =
3037 IsCopy ? MI->getOperand(0).getReg()
3038 : RS->scavengeRegisterBackwards(*RC, MI, false, 0);
3039
3040 int64_t Offset = FrameInfo.getObjectOffset(Index);
3041 if (Offset == 0) {
3042 unsigned OpCode =
3043 IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32 : AMDGPU::V_LSHRREV_B32_e64;
3044 Register TmpResultReg = ResultReg;
3045 if (IsSALU && LiveSCC) {
3046 TmpResultReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
3047 MI, false, 0);
3048 }
3049
3050 auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), TmpResultReg);
3051 if (OpCode == AMDGPU::V_LSHRREV_B32_e64)
3052 // For V_LSHRREV, the operands are reversed (the shift count goes
3053 // first).
3054 Shift.addImm(ST.getWavefrontSizeLog2()).addReg(FrameReg);
3055 else
3056 Shift.addReg(FrameReg).addImm(ST.getWavefrontSizeLog2());
3057 if (IsSALU && !LiveSCC)
3058 Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
3059 if (IsSALU && LiveSCC) {
3060 Register NewDest;
3061 if (IsCopy) {
3062 MF->getRegInfo().constrainRegClass(ResultReg,
3063 &AMDGPU::SReg_32_XM0RegClass);
3064 NewDest = ResultReg;
3065 } else {
3066 NewDest = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
3067 Shift, false, 0);
3068 }
3069 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), NewDest)
3070 .addReg(TmpResultReg);
3071 ResultReg = NewDest;
3072 }
3073 } else {
3075 if (!IsSALU) {
3076 if ((MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) !=
3077 nullptr) {
3078 // Reuse ResultReg in intermediate step.
3079 Register ScaledReg = ResultReg;
3080
3081 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3082 ScaledReg)
3084 .addReg(FrameReg);
3085
3086 const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32;
3087
3088 // TODO: Fold if use instruction is another add of a constant.
3089 if (IsVOP2 ||
3091 // FIXME: This can fail
3092 MIB.addImm(Offset);
3093 MIB.addReg(ScaledReg, RegState::Kill);
3094 if (!IsVOP2)
3095 MIB.addImm(0); // clamp bit
3096 } else {
3097 assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 &&
3098 "Need to reuse carry out register");
3099
3100 // Use scavenged unused carry out as offset register.
3101 Register ConstOffsetReg;
3102 if (!isWave32)
3103 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0);
3104 else
3105 ConstOffsetReg = MIB.getReg(1);
3106
3107 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32),
3108 ConstOffsetReg)
3109 .addImm(Offset);
3110 MIB.addReg(ConstOffsetReg, RegState::Kill);
3111 MIB.addReg(ScaledReg, RegState::Kill);
3112 MIB.addImm(0); // clamp bit
3113 }
3114 }
3115 }
3116 if (!MIB || IsSALU) {
3117 // We have to produce a carry out, and there isn't a free SGPR pair
3118 // for it. We can keep the whole computation on the SALU to avoid
3119 // clobbering an additional register at the cost of an extra mov.
3120
3121 // We may have 1 free scratch SGPR even though a carry out is
3122 // unavailable. Only one additional mov is needed.
3123 Register TmpScaledReg = IsCopy && IsSALU
3124 ? ResultReg
3126 AMDGPU::SReg_32_XM0RegClass, MI,
3127 false, 0, /*AllowSpill=*/false);
3128 Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
3129 Register TmpResultReg = ScaledReg;
3130
3131 if (!LiveSCC) {
3132 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), TmpResultReg)
3133 .addReg(FrameReg)
3135 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpResultReg)
3136 .addReg(TmpResultReg, RegState::Kill)
3137 .addImm(Offset);
3138 } else {
3139 TmpResultReg = RS->scavengeRegisterBackwards(
3140 AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true);
3141
3143 if ((Add = TII->getAddNoCarry(*MBB, MI, DL, TmpResultReg, *RS))) {
3144 BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3145 TmpResultReg)
3147 .addReg(FrameReg);
3148 if (Add->getOpcode() == AMDGPU::V_ADD_CO_U32_e64) {
3149 BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::S_MOV_B32), ResultReg)
3150 .addImm(Offset);
3151 Add.addReg(ResultReg, RegState::Kill)
3152 .addReg(TmpResultReg, RegState::Kill)
3153 .addImm(0);
3154 } else
3155 Add.addImm(Offset).addReg(TmpResultReg, RegState::Kill);
3156 } else {
3157 assert(Offset > 0 && isUInt<24>(2 * ST.getMaxWaveScratchSize()) &&
3158 "offset is unsafe for v_mad_u32_u24");
3159
3160 // We start with a frame pointer with a wave space value, and
3161 // an offset in lane-space. We are materializing a lane space
3162 // value. We can either do a right shift of the frame pointer
3163 // to get to lane space, or a left shift of the offset to get
3164 // to wavespace. We can right shift after the computation to
3165 // get back to the desired per-lane value. We are using the
3166 // mad_u32_u24 primarily as an add with no carry out clobber.
3167 bool IsInlinableLiteral =
3169 if (!IsInlinableLiteral) {
3170 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32),
3171 TmpResultReg)
3172 .addImm(Offset);
3173 }
3174
3175 Add = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64),
3176 TmpResultReg);
3177
3178 if (!IsInlinableLiteral) {
3179 Add.addReg(TmpResultReg, RegState::Kill);
3180 } else {
3181 // We fold the offset into mad itself if its inlinable.
3182 Add.addImm(Offset);
3183 }
3184 Add.addImm(ST.getWavefrontSize()).addReg(FrameReg).addImm(0);
3185 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3186 TmpResultReg)
3188 .addReg(TmpResultReg);
3189 }
3190
3191 Register NewDest;
3192 if (IsCopy) {
3193 MF->getRegInfo().constrainRegClass(ResultReg,
3194 &AMDGPU::SReg_32_XM0RegClass);
3195 NewDest = ResultReg;
3196 } else {
3197 NewDest = RS->scavengeRegisterBackwards(
3198 AMDGPU::SReg_32_XM0RegClass, *Add, false, 0,
3199 /*AllowSpill=*/true);
3200 }
3201
3202 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
3203 NewDest)
3204 .addReg(TmpResultReg);
3205 ResultReg = NewDest;
3206 }
3207 if (!IsSALU)
3208 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
3209 .addReg(TmpResultReg, RegState::Kill);
3210 else
3211 ResultReg = TmpResultReg;
3212 // If there were truly no free SGPRs, we need to undo everything.
3213 if (!TmpScaledReg.isValid()) {
3214 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
3215 .addReg(ScaledReg, RegState::Kill)
3216 .addImm(-Offset);
3217 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
3218 .addReg(FrameReg)
3220 }
3221 }
3222 }
3223
3224 // Don't introduce an extra copy if we're just materializing in a mov.
3225 if (IsCopy) {
3226 MI->eraseFromParent();
3227 return true;
3228 }
3229 FIOp->ChangeToRegister(ResultReg, false, false, true);
3230 return false;
3231 }
3232
3233 if (IsMUBUF) {
3234 // Disable offen so we don't need a 0 vgpr base.
3235 assert(
3236 static_cast<int>(FIOperandNum) ==
3237 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr));
3238
3239 auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset);
3240 assert((SOffset.isImm() && SOffset.getImm() == 0));
3241
3242 if (FrameReg != AMDGPU::NoRegister)
3243 SOffset.ChangeToRegister(FrameReg, false);
3244
3245 int64_t Offset = FrameInfo.getObjectOffset(Index);
3246 int64_t OldImm =
3247 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
3248 int64_t NewOffset = OldImm + Offset;
3249
3250 if (TII->isLegalMUBUFImmOffset(NewOffset) &&
3251 buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) {
3252 MI->eraseFromParent();
3253 return true;
3254 }
3255 }
3256
3257 // If the offset is simply too big, don't convert to a scratch wave offset
3258 // relative index.
3259
3261 if (!TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp)) {
3262 Register TmpReg =
3263 RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
3264 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
3265 .addImm(Offset);
3266 FIOp->ChangeToRegister(TmpReg, false, false, true);
3267 }
3268
3269 return false;
3270}
3271
3274}
3275
3277 return getRegBitWidth(RC.getID());
3278}
3279
3280static const TargetRegisterClass *
3282 if (BitWidth == 64)
3283 return &AMDGPU::VReg_64RegClass;
3284 if (BitWidth == 96)
3285 return &AMDGPU::VReg_96RegClass;
3286 if (BitWidth == 128)
3287 return &AMDGPU::VReg_128RegClass;
3288 if (BitWidth == 160)
3289 return &AMDGPU::VReg_160RegClass;
3290 if (BitWidth == 192)
3291 return &AMDGPU::VReg_192RegClass;
3292 if (BitWidth == 224)
3293 return &AMDGPU::VReg_224RegClass;
3294 if (BitWidth == 256)
3295 return &AMDGPU::VReg_256RegClass;
3296 if (BitWidth == 288)
3297 return &AMDGPU::VReg_288RegClass;
3298 if (BitWidth == 320)
3299 return &AMDGPU::VReg_320RegClass;
3300 if (BitWidth == 352)
3301 return &AMDGPU::VReg_352RegClass;
3302 if (BitWidth == 384)
3303 return &AMDGPU::VReg_384RegClass;
3304 if (BitWidth == 512)
3305 return &AMDGPU::VReg_512RegClass;
3306 if (BitWidth == 1024)
3307 return &AMDGPU::VReg_1024RegClass;
3308
3309 return nullptr;
3310}
3311
3312static const TargetRegisterClass *
3314 if (BitWidth == 64)
3315 return &AMDGPU::VReg_64_Align2RegClass;
3316 if (BitWidth == 96)
3317 return &AMDGPU::VReg_96_Align2RegClass;
3318 if (BitWidth == 128)
3319 return &AMDGPU::VReg_128_Align2RegClass;
3320 if (BitWidth == 160)
3321 return &AMDGPU::VReg_160_Align2RegClass;
3322 if (BitWidth == 192)
3323 return &AMDGPU::VReg_192_Align2RegClass;
3324 if (BitWidth == 224)
3325 return &AMDGPU::VReg_224_Align2RegClass;
3326 if (BitWidth == 256)
3327 return &AMDGPU::VReg_256_Align2RegClass;
3328 if (BitWidth == 288)
3329 return &AMDGPU::VReg_288_Align2RegClass;
3330 if (BitWidth == 320)
3331 return &AMDGPU::VReg_320_Align2RegClass;
3332 if (BitWidth == 352)
3333 return &AMDGPU::VReg_352_Align2RegClass;
3334 if (BitWidth == 384)
3335 return &AMDGPU::VReg_384_Align2RegClass;
3336 if (BitWidth == 512)
3337 return &AMDGPU::VReg_512_Align2RegClass;
3338 if (BitWidth == 1024)
3339 return &AMDGPU::VReg_1024_Align2RegClass;
3340
3341 return nullptr;
3342}
3343
3344const TargetRegisterClass *
3346 if (BitWidth == 1)
3347 return &AMDGPU::VReg_1RegClass;
3348 if (BitWidth == 16)
3349 return &AMDGPU::VGPR_16RegClass;
3350 if (BitWidth == 32)
3351 return &AMDGPU::VGPR_32RegClass;
3354}
3355
3356static const TargetRegisterClass *
3358 if (BitWidth == 64)
3359 return &AMDGPU::AReg_64RegClass;
3360 if (BitWidth == 96)
3361 return &AMDGPU::AReg_96RegClass;
3362 if (BitWidth == 128)
3363 return &AMDGPU::AReg_128RegClass;
3364 if (BitWidth == 160)
3365 return &AMDGPU::AReg_160RegClass;
3366 if (BitWidth == 192)
3367 return &AMDGPU::AReg_192RegClass;
3368 if (BitWidth == 224)
3369 return &AMDGPU::AReg_224RegClass;
3370 if (BitWidth == 256)
3371 return &AMDGPU::AReg_256RegClass;
3372 if (BitWidth == 288)
3373 return &AMDGPU::AReg_288RegClass;
3374 if (BitWidth == 320)
3375 return &AMDGPU::AReg_320RegClass;
3376 if (BitWidth == 352)
3377 return &AMDGPU::AReg_352RegClass;
3378 if (BitWidth == 384)
3379 return &AMDGPU::AReg_384RegClass;
3380 if (BitWidth == 512)
3381 return &AMDGPU::AReg_512RegClass;
3382 if (BitWidth == 1024)
3383 return &AMDGPU::AReg_1024RegClass;
3384
3385 return nullptr;
3386}
3387
3388static const TargetRegisterClass *
3390 if (BitWidth == 64)
3391 return &AMDGPU::AReg_64_Align2RegClass;
3392 if (BitWidth == 96)
3393 return &AMDGPU::AReg_96_Align2RegClass;
3394 if (BitWidth == 128)
3395 return &AMDGPU::AReg_128_Align2RegClass;
3396 if (BitWidth == 160)
3397 return &AMDGPU::AReg_160_Align2RegClass;
3398 if (BitWidth == 192)
3399 return &AMDGPU::AReg_192_Align2RegClass;
3400 if (BitWidth == 224)
3401 return &AMDGPU::AReg_224_Align2RegClass;
3402 if (BitWidth == 256)
3403 return &AMDGPU::AReg_256_Align2RegClass;
3404 if (BitWidth == 288)
3405 return &AMDGPU::AReg_288_Align2RegClass;
3406 if (BitWidth == 320)
3407 return &AMDGPU::AReg_320_Align2RegClass;
3408 if (BitWidth == 352)
3409 return &AMDGPU::AReg_352_Align2RegClass;
3410 if (BitWidth == 384)
3411 return &AMDGPU::AReg_384_Align2RegClass;
3412 if (BitWidth == 512)
3413 return &AMDGPU::AReg_512_Align2RegClass;
3414 if (BitWidth == 1024)
3415 return &AMDGPU::AReg_1024_Align2RegClass;
3416
3417 return nullptr;
3418}
3419
3420const TargetRegisterClass *
3422 if (BitWidth == 16)
3423 return &AMDGPU::AGPR_LO16RegClass;
3424 if (BitWidth == 32)
3425 return &AMDGPU::AGPR_32RegClass;
3428}
3429
3430static const TargetRegisterClass *
3432 if (BitWidth == 64)
3433 return &AMDGPU::AV_64RegClass;
3434 if (BitWidth == 96)
3435 return &AMDGPU::AV_96RegClass;
3436 if (BitWidth == 128)
3437 return &AMDGPU::AV_128RegClass;
3438 if (BitWidth == 160)
3439 return &AMDGPU::AV_160RegClass;
3440 if (BitWidth == 192)
3441 return &AMDGPU::AV_192RegClass;
3442 if (BitWidth == 224)
3443 return &AMDGPU::AV_224RegClass;
3444 if (BitWidth == 256)
3445 return &AMDGPU::AV_256RegClass;
3446 if (BitWidth == 288)
3447 return &AMDGPU::AV_288RegClass;
3448 if (BitWidth == 320)
3449 return &AMDGPU::AV_320RegClass;
3450 if (BitWidth == 352)
3451 return &AMDGPU::AV_352RegClass;
3452 if (BitWidth == 384)
3453 return &AMDGPU::AV_384RegClass;
3454 if (BitWidth == 512)
3455 return &AMDGPU::AV_512RegClass;
3456 if (BitWidth == 1024)
3457 return &AMDGPU::AV_1024RegClass;
3458
3459 return nullptr;
3460}
3461
3462static const TargetRegisterClass *
3464 if (BitWidth == 64)
3465 return &AMDGPU::AV_64_Align2RegClass;
3466 if (BitWidth == 96)
3467 return &AMDGPU::AV_96_Align2RegClass;
3468 if (BitWidth == 128)
3469 return &AMDGPU::AV_128_Align2RegClass;
3470 if (BitWidth == 160)
3471 return &AMDGPU::AV_160_Align2RegClass;
3472 if (BitWidth == 192)
3473 return &AMDGPU::AV_192_Align2RegClass;
3474 if (BitWidth == 224)
3475 return &AMDGPU::AV_224_Align2RegClass;
3476 if (BitWidth == 256)
3477 return &AMDGPU::AV_256_Align2RegClass;
3478 if (BitWidth == 288)
3479 return &AMDGPU::AV_288_Align2RegClass;
3480 if (BitWidth == 320)
3481 return &AMDGPU::AV_320_Align2RegClass;
3482 if (BitWidth == 352)
3483 return &AMDGPU::AV_352_Align2RegClass;
3484 if (BitWidth == 384)
3485 return &AMDGPU::AV_384_Align2RegClass;
3486 if (BitWidth == 512)
3487 return &AMDGPU::AV_512_Align2RegClass;
3488 if (BitWidth == 1024)
3489 return &AMDGPU::AV_1024_Align2RegClass;
3490
3491 return nullptr;
3492}
3493
3494const TargetRegisterClass *
3496 if (BitWidth == 32)
3497 return &AMDGPU::AV_32RegClass;
3498 return ST.needsAlignedVGPRs()
3501}
3502
3503const TargetRegisterClass *
3505 if (BitWidth == 16 || BitWidth == 32)
3506 return &AMDGPU::SReg_32RegClass;
3507 if (BitWidth == 64)
3508 return &AMDGPU::SReg_64RegClass;
3509 if (BitWidth == 96)
3510 return &AMDGPU::SGPR_96RegClass;
3511 if (BitWidth == 128)
3512 return &AMDGPU::SGPR_128RegClass;
3513 if (BitWidth == 160)
3514 return &AMDGPU::SGPR_160RegClass;
3515 if (BitWidth == 192)
3516 return &AMDGPU::SGPR_192RegClass;
3517 if (BitWidth == 224)
3518 return &AMDGPU::SGPR_224RegClass;
3519 if (BitWidth == 256)
3520 return &AMDGPU::SGPR_256RegClass;
3521 if (BitWidth == 288)
3522 return &AMDGPU::SGPR_288RegClass;
3523 if (BitWidth == 320)
3524 return &AMDGPU::SGPR_320RegClass;
3525 if (BitWidth == 352)
3526 return &AMDGPU::SGPR_352RegClass;
3527 if (BitWidth == 384)
3528 return &AMDGPU::SGPR_384RegClass;
3529 if (BitWidth == 512)
3530 return &AMDGPU::SGPR_512RegClass;
3531 if (BitWidth == 1024)
3532 return &AMDGPU::SGPR_1024RegClass;
3533
3534 return nullptr;
3535}
3536
3538 Register Reg) const {
3539 const TargetRegisterClass *RC;
3540 if (Reg.isVirtual())
3541 RC = MRI.getRegClass(Reg);
3542 else
3543 RC = getPhysRegBaseClass(Reg);
3544 return RC && isSGPRClass(RC);
3545}
3546
3547const TargetRegisterClass *
3549 unsigned Size = getRegSizeInBits(*SRC);
3551 assert(VRC && "Invalid register class size");
3552 return VRC;
3553}
3554
3555const TargetRegisterClass *
3557 unsigned Size = getRegSizeInBits(*SRC);
3559 assert(ARC && "Invalid register class size");
3560 return ARC;
3561}
3562
3563const TargetRegisterClass *
3565 unsigned Size = getRegSizeInBits(*VRC);
3566 if (Size == 32)
3567 return &AMDGPU::SGPR_32RegClass;
3569 assert(SRC && "Invalid register class size");
3570 return SRC;
3571}
3572
3573const TargetRegisterClass *
3575 const TargetRegisterClass *SubRC,
3576 unsigned SubIdx) const {
3577 // Ensure this subregister index is aligned in the super register.
3578 const TargetRegisterClass *MatchRC =
3579 getMatchingSuperRegClass(SuperRC, SubRC, SubIdx);
3580 return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr;
3581}
3582
3583bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
3586 return !ST.hasMFMAInlineLiteralBug();
3587
3588 return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
3589 OpType <= AMDGPU::OPERAND_SRC_LAST;
3590}
3591
3592bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const {
3593 // TODO: 64-bit operands have extending behavior from 32-bit literal.
3594 return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST &&
3596}
3597
3598/// Returns a lowest register that is not used at any point in the function.
3599/// If all registers are used, then this function will return
3600/// AMDGPU::NoRegister. If \p ReserveHighestRegister = true, then return
3601/// highest unused register.
3604 const MachineFunction &MF, bool ReserveHighestRegister) const {
3605 if (ReserveHighestRegister) {
3606 for (MCRegister Reg : reverse(*RC))
3607 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
3608 return Reg;
3609 } else {
3610 for (MCRegister Reg : *RC)
3611 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
3612 return Reg;
3613 }
3614 return MCRegister();
3615}
3616
3618 const RegisterBankInfo &RBI,
3619 Register Reg) const {
3620 auto *RB = RBI.getRegBank(Reg, MRI, *MRI.getTargetRegisterInfo());
3621 if (!RB)
3622 return false;
3623
3624 return !RBI.isDivergentRegBank(RB);
3625}
3626
3628 unsigned EltSize) const {
3629 const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC);
3630 assert(RegBitWidth >= 32 && RegBitWidth <= 1024 && EltSize >= 2);
3631
3632 const unsigned RegHalves = RegBitWidth / 16;
3633 const unsigned EltHalves = EltSize / 2;
3634 assert(RegSplitParts.size() + 1 >= EltHalves);
3635
3636 const std::vector<int16_t> &Parts = RegSplitParts[EltHalves - 1];
3637 const unsigned NumParts = RegHalves / EltHalves;
3638
3639 return ArrayRef(Parts.data(), NumParts);
3640}
3641
3644 Register Reg) const {
3645 return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegBaseClass(Reg);
3646}
3647
3648const TargetRegisterClass *
3650 const MachineOperand &MO) const {
3651 const TargetRegisterClass *SrcRC = getRegClassForReg(MRI, MO.getReg());
3652 return getSubRegisterClass(SrcRC, MO.getSubReg());
3653}
3654
3656 Register Reg) const {
3657 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
3658 // Registers without classes are unaddressable, SGPR-like registers.
3659 return RC && isVGPRClass(RC);
3660}
3661
3663 Register Reg) const {
3664 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
3665
3666 // Registers without classes are unaddressable, SGPR-like registers.
3667 return RC && isAGPRClass(RC);
3668}
3669
3671 const TargetRegisterClass *SrcRC,
3672 unsigned SubReg,
3673 const TargetRegisterClass *DstRC,
3674 unsigned DstSubReg,
3675 const TargetRegisterClass *NewRC,
3676 LiveIntervals &LIS) const {
3677 unsigned SrcSize = getRegSizeInBits(*SrcRC);
3678 unsigned DstSize = getRegSizeInBits(*DstRC);
3679 unsigned NewSize = getRegSizeInBits(*NewRC);
3680
3681 // Do not increase size of registers beyond dword, we would need to allocate
3682 // adjacent registers and constraint regalloc more than needed.
3683
3684 // Always allow dword coalescing.
3685 if (SrcSize <= 32 || DstSize <= 32)
3686 return true;
3687
3688 return NewSize <= DstSize || NewSize <= SrcSize;
3689}
3690
3692 MachineFunction &MF) const {
3693 unsigned MinOcc = ST.getOccupancyWithWorkGroupSizes(MF).first;
3694 switch (RC->getID()) {
3695 default:
3696 return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF);
3697 case AMDGPU::VGPR_32RegClassID:
3698 return std::min(
3699 ST.getMaxNumVGPRs(
3700 MinOcc,
3702 ST.getMaxNumVGPRs(MF));
3703 case AMDGPU::SGPR_32RegClassID:
3704 case AMDGPU::SGPR_LO16RegClassID:
3705 return std::min(ST.getMaxNumSGPRs(MinOcc, true), ST.getMaxNumSGPRs(MF));
3706 }
3707}
3708
3710 unsigned Idx) const {
3711 if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 ||
3712 Idx == AMDGPU::RegisterPressureSets::AGPR_32)
3713 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
3714 const_cast<MachineFunction &>(MF));
3715
3716 if (Idx == AMDGPU::RegisterPressureSets::SReg_32)
3717 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass,
3718 const_cast<MachineFunction &>(MF));
3719
3720 llvm_unreachable("Unexpected register pressure set!");
3721}
3722
3723const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
3724 static const int Empty[] = { -1 };
3725
3726 if (RegPressureIgnoredUnits[RegUnit])
3727 return Empty;
3728
3729 return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
3730}
3731
3733 ArrayRef<MCPhysReg> Order,
3735 const MachineFunction &MF,
3736 const VirtRegMap *VRM,
3737 const LiveRegMatrix *Matrix) const {
3738
3739 const MachineRegisterInfo &MRI = MF.getRegInfo();
3740 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3741
3742 std::pair<unsigned, Register> Hint = MRI.getRegAllocationHint(VirtReg);
3743
3744 switch (Hint.first) {
3745 case AMDGPURI::Size32: {
3746 Register Paired = Hint.second;
3747 assert(Paired);
3748 Register PairedPhys;
3749 if (Paired.isPhysical()) {
3750 PairedPhys =
3751 getMatchingSuperReg(Paired, AMDGPU::lo16, &AMDGPU::VGPR_32RegClass);
3752 } else if (VRM && VRM->hasPhys(Paired)) {
3753 PairedPhys = getMatchingSuperReg(VRM->getPhys(Paired), AMDGPU::lo16,
3754 &AMDGPU::VGPR_32RegClass);
3755 }
3756
3757 // Prefer the paired physreg.
3758 if (PairedPhys)
3759 // isLo(Paired) is implicitly true here from the API of
3760 // getMatchingSuperReg.
3761 Hints.push_back(PairedPhys);
3762 return false;
3763 }
3764 case AMDGPURI::Size16: {
3765 Register Paired = Hint.second;
3766 assert(Paired);
3767 Register PairedPhys;
3768 if (Paired.isPhysical()) {
3769 PairedPhys = TRI->getSubReg(Paired, AMDGPU::lo16);
3770 } else if (VRM && VRM->hasPhys(Paired)) {
3771 PairedPhys = TRI->getSubReg(VRM->getPhys(Paired), AMDGPU::lo16);
3772 }
3773
3774 // First prefer the paired physreg.
3775 if (PairedPhys)
3776 Hints.push_back(PairedPhys);
3777 else {
3778 // Add all the lo16 physregs.
3779 // When the Paired operand has not yet been assigned a physreg it is
3780 // better to try putting VirtReg in a lo16 register, because possibly
3781 // later Paired can be assigned to the overlapping register and the COPY
3782 // can be eliminated.
3783 for (MCPhysReg PhysReg : Order) {
3784 if (PhysReg == PairedPhys || AMDGPU::isHi16Reg(PhysReg, *this))
3785 continue;
3786 if (AMDGPU::VGPR_16RegClass.contains(PhysReg) &&
3787 !MRI.isReserved(PhysReg))
3788 Hints.push_back(PhysReg);
3789 }
3790 }
3791 return false;
3792 }
3793 default:
3794 return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
3795 VRM);
3796 }
3797}
3798
3800 // Not a callee saved register.
3801 return AMDGPU::SGPR30_SGPR31;
3802}
3803
3804const TargetRegisterClass *
3806 const RegisterBank &RB) const {
3807 switch (RB.getID()) {
3808 case AMDGPU::VGPRRegBankID:
3810 std::max(ST.useRealTrue16Insts() ? 16u : 32u, Size));
3811 case AMDGPU::VCCRegBankID:
3812 assert(Size == 1);
3813 return getWaveMaskRegClass();
3814 case AMDGPU::SGPRRegBankID:
3815 return getSGPRClassForBitWidth(std::max(32u, Size));
3816 case AMDGPU::AGPRRegBankID:
3817 return getAGPRClassForBitWidth(std::max(32u, Size));
3818 default:
3819 llvm_unreachable("unknown register bank");
3820 }
3821}
3822
3823const TargetRegisterClass *
3825 const MachineRegisterInfo &MRI) const {
3826 const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg());
3827 if (const RegisterBank *RB = dyn_cast<const RegisterBank *>(RCOrRB))
3828 return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB);
3829
3830 if (const auto *RC = dyn_cast<const TargetRegisterClass *>(RCOrRB))
3831 return getAllocatableClass(RC);
3832
3833 return nullptr;
3834}
3835
3837 return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
3838}
3839
3841 return isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3842}
3843
3845 // VGPR tuples have an alignment requirement on gfx90a variants.
3846 return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass
3847 : &AMDGPU::VReg_64RegClass;
3848}
3849
3850const TargetRegisterClass *
3851SIRegisterInfo::getRegClass(unsigned RCID) const {
3852 switch ((int)RCID) {
3853 case AMDGPU::SReg_1RegClassID:
3854 return getBoolRC();
3855 case AMDGPU::SReg_1_XEXECRegClassID:
3856 return getWaveMaskRegClass();
3857 case -1:
3858 return nullptr;
3859 default:
3860 return AMDGPUGenRegisterInfo::getRegClass(RCID);
3861 }
3862}
3863
3864// Find reaching register definition
3868 LiveIntervals *LIS) const {
3869 auto &MDT = LIS->getDomTree();
3870 SlotIndex UseIdx = LIS->getInstructionIndex(Use);
3871 SlotIndex DefIdx;
3872
3873 if (Reg.isVirtual()) {
3874 if (!LIS->hasInterval(Reg))
3875 return nullptr;
3876 LiveInterval &LI = LIS->getInterval(Reg);
3877 LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg)
3878 : MRI.getMaxLaneMaskForVReg(Reg);
3879 VNInfo *V = nullptr;
3880 if (LI.hasSubRanges()) {
3881 for (auto &S : LI.subranges()) {
3882 if ((S.LaneMask & SubLanes) == SubLanes) {
3883 V = S.getVNInfoAt(UseIdx);
3884 break;
3885 }
3886 }
3887 } else {
3888 V = LI.getVNInfoAt(UseIdx);
3889 }
3890 if (!V)
3891 return nullptr;
3892 DefIdx = V->def;
3893 } else {
3894 // Find last def.
3895 for (MCRegUnit Unit : regunits(Reg.asMCReg())) {
3896 LiveRange &LR = LIS->getRegUnit(Unit);
3897 if (VNInfo *V = LR.getVNInfoAt(UseIdx)) {
3898 if (!DefIdx.isValid() ||
3899 MDT.dominates(LIS->getInstructionFromIndex(DefIdx),
3900 LIS->getInstructionFromIndex(V->def)))
3901 DefIdx = V->def;
3902 } else {
3903 return nullptr;
3904 }
3905 }
3906 }
3907
3908 MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx);
3909
3910 if (!Def || !MDT.dominates(Def, &Use))
3911 return nullptr;
3912
3913 assert(Def->modifiesRegister(Reg, this));
3914
3915 return Def;
3916}
3917
3919 assert(getRegSizeInBits(*getPhysRegBaseClass(Reg)) <= 32);
3920
3921 for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass,
3922 AMDGPU::SReg_32RegClass,
3923 AMDGPU::AGPR_32RegClass } ) {
3924 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC))
3925 return Super;
3926 }
3927 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16,
3928 &AMDGPU::VGPR_32RegClass)) {
3929 return Super;
3930 }
3931
3932 return AMDGPU::NoRegister;
3933}
3934
3936 if (!ST.needsAlignedVGPRs())
3937 return true;
3938
3939 if (isVGPRClass(&RC))
3940 return RC.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC)));
3941 if (isAGPRClass(&RC))
3942 return RC.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC)));
3943 if (isVectorSuperClass(&RC))
3944 return RC.hasSuperClassEq(
3945 getVectorSuperClassForBitWidth(getRegSizeInBits(RC)));
3946
3947 return true;
3948}
3949
3950const TargetRegisterClass *
3952 if (!RC || !ST.needsAlignedVGPRs())
3953 return RC;
3954
3955 unsigned Size = getRegSizeInBits(*RC);
3956 if (Size <= 32)
3957 return RC;
3958
3959 if (isVGPRClass(RC))
3961 if (isAGPRClass(RC))
3963 if (isVectorSuperClass(RC))
3965
3966 return RC;
3967}
3968
3971 return ArrayRef(AMDGPU::SGPR_128RegClass.begin(), ST.getMaxNumSGPRs(MF) / 4);
3972}
3973
3976 return ArrayRef(AMDGPU::SGPR_64RegClass.begin(), ST.getMaxNumSGPRs(MF) / 2);
3977}
3978
3981 return ArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF));
3982}
3983
3984unsigned
3986 unsigned SubReg) const {
3987 switch (RC->TSFlags & SIRCFlags::RegKindMask) {
3988 case SIRCFlags::HasSGPR:
3989 return std::min(128u, getSubRegIdxSize(SubReg));
3990 case SIRCFlags::HasAGPR:
3991 case SIRCFlags::HasVGPR:
3993 return std::min(32u, getSubRegIdxSize(SubReg));
3994 default:
3995 break;
3996 }
3997 return 0;
3998}
3999
4001 const TargetRegisterClass &RC,
4002 bool IncludeCalls) const {
4003 for (MCPhysReg Reg : reverse(RC.getRegisters()))
4004 if (MRI.isPhysRegUsed(Reg, /*SkipRegMaskTest=*/!IncludeCalls))
4005 return getHWRegIndex(Reg) + 1;
4006 return 0;
4007}
4008
4011 const MachineFunction &MF) const {
4013 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4014 if (FuncInfo->checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
4015 RegFlags.push_back("WWM_REG");
4016 return RegFlags;
4017}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static const Function * getParent(const Value *V)
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition: Compiler.h:404
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
Live Register Matrix
A set of register units.
#define I(x, y, z)
Definition: MD5.cpp:58
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
Register const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
This file declares the machine register scavenger class.
static int getOffenMUBUFStore(unsigned Opc)
static const TargetRegisterClass * getAnyAGPRClassForBitWidth(unsigned BitWidth)
static int getOffsetMUBUFLoad(unsigned Opc)
static const std::array< unsigned, 17 > SubRegFromChannelTableWidthMap
static unsigned getNumSubRegsForSpillOp(const MachineInstr &MI, const SIInstrInfo *TII)
static void emitUnsupportedError(const Function &Fn, const MachineInstr &MI, const Twine &ErrMsg)
static const TargetRegisterClass * getAlignedAGPRClassForBitWidth(unsigned BitWidth)
static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST, MachineFrameInfo &MFI, MachineBasicBlock::iterator MI, int Index, int64_t Offset)
static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII, unsigned LoadStoreOp, unsigned EltSize)
static const TargetRegisterClass * getAlignedVGPRClassForBitWidth(unsigned BitWidth)
static int getOffsetMUBUFStore(unsigned Opc)
static const TargetRegisterClass * getAnyVGPRClassForBitWidth(unsigned BitWidth)
static cl::opt< bool > EnableSpillSGPRToVGPR("amdgpu-spill-sgpr-to-vgpr", cl::desc("Enable spilling SGPRs to VGPRs"), cl::ReallyHidden, cl::init(true))
static const TargetRegisterClass * getAlignedVectorSuperClassForBitWidth(unsigned BitWidth)
static const TargetRegisterClass * getAnyVectorSuperClassForBitWidth(unsigned BitWidth)
static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, int Index, unsigned Lane, unsigned ValueReg, bool IsKill)
static bool isFIPlusImmOrVGPR(const SIRegisterInfo &TRI, const MachineInstr &MI)
static int getOffenMUBUFLoad(unsigned Opc)
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:480
static const char * getRegisterName(MCRegister Reg)
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
std::pair< unsigned, unsigned > getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
unsigned getWavefrontSizeLog2() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:142
bool test(unsigned Idx) const
Definition: BitVector.h:461
void resize(unsigned N, bool t=false)
resize - Grow or shrink the bitvector.
Definition: BitVector.h:341
BitVector & set()
Definition: BitVector.h:351
bool empty() const
empty - Tests whether there are no bits in this bitvector.
Definition: BitVector.h:156
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
Register getReg() const
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:359
bool hasGFX90AInsts() const
bool hasMAIInsts() const
Definition: GCNSubtarget.h:878
bool hasMFMAInlineLiteralBug() const
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:308
unsigned getConstantBusLimit(unsigned Opcode) const
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
bool enableFlatScratch() const
Definition: GCNSubtarget.h:703
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:320
const SIFrameLowering * getFrameLowering() const override
Definition: GCNSubtarget.h:312
unsigned getMaxNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
bool hasVOP3Literal() const
Definition: GCNSubtarget.h:995
std::pair< unsigned, unsigned > getMaxNumVectorRegs(const Function &F) const
Return a pair of maximum numbers of VGPRs and AGPRs that meet the number of waves per execution unit ...
bool hasFlatScratchSTMode() const
Definition: GCNSubtarget.h:693
unsigned getMaxWaveScratchSize() const
Definition: GCNSubtarget.h:360
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LiveInterval - This class represents the liveness of a register, or stack slot.
Definition: LiveInterval.h:690
bool hasSubRanges() const
Returns true if subregister liveness information is available.
Definition: LiveInterval.h:813
iterator_range< subrange_iterator > subranges()
Definition: LiveInterval.h:785
void removeAllRegUnitsForPhysReg(MCRegister Reg)
Remove associated live ranges for the register units associated with Reg.
bool hasInterval(Register Reg) const
MachineInstr * getInstructionFromIndex(SlotIndex index) const
Returns the instruction associated with the given index.
MachineDominatorTree & getDomTree()
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveRange & getRegUnit(unsigned Unit)
Return the live range for register unit Unit.
LiveInterval & getInterval(Register Reg)
This class represents the liveness of a register, stack slot, etc.
Definition: LiveInterval.h:158
VNInfo * getVNInfoAt(SlotIndex Idx) const
getVNInfoAt - Return the VNInfo that is live at Idx, or NULL.
Definition: LiveInterval.h:423
A set of register units used to track register liveness.
Definition: LiveRegUnits.h:31
bool available(MCRegister Reg) const
Returns true if no part of physical register Reg is live.
Definition: LiveRegUnits.h:117
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:199
MCRegAliasIterator enumerates all registers aliasing Reg.
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
static MCRegister from(unsigned Val)
Check the provided unsigned value is a valid MCRegister.
Definition: MCRegister.h:69
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
bool hasCalls() const
Return true if the current function has any function calls.
Align getObjectAlign(int ObjectIdx) const
Return the alignment of the specified stack object.
bool hasStackObjects() const
Return true if there are any stack objects in this function.
uint8_t getStackID(int ObjectIdx) const
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:72
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:587
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:359
void setAsmPrinterFlag(uint8_t Flag)
Set a flag for the AsmPrinter.
Definition: MachineInstr.h:390
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:595
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
LLVM_ABI void setIsRenamable(bool Val=true)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool isReserved(MCRegister PhysReg) const
isReserved - Returns true when PhysReg is a reserved register.
LLVM_ABI const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
bool isRegUsed(Register Reg, bool includeReserved=true) const
Return if a specific register is currently used.
void setRegUsed(Register Reg, LaneBitmask LaneMask=LaneBitmask::getAll())
Tell the scavenger a register is used.
void assignRegToScavengingIndex(int FI, Register Reg, MachineInstr *Restore=nullptr)
Record that Reg is in use at scavenging index FI.
Register scavengeRegisterBackwards(const TargetRegisterClass &RC, MachineBasicBlock::iterator To, bool RestoreAfter, int SPAdj, bool AllowSpill=true)
Make a register of the specific register class available from the current position backwards to the p...
Holds all the information related to register banks.
virtual bool isDivergentRegBank(const RegisterBank *RB) const
Returns true if the register bank is considered divergent.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
Definition: RegisterBank.h:29
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:46
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:107
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:78
static bool isVOP3(const MachineInstr &MI)
Definition: SIInstrInfo.h:536
static bool isFLATScratch(const MachineInstr &MI)
Definition: SIInstrInfo.h:668
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:560
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
ArrayRef< MCPhysReg > getAGPRSpillVGPRs() const
MCPhysReg getVGPRToAGPRSpill(int FrameIndex, unsigned Lane) const
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
ArrayRef< MCPhysReg > getVGPRSpillAGPRs() const
int getScavengeFI(MachineFrameInfo &MFI, const SIRegisterInfo &TRI)
ArrayRef< SIRegisterInfo::SpilledReg > getSGPRSpillToVirtualVGPRLanes(int FrameIndex) const
uint32_t getMaskForVGPRBlockOps(Register RegisterBlock) const
ArrayRef< SIRegisterInfo::SpilledReg > getSGPRSpillToPhysicalVGPRLanes(int FrameIndex) const
bool checkFlag(Register Reg, uint8_t Flag) const
const ReservedRegSet & getWWMReservedRegs() const
Register materializeFrameBaseRegister(MachineBasicBlock *MBB, int FrameIdx, int64_t Offset) const override
int64_t getScratchInstrOffset(const MachineInstr *MI) const
bool isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg, int64_t Offset) const override
const TargetRegisterClass * getRegClass(unsigned RCID) const
const TargetRegisterClass * getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, const TargetRegisterClass *SubRC, unsigned SubIdx) const
Returns a register class which is compatible with SuperRC, such that a subregister exists with class ...
ArrayRef< MCPhysReg > getAllSGPR64(const MachineFunction &MF) const
Return all SGPR64 which satisfy the waves per execution unit requirement of the subtarget.
MCRegister findUnusedRegister(const MachineRegisterInfo &MRI, const TargetRegisterClass *RC, const MachineFunction &MF, bool ReserveHighestVGPR=false) const
Returns a lowest register that is not used at any point in the function.
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCPhysReg get32BitRegister(MCPhysReg Reg) const
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
bool requiresFrameIndexReplacementScavenging(const MachineFunction &MF) const override
const TargetRegisterClass * getProperlyAlignedRC(const TargetRegisterClass *RC) const
bool shouldRealignStack(const MachineFunction &MF) const override
bool restoreSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool OnlyToVGPR=false, bool SpillToPhysVGPRLane=false) const
bool isProperlyAlignedRC(const TargetRegisterClass &RC) const
const TargetRegisterClass * getEquivalentVGPRClass(const TargetRegisterClass *SRC) const
Register getFrameRegister(const MachineFunction &MF) const override
LLVM_READONLY const TargetRegisterClass * getVectorSuperClassForBitWidth(unsigned BitWidth) const
bool spillEmergencySGPR(MachineBasicBlock::iterator MI, MachineBasicBlock &RestoreMBB, Register SGPR, RegScavenger *RS) const
SIRegisterInfo(const GCNSubtarget &ST)
const uint32_t * getAllVGPRRegMask() const
MCRegister getReturnAddressReg(const MachineFunction &MF) const
const MCPhysReg * getCalleeSavedRegs(const MachineFunction *MF) const override
bool hasBasePointer(const MachineFunction &MF) const
const TargetRegisterClass * getCrossCopyRegClass(const TargetRegisterClass *RC) const override
Returns a legal register class to copy a register in the specified class to or from.
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
ArrayRef< MCPhysReg > getAllSGPR32(const MachineFunction &MF) const
Return all SGPR32 which satisfy the waves per execution unit requirement of the subtarget.
const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const override
MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const
Return the end register initially reserved for the scratch buffer in case spilling is needed.
bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool SpillToPhysVGPRLane=false) const
Special case of eliminateFrameIndex.
bool isVGPR(const MachineRegisterInfo &MRI, Register Reg) const
void buildSpillLoadStore(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, unsigned LoadStoreOp, int Index, Register ValueReg, bool ValueIsKill, MCRegister ScratchOffsetReg, int64_t InstrOffset, MachineMemOperand *MMO, RegScavenger *RS, LiveRegUnits *LiveUnits=nullptr) const
bool isAsmClobberable(const MachineFunction &MF, MCRegister PhysReg) const override
LLVM_READONLY const TargetRegisterClass * getAGPRClassForBitWidth(unsigned BitWidth) const
static bool isChainScratchRegister(Register VGPR)
bool requiresRegisterScavenging(const MachineFunction &Fn) const override
bool opCanUseInlineConstant(unsigned OpType) const
const TargetRegisterClass * getRegClassForSizeOnBank(unsigned Size, const RegisterBank &Bank) const
const TargetRegisterClass * getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const override
bool isUniformReg(const MachineRegisterInfo &MRI, const RegisterBankInfo &RBI, Register Reg) const override
const uint32_t * getNoPreservedMask() const override
StringRef getRegAsmName(MCRegister Reg) const override
const uint32_t * getAllAllocatableSRegMask() const
MCRegister getAlignedHighSGPRForRC(const MachineFunction &MF, const unsigned Align, const TargetRegisterClass *RC) const
Return the largest available SGPR aligned to Align for the register class RC.
const TargetRegisterClass * getRegClassForReg(const MachineRegisterInfo &MRI, Register Reg) const
const MCPhysReg * getCalleeSavedRegsViaCopy(const MachineFunction *MF) const
const uint32_t * getAllVectorRegMask() const
const TargetRegisterClass * getEquivalentAGPRClass(const TargetRegisterClass *SRC) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
const TargetRegisterClass * getRegClassForTypeOnBank(LLT Ty, const RegisterBank &Bank) const
bool opCanUseLiteralConstant(unsigned OpType) const
Register getBaseRegister() const
bool getRegAllocationHints(Register VirtReg, ArrayRef< MCPhysReg > Order, SmallVectorImpl< MCPhysReg > &Hints, const MachineFunction &MF, const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const override
LLVM_READONLY const TargetRegisterClass * getVGPRClassForBitWidth(unsigned BitWidth) const
bool requiresFrameIndexScavenging(const MachineFunction &MF) const override
static bool isVGPRClass(const TargetRegisterClass *RC)
unsigned getHWRegIndex(MCRegister Reg) const
MachineInstr * findReachingDef(Register Reg, unsigned SubReg, MachineInstr &Use, MachineRegisterInfo &MRI, LiveIntervals *LIS) const
bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentSGPRClass(const TargetRegisterClass *VRC) const
SmallVector< StringLiteral > getVRegFlagsOfReg(Register Reg, const MachineFunction &MF) const override
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
ArrayRef< MCPhysReg > getAllSGPR128(const MachineFunction &MF) const
Return all SGPR128 which satisfy the waves per execution unit requirement of the subtarget.
unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const override
BitVector getReservedRegs(const MachineFunction &MF) const override
bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override
const TargetRegisterClass * getRegClassForOperandReg(const MachineRegisterInfo &MRI, const MachineOperand &MO) const
void addImplicitUsesForBlockCSRLoad(MachineInstrBuilder &MIB, Register BlockReg) const
unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI, const TargetRegisterClass &RC, bool IncludeCalls=true) const
const uint32_t * getAllAGPRRegMask() const
bool shouldCoalesce(MachineInstr *MI, const TargetRegisterClass *SrcRC, unsigned SubReg, const TargetRegisterClass *DstRC, unsigned DstSubReg, const TargetRegisterClass *NewRC, LiveIntervals &LIS) const override
const TargetRegisterClass * getBoolRC() const
const TargetRegisterClass * getPointerRegClass(const MachineFunction &MF, unsigned Kind=0) const override
bool isAGPR(const MachineRegisterInfo &MRI, Register Reg) const
bool eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const override
bool spillSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool OnlyToVGPR=false, bool SpillToPhysVGPRLane=false) const
If OnlyToVGPR is true, this will only succeed if this manages to find a free VGPR lane to spill.
MCRegister getExec() const
MCRegister getVCC() const
int64_t getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const override
bool isVectorSuperClass(const TargetRegisterClass *RC) const
const TargetRegisterClass * getWaveMaskRegClass() const
unsigned getSubRegAlignmentNumBits(const TargetRegisterClass *RC, unsigned SubReg) const
void resolveFrameIndex(MachineInstr &MI, Register BaseReg, int64_t Offset) const override
bool requiresVirtualBaseRegisters(const MachineFunction &Fn) const override
const TargetRegisterClass * getVGPR64Class() const
void buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, int Offset, bool IsLoad, bool IsKill=true) const
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
const int * getRegUnitPressureSets(unsigned RegUnit) const override
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:66
bool isValid() const
Returns true if this is a valid index.
Definition: SlotIndexes.h:131
SlotIndexes pass.
Definition: SlotIndexes.h:298
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Definition: SlotIndexes.h:532
SlotIndex replaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
ReplaceMachineInstrInMaps - Replacing a machine instr with a new one in maps used by register allocat...
Definition: SlotIndexes.h:589
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
const uint8_t TSFlags
Configurable target specific flags.
ArrayRef< MCPhysReg > getRegisters() const
unsigned getID() const
Return the register class ID number.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
virtual const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &) const
Returns the largest super class of RC that is legal to use in the current sub-target and has the same...
virtual bool shouldRealignStack(const MachineFunction &MF) const
True if storage within the function requires the stack pointer to be aligned more than the normal cal...
virtual bool getRegAllocationHints(Register VirtReg, ArrayRef< MCPhysReg > Order, SmallVectorImpl< MCPhysReg > &Hints, const MachineFunction &MF, const VirtRegMap *VRM=nullptr, const LiveRegMatrix *Matrix=nullptr) const
Get a list of 'hint' registers that the register allocator should try first when allocating a physica...
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:82
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
VNInfo - Value Number Information.
Definition: LiveInterval.h:54
MCRegister getPhys(Register virtReg) const
returns the physical register mapped to the specified virtual register
Definition: VirtRegMap.h:91
bool hasPhys(Register virtReg) const
returns true if the specified virtual register is mapped to a physical register
Definition: VirtRegMap.h:87
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ PRIVATE_ADDRESS
Address space for private memory.
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
LLVM_READONLY int getFlatScratchInstSTfromSS(uint16_t Opcode)
LLVM_READONLY int getFlatScratchInstSVfromSVS(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
@ OPERAND_REG_IMM_FIRST
Definition: SIDefines.h:250
@ OPERAND_SRC_FIRST
Definition: SIDefines.h:259
@ OPERAND_REG_INLINE_AC_FIRST
Definition: SIDefines.h:256
@ OPERAND_REG_INLINE_AC_LAST
Definition: SIDefines.h:257
@ OPERAND_REG_IMM_LAST
Definition: SIDefines.h:251
@ OPERAND_SRC_LAST
Definition: SIDefines.h:260
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition: CallingConv.h:47
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Renamable
Register that may be renamed.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ ReallyHidden
Definition: CommandLine.h:139
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:477
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:307
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1702
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition: MCRegister.h:21
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:551
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:428
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition: Error.cpp:167
@ HasSGPR
Definition: SIDefines.h:26
@ HasVGPR
Definition: SIDefines.h:24
@ RegKindMask
Definition: SIDefines.h:29
@ HasAGPR
Definition: SIDefines.h:25
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:399
unsigned getDefRegState(bool B)
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
void call_once(once_flag &flag, Function &&F, Args &&... ArgList)
Execute the function specified as a parameter once.
Definition: Threading.h:86
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:223
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:48
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:858
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Description of the encoding of one expression Op.
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
void setMI(MachineBasicBlock *NewMBB, MachineBasicBlock::iterator NewMI)
ArrayRef< int16_t > SplitParts
SIMachineFunctionInfo & MFI
SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, bool IsWave32, MachineBasicBlock::iterator MI, int Index, RegScavenger *RS)
SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, bool IsWave32, MachineBasicBlock::iterator MI, Register Reg, bool IsKill, int Index, RegScavenger *RS)
PerVGPRData getPerVGPRData()
MachineBasicBlock::iterator MI
void readWriteTmpVGPR(unsigned Offset, bool IsLoad)
const SIRegisterInfo & TRI
MachineFunction & MF
MachineBasicBlock * MBB
const SIInstrInfo & TII
The llvm::once_flag structure.
Definition: Threading.h:67