LLVM 22.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "GCNHazardRecognizer.h"
19#include "GCNSubtarget.h"
22#include "llvm/ADT/STLExtras.h"
33#include "llvm/IR/IntrinsicsAMDGPU.h"
34#include "llvm/MC/MCContext.h"
37
38using namespace llvm;
39
40#define DEBUG_TYPE "si-instr-info"
41
42#define GET_INSTRINFO_CTOR_DTOR
43#include "AMDGPUGenInstrInfo.inc"
44
45namespace llvm::AMDGPU {
46#define GET_D16ImageDimIntrinsics_IMPL
47#define GET_ImageDimIntrinsicTable_IMPL
48#define GET_RsrcIntrinsics_IMPL
49#include "AMDGPUGenSearchableTables.inc"
50} // namespace llvm::AMDGPU
51
52// Must be at least 4 to be able to branch over minimum unconditional branch
53// code. This is only for making it possible to write reasonably small tests for
54// long branches.
56BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
57 cl::desc("Restrict range of branch instructions (DEBUG)"));
58
60 "amdgpu-fix-16-bit-physreg-copies",
61 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
62 cl::init(true),
64
66 : AMDGPUGenInstrInfo(ST, AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
67 RI(ST), ST(ST) {
68 SchedModel.init(&ST);
69}
70
71//===----------------------------------------------------------------------===//
72// TargetInstrInfo callbacks
73//===----------------------------------------------------------------------===//
74
75static unsigned getNumOperandsNoGlue(SDNode *Node) {
76 unsigned N = Node->getNumOperands();
77 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
78 --N;
79 return N;
80}
81
82/// Returns true if both nodes have the same value for the given
83/// operand \p Op, or if both nodes do not have this operand.
85 AMDGPU::OpName OpName) {
86 unsigned Opc0 = N0->getMachineOpcode();
87 unsigned Opc1 = N1->getMachineOpcode();
88
89 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
90 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
91
92 if (Op0Idx == -1 && Op1Idx == -1)
93 return true;
94
95
96 if ((Op0Idx == -1 && Op1Idx != -1) ||
97 (Op1Idx == -1 && Op0Idx != -1))
98 return false;
99
100 // getNamedOperandIdx returns the index for the MachineInstr's operands,
101 // which includes the result as the first operand. We are indexing into the
102 // MachineSDNode's operands, so we need to skip the result operand to get
103 // the real index.
104 --Op0Idx;
105 --Op1Idx;
106
107 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
108}
109
110static bool canRemat(const MachineInstr &MI) {
111
115 return true;
116
117 if (SIInstrInfo::isSMRD(MI)) {
118 return !MI.memoperands_empty() &&
119 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
120 return MMO->isLoad() && MMO->isInvariant();
121 });
122 }
123
124 return false;
125}
126
128 const MachineInstr &MI) const {
129
130 if (canRemat(MI)) {
131 // Normally VALU use of exec would block the rematerialization, but that
132 // is OK in this case to have an implicit exec read as all VALU do.
133 // We really want all of the generic logic for this except for this.
134
135 // Another potential implicit use is mode register. The core logic of
136 // the RA will not attempt rematerialization if mode is set anywhere
137 // in the function, otherwise it is safe since mode is not changed.
138
139 // There is difference to generic method which does not allow
140 // rematerialization if there are virtual register uses. We allow this,
141 // therefore this method includes SOP instructions as well.
142 if (!MI.hasImplicitDef() &&
143 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
144 !MI.mayRaiseFPException())
145 return true;
146 }
147
149}
150
151// Returns true if the scalar result of a VALU instruction depends on exec.
152bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
153 // Ignore comparisons which are only used masked with exec.
154 // This allows some hoisting/sinking of VALU comparisons.
155 if (MI.isCompare()) {
156 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
157 if (!Dst)
158 return true;
159
160 Register DstReg = Dst->getReg();
161 if (!DstReg.isVirtual())
162 return true;
163
164 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
165 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
166 switch (Use.getOpcode()) {
167 case AMDGPU::S_AND_SAVEEXEC_B32:
168 case AMDGPU::S_AND_SAVEEXEC_B64:
169 break;
170 case AMDGPU::S_AND_B32:
171 case AMDGPU::S_AND_B64:
172 if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))
173 return true;
174 break;
175 default:
176 return true;
177 }
178 }
179 return false;
180 }
181
182 switch (MI.getOpcode()) {
183 default:
184 break;
185 case AMDGPU::V_READFIRSTLANE_B32:
186 return true;
187 }
188
189 return false;
190}
191
193 // Any implicit use of exec by VALU is not a real register read.
194 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
195 isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
196}
197
199 MachineBasicBlock *SuccToSinkTo,
200 MachineCycleInfo *CI) const {
201 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
202 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
203 return true;
204
205 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
206 // Check if sinking of MI would create temporal divergent use.
207 for (auto Op : MI.uses()) {
208 if (Op.isReg() && Op.getReg().isVirtual() &&
209 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
210 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
211
212 // SgprDef defined inside cycle
213 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
214 if (FromCycle == nullptr)
215 continue;
216
217 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
218 // Check if there is a FromCycle that contains SgprDef's basic block but
219 // does not contain SuccToSinkTo and also has divergent exit condition.
220 while (FromCycle && !FromCycle->contains(ToCycle)) {
222 FromCycle->getExitingBlocks(ExitingBlocks);
223
224 // FromCycle has divergent exit condition.
225 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
226 if (hasDivergentBranch(ExitingBlock))
227 return false;
228 }
229
230 FromCycle = FromCycle->getParentCycle();
231 }
232 }
233 }
234
235 return true;
236}
237
239 int64_t &Offset0,
240 int64_t &Offset1) const {
241 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
242 return false;
243
244 unsigned Opc0 = Load0->getMachineOpcode();
245 unsigned Opc1 = Load1->getMachineOpcode();
246
247 // Make sure both are actually loads.
248 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
249 return false;
250
251 // A mayLoad instruction without a def is not a load. Likely a prefetch.
252 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
253 return false;
254
255 if (isDS(Opc0) && isDS(Opc1)) {
256
257 // FIXME: Handle this case:
258 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
259 return false;
260
261 // Check base reg.
262 if (Load0->getOperand(0) != Load1->getOperand(0))
263 return false;
264
265 // Skip read2 / write2 variants for simplicity.
266 // TODO: We should report true if the used offsets are adjacent (excluded
267 // st64 versions).
268 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
269 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
270 if (Offset0Idx == -1 || Offset1Idx == -1)
271 return false;
272
273 // XXX - be careful of dataless loads
274 // getNamedOperandIdx returns the index for MachineInstrs. Since they
275 // include the output in the operand list, but SDNodes don't, we need to
276 // subtract the index by one.
277 Offset0Idx -= get(Opc0).NumDefs;
278 Offset1Idx -= get(Opc1).NumDefs;
279 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
280 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
281 return true;
282 }
283
284 if (isSMRD(Opc0) && isSMRD(Opc1)) {
285 // Skip time and cache invalidation instructions.
286 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
287 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
288 return false;
289
290 unsigned NumOps = getNumOperandsNoGlue(Load0);
291 if (NumOps != getNumOperandsNoGlue(Load1))
292 return false;
293
294 // Check base reg.
295 if (Load0->getOperand(0) != Load1->getOperand(0))
296 return false;
297
298 // Match register offsets, if both register and immediate offsets present.
299 assert(NumOps == 4 || NumOps == 5);
300 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
301 return false;
302
303 const ConstantSDNode *Load0Offset =
305 const ConstantSDNode *Load1Offset =
307
308 if (!Load0Offset || !Load1Offset)
309 return false;
310
311 Offset0 = Load0Offset->getZExtValue();
312 Offset1 = Load1Offset->getZExtValue();
313 return true;
314 }
315
316 // MUBUF and MTBUF can access the same addresses.
317 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
318
319 // MUBUF and MTBUF have vaddr at different indices.
320 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
321 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
322 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
323 return false;
324
325 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
326 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
327
328 if (OffIdx0 == -1 || OffIdx1 == -1)
329 return false;
330
331 // getNamedOperandIdx returns the index for MachineInstrs. Since they
332 // include the output in the operand list, but SDNodes don't, we need to
333 // subtract the index by one.
334 OffIdx0 -= get(Opc0).NumDefs;
335 OffIdx1 -= get(Opc1).NumDefs;
336
337 SDValue Off0 = Load0->getOperand(OffIdx0);
338 SDValue Off1 = Load1->getOperand(OffIdx1);
339
340 // The offset might be a FrameIndexSDNode.
341 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
342 return false;
343
344 Offset0 = Off0->getAsZExtVal();
345 Offset1 = Off1->getAsZExtVal();
346 return true;
347 }
348
349 return false;
350}
351
352static bool isStride64(unsigned Opc) {
353 switch (Opc) {
354 case AMDGPU::DS_READ2ST64_B32:
355 case AMDGPU::DS_READ2ST64_B64:
356 case AMDGPU::DS_WRITE2ST64_B32:
357 case AMDGPU::DS_WRITE2ST64_B64:
358 return true;
359 default:
360 return false;
361 }
362}
363
366 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
367 const TargetRegisterInfo *TRI) const {
368 if (!LdSt.mayLoadOrStore())
369 return false;
370
371 unsigned Opc = LdSt.getOpcode();
372 OffsetIsScalable = false;
373 const MachineOperand *BaseOp, *OffsetOp;
374 int DataOpIdx;
375
376 if (isDS(LdSt)) {
377 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
378 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
379 if (OffsetOp) {
380 // Normal, single offset LDS instruction.
381 if (!BaseOp) {
382 // DS_CONSUME/DS_APPEND use M0 for the base address.
383 // TODO: find the implicit use operand for M0 and use that as BaseOp?
384 return false;
385 }
386 BaseOps.push_back(BaseOp);
387 Offset = OffsetOp->getImm();
388 // Get appropriate operand, and compute width accordingly.
389 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
390 if (DataOpIdx == -1)
391 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
392 if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
393 Width = LocationSize::precise(64);
394 else
395 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
396 } else {
397 // The 2 offset instructions use offset0 and offset1 instead. We can treat
398 // these as a load with a single offset if the 2 offsets are consecutive.
399 // We will use this for some partially aligned loads.
400 const MachineOperand *Offset0Op =
401 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
402 const MachineOperand *Offset1Op =
403 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
404
405 unsigned Offset0 = Offset0Op->getImm() & 0xff;
406 unsigned Offset1 = Offset1Op->getImm() & 0xff;
407 if (Offset0 + 1 != Offset1)
408 return false;
409
410 // Each of these offsets is in element sized units, so we need to convert
411 // to bytes of the individual reads.
412
413 unsigned EltSize;
414 if (LdSt.mayLoad())
415 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
416 else {
417 assert(LdSt.mayStore());
418 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
419 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
420 }
421
422 if (isStride64(Opc))
423 EltSize *= 64;
424
425 BaseOps.push_back(BaseOp);
426 Offset = EltSize * Offset0;
427 // Get appropriate operand(s), and compute width accordingly.
428 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
429 if (DataOpIdx == -1) {
430 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
431 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
432 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
433 Width = LocationSize::precise(
434 Width.getValue() + TypeSize::getFixed(getOpSize(LdSt, DataOpIdx)));
435 } else {
436 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
437 }
438 }
439 return true;
440 }
441
442 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
443 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
444 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
445 return false;
446 BaseOps.push_back(RSrc);
447 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
448 if (BaseOp && !BaseOp->isFI())
449 BaseOps.push_back(BaseOp);
450 const MachineOperand *OffsetImm =
451 getNamedOperand(LdSt, AMDGPU::OpName::offset);
452 Offset = OffsetImm->getImm();
453 const MachineOperand *SOffset =
454 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
455 if (SOffset) {
456 if (SOffset->isReg())
457 BaseOps.push_back(SOffset);
458 else
459 Offset += SOffset->getImm();
460 }
461 // Get appropriate operand, and compute width accordingly.
462 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
463 if (DataOpIdx == -1)
464 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
465 if (DataOpIdx == -1) // LDS DMA
466 return false;
467 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
468 return true;
469 }
470
471 if (isImage(LdSt)) {
472 auto RsrcOpName =
473 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
474 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
475 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
476 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
477 if (VAddr0Idx >= 0) {
478 // GFX10 possible NSA encoding.
479 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
480 BaseOps.push_back(&LdSt.getOperand(I));
481 } else {
482 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
483 }
484 Offset = 0;
485 // Get appropriate operand, and compute width accordingly.
486 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
487 if (DataOpIdx == -1)
488 return false; // no return sampler
489 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
490 return true;
491 }
492
493 if (isSMRD(LdSt)) {
494 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
495 if (!BaseOp) // e.g. S_MEMTIME
496 return false;
497 BaseOps.push_back(BaseOp);
498 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
499 Offset = OffsetOp ? OffsetOp->getImm() : 0;
500 // Get appropriate operand, and compute width accordingly.
501 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
502 if (DataOpIdx == -1)
503 return false;
504 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
505 return true;
506 }
507
508 if (isFLAT(LdSt)) {
509 // Instructions have either vaddr or saddr or both or none.
510 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
511 if (BaseOp)
512 BaseOps.push_back(BaseOp);
513 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
514 if (BaseOp)
515 BaseOps.push_back(BaseOp);
516 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
517 // Get appropriate operand, and compute width accordingly.
518 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
519 if (DataOpIdx == -1)
520 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
521 if (DataOpIdx == -1) // LDS DMA
522 return false;
523 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
524 return true;
525 }
526
527 return false;
528}
529
530static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
532 const MachineInstr &MI2,
534 // Only examine the first "base" operand of each instruction, on the
535 // assumption that it represents the real base address of the memory access.
536 // Other operands are typically offsets or indices from this base address.
537 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
538 return true;
539
540 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
541 return false;
542
543 auto *MO1 = *MI1.memoperands_begin();
544 auto *MO2 = *MI2.memoperands_begin();
545 if (MO1->getAddrSpace() != MO2->getAddrSpace())
546 return false;
547
548 const auto *Base1 = MO1->getValue();
549 const auto *Base2 = MO2->getValue();
550 if (!Base1 || !Base2)
551 return false;
552 Base1 = getUnderlyingObject(Base1);
553 Base2 = getUnderlyingObject(Base2);
554
555 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
556 return false;
557
558 return Base1 == Base2;
559}
560
562 int64_t Offset1, bool OffsetIsScalable1,
564 int64_t Offset2, bool OffsetIsScalable2,
565 unsigned ClusterSize,
566 unsigned NumBytes) const {
567 // If the mem ops (to be clustered) do not have the same base ptr, then they
568 // should not be clustered
569 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
570 if (!BaseOps1.empty() && !BaseOps2.empty()) {
571 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
572 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
573 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
574 return false;
575
576 const SIMachineFunctionInfo *MFI =
577 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
578 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
579 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
580 // If only one base op is empty, they do not have the same base ptr
581 return false;
582 }
583
584 // In order to avoid register pressure, on an average, the number of DWORDS
585 // loaded together by all clustered mem ops should not exceed
586 // MaxMemoryClusterDWords. This is an empirical value based on certain
587 // observations and performance related experiments.
588 // The good thing about this heuristic is - it avoids clustering of too many
589 // sub-word loads, and also avoids clustering of wide loads. Below is the
590 // brief summary of how the heuristic behaves for various `LoadSize` when
591 // MaxMemoryClusterDWords is 8.
592 //
593 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
594 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
595 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
596 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
597 // (5) LoadSize >= 17: do not cluster
598 const unsigned LoadSize = NumBytes / ClusterSize;
599 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
600 return NumDWords <= MaxMemoryClusterDWords;
601}
602
603// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
604// the first 16 loads will be interleaved with the stores, and the next 16 will
605// be clustered as expected. It should really split into 2 16 store batches.
606//
607// Loads are clustered until this returns false, rather than trying to schedule
608// groups of stores. This also means we have to deal with saying different
609// address space loads should be clustered, and ones which might cause bank
610// conflicts.
611//
612// This might be deprecated so it might not be worth that much effort to fix.
614 int64_t Offset0, int64_t Offset1,
615 unsigned NumLoads) const {
616 assert(Offset1 > Offset0 &&
617 "Second offset should be larger than first offset!");
618 // If we have less than 16 loads in a row, and the offsets are within 64
619 // bytes, then schedule together.
620
621 // A cacheline is 64 bytes (for global memory).
622 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
623}
624
627 const DebugLoc &DL, MCRegister DestReg,
628 MCRegister SrcReg, bool KillSrc,
629 const char *Msg = "illegal VGPR to SGPR copy") {
630 MachineFunction *MF = MBB.getParent();
631
633 C.diagnose(DiagnosticInfoUnsupported(MF->getFunction(), Msg, DL, DS_Error));
634
635 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
636 .addReg(SrcReg, getKillRegState(KillSrc));
637}
638
639/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
640/// possible to have a direct copy in these cases on GFX908, so an intermediate
641/// VGPR copy is required.
645 const DebugLoc &DL, MCRegister DestReg,
646 MCRegister SrcReg, bool KillSrc,
647 RegScavenger &RS, bool RegsOverlap,
648 Register ImpDefSuperReg = Register(),
649 Register ImpUseSuperReg = Register()) {
650 assert((TII.getSubtarget().hasMAIInsts() &&
651 !TII.getSubtarget().hasGFX90AInsts()) &&
652 "Expected GFX908 subtarget.");
653
654 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
655 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
656 "Source register of the copy should be either an SGPR or an AGPR.");
657
658 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
659 "Destination register of the copy should be an AGPR.");
660
661 const SIRegisterInfo &RI = TII.getRegisterInfo();
662
663 // First try to find defining accvgpr_write to avoid temporary registers.
664 // In the case of copies of overlapping AGPRs, we conservatively do not
665 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
666 // an accvgpr_write used for this same copy due to implicit-defs
667 if (!RegsOverlap) {
668 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
669 --Def;
670
671 if (!Def->modifiesRegister(SrcReg, &RI))
672 continue;
673
674 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
675 Def->getOperand(0).getReg() != SrcReg)
676 break;
677
678 MachineOperand &DefOp = Def->getOperand(1);
679 assert(DefOp.isReg() || DefOp.isImm());
680
681 if (DefOp.isReg()) {
682 bool SafeToPropagate = true;
683 // Check that register source operand is not clobbered before MI.
684 // Immediate operands are always safe to propagate.
685 for (auto I = Def; I != MI && SafeToPropagate; ++I)
686 if (I->modifiesRegister(DefOp.getReg(), &RI))
687 SafeToPropagate = false;
688
689 if (!SafeToPropagate)
690 break;
691
692 for (auto I = Def; I != MI; ++I)
693 I->clearRegisterKills(DefOp.getReg(), &RI);
694 }
695
696 MachineInstrBuilder Builder =
697 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
698 .add(DefOp);
699 if (ImpDefSuperReg)
700 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
701
702 if (ImpUseSuperReg) {
703 Builder.addReg(ImpUseSuperReg,
705 }
706
707 return;
708 }
709 }
710
712 RS.backward(std::next(MI));
713
714 // Ideally we want to have three registers for a long reg_sequence copy
715 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
716 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
717 *MBB.getParent());
718
719 // Registers in the sequence are allocated contiguously so we can just
720 // use register number to pick one of three round-robin temps.
721 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
722 Register Tmp =
723 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
724 assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
725 "VGPR used for an intermediate copy should have been reserved.");
726
727 // Only loop through if there are any free registers left. We don't want to
728 // spill.
729 while (RegNo--) {
730 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
731 /* RestoreAfter */ false, 0,
732 /* AllowSpill */ false);
733 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
734 break;
735 Tmp = Tmp2;
736 RS.setRegUsed(Tmp);
737 }
738
739 // Insert copy to temporary VGPR.
740 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
741 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
742 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
743 } else {
744 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
745 }
746
747 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
748 .addReg(SrcReg, getKillRegState(KillSrc));
749 if (ImpUseSuperReg) {
750 UseBuilder.addReg(ImpUseSuperReg,
752 }
753
754 MachineInstrBuilder DefBuilder
755 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
756 .addReg(Tmp, RegState::Kill);
757
758 if (ImpDefSuperReg)
759 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
760}
761
764 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
765 const TargetRegisterClass *RC, bool Forward) {
766 const SIRegisterInfo &RI = TII.getRegisterInfo();
767 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
769 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
770
771 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
772 int16_t SubIdx = BaseIndices[Idx];
773 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
774 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
775 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
776 unsigned Opcode = AMDGPU::S_MOV_B32;
777
778 // Is SGPR aligned? If so try to combine with next.
779 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
780 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
781 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
782 // Can use SGPR64 copy
783 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
784 SubIdx = RI.getSubRegFromChannel(Channel, 2);
785 DestSubReg = RI.getSubReg(DestReg, SubIdx);
786 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
787 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
788 Opcode = AMDGPU::S_MOV_B64;
789 Idx++;
790 }
791
792 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
793 .addReg(SrcSubReg)
794 .addReg(SrcReg, RegState::Implicit);
795
796 if (!FirstMI)
797 FirstMI = LastMI;
798
799 if (!Forward)
800 I--;
801 }
802
803 assert(FirstMI && LastMI);
804 if (!Forward)
805 std::swap(FirstMI, LastMI);
806
807 FirstMI->addOperand(
808 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
809
810 if (KillSrc)
811 LastMI->addRegisterKilled(SrcReg, &RI);
812}
813
816 const DebugLoc &DL, Register DestReg,
817 Register SrcReg, bool KillSrc, bool RenamableDest,
818 bool RenamableSrc) const {
819 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
820 unsigned Size = RI.getRegSizeInBits(*RC);
821 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
822 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
823
824 // The rest of copyPhysReg assumes Src and Dst size are the same size.
825 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
826 // we remove Fix16BitCopies and this code block?
827 if (Fix16BitCopies) {
828 if (((Size == 16) != (SrcSize == 16))) {
829 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
830 assert(ST.useRealTrue16Insts());
831 Register &RegToFix = (Size == 32) ? DestReg : SrcReg;
832 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
833 RegToFix = SubReg;
834
835 if (DestReg == SrcReg) {
836 // Identity copy. Insert empty bundle since ExpandPostRA expects an
837 // instruction here.
838 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
839 return;
840 }
841 RC = RI.getPhysRegBaseClass(DestReg);
842 Size = RI.getRegSizeInBits(*RC);
843 SrcRC = RI.getPhysRegBaseClass(SrcReg);
844 SrcSize = RI.getRegSizeInBits(*SrcRC);
845 }
846 }
847
848 if (RC == &AMDGPU::VGPR_32RegClass) {
849 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
850 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
851 AMDGPU::AGPR_32RegClass.contains(SrcReg));
852 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
853 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
854 BuildMI(MBB, MI, DL, get(Opc), DestReg)
855 .addReg(SrcReg, getKillRegState(KillSrc));
856 return;
857 }
858
859 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
860 RC == &AMDGPU::SReg_32RegClass) {
861 if (SrcReg == AMDGPU::SCC) {
862 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
863 .addImm(1)
864 .addImm(0);
865 return;
866 }
867
868 if (DestReg == AMDGPU::VCC_LO) {
869 if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {
870 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)
871 .addReg(SrcReg, getKillRegState(KillSrc));
872 } else {
873 // FIXME: Hack until VReg_1 removed.
874 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
875 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
876 .addImm(0)
877 .addReg(SrcReg, getKillRegState(KillSrc));
878 }
879
880 return;
881 }
882
883 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
884 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
885 return;
886 }
887
888 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
889 .addReg(SrcReg, getKillRegState(KillSrc));
890 return;
891 }
892
893 if (RC == &AMDGPU::SReg_64RegClass) {
894 if (SrcReg == AMDGPU::SCC) {
895 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
896 .addImm(1)
897 .addImm(0);
898 return;
899 }
900
901 if (DestReg == AMDGPU::VCC) {
902 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
903 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
904 .addReg(SrcReg, getKillRegState(KillSrc));
905 } else {
906 // FIXME: Hack until VReg_1 removed.
907 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
908 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
909 .addImm(0)
910 .addReg(SrcReg, getKillRegState(KillSrc));
911 }
912
913 return;
914 }
915
916 if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
917 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
918 return;
919 }
920
921 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
922 .addReg(SrcReg, getKillRegState(KillSrc));
923 return;
924 }
925
926 if (DestReg == AMDGPU::SCC) {
927 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
928 // but SelectionDAG emits such copies for i1 sources.
929 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
930 // This copy can only be produced by patterns
931 // with explicit SCC, which are known to be enabled
932 // only for subtargets with S_CMP_LG_U64 present.
933 assert(ST.hasScalarCompareEq64());
934 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
935 .addReg(SrcReg, getKillRegState(KillSrc))
936 .addImm(0);
937 } else {
938 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
939 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
940 .addReg(SrcReg, getKillRegState(KillSrc))
941 .addImm(0);
942 }
943
944 return;
945 }
946
947 if (RC == &AMDGPU::AGPR_32RegClass) {
948 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
949 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
950 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
951 .addReg(SrcReg, getKillRegState(KillSrc));
952 return;
953 }
954
955 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
956 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
957 .addReg(SrcReg, getKillRegState(KillSrc));
958 return;
959 }
960
961 // FIXME: Pass should maintain scavenger to avoid scan through the block on
962 // every AGPR spill.
963 RegScavenger RS;
964 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
965 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
966 return;
967 }
968
969 if (Size == 16) {
970 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
971 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
972 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
973
974 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
975 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
976 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
977 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
978 bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
979 bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
980 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
981 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
982
983 if (IsSGPRDst) {
984 if (!IsSGPRSrc) {
985 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
986 return;
987 }
988
989 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
990 .addReg(NewSrcReg, getKillRegState(KillSrc));
991 return;
992 }
993
994 if (IsAGPRDst || IsAGPRSrc) {
995 if (!DstLow || !SrcLow) {
996 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
997 "Cannot use hi16 subreg with an AGPR!");
998 }
999
1000 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
1001 return;
1002 }
1003
1004 if (ST.useRealTrue16Insts()) {
1005 if (IsSGPRSrc) {
1006 assert(SrcLow);
1007 SrcReg = NewSrcReg;
1008 }
1009 // Use the smaller instruction encoding if possible.
1010 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
1011 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
1012 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
1013 .addReg(SrcReg);
1014 } else {
1015 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
1016 .addImm(0) // src0_modifiers
1017 .addReg(SrcReg)
1018 .addImm(0); // op_sel
1019 }
1020 return;
1021 }
1022
1023 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1024 if (!DstLow || !SrcLow) {
1025 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1026 "Cannot use hi16 subreg on VI!");
1027 }
1028
1029 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1030 .addReg(NewSrcReg, getKillRegState(KillSrc));
1031 return;
1032 }
1033
1034 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1035 .addImm(0) // src0_modifiers
1036 .addReg(NewSrcReg)
1037 .addImm(0) // clamp
1044 // First implicit operand is $exec.
1045 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1046 return;
1047 }
1048
1049 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1050 if (ST.hasMovB64()) {
1051 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1052 .addReg(SrcReg, getKillRegState(KillSrc));
1053 return;
1054 }
1055 if (ST.hasPkMovB32()) {
1056 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1058 .addReg(SrcReg)
1060 .addReg(SrcReg)
1061 .addImm(0) // op_sel_lo
1062 .addImm(0) // op_sel_hi
1063 .addImm(0) // neg_lo
1064 .addImm(0) // neg_hi
1065 .addImm(0) // clamp
1066 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1067 return;
1068 }
1069 }
1070
1071 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1072 if (RI.isSGPRClass(RC)) {
1073 if (!RI.isSGPRClass(SrcRC)) {
1074 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1075 return;
1076 }
1077 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1078 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1079 Forward);
1080 return;
1081 }
1082
1083 unsigned EltSize = 4;
1084 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1085 if (RI.isAGPRClass(RC)) {
1086 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1087 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1088 else if (RI.hasVGPRs(SrcRC) ||
1089 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1090 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1091 else
1092 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1093 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1094 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1095 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1096 (RI.isProperlyAlignedRC(*RC) &&
1097 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1098 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1099 if (ST.hasMovB64()) {
1100 Opcode = AMDGPU::V_MOV_B64_e32;
1101 EltSize = 8;
1102 } else if (ST.hasPkMovB32()) {
1103 Opcode = AMDGPU::V_PK_MOV_B32;
1104 EltSize = 8;
1105 }
1106 }
1107
1108 // For the cases where we need an intermediate instruction/temporary register
1109 // (destination is an AGPR), we need a scavenger.
1110 //
1111 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1112 // whole block for every handled copy.
1113 std::unique_ptr<RegScavenger> RS;
1114 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1115 RS = std::make_unique<RegScavenger>();
1116
1117 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1118
1119 // If there is an overlap, we can't kill the super-register on the last
1120 // instruction, since it will also kill the components made live by this def.
1121 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1122 const bool CanKillSuperReg = KillSrc && !Overlap;
1123
1124 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1125 unsigned SubIdx;
1126 if (Forward)
1127 SubIdx = SubIndices[Idx];
1128 else
1129 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1130 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1131 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1132 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1133
1134 bool IsFirstSubreg = Idx == 0;
1135 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1136
1137 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1138 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1139 Register ImpUseSuper = SrcReg;
1140 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1141 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1142 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1144 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1146 .addReg(SrcSubReg)
1148 .addReg(SrcSubReg)
1149 .addImm(0) // op_sel_lo
1150 .addImm(0) // op_sel_hi
1151 .addImm(0) // neg_lo
1152 .addImm(0) // neg_hi
1153 .addImm(0) // clamp
1154 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1155 if (IsFirstSubreg)
1157 } else {
1158 MachineInstrBuilder Builder =
1159 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1160 if (IsFirstSubreg)
1161 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1162
1163 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1164 }
1165 }
1166}
1167
1168int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1169 int NewOpc;
1170
1171 // Try to map original to commuted opcode
1172 NewOpc = AMDGPU::getCommuteRev(Opcode);
1173 if (NewOpc != -1)
1174 // Check if the commuted (REV) opcode exists on the target.
1175 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1176
1177 // Try to map commuted to original opcode
1178 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1179 if (NewOpc != -1)
1180 // Check if the original (non-REV) opcode exists on the target.
1181 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1182
1183 return Opcode;
1184}
1185
1186const TargetRegisterClass *
1188 return &AMDGPU::VGPR_32RegClass;
1189}
1190
1193 const DebugLoc &DL, Register DstReg,
1195 Register TrueReg,
1196 Register FalseReg) const {
1197 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1198 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1200 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1201 "Not a VGPR32 reg");
1202
1203 if (Cond.size() == 1) {
1204 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1205 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1206 .add(Cond[0]);
1207 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1208 .addImm(0)
1209 .addReg(FalseReg)
1210 .addImm(0)
1211 .addReg(TrueReg)
1212 .addReg(SReg);
1213 } else if (Cond.size() == 2) {
1214 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1215 switch (Cond[0].getImm()) {
1216 case SIInstrInfo::SCC_TRUE: {
1217 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1218 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1219 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1220 .addImm(0)
1221 .addReg(FalseReg)
1222 .addImm(0)
1223 .addReg(TrueReg)
1224 .addReg(SReg);
1225 break;
1226 }
1227 case SIInstrInfo::SCC_FALSE: {
1228 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1229 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1230 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1231 .addImm(0)
1232 .addReg(FalseReg)
1233 .addImm(0)
1234 .addReg(TrueReg)
1235 .addReg(SReg);
1236 break;
1237 }
1238 case SIInstrInfo::VCCNZ: {
1239 MachineOperand RegOp = Cond[1];
1240 RegOp.setImplicit(false);
1241 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1242 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1243 .add(RegOp);
1244 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1245 .addImm(0)
1246 .addReg(FalseReg)
1247 .addImm(0)
1248 .addReg(TrueReg)
1249 .addReg(SReg);
1250 break;
1251 }
1252 case SIInstrInfo::VCCZ: {
1253 MachineOperand RegOp = Cond[1];
1254 RegOp.setImplicit(false);
1255 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1256 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1257 .add(RegOp);
1258 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1259 .addImm(0)
1260 .addReg(TrueReg)
1261 .addImm(0)
1262 .addReg(FalseReg)
1263 .addReg(SReg);
1264 break;
1265 }
1266 case SIInstrInfo::EXECNZ: {
1267 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1268 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1269 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1270 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1271 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1272 .addImm(0)
1273 .addReg(FalseReg)
1274 .addImm(0)
1275 .addReg(TrueReg)
1276 .addReg(SReg);
1277 break;
1278 }
1279 case SIInstrInfo::EXECZ: {
1280 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1281 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1282 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1283 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1284 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1285 .addImm(0)
1286 .addReg(FalseReg)
1287 .addImm(0)
1288 .addReg(TrueReg)
1289 .addReg(SReg);
1290 llvm_unreachable("Unhandled branch predicate EXECZ");
1291 break;
1292 }
1293 default:
1294 llvm_unreachable("invalid branch predicate");
1295 }
1296 } else {
1297 llvm_unreachable("Can only handle Cond size 1 or 2");
1298 }
1299}
1300
1303 const DebugLoc &DL,
1304 Register SrcReg, int Value) const {
1305 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1306 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1307 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1308 .addImm(Value)
1309 .addReg(SrcReg);
1310
1311 return Reg;
1312}
1313
1316 const DebugLoc &DL,
1317 Register SrcReg, int Value) const {
1318 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1319 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1320 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1321 .addImm(Value)
1322 .addReg(SrcReg);
1323
1324 return Reg;
1325}
1326
1328 const Register Reg,
1329 int64_t &ImmVal) const {
1330 switch (MI.getOpcode()) {
1331 case AMDGPU::V_MOV_B32_e32:
1332 case AMDGPU::S_MOV_B32:
1333 case AMDGPU::S_MOVK_I32:
1334 case AMDGPU::S_MOV_B64:
1335 case AMDGPU::V_MOV_B64_e32:
1336 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
1337 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
1338 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
1339 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
1340 case AMDGPU::V_MOV_B64_PSEUDO: {
1341 const MachineOperand &Src0 = MI.getOperand(1);
1342 if (Src0.isImm()) {
1343 ImmVal = Src0.getImm();
1344 return MI.getOperand(0).getReg() == Reg;
1345 }
1346
1347 return false;
1348 }
1349 case AMDGPU::S_BREV_B32:
1350 case AMDGPU::V_BFREV_B32_e32:
1351 case AMDGPU::V_BFREV_B32_e64: {
1352 const MachineOperand &Src0 = MI.getOperand(1);
1353 if (Src0.isImm()) {
1354 ImmVal = static_cast<int64_t>(reverseBits<int32_t>(Src0.getImm()));
1355 return MI.getOperand(0).getReg() == Reg;
1356 }
1357
1358 return false;
1359 }
1360 case AMDGPU::S_NOT_B32:
1361 case AMDGPU::V_NOT_B32_e32:
1362 case AMDGPU::V_NOT_B32_e64: {
1363 const MachineOperand &Src0 = MI.getOperand(1);
1364 if (Src0.isImm()) {
1365 ImmVal = static_cast<int64_t>(~static_cast<int32_t>(Src0.getImm()));
1366 return MI.getOperand(0).getReg() == Reg;
1367 }
1368
1369 return false;
1370 }
1371 default:
1372 return false;
1373 }
1374}
1375
1377
1378 if (RI.isAGPRClass(DstRC))
1379 return AMDGPU::COPY;
1380 if (RI.getRegSizeInBits(*DstRC) == 16) {
1381 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1382 // before RA.
1383 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1384 }
1385 if (RI.getRegSizeInBits(*DstRC) == 32)
1386 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1387 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1388 return AMDGPU::S_MOV_B64;
1389 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1390 return AMDGPU::V_MOV_B64_PSEUDO;
1391 return AMDGPU::COPY;
1392}
1393
1394const MCInstrDesc &
1396 bool IsIndirectSrc) const {
1397 if (IsIndirectSrc) {
1398 if (VecSize <= 32) // 4 bytes
1399 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1400 if (VecSize <= 64) // 8 bytes
1401 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1402 if (VecSize <= 96) // 12 bytes
1403 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1404 if (VecSize <= 128) // 16 bytes
1405 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1406 if (VecSize <= 160) // 20 bytes
1407 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1408 if (VecSize <= 256) // 32 bytes
1409 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1410 if (VecSize <= 288) // 36 bytes
1411 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1412 if (VecSize <= 320) // 40 bytes
1413 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1414 if (VecSize <= 352) // 44 bytes
1415 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1416 if (VecSize <= 384) // 48 bytes
1417 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1418 if (VecSize <= 512) // 64 bytes
1419 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1420 if (VecSize <= 1024) // 128 bytes
1421 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1422
1423 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1424 }
1425
1426 if (VecSize <= 32) // 4 bytes
1427 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1428 if (VecSize <= 64) // 8 bytes
1429 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1430 if (VecSize <= 96) // 12 bytes
1431 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1432 if (VecSize <= 128) // 16 bytes
1433 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1434 if (VecSize <= 160) // 20 bytes
1435 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1436 if (VecSize <= 256) // 32 bytes
1437 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1438 if (VecSize <= 288) // 36 bytes
1439 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1440 if (VecSize <= 320) // 40 bytes
1441 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1442 if (VecSize <= 352) // 44 bytes
1443 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1444 if (VecSize <= 384) // 48 bytes
1445 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1446 if (VecSize <= 512) // 64 bytes
1447 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1448 if (VecSize <= 1024) // 128 bytes
1449 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1450
1451 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1452}
1453
1454static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1455 if (VecSize <= 32) // 4 bytes
1456 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1457 if (VecSize <= 64) // 8 bytes
1458 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1459 if (VecSize <= 96) // 12 bytes
1460 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1461 if (VecSize <= 128) // 16 bytes
1462 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1463 if (VecSize <= 160) // 20 bytes
1464 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1465 if (VecSize <= 256) // 32 bytes
1466 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1467 if (VecSize <= 288) // 36 bytes
1468 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1469 if (VecSize <= 320) // 40 bytes
1470 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1471 if (VecSize <= 352) // 44 bytes
1472 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1473 if (VecSize <= 384) // 48 bytes
1474 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1475 if (VecSize <= 512) // 64 bytes
1476 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1477 if (VecSize <= 1024) // 128 bytes
1478 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1479
1480 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1481}
1482
1483static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1484 if (VecSize <= 32) // 4 bytes
1485 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1486 if (VecSize <= 64) // 8 bytes
1487 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1488 if (VecSize <= 96) // 12 bytes
1489 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1490 if (VecSize <= 128) // 16 bytes
1491 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1492 if (VecSize <= 160) // 20 bytes
1493 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1494 if (VecSize <= 256) // 32 bytes
1495 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1496 if (VecSize <= 288) // 36 bytes
1497 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1498 if (VecSize <= 320) // 40 bytes
1499 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1500 if (VecSize <= 352) // 44 bytes
1501 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1502 if (VecSize <= 384) // 48 bytes
1503 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1504 if (VecSize <= 512) // 64 bytes
1505 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1506 if (VecSize <= 1024) // 128 bytes
1507 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1508
1509 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1510}
1511
1512static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1513 if (VecSize <= 64) // 8 bytes
1514 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1515 if (VecSize <= 128) // 16 bytes
1516 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1517 if (VecSize <= 256) // 32 bytes
1518 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1519 if (VecSize <= 512) // 64 bytes
1520 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1521 if (VecSize <= 1024) // 128 bytes
1522 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1523
1524 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1525}
1526
1527const MCInstrDesc &
1528SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1529 bool IsSGPR) const {
1530 if (IsSGPR) {
1531 switch (EltSize) {
1532 case 32:
1533 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1534 case 64:
1535 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1536 default:
1537 llvm_unreachable("invalid reg indexing elt size");
1538 }
1539 }
1540
1541 assert(EltSize == 32 && "invalid reg indexing elt size");
1543}
1544
1545static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1546 switch (Size) {
1547 case 4:
1548 return AMDGPU::SI_SPILL_S32_SAVE;
1549 case 8:
1550 return AMDGPU::SI_SPILL_S64_SAVE;
1551 case 12:
1552 return AMDGPU::SI_SPILL_S96_SAVE;
1553 case 16:
1554 return AMDGPU::SI_SPILL_S128_SAVE;
1555 case 20:
1556 return AMDGPU::SI_SPILL_S160_SAVE;
1557 case 24:
1558 return AMDGPU::SI_SPILL_S192_SAVE;
1559 case 28:
1560 return AMDGPU::SI_SPILL_S224_SAVE;
1561 case 32:
1562 return AMDGPU::SI_SPILL_S256_SAVE;
1563 case 36:
1564 return AMDGPU::SI_SPILL_S288_SAVE;
1565 case 40:
1566 return AMDGPU::SI_SPILL_S320_SAVE;
1567 case 44:
1568 return AMDGPU::SI_SPILL_S352_SAVE;
1569 case 48:
1570 return AMDGPU::SI_SPILL_S384_SAVE;
1571 case 64:
1572 return AMDGPU::SI_SPILL_S512_SAVE;
1573 case 128:
1574 return AMDGPU::SI_SPILL_S1024_SAVE;
1575 default:
1576 llvm_unreachable("unknown register size");
1577 }
1578}
1579
1580static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1581 switch (Size) {
1582 case 2:
1583 return AMDGPU::SI_SPILL_V16_SAVE;
1584 case 4:
1585 return AMDGPU::SI_SPILL_V32_SAVE;
1586 case 8:
1587 return AMDGPU::SI_SPILL_V64_SAVE;
1588 case 12:
1589 return AMDGPU::SI_SPILL_V96_SAVE;
1590 case 16:
1591 return AMDGPU::SI_SPILL_V128_SAVE;
1592 case 20:
1593 return AMDGPU::SI_SPILL_V160_SAVE;
1594 case 24:
1595 return AMDGPU::SI_SPILL_V192_SAVE;
1596 case 28:
1597 return AMDGPU::SI_SPILL_V224_SAVE;
1598 case 32:
1599 return AMDGPU::SI_SPILL_V256_SAVE;
1600 case 36:
1601 return AMDGPU::SI_SPILL_V288_SAVE;
1602 case 40:
1603 return AMDGPU::SI_SPILL_V320_SAVE;
1604 case 44:
1605 return AMDGPU::SI_SPILL_V352_SAVE;
1606 case 48:
1607 return AMDGPU::SI_SPILL_V384_SAVE;
1608 case 64:
1609 return AMDGPU::SI_SPILL_V512_SAVE;
1610 case 128:
1611 return AMDGPU::SI_SPILL_V1024_SAVE;
1612 default:
1613 llvm_unreachable("unknown register size");
1614 }
1615}
1616
1617static unsigned getAVSpillSaveOpcode(unsigned Size) {
1618 switch (Size) {
1619 case 4:
1620 return AMDGPU::SI_SPILL_AV32_SAVE;
1621 case 8:
1622 return AMDGPU::SI_SPILL_AV64_SAVE;
1623 case 12:
1624 return AMDGPU::SI_SPILL_AV96_SAVE;
1625 case 16:
1626 return AMDGPU::SI_SPILL_AV128_SAVE;
1627 case 20:
1628 return AMDGPU::SI_SPILL_AV160_SAVE;
1629 case 24:
1630 return AMDGPU::SI_SPILL_AV192_SAVE;
1631 case 28:
1632 return AMDGPU::SI_SPILL_AV224_SAVE;
1633 case 32:
1634 return AMDGPU::SI_SPILL_AV256_SAVE;
1635 case 36:
1636 return AMDGPU::SI_SPILL_AV288_SAVE;
1637 case 40:
1638 return AMDGPU::SI_SPILL_AV320_SAVE;
1639 case 44:
1640 return AMDGPU::SI_SPILL_AV352_SAVE;
1641 case 48:
1642 return AMDGPU::SI_SPILL_AV384_SAVE;
1643 case 64:
1644 return AMDGPU::SI_SPILL_AV512_SAVE;
1645 case 128:
1646 return AMDGPU::SI_SPILL_AV1024_SAVE;
1647 default:
1648 llvm_unreachable("unknown register size");
1649 }
1650}
1651
1652static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1653 bool IsVectorSuperClass) {
1654 // Currently, there is only 32-bit WWM register spills needed.
1655 if (Size != 4)
1656 llvm_unreachable("unknown wwm register spill size");
1657
1658 if (IsVectorSuperClass)
1659 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1660
1661 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1662}
1663
1665 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1666 const SIMachineFunctionInfo &MFI) const {
1667 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1668
1669 // Choose the right opcode if spilling a WWM register.
1671 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1672
1673 // TODO: Check if AGPRs are available
1674 if (ST.hasMAIInsts())
1675 return getAVSpillSaveOpcode(Size);
1676
1678}
1679
1682 bool isKill, int FrameIndex, const TargetRegisterClass *RC,
1683 const TargetRegisterInfo *TRI, Register VReg,
1684 MachineInstr::MIFlag Flags) const {
1685 MachineFunction *MF = MBB.getParent();
1687 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1688 const DebugLoc &DL = MBB.findDebugLoc(MI);
1689
1690 MachinePointerInfo PtrInfo
1691 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1693 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1694 FrameInfo.getObjectAlign(FrameIndex));
1695 unsigned SpillSize = TRI->getSpillSize(*RC);
1696
1698 if (RI.isSGPRClass(RC)) {
1699 MFI->setHasSpilledSGPRs();
1700 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1701 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1702 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1703
1704 // We are only allowed to create one new instruction when spilling
1705 // registers, so we need to use pseudo instruction for spilling SGPRs.
1706 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1707
1708 // The SGPR spill/restore instructions only work on number sgprs, so we need
1709 // to make sure we are using the correct register class.
1710 if (SrcReg.isVirtual() && SpillSize == 4) {
1711 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1712 }
1713
1714 BuildMI(MBB, MI, DL, OpDesc)
1715 .addReg(SrcReg, getKillRegState(isKill)) // data
1716 .addFrameIndex(FrameIndex) // addr
1717 .addMemOperand(MMO)
1719
1720 if (RI.spillSGPRToVGPR())
1721 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1722 return;
1723 }
1724
1725 unsigned Opcode =
1726 getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI);
1727 MFI->setHasSpilledVGPRs();
1728
1729 BuildMI(MBB, MI, DL, get(Opcode))
1730 .addReg(SrcReg, getKillRegState(isKill)) // data
1731 .addFrameIndex(FrameIndex) // addr
1732 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1733 .addImm(0) // offset
1734 .addMemOperand(MMO);
1735}
1736
1737static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1738 switch (Size) {
1739 case 4:
1740 return AMDGPU::SI_SPILL_S32_RESTORE;
1741 case 8:
1742 return AMDGPU::SI_SPILL_S64_RESTORE;
1743 case 12:
1744 return AMDGPU::SI_SPILL_S96_RESTORE;
1745 case 16:
1746 return AMDGPU::SI_SPILL_S128_RESTORE;
1747 case 20:
1748 return AMDGPU::SI_SPILL_S160_RESTORE;
1749 case 24:
1750 return AMDGPU::SI_SPILL_S192_RESTORE;
1751 case 28:
1752 return AMDGPU::SI_SPILL_S224_RESTORE;
1753 case 32:
1754 return AMDGPU::SI_SPILL_S256_RESTORE;
1755 case 36:
1756 return AMDGPU::SI_SPILL_S288_RESTORE;
1757 case 40:
1758 return AMDGPU::SI_SPILL_S320_RESTORE;
1759 case 44:
1760 return AMDGPU::SI_SPILL_S352_RESTORE;
1761 case 48:
1762 return AMDGPU::SI_SPILL_S384_RESTORE;
1763 case 64:
1764 return AMDGPU::SI_SPILL_S512_RESTORE;
1765 case 128:
1766 return AMDGPU::SI_SPILL_S1024_RESTORE;
1767 default:
1768 llvm_unreachable("unknown register size");
1769 }
1770}
1771
1772static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1773 switch (Size) {
1774 case 2:
1775 return AMDGPU::SI_SPILL_V16_RESTORE;
1776 case 4:
1777 return AMDGPU::SI_SPILL_V32_RESTORE;
1778 case 8:
1779 return AMDGPU::SI_SPILL_V64_RESTORE;
1780 case 12:
1781 return AMDGPU::SI_SPILL_V96_RESTORE;
1782 case 16:
1783 return AMDGPU::SI_SPILL_V128_RESTORE;
1784 case 20:
1785 return AMDGPU::SI_SPILL_V160_RESTORE;
1786 case 24:
1787 return AMDGPU::SI_SPILL_V192_RESTORE;
1788 case 28:
1789 return AMDGPU::SI_SPILL_V224_RESTORE;
1790 case 32:
1791 return AMDGPU::SI_SPILL_V256_RESTORE;
1792 case 36:
1793 return AMDGPU::SI_SPILL_V288_RESTORE;
1794 case 40:
1795 return AMDGPU::SI_SPILL_V320_RESTORE;
1796 case 44:
1797 return AMDGPU::SI_SPILL_V352_RESTORE;
1798 case 48:
1799 return AMDGPU::SI_SPILL_V384_RESTORE;
1800 case 64:
1801 return AMDGPU::SI_SPILL_V512_RESTORE;
1802 case 128:
1803 return AMDGPU::SI_SPILL_V1024_RESTORE;
1804 default:
1805 llvm_unreachable("unknown register size");
1806 }
1807}
1808
1809static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1810 switch (Size) {
1811 case 4:
1812 return AMDGPU::SI_SPILL_AV32_RESTORE;
1813 case 8:
1814 return AMDGPU::SI_SPILL_AV64_RESTORE;
1815 case 12:
1816 return AMDGPU::SI_SPILL_AV96_RESTORE;
1817 case 16:
1818 return AMDGPU::SI_SPILL_AV128_RESTORE;
1819 case 20:
1820 return AMDGPU::SI_SPILL_AV160_RESTORE;
1821 case 24:
1822 return AMDGPU::SI_SPILL_AV192_RESTORE;
1823 case 28:
1824 return AMDGPU::SI_SPILL_AV224_RESTORE;
1825 case 32:
1826 return AMDGPU::SI_SPILL_AV256_RESTORE;
1827 case 36:
1828 return AMDGPU::SI_SPILL_AV288_RESTORE;
1829 case 40:
1830 return AMDGPU::SI_SPILL_AV320_RESTORE;
1831 case 44:
1832 return AMDGPU::SI_SPILL_AV352_RESTORE;
1833 case 48:
1834 return AMDGPU::SI_SPILL_AV384_RESTORE;
1835 case 64:
1836 return AMDGPU::SI_SPILL_AV512_RESTORE;
1837 case 128:
1838 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1839 default:
1840 llvm_unreachable("unknown register size");
1841 }
1842}
1843
1844static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1845 bool IsVectorSuperClass) {
1846 // Currently, there is only 32-bit WWM register spills needed.
1847 if (Size != 4)
1848 llvm_unreachable("unknown wwm register spill size");
1849
1850 if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
1851 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1852
1853 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1854}
1855
1857 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1858 const SIMachineFunctionInfo &MFI) const {
1859 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1860
1861 // Choose the right opcode if restoring a WWM register.
1863 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1864
1865 // TODO: Check if AGPRs are available
1866 if (ST.hasMAIInsts())
1868
1869 assert(!RI.isAGPRClass(RC));
1871}
1872
1875 Register DestReg, int FrameIndex,
1876 const TargetRegisterClass *RC,
1877 const TargetRegisterInfo *TRI,
1878 Register VReg,
1879 MachineInstr::MIFlag Flags) const {
1880 MachineFunction *MF = MBB.getParent();
1882 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1883 const DebugLoc &DL = MBB.findDebugLoc(MI);
1884 unsigned SpillSize = TRI->getSpillSize(*RC);
1885
1886 MachinePointerInfo PtrInfo
1887 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1888
1890 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1891 FrameInfo.getObjectAlign(FrameIndex));
1892
1893 if (RI.isSGPRClass(RC)) {
1894 MFI->setHasSpilledSGPRs();
1895 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1896 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1897 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1898
1899 // FIXME: Maybe this should not include a memoperand because it will be
1900 // lowered to non-memory instructions.
1901 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1902 if (DestReg.isVirtual() && SpillSize == 4) {
1904 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1905 }
1906
1907 if (RI.spillSGPRToVGPR())
1908 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1909 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1910 .addFrameIndex(FrameIndex) // addr
1911 .addMemOperand(MMO)
1913
1914 return;
1915 }
1916
1917 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1918 SpillSize, *MFI);
1919 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1920 .addFrameIndex(FrameIndex) // vaddr
1921 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1922 .addImm(0) // offset
1923 .addMemOperand(MMO);
1924}
1925
1930
1933 unsigned Quantity) const {
1934 DebugLoc DL = MBB.findDebugLoc(MI);
1935 while (Quantity > 0) {
1936 unsigned Arg = std::min(Quantity, 8u);
1937 Quantity -= Arg;
1938 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
1939 }
1940}
1941
1943 auto *MF = MBB.getParent();
1944 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1945
1946 assert(Info->isEntryFunction());
1947
1948 if (MBB.succ_empty()) {
1949 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1950 if (HasNoTerminator) {
1951 if (Info->returnsVoid()) {
1952 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
1953 } else {
1954 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
1955 }
1956 }
1957 }
1958}
1959
1963 const DebugLoc &DL) const {
1964 MachineFunction *MF = MBB.getParent();
1965 constexpr unsigned DoorbellIDMask = 0x3ff;
1966 constexpr unsigned ECQueueWaveAbort = 0x400;
1967
1968 MachineBasicBlock *TrapBB = &MBB;
1969 MachineBasicBlock *ContBB = &MBB;
1970 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
1971
1972 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
1973 ContBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
1974 TrapBB = MF->CreateMachineBasicBlock();
1975 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
1976 MF->push_back(TrapBB);
1977 MBB.addSuccessor(TrapBB);
1978 }
1979
1980 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
1981 // will be a nop.
1982 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
1983 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
1984 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1985 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
1986 DoorbellReg)
1988 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
1989 .addUse(AMDGPU::M0);
1990 Register DoorbellRegMasked =
1991 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1992 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
1993 .addUse(DoorbellReg)
1994 .addImm(DoorbellIDMask);
1995 Register SetWaveAbortBit =
1996 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1997 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
1998 .addUse(DoorbellRegMasked)
1999 .addImm(ECQueueWaveAbort);
2000 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2001 .addUse(SetWaveAbortBit);
2002 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
2004 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2005 .addUse(AMDGPU::TTMP2);
2006 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
2007 TrapBB->addSuccessor(HaltLoopBB);
2008
2009 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2010 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2011 .addMBB(HaltLoopBB);
2012 MF->push_back(HaltLoopBB);
2013 HaltLoopBB->addSuccessor(HaltLoopBB);
2014
2015 return ContBB;
2016}
2017
2019 switch (MI.getOpcode()) {
2020 default:
2021 if (MI.isMetaInstruction())
2022 return 0;
2023 return 1; // FIXME: Do wait states equal cycles?
2024
2025 case AMDGPU::S_NOP:
2026 return MI.getOperand(0).getImm() + 1;
2027 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2028 // hazard, even if one exist, won't really be visible. Should we handle it?
2029 }
2030}
2031
2033 MachineBasicBlock &MBB = *MI.getParent();
2034 DebugLoc DL = MBB.findDebugLoc(MI);
2036 switch (MI.getOpcode()) {
2037 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2038 case AMDGPU::S_MOV_B64_term:
2039 // This is only a terminator to get the correct spill code placement during
2040 // register allocation.
2041 MI.setDesc(get(AMDGPU::S_MOV_B64));
2042 break;
2043
2044 case AMDGPU::S_MOV_B32_term:
2045 // This is only a terminator to get the correct spill code placement during
2046 // register allocation.
2047 MI.setDesc(get(AMDGPU::S_MOV_B32));
2048 break;
2049
2050 case AMDGPU::S_XOR_B64_term:
2051 // This is only a terminator to get the correct spill code placement during
2052 // register allocation.
2053 MI.setDesc(get(AMDGPU::S_XOR_B64));
2054 break;
2055
2056 case AMDGPU::S_XOR_B32_term:
2057 // This is only a terminator to get the correct spill code placement during
2058 // register allocation.
2059 MI.setDesc(get(AMDGPU::S_XOR_B32));
2060 break;
2061 case AMDGPU::S_OR_B64_term:
2062 // This is only a terminator to get the correct spill code placement during
2063 // register allocation.
2064 MI.setDesc(get(AMDGPU::S_OR_B64));
2065 break;
2066 case AMDGPU::S_OR_B32_term:
2067 // This is only a terminator to get the correct spill code placement during
2068 // register allocation.
2069 MI.setDesc(get(AMDGPU::S_OR_B32));
2070 break;
2071
2072 case AMDGPU::S_ANDN2_B64_term:
2073 // This is only a terminator to get the correct spill code placement during
2074 // register allocation.
2075 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2076 break;
2077
2078 case AMDGPU::S_ANDN2_B32_term:
2079 // This is only a terminator to get the correct spill code placement during
2080 // register allocation.
2081 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2082 break;
2083
2084 case AMDGPU::S_AND_B64_term:
2085 // This is only a terminator to get the correct spill code placement during
2086 // register allocation.
2087 MI.setDesc(get(AMDGPU::S_AND_B64));
2088 break;
2089
2090 case AMDGPU::S_AND_B32_term:
2091 // This is only a terminator to get the correct spill code placement during
2092 // register allocation.
2093 MI.setDesc(get(AMDGPU::S_AND_B32));
2094 break;
2095
2096 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2097 // This is only a terminator to get the correct spill code placement during
2098 // register allocation.
2099 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2100 break;
2101
2102 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2103 // This is only a terminator to get the correct spill code placement during
2104 // register allocation.
2105 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2106 break;
2107
2108 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2109 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2110 break;
2111
2112 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2113 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2114 MI.getMF()->getRegInfo().constrainRegClass(MI.getOperand(0).getReg(),
2115 &AMDGPU::SReg_32_XM0RegClass);
2116 break;
2117 case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
2118 Register Dst = MI.getOperand(0).getReg();
2119 bool IsAGPR = SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst));
2120 MI.setDesc(
2121 get(IsAGPR ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_MOV_B32_e32));
2122 break;
2123 }
2124 case AMDGPU::AV_MOV_B64_IMM_PSEUDO: {
2125 Register Dst = MI.getOperand(0).getReg();
2126 if (SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst))) {
2127 int64_t Imm = MI.getOperand(1).getImm();
2128
2129 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2130 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2131 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstLo)
2134 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstHi)
2135 .addImm(SignExtend64<32>(Imm >> 32))
2137 MI.eraseFromParent();
2138 break;
2139 }
2140
2141 [[fallthrough]];
2142 }
2143 case AMDGPU::V_MOV_B64_PSEUDO: {
2144 Register Dst = MI.getOperand(0).getReg();
2145 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2146 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2147
2148 const MachineOperand &SrcOp = MI.getOperand(1);
2149 // FIXME: Will this work for 64-bit floating point immediates?
2150 assert(!SrcOp.isFPImm());
2151 if (ST.hasMovB64()) {
2152 MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
2153 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2154 isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
2155 break;
2156 }
2157 if (SrcOp.isImm()) {
2158 APInt Imm(64, SrcOp.getImm());
2159 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2160 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2161 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
2162 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2164 .addImm(Lo.getSExtValue())
2166 .addImm(Lo.getSExtValue())
2167 .addImm(0) // op_sel_lo
2168 .addImm(0) // op_sel_hi
2169 .addImm(0) // neg_lo
2170 .addImm(0) // neg_hi
2171 .addImm(0); // clamp
2172 } else {
2173 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2174 .addImm(Lo.getSExtValue())
2176 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2177 .addImm(Hi.getSExtValue())
2179 }
2180 } else {
2181 assert(SrcOp.isReg());
2182 if (ST.hasPkMovB32() &&
2183 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2184 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2185 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2186 .addReg(SrcOp.getReg())
2188 .addReg(SrcOp.getReg())
2189 .addImm(0) // op_sel_lo
2190 .addImm(0) // op_sel_hi
2191 .addImm(0) // neg_lo
2192 .addImm(0) // neg_hi
2193 .addImm(0); // clamp
2194 } else {
2195 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2196 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2198 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2199 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2201 }
2202 }
2203 MI.eraseFromParent();
2204 break;
2205 }
2206 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2208 break;
2209 }
2210 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2211 const MachineOperand &SrcOp = MI.getOperand(1);
2212 assert(!SrcOp.isFPImm());
2213
2214 if (ST.has64BitLiterals()) {
2215 MI.setDesc(get(AMDGPU::S_MOV_B64));
2216 break;
2217 }
2218
2219 APInt Imm(64, SrcOp.getImm());
2220 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2221 MI.setDesc(get(AMDGPU::S_MOV_B64));
2222 break;
2223 }
2224
2225 Register Dst = MI.getOperand(0).getReg();
2226 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2227 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2228
2229 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2230 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2231 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2232 .addImm(Lo.getSExtValue())
2234 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2235 .addImm(Hi.getSExtValue())
2237 MI.eraseFromParent();
2238 break;
2239 }
2240 case AMDGPU::V_SET_INACTIVE_B32: {
2241 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2242 Register DstReg = MI.getOperand(0).getReg();
2243 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2244 .add(MI.getOperand(3))
2245 .add(MI.getOperand(4))
2246 .add(MI.getOperand(1))
2247 .add(MI.getOperand(2))
2248 .add(MI.getOperand(5));
2249 MI.eraseFromParent();
2250 break;
2251 }
2252 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2253 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2254 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2255 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2256 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2257 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2258 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2259 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2260 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2261 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2262 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2263 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2264 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2265 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2266 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2267 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2268 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2269 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2270 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2271 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2272 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2273 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2274 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2275 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2276 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2277 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2278 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2279 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2280 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2281 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2282
2283 unsigned Opc;
2284 if (RI.hasVGPRs(EltRC)) {
2285 Opc = AMDGPU::V_MOVRELD_B32_e32;
2286 } else {
2287 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2288 : AMDGPU::S_MOVRELD_B32;
2289 }
2290
2291 const MCInstrDesc &OpDesc = get(Opc);
2292 Register VecReg = MI.getOperand(0).getReg();
2293 bool IsUndef = MI.getOperand(1).isUndef();
2294 unsigned SubReg = MI.getOperand(3).getImm();
2295 assert(VecReg == MI.getOperand(1).getReg());
2296
2298 BuildMI(MBB, MI, DL, OpDesc)
2299 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2300 .add(MI.getOperand(2))
2302 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2303
2304 const int ImpDefIdx =
2305 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2306 const int ImpUseIdx = ImpDefIdx + 1;
2307 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2308 MI.eraseFromParent();
2309 break;
2310 }
2311 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2312 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2313 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2314 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2315 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2316 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2317 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2318 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2319 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2320 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2321 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2322 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2323 assert(ST.useVGPRIndexMode());
2324 Register VecReg = MI.getOperand(0).getReg();
2325 bool IsUndef = MI.getOperand(1).isUndef();
2326 MachineOperand &Idx = MI.getOperand(3);
2327 Register SubReg = MI.getOperand(4).getImm();
2328
2329 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2330 .add(Idx)
2332 SetOn->getOperand(3).setIsUndef();
2333
2334 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2336 BuildMI(MBB, MI, DL, OpDesc)
2337 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2338 .add(MI.getOperand(2))
2340 .addReg(VecReg,
2341 RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2342
2343 const int ImpDefIdx =
2344 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2345 const int ImpUseIdx = ImpDefIdx + 1;
2346 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2347
2348 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2349
2350 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2351
2352 MI.eraseFromParent();
2353 break;
2354 }
2355 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2356 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2357 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2358 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2359 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2360 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2361 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2362 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2363 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2364 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2365 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2366 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2367 assert(ST.useVGPRIndexMode());
2368 Register Dst = MI.getOperand(0).getReg();
2369 Register VecReg = MI.getOperand(1).getReg();
2370 bool IsUndef = MI.getOperand(1).isUndef();
2371 Register Idx = MI.getOperand(2).getReg();
2372 Register SubReg = MI.getOperand(3).getImm();
2373
2374 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2375 .addReg(Idx)
2377 SetOn->getOperand(3).setIsUndef();
2378
2379 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2380 .addDef(Dst)
2381 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2382 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2383
2384 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2385
2386 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2387
2388 MI.eraseFromParent();
2389 break;
2390 }
2391 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2392 MachineFunction &MF = *MBB.getParent();
2393 Register Reg = MI.getOperand(0).getReg();
2394 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2395 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2396 MachineOperand OpLo = MI.getOperand(1);
2397 MachineOperand OpHi = MI.getOperand(2);
2398
2399 // Create a bundle so these instructions won't be re-ordered by the
2400 // post-RA scheduler.
2401 MIBundleBuilder Bundler(MBB, MI);
2402 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2403
2404 // What we want here is an offset from the value returned by s_getpc (which
2405 // is the address of the s_add_u32 instruction) to the global variable, but
2406 // since the encoding of $symbol starts 4 bytes after the start of the
2407 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2408 // small. This requires us to add 4 to the global variable offset in order
2409 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2410 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2411 // instruction.
2412
2413 int64_t Adjust = 0;
2414 if (ST.hasGetPCZeroExtension()) {
2415 // Fix up hardware that does not sign-extend the 48-bit PC value by
2416 // inserting: s_sext_i32_i16 reghi, reghi
2417 Bundler.append(
2418 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2419 Adjust += 4;
2420 }
2421
2422 if (OpLo.isGlobal())
2423 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2424 Bundler.append(
2425 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2426
2427 if (OpHi.isGlobal())
2428 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2429 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2430 .addReg(RegHi)
2431 .add(OpHi));
2432
2433 finalizeBundle(MBB, Bundler.begin());
2434
2435 MI.eraseFromParent();
2436 break;
2437 }
2438 case AMDGPU::SI_PC_ADD_REL_OFFSET64: {
2439 MachineFunction &MF = *MBB.getParent();
2440 Register Reg = MI.getOperand(0).getReg();
2441 MachineOperand Op = MI.getOperand(1);
2442
2443 // Create a bundle so these instructions won't be re-ordered by the
2444 // post-RA scheduler.
2445 MIBundleBuilder Bundler(MBB, MI);
2446 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2447 if (Op.isGlobal())
2448 Op.setOffset(Op.getOffset() + 4);
2449 Bundler.append(
2450 BuildMI(MF, DL, get(AMDGPU::S_ADD_U64), Reg).addReg(Reg).add(Op));
2451
2452 finalizeBundle(MBB, Bundler.begin());
2453
2454 MI.eraseFromParent();
2455 break;
2456 }
2457 case AMDGPU::ENTER_STRICT_WWM: {
2458 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2459 // Whole Wave Mode is entered.
2460 MI.setDesc(get(LMC.OrSaveExecOpc));
2461 break;
2462 }
2463 case AMDGPU::ENTER_STRICT_WQM: {
2464 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2465 // STRICT_WQM is entered.
2466 BuildMI(MBB, MI, DL, get(LMC.MovOpc), MI.getOperand(0).getReg())
2467 .addReg(LMC.ExecReg);
2468 BuildMI(MBB, MI, DL, get(LMC.WQMOpc), LMC.ExecReg).addReg(LMC.ExecReg);
2469
2470 MI.eraseFromParent();
2471 break;
2472 }
2473 case AMDGPU::EXIT_STRICT_WWM:
2474 case AMDGPU::EXIT_STRICT_WQM: {
2475 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2476 // WWM/STICT_WQM is exited.
2477 MI.setDesc(get(LMC.MovOpc));
2478 break;
2479 }
2480 case AMDGPU::SI_RETURN: {
2481 const MachineFunction *MF = MBB.getParent();
2482 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2483 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2484 // Hiding the return address use with SI_RETURN may lead to extra kills in
2485 // the function and missing live-ins. We are fine in practice because callee
2486 // saved register handling ensures the register value is restored before
2487 // RET, but we need the undef flag here to appease the MachineVerifier
2488 // liveness checks.
2490 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2491 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2492
2493 MIB.copyImplicitOps(MI);
2494 MI.eraseFromParent();
2495 break;
2496 }
2497
2498 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2499 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2500 MI.setDesc(get(AMDGPU::S_MUL_U64));
2501 break;
2502
2503 case AMDGPU::S_GETPC_B64_pseudo:
2504 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2505 if (ST.hasGetPCZeroExtension()) {
2506 Register Dst = MI.getOperand(0).getReg();
2507 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2508 // Fix up hardware that does not sign-extend the 48-bit PC value by
2509 // inserting: s_sext_i32_i16 dsthi, dsthi
2510 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2511 DstHi)
2512 .addReg(DstHi);
2513 }
2514 break;
2515
2516 case AMDGPU::V_MAX_BF16_PSEUDO_e64:
2517 assert(ST.hasBF16PackedInsts());
2518 MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
2519 MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
2520 MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
2521 MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
2522 auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2523 Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
2524 auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2525 Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
2526 break;
2527 }
2528
2529 return true;
2530}
2531
2534 unsigned SubIdx, const MachineInstr &Orig,
2535 const TargetRegisterInfo &RI) const {
2536
2537 // Try shrinking the instruction to remat only the part needed for current
2538 // context.
2539 // TODO: Handle more cases.
2540 unsigned Opcode = Orig.getOpcode();
2541 switch (Opcode) {
2542 case AMDGPU::S_LOAD_DWORDX16_IMM:
2543 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2544 if (SubIdx != 0)
2545 break;
2546
2547 if (I == MBB.end())
2548 break;
2549
2550 if (I->isBundled())
2551 break;
2552
2553 // Look for a single use of the register that is also a subreg.
2554 Register RegToFind = Orig.getOperand(0).getReg();
2555 MachineOperand *UseMO = nullptr;
2556 for (auto &CandMO : I->operands()) {
2557 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2558 continue;
2559 if (UseMO) {
2560 UseMO = nullptr;
2561 break;
2562 }
2563 UseMO = &CandMO;
2564 }
2565 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2566 break;
2567
2568 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2569 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2570
2571 MachineFunction *MF = MBB.getParent();
2573 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2574
2575 unsigned NewOpcode = -1;
2576 if (SubregSize == 256)
2577 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2578 else if (SubregSize == 128)
2579 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2580 else
2581 break;
2582
2583 const MCInstrDesc &TID = get(NewOpcode);
2584 const TargetRegisterClass *NewRC =
2585 RI.getAllocatableClass(getRegClass(TID, 0, &RI));
2586 MRI.setRegClass(DestReg, NewRC);
2587
2588 UseMO->setReg(DestReg);
2589 UseMO->setSubReg(AMDGPU::NoSubRegister);
2590
2591 // Use a smaller load with the desired size, possibly with updated offset.
2592 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2593 MI->setDesc(TID);
2594 MI->getOperand(0).setReg(DestReg);
2595 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2596 if (Offset) {
2597 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2598 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2599 OffsetMO->setImm(FinalOffset);
2600 }
2602 for (const MachineMemOperand *MemOp : Orig.memoperands())
2603 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2604 SubregSize / 8));
2605 MI->setMemRefs(*MF, NewMMOs);
2606
2607 MBB.insert(I, MI);
2608 return;
2609 }
2610
2611 default:
2612 break;
2613 }
2614
2615 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);
2616}
2617
2618std::pair<MachineInstr*, MachineInstr*>
2620 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2621
2622 if (ST.hasMovB64() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) &&
2624 ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2625 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2626 return std::pair(&MI, nullptr);
2627 }
2628
2629 MachineBasicBlock &MBB = *MI.getParent();
2630 DebugLoc DL = MBB.findDebugLoc(MI);
2631 MachineFunction *MF = MBB.getParent();
2633 Register Dst = MI.getOperand(0).getReg();
2634 unsigned Part = 0;
2635 MachineInstr *Split[2];
2636
2637 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2638 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2639 if (Dst.isPhysical()) {
2640 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2641 } else {
2642 assert(MRI.isSSA());
2643 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2644 MovDPP.addDef(Tmp);
2645 }
2646
2647 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2648 const MachineOperand &SrcOp = MI.getOperand(I);
2649 assert(!SrcOp.isFPImm());
2650 if (SrcOp.isImm()) {
2651 APInt Imm(64, SrcOp.getImm());
2652 Imm.ashrInPlace(Part * 32);
2653 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2654 } else {
2655 assert(SrcOp.isReg());
2656 Register Src = SrcOp.getReg();
2657 if (Src.isPhysical())
2658 MovDPP.addReg(RI.getSubReg(Src, Sub));
2659 else
2660 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
2661 }
2662 }
2663
2664 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2665 MovDPP.addImm(MO.getImm());
2666
2667 Split[Part] = MovDPP;
2668 ++Part;
2669 }
2670
2671 if (Dst.isVirtual())
2672 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2673 .addReg(Split[0]->getOperand(0).getReg())
2674 .addImm(AMDGPU::sub0)
2675 .addReg(Split[1]->getOperand(0).getReg())
2676 .addImm(AMDGPU::sub1);
2677
2678 MI.eraseFromParent();
2679 return std::pair(Split[0], Split[1]);
2680}
2681
2682std::optional<DestSourcePair>
2684 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2685 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2686
2687 return std::nullopt;
2688}
2689
2691 AMDGPU::OpName Src0OpName,
2692 MachineOperand &Src1,
2693 AMDGPU::OpName Src1OpName) const {
2694 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2695 if (!Src0Mods)
2696 return false;
2697
2698 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2699 assert(Src1Mods &&
2700 "All commutable instructions have both src0 and src1 modifiers");
2701
2702 int Src0ModsVal = Src0Mods->getImm();
2703 int Src1ModsVal = Src1Mods->getImm();
2704
2705 Src1Mods->setImm(Src0ModsVal);
2706 Src0Mods->setImm(Src1ModsVal);
2707 return true;
2708}
2709
2711 MachineOperand &RegOp,
2712 MachineOperand &NonRegOp) {
2713 Register Reg = RegOp.getReg();
2714 unsigned SubReg = RegOp.getSubReg();
2715 bool IsKill = RegOp.isKill();
2716 bool IsDead = RegOp.isDead();
2717 bool IsUndef = RegOp.isUndef();
2718 bool IsDebug = RegOp.isDebug();
2719
2720 if (NonRegOp.isImm())
2721 RegOp.ChangeToImmediate(NonRegOp.getImm());
2722 else if (NonRegOp.isFI())
2723 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2724 else if (NonRegOp.isGlobal()) {
2725 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2726 NonRegOp.getTargetFlags());
2727 } else
2728 return nullptr;
2729
2730 // Make sure we don't reinterpret a subreg index in the target flags.
2731 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2732
2733 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2734 NonRegOp.setSubReg(SubReg);
2735
2736 return &MI;
2737}
2738
2740 MachineOperand &NonRegOp1,
2741 MachineOperand &NonRegOp2) {
2742 unsigned TargetFlags = NonRegOp1.getTargetFlags();
2743 int64_t NonRegVal = NonRegOp1.getImm();
2744
2745 NonRegOp1.setImm(NonRegOp2.getImm());
2746 NonRegOp2.setImm(NonRegVal);
2747 NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2748 NonRegOp2.setTargetFlags(TargetFlags);
2749 return &MI;
2750}
2751
2752bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2753 unsigned OpIdx1) const {
2754 const MCInstrDesc &InstDesc = MI.getDesc();
2755 const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2756 const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2757
2758 unsigned Opc = MI.getOpcode();
2759 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2760
2761 const MachineOperand &MO0 = MI.getOperand(OpIdx0);
2762 const MachineOperand &MO1 = MI.getOperand(OpIdx1);
2763
2764 // Swap doesn't breach constant bus or literal limits
2765 // It may move literal to position other than src0, this is not allowed
2766 // pre-gfx10 However, most test cases need literals in Src0 for VOP
2767 // FIXME: After gfx9, literal can be in place other than Src0
2768 if (isVALU(MI)) {
2769 if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
2770 !isInlineConstant(MO0, OpInfo1))
2771 return false;
2772 if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
2773 !isInlineConstant(MO1, OpInfo0))
2774 return false;
2775 }
2776
2777 if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
2778 if (OpInfo1.RegClass == -1)
2779 return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2780 return isLegalRegOperand(MI, OpIdx1, MO0) &&
2781 (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1));
2782 }
2783 if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
2784 if (OpInfo0.RegClass == -1)
2785 return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2786 return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) &&
2787 isLegalRegOperand(MI, OpIdx0, MO1);
2788 }
2789
2790 // No need to check 64-bit literals since swapping does not bring new
2791 // 64-bit literals into current instruction to fold to 32-bit
2792
2793 return isImmOperandLegal(MI, OpIdx1, MO0);
2794}
2795
2797 unsigned Src0Idx,
2798 unsigned Src1Idx) const {
2799 assert(!NewMI && "this should never be used");
2800
2801 unsigned Opc = MI.getOpcode();
2802 int CommutedOpcode = commuteOpcode(Opc);
2803 if (CommutedOpcode == -1)
2804 return nullptr;
2805
2806 if (Src0Idx > Src1Idx)
2807 std::swap(Src0Idx, Src1Idx);
2808
2809 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2810 static_cast<int>(Src0Idx) &&
2811 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2812 static_cast<int>(Src1Idx) &&
2813 "inconsistency with findCommutedOpIndices");
2814
2815 if (!isLegalToSwap(MI, Src0Idx, Src1Idx))
2816 return nullptr;
2817
2818 MachineInstr *CommutedMI = nullptr;
2819 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2820 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2821 if (Src0.isReg() && Src1.isReg()) {
2822 // Be sure to copy the source modifiers to the right place.
2823 CommutedMI =
2824 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2825 } else if (Src0.isReg() && !Src1.isReg()) {
2826 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2827 } else if (!Src0.isReg() && Src1.isReg()) {
2828 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2829 } else if (Src0.isImm() && Src1.isImm()) {
2830 CommutedMI = swapImmOperands(MI, Src0, Src1);
2831 } else {
2832 // FIXME: Found two non registers to commute. This does happen.
2833 return nullptr;
2834 }
2835
2836 if (CommutedMI) {
2837 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2838 Src1, AMDGPU::OpName::src1_modifiers);
2839
2840 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
2841 AMDGPU::OpName::src1_sel);
2842
2843 CommutedMI->setDesc(get(CommutedOpcode));
2844 }
2845
2846 return CommutedMI;
2847}
2848
2849// This needs to be implemented because the source modifiers may be inserted
2850// between the true commutable operands, and the base
2851// TargetInstrInfo::commuteInstruction uses it.
2853 unsigned &SrcOpIdx0,
2854 unsigned &SrcOpIdx1) const {
2855 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2856}
2857
2859 unsigned &SrcOpIdx0,
2860 unsigned &SrcOpIdx1) const {
2861 if (!Desc.isCommutable())
2862 return false;
2863
2864 unsigned Opc = Desc.getOpcode();
2865 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2866 if (Src0Idx == -1)
2867 return false;
2868
2869 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2870 if (Src1Idx == -1)
2871 return false;
2872
2873 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2874}
2875
2877 int64_t BrOffset) const {
2878 // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64
2879 // because its dest block is unanalyzable.
2880 assert(isSOPP(BranchOp) || isSOPK(BranchOp));
2881
2882 // Convert to dwords.
2883 BrOffset /= 4;
2884
2885 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2886 // from the next instruction.
2887 BrOffset -= 1;
2888
2889 return isIntN(BranchOffsetBits, BrOffset);
2890}
2891
2894 return MI.getOperand(0).getMBB();
2895}
2896
2898 for (const MachineInstr &MI : MBB->terminators()) {
2899 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2900 MI.getOpcode() == AMDGPU::SI_LOOP)
2901 return true;
2902 }
2903 return false;
2904}
2905
2907 MachineBasicBlock &DestBB,
2908 MachineBasicBlock &RestoreBB,
2909 const DebugLoc &DL, int64_t BrOffset,
2910 RegScavenger *RS) const {
2911 assert(MBB.empty() &&
2912 "new block should be inserted for expanding unconditional branch");
2913 assert(MBB.pred_size() == 1);
2914 assert(RestoreBB.empty() &&
2915 "restore block should be inserted for restoring clobbered registers");
2916
2917 MachineFunction *MF = MBB.getParent();
2920 auto I = MBB.end();
2921 auto &MCCtx = MF->getContext();
2922
2923 if (ST.hasAddPC64Inst()) {
2924 MCSymbol *Offset =
2925 MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true);
2926 auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64))
2928 MCSymbol *PostAddPCLabel =
2929 MCCtx.createTempSymbol("post_addpc", /*AlwaysAddSuffix=*/true);
2930 AddPC->setPostInstrSymbol(*MF, PostAddPCLabel);
2931 auto *OffsetExpr = MCBinaryExpr::createSub(
2932 MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),
2933 MCSymbolRefExpr::create(PostAddPCLabel, MCCtx), MCCtx);
2934 Offset->setVariableValue(OffsetExpr);
2935 return;
2936 }
2937
2938 assert(RS && "RegScavenger required for long branching");
2939
2940 // FIXME: Virtual register workaround for RegScavenger not working with empty
2941 // blocks.
2942 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2943
2944 // Note: as this is used after hazard recognizer we need to apply some hazard
2945 // workarounds directly.
2946 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
2947 ST.hasVALUReadSGPRHazard();
2948 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
2949 if (FlushSGPRWrites)
2950 BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
2952 };
2953
2954 // We need to compute the offset relative to the instruction immediately after
2955 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2956 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2957 ApplyHazardWorkarounds();
2958
2959 MCSymbol *PostGetPCLabel =
2960 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2961 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2962
2963 MCSymbol *OffsetLo =
2964 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2965 MCSymbol *OffsetHi =
2966 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2967 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2968 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2969 .addReg(PCReg, 0, AMDGPU::sub0)
2970 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2971 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2972 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2973 .addReg(PCReg, 0, AMDGPU::sub1)
2974 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2975 ApplyHazardWorkarounds();
2976
2977 // Insert the indirect branch after the other terminator.
2978 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2979 .addReg(PCReg);
2980
2981 // If a spill is needed for the pc register pair, we need to insert a spill
2982 // restore block right before the destination block, and insert a short branch
2983 // into the old destination block's fallthrough predecessor.
2984 // e.g.:
2985 //
2986 // s_cbranch_scc0 skip_long_branch:
2987 //
2988 // long_branch_bb:
2989 // spill s[8:9]
2990 // s_getpc_b64 s[8:9]
2991 // s_add_u32 s8, s8, restore_bb
2992 // s_addc_u32 s9, s9, 0
2993 // s_setpc_b64 s[8:9]
2994 //
2995 // skip_long_branch:
2996 // foo;
2997 //
2998 // .....
2999 //
3000 // dest_bb_fallthrough_predecessor:
3001 // bar;
3002 // s_branch dest_bb
3003 //
3004 // restore_bb:
3005 // restore s[8:9]
3006 // fallthrough dest_bb
3007 ///
3008 // dest_bb:
3009 // buzz;
3010
3011 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
3012 Register Scav;
3013
3014 // If we've previously reserved a register for long branches
3015 // avoid running the scavenger and just use those registers
3016 if (LongBranchReservedReg) {
3017 RS->enterBasicBlock(MBB);
3018 Scav = LongBranchReservedReg;
3019 } else {
3021 Scav = RS->scavengeRegisterBackwards(
3022 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
3023 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
3024 }
3025 if (Scav) {
3026 RS->setRegUsed(Scav);
3027 MRI.replaceRegWith(PCReg, Scav);
3028 MRI.clearVirtRegs();
3029 } else {
3030 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3031 // SGPR spill.
3032 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3033 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3034 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
3035 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
3036 MRI.clearVirtRegs();
3037 }
3038
3039 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3040 // Now, the distance could be defined.
3042 MCSymbolRefExpr::create(DestLabel, MCCtx),
3043 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
3044 // Add offset assignments.
3045 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
3046 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
3047 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
3048 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
3049}
3050
3051unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3052 switch (Cond) {
3053 case SIInstrInfo::SCC_TRUE:
3054 return AMDGPU::S_CBRANCH_SCC1;
3055 case SIInstrInfo::SCC_FALSE:
3056 return AMDGPU::S_CBRANCH_SCC0;
3057 case SIInstrInfo::VCCNZ:
3058 return AMDGPU::S_CBRANCH_VCCNZ;
3059 case SIInstrInfo::VCCZ:
3060 return AMDGPU::S_CBRANCH_VCCZ;
3061 case SIInstrInfo::EXECNZ:
3062 return AMDGPU::S_CBRANCH_EXECNZ;
3063 case SIInstrInfo::EXECZ:
3064 return AMDGPU::S_CBRANCH_EXECZ;
3065 default:
3066 llvm_unreachable("invalid branch predicate");
3067 }
3068}
3069
3070SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3071 switch (Opcode) {
3072 case AMDGPU::S_CBRANCH_SCC0:
3073 return SCC_FALSE;
3074 case AMDGPU::S_CBRANCH_SCC1:
3075 return SCC_TRUE;
3076 case AMDGPU::S_CBRANCH_VCCNZ:
3077 return VCCNZ;
3078 case AMDGPU::S_CBRANCH_VCCZ:
3079 return VCCZ;
3080 case AMDGPU::S_CBRANCH_EXECNZ:
3081 return EXECNZ;
3082 case AMDGPU::S_CBRANCH_EXECZ:
3083 return EXECZ;
3084 default:
3085 return INVALID_BR;
3086 }
3087}
3088
3092 MachineBasicBlock *&FBB,
3094 bool AllowModify) const {
3095 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3096 // Unconditional Branch
3097 TBB = I->getOperand(0).getMBB();
3098 return false;
3099 }
3100
3101 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3102 if (Pred == INVALID_BR)
3103 return true;
3104
3105 MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
3106 Cond.push_back(MachineOperand::CreateImm(Pred));
3107 Cond.push_back(I->getOperand(1)); // Save the branch register.
3108
3109 ++I;
3110
3111 if (I == MBB.end()) {
3112 // Conditional branch followed by fall-through.
3113 TBB = CondBB;
3114 return false;
3115 }
3116
3117 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3118 TBB = CondBB;
3119 FBB = I->getOperand(0).getMBB();
3120 return false;
3121 }
3122
3123 return true;
3124}
3125
3127 MachineBasicBlock *&FBB,
3129 bool AllowModify) const {
3130 MachineBasicBlock::iterator I = MBB.getFirstTerminator();
3131 auto E = MBB.end();
3132 if (I == E)
3133 return false;
3134
3135 // Skip over the instructions that are artificially terminators for special
3136 // exec management.
3137 while (I != E && !I->isBranch() && !I->isReturn()) {
3138 switch (I->getOpcode()) {
3139 case AMDGPU::S_MOV_B64_term:
3140 case AMDGPU::S_XOR_B64_term:
3141 case AMDGPU::S_OR_B64_term:
3142 case AMDGPU::S_ANDN2_B64_term:
3143 case AMDGPU::S_AND_B64_term:
3144 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3145 case AMDGPU::S_MOV_B32_term:
3146 case AMDGPU::S_XOR_B32_term:
3147 case AMDGPU::S_OR_B32_term:
3148 case AMDGPU::S_ANDN2_B32_term:
3149 case AMDGPU::S_AND_B32_term:
3150 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3151 break;
3152 case AMDGPU::SI_IF:
3153 case AMDGPU::SI_ELSE:
3154 case AMDGPU::SI_KILL_I1_TERMINATOR:
3155 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3156 // FIXME: It's messy that these need to be considered here at all.
3157 return true;
3158 default:
3159 llvm_unreachable("unexpected non-branch terminator inst");
3160 }
3161
3162 ++I;
3163 }
3164
3165 if (I == E)
3166 return false;
3167
3168 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3169}
3170
3172 int *BytesRemoved) const {
3173 unsigned Count = 0;
3174 unsigned RemovedSize = 0;
3175 for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) {
3176 // Skip over artificial terminators when removing instructions.
3177 if (MI.isBranch() || MI.isReturn()) {
3178 RemovedSize += getInstSizeInBytes(MI);
3179 MI.eraseFromParent();
3180 ++Count;
3181 }
3182 }
3183
3184 if (BytesRemoved)
3185 *BytesRemoved = RemovedSize;
3186
3187 return Count;
3188}
3189
3190// Copy the flags onto the implicit condition register operand.
3192 const MachineOperand &OrigCond) {
3193 CondReg.setIsUndef(OrigCond.isUndef());
3194 CondReg.setIsKill(OrigCond.isKill());
3195}
3196
3199 MachineBasicBlock *FBB,
3201 const DebugLoc &DL,
3202 int *BytesAdded) const {
3203 if (!FBB && Cond.empty()) {
3204 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3205 .addMBB(TBB);
3206 if (BytesAdded)
3207 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3208 return 1;
3209 }
3210
3211 assert(TBB && Cond[0].isImm());
3212
3213 unsigned Opcode
3214 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3215
3216 if (!FBB) {
3217 MachineInstr *CondBr =
3218 BuildMI(&MBB, DL, get(Opcode))
3219 .addMBB(TBB);
3220
3221 // Copy the flags onto the implicit condition register operand.
3222 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3223 fixImplicitOperands(*CondBr);
3224
3225 if (BytesAdded)
3226 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3227 return 1;
3228 }
3229
3230 assert(TBB && FBB);
3231
3232 MachineInstr *CondBr =
3233 BuildMI(&MBB, DL, get(Opcode))
3234 .addMBB(TBB);
3235 fixImplicitOperands(*CondBr);
3236 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3237 .addMBB(FBB);
3238
3239 MachineOperand &CondReg = CondBr->getOperand(1);
3240 CondReg.setIsUndef(Cond[1].isUndef());
3241 CondReg.setIsKill(Cond[1].isKill());
3242
3243 if (BytesAdded)
3244 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3245
3246 return 2;
3247}
3248
3251 if (Cond.size() != 2) {
3252 return true;
3253 }
3254
3255 if (Cond[0].isImm()) {
3256 Cond[0].setImm(-Cond[0].getImm());
3257 return false;
3258 }
3259
3260 return true;
3261}
3262
3265 Register DstReg, Register TrueReg,
3266 Register FalseReg, int &CondCycles,
3267 int &TrueCycles, int &FalseCycles) const {
3268 switch (Cond[0].getImm()) {
3269 case VCCNZ:
3270 case VCCZ: {
3271 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3272 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3273 if (MRI.getRegClass(FalseReg) != RC)
3274 return false;
3275
3276 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3277 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3278
3279 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3280 return RI.hasVGPRs(RC) && NumInsts <= 6;
3281 }
3282 case SCC_TRUE:
3283 case SCC_FALSE: {
3284 // FIXME: We could insert for VGPRs if we could replace the original compare
3285 // with a vector one.
3286 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3287 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3288 if (MRI.getRegClass(FalseReg) != RC)
3289 return false;
3290
3291 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3292
3293 // Multiples of 8 can do s_cselect_b64
3294 if (NumInsts % 2 == 0)
3295 NumInsts /= 2;
3296
3297 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3298 return RI.isSGPRClass(RC);
3299 }
3300 default:
3301 return false;
3302 }
3303}
3304
3308 Register TrueReg, Register FalseReg) const {
3309 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3310 if (Pred == VCCZ || Pred == SCC_FALSE) {
3311 Pred = static_cast<BranchPredicate>(-Pred);
3312 std::swap(TrueReg, FalseReg);
3313 }
3314
3315 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3316 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3317 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3318
3319 if (DstSize == 32) {
3321 if (Pred == SCC_TRUE) {
3322 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3323 .addReg(TrueReg)
3324 .addReg(FalseReg);
3325 } else {
3326 // Instruction's operands are backwards from what is expected.
3327 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3328 .addReg(FalseReg)
3329 .addReg(TrueReg);
3330 }
3331
3332 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3333 return;
3334 }
3335
3336 if (DstSize == 64 && Pred == SCC_TRUE) {
3338 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3339 .addReg(TrueReg)
3340 .addReg(FalseReg);
3341
3342 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3343 return;
3344 }
3345
3346 static const int16_t Sub0_15[] = {
3347 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3348 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3349 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3350 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3351 };
3352
3353 static const int16_t Sub0_15_64[] = {
3354 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3355 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3356 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3357 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3358 };
3359
3360 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3361 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3362 const int16_t *SubIndices = Sub0_15;
3363 int NElts = DstSize / 32;
3364
3365 // 64-bit select is only available for SALU.
3366 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3367 if (Pred == SCC_TRUE) {
3368 if (NElts % 2) {
3369 SelOp = AMDGPU::S_CSELECT_B32;
3370 EltRC = &AMDGPU::SGPR_32RegClass;
3371 } else {
3372 SelOp = AMDGPU::S_CSELECT_B64;
3373 EltRC = &AMDGPU::SGPR_64RegClass;
3374 SubIndices = Sub0_15_64;
3375 NElts /= 2;
3376 }
3377 }
3378
3380 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3381
3382 I = MIB->getIterator();
3383
3385 for (int Idx = 0; Idx != NElts; ++Idx) {
3386 Register DstElt = MRI.createVirtualRegister(EltRC);
3387 Regs.push_back(DstElt);
3388
3389 unsigned SubIdx = SubIndices[Idx];
3390
3392 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3393 Select =
3394 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3395 .addReg(FalseReg, 0, SubIdx)
3396 .addReg(TrueReg, 0, SubIdx);
3397 } else {
3398 Select =
3399 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3400 .addReg(TrueReg, 0, SubIdx)
3401 .addReg(FalseReg, 0, SubIdx);
3402 }
3403
3404 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3406
3407 MIB.addReg(DstElt)
3408 .addImm(SubIdx);
3409 }
3410}
3411
3413 switch (MI.getOpcode()) {
3414 case AMDGPU::V_MOV_B16_t16_e32:
3415 case AMDGPU::V_MOV_B16_t16_e64:
3416 case AMDGPU::V_MOV_B32_e32:
3417 case AMDGPU::V_MOV_B32_e64:
3418 case AMDGPU::V_MOV_B64_PSEUDO:
3419 case AMDGPU::V_MOV_B64_e32:
3420 case AMDGPU::V_MOV_B64_e64:
3421 case AMDGPU::S_MOV_B32:
3422 case AMDGPU::S_MOV_B64:
3423 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3424 case AMDGPU::COPY:
3425 case AMDGPU::WWM_COPY:
3426 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3427 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3428 case AMDGPU::V_ACCVGPR_MOV_B32:
3429 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3430 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3431 return true;
3432 default:
3433 return false;
3434 }
3435}
3436
3437static constexpr AMDGPU::OpName ModifierOpNames[] = {
3438 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3439 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3440 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3441
3443 unsigned Opc = MI.getOpcode();
3444 for (AMDGPU::OpName Name : reverse(ModifierOpNames)) {
3445 int Idx = AMDGPU::getNamedOperandIdx(Opc, Name);
3446 if (Idx >= 0)
3447 MI.removeOperand(Idx);
3448 }
3449}
3450
3451std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
3452 unsigned SubRegIndex) {
3453 switch (SubRegIndex) {
3454 case AMDGPU::NoSubRegister:
3455 return Imm;
3456 case AMDGPU::sub0:
3457 return SignExtend64<32>(Imm);
3458 case AMDGPU::sub1:
3459 return SignExtend64<32>(Imm >> 32);
3460 case AMDGPU::lo16:
3461 return SignExtend64<16>(Imm);
3462 case AMDGPU::hi16:
3463 return SignExtend64<16>(Imm >> 16);
3464 case AMDGPU::sub1_lo16:
3465 return SignExtend64<16>(Imm >> 32);
3466 case AMDGPU::sub1_hi16:
3467 return SignExtend64<16>(Imm >> 48);
3468 default:
3469 return std::nullopt;
3470 }
3471
3472 llvm_unreachable("covered subregister switch");
3473}
3474
3475static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
3476 switch (Opc) {
3477 case AMDGPU::V_MAC_F16_e32:
3478 case AMDGPU::V_MAC_F16_e64:
3479 case AMDGPU::V_MAD_F16_e64:
3480 return AMDGPU::V_MADAK_F16;
3481 case AMDGPU::V_MAC_F32_e32:
3482 case AMDGPU::V_MAC_F32_e64:
3483 case AMDGPU::V_MAD_F32_e64:
3484 return AMDGPU::V_MADAK_F32;
3485 case AMDGPU::V_FMAC_F32_e32:
3486 case AMDGPU::V_FMAC_F32_e64:
3487 case AMDGPU::V_FMA_F32_e64:
3488 return AMDGPU::V_FMAAK_F32;
3489 case AMDGPU::V_FMAC_F16_e32:
3490 case AMDGPU::V_FMAC_F16_e64:
3491 case AMDGPU::V_FMAC_F16_t16_e64:
3492 case AMDGPU::V_FMAC_F16_fake16_e64:
3493 case AMDGPU::V_FMA_F16_e64:
3494 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3495 ? AMDGPU::V_FMAAK_F16_t16
3496 : AMDGPU::V_FMAAK_F16_fake16
3497 : AMDGPU::V_FMAAK_F16;
3498 case AMDGPU::V_FMAC_F64_e32:
3499 case AMDGPU::V_FMAC_F64_e64:
3500 case AMDGPU::V_FMA_F64_e64:
3501 return AMDGPU::V_FMAAK_F64;
3502 default:
3503 llvm_unreachable("invalid instruction");
3504 }
3505}
3506
3507static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
3508 switch (Opc) {
3509 case AMDGPU::V_MAC_F16_e32:
3510 case AMDGPU::V_MAC_F16_e64:
3511 case AMDGPU::V_MAD_F16_e64:
3512 return AMDGPU::V_MADMK_F16;
3513 case AMDGPU::V_MAC_F32_e32:
3514 case AMDGPU::V_MAC_F32_e64:
3515 case AMDGPU::V_MAD_F32_e64:
3516 return AMDGPU::V_MADMK_F32;
3517 case AMDGPU::V_FMAC_F32_e32:
3518 case AMDGPU::V_FMAC_F32_e64:
3519 case AMDGPU::V_FMA_F32_e64:
3520 return AMDGPU::V_FMAMK_F32;
3521 case AMDGPU::V_FMAC_F16_e32:
3522 case AMDGPU::V_FMAC_F16_e64:
3523 case AMDGPU::V_FMAC_F16_t16_e64:
3524 case AMDGPU::V_FMAC_F16_fake16_e64:
3525 case AMDGPU::V_FMA_F16_e64:
3526 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3527 ? AMDGPU::V_FMAMK_F16_t16
3528 : AMDGPU::V_FMAMK_F16_fake16
3529 : AMDGPU::V_FMAMK_F16;
3530 case AMDGPU::V_FMAC_F64_e32:
3531 case AMDGPU::V_FMAC_F64_e64:
3532 case AMDGPU::V_FMA_F64_e64:
3533 return AMDGPU::V_FMAMK_F64;
3534 default:
3535 llvm_unreachable("invalid instruction");
3536 }
3537}
3538
3540 Register Reg, MachineRegisterInfo *MRI) const {
3541 int64_t Imm;
3542 if (!getConstValDefinedInReg(DefMI, Reg, Imm))
3543 return false;
3544
3545 const bool HasMultipleUses = !MRI->hasOneNonDBGUse(Reg);
3546
3547 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3548
3549 unsigned Opc = UseMI.getOpcode();
3550 if (Opc == AMDGPU::COPY) {
3551 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3552
3553 Register DstReg = UseMI.getOperand(0).getReg();
3554 Register UseSubReg = UseMI.getOperand(1).getSubReg();
3555
3556 const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg);
3557
3558 if (HasMultipleUses) {
3559 // TODO: This should fold in more cases with multiple use, but we need to
3560 // more carefully consider what those uses are.
3561 unsigned ImmDefSize = RI.getRegSizeInBits(*MRI->getRegClass(Reg));
3562
3563 // Avoid breaking up a 64-bit inline immediate into a subregister extract.
3564 if (UseSubReg != AMDGPU::NoSubRegister && ImmDefSize == 64)
3565 return false;
3566
3567 // Most of the time folding a 32-bit inline constant is free (though this
3568 // might not be true if we can't later fold it into a real user).
3569 //
3570 // FIXME: This isInlineConstant check is imprecise if
3571 // getConstValDefinedInReg handled the tricky non-mov cases.
3572 if (ImmDefSize == 32 &&
3574 return false;
3575 }
3576
3577 bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
3578 RI.getSubRegIdxSize(UseSubReg) == 16;
3579
3580 if (Is16Bit) {
3581 if (RI.hasVGPRs(DstRC))
3582 return false; // Do not clobber vgpr_hi16
3583
3584 if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
3585 return false;
3586 }
3587
3588 MachineFunction *MF = UseMI.getMF();
3589
3590 unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
3591 MCRegister MovDstPhysReg =
3592 DstReg.isPhysical() ? DstReg.asMCReg() : MCRegister();
3593
3594 std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, UseSubReg);
3595
3596 // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
3597 for (unsigned MovOp :
3598 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
3599 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
3600 const MCInstrDesc &MovDesc = get(MovOp);
3601
3602 const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0, &RI);
3603 if (Is16Bit) {
3604 // We just need to find a correctly sized register class, so the
3605 // subregister index compatibility doesn't matter since we're statically
3606 // extracting the immediate value.
3607 MovDstRC = RI.getMatchingSuperRegClass(MovDstRC, DstRC, AMDGPU::lo16);
3608 if (!MovDstRC)
3609 continue;
3610
3611 if (MovDstPhysReg) {
3612 // FIXME: We probably should not do this. If there is a live value in
3613 // the high half of the register, it will be corrupted.
3614 MovDstPhysReg =
3615 RI.getMatchingSuperReg(MovDstPhysReg, AMDGPU::lo16, MovDstRC);
3616 if (!MovDstPhysReg)
3617 continue;
3618 }
3619 }
3620
3621 // Result class isn't the right size, try the next instruction.
3622 if (MovDstPhysReg) {
3623 if (!MovDstRC->contains(MovDstPhysReg))
3624 return false;
3625 } else if (!MRI->constrainRegClass(DstReg, MovDstRC)) {
3626 // TODO: This will be overly conservative in the case of 16-bit virtual
3627 // SGPRs. We could hack up the virtual register uses to use a compatible
3628 // 32-bit class.
3629 continue;
3630 }
3631
3632 const MCOperandInfo &OpInfo = MovDesc.operands()[1];
3633
3634 // Ensure the interpreted immediate value is a valid operand in the new
3635 // mov.
3636 //
3637 // FIXME: isImmOperandLegal should have form that doesn't require existing
3638 // MachineInstr or MachineOperand
3639 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType) &&
3640 !isInlineConstant(*SubRegImm, OpInfo.OperandType))
3641 break;
3642
3643 NewOpc = MovOp;
3644 break;
3645 }
3646
3647 if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
3648 return false;
3649
3650 if (Is16Bit) {
3651 UseMI.getOperand(0).setSubReg(AMDGPU::NoSubRegister);
3652 if (MovDstPhysReg)
3653 UseMI.getOperand(0).setReg(MovDstPhysReg);
3654 assert(UseMI.getOperand(1).getReg().isVirtual());
3655 }
3656
3657 const MCInstrDesc &NewMCID = get(NewOpc);
3658 UseMI.setDesc(NewMCID);
3659 UseMI.getOperand(1).ChangeToImmediate(*SubRegImm);
3660 UseMI.addImplicitDefUseOperands(*MF);
3661 return true;
3662 }
3663
3664 if (HasMultipleUses)
3665 return false;
3666
3667 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3668 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3669 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3670 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3671 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3672 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
3673 Opc == AMDGPU::V_FMAC_F64_e64) {
3674 // Don't fold if we are using source or output modifiers. The new VOP2
3675 // instructions don't have them.
3677 return false;
3678
3679 // If this is a free constant, there's no reason to do this.
3680 // TODO: We could fold this here instead of letting SIFoldOperands do it
3681 // later.
3682 int Src0Idx = getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::src0);
3683
3684 // Any src operand can be used for the legality check.
3685 if (isInlineConstant(UseMI, Src0Idx, Imm))
3686 return false;
3687
3688 MachineOperand *Src0 = &UseMI.getOperand(Src0Idx);
3689
3690 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3691 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3692
3693 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3694 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3695 (Src1->isReg() && Src1->getReg() == Reg)) {
3696 MachineOperand *RegSrc =
3697 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3698 if (!RegSrc->isReg())
3699 return false;
3700 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3701 ST.getConstantBusLimit(Opc) < 2)
3702 return false;
3703
3704 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3705 return false;
3706
3707 // If src2 is also a literal constant then we have to choose which one to
3708 // fold. In general it is better to choose madak so that the other literal
3709 // can be materialized in an sgpr instead of a vgpr:
3710 // s_mov_b32 s0, literal
3711 // v_madak_f32 v0, s0, v0, literal
3712 // Instead of:
3713 // v_mov_b32 v1, literal
3714 // v_madmk_f32 v0, v0, literal, v1
3715 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3716 if (Def && Def->isMoveImmediate() &&
3717 !isInlineConstant(Def->getOperand(1)))
3718 return false;
3719
3720 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
3721 if (pseudoToMCOpcode(NewOpc) == -1)
3722 return false;
3723
3724 // V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16
3725 // takes VGPR_32_Lo128 operands, so the rewrite would also require
3726 // restricting their register classes. For now just bail out.
3727 if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3728 NewOpc == AMDGPU::V_FMAMK_F16_fake16)
3729 return false;
3730
3731 const std::optional<int64_t> SubRegImm = extractSubregFromImm(
3732 Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
3733
3734 // FIXME: This would be a lot easier if we could return a new instruction
3735 // instead of having to modify in place.
3736
3737 Register SrcReg = RegSrc->getReg();
3738 unsigned SrcSubReg = RegSrc->getSubReg();
3739 Src0->setReg(SrcReg);
3740 Src0->setSubReg(SrcSubReg);
3741 Src0->setIsKill(RegSrc->isKill());
3742
3743 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3744 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3745 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3746 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3747 UseMI.untieRegOperand(
3748 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3749
3750 Src1->ChangeToImmediate(*SubRegImm);
3751
3753 UseMI.setDesc(get(NewOpc));
3754
3755 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3756 if (DeleteDef)
3757 DefMI.eraseFromParent();
3758
3759 return true;
3760 }
3761
3762 // Added part is the constant: Use v_madak_{f16, f32}.
3763 if (Src2->isReg() && Src2->getReg() == Reg) {
3764 if (ST.getConstantBusLimit(Opc) < 2) {
3765 // Not allowed to use constant bus for another operand.
3766 // We can however allow an inline immediate as src0.
3767 bool Src0Inlined = false;
3768 if (Src0->isReg()) {
3769 // Try to inline constant if possible.
3770 // If the Def moves immediate and the use is single
3771 // We are saving VGPR here.
3772 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3773 if (Def && Def->isMoveImmediate() &&
3774 isInlineConstant(Def->getOperand(1)) &&
3775 MRI->hasOneUse(Src0->getReg())) {
3776 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3777 Src0Inlined = true;
3778 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3779 RI.isSGPRReg(*MRI, Src0->getReg())) {
3780 return false;
3781 }
3782 // VGPR is okay as Src0 - fallthrough
3783 }
3784
3785 if (Src1->isReg() && !Src0Inlined) {
3786 // We have one slot for inlinable constant so far - try to fill it
3787 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3788 if (Def && Def->isMoveImmediate() &&
3789 isInlineConstant(Def->getOperand(1)) &&
3790 MRI->hasOneUse(Src1->getReg()) && commuteInstruction(UseMI))
3791 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3792 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3793 return false;
3794 // VGPR is okay as Src1 - fallthrough
3795 }
3796 }
3797
3798 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
3799 if (pseudoToMCOpcode(NewOpc) == -1)
3800 return false;
3801
3802 // V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
3803 // takes VGPR_32_Lo128 operands, so the rewrite would also require
3804 // restricting their register classes. For now just bail out.
3805 if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3806 NewOpc == AMDGPU::V_FMAAK_F16_fake16)
3807 return false;
3808
3809 // FIXME: This would be a lot easier if we could return a new instruction
3810 // instead of having to modify in place.
3811
3812 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3813 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3814 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3815 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3816 UseMI.untieRegOperand(
3817 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3818
3819 const std::optional<int64_t> SubRegImm =
3820 extractSubregFromImm(Imm, Src2->getSubReg());
3821
3822 // ChangingToImmediate adds Src2 back to the instruction.
3823 Src2->ChangeToImmediate(*SubRegImm);
3824
3825 // These come before src2.
3827 UseMI.setDesc(get(NewOpc));
3828 // It might happen that UseMI was commuted
3829 // and we now have SGPR as SRC1. If so 2 inlined
3830 // constant and SGPR are illegal.
3832
3833 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3834 if (DeleteDef)
3835 DefMI.eraseFromParent();
3836
3837 return true;
3838 }
3839 }
3840
3841 return false;
3842}
3843
3844static bool
3847 if (BaseOps1.size() != BaseOps2.size())
3848 return false;
3849 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3850 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3851 return false;
3852 }
3853 return true;
3854}
3855
3856static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3857 LocationSize WidthB, int OffsetB) {
3858 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3859 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3860 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3861 return LowWidth.hasValue() &&
3862 LowOffset + (int)LowWidth.getValue() <= HighOffset;
3863}
3864
3865bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3866 const MachineInstr &MIb) const {
3867 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3868 int64_t Offset0, Offset1;
3869 LocationSize Dummy0 = LocationSize::precise(0);
3870 LocationSize Dummy1 = LocationSize::precise(0);
3871 bool Offset0IsScalable, Offset1IsScalable;
3872 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3873 Dummy0, &RI) ||
3874 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3875 Dummy1, &RI))
3876 return false;
3877
3878 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3879 return false;
3880
3881 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3882 // FIXME: Handle ds_read2 / ds_write2.
3883 return false;
3884 }
3885 LocationSize Width0 = MIa.memoperands().front()->getSize();
3886 LocationSize Width1 = MIb.memoperands().front()->getSize();
3887 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3888}
3889
3891 const MachineInstr &MIb) const {
3892 assert(MIa.mayLoadOrStore() &&
3893 "MIa must load from or modify a memory location");
3894 assert(MIb.mayLoadOrStore() &&
3895 "MIb must load from or modify a memory location");
3896
3898 return false;
3899
3900 // XXX - Can we relax this between address spaces?
3901 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3902 return false;
3903
3904 if (isLDSDMA(MIa) || isLDSDMA(MIb))
3905 return false;
3906
3907 // TODO: Should we check the address space from the MachineMemOperand? That
3908 // would allow us to distinguish objects we know don't alias based on the
3909 // underlying address space, even if it was lowered to a different one,
3910 // e.g. private accesses lowered to use MUBUF instructions on a scratch
3911 // buffer.
3912 if (isDS(MIa)) {
3913 if (isDS(MIb))
3914 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3915
3916 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
3917 }
3918
3919 if (isMUBUF(MIa) || isMTBUF(MIa)) {
3920 if (isMUBUF(MIb) || isMTBUF(MIb))
3921 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3922
3923 if (isFLAT(MIb))
3924 return isFLATScratch(MIb);
3925
3926 return !isSMRD(MIb);
3927 }
3928
3929 if (isSMRD(MIa)) {
3930 if (isSMRD(MIb))
3931 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3932
3933 if (isFLAT(MIb))
3934 return isFLATScratch(MIb);
3935
3936 return !isMUBUF(MIb) && !isMTBUF(MIb);
3937 }
3938
3939 if (isFLAT(MIa)) {
3940 if (isFLAT(MIb)) {
3941 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
3942 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
3943 return true;
3944
3945 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3946 }
3947
3948 return false;
3949 }
3950
3951 return false;
3952}
3953
3955 int64_t &Imm, MachineInstr **DefMI = nullptr) {
3956 if (Reg.isPhysical())
3957 return false;
3958 auto *Def = MRI.getUniqueVRegDef(Reg);
3959 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
3960 Imm = Def->getOperand(1).getImm();
3961 if (DefMI)
3962 *DefMI = Def;
3963 return true;
3964 }
3965 return false;
3966}
3967
3968static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
3969 MachineInstr **DefMI = nullptr) {
3970 if (!MO->isReg())
3971 return false;
3972 const MachineFunction *MF = MO->getParent()->getParent()->getParent();
3973 const MachineRegisterInfo &MRI = MF->getRegInfo();
3974 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
3975}
3976
3978 MachineInstr &NewMI) {
3979 if (LV) {
3980 unsigned NumOps = MI.getNumOperands();
3981 for (unsigned I = 1; I < NumOps; ++I) {
3982 MachineOperand &Op = MI.getOperand(I);
3983 if (Op.isReg() && Op.isKill())
3984 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
3985 }
3986 }
3987}
3988
3989static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
3990 switch (Opc) {
3991 case AMDGPU::V_MAC_F16_e32:
3992 case AMDGPU::V_MAC_F16_e64:
3993 return AMDGPU::V_MAD_F16_e64;
3994 case AMDGPU::V_MAC_F32_e32:
3995 case AMDGPU::V_MAC_F32_e64:
3996 return AMDGPU::V_MAD_F32_e64;
3997 case AMDGPU::V_MAC_LEGACY_F32_e32:
3998 case AMDGPU::V_MAC_LEGACY_F32_e64:
3999 return AMDGPU::V_MAD_LEGACY_F32_e64;
4000 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4001 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4002 return AMDGPU::V_FMA_LEGACY_F32_e64;
4003 case AMDGPU::V_FMAC_F16_e32:
4004 case AMDGPU::V_FMAC_F16_e64:
4005 case AMDGPU::V_FMAC_F16_t16_e64:
4006 case AMDGPU::V_FMAC_F16_fake16_e64:
4007 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
4008 ? AMDGPU::V_FMA_F16_gfx9_t16_e64
4009 : AMDGPU::V_FMA_F16_gfx9_fake16_e64
4010 : AMDGPU::V_FMA_F16_gfx9_e64;
4011 case AMDGPU::V_FMAC_F32_e32:
4012 case AMDGPU::V_FMAC_F32_e64:
4013 return AMDGPU::V_FMA_F32_e64;
4014 case AMDGPU::V_FMAC_F64_e32:
4015 case AMDGPU::V_FMAC_F64_e64:
4016 return AMDGPU::V_FMA_F64_e64;
4017 default:
4018 llvm_unreachable("invalid instruction");
4019 }
4020}
4021
4023 LiveVariables *LV,
4024 LiveIntervals *LIS) const {
4025 MachineBasicBlock &MBB = *MI.getParent();
4026 unsigned Opc = MI.getOpcode();
4027
4028 // Handle MFMA.
4029 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
4030 if (NewMFMAOpc != -1) {
4032 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
4033 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
4034 MIB.add(MI.getOperand(I));
4035 updateLiveVariables(LV, MI, *MIB);
4036 if (LIS) {
4037 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4038 // SlotIndex of defs needs to be updated when converting to early-clobber
4039 MachineOperand &Def = MIB->getOperand(0);
4040 if (Def.isEarlyClobber() && Def.isReg() &&
4041 LIS->hasInterval(Def.getReg())) {
4042 SlotIndex OldIndex = LIS->getInstructionIndex(*MIB).getRegSlot(false);
4043 SlotIndex NewIndex = LIS->getInstructionIndex(*MIB).getRegSlot(true);
4044 auto &LI = LIS->getInterval(Def.getReg());
4045 auto UpdateDefIndex = [&](LiveRange &LR) {
4046 auto *S = LR.find(OldIndex);
4047 if (S != LR.end() && S->start == OldIndex) {
4048 assert(S->valno && S->valno->def == OldIndex);
4049 S->start = NewIndex;
4050 S->valno->def = NewIndex;
4051 }
4052 };
4053 UpdateDefIndex(LI);
4054 for (auto &SR : LI.subranges())
4055 UpdateDefIndex(SR);
4056 }
4057 }
4058 return MIB;
4059 }
4060
4061 if (SIInstrInfo::isWMMA(MI)) {
4062 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
4063 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4064 .setMIFlags(MI.getFlags());
4065 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
4066 MIB->addOperand(MI.getOperand(I));
4067
4068 updateLiveVariables(LV, MI, *MIB);
4069 if (LIS)
4070 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4071
4072 return MIB;
4073 }
4074
4075 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4076 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4077 "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4078 "present pre-RA");
4079
4080 // Handle MAC/FMAC.
4081 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
4082 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
4083 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
4084 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
4085 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
4086 bool Src0Literal = false;
4087
4088 switch (Opc) {
4089 default:
4090 return nullptr;
4091 case AMDGPU::V_MAC_F16_e64:
4092 case AMDGPU::V_FMAC_F16_e64:
4093 case AMDGPU::V_FMAC_F16_t16_e64:
4094 case AMDGPU::V_FMAC_F16_fake16_e64:
4095 case AMDGPU::V_MAC_F32_e64:
4096 case AMDGPU::V_MAC_LEGACY_F32_e64:
4097 case AMDGPU::V_FMAC_F32_e64:
4098 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4099 case AMDGPU::V_FMAC_F64_e64:
4100 break;
4101 case AMDGPU::V_MAC_F16_e32:
4102 case AMDGPU::V_FMAC_F16_e32:
4103 case AMDGPU::V_MAC_F32_e32:
4104 case AMDGPU::V_MAC_LEGACY_F32_e32:
4105 case AMDGPU::V_FMAC_F32_e32:
4106 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4107 case AMDGPU::V_FMAC_F64_e32: {
4108 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4109 AMDGPU::OpName::src0);
4110 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
4111 if (!Src0->isReg() && !Src0->isImm())
4112 return nullptr;
4113
4114 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
4115 Src0Literal = true;
4116
4117 break;
4118 }
4119 }
4120
4122 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4123 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
4124 const MachineOperand *Src0Mods =
4125 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4126 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4127 const MachineOperand *Src1Mods =
4128 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
4129 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4130 const MachineOperand *Src2Mods =
4131 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
4132 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4133 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
4134 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
4135
4136 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
4137 (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
4138 // If we have an SGPR input, we will violate the constant bus restriction.
4139 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
4140 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
4142 const auto killDef = [&]() -> void {
4143 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4144 // The only user is the instruction which will be killed.
4145 Register DefReg = DefMI->getOperand(0).getReg();
4146
4147 if (MRI.hasOneNonDBGUse(DefReg)) {
4148 // We cannot just remove the DefMI here, calling pass will crash.
4149 DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
4150 DefMI->getOperand(0).setIsDead(true);
4151 for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
4152 DefMI->removeOperand(I);
4153 if (LV)
4154 LV->getVarInfo(DefReg).AliveBlocks.clear();
4155 }
4156
4157 if (LIS) {
4158 LiveInterval &DefLI = LIS->getInterval(DefReg);
4159
4160 // We cannot delete the original instruction here, so hack out the use
4161 // in the original instruction with a dummy register so we can use
4162 // shrinkToUses to deal with any multi-use edge cases. Other targets do
4163 // not have the complexity of deleting a use to consider here.
4164 Register DummyReg = MRI.cloneVirtualRegister(DefReg);
4165 for (MachineOperand &MIOp : MI.uses()) {
4166 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4167 MIOp.setIsUndef(true);
4168 MIOp.setReg(DummyReg);
4169 }
4170 }
4171
4172 LIS->shrinkToUses(&DefLI);
4173 }
4174 };
4175
4176 int64_t Imm;
4177 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
4178 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4179 if (pseudoToMCOpcode(NewOpc) != -1) {
4180 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4181 .add(*Dst)
4182 .add(*Src0)
4183 .add(*Src1)
4184 .addImm(Imm)
4185 .setMIFlags(MI.getFlags());
4186 updateLiveVariables(LV, MI, *MIB);
4187 if (LIS)
4188 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4189 killDef();
4190 return MIB;
4191 }
4192 }
4193 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
4194 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
4195 if (pseudoToMCOpcode(NewOpc) != -1) {
4196 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4197 .add(*Dst)
4198 .add(*Src0)
4199 .addImm(Imm)
4200 .add(*Src2)
4201 .setMIFlags(MI.getFlags());
4202 updateLiveVariables(LV, MI, *MIB);
4203
4204 if (LIS)
4205 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4206 killDef();
4207 return MIB;
4208 }
4209 }
4210 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4211 if (Src0Literal) {
4212 Imm = Src0->getImm();
4213 DefMI = nullptr;
4214 }
4215 if (pseudoToMCOpcode(NewOpc) != -1 &&
4217 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4218 Src1)) {
4219 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4220 .add(*Dst)
4221 .add(*Src1)
4222 .addImm(Imm)
4223 .add(*Src2)
4224 .setMIFlags(MI.getFlags());
4225 updateLiveVariables(LV, MI, *MIB);
4226
4227 if (LIS)
4228 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4229 if (DefMI)
4230 killDef();
4231 return MIB;
4232 }
4233 }
4234 }
4235
4236 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4237 // if VOP3 does not allow a literal operand.
4238 if (Src0Literal && !ST.hasVOP3Literal())
4239 return nullptr;
4240
4241 unsigned NewOpc = getNewFMAInst(ST, Opc);
4242
4243 if (pseudoToMCOpcode(NewOpc) == -1)
4244 return nullptr;
4245
4246 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4247 .add(*Dst)
4248 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4249 .add(*Src0)
4250 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4251 .add(*Src1)
4252 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4253 .add(*Src2)
4254 .addImm(Clamp ? Clamp->getImm() : 0)
4255 .addImm(Omod ? Omod->getImm() : 0)
4256 .setMIFlags(MI.getFlags());
4257 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4258 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4259 updateLiveVariables(LV, MI, *MIB);
4260 if (LIS)
4261 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4262 return MIB;
4263}
4264
4265// It's not generally safe to move VALU instructions across these since it will
4266// start using the register as a base index rather than directly.
4267// XXX - Why isn't hasSideEffects sufficient for these?
4269 switch (MI.getOpcode()) {
4270 case AMDGPU::S_SET_GPR_IDX_ON:
4271 case AMDGPU::S_SET_GPR_IDX_MODE:
4272 case AMDGPU::S_SET_GPR_IDX_OFF:
4273 return true;
4274 default:
4275 return false;
4276 }
4277}
4278
4280 const MachineBasicBlock *MBB,
4281 const MachineFunction &MF) const {
4282 // Skipping the check for SP writes in the base implementation. The reason it
4283 // was added was apparently due to compile time concerns.
4284 //
4285 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4286 // but is probably avoidable.
4287
4288 // Copied from base implementation.
4289 // Terminators and labels can't be scheduled around.
4290 if (MI.isTerminator() || MI.isPosition())
4291 return true;
4292
4293 // INLINEASM_BR can jump to another block
4294 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4295 return true;
4296
4297 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4298 return true;
4299
4300 // Target-independent instructions do not have an implicit-use of EXEC, even
4301 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4302 // boundaries prevents incorrect movements of such instructions.
4303 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4304 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4305 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4306 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4307 MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
4309}
4310
4312 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4313 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4314 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
4315}
4316
4318 if (!isFLAT(MI) || isFLATGlobal(MI))
4319 return false;
4320
4321 // If scratch is not initialized, we can never access it.
4322 if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
4323 return false;
4324
4325 // SCRATCH instructions always access scratch.
4326 if (isFLATScratch(MI))
4327 return true;
4328
4329 // If there are no memory operands then conservatively assume the flat
4330 // operation may access scratch.
4331 if (MI.memoperands_empty())
4332 return true;
4333
4334 // See if any memory operand specifies an address space that involves scratch.
4335 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
4336 unsigned AS = Memop->getAddrSpace();
4337 if (AS == AMDGPUAS::FLAT_ADDRESS) {
4338 const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;
4339 return !MD || !AMDGPU::hasValueInRangeLikeMetadata(
4340 *MD, AMDGPUAS::PRIVATE_ADDRESS);
4341 }
4342 return AS == AMDGPUAS::PRIVATE_ADDRESS;
4343 });
4344}
4345
4347 // Skip the full operand and register alias search modifiesRegister
4348 // does. There's only a handful of instructions that touch this, it's only an
4349 // implicit def, and doesn't alias any other registers.
4350 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4351}
4352
4354 unsigned Opcode = MI.getOpcode();
4355
4356 if (MI.mayStore() && isSMRD(MI))
4357 return true; // scalar store or atomic
4358
4359 // This will terminate the function when other lanes may need to continue.
4360 if (MI.isReturn())
4361 return true;
4362
4363 // These instructions cause shader I/O that may cause hardware lockups
4364 // when executed with an empty EXEC mask.
4365 //
4366 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4367 // EXEC = 0, but checking for that case here seems not worth it
4368 // given the typical code patterns.
4369 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4370 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4371 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT)
4372 return true;
4373
4374 if (MI.isCall() || MI.isInlineAsm())
4375 return true; // conservative assumption
4376
4377 // Assume that barrier interactions are only intended with active lanes.
4378 if (isBarrier(Opcode))
4379 return true;
4380
4381 // A mode change is a scalar operation that influences vector instructions.
4383 return true;
4384
4385 // These are like SALU instructions in terms of effects, so it's questionable
4386 // whether we should return true for those.
4387 //
4388 // However, executing them with EXEC = 0 causes them to operate on undefined
4389 // data, which we avoid by returning true here.
4390 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4391 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4392 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4393 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4394 return true;
4395
4396 return false;
4397}
4398
4400 const MachineInstr &MI) const {
4401 if (MI.isMetaInstruction())
4402 return false;
4403
4404 // This won't read exec if this is an SGPR->SGPR copy.
4405 if (MI.isCopyLike()) {
4406 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4407 return true;
4408
4409 // Make sure this isn't copying exec as a normal operand
4410 return MI.readsRegister(AMDGPU::EXEC, &RI);
4411 }
4412
4413 // Make a conservative assumption about the callee.
4414 if (MI.isCall())
4415 return true;
4416
4417 // Be conservative with any unhandled generic opcodes.
4418 if (!isTargetSpecificOpcode(MI.getOpcode()))
4419 return true;
4420
4421 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4422}
4423
4424bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4425 switch (Imm.getBitWidth()) {
4426 case 1: // This likely will be a condition code mask.
4427 return true;
4428
4429 case 32:
4430 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4431 ST.hasInv2PiInlineImm());
4432 case 64:
4433 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4434 ST.hasInv2PiInlineImm());
4435 case 16:
4436 return ST.has16BitInsts() &&
4437 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4438 ST.hasInv2PiInlineImm());
4439 default:
4440 llvm_unreachable("invalid bitwidth");
4441 }
4442}
4443
4445 APInt IntImm = Imm.bitcastToAPInt();
4446 int64_t IntImmVal = IntImm.getSExtValue();
4447 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4448 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4449 default:
4450 llvm_unreachable("invalid fltSemantics");
4453 return isInlineConstant(IntImm);
4455 return ST.has16BitInsts() &&
4456 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4458 return ST.has16BitInsts() &&
4459 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4460 }
4461}
4462
4463bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
4464 // MachineOperand provides no way to tell the true operand size, since it only
4465 // records a 64-bit value. We need to know the size to determine if a 32-bit
4466 // floating point immediate bit pattern is legal for an integer immediate. It
4467 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4468 switch (OperandType) {
4478 int32_t Trunc = static_cast<int32_t>(Imm);
4479 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
4480 }
4486 return AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm());
4489 // We would expect inline immediates to not be concerned with an integer/fp
4490 // distinction. However, in the case of 16-bit integer operations, the
4491 // "floating point" values appear to not work. It seems read the low 16-bits
4492 // of 32-bit immediates, which happens to always work for the integer
4493 // values.
4494 //
4495 // See llvm bugzilla 46302.
4496 //
4497 // TODO: Theoretically we could use op-sel to use the high bits of the
4498 // 32-bit FP values.
4510 return false;
4513 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4514 // A few special case instructions have 16-bit operands on subtargets
4515 // where 16-bit instructions are not legal.
4516 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4517 // constants in these cases
4518 int16_t Trunc = static_cast<int16_t>(Imm);
4519 return ST.has16BitInsts() &&
4520 AMDGPU::isInlinableLiteralFP16(Trunc, ST.hasInv2PiInlineImm());
4521 }
4522
4523 return false;
4524 }
4527 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4528 int16_t Trunc = static_cast<int16_t>(Imm);
4529 return ST.has16BitInsts() &&
4530 AMDGPU::isInlinableLiteralBF16(Trunc, ST.hasInv2PiInlineImm());
4531 }
4532 return false;
4533 }
4537 return false;
4539 return isLegalAV64PseudoImm(Imm);
4542 // Always embedded in the instruction for free.
4543 return true;
4553 // Just ignore anything else.
4554 return true;
4555 default:
4556 llvm_unreachable("invalid operand type");
4557 }
4558}
4559
4560static bool compareMachineOp(const MachineOperand &Op0,
4561 const MachineOperand &Op1) {
4562 if (Op0.getType() != Op1.getType())
4563 return false;
4564
4565 switch (Op0.getType()) {
4567 return Op0.getReg() == Op1.getReg();
4569 return Op0.getImm() == Op1.getImm();
4570 default:
4571 llvm_unreachable("Didn't expect to be comparing these operand types");
4572 }
4573}
4574
4576 const MCOperandInfo &OpInfo) const {
4577 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
4578 return true;
4579
4580 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4581 return false;
4582
4583 if (!isVOP3(InstDesc) || !AMDGPU::isSISrcOperand(OpInfo))
4584 return true;
4585
4586 return ST.hasVOP3Literal();
4587}
4588
4589bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4590 int64_t ImmVal) const {
4591 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4592 if (isInlineConstant(ImmVal, OpInfo.OperandType)) {
4593 if (isMAI(InstDesc) && ST.hasMFMAInlineLiteralBug() &&
4594 OpNo == (unsigned)AMDGPU::getNamedOperandIdx(InstDesc.getOpcode(),
4595 AMDGPU::OpName::src2))
4596 return false;
4597 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4598 }
4599
4600 return isLiteralOperandLegal(InstDesc, OpInfo);
4601}
4602
4603bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4604 const MachineOperand &MO) const {
4605 if (MO.isImm())
4606 return isImmOperandLegal(InstDesc, OpNo, MO.getImm());
4607
4608 assert((MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) &&
4609 "unexpected imm-like operand kind");
4610 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4611 return isLiteralOperandLegal(InstDesc, OpInfo);
4612}
4613
4615 // 2 32-bit inline constants packed into one.
4616 return AMDGPU::isInlinableLiteral32(Lo_32(Imm), ST.hasInv2PiInlineImm()) &&
4617 AMDGPU::isInlinableLiteral32(Hi_32(Imm), ST.hasInv2PiInlineImm());
4618}
4619
4620bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4621 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4622 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4623 return false;
4624
4625 int Op32 = AMDGPU::getVOPe32(Opcode);
4626 if (Op32 == -1)
4627 return false;
4628
4629 return pseudoToMCOpcode(Op32) != -1;
4630}
4631
4632bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4633 // The src0_modifier operand is present on all instructions
4634 // that have modifiers.
4635
4636 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4637}
4638
4640 AMDGPU::OpName OpName) const {
4641 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4642 return Mods && Mods->getImm();
4643}
4644
4646 return any_of(ModifierOpNames,
4647 [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, Name); });
4648}
4649
4651 const MachineRegisterInfo &MRI) const {
4652 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4653 // Can't shrink instruction with three operands.
4654 if (Src2) {
4655 switch (MI.getOpcode()) {
4656 default: return false;
4657
4658 case AMDGPU::V_ADDC_U32_e64:
4659 case AMDGPU::V_SUBB_U32_e64:
4660 case AMDGPU::V_SUBBREV_U32_e64: {
4661 const MachineOperand *Src1
4662 = getNamedOperand(MI, AMDGPU::OpName::src1);
4663 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4664 return false;
4665 // Additional verification is needed for sdst/src2.
4666 return true;
4667 }
4668 case AMDGPU::V_MAC_F16_e64:
4669 case AMDGPU::V_MAC_F32_e64:
4670 case AMDGPU::V_MAC_LEGACY_F32_e64:
4671 case AMDGPU::V_FMAC_F16_e64:
4672 case AMDGPU::V_FMAC_F16_t16_e64:
4673 case AMDGPU::V_FMAC_F16_fake16_e64:
4674 case AMDGPU::V_FMAC_F32_e64:
4675 case AMDGPU::V_FMAC_F64_e64:
4676 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4677 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4678 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4679 return false;
4680 break;
4681
4682 case AMDGPU::V_CNDMASK_B32_e64:
4683 break;
4684 }
4685 }
4686
4687 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4688 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4689 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4690 return false;
4691
4692 // We don't need to check src0, all input types are legal, so just make sure
4693 // src0 isn't using any modifiers.
4694 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4695 return false;
4696
4697 // Can it be shrunk to a valid 32 bit opcode?
4698 if (!hasVALU32BitEncoding(MI.getOpcode()))
4699 return false;
4700
4701 // Check output modifiers
4702 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4703 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4704 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel) &&
4705 // TODO: Can we avoid checking bound_ctrl/fi here?
4706 // They are only used by permlane*_swap special case.
4707 !hasModifiersSet(MI, AMDGPU::OpName::bound_ctrl) &&
4708 !hasModifiersSet(MI, AMDGPU::OpName::fi);
4709}
4710
4711// Set VCC operand with all flags from \p Orig, except for setting it as
4712// implicit.
4714 const MachineOperand &Orig) {
4715
4716 for (MachineOperand &Use : MI.implicit_operands()) {
4717 if (Use.isUse() &&
4718 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4719 Use.setIsUndef(Orig.isUndef());
4720 Use.setIsKill(Orig.isKill());
4721 return;
4722 }
4723 }
4724}
4725
4727 unsigned Op32) const {
4728 MachineBasicBlock *MBB = MI.getParent();
4729
4730 const MCInstrDesc &Op32Desc = get(Op32);
4731 MachineInstrBuilder Inst32 =
4732 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
4733 .setMIFlags(MI.getFlags());
4734
4735 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4736 // For VOPC instructions, this is replaced by an implicit def of vcc.
4737
4738 // We assume the defs of the shrunk opcode are in the same order, and the
4739 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
4740 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
4741 Inst32.add(MI.getOperand(I));
4742
4743 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4744
4745 int Idx = MI.getNumExplicitDefs();
4746 for (const MachineOperand &Use : MI.explicit_uses()) {
4747 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
4749 continue;
4750
4751 if (&Use == Src2) {
4752 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
4753 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4754 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4755 // of vcc was already added during the initial BuildMI, but we
4756 // 1) may need to change vcc to vcc_lo to preserve the original register
4757 // 2) have to preserve the original flags.
4758 copyFlagsToImplicitVCC(*Inst32, *Src2);
4759 continue;
4760 }
4761 }
4762
4763 Inst32.add(Use);
4764 }
4765
4766 // FIXME: Losing implicit operands
4767 fixImplicitOperands(*Inst32);
4768 return Inst32;
4769}
4770
4772 // Null is free
4773 Register Reg = RegOp.getReg();
4774 if (Reg == AMDGPU::SGPR_NULL || Reg == AMDGPU::SGPR_NULL64)
4775 return false;
4776
4777 // SGPRs use the constant bus
4778
4779 // FIXME: implicit registers that are not part of the MCInstrDesc's implicit
4780 // physical register operands should also count, except for exec.
4781 if (RegOp.isImplicit())
4782 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::M0;
4783
4784 // SGPRs use the constant bus
4785 return AMDGPU::SReg_32RegClass.contains(Reg) ||
4786 AMDGPU::SReg_64RegClass.contains(Reg);
4787}
4788
4790 const MachineRegisterInfo &MRI) const {
4791 Register Reg = RegOp.getReg();
4792 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
4793 : physRegUsesConstantBus(RegOp);
4794}
4795
4797 const MachineOperand &MO,
4798 const MCOperandInfo &OpInfo) const {
4799 // Literal constants use the constant bus.
4800 if (!MO.isReg())
4801 return !isInlineConstant(MO, OpInfo);
4802
4803 Register Reg = MO.getReg();
4804 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
4806}
4807
4809 for (const MachineOperand &MO : MI.implicit_operands()) {
4810 // We only care about reads.
4811 if (MO.isDef())
4812 continue;
4813
4814 switch (MO.getReg()) {
4815 case AMDGPU::VCC:
4816 case AMDGPU::VCC_LO:
4817 case AMDGPU::VCC_HI:
4818 case AMDGPU::M0:
4819 case AMDGPU::FLAT_SCR:
4820 return MO.getReg();
4821
4822 default:
4823 break;
4824 }
4825 }
4826
4827 return Register();
4828}
4829
4830static bool shouldReadExec(const MachineInstr &MI) {
4831 if (SIInstrInfo::isVALU(MI)) {
4832 switch (MI.getOpcode()) {
4833 case AMDGPU::V_READLANE_B32:
4834 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
4835 case AMDGPU::V_WRITELANE_B32:
4836 case AMDGPU::SI_SPILL_S32_TO_VGPR:
4837 return false;
4838 }
4839
4840 return true;
4841 }
4842
4843 if (MI.isPreISelOpcode() ||
4844 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
4847 return false;
4848
4849 return true;
4850}
4851
4852static bool isRegOrFI(const MachineOperand &MO) {
4853 return MO.isReg() || MO.isFI();
4854}
4855
4856static bool isSubRegOf(const SIRegisterInfo &TRI,
4857 const MachineOperand &SuperVec,
4858 const MachineOperand &SubReg) {
4859 if (SubReg.getReg().isPhysical())
4860 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
4861
4862 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
4863 SubReg.getReg() == SuperVec.getReg();
4864}
4865
4866// Verify the illegal copy from vector register to SGPR for generic opcode COPY
4867bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
4868 const MachineRegisterInfo &MRI,
4869 StringRef &ErrInfo) const {
4870 Register DstReg = MI.getOperand(0).getReg();
4871 Register SrcReg = MI.getOperand(1).getReg();
4872 // This is a check for copy from vector register to SGPR
4873 if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) {
4874 ErrInfo = "illegal copy from vector register to SGPR";
4875 return false;
4876 }
4877 return true;
4878}
4879
4881 StringRef &ErrInfo) const {
4882 uint16_t Opcode = MI.getOpcode();
4883 const MachineFunction *MF = MI.getParent()->getParent();
4884 const MachineRegisterInfo &MRI = MF->getRegInfo();
4885
4886 // FIXME: At this point the COPY verify is done only for non-ssa forms.
4887 // Find a better property to recognize the point where instruction selection
4888 // is just done.
4889 // We can only enforce this check after SIFixSGPRCopies pass so that the
4890 // illegal copies are legalized and thereafter we don't expect a pass
4891 // inserting similar copies.
4892 if (!MRI.isSSA() && MI.isCopy())
4893 return verifyCopy(MI, MRI, ErrInfo);
4894
4895 if (SIInstrInfo::isGenericOpcode(Opcode))
4896 return true;
4897
4898 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
4899 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
4900 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
4901 int Src3Idx = -1;
4902 if (Src0Idx == -1) {
4903 // VOPD V_DUAL_* instructions use different operand names.
4904 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
4905 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
4906 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
4907 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
4908 }
4909
4910 // Make sure the number of operands is correct.
4911 const MCInstrDesc &Desc = get(Opcode);
4912 if (!Desc.isVariadic() &&
4913 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
4914 ErrInfo = "Instruction has wrong number of operands.";
4915 return false;
4916 }
4917
4918 if (MI.isInlineAsm()) {
4919 // Verify register classes for inlineasm constraints.
4920 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
4921 I != E; ++I) {
4922 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
4923 if (!RC)
4924 continue;
4925
4926 const MachineOperand &Op = MI.getOperand(I);
4927 if (!Op.isReg())
4928 continue;
4929
4930 Register Reg = Op.getReg();
4931 if (!Reg.isVirtual() && !RC->contains(Reg)) {
4932 ErrInfo = "inlineasm operand has incorrect register class.";
4933 return false;
4934 }
4935 }
4936
4937 return true;
4938 }
4939
4940 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
4941 ErrInfo = "missing memory operand from image instruction.";
4942 return false;
4943 }
4944
4945 // Make sure the register classes are correct.
4946 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
4947 const MachineOperand &MO = MI.getOperand(i);
4948 if (MO.isFPImm()) {
4949 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
4950 "all fp values to integers.";
4951 return false;
4952 }
4953
4954 int RegClass = Desc.operands()[i].RegClass;
4955
4956 const MCOperandInfo &OpInfo = Desc.operands()[i];
4957 switch (OpInfo.OperandType) {
4959 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
4960 ErrInfo = "Illegal immediate value for operand.";
4961 return false;
4962 }
4963 break;
4976 break;
4978 break;
4979 break;
4993 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
4994 ErrInfo = "Illegal immediate value for operand.";
4995 return false;
4996 }
4997 break;
4998 }
5000 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
5001 ErrInfo = "Expected inline constant for operand.";
5002 return false;
5003 }
5004 break;
5008 break;
5013 // Check if this operand is an immediate.
5014 // FrameIndex operands will be replaced by immediates, so they are
5015 // allowed.
5016 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
5017 ErrInfo = "Expected immediate, but got non-immediate";
5018 return false;
5019 }
5020 break;
5024 break;
5025 default:
5026 if (OpInfo.isGenericType())
5027 continue;
5028 break;
5029 }
5030
5031 if (!MO.isReg())
5032 continue;
5033 Register Reg = MO.getReg();
5034 if (!Reg)
5035 continue;
5036
5037 // FIXME: Ideally we would have separate instruction definitions with the
5038 // aligned register constraint.
5039 // FIXME: We do not verify inline asm operands, but custom inline asm
5040 // verification is broken anyway
5041 if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO) {
5042 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
5043 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
5044 if (const TargetRegisterClass *SubRC =
5045 RI.getSubRegisterClass(RC, MO.getSubReg())) {
5046 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
5047 if (RC)
5048 RC = SubRC;
5049 }
5050 }
5051
5052 // Check that this is the aligned version of the class.
5053 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
5054 ErrInfo = "Subtarget requires even aligned vector registers";
5055 return false;
5056 }
5057 }
5058
5059 if (RegClass != -1) {
5060 if (Reg.isVirtual())
5061 continue;
5062
5063 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
5064 if (!RC->contains(Reg)) {
5065 ErrInfo = "Operand has incorrect register class.";
5066 return false;
5067 }
5068 }
5069 }
5070
5071 // Verify SDWA
5072 if (isSDWA(MI)) {
5073 if (!ST.hasSDWA()) {
5074 ErrInfo = "SDWA is not supported on this target";
5075 return false;
5076 }
5077
5078 for (auto Op : {AMDGPU::OpName::src0_sel, AMDGPU::OpName::src1_sel,
5079 AMDGPU::OpName::dst_sel}) {
5080 const MachineOperand *MO = getNamedOperand(MI, Op);
5081 if (!MO)
5082 continue;
5083 int64_t Imm = MO->getImm();
5084 if (Imm < 0 || Imm > AMDGPU::SDWA::SdwaSel::DWORD) {
5085 ErrInfo = "Invalid SDWA selection";
5086 return false;
5087 }
5088 }
5089
5090 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
5091
5092 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
5093 if (OpIdx == -1)
5094 continue;
5095 const MachineOperand &MO = MI.getOperand(OpIdx);
5096
5097 if (!ST.hasSDWAScalar()) {
5098 // Only VGPRS on VI
5099 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
5100 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
5101 return false;
5102 }
5103 } else {
5104 // No immediates on GFX9
5105 if (!MO.isReg()) {
5106 ErrInfo =
5107 "Only reg allowed as operands in SDWA instructions on GFX9+";
5108 return false;
5109 }
5110 }
5111 }
5112
5113 if (!ST.hasSDWAOmod()) {
5114 // No omod allowed on VI
5115 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5116 if (OMod != nullptr &&
5117 (!OMod->isImm() || OMod->getImm() != 0)) {
5118 ErrInfo = "OMod not allowed in SDWA instructions on VI";
5119 return false;
5120 }
5121 }
5122
5123 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
5124 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
5125 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
5126 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
5127 const MachineOperand *Src0ModsMO =
5128 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
5129 unsigned Mods = Src0ModsMO->getImm();
5130 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
5131 Mods & SISrcMods::SEXT) {
5132 ErrInfo = "sext, abs and neg are not allowed on this instruction";
5133 return false;
5134 }
5135 }
5136
5137 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
5138 if (isVOPC(BasicOpcode)) {
5139 if (!ST.hasSDWASdst() && DstIdx != -1) {
5140 // Only vcc allowed as dst on VI for VOPC
5141 const MachineOperand &Dst = MI.getOperand(DstIdx);
5142 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
5143 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
5144 return false;
5145 }
5146 } else if (!ST.hasSDWAOutModsVOPC()) {
5147 // No clamp allowed on GFX9 for VOPC
5148 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
5149 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
5150 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
5151 return false;
5152 }
5153
5154 // No omod allowed on GFX9 for VOPC
5155 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5156 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
5157 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
5158 return false;
5159 }
5160 }
5161 }
5162
5163 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
5164 if (DstUnused && DstUnused->isImm() &&
5165 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
5166 const MachineOperand &Dst = MI.getOperand(DstIdx);
5167 if (!Dst.isReg() || !Dst.isTied()) {
5168 ErrInfo = "Dst register should have tied register";
5169 return false;
5170 }
5171
5172 const MachineOperand &TiedMO =
5173 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
5174 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
5175 ErrInfo =
5176 "Dst register should be tied to implicit use of preserved register";
5177 return false;
5178 }
5179 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
5180 ErrInfo = "Dst register should use same physical register as preserved";
5181 return false;
5182 }
5183 }
5184 }
5185
5186 // Verify MIMG / VIMAGE / VSAMPLE
5187 if (isImage(Opcode) && !MI.mayStore()) {
5188 // Ensure that the return type used is large enough for all the options
5189 // being used TFE/LWE require an extra result register.
5190 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
5191 if (DMask) {
5192 uint64_t DMaskImm = DMask->getImm();
5193 uint32_t RegCount = isGather4(Opcode) ? 4 : llvm::popcount(DMaskImm);
5194 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
5195 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
5196 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
5197
5198 // Adjust for packed 16 bit values
5199 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5200 RegCount = divideCeil(RegCount, 2);
5201
5202 // Adjust if using LWE or TFE
5203 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5204 RegCount += 1;
5205
5206 const uint32_t DstIdx =
5207 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
5208 const MachineOperand &Dst = MI.getOperand(DstIdx);
5209 if (Dst.isReg()) {
5210 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
5211 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
5212 if (RegCount > DstSize) {
5213 ErrInfo = "Image instruction returns too many registers for dst "
5214 "register class";
5215 return false;
5216 }
5217 }
5218 }
5219 }
5220
5221 // Verify VOP*. Ignore multiple sgpr operands on writelane.
5222 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5223 unsigned ConstantBusCount = 0;
5224 bool UsesLiteral = false;
5225 const MachineOperand *LiteralVal = nullptr;
5226
5227 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
5228 if (ImmIdx != -1) {
5229 ++ConstantBusCount;
5230 UsesLiteral = true;
5231 LiteralVal = &MI.getOperand(ImmIdx);
5232 }
5233
5234 SmallVector<Register, 2> SGPRsUsed;
5235 Register SGPRUsed;
5236
5237 // Only look at the true operands. Only a real operand can use the constant
5238 // bus, and we don't want to check pseudo-operands like the source modifier
5239 // flags.
5240 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5241 if (OpIdx == -1)
5242 continue;
5243 const MachineOperand &MO = MI.getOperand(OpIdx);
5244 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5245 if (MO.isReg()) {
5246 SGPRUsed = MO.getReg();
5247 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
5248 ++ConstantBusCount;
5249 SGPRsUsed.push_back(SGPRUsed);
5250 }
5251 } else if (!MO.isFI()) { // Treat FI like a register.
5252 if (!UsesLiteral) {
5253 ++ConstantBusCount;
5254 UsesLiteral = true;
5255 LiteralVal = &MO;
5256 } else if (!MO.isIdenticalTo(*LiteralVal)) {
5257 assert(isVOP2(MI) || isVOP3(MI));
5258 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5259 return false;
5260 }
5261 }
5262 }
5263 }
5264
5265 SGPRUsed = findImplicitSGPRRead(MI);
5266 if (SGPRUsed) {
5267 // Implicit uses may safely overlap true operands
5268 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
5269 return !RI.regsOverlap(SGPRUsed, SGPR);
5270 })) {
5271 ++ConstantBusCount;
5272 SGPRsUsed.push_back(SGPRUsed);
5273 }
5274 }
5275
5276 // v_writelane_b32 is an exception from constant bus restriction:
5277 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5278 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5279 Opcode != AMDGPU::V_WRITELANE_B32) {
5280 ErrInfo = "VOP* instruction violates constant bus restriction";
5281 return false;
5282 }
5283
5284 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5285 ErrInfo = "VOP3 instruction uses literal";
5286 return false;
5287 }
5288 }
5289
5290 // Special case for writelane - this can break the multiple constant bus rule,
5291 // but still can't use more than one SGPR register
5292 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5293 unsigned SGPRCount = 0;
5294 Register SGPRUsed;
5295
5296 for (int OpIdx : {Src0Idx, Src1Idx}) {
5297 if (OpIdx == -1)
5298 break;
5299
5300 const MachineOperand &MO = MI.getOperand(OpIdx);
5301
5302 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5303 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5304 if (MO.getReg() != SGPRUsed)
5305 ++SGPRCount;
5306 SGPRUsed = MO.getReg();
5307 }
5308 }
5309 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5310 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5311 return false;
5312 }
5313 }
5314 }
5315
5316 // Verify misc. restrictions on specific instructions.
5317 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5318 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5319 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5320 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5321 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5322 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5323 if (!compareMachineOp(Src0, Src1) &&
5324 !compareMachineOp(Src0, Src2)) {
5325 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5326 return false;
5327 }
5328 }
5329 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5330 SISrcMods::ABS) ||
5331 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5332 SISrcMods::ABS) ||
5333 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5334 SISrcMods::ABS)) {
5335 ErrInfo = "ABS not allowed in VOP3B instructions";
5336 return false;
5337 }
5338 }
5339
5340 if (isSOP2(MI) || isSOPC(MI)) {
5341 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5342 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5343
5344 if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&
5345 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5346 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5347 !Src0.isIdenticalTo(Src1)) {
5348 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5349 return false;
5350 }
5351 }
5352
5353 if (isSOPK(MI)) {
5354 const auto *Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5355 if (Desc.isBranch()) {
5356 if (!Op->isMBB()) {
5357 ErrInfo = "invalid branch target for SOPK instruction";
5358 return false;
5359 }
5360 } else {
5361 uint64_t Imm = Op->getImm();
5362 if (sopkIsZext(Opcode)) {
5363 if (!isUInt<16>(Imm)) {
5364 ErrInfo = "invalid immediate for SOPK instruction";
5365 return false;
5366 }
5367 } else {
5368 if (!isInt<16>(Imm)) {
5369 ErrInfo = "invalid immediate for SOPK instruction";
5370 return false;
5371 }
5372 }
5373 }
5374 }
5375
5376 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5377 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5378 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5379 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5380 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5381 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5382
5383 const unsigned StaticNumOps =
5384 Desc.getNumOperands() + Desc.implicit_uses().size();
5385 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5386
5387 // Allow additional implicit operands. This allows a fixup done by the post
5388 // RA scheduler where the main implicit operand is killed and implicit-defs
5389 // are added for sub-registers that remain live after this instruction.
5390 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5391 ErrInfo = "missing implicit register operands";
5392 return false;
5393 }
5394
5395 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5396 if (IsDst) {
5397 if (!Dst->isUse()) {
5398 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5399 return false;
5400 }
5401
5402 unsigned UseOpIdx;
5403 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5404 UseOpIdx != StaticNumOps + 1) {
5405 ErrInfo = "movrel implicit operands should be tied";
5406 return false;
5407 }
5408 }
5409
5410 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5411 const MachineOperand &ImpUse
5412 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5413 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5414 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5415 ErrInfo = "src0 should be subreg of implicit vector use";
5416 return false;
5417 }
5418 }
5419
5420 // Make sure we aren't losing exec uses in the td files. This mostly requires
5421 // being careful when using let Uses to try to add other use registers.
5422 if (shouldReadExec(MI)) {
5423 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5424 ErrInfo = "VALU instruction does not implicitly read exec mask";
5425 return false;
5426 }
5427 }
5428
5429 if (isSMRD(MI)) {
5430 if (MI.mayStore() &&
5431 ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5432 // The register offset form of scalar stores may only use m0 as the
5433 // soffset register.
5434 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5435 if (Soff && Soff->getReg() != AMDGPU::M0) {
5436 ErrInfo = "scalar stores must use m0 as offset register";
5437 return false;
5438 }
5439 }
5440 }
5441
5442 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5443 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5444 if (Offset->getImm() != 0) {
5445 ErrInfo = "subtarget does not support offsets in flat instructions";
5446 return false;
5447 }
5448 }
5449
5450 if (isDS(MI) && !ST.hasGDS()) {
5451 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5452 if (GDSOp && GDSOp->getImm() != 0) {
5453 ErrInfo = "GDS is not supported on this subtarget";
5454 return false;
5455 }
5456 }
5457
5458 if (isImage(MI)) {
5459 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5460 if (DimOp) {
5461 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5462 AMDGPU::OpName::vaddr0);
5463 AMDGPU::OpName RSrcOpName =
5464 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5465 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5466 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5467 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5468 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
5469 const AMDGPU::MIMGDimInfo *Dim =
5471
5472 if (!Dim) {
5473 ErrInfo = "dim is out of range";
5474 return false;
5475 }
5476
5477 bool IsA16 = false;
5478 if (ST.hasR128A16()) {
5479 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5480 IsA16 = R128A16->getImm() != 0;
5481 } else if (ST.hasA16()) {
5482 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5483 IsA16 = A16->getImm() != 0;
5484 }
5485
5486 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5487
5488 unsigned AddrWords =
5489 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5490
5491 unsigned VAddrWords;
5492 if (IsNSA) {
5493 VAddrWords = RsrcIdx - VAddr0Idx;
5494 if (ST.hasPartialNSAEncoding() &&
5495 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5496 unsigned LastVAddrIdx = RsrcIdx - 1;
5497 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5498 }
5499 } else {
5500 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5501 if (AddrWords > 12)
5502 AddrWords = 16;
5503 }
5504
5505 if (VAddrWords != AddrWords) {
5506 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5507 << " but got " << VAddrWords << "\n");
5508 ErrInfo = "bad vaddr size";
5509 return false;
5510 }
5511 }
5512 }
5513
5514 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5515 if (DppCt) {
5516 using namespace AMDGPU::DPP;
5517
5518 unsigned DC = DppCt->getImm();
5519 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5520 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5521 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5522 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5523 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5524 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5525 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5526 ErrInfo = "Invalid dpp_ctrl value";
5527 return false;
5528 }
5529 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5530 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5531 ErrInfo = "Invalid dpp_ctrl value: "
5532 "wavefront shifts are not supported on GFX10+";
5533 return false;
5534 }
5535 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5536 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5537 ErrInfo = "Invalid dpp_ctrl value: "
5538 "broadcasts are not supported on GFX10+";
5539 return false;
5540 }
5541 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5542 ST.getGeneration() < AMDGPUSubtarget::GFX10) {
5543 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5544 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5545 !ST.hasGFX90AInsts()) {
5546 ErrInfo = "Invalid dpp_ctrl value: "
5547 "row_newbroadcast/row_share is not supported before "
5548 "GFX90A/GFX10";
5549 return false;
5550 }
5551 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5552 ErrInfo = "Invalid dpp_ctrl value: "
5553 "row_share and row_xmask are not supported before GFX10";
5554 return false;
5555 }
5556 }
5557
5558 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5561 ErrInfo = "Invalid dpp_ctrl value: "
5562 "DP ALU dpp only support row_newbcast";
5563 return false;
5564 }
5565 }
5566
5567 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5568 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5569 AMDGPU::OpName DataName =
5570 isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata;
5571 const MachineOperand *Data = getNamedOperand(MI, DataName);
5572 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5573 if (Data && !Data->isReg())
5574 Data = nullptr;
5575
5576 if (ST.hasGFX90AInsts()) {
5577 if (Dst && Data && !Dst->isTied() && !Data->isTied() &&
5578 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5579 ErrInfo = "Invalid register class: "
5580 "vdata and vdst should be both VGPR or AGPR";
5581 return false;
5582 }
5583 if (Data && Data2 &&
5584 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5585 ErrInfo = "Invalid register class: "
5586 "both data operands should be VGPR or AGPR";
5587 return false;
5588 }
5589 } else {
5590 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5591 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5592 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5593 ErrInfo = "Invalid register class: "
5594 "agpr loads and stores not supported on this GPU";
5595 return false;
5596 }
5597 }
5598 }
5599
5600 if (ST.needsAlignedVGPRs()) {
5601 const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool {
5603 if (!Op)
5604 return true;
5605 Register Reg = Op->getReg();
5606 if (Reg.isPhysical())
5607 return !(RI.getHWRegIndex(Reg) & 1);
5608 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5609 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5610 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5611 };
5612
5613 if (Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
5614 Opcode == AMDGPU::DS_GWS_BARRIER) {
5615
5616 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5617 ErrInfo = "Subtarget requires even aligned vector registers "
5618 "for DS_GWS instructions";
5619 return false;
5620 }
5621 }
5622
5623 if (isMIMG(MI)) {
5624 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5625 ErrInfo = "Subtarget requires even aligned vector registers "
5626 "for vaddr operand of image instructions";
5627 return false;
5628 }
5629 }
5630 }
5631
5632 if (Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts()) {
5633 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5634 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5635 ErrInfo = "Invalid register class: "
5636 "v_accvgpr_write with an SGPR is not supported on this GPU";
5637 return false;
5638 }
5639 }
5640
5641 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5642 const MachineOperand &SrcOp = MI.getOperand(1);
5643 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5644 ErrInfo = "pseudo expects only physical SGPRs";
5645 return false;
5646 }
5647 }
5648
5649 if (const MachineOperand *CPol = getNamedOperand(MI, AMDGPU::OpName::cpol)) {
5650 if (CPol->getImm() & AMDGPU::CPol::SCAL) {
5651 if (!ST.hasScaleOffset()) {
5652 ErrInfo = "Subtarget does not support offset scaling";
5653 return false;
5654 }
5655 if (!AMDGPU::supportsScaleOffset(*this, MI.getOpcode())) {
5656 ErrInfo = "Instruction does not support offset scaling";
5657 return false;
5658 }
5659 }
5660 }
5661
5662 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
5663 // information.
5664 if (AMDGPU::isPackedFP32Inst(Opcode) && AMDGPU::isGFX12Plus(ST)) {
5665 for (unsigned I = 0; I < 3; ++I) {
5667 return false;
5668 }
5669 }
5670
5671 return true;
5672}
5673
5674// It is more readable to list mapped opcodes on the same line.
5675// clang-format off
5676
5678 switch (MI.getOpcode()) {
5679 default: return AMDGPU::INSTRUCTION_LIST_END;
5680 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5681 case AMDGPU::COPY: return AMDGPU::COPY;
5682 case AMDGPU::PHI: return AMDGPU::PHI;
5683 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5684 case AMDGPU::WQM: return AMDGPU::WQM;
5685 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5686 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5687 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5688 case AMDGPU::S_MOV_B32: {
5689 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5690 return MI.getOperand(1).isReg() ||
5691 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
5692 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5693 }
5694 case AMDGPU::S_ADD_I32:
5695 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5696 case AMDGPU::S_ADDC_U32:
5697 return AMDGPU::V_ADDC_U32_e32;
5698 case AMDGPU::S_SUB_I32:
5699 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5700 // FIXME: These are not consistently handled, and selected when the carry is
5701 // used.
5702 case AMDGPU::S_ADD_U32:
5703 return AMDGPU::V_ADD_CO_U32_e32;
5704 case AMDGPU::S_SUB_U32:
5705 return AMDGPU::V_SUB_CO_U32_e32;
5706 case AMDGPU::S_ADD_U64_PSEUDO:
5707 return AMDGPU::V_ADD_U64_PSEUDO;
5708 case AMDGPU::S_SUB_U64_PSEUDO:
5709 return AMDGPU::V_SUB_U64_PSEUDO;
5710 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5711 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5712 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5713 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5714 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5715 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5716 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5717 case AMDGPU::S_XNOR_B32:
5718 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5719 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5720 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5721 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5722 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5723 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5724 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5725 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5726 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5727 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5728 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5729 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5730 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5731 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5732 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5733 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5734 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5735 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5736 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5737 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5738 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5739 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5740 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5741 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5742 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5743 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5744 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5745 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5746 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5747 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5748 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5749 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5750 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5751 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5752 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5753 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5754 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5755 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5756 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5757 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
5758 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
5759 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
5760 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
5761 case AMDGPU::S_CVT_F32_F16:
5762 case AMDGPU::S_CVT_HI_F32_F16:
5763 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
5764 : AMDGPU::V_CVT_F32_F16_fake16_e64;
5765 case AMDGPU::S_CVT_F16_F32:
5766 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
5767 : AMDGPU::V_CVT_F16_F32_fake16_e64;
5768 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
5769 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
5770 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
5771 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
5772 case AMDGPU::S_CEIL_F16:
5773 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
5774 : AMDGPU::V_CEIL_F16_fake16_e64;
5775 case AMDGPU::S_FLOOR_F16:
5776 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
5777 : AMDGPU::V_FLOOR_F16_fake16_e64;
5778 case AMDGPU::S_TRUNC_F16:
5779 return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
5780 : AMDGPU::V_TRUNC_F16_fake16_e64;
5781 case AMDGPU::S_RNDNE_F16:
5782 return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
5783 : AMDGPU::V_RNDNE_F16_fake16_e64;
5784 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
5785 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
5786 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
5787 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
5788 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
5789 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
5790 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
5791 case AMDGPU::S_ADD_F16:
5792 return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
5793 : AMDGPU::V_ADD_F16_fake16_e64;
5794 case AMDGPU::S_SUB_F16:
5795 return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
5796 : AMDGPU::V_SUB_F16_fake16_e64;
5797 case AMDGPU::S_MIN_F16:
5798 return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
5799 : AMDGPU::V_MIN_F16_fake16_e64;
5800 case AMDGPU::S_MAX_F16:
5801 return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
5802 : AMDGPU::V_MAX_F16_fake16_e64;
5803 case AMDGPU::S_MINIMUM_F16:
5804 return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
5805 : AMDGPU::V_MINIMUM_F16_fake16_e64;
5806 case AMDGPU::S_MAXIMUM_F16:
5807 return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
5808 : AMDGPU::V_MAXIMUM_F16_fake16_e64;
5809 case AMDGPU::S_MUL_F16:
5810 return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
5811 : AMDGPU::V_MUL_F16_fake16_e64;
5812 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
5813 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5814 case AMDGPU::S_FMAC_F16:
5815 return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
5816 : AMDGPU::V_FMAC_F16_fake16_e64;
5817 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
5818 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
5819 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
5820 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
5821 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
5822 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
5823 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
5824 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
5825 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
5826 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
5827 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
5828 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
5829 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
5830 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
5831 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
5832 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
5833 case AMDGPU::S_CMP_LT_F16:
5834 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
5835 : AMDGPU::V_CMP_LT_F16_fake16_e64;
5836 case AMDGPU::S_CMP_EQ_F16:
5837 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
5838 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
5839 case AMDGPU::S_CMP_LE_F16:
5840 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
5841 : AMDGPU::V_CMP_LE_F16_fake16_e64;
5842 case AMDGPU::S_CMP_GT_F16:
5843 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
5844 : AMDGPU::V_CMP_GT_F16_fake16_e64;
5845 case AMDGPU::S_CMP_LG_F16:
5846 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
5847 : AMDGPU::V_CMP_LG_F16_fake16_e64;
5848 case AMDGPU::S_CMP_GE_F16:
5849 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
5850 : AMDGPU::V_CMP_GE_F16_fake16_e64;
5851 case AMDGPU::S_CMP_O_F16:
5852 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
5853 : AMDGPU::V_CMP_O_F16_fake16_e64;
5854 case AMDGPU::S_CMP_U_F16:
5855 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
5856 : AMDGPU::V_CMP_U_F16_fake16_e64;
5857 case AMDGPU::S_CMP_NGE_F16:
5858 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
5859 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
5860 case AMDGPU::S_CMP_NLG_F16:
5861 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
5862 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
5863 case AMDGPU::S_CMP_NGT_F16:
5864 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
5865 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
5866 case AMDGPU::S_CMP_NLE_F16:
5867 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
5868 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
5869 case AMDGPU::S_CMP_NEQ_F16:
5870 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
5871 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
5872 case AMDGPU::S_CMP_NLT_F16:
5873 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
5874 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
5875 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
5876 case AMDGPU::V_S_EXP_F16_e64:
5877 return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
5878 : AMDGPU::V_EXP_F16_fake16_e64;
5879 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
5880 case AMDGPU::V_S_LOG_F16_e64:
5881 return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
5882 : AMDGPU::V_LOG_F16_fake16_e64;
5883 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
5884 case AMDGPU::V_S_RCP_F16_e64:
5885 return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
5886 : AMDGPU::V_RCP_F16_fake16_e64;
5887 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
5888 case AMDGPU::V_S_RSQ_F16_e64:
5889 return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
5890 : AMDGPU::V_RSQ_F16_fake16_e64;
5891 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
5892 case AMDGPU::V_S_SQRT_F16_e64:
5893 return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
5894 : AMDGPU::V_SQRT_F16_fake16_e64;
5895 }
5897 "Unexpected scalar opcode without corresponding vector one!");
5898}
5899
5900// clang-format on
5901
5905 const DebugLoc &DL, Register Reg,
5906 bool IsSCCLive,
5907 SlotIndexes *Indexes) const {
5908 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
5909 const SIInstrInfo *TII = ST.getInstrInfo();
5911 if (IsSCCLive) {
5912 // Insert two move instructions, one to save the original value of EXEC and
5913 // the other to turn on all bits in EXEC. This is required as we can't use
5914 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
5915 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), Reg)
5917 auto FlipExecMI =
5918 BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
5919 if (Indexes) {
5920 Indexes->insertMachineInstrInMaps(*StoreExecMI);
5921 Indexes->insertMachineInstrInMaps(*FlipExecMI);
5922 }
5923 } else {
5924 auto SaveExec =
5925 BuildMI(MBB, MBBI, DL, TII->get(LMC.OrSaveExecOpc), Reg).addImm(-1);
5926 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
5927 if (Indexes)
5928 Indexes->insertMachineInstrInMaps(*SaveExec);
5929 }
5930}
5931
5934 const DebugLoc &DL, Register Reg,
5935 SlotIndexes *Indexes) const {
5937 auto ExecRestoreMI = BuildMI(MBB, MBBI, DL, get(LMC.MovOpc), LMC.ExecReg)
5938 .addReg(Reg, RegState::Kill);
5939 if (Indexes)
5940 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
5941}
5942
5946 "Not a whole wave func");
5947 MachineBasicBlock &MBB = *MF.begin();
5948 for (MachineInstr &MI : MBB)
5949 if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
5950 MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
5951 return &MI;
5952
5953 llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
5954}
5955
5956static const TargetRegisterClass *
5958 const MCInstrDesc &TID, unsigned RCID) {
5959 if (!ST.hasGFX90AInsts() && (TID.mayLoad() || TID.mayStore())) {
5960 switch (RCID) {
5961 case AMDGPU::AV_32RegClassID:
5962 RCID = AMDGPU::VGPR_32RegClassID;
5963 break;
5964 case AMDGPU::AV_64RegClassID:
5965 RCID = AMDGPU::VReg_64RegClassID;
5966 break;
5967 case AMDGPU::AV_96RegClassID:
5968 RCID = AMDGPU::VReg_96RegClassID;
5969 break;
5970 case AMDGPU::AV_128RegClassID:
5971 RCID = AMDGPU::VReg_128RegClassID;
5972 break;
5973 case AMDGPU::AV_160RegClassID:
5974 RCID = AMDGPU::VReg_160RegClassID;
5975 break;
5976 case AMDGPU::AV_512RegClassID:
5977 RCID = AMDGPU::VReg_512RegClassID;
5978 break;
5979 default:
5980 break;
5981 }
5982 }
5983
5984 return RI.getProperlyAlignedRC(RI.getRegClass(RCID));
5985}
5986
5987const TargetRegisterClass *
5988SIInstrInfo::getRegClass(const MCInstrDesc &TID, unsigned OpNum,
5989 const TargetRegisterInfo *TRI) const {
5990 if (OpNum >= TID.getNumOperands())
5991 return nullptr;
5992 auto RegClass = TID.operands()[OpNum].RegClass;
5993 // Special pseudos have no alignment requirement.
5994 if (TID.getOpcode() == AMDGPU::AV_MOV_B64_IMM_PSEUDO || isSpill(TID))
5995 return RI.getRegClass(RegClass);
5996
5997 return adjustAllocatableRegClass(ST, RI, TID, RegClass);
5998}
5999
6001 unsigned OpNo) const {
6002 const MCInstrDesc &Desc = get(MI.getOpcode());
6003 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
6004 Desc.operands()[OpNo].RegClass == -1) {
6005 Register Reg = MI.getOperand(OpNo).getReg();
6006
6007 if (Reg.isVirtual()) {
6008 const MachineRegisterInfo &MRI =
6009 MI.getParent()->getParent()->getRegInfo();
6010 return MRI.getRegClass(Reg);
6011 }
6012 return RI.getPhysRegBaseClass(Reg);
6013 }
6014
6015 unsigned RCID = Desc.operands()[OpNo].RegClass;
6016 return adjustAllocatableRegClass(ST, RI, Desc, RCID);
6017}
6018
6021 MachineBasicBlock *MBB = MI.getParent();
6022 MachineOperand &MO = MI.getOperand(OpIdx);
6023 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
6024 unsigned RCID = get(MI.getOpcode()).operands()[OpIdx].RegClass;
6025 const TargetRegisterClass *RC = RI.getRegClass(RCID);
6026 unsigned Size = RI.getRegSizeInBits(*RC);
6027 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
6028 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
6029 : AMDGPU::V_MOV_B32_e32;
6030 if (MO.isReg())
6031 Opcode = AMDGPU::COPY;
6032 else if (RI.isSGPRClass(RC))
6033 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
6034
6035 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
6036 Register Reg = MRI.createVirtualRegister(VRC);
6037 DebugLoc DL = MBB->findDebugLoc(I);
6038 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
6039 MO.ChangeToRegister(Reg, false);
6040}
6041
6044 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
6045 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6046 if (!SuperReg.getReg().isVirtual())
6047 return RI.getSubReg(SuperReg.getReg(), SubIdx);
6048
6049 MachineBasicBlock *MBB = MI->getParent();
6050 const DebugLoc &DL = MI->getDebugLoc();
6051 Register SubReg = MRI.createVirtualRegister(SubRC);
6052
6053 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
6054 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
6055 .addReg(SuperReg.getReg(), 0, NewSubIdx);
6056 return SubReg;
6057}
6058
6061 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
6062 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6063 if (Op.isImm()) {
6064 if (SubIdx == AMDGPU::sub0)
6065 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
6066 if (SubIdx == AMDGPU::sub1)
6067 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
6068
6069 llvm_unreachable("Unhandled register index for immediate");
6070 }
6071
6072 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
6073 SubIdx, SubRC);
6074 return MachineOperand::CreateReg(SubReg, false);
6075}
6076
6077// Change the order of operands from (0, 1, 2) to (0, 2, 1)
6078void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
6079 assert(Inst.getNumExplicitOperands() == 3);
6080 MachineOperand Op1 = Inst.getOperand(1);
6081 Inst.removeOperand(1);
6082 Inst.addOperand(Op1);
6083}
6084
6086 const MCOperandInfo &OpInfo,
6087 const MachineOperand &MO) const {
6088 if (!MO.isReg())
6089 return false;
6090
6091 Register Reg = MO.getReg();
6092
6093 const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass);
6094 if (Reg.isPhysical())
6095 return DRC->contains(Reg);
6096
6097 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
6098
6099 if (MO.getSubReg()) {
6100 const MachineFunction *MF = MO.getParent()->getParent()->getParent();
6101 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
6102 if (!SuperRC)
6103 return false;
6104 return RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()) != nullptr;
6105 }
6106
6107 return RI.getCommonSubClass(DRC, RC) != nullptr;
6108}
6109
6111 const MachineOperand &MO) const {
6112 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
6113 const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
6114 unsigned Opc = MI.getOpcode();
6115
6116 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
6117 // information.
6118 if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) &&
6119 MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
6120 constexpr const AMDGPU::OpName OpNames[] = {
6121 AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
6122
6123 for (auto [I, OpName] : enumerate(OpNames)) {
6124 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]);
6125 if (static_cast<unsigned>(SrcIdx) == OpIdx &&
6127 return false;
6128 }
6129 }
6130
6131 if (!isLegalRegOperand(MRI, OpInfo, MO))
6132 return false;
6133
6134 // check Accumulate GPR operand
6135 bool IsAGPR = RI.isAGPR(MRI, MO.getReg());
6136 if (IsAGPR && !ST.hasMAIInsts())
6137 return false;
6138 if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
6139 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
6140 return false;
6141 // Atomics should have both vdst and vdata either vgpr or agpr.
6142 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
6143 const int DataIdx = AMDGPU::getNamedOperandIdx(
6144 Opc, isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
6145 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
6146 MI.getOperand(DataIdx).isReg() &&
6147 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
6148 return false;
6149 if ((int)OpIdx == DataIdx) {
6150 if (VDstIdx != -1 &&
6151 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
6152 return false;
6153 // DS instructions with 2 src operands also must have tied RC.
6154 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
6155 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
6156 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
6157 return false;
6158 }
6159
6160 // Check V_ACCVGPR_WRITE_B32_e64
6161 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
6162 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
6163 RI.isSGPRReg(MRI, MO.getReg()))
6164 return false;
6165 return true;
6166}
6167
6169 const MCOperandInfo &OpInfo,
6170 const MachineOperand &MO) const {
6171 if (MO.isReg())
6172 return isLegalRegOperand(MRI, OpInfo, MO);
6173
6174 // Handle non-register types that are treated like immediates.
6175 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
6176 return true;
6177}
6178
6180 const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
6181 const MachineOperand *MO) const {
6182 constexpr const unsigned NumOps = 3;
6183 constexpr const AMDGPU::OpName OpNames[NumOps * 2] = {
6184 AMDGPU::OpName::src0, AMDGPU::OpName::src1,
6185 AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
6186 AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
6187
6188 assert(SrcN < NumOps);
6189
6190 if (!MO) {
6191 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]);
6192 if (SrcIdx == -1)
6193 return true;
6194 MO = &MI.getOperand(SrcIdx);
6195 }
6196
6197 if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg()))
6198 return true;
6199
6200 int ModsIdx =
6201 AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]);
6202 if (ModsIdx == -1)
6203 return true;
6204
6205 unsigned Mods = MI.getOperand(ModsIdx).getImm();
6206 bool OpSel = Mods & SISrcMods::OP_SEL_0;
6207 bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
6208
6209 return !OpSel && !OpSelHi;
6210}
6211
6213 const MachineOperand *MO) const {
6214 const MachineFunction &MF = *MI.getParent()->getParent();
6215 const MachineRegisterInfo &MRI = MF.getRegInfo();
6216 const MCInstrDesc &InstDesc = MI.getDesc();
6217 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
6218 const TargetRegisterClass *DefinedRC =
6219 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
6220 if (!MO)
6221 MO = &MI.getOperand(OpIdx);
6222
6223 const bool IsInlineConst = !MO->isReg() && isInlineConstant(*MO, OpInfo);
6224
6225 if (isVALU(MI) && !IsInlineConst && usesConstantBus(MRI, *MO, OpInfo)) {
6226 const MachineOperand *UsedLiteral = nullptr;
6227
6228 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
6229 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
6230
6231 // TODO: Be more permissive with frame indexes.
6232 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) {
6233 if (!LiteralLimit--)
6234 return false;
6235
6236 UsedLiteral = MO;
6237 }
6238
6240 if (MO->isReg())
6241 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
6242
6243 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6244 if (i == OpIdx)
6245 continue;
6246 const MachineOperand &Op = MI.getOperand(i);
6247 if (Op.isReg()) {
6248 if (Op.isUse()) {
6249 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
6250 if (regUsesConstantBus(Op, MRI) && SGPRsUsed.insert(SGPR).second) {
6251 if (--ConstantBusLimit <= 0)
6252 return false;
6253 }
6254 }
6255 } else if (AMDGPU::isSISrcOperand(InstDesc.operands()[i]) &&
6256 !isInlineConstant(Op, InstDesc.operands()[i])) {
6257 // The same literal may be used multiple times.
6258 if (!UsedLiteral)
6259 UsedLiteral = &Op;
6260 else if (UsedLiteral->isIdenticalTo(Op))
6261 continue;
6262
6263 if (!LiteralLimit--)
6264 return false;
6265 if (--ConstantBusLimit <= 0)
6266 return false;
6267 }
6268 }
6269 } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {
6270 // There can be at most one literal operand, but it can be repeated.
6271 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6272 if (i == OpIdx)
6273 continue;
6274 const MachineOperand &Op = MI.getOperand(i);
6275 if (!Op.isReg() && !Op.isFI() && !Op.isRegMask() &&
6276 !isInlineConstant(Op, InstDesc.operands()[i]) &&
6277 !Op.isIdenticalTo(*MO))
6278 return false;
6279
6280 // Do not fold a non-inlineable and non-register operand into an
6281 // instruction that already has a frame index. The frame index handling
6282 // code could not handle well when a frame index co-exists with another
6283 // non-register operand, unless that operand is an inlineable immediate.
6284 if (Op.isFI())
6285 return false;
6286 }
6287 } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
6288 isF16PseudoScalarTrans(MI.getOpcode())) {
6289 return false;
6290 }
6291
6292 if (MO->isReg()) {
6293 if (!DefinedRC)
6294 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
6295 return isLegalRegOperand(MI, OpIdx, *MO);
6296 }
6297
6298 if (MO->isImm()) {
6299 uint64_t Imm = MO->getImm();
6300 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
6301 bool Is64BitOp = Is64BitFPOp ||
6302 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
6303 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
6304 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
6305 if (Is64BitOp &&
6306 !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
6307 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
6308 (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
6309 return false;
6310
6311 // FIXME: We can use sign extended 64-bit literals, but only for signed
6312 // operands. At the moment we do not know if an operand is signed.
6313 // Such operand will be encoded as its low 32 bits and then either
6314 // correctly sign extended or incorrectly zero extended by HW.
6315 // If 64-bit literals are supported and the literal will be encoded
6316 // as full 64 bit we still can use it.
6317 if (!Is64BitFPOp && (int32_t)Imm < 0 &&
6318 (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Imm, false)))
6319 return false;
6320 }
6321 }
6322
6323 // Handle non-register types that are treated like immediates.
6324 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6325
6326 if (!DefinedRC) {
6327 // This operand expects an immediate.
6328 return true;
6329 }
6330
6331 return isImmOperandLegal(MI, OpIdx, *MO);
6332}
6333
6335 MachineInstr &MI) const {
6336 unsigned Opc = MI.getOpcode();
6337 const MCInstrDesc &InstrDesc = get(Opc);
6338
6339 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
6340 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6341
6342 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
6343 MachineOperand &Src1 = MI.getOperand(Src1Idx);
6344
6345 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6346 // we need to only have one constant bus use before GFX10.
6347 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6348 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
6349 RI.isSGPRReg(MRI, Src0.getReg()))
6350 legalizeOpWithMove(MI, Src0Idx);
6351
6352 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6353 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
6354 // src0/src1 with V_READFIRSTLANE.
6355 if (Opc == AMDGPU::V_WRITELANE_B32) {
6356 const DebugLoc &DL = MI.getDebugLoc();
6357 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
6358 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6359 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6360 .add(Src0);
6361 Src0.ChangeToRegister(Reg, false);
6362 }
6363 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
6364 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6365 const DebugLoc &DL = MI.getDebugLoc();
6366 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6367 .add(Src1);
6368 Src1.ChangeToRegister(Reg, false);
6369 }
6370 return;
6371 }
6372
6373 // No VOP2 instructions support AGPRs.
6374 if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg()))
6375 legalizeOpWithMove(MI, Src0Idx);
6376
6377 if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg()))
6378 legalizeOpWithMove(MI, Src1Idx);
6379
6380 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6381 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6382 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
6383 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
6384 legalizeOpWithMove(MI, Src2Idx);
6385 }
6386
6387 // VOP2 src0 instructions support all operand types, so we don't need to check
6388 // their legality. If src1 is already legal, we don't need to do anything.
6389 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
6390 return;
6391
6392 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6393 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6394 // select is uniform.
6395 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6396 RI.isVGPR(MRI, Src1.getReg())) {
6397 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6398 const DebugLoc &DL = MI.getDebugLoc();
6399 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6400 .add(Src1);
6401 Src1.ChangeToRegister(Reg, false);
6402 return;
6403 }
6404
6405 // We do not use commuteInstruction here because it is too aggressive and will
6406 // commute if it is possible. We only want to commute here if it improves
6407 // legality. This can be called a fairly large number of times so don't waste
6408 // compile time pointlessly swapping and checking legality again.
6409 if (HasImplicitSGPR || !MI.isCommutable()) {
6410 legalizeOpWithMove(MI, Src1Idx);
6411 return;
6412 }
6413
6414 // If src0 can be used as src1, commuting will make the operands legal.
6415 // Otherwise we have to give up and insert a move.
6416 //
6417 // TODO: Other immediate-like operand kinds could be commuted if there was a
6418 // MachineOperand::ChangeTo* for them.
6419 if ((!Src1.isImm() && !Src1.isReg()) ||
6420 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
6421 legalizeOpWithMove(MI, Src1Idx);
6422 return;
6423 }
6424
6425 int CommutedOpc = commuteOpcode(MI);
6426 if (CommutedOpc == -1) {
6427 legalizeOpWithMove(MI, Src1Idx);
6428 return;
6429 }
6430
6431 MI.setDesc(get(CommutedOpc));
6432
6433 Register Src0Reg = Src0.getReg();
6434 unsigned Src0SubReg = Src0.getSubReg();
6435 bool Src0Kill = Src0.isKill();
6436
6437 if (Src1.isImm())
6438 Src0.ChangeToImmediate(Src1.getImm());
6439 else if (Src1.isReg()) {
6440 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
6441 Src0.setSubReg(Src1.getSubReg());
6442 } else
6443 llvm_unreachable("Should only have register or immediate operands");
6444
6445 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
6446 Src1.setSubReg(Src0SubReg);
6448}
6449
6450// Legalize VOP3 operands. All operand types are supported for any operand
6451// but only one literal constant and only starting from GFX10.
6453 MachineInstr &MI) const {
6454 unsigned Opc = MI.getOpcode();
6455
6456 int VOP3Idx[3] = {
6457 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
6458 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
6459 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
6460 };
6461
6462 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6463 Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||
6464 Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
6465 Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||
6466 Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
6467 Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
6468 Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {
6469 // src1 and src2 must be scalar
6470 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
6471 const DebugLoc &DL = MI.getDebugLoc();
6472 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
6473 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6474 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6475 .add(Src1);
6476 Src1.ChangeToRegister(Reg, false);
6477 }
6478 if (VOP3Idx[2] != -1) {
6479 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
6480 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
6481 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6482 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6483 .add(Src2);
6484 Src2.ChangeToRegister(Reg, false);
6485 }
6486 }
6487 }
6488
6489 // Find the one SGPR operand we are allowed to use.
6490 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6491 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6492 SmallDenseSet<unsigned> SGPRsUsed;
6493 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6494 if (SGPRReg) {
6495 SGPRsUsed.insert(SGPRReg);
6496 --ConstantBusLimit;
6497 }
6498
6499 for (int Idx : VOP3Idx) {
6500 if (Idx == -1)
6501 break;
6502 MachineOperand &MO = MI.getOperand(Idx);
6503
6504 if (!MO.isReg()) {
6505 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6506 continue;
6507
6508 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6509 --LiteralLimit;
6510 --ConstantBusLimit;
6511 continue;
6512 }
6513
6514 --LiteralLimit;
6515 --ConstantBusLimit;
6516 legalizeOpWithMove(MI, Idx);
6517 continue;
6518 }
6519
6520 if (RI.hasAGPRs(RI.getRegClassForReg(MRI, MO.getReg())) &&
6521 !isOperandLegal(MI, Idx, &MO)) {
6522 legalizeOpWithMove(MI, Idx);
6523 continue;
6524 }
6525
6526 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6527 continue; // VGPRs are legal
6528
6529 // We can use one SGPR in each VOP3 instruction prior to GFX10
6530 // and two starting from GFX10.
6531 if (SGPRsUsed.count(MO.getReg()))
6532 continue;
6533 if (ConstantBusLimit > 0) {
6534 SGPRsUsed.insert(MO.getReg());
6535 --ConstantBusLimit;
6536 continue;
6537 }
6538
6539 // If we make it this far, then the operand is not legal and we must
6540 // legalize it.
6541 legalizeOpWithMove(MI, Idx);
6542 }
6543
6544 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6545 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6546 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6547 legalizeOpWithMove(MI, VOP3Idx[2]);
6548
6549 if (isWMMA(MI)) {
6550 // scale_src has a register class restricted to low 256 VGPRs, we may need
6551 // to insert a copy to the restricted VGPR class.
6552 int ScaleSrc0Idx =
6553 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::scale_src0);
6554 if (ScaleSrc0Idx != -1) {
6555 int ScaleSrc1Idx =
6556 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::scale_src1);
6557 if (!isOperandLegal(MI, ScaleSrc0Idx))
6558 legalizeOpWithMove(MI, ScaleSrc0Idx);
6559 if (!isOperandLegal(MI, ScaleSrc1Idx))
6560 legalizeOpWithMove(MI, ScaleSrc1Idx);
6561 }
6562 }
6563
6564 // Fix the register class of packed FP32 instructions on gfx12+. See
6565 // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
6567 for (unsigned I = 0; I < 3; ++I) {
6569 legalizeOpWithMove(MI, VOP3Idx[I]);
6570 }
6571 }
6572}
6573
6576 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6577 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6578 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6579 if (DstRC)
6580 SRC = RI.getCommonSubClass(SRC, DstRC);
6581
6582 Register DstReg = MRI.createVirtualRegister(SRC);
6583 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6584
6585 if (RI.hasAGPRs(VRC)) {
6586 VRC = RI.getEquivalentVGPRClass(VRC);
6587 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6588 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6589 get(TargetOpcode::COPY), NewSrcReg)
6590 .addReg(SrcReg);
6591 SrcReg = NewSrcReg;
6592 }
6593
6594 if (SubRegs == 1) {
6595 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6596 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6597 .addReg(SrcReg);
6598 return DstReg;
6599 }
6600
6602 for (unsigned i = 0; i < SubRegs; ++i) {
6603 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6604 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6605 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6606 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
6607 SRegs.push_back(SGPR);
6608 }
6609
6611 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6612 get(AMDGPU::REG_SEQUENCE), DstReg);
6613 for (unsigned i = 0; i < SubRegs; ++i) {
6614 MIB.addReg(SRegs[i]);
6615 MIB.addImm(RI.getSubRegFromChannel(i));
6616 }
6617 return DstReg;
6618}
6619
6621 MachineInstr &MI) const {
6622
6623 // If the pointer is store in VGPRs, then we need to move them to
6624 // SGPRs using v_readfirstlane. This is safe because we only select
6625 // loads with uniform pointers to SMRD instruction so we know the
6626 // pointer value is uniform.
6627 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6628 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6629 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6630 SBase->setReg(SGPR);
6631 }
6632 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6633 if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
6634 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6635 SOff->setReg(SGPR);
6636 }
6637}
6638
6640 unsigned Opc = Inst.getOpcode();
6641 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6642 if (OldSAddrIdx < 0)
6643 return false;
6644
6645 assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
6646
6647 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6648 if (NewOpc < 0)
6650 if (NewOpc < 0)
6651 return false;
6652
6654 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6655 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6656 return false;
6657
6658 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6659 if (NewVAddrIdx < 0)
6660 return false;
6661
6662 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6663
6664 // Check vaddr, it shall be zero or absent.
6665 MachineInstr *VAddrDef = nullptr;
6666 if (OldVAddrIdx >= 0) {
6667 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6668 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6669 if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
6670 !VAddrDef->getOperand(1).isImm() ||
6671 VAddrDef->getOperand(1).getImm() != 0)
6672 return false;
6673 }
6674
6675 const MCInstrDesc &NewDesc = get(NewOpc);
6676 Inst.setDesc(NewDesc);
6677
6678 // Callers expect iterator to be valid after this call, so modify the
6679 // instruction in place.
6680 if (OldVAddrIdx == NewVAddrIdx) {
6681 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6682 // Clear use list from the old vaddr holding a zero register.
6683 MRI.removeRegOperandFromUseList(&NewVAddr);
6684 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6685 Inst.removeOperand(OldSAddrIdx);
6686 // Update the use list with the pointer we have just moved from vaddr to
6687 // saddr position. Otherwise new vaddr will be missing from the use list.
6688 MRI.removeRegOperandFromUseList(&NewVAddr);
6689 MRI.addRegOperandToUseList(&NewVAddr);
6690 } else {
6691 assert(OldSAddrIdx == NewVAddrIdx);
6692
6693 if (OldVAddrIdx >= 0) {
6694 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6695 AMDGPU::OpName::vdst_in);
6696
6697 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6698 // it asserts. Untie the operands for now and retie them afterwards.
6699 if (NewVDstIn != -1) {
6700 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6701 Inst.untieRegOperand(OldVDstIn);
6702 }
6703
6704 Inst.removeOperand(OldVAddrIdx);
6705
6706 if (NewVDstIn != -1) {
6707 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6708 Inst.tieOperands(NewVDst, NewVDstIn);
6709 }
6710 }
6711 }
6712
6713 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6714 VAddrDef->eraseFromParent();
6715
6716 return true;
6717}
6718
6719// FIXME: Remove this when SelectionDAG is obsoleted.
6721 MachineInstr &MI) const {
6722 if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
6723 return;
6724
6725 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6726 // thinks they are uniform, so a readfirstlane should be valid.
6727 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
6728 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
6729 return;
6730
6732 return;
6733
6734 const TargetRegisterClass *DeclaredRC =
6735 getRegClass(MI.getDesc(), SAddr->getOperandNo(), &RI);
6736
6737 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
6738 SAddr->setReg(ToSGPR);
6739}
6740
6743 const TargetRegisterClass *DstRC,
6746 const DebugLoc &DL) const {
6747 Register OpReg = Op.getReg();
6748 unsigned OpSubReg = Op.getSubReg();
6749
6750 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
6751 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
6752
6753 // Check if operand is already the correct register class.
6754 if (DstRC == OpRC)
6755 return;
6756
6757 Register DstReg = MRI.createVirtualRegister(DstRC);
6758 auto Copy =
6759 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).addReg(OpReg);
6760 Op.setReg(DstReg);
6761
6762 MachineInstr *Def = MRI.getVRegDef(OpReg);
6763 if (!Def)
6764 return;
6765
6766 // Try to eliminate the copy if it is copying an immediate value.
6767 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
6768 foldImmediate(*Copy, *Def, OpReg, &MRI);
6769
6770 bool ImpDef = Def->isImplicitDef();
6771 while (!ImpDef && Def && Def->isCopy()) {
6772 if (Def->getOperand(1).getReg().isPhysical())
6773 break;
6774 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
6775 ImpDef = Def && Def->isImplicitDef();
6776 }
6777 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
6778 !ImpDef)
6779 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
6780}
6781
6782// Emit the actual waterfall loop, executing the wrapped instruction for each
6783// unique value of \p ScalarOps across all lanes. In the best case we execute 1
6784// iteration, in the worst case we execute 64 (once per lane).
6785static void
6788 MachineBasicBlock &LoopBB,
6789 MachineBasicBlock &BodyBB,
6790 const DebugLoc &DL,
6791 ArrayRef<MachineOperand *> ScalarOps) {
6792 MachineFunction &MF = *LoopBB.getParent();
6793 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6794 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6796 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
6797
6799 Register CondReg;
6800
6801 for (MachineOperand *ScalarOp : ScalarOps) {
6802 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
6803 unsigned NumSubRegs = RegSize / 32;
6804 Register VScalarOp = ScalarOp->getReg();
6805
6806 if (NumSubRegs == 1) {
6807 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6808
6809 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
6810 .addReg(VScalarOp);
6811
6812 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6813
6814 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
6815 .addReg(CurReg)
6816 .addReg(VScalarOp);
6817
6818 // Combine the comparison results with AND.
6819 if (!CondReg) // First.
6820 CondReg = NewCondReg;
6821 else { // If not the first, we create an AND.
6822 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6823 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
6824 .addReg(CondReg)
6825 .addReg(NewCondReg);
6826 CondReg = AndReg;
6827 }
6828
6829 // Update ScalarOp operand to use the SGPR ScalarOp.
6830 ScalarOp->setReg(CurReg);
6831 ScalarOp->setIsKill();
6832 } else {
6833 SmallVector<Register, 8> ReadlanePieces;
6834 unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
6835 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
6836 "Unhandled register size");
6837
6838 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
6839 Register CurRegLo =
6840 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6841 Register CurRegHi =
6842 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6843
6844 // Read the next variant <- also loop target.
6845 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
6846 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
6847
6848 // Read the next variant <- also loop target.
6849 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
6850 .addReg(VScalarOp, VScalarOpUndef,
6851 TRI->getSubRegFromChannel(Idx + 1));
6852
6853 ReadlanePieces.push_back(CurRegLo);
6854 ReadlanePieces.push_back(CurRegHi);
6855
6856 // Comparison is to be done as 64-bit.
6857 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
6858 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
6859 .addReg(CurRegLo)
6860 .addImm(AMDGPU::sub0)
6861 .addReg(CurRegHi)
6862 .addImm(AMDGPU::sub1);
6863
6864 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6865 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
6866 NewCondReg)
6867 .addReg(CurReg);
6868 if (NumSubRegs <= 2)
6869 Cmp.addReg(VScalarOp);
6870 else
6871 Cmp.addReg(VScalarOp, VScalarOpUndef,
6872 TRI->getSubRegFromChannel(Idx, 2));
6873
6874 // Combine the comparison results with AND.
6875 if (!CondReg) // First.
6876 CondReg = NewCondReg;
6877 else { // If not the first, we create an AND.
6878 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6879 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
6880 .addReg(CondReg)
6881 .addReg(NewCondReg);
6882 CondReg = AndReg;
6883 }
6884 } // End for loop.
6885
6886 const auto *SScalarOpRC =
6887 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
6888 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
6889
6890 // Build scalar ScalarOp.
6891 auto Merge =
6892 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
6893 unsigned Channel = 0;
6894 for (Register Piece : ReadlanePieces) {
6895 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
6896 }
6897
6898 // Update ScalarOp operand to use the SGPR ScalarOp.
6899 ScalarOp->setReg(SScalarOp);
6900 ScalarOp->setIsKill();
6901 }
6902 }
6903
6904 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6905 MRI.setSimpleHint(SaveExec, CondReg);
6906
6907 // Update EXEC to matching lanes, saving original to SaveExec.
6908 BuildMI(LoopBB, I, DL, TII.get(LMC.AndSaveExecOpc), SaveExec)
6909 .addReg(CondReg, RegState::Kill);
6910
6911 // The original instruction is here; we insert the terminators after it.
6912 I = BodyBB.end();
6913
6914 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
6915 BuildMI(BodyBB, I, DL, TII.get(LMC.XorTermOpc), LMC.ExecReg)
6916 .addReg(LMC.ExecReg)
6917 .addReg(SaveExec);
6918
6919 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
6920}
6921
6922// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
6923// with SGPRs by iterating over all unique values across all lanes.
6924// Returns the loop basic block that now contains \p MI.
6925static MachineBasicBlock *
6929 MachineBasicBlock::iterator Begin = nullptr,
6930 MachineBasicBlock::iterator End = nullptr) {
6931 MachineBasicBlock &MBB = *MI.getParent();
6932 MachineFunction &MF = *MBB.getParent();
6933 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6934 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6936 if (!Begin.isValid())
6937 Begin = &MI;
6938 if (!End.isValid()) {
6939 End = &MI;
6940 ++End;
6941 }
6942 const DebugLoc &DL = MI.getDebugLoc();
6944 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
6945
6946 // Save SCC. Waterfall Loop may overwrite SCC.
6947 Register SaveSCCReg;
6948
6949 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
6950 // rather than unlimited scan everywhere
6951 bool SCCNotDead =
6952 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
6953 std::numeric_limits<unsigned>::max()) !=
6955 if (SCCNotDead) {
6956 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6957 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
6958 .addImm(1)
6959 .addImm(0);
6960 }
6961
6962 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6963
6964 // Save the EXEC mask
6965 BuildMI(MBB, Begin, DL, TII.get(LMC.MovOpc), SaveExec).addReg(LMC.ExecReg);
6966
6967 // Killed uses in the instruction we are waterfalling around will be
6968 // incorrect due to the added control-flow.
6970 ++AfterMI;
6971 for (auto I = Begin; I != AfterMI; I++) {
6972 for (auto &MO : I->all_uses())
6973 MRI.clearKillFlags(MO.getReg());
6974 }
6975
6976 // To insert the loop we need to split the block. Move everything after this
6977 // point to a new block, and insert a new empty block between the two.
6980 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
6982 ++MBBI;
6983
6984 MF.insert(MBBI, LoopBB);
6985 MF.insert(MBBI, BodyBB);
6986 MF.insert(MBBI, RemainderBB);
6987
6988 LoopBB->addSuccessor(BodyBB);
6989 BodyBB->addSuccessor(LoopBB);
6990 BodyBB->addSuccessor(RemainderBB);
6991
6992 // Move Begin to MI to the BodyBB, and the remainder of the block to
6993 // RemainderBB.
6994 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
6995 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
6996 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
6997
6998 MBB.addSuccessor(LoopBB);
6999
7000 // Update dominators. We know that MBB immediately dominates LoopBB, that
7001 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
7002 // RemainderBB. RemainderBB immediately dominates all of the successors
7003 // transferred to it from MBB that MBB used to properly dominate.
7004 if (MDT) {
7005 MDT->addNewBlock(LoopBB, &MBB);
7006 MDT->addNewBlock(BodyBB, LoopBB);
7007 MDT->addNewBlock(RemainderBB, BodyBB);
7008 for (auto &Succ : RemainderBB->successors()) {
7009 if (MDT->properlyDominates(&MBB, Succ)) {
7010 MDT->changeImmediateDominator(Succ, RemainderBB);
7011 }
7012 }
7013 }
7014
7015 emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps);
7016
7017 MachineBasicBlock::iterator First = RemainderBB->begin();
7018 // Restore SCC
7019 if (SCCNotDead) {
7020 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
7021 .addReg(SaveSCCReg, RegState::Kill)
7022 .addImm(0);
7023 }
7024
7025 // Restore the EXEC mask
7026 BuildMI(*RemainderBB, First, DL, TII.get(LMC.MovOpc), LMC.ExecReg)
7027 .addReg(SaveExec);
7028 return BodyBB;
7029}
7030
7031// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
7032static std::tuple<unsigned, unsigned>
7034 MachineBasicBlock &MBB = *MI.getParent();
7035 MachineFunction &MF = *MBB.getParent();
7037
7038 // Extract the ptr from the resource descriptor.
7039 unsigned RsrcPtr =
7040 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
7041 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
7042
7043 // Create an empty resource descriptor
7044 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
7045 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7046 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7047 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
7048 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
7049
7050 // Zero64 = 0
7051 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
7052 .addImm(0);
7053
7054 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
7055 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
7056 .addImm(Lo_32(RsrcDataFormat));
7057
7058 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
7059 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
7060 .addImm(Hi_32(RsrcDataFormat));
7061
7062 // NewSRsrc = {Zero64, SRsrcFormat}
7063 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
7064 .addReg(Zero64)
7065 .addImm(AMDGPU::sub0_sub1)
7066 .addReg(SRsrcFormatLo)
7067 .addImm(AMDGPU::sub2)
7068 .addReg(SRsrcFormatHi)
7069 .addImm(AMDGPU::sub3);
7070
7071 return std::tuple(RsrcPtr, NewSRsrc);
7072}
7073
7076 MachineDominatorTree *MDT) const {
7077 MachineFunction &MF = *MI.getParent()->getParent();
7079 MachineBasicBlock *CreatedBB = nullptr;
7080
7081 // Legalize VOP2
7082 if (isVOP2(MI) || isVOPC(MI)) {
7084 return CreatedBB;
7085 }
7086
7087 // Legalize VOP3
7088 if (isVOP3(MI)) {
7090 return CreatedBB;
7091 }
7092
7093 // Legalize SMRD
7094 if (isSMRD(MI)) {
7096 return CreatedBB;
7097 }
7098
7099 // Legalize FLAT
7100 if (isFLAT(MI)) {
7102 return CreatedBB;
7103 }
7104
7105 // Legalize REG_SEQUENCE and PHI
7106 // The register class of the operands much be the same type as the register
7107 // class of the output.
7108 if (MI.getOpcode() == AMDGPU::PHI) {
7109 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
7110 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
7111 if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
7112 continue;
7113 const TargetRegisterClass *OpRC =
7114 MRI.getRegClass(MI.getOperand(i).getReg());
7115 if (RI.hasVectorRegisters(OpRC)) {
7116 VRC = OpRC;
7117 } else {
7118 SRC = OpRC;
7119 }
7120 }
7121
7122 // If any of the operands are VGPR registers, then they all most be
7123 // otherwise we will create illegal VGPR->SGPR copies when legalizing
7124 // them.
7125 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
7126 if (!VRC) {
7127 assert(SRC);
7128 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
7129 VRC = &AMDGPU::VReg_1RegClass;
7130 } else
7131 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
7132 ? RI.getEquivalentAGPRClass(SRC)
7133 : RI.getEquivalentVGPRClass(SRC);
7134 } else {
7135 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
7136 ? RI.getEquivalentAGPRClass(VRC)
7137 : RI.getEquivalentVGPRClass(VRC);
7138 }
7139 RC = VRC;
7140 } else {
7141 RC = SRC;
7142 }
7143
7144 // Update all the operands so they have the same type.
7145 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7146 MachineOperand &Op = MI.getOperand(I);
7147 if (!Op.isReg() || !Op.getReg().isVirtual())
7148 continue;
7149
7150 // MI is a PHI instruction.
7151 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
7153
7154 // Avoid creating no-op copies with the same src and dst reg class. These
7155 // confuse some of the machine passes.
7156 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
7157 }
7158 }
7159
7160 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
7161 // VGPR dest type and SGPR sources, insert copies so all operands are
7162 // VGPRs. This seems to help operand folding / the register coalescer.
7163 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
7164 MachineBasicBlock *MBB = MI.getParent();
7165 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
7166 if (RI.hasVGPRs(DstRC)) {
7167 // Update all the operands so they are VGPR register classes. These may
7168 // not be the same register class because REG_SEQUENCE supports mixing
7169 // subregister index types e.g. sub0_sub1 + sub2 + sub3
7170 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7171 MachineOperand &Op = MI.getOperand(I);
7172 if (!Op.isReg() || !Op.getReg().isVirtual())
7173 continue;
7174
7175 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
7176 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
7177 if (VRC == OpRC)
7178 continue;
7179
7180 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
7181 Op.setIsKill();
7182 }
7183 }
7184
7185 return CreatedBB;
7186 }
7187
7188 // Legalize INSERT_SUBREG
7189 // src0 must have the same register class as dst
7190 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
7191 Register Dst = MI.getOperand(0).getReg();
7192 Register Src0 = MI.getOperand(1).getReg();
7193 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
7194 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
7195 if (DstRC != Src0RC) {
7196 MachineBasicBlock *MBB = MI.getParent();
7197 MachineOperand &Op = MI.getOperand(1);
7198 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
7199 }
7200 return CreatedBB;
7201 }
7202
7203 // Legalize SI_INIT_M0
7204 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
7205 MachineOperand &Src = MI.getOperand(0);
7206 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7207 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7208 return CreatedBB;
7209 }
7210
7211 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
7212 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
7213 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
7214 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
7215 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
7216 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
7217 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
7218 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
7219 MachineOperand &Src = MI.getOperand(1);
7220 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7221 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7222 return CreatedBB;
7223 }
7224
7225 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
7226 //
7227 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
7228 // scratch memory access. In both cases, the legalization never involves
7229 // conversion to the addr64 form.
7231 (isMUBUF(MI) || isMTBUF(MI)))) {
7232 AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI))
7233 ? AMDGPU::OpName::rsrc
7234 : AMDGPU::OpName::srsrc;
7235 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
7236 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
7237 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
7238
7239 AMDGPU::OpName SampOpName =
7240 isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
7241 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
7242 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
7243 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
7244
7245 return CreatedBB;
7246 }
7247
7248 // Legalize SI_CALL
7249 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
7250 MachineOperand *Dest = &MI.getOperand(0);
7251 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
7252 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
7253 // following copies, we also need to move copies from and to physical
7254 // registers into the loop block.
7255 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
7256 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
7257
7258 // Also move the copies to physical registers into the loop block
7259 MachineBasicBlock &MBB = *MI.getParent();
7261 while (Start->getOpcode() != FrameSetupOpcode)
7262 --Start;
7264 while (End->getOpcode() != FrameDestroyOpcode)
7265 ++End;
7266 // Also include following copies of the return value
7267 ++End;
7268 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
7269 MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
7270 ++End;
7271 CreatedBB =
7272 loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
7273 }
7274 }
7275
7276 // Legalize s_sleep_var.
7277 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
7278 const DebugLoc &DL = MI.getDebugLoc();
7279 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7280 int Src0Idx =
7281 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
7282 MachineOperand &Src0 = MI.getOperand(Src0Idx);
7283 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
7284 .add(Src0);
7285 Src0.ChangeToRegister(Reg, false);
7286 return nullptr;
7287 }
7288
7289 // Legalize TENSOR_LOAD_TO_LDS, TENSOR_LOAD_TO_LDS_D2, TENSOR_STORE_FROM_LDS,
7290 // TENSOR_STORE_FROM_LDS_D2. All their operands are scalar.
7291 if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS ||
7292 MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_D2 ||
7293 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS ||
7294 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_D2) {
7295 for (MachineOperand &Src : MI.explicit_operands()) {
7296 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7297 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7298 }
7299 return CreatedBB;
7300 }
7301
7302 // Legalize MUBUF instructions.
7303 bool isSoffsetLegal = true;
7304 int SoffsetIdx =
7305 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
7306 if (SoffsetIdx != -1) {
7307 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
7308 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7309 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
7310 isSoffsetLegal = false;
7311 }
7312 }
7313
7314 bool isRsrcLegal = true;
7315 int RsrcIdx =
7316 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
7317 if (RsrcIdx != -1) {
7318 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7319 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg()))
7320 isRsrcLegal = false;
7321 }
7322
7323 // The operands are legal.
7324 if (isRsrcLegal && isSoffsetLegal)
7325 return CreatedBB;
7326
7327 if (!isRsrcLegal) {
7328 // Legalize a VGPR Rsrc
7329 //
7330 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
7331 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
7332 // a zero-value SRsrc.
7333 //
7334 // If the instruction is _OFFSET (both idxen and offen disabled), and we
7335 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
7336 // above.
7337 //
7338 // Otherwise we are on non-ADDR64 hardware, and/or we have
7339 // idxen/offen/bothen and we fall back to a waterfall loop.
7340
7341 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7342 MachineBasicBlock &MBB = *MI.getParent();
7343
7344 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
7345 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
7346 // This is already an ADDR64 instruction so we need to add the pointer
7347 // extracted from the resource descriptor to the current value of VAddr.
7348 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7349 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7350 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7351
7352 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
7353 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
7354 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
7355
7356 unsigned RsrcPtr, NewSRsrc;
7357 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7358
7359 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7360 const DebugLoc &DL = MI.getDebugLoc();
7361 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
7362 .addDef(CondReg0)
7363 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7364 .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
7365 .addImm(0);
7366
7367 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7368 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
7369 .addDef(CondReg1, RegState::Dead)
7370 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7371 .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
7372 .addReg(CondReg0, RegState::Kill)
7373 .addImm(0);
7374
7375 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7376 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
7377 .addReg(NewVAddrLo)
7378 .addImm(AMDGPU::sub0)
7379 .addReg(NewVAddrHi)
7380 .addImm(AMDGPU::sub1);
7381
7382 VAddr->setReg(NewVAddr);
7383 Rsrc->setReg(NewSRsrc);
7384 } else if (!VAddr && ST.hasAddr64()) {
7385 // This instructions is the _OFFSET variant, so we need to convert it to
7386 // ADDR64.
7387 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
7388 "FIXME: Need to emit flat atomics here");
7389
7390 unsigned RsrcPtr, NewSRsrc;
7391 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7392
7393 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7394 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
7395 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
7396 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7397 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
7398
7399 // Atomics with return have an additional tied operand and are
7400 // missing some of the special bits.
7401 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
7402 MachineInstr *Addr64;
7403
7404 if (!VDataIn) {
7405 // Regular buffer load / store.
7407 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7408 .add(*VData)
7409 .addReg(NewVAddr)
7410 .addReg(NewSRsrc)
7411 .add(*SOffset)
7412 .add(*Offset);
7413
7414 if (const MachineOperand *CPol =
7415 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
7416 MIB.addImm(CPol->getImm());
7417 }
7418
7419 if (const MachineOperand *TFE =
7420 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
7421 MIB.addImm(TFE->getImm());
7422 }
7423
7424 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
7425
7426 MIB.cloneMemRefs(MI);
7427 Addr64 = MIB;
7428 } else {
7429 // Atomics with return.
7430 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7431 .add(*VData)
7432 .add(*VDataIn)
7433 .addReg(NewVAddr)
7434 .addReg(NewSRsrc)
7435 .add(*SOffset)
7436 .add(*Offset)
7437 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
7438 .cloneMemRefs(MI);
7439 }
7440
7441 MI.removeFromParent();
7442
7443 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7444 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
7445 NewVAddr)
7446 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7447 .addImm(AMDGPU::sub0)
7448 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7449 .addImm(AMDGPU::sub1);
7450 } else {
7451 // Legalize a VGPR Rsrc and soffset together.
7452 if (!isSoffsetLegal) {
7453 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7454 CreatedBB =
7455 loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
7456 return CreatedBB;
7457 }
7458 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
7459 return CreatedBB;
7460 }
7461 }
7462
7463 // Legalize a VGPR soffset.
7464 if (!isSoffsetLegal) {
7465 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7466 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
7467 return CreatedBB;
7468 }
7469 return CreatedBB;
7470}
7471
7473 InstrList.insert(MI);
7474 // Add MBUF instructiosn to deferred list.
7475 int RsrcIdx =
7476 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
7477 if (RsrcIdx != -1) {
7478 DeferredList.insert(MI);
7479 }
7480}
7481
7483 return DeferredList.contains(MI);
7484}
7485
7486// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7487// lowering (change spgr to vgpr).
7488// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7489// size. Need to legalize the size of the operands during the vgpr lowering
7490// chain. This can be removed after we have sgpr16 in place
7492 MachineRegisterInfo &MRI) const {
7493 if (!ST.useRealTrue16Insts())
7494 return;
7495
7496 unsigned Opcode = MI.getOpcode();
7497 MachineBasicBlock *MBB = MI.getParent();
7498 // Legalize operands and check for size mismatch
7499 if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7500 OpIdx >= get(Opcode).getNumOperands() ||
7501 get(Opcode).operands()[OpIdx].RegClass == -1)
7502 return;
7503
7504 MachineOperand &Op = MI.getOperand(OpIdx);
7505 if (!Op.isReg() || !Op.getReg().isVirtual())
7506 return;
7507
7508 const TargetRegisterClass *CurrRC = MRI.getRegClass(Op.getReg());
7509 if (!RI.isVGPRClass(CurrRC))
7510 return;
7511
7512 unsigned RCID = get(Opcode).operands()[OpIdx].RegClass;
7513 const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
7514 if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) {
7515 Op.setSubReg(AMDGPU::lo16);
7516 } else if (RI.getMatchingSuperRegClass(ExpectedRC, CurrRC, AMDGPU::lo16)) {
7517 const DebugLoc &DL = MI.getDebugLoc();
7518 Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7519 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7520 BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
7521 BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
7522 .addReg(Op.getReg())
7523 .addImm(AMDGPU::lo16)
7524 .addReg(Undef)
7525 .addImm(AMDGPU::hi16);
7526 Op.setReg(NewDstReg);
7527 }
7528}
7530 MachineRegisterInfo &MRI) const {
7531 for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
7533}
7534
7536 MachineDominatorTree *MDT) const {
7537
7538 while (!Worklist.empty()) {
7539 MachineInstr &Inst = *Worklist.top();
7540 Worklist.erase_top();
7541 // Skip MachineInstr in the deferred list.
7542 if (Worklist.isDeferred(&Inst))
7543 continue;
7544 moveToVALUImpl(Worklist, MDT, Inst);
7545 }
7546
7547 // Deferred list of instructions will be processed once
7548 // all the MachineInstr in the worklist are done.
7549 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7550 moveToVALUImpl(Worklist, MDT, *Inst);
7551 assert(Worklist.empty() &&
7552 "Deferred MachineInstr are not supposed to re-populate worklist");
7553 }
7554}
7555
7558 MachineInstr &Inst) const {
7559
7561 if (!MBB)
7562 return;
7563 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
7564 unsigned Opcode = Inst.getOpcode();
7565 unsigned NewOpcode = getVALUOp(Inst);
7566 // Handle some special cases
7567 switch (Opcode) {
7568 default:
7569 break;
7570 case AMDGPU::S_ADD_I32:
7571 case AMDGPU::S_SUB_I32: {
7572 // FIXME: The u32 versions currently selected use the carry.
7573 bool Changed;
7574 MachineBasicBlock *CreatedBBTmp = nullptr;
7575 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7576 if (Changed)
7577 return;
7578
7579 // Default handling
7580 break;
7581 }
7582
7583 case AMDGPU::S_MUL_U64:
7584 if (ST.hasVectorMulU64()) {
7585 NewOpcode = AMDGPU::V_MUL_U64_e64;
7586 break;
7587 }
7588 // Split s_mul_u64 in 32-bit vector multiplications.
7589 splitScalarSMulU64(Worklist, Inst, MDT);
7590 Inst.eraseFromParent();
7591 return;
7592
7593 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7594 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7595 // This is a special case of s_mul_u64 where all the operands are either
7596 // zero extended or sign extended.
7597 splitScalarSMulPseudo(Worklist, Inst, MDT);
7598 Inst.eraseFromParent();
7599 return;
7600
7601 case AMDGPU::S_AND_B64:
7602 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7603 Inst.eraseFromParent();
7604 return;
7605
7606 case AMDGPU::S_OR_B64:
7607 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7608 Inst.eraseFromParent();
7609 return;
7610
7611 case AMDGPU::S_XOR_B64:
7612 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
7613 Inst.eraseFromParent();
7614 return;
7615
7616 case AMDGPU::S_NAND_B64:
7617 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
7618 Inst.eraseFromParent();
7619 return;
7620
7621 case AMDGPU::S_NOR_B64:
7622 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
7623 Inst.eraseFromParent();
7624 return;
7625
7626 case AMDGPU::S_XNOR_B64:
7627 if (ST.hasDLInsts())
7628 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
7629 else
7630 splitScalar64BitXnor(Worklist, Inst, MDT);
7631 Inst.eraseFromParent();
7632 return;
7633
7634 case AMDGPU::S_ANDN2_B64:
7635 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
7636 Inst.eraseFromParent();
7637 return;
7638
7639 case AMDGPU::S_ORN2_B64:
7640 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7641 Inst.eraseFromParent();
7642 return;
7643
7644 case AMDGPU::S_BREV_B64:
7645 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7646 Inst.eraseFromParent();
7647 return;
7648
7649 case AMDGPU::S_NOT_B64:
7650 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7651 Inst.eraseFromParent();
7652 return;
7653
7654 case AMDGPU::S_BCNT1_I32_B64:
7655 splitScalar64BitBCNT(Worklist, Inst);
7656 Inst.eraseFromParent();
7657 return;
7658
7659 case AMDGPU::S_BFE_I64:
7660 splitScalar64BitBFE(Worklist, Inst);
7661 Inst.eraseFromParent();
7662 return;
7663
7664 case AMDGPU::S_FLBIT_I32_B64:
7665 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7666 Inst.eraseFromParent();
7667 return;
7668 case AMDGPU::S_FF1_I32_B64:
7669 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7670 Inst.eraseFromParent();
7671 return;
7672
7673 case AMDGPU::S_LSHL_B32:
7674 if (ST.hasOnlyRevVALUShifts()) {
7675 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7676 swapOperands(Inst);
7677 }
7678 break;
7679 case AMDGPU::S_ASHR_I32:
7680 if (ST.hasOnlyRevVALUShifts()) {
7681 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7682 swapOperands(Inst);
7683 }
7684 break;
7685 case AMDGPU::S_LSHR_B32:
7686 if (ST.hasOnlyRevVALUShifts()) {
7687 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7688 swapOperands(Inst);
7689 }
7690 break;
7691 case AMDGPU::S_LSHL_B64:
7692 if (ST.hasOnlyRevVALUShifts()) {
7693 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7694 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7695 : AMDGPU::V_LSHLREV_B64_e64;
7696 swapOperands(Inst);
7697 }
7698 break;
7699 case AMDGPU::S_ASHR_I64:
7700 if (ST.hasOnlyRevVALUShifts()) {
7701 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7702 swapOperands(Inst);
7703 }
7704 break;
7705 case AMDGPU::S_LSHR_B64:
7706 if (ST.hasOnlyRevVALUShifts()) {
7707 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7708 swapOperands(Inst);
7709 }
7710 break;
7711
7712 case AMDGPU::S_ABS_I32:
7713 lowerScalarAbs(Worklist, Inst);
7714 Inst.eraseFromParent();
7715 return;
7716
7717 case AMDGPU::S_CBRANCH_SCC0:
7718 case AMDGPU::S_CBRANCH_SCC1: {
7719 // Clear unused bits of vcc
7720 Register CondReg = Inst.getOperand(1).getReg();
7721 bool IsSCC = CondReg == AMDGPU::SCC;
7723 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(LMC.AndOpc), LMC.VccReg)
7724 .addReg(LMC.ExecReg)
7725 .addReg(IsSCC ? LMC.VccReg : CondReg);
7726 Inst.removeOperand(1);
7727 } break;
7728
7729 case AMDGPU::S_BFE_U64:
7730 case AMDGPU::S_BFM_B64:
7731 llvm_unreachable("Moving this op to VALU not implemented");
7732
7733 case AMDGPU::S_PACK_LL_B32_B16:
7734 case AMDGPU::S_PACK_LH_B32_B16:
7735 case AMDGPU::S_PACK_HL_B32_B16:
7736 case AMDGPU::S_PACK_HH_B32_B16:
7737 movePackToVALU(Worklist, MRI, Inst);
7738 Inst.eraseFromParent();
7739 return;
7740
7741 case AMDGPU::S_XNOR_B32:
7742 lowerScalarXnor(Worklist, Inst);
7743 Inst.eraseFromParent();
7744 return;
7745
7746 case AMDGPU::S_NAND_B32:
7747 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
7748 Inst.eraseFromParent();
7749 return;
7750
7751 case AMDGPU::S_NOR_B32:
7752 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
7753 Inst.eraseFromParent();
7754 return;
7755
7756 case AMDGPU::S_ANDN2_B32:
7757 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
7758 Inst.eraseFromParent();
7759 return;
7760
7761 case AMDGPU::S_ORN2_B32:
7762 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
7763 Inst.eraseFromParent();
7764 return;
7765
7766 // TODO: remove as soon as everything is ready
7767 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
7768 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
7769 // can only be selected from the uniform SDNode.
7770 case AMDGPU::S_ADD_CO_PSEUDO:
7771 case AMDGPU::S_SUB_CO_PSEUDO: {
7772 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
7773 ? AMDGPU::V_ADDC_U32_e64
7774 : AMDGPU::V_SUBB_U32_e64;
7775 const auto *CarryRC = RI.getWaveMaskRegClass();
7776
7777 Register CarryInReg = Inst.getOperand(4).getReg();
7778 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
7779 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
7780 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
7781 .addReg(CarryInReg);
7782 }
7783
7784 Register CarryOutReg = Inst.getOperand(1).getReg();
7785
7786 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
7787 MRI.getRegClass(Inst.getOperand(0).getReg())));
7788 MachineInstr *CarryOp =
7789 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
7790 .addReg(CarryOutReg, RegState::Define)
7791 .add(Inst.getOperand(2))
7792 .add(Inst.getOperand(3))
7793 .addReg(CarryInReg)
7794 .addImm(0);
7795 legalizeOperands(*CarryOp);
7796 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
7797 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
7798 Inst.eraseFromParent();
7799 }
7800 return;
7801 case AMDGPU::S_UADDO_PSEUDO:
7802 case AMDGPU::S_USUBO_PSEUDO: {
7803 const DebugLoc &DL = Inst.getDebugLoc();
7804 MachineOperand &Dest0 = Inst.getOperand(0);
7805 MachineOperand &Dest1 = Inst.getOperand(1);
7806 MachineOperand &Src0 = Inst.getOperand(2);
7807 MachineOperand &Src1 = Inst.getOperand(3);
7808
7809 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
7810 ? AMDGPU::V_ADD_CO_U32_e64
7811 : AMDGPU::V_SUB_CO_U32_e64;
7812 const TargetRegisterClass *NewRC =
7813 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
7814 Register DestReg = MRI.createVirtualRegister(NewRC);
7815 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
7816 .addReg(Dest1.getReg(), RegState::Define)
7817 .add(Src0)
7818 .add(Src1)
7819 .addImm(0); // clamp bit
7820
7821 legalizeOperands(*NewInstr, MDT);
7822 MRI.replaceRegWith(Dest0.getReg(), DestReg);
7823 addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
7824 Worklist);
7825 Inst.eraseFromParent();
7826 }
7827 return;
7828
7829 case AMDGPU::S_CSELECT_B32:
7830 case AMDGPU::S_CSELECT_B64:
7831 lowerSelect(Worklist, Inst, MDT);
7832 Inst.eraseFromParent();
7833 return;
7834 case AMDGPU::S_CMP_EQ_I32:
7835 case AMDGPU::S_CMP_LG_I32:
7836 case AMDGPU::S_CMP_GT_I32:
7837 case AMDGPU::S_CMP_GE_I32:
7838 case AMDGPU::S_CMP_LT_I32:
7839 case AMDGPU::S_CMP_LE_I32:
7840 case AMDGPU::S_CMP_EQ_U32:
7841 case AMDGPU::S_CMP_LG_U32:
7842 case AMDGPU::S_CMP_GT_U32:
7843 case AMDGPU::S_CMP_GE_U32:
7844 case AMDGPU::S_CMP_LT_U32:
7845 case AMDGPU::S_CMP_LE_U32:
7846 case AMDGPU::S_CMP_EQ_U64:
7847 case AMDGPU::S_CMP_LG_U64:
7848 case AMDGPU::S_CMP_LT_F32:
7849 case AMDGPU::S_CMP_EQ_F32:
7850 case AMDGPU::S_CMP_LE_F32:
7851 case AMDGPU::S_CMP_GT_F32:
7852 case AMDGPU::S_CMP_LG_F32:
7853 case AMDGPU::S_CMP_GE_F32:
7854 case AMDGPU::S_CMP_O_F32:
7855 case AMDGPU::S_CMP_U_F32:
7856 case AMDGPU::S_CMP_NGE_F32:
7857 case AMDGPU::S_CMP_NLG_F32:
7858 case AMDGPU::S_CMP_NGT_F32:
7859 case AMDGPU::S_CMP_NLE_F32:
7860 case AMDGPU::S_CMP_NEQ_F32:
7861 case AMDGPU::S_CMP_NLT_F32: {
7862 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7863 auto NewInstr =
7864 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7865 .setMIFlags(Inst.getFlags());
7866 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) >=
7867 0) {
7868 NewInstr
7869 .addImm(0) // src0_modifiers
7870 .add(Inst.getOperand(0)) // src0
7871 .addImm(0) // src1_modifiers
7872 .add(Inst.getOperand(1)) // src1
7873 .addImm(0); // clamp
7874 } else {
7875 NewInstr.add(Inst.getOperand(0)).add(Inst.getOperand(1));
7876 }
7877 legalizeOperands(*NewInstr, MDT);
7878 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7879 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7880 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7881 Inst.eraseFromParent();
7882 return;
7883 }
7884 case AMDGPU::S_CMP_LT_F16:
7885 case AMDGPU::S_CMP_EQ_F16:
7886 case AMDGPU::S_CMP_LE_F16:
7887 case AMDGPU::S_CMP_GT_F16:
7888 case AMDGPU::S_CMP_LG_F16:
7889 case AMDGPU::S_CMP_GE_F16:
7890 case AMDGPU::S_CMP_O_F16:
7891 case AMDGPU::S_CMP_U_F16:
7892 case AMDGPU::S_CMP_NGE_F16:
7893 case AMDGPU::S_CMP_NLG_F16:
7894 case AMDGPU::S_CMP_NGT_F16:
7895 case AMDGPU::S_CMP_NLE_F16:
7896 case AMDGPU::S_CMP_NEQ_F16:
7897 case AMDGPU::S_CMP_NLT_F16: {
7898 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7899 auto NewInstr =
7900 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7901 .setMIFlags(Inst.getFlags());
7902 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0_modifiers)) {
7903 NewInstr
7904 .addImm(0) // src0_modifiers
7905 .add(Inst.getOperand(0)) // src0
7906 .addImm(0) // src1_modifiers
7907 .add(Inst.getOperand(1)) // src1
7908 .addImm(0); // clamp
7909 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
7910 NewInstr.addImm(0); // op_sel0
7911 } else {
7912 NewInstr
7913 .add(Inst.getOperand(0))
7914 .add(Inst.getOperand(1));
7915 }
7916 legalizeOperandsVALUt16(*NewInstr, MRI);
7917 legalizeOperands(*NewInstr, MDT);
7918 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7919 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7920 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7921 Inst.eraseFromParent();
7922 return;
7923 }
7924 case AMDGPU::S_CVT_HI_F32_F16: {
7925 const DebugLoc &DL = Inst.getDebugLoc();
7926 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7927 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7928 if (ST.useRealTrue16Insts()) {
7929 BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
7930 .add(Inst.getOperand(1));
7931 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7932 .addImm(0) // src0_modifiers
7933 .addReg(TmpReg, 0, AMDGPU::hi16)
7934 .addImm(0) // clamp
7935 .addImm(0) // omod
7936 .addImm(0); // op_sel0
7937 } else {
7938 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
7939 .addImm(16)
7940 .add(Inst.getOperand(1));
7941 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7942 .addImm(0) // src0_modifiers
7943 .addReg(TmpReg)
7944 .addImm(0) // clamp
7945 .addImm(0); // omod
7946 }
7947
7948 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7949 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7950 Inst.eraseFromParent();
7951 return;
7952 }
7953 case AMDGPU::S_MINIMUM_F32:
7954 case AMDGPU::S_MAXIMUM_F32: {
7955 const DebugLoc &DL = Inst.getDebugLoc();
7956 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7957 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7958 .addImm(0) // src0_modifiers
7959 .add(Inst.getOperand(1))
7960 .addImm(0) // src1_modifiers
7961 .add(Inst.getOperand(2))
7962 .addImm(0) // clamp
7963 .addImm(0); // omod
7964 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7965
7966 legalizeOperands(*NewInstr, MDT);
7967 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7968 Inst.eraseFromParent();
7969 return;
7970 }
7971 case AMDGPU::S_MINIMUM_F16:
7972 case AMDGPU::S_MAXIMUM_F16: {
7973 const DebugLoc &DL = Inst.getDebugLoc();
7974 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
7975 ? &AMDGPU::VGPR_16RegClass
7976 : &AMDGPU::VGPR_32RegClass);
7977 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7978 .addImm(0) // src0_modifiers
7979 .add(Inst.getOperand(1))
7980 .addImm(0) // src1_modifiers
7981 .add(Inst.getOperand(2))
7982 .addImm(0) // clamp
7983 .addImm(0) // omod
7984 .addImm(0); // opsel0
7985 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7986 legalizeOperandsVALUt16(*NewInstr, MRI);
7987 legalizeOperands(*NewInstr, MDT);
7988 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7989 Inst.eraseFromParent();
7990 return;
7991 }
7992 case AMDGPU::V_S_EXP_F16_e64:
7993 case AMDGPU::V_S_LOG_F16_e64:
7994 case AMDGPU::V_S_RCP_F16_e64:
7995 case AMDGPU::V_S_RSQ_F16_e64:
7996 case AMDGPU::V_S_SQRT_F16_e64: {
7997 const DebugLoc &DL = Inst.getDebugLoc();
7998 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
7999 ? &AMDGPU::VGPR_16RegClass
8000 : &AMDGPU::VGPR_32RegClass);
8001 auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8002 .add(Inst.getOperand(1)) // src0_modifiers
8003 .add(Inst.getOperand(2))
8004 .add(Inst.getOperand(3)) // clamp
8005 .add(Inst.getOperand(4)) // omod
8006 .setMIFlags(Inst.getFlags());
8007 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8008 NewInstr.addImm(0); // opsel0
8009 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8010 legalizeOperandsVALUt16(*NewInstr, MRI);
8011 legalizeOperands(*NewInstr, MDT);
8012 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8013 Inst.eraseFromParent();
8014 return;
8015 }
8016 }
8017
8018 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
8019 // We cannot move this instruction to the VALU, so we should try to
8020 // legalize its operands instead.
8021 legalizeOperands(Inst, MDT);
8022 return;
8023 }
8024 // Handle converting generic instructions like COPY-to-SGPR into
8025 // COPY-to-VGPR.
8026 if (NewOpcode == Opcode) {
8027 Register DstReg = Inst.getOperand(0).getReg();
8028 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
8029
8030 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
8031 // hope for the best.
8032 if (Inst.isCopy() && DstReg.isPhysical() &&
8033 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8034 // TODO: Only works for 32 bit registers.
8035 if (MRI.constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass)) {
8036 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8037 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
8038 .add(Inst.getOperand(1));
8039 } else {
8040 Register NewDst =
8041 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8042 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8043 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
8044 .add(Inst.getOperand(1));
8045 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
8046 DstReg)
8047 .addReg(NewDst);
8048 }
8049 Inst.eraseFromParent();
8050 return;
8051 }
8052
8053 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
8054 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
8055 // Instead of creating a copy where src and dst are the same register
8056 // class, we just replace all uses of dst with src. These kinds of
8057 // copies interfere with the heuristics MachineSink uses to decide
8058 // whether or not to split a critical edge. Since the pass assumes
8059 // that copies will end up as machine instructions and not be
8060 // eliminated.
8061 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
8062 Register NewDstReg = Inst.getOperand(1).getReg();
8063 MRI.replaceRegWith(DstReg, NewDstReg);
8064 MRI.clearKillFlags(NewDstReg);
8065 Inst.getOperand(0).setReg(DstReg);
8066 Inst.eraseFromParent();
8067 // Legalize t16 operand since replaceReg is called after addUsersToVALU
8068 for (MachineOperand &MO :
8069 make_early_inc_range(MRI.use_operands(NewDstReg))) {
8070 legalizeOperandsVALUt16(*MO.getParent(), MRI);
8071 }
8072 return;
8073 }
8074
8075 // If this is a v2s copy between 16bit and 32bit reg,
8076 // replace vgpr copy to reg_sequence/extract_subreg
8077 // This can be remove after we have sgpr16 in place
8078 if (ST.useRealTrue16Insts() && Inst.isCopy() &&
8079 Inst.getOperand(1).getReg().isVirtual() &&
8080 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8081 const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
8082 if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
8083 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8084 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
8085 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8086 get(AMDGPU::IMPLICIT_DEF), Undef);
8087 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8088 get(AMDGPU::REG_SEQUENCE), NewDstReg)
8089 .addReg(Inst.getOperand(1).getReg())
8090 .addImm(AMDGPU::lo16)
8091 .addReg(Undef)
8092 .addImm(AMDGPU::hi16);
8093 Inst.eraseFromParent();
8094 MRI.replaceRegWith(DstReg, NewDstReg);
8095 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8096 return;
8097 } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
8098 AMDGPU::lo16)) {
8099 Inst.getOperand(1).setSubReg(AMDGPU::lo16);
8100 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8101 MRI.replaceRegWith(DstReg, NewDstReg);
8102 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8103 return;
8104 }
8105 }
8106
8107 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8108 MRI.replaceRegWith(DstReg, NewDstReg);
8109 legalizeOperands(Inst, MDT);
8110 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8111 return;
8112 }
8113
8114 // Use the new VALU Opcode.
8115 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
8116 .setMIFlags(Inst.getFlags());
8117 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
8118 // Intersperse VOP3 modifiers among the SALU operands.
8119 NewInstr->addOperand(Inst.getOperand(0));
8120 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8121 AMDGPU::OpName::src0_modifiers) >= 0)
8122 NewInstr.addImm(0);
8123 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
8124 MachineOperand Src = Inst.getOperand(1);
8125 NewInstr->addOperand(Src);
8126 }
8127
8128 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
8129 // We are converting these to a BFE, so we need to add the missing
8130 // operands for the size and offset.
8131 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
8132 NewInstr.addImm(0);
8133 NewInstr.addImm(Size);
8134 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
8135 // The VALU version adds the second operand to the result, so insert an
8136 // extra 0 operand.
8137 NewInstr.addImm(0);
8138 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
8139 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
8140 // If we need to move this to VGPRs, we need to unpack the second
8141 // operand back into the 2 separate ones for bit offset and width.
8142 assert(OffsetWidthOp.isImm() &&
8143 "Scalar BFE is only implemented for constant width and offset");
8144 uint32_t Imm = OffsetWidthOp.getImm();
8145
8146 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8147 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8148 NewInstr.addImm(Offset);
8149 NewInstr.addImm(BitWidth);
8150 } else {
8151 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8152 AMDGPU::OpName::src1_modifiers) >= 0)
8153 NewInstr.addImm(0);
8154 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
8155 NewInstr->addOperand(Inst.getOperand(2));
8156 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8157 AMDGPU::OpName::src2_modifiers) >= 0)
8158 NewInstr.addImm(0);
8159 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
8160 NewInstr->addOperand(Inst.getOperand(3));
8161 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
8162 NewInstr.addImm(0);
8163 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
8164 NewInstr.addImm(0);
8165 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
8166 NewInstr.addImm(0);
8167 }
8168 } else {
8169 // Just copy the SALU operands.
8170 for (const MachineOperand &Op : Inst.explicit_operands())
8171 NewInstr->addOperand(Op);
8172 }
8173
8174 // Remove any references to SCC. Vector instructions can't read from it, and
8175 // We're just about to add the implicit use / defs of VCC, and we don't want
8176 // both.
8177 for (MachineOperand &Op : Inst.implicit_operands()) {
8178 if (Op.getReg() == AMDGPU::SCC) {
8179 // Only propagate through live-def of SCC.
8180 if (Op.isDef() && !Op.isDead())
8181 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
8182 if (Op.isUse())
8183 addSCCDefsToVALUWorklist(NewInstr, Worklist);
8184 }
8185 }
8186 Inst.eraseFromParent();
8187 Register NewDstReg;
8188 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
8189 Register DstReg = NewInstr->getOperand(0).getReg();
8190 assert(DstReg.isVirtual());
8191 // Update the destination register class.
8192 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
8193 assert(NewDstRC);
8194 NewDstReg = MRI.createVirtualRegister(NewDstRC);
8195 MRI.replaceRegWith(DstReg, NewDstReg);
8196 }
8197 fixImplicitOperands(*NewInstr);
8198
8199 legalizeOperandsVALUt16(*NewInstr, MRI);
8200
8201 // Legalize the operands
8202 legalizeOperands(*NewInstr, MDT);
8203 if (NewDstReg)
8204 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8205}
8206
8207// Add/sub require special handling to deal with carry outs.
8208std::pair<bool, MachineBasicBlock *>
8209SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
8210 MachineDominatorTree *MDT) const {
8211 if (ST.hasAddNoCarry()) {
8212 // Assume there is no user of scc since we don't select this in that case.
8213 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
8214 // is used.
8215
8216 MachineBasicBlock &MBB = *Inst.getParent();
8217 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8218
8219 Register OldDstReg = Inst.getOperand(0).getReg();
8220 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8221
8222 unsigned Opc = Inst.getOpcode();
8223 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
8224
8225 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
8226 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
8227
8228 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
8229 Inst.removeOperand(3);
8230
8231 Inst.setDesc(get(NewOpc));
8232 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
8233 Inst.addImplicitDefUseOperands(*MBB.getParent());
8234 MRI.replaceRegWith(OldDstReg, ResultReg);
8235 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
8236
8237 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8238 return std::pair(true, NewBB);
8239 }
8240
8241 return std::pair(false, nullptr);
8242}
8243
8244void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
8245 MachineDominatorTree *MDT) const {
8246
8247 MachineBasicBlock &MBB = *Inst.getParent();
8248 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8249 MachineBasicBlock::iterator MII = Inst;
8250 DebugLoc DL = Inst.getDebugLoc();
8251
8252 MachineOperand &Dest = Inst.getOperand(0);
8253 MachineOperand &Src0 = Inst.getOperand(1);
8254 MachineOperand &Src1 = Inst.getOperand(2);
8255 MachineOperand &Cond = Inst.getOperand(3);
8256
8257 Register CondReg = Cond.getReg();
8258 bool IsSCC = (CondReg == AMDGPU::SCC);
8259
8260 // If this is a trivial select where the condition is effectively not SCC
8261 // (CondReg is a source of copy to SCC), then the select is semantically
8262 // equivalent to copying CondReg. Hence, there is no need to create
8263 // V_CNDMASK, we can just use that and bail out.
8264 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
8265 (Src1.getImm() == 0)) {
8266 MRI.replaceRegWith(Dest.getReg(), CondReg);
8267 return;
8268 }
8269
8270 Register NewCondReg = CondReg;
8271 if (IsSCC) {
8272 const TargetRegisterClass *TC = RI.getWaveMaskRegClass();
8273 NewCondReg = MRI.createVirtualRegister(TC);
8274
8275 // Now look for the closest SCC def if it is a copy
8276 // replacing the CondReg with the COPY source register
8277 bool CopyFound = false;
8278 for (MachineInstr &CandI :
8280 Inst.getParent()->rend())) {
8281 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
8282 -1) {
8283 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
8284 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
8285 .addReg(CandI.getOperand(1).getReg());
8286 CopyFound = true;
8287 }
8288 break;
8289 }
8290 }
8291 if (!CopyFound) {
8292 // SCC def is not a copy
8293 // Insert a trivial select instead of creating a copy, because a copy from
8294 // SCC would semantically mean just copying a single bit, but we may need
8295 // the result to be a vector condition mask that needs preserving.
8296 unsigned Opcode =
8297 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
8298 auto NewSelect =
8299 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
8300 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
8301 }
8302 }
8303
8304 Register NewDestReg = MRI.createVirtualRegister(
8305 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
8306 MachineInstr *NewInst;
8307 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
8308 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
8309 .addImm(0)
8310 .add(Src1) // False
8311 .addImm(0)
8312 .add(Src0) // True
8313 .addReg(NewCondReg);
8314 } else {
8315 NewInst =
8316 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
8317 .add(Src1) // False
8318 .add(Src0) // True
8319 .addReg(NewCondReg);
8320 }
8321 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
8322 legalizeOperands(*NewInst, MDT);
8323 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
8324}
8325
8326void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
8327 MachineInstr &Inst) const {
8328 MachineBasicBlock &MBB = *Inst.getParent();
8329 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8330 MachineBasicBlock::iterator MII = Inst;
8331 DebugLoc DL = Inst.getDebugLoc();
8332
8333 MachineOperand &Dest = Inst.getOperand(0);
8334 MachineOperand &Src = Inst.getOperand(1);
8335 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8336 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8337
8338 unsigned SubOp = ST.hasAddNoCarry() ?
8339 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
8340
8341 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
8342 .addImm(0)
8343 .addReg(Src.getReg());
8344
8345 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8346 .addReg(Src.getReg())
8347 .addReg(TmpReg);
8348
8349 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8350 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8351}
8352
8353void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
8354 MachineInstr &Inst) const {
8355 MachineBasicBlock &MBB = *Inst.getParent();
8356 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8357 MachineBasicBlock::iterator MII = Inst;
8358 const DebugLoc &DL = Inst.getDebugLoc();
8359
8360 MachineOperand &Dest = Inst.getOperand(0);
8361 MachineOperand &Src0 = Inst.getOperand(1);
8362 MachineOperand &Src1 = Inst.getOperand(2);
8363
8364 if (ST.hasDLInsts()) {
8365 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8366 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
8367 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
8368
8369 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
8370 .add(Src0)
8371 .add(Src1);
8372
8373 MRI.replaceRegWith(Dest.getReg(), NewDest);
8374 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8375 } else {
8376 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
8377 // invert either source and then perform the XOR. If either source is a
8378 // scalar register, then we can leave the inversion on the scalar unit to
8379 // achieve a better distribution of scalar and vector instructions.
8380 bool Src0IsSGPR = Src0.isReg() &&
8381 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
8382 bool Src1IsSGPR = Src1.isReg() &&
8383 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
8384 MachineInstr *Xor;
8385 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8386 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8387
8388 // Build a pair of scalar instructions and add them to the work list.
8389 // The next iteration over the work list will lower these to the vector
8390 // unit as necessary.
8391 if (Src0IsSGPR) {
8392 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
8393 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8394 .addReg(Temp)
8395 .add(Src1);
8396 } else if (Src1IsSGPR) {
8397 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
8398 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8399 .add(Src0)
8400 .addReg(Temp);
8401 } else {
8402 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
8403 .add(Src0)
8404 .add(Src1);
8405 MachineInstr *Not =
8406 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
8407 Worklist.insert(Not);
8408 }
8409
8410 MRI.replaceRegWith(Dest.getReg(), NewDest);
8411
8412 Worklist.insert(Xor);
8413
8414 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8415 }
8416}
8417
8418void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
8419 MachineInstr &Inst,
8420 unsigned Opcode) const {
8421 MachineBasicBlock &MBB = *Inst.getParent();
8422 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8423 MachineBasicBlock::iterator MII = Inst;
8424 const DebugLoc &DL = Inst.getDebugLoc();
8425
8426 MachineOperand &Dest = Inst.getOperand(0);
8427 MachineOperand &Src0 = Inst.getOperand(1);
8428 MachineOperand &Src1 = Inst.getOperand(2);
8429
8430 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8431 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8432
8433 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
8434 .add(Src0)
8435 .add(Src1);
8436
8437 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
8438 .addReg(Interm);
8439
8440 Worklist.insert(&Op);
8441 Worklist.insert(&Not);
8442
8443 MRI.replaceRegWith(Dest.getReg(), NewDest);
8444 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8445}
8446
8447void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
8448 MachineInstr &Inst,
8449 unsigned Opcode) const {
8450 MachineBasicBlock &MBB = *Inst.getParent();
8451 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8452 MachineBasicBlock::iterator MII = Inst;
8453 const DebugLoc &DL = Inst.getDebugLoc();
8454
8455 MachineOperand &Dest = Inst.getOperand(0);
8456 MachineOperand &Src0 = Inst.getOperand(1);
8457 MachineOperand &Src1 = Inst.getOperand(2);
8458
8459 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8460 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8461
8462 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
8463 .add(Src1);
8464
8465 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
8466 .add(Src0)
8467 .addReg(Interm);
8468
8469 Worklist.insert(&Not);
8470 Worklist.insert(&Op);
8471
8472 MRI.replaceRegWith(Dest.getReg(), NewDest);
8473 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8474}
8475
8476void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
8477 MachineInstr &Inst, unsigned Opcode,
8478 bool Swap) const {
8479 MachineBasicBlock &MBB = *Inst.getParent();
8480 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8481
8482 MachineOperand &Dest = Inst.getOperand(0);
8483 MachineOperand &Src0 = Inst.getOperand(1);
8484 DebugLoc DL = Inst.getDebugLoc();
8485
8486 MachineBasicBlock::iterator MII = Inst;
8487
8488 const MCInstrDesc &InstDesc = get(Opcode);
8489 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8490 MRI.getRegClass(Src0.getReg()) :
8491 &AMDGPU::SGPR_32RegClass;
8492
8493 const TargetRegisterClass *Src0SubRC =
8494 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8495
8496 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8497 AMDGPU::sub0, Src0SubRC);
8498
8499 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8500 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8501 const TargetRegisterClass *NewDestSubRC =
8502 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8503
8504 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8505 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
8506
8507 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8508 AMDGPU::sub1, Src0SubRC);
8509
8510 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8511 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
8512
8513 if (Swap)
8514 std::swap(DestSub0, DestSub1);
8515
8516 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8517 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8518 .addReg(DestSub0)
8519 .addImm(AMDGPU::sub0)
8520 .addReg(DestSub1)
8521 .addImm(AMDGPU::sub1);
8522
8523 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8524
8525 Worklist.insert(&LoHalf);
8526 Worklist.insert(&HiHalf);
8527
8528 // We don't need to legalizeOperands here because for a single operand, src0
8529 // will support any kind of input.
8530
8531 // Move all users of this moved value.
8532 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8533}
8534
8535// There is not a vector equivalent of s_mul_u64. For this reason, we need to
8536// split the s_mul_u64 in 32-bit vector multiplications.
8537void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
8538 MachineInstr &Inst,
8539 MachineDominatorTree *MDT) const {
8540 MachineBasicBlock &MBB = *Inst.getParent();
8541 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8542
8543 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8544 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8545 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8546
8547 MachineOperand &Dest = Inst.getOperand(0);
8548 MachineOperand &Src0 = Inst.getOperand(1);
8549 MachineOperand &Src1 = Inst.getOperand(2);
8550 const DebugLoc &DL = Inst.getDebugLoc();
8551 MachineBasicBlock::iterator MII = Inst;
8552
8553 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8554 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8555 const TargetRegisterClass *Src0SubRC =
8556 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8557 if (RI.isSGPRClass(Src0SubRC))
8558 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8559 const TargetRegisterClass *Src1SubRC =
8560 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8561 if (RI.isSGPRClass(Src1SubRC))
8562 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8563
8564 // First, we extract the low 32-bit and high 32-bit values from each of the
8565 // operands.
8566 MachineOperand Op0L =
8567 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8568 MachineOperand Op1L =
8569 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8570 MachineOperand Op0H =
8571 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
8572 MachineOperand Op1H =
8573 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
8574
8575 // The multilication is done as follows:
8576 //
8577 // Op1H Op1L
8578 // * Op0H Op0L
8579 // --------------------
8580 // Op1H*Op0L Op1L*Op0L
8581 // + Op1H*Op0H Op1L*Op0H
8582 // -----------------------------------------
8583 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
8584 //
8585 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
8586 // value and that would overflow.
8587 // The low 32-bit value is Op1L*Op0L.
8588 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
8589
8590 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8591 MachineInstr *Op1L_Op0H =
8592 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
8593 .add(Op1L)
8594 .add(Op0H);
8595
8596 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8597 MachineInstr *Op1H_Op0L =
8598 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
8599 .add(Op1H)
8600 .add(Op0L);
8601
8602 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8603 MachineInstr *Carry =
8604 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
8605 .add(Op1L)
8606 .add(Op0L);
8607
8608 MachineInstr *LoHalf =
8609 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8610 .add(Op1L)
8611 .add(Op0L);
8612
8613 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8614 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
8615 .addReg(Op1L_Op0H_Reg)
8616 .addReg(Op1H_Op0L_Reg);
8617
8618 MachineInstr *HiHalf =
8619 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
8620 .addReg(AddReg)
8621 .addReg(CarryReg);
8622
8623 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8624 .addReg(DestSub0)
8625 .addImm(AMDGPU::sub0)
8626 .addReg(DestSub1)
8627 .addImm(AMDGPU::sub1);
8628
8629 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8630
8631 // Try to legalize the operands in case we need to swap the order to keep it
8632 // valid.
8633 legalizeOperands(*Op1L_Op0H, MDT);
8634 legalizeOperands(*Op1H_Op0L, MDT);
8635 legalizeOperands(*Carry, MDT);
8636 legalizeOperands(*LoHalf, MDT);
8637 legalizeOperands(*Add, MDT);
8638 legalizeOperands(*HiHalf, MDT);
8639
8640 // Move all users of this moved value.
8641 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8642}
8643
8644// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
8645// multiplications.
8646void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
8647 MachineInstr &Inst,
8648 MachineDominatorTree *MDT) const {
8649 MachineBasicBlock &MBB = *Inst.getParent();
8650 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8651
8652 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8653 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8654 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8655
8656 MachineOperand &Dest = Inst.getOperand(0);
8657 MachineOperand &Src0 = Inst.getOperand(1);
8658 MachineOperand &Src1 = Inst.getOperand(2);
8659 const DebugLoc &DL = Inst.getDebugLoc();
8660 MachineBasicBlock::iterator MII = Inst;
8661
8662 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8663 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8664 const TargetRegisterClass *Src0SubRC =
8665 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8666 if (RI.isSGPRClass(Src0SubRC))
8667 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8668 const TargetRegisterClass *Src1SubRC =
8669 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8670 if (RI.isSGPRClass(Src1SubRC))
8671 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8672
8673 // First, we extract the low 32-bit and high 32-bit values from each of the
8674 // operands.
8675 MachineOperand Op0L =
8676 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8677 MachineOperand Op1L =
8678 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8679
8680 unsigned Opc = Inst.getOpcode();
8681 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
8682 ? AMDGPU::V_MUL_HI_U32_e64
8683 : AMDGPU::V_MUL_HI_I32_e64;
8684 MachineInstr *HiHalf =
8685 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
8686
8687 MachineInstr *LoHalf =
8688 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8689 .add(Op1L)
8690 .add(Op0L);
8691
8692 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8693 .addReg(DestSub0)
8694 .addImm(AMDGPU::sub0)
8695 .addReg(DestSub1)
8696 .addImm(AMDGPU::sub1);
8697
8698 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8699
8700 // Try to legalize the operands in case we need to swap the order to keep it
8701 // valid.
8702 legalizeOperands(*HiHalf, MDT);
8703 legalizeOperands(*LoHalf, MDT);
8704
8705 // Move all users of this moved value.
8706 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8707}
8708
8709void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
8710 MachineInstr &Inst, unsigned Opcode,
8711 MachineDominatorTree *MDT) const {
8712 MachineBasicBlock &MBB = *Inst.getParent();
8713 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8714
8715 MachineOperand &Dest = Inst.getOperand(0);
8716 MachineOperand &Src0 = Inst.getOperand(1);
8717 MachineOperand &Src1 = Inst.getOperand(2);
8718 DebugLoc DL = Inst.getDebugLoc();
8719
8720 MachineBasicBlock::iterator MII = Inst;
8721
8722 const MCInstrDesc &InstDesc = get(Opcode);
8723 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8724 MRI.getRegClass(Src0.getReg()) :
8725 &AMDGPU::SGPR_32RegClass;
8726
8727 const TargetRegisterClass *Src0SubRC =
8728 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8729 const TargetRegisterClass *Src1RC = Src1.isReg() ?
8730 MRI.getRegClass(Src1.getReg()) :
8731 &AMDGPU::SGPR_32RegClass;
8732
8733 const TargetRegisterClass *Src1SubRC =
8734 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8735
8736 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8737 AMDGPU::sub0, Src0SubRC);
8738 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8739 AMDGPU::sub0, Src1SubRC);
8740 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8741 AMDGPU::sub1, Src0SubRC);
8742 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8743 AMDGPU::sub1, Src1SubRC);
8744
8745 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8746 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8747 const TargetRegisterClass *NewDestSubRC =
8748 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8749
8750 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8751 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
8752 .add(SrcReg0Sub0)
8753 .add(SrcReg1Sub0);
8754
8755 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8756 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
8757 .add(SrcReg0Sub1)
8758 .add(SrcReg1Sub1);
8759
8760 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8761 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8762 .addReg(DestSub0)
8763 .addImm(AMDGPU::sub0)
8764 .addReg(DestSub1)
8765 .addImm(AMDGPU::sub1);
8766
8767 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8768
8769 Worklist.insert(&LoHalf);
8770 Worklist.insert(&HiHalf);
8771
8772 // Move all users of this moved value.
8773 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8774}
8775
8776void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
8777 MachineInstr &Inst,
8778 MachineDominatorTree *MDT) const {
8779 MachineBasicBlock &MBB = *Inst.getParent();
8780 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8781
8782 MachineOperand &Dest = Inst.getOperand(0);
8783 MachineOperand &Src0 = Inst.getOperand(1);
8784 MachineOperand &Src1 = Inst.getOperand(2);
8785 const DebugLoc &DL = Inst.getDebugLoc();
8786
8787 MachineBasicBlock::iterator MII = Inst;
8788
8789 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8790
8791 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
8792
8793 MachineOperand* Op0;
8794 MachineOperand* Op1;
8795
8796 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
8797 Op0 = &Src0;
8798 Op1 = &Src1;
8799 } else {
8800 Op0 = &Src1;
8801 Op1 = &Src0;
8802 }
8803
8804 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
8805 .add(*Op0);
8806
8807 Register NewDest = MRI.createVirtualRegister(DestRC);
8808
8809 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
8810 .addReg(Interm)
8811 .add(*Op1);
8812
8813 MRI.replaceRegWith(Dest.getReg(), NewDest);
8814
8815 Worklist.insert(&Xor);
8816}
8817
8818void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
8819 MachineInstr &Inst) const {
8820 MachineBasicBlock &MBB = *Inst.getParent();
8821 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8822
8823 MachineBasicBlock::iterator MII = Inst;
8824 const DebugLoc &DL = Inst.getDebugLoc();
8825
8826 MachineOperand &Dest = Inst.getOperand(0);
8827 MachineOperand &Src = Inst.getOperand(1);
8828
8829 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
8830 const TargetRegisterClass *SrcRC = Src.isReg() ?
8831 MRI.getRegClass(Src.getReg()) :
8832 &AMDGPU::SGPR_32RegClass;
8833
8834 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8835 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8836
8837 const TargetRegisterClass *SrcSubRC =
8838 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8839
8840 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8841 AMDGPU::sub0, SrcSubRC);
8842 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8843 AMDGPU::sub1, SrcSubRC);
8844
8845 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
8846
8847 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
8848
8849 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8850
8851 // We don't need to legalize operands here. src0 for either instruction can be
8852 // an SGPR, and the second input is unused or determined here.
8853 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8854}
8855
8856void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
8857 MachineInstr &Inst) const {
8858 MachineBasicBlock &MBB = *Inst.getParent();
8859 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8860 MachineBasicBlock::iterator MII = Inst;
8861 const DebugLoc &DL = Inst.getDebugLoc();
8862
8863 MachineOperand &Dest = Inst.getOperand(0);
8864 uint32_t Imm = Inst.getOperand(2).getImm();
8865 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8866 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8867
8868 (void) Offset;
8869
8870 // Only sext_inreg cases handled.
8871 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
8872 Offset == 0 && "Not implemented");
8873
8874 if (BitWidth < 32) {
8875 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8876 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8877 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8878
8879 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
8880 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
8881 .addImm(0)
8882 .addImm(BitWidth);
8883
8884 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
8885 .addImm(31)
8886 .addReg(MidRegLo);
8887
8888 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8889 .addReg(MidRegLo)
8890 .addImm(AMDGPU::sub0)
8891 .addReg(MidRegHi)
8892 .addImm(AMDGPU::sub1);
8893
8894 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8895 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8896 return;
8897 }
8898
8899 MachineOperand &Src = Inst.getOperand(1);
8900 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8901 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8902
8903 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
8904 .addImm(31)
8905 .addReg(Src.getReg(), 0, AMDGPU::sub0);
8906
8907 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8908 .addReg(Src.getReg(), 0, AMDGPU::sub0)
8909 .addImm(AMDGPU::sub0)
8910 .addReg(TmpReg)
8911 .addImm(AMDGPU::sub1);
8912
8913 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8914 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8915}
8916
8917void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
8918 MachineInstr &Inst, unsigned Opcode,
8919 MachineDominatorTree *MDT) const {
8920 // (S_FLBIT_I32_B64 hi:lo) ->
8921 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
8922 // (S_FF1_I32_B64 hi:lo) ->
8923 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
8924
8925 MachineBasicBlock &MBB = *Inst.getParent();
8926 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8927 MachineBasicBlock::iterator MII = Inst;
8928 const DebugLoc &DL = Inst.getDebugLoc();
8929
8930 MachineOperand &Dest = Inst.getOperand(0);
8931 MachineOperand &Src = Inst.getOperand(1);
8932
8933 const MCInstrDesc &InstDesc = get(Opcode);
8934
8935 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
8936 unsigned OpcodeAdd =
8937 ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
8938
8939 const TargetRegisterClass *SrcRC =
8940 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
8941 const TargetRegisterClass *SrcSubRC =
8942 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8943
8944 MachineOperand SrcRegSub0 =
8945 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
8946 MachineOperand SrcRegSub1 =
8947 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
8948
8949 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8950 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8951 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8952 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8953
8954 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
8955
8956 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
8957
8958 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
8959 .addReg(IsCtlz ? MidReg1 : MidReg2)
8960 .addImm(32)
8961 .addImm(1); // enable clamp
8962
8963 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
8964 .addReg(MidReg3)
8965 .addReg(IsCtlz ? MidReg2 : MidReg1);
8966
8967 MRI.replaceRegWith(Dest.getReg(), MidReg4);
8968
8969 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
8970}
8971
8972void SIInstrInfo::addUsersToMoveToVALUWorklist(
8974 SIInstrWorklist &Worklist) const {
8975 for (MachineOperand &MO : make_early_inc_range(MRI.use_operands(DstReg))) {
8976 MachineInstr &UseMI = *MO.getParent();
8977
8978 unsigned OpNo = 0;
8979
8980 switch (UseMI.getOpcode()) {
8981 case AMDGPU::COPY:
8982 case AMDGPU::WQM:
8983 case AMDGPU::SOFT_WQM:
8984 case AMDGPU::STRICT_WWM:
8985 case AMDGPU::STRICT_WQM:
8986 case AMDGPU::REG_SEQUENCE:
8987 case AMDGPU::PHI:
8988 case AMDGPU::INSERT_SUBREG:
8989 break;
8990 default:
8991 OpNo = MO.getOperandNo();
8992 break;
8993 }
8994
8995 if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo)))
8996 Worklist.insert(&UseMI);
8997 else
8998 // Legalization could change user list.
9000 }
9001}
9002
9003void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
9005 MachineInstr &Inst) const {
9006 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9007 MachineBasicBlock *MBB = Inst.getParent();
9008 MachineOperand &Src0 = Inst.getOperand(1);
9009 MachineOperand &Src1 = Inst.getOperand(2);
9010 const DebugLoc &DL = Inst.getDebugLoc();
9011
9012 switch (Inst.getOpcode()) {
9013 case AMDGPU::S_PACK_LL_B32_B16: {
9014 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9015 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9016
9017 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
9018 // 0.
9019 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9020 .addImm(0xffff);
9021
9022 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
9023 .addReg(ImmReg, RegState::Kill)
9024 .add(Src0);
9025
9026 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9027 .add(Src1)
9028 .addImm(16)
9029 .addReg(TmpReg, RegState::Kill);
9030 break;
9031 }
9032 case AMDGPU::S_PACK_LH_B32_B16: {
9033 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9034 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9035 .addImm(0xffff);
9036 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
9037 .addReg(ImmReg, RegState::Kill)
9038 .add(Src0)
9039 .add(Src1);
9040 break;
9041 }
9042 case AMDGPU::S_PACK_HL_B32_B16: {
9043 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9044 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9045 .addImm(16)
9046 .add(Src0);
9047 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9048 .add(Src1)
9049 .addImm(16)
9050 .addReg(TmpReg, RegState::Kill);
9051 break;
9052 }
9053 case AMDGPU::S_PACK_HH_B32_B16: {
9054 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9055 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9056 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9057 .addImm(16)
9058 .add(Src0);
9059 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9060 .addImm(0xffff0000);
9061 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
9062 .add(Src1)
9063 .addReg(ImmReg, RegState::Kill)
9064 .addReg(TmpReg, RegState::Kill);
9065 break;
9066 }
9067 default:
9068 llvm_unreachable("unhandled s_pack_* instruction");
9069 }
9070
9071 MachineOperand &Dest = Inst.getOperand(0);
9072 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9073 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9074}
9075
9076void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
9077 MachineInstr &SCCDefInst,
9078 SIInstrWorklist &Worklist,
9079 Register NewCond) const {
9080
9081 // Ensure that def inst defines SCC, which is still live.
9082 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
9083 !Op.isDead() && Op.getParent() == &SCCDefInst);
9084 SmallVector<MachineInstr *, 4> CopyToDelete;
9085 // This assumes that all the users of SCC are in the same block
9086 // as the SCC def.
9087 for (MachineInstr &MI : // Skip the def inst itself.
9088 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
9089 SCCDefInst.getParent()->end())) {
9090 // Check if SCC is used first.
9091 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
9092 if (SCCIdx != -1) {
9093 if (MI.isCopy()) {
9094 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9095 Register DestReg = MI.getOperand(0).getReg();
9096
9097 MRI.replaceRegWith(DestReg, NewCond);
9098 CopyToDelete.push_back(&MI);
9099 } else {
9100
9101 if (NewCond.isValid())
9102 MI.getOperand(SCCIdx).setReg(NewCond);
9103
9104 Worklist.insert(&MI);
9105 }
9106 }
9107 // Exit if we find another SCC def.
9108 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
9109 break;
9110 }
9111 for (auto &Copy : CopyToDelete)
9112 Copy->eraseFromParent();
9113}
9114
9115// Instructions that use SCC may be converted to VALU instructions. When that
9116// happens, the SCC register is changed to VCC_LO. The instruction that defines
9117// SCC must be changed to an instruction that defines VCC. This function makes
9118// sure that the instruction that defines SCC is added to the moveToVALU
9119// worklist.
9120void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
9121 SIInstrWorklist &Worklist) const {
9122 // Look for a preceding instruction that either defines VCC or SCC. If VCC
9123 // then there is nothing to do because the defining instruction has been
9124 // converted to a VALU already. If SCC then that instruction needs to be
9125 // converted to a VALU.
9126 for (MachineInstr &MI :
9127 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
9128 SCCUseInst->getParent()->rend())) {
9129 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
9130 break;
9131 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
9132 Worklist.insert(&MI);
9133 break;
9134 }
9135 }
9136}
9137
9138const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
9139 const MachineInstr &Inst) const {
9140 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
9141
9142 switch (Inst.getOpcode()) {
9143 // For target instructions, getOpRegClass just returns the virtual register
9144 // class associated with the operand, so we need to find an equivalent VGPR
9145 // register class in order to move the instruction to the VALU.
9146 case AMDGPU::COPY:
9147 case AMDGPU::PHI:
9148 case AMDGPU::REG_SEQUENCE:
9149 case AMDGPU::INSERT_SUBREG:
9150 case AMDGPU::WQM:
9151 case AMDGPU::SOFT_WQM:
9152 case AMDGPU::STRICT_WWM:
9153 case AMDGPU::STRICT_WQM: {
9154 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
9155 if (RI.isAGPRClass(SrcRC)) {
9156 if (RI.isAGPRClass(NewDstRC))
9157 return nullptr;
9158
9159 switch (Inst.getOpcode()) {
9160 case AMDGPU::PHI:
9161 case AMDGPU::REG_SEQUENCE:
9162 case AMDGPU::INSERT_SUBREG:
9163 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
9164 break;
9165 default:
9166 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9167 }
9168
9169 if (!NewDstRC)
9170 return nullptr;
9171 } else {
9172 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
9173 return nullptr;
9174
9175 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9176 if (!NewDstRC)
9177 return nullptr;
9178 }
9179
9180 return NewDstRC;
9181 }
9182 default:
9183 return NewDstRC;
9184 }
9185}
9186
9187// Find the one SGPR operand we are allowed to use.
9188Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
9189 int OpIndices[3]) const {
9190 const MCInstrDesc &Desc = MI.getDesc();
9191
9192 // Find the one SGPR operand we are allowed to use.
9193 //
9194 // First we need to consider the instruction's operand requirements before
9195 // legalizing. Some operands are required to be SGPRs, such as implicit uses
9196 // of VCC, but we are still bound by the constant bus requirement to only use
9197 // one.
9198 //
9199 // If the operand's class is an SGPR, we can never move it.
9200
9201 Register SGPRReg = findImplicitSGPRRead(MI);
9202 if (SGPRReg)
9203 return SGPRReg;
9204
9205 Register UsedSGPRs[3] = {Register()};
9206 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9207
9208 for (unsigned i = 0; i < 3; ++i) {
9209 int Idx = OpIndices[i];
9210 if (Idx == -1)
9211 break;
9212
9213 const MachineOperand &MO = MI.getOperand(Idx);
9214 if (!MO.isReg())
9215 continue;
9216
9217 // Is this operand statically required to be an SGPR based on the operand
9218 // constraints?
9219 const TargetRegisterClass *OpRC =
9220 RI.getRegClass(Desc.operands()[Idx].RegClass);
9221 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
9222 if (IsRequiredSGPR)
9223 return MO.getReg();
9224
9225 // If this could be a VGPR or an SGPR, Check the dynamic register class.
9226 Register Reg = MO.getReg();
9227 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
9228 if (RI.isSGPRClass(RegRC))
9229 UsedSGPRs[i] = Reg;
9230 }
9231
9232 // We don't have a required SGPR operand, so we have a bit more freedom in
9233 // selecting operands to move.
9234
9235 // Try to select the most used SGPR. If an SGPR is equal to one of the
9236 // others, we choose that.
9237 //
9238 // e.g.
9239 // V_FMA_F32 v0, s0, s0, s0 -> No moves
9240 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
9241
9242 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
9243 // prefer those.
9244
9245 if (UsedSGPRs[0]) {
9246 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
9247 SGPRReg = UsedSGPRs[0];
9248 }
9249
9250 if (!SGPRReg && UsedSGPRs[1]) {
9251 if (UsedSGPRs[1] == UsedSGPRs[2])
9252 SGPRReg = UsedSGPRs[1];
9253 }
9254
9255 return SGPRReg;
9256}
9257
9259 AMDGPU::OpName OperandName) const {
9260 if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES)
9261 return nullptr;
9262
9263 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
9264 if (Idx == -1)
9265 return nullptr;
9266
9267 return &MI.getOperand(Idx);
9268}
9269
9271 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
9272 int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11
9275 return (Format << 44) |
9276 (1ULL << 56) | // RESOURCE_LEVEL = 1
9277 (3ULL << 60); // OOB_SELECT = 3
9278 }
9279
9280 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
9281 if (ST.isAmdHsaOS()) {
9282 // Set ATC = 1. GFX9 doesn't have this bit.
9283 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9284 RsrcDataFormat |= (1ULL << 56);
9285
9286 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
9287 // BTW, it disables TC L2 and therefore decreases performance.
9288 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
9289 RsrcDataFormat |= (2ULL << 59);
9290 }
9291
9292 return RsrcDataFormat;
9293}
9294
9298 0xffffffff; // Size;
9299
9300 // GFX9 doesn't have ELEMENT_SIZE.
9301 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
9302 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
9303 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
9304 }
9305
9306 // IndexStride = 64 / 32.
9307 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
9308 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
9309
9310 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
9311 // Clear them unless we want a huge stride.
9312 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
9313 ST.getGeneration() <= AMDGPUSubtarget::GFX9)
9314 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
9315
9316 return Rsrc23;
9317}
9318
9320 unsigned Opc = MI.getOpcode();
9321
9322 return isSMRD(Opc);
9323}
9324
9326 return get(Opc).mayLoad() &&
9327 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
9328}
9329
9331 int &FrameIndex) const {
9332 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
9333 if (!Addr || !Addr->isFI())
9334 return Register();
9335
9336 assert(!MI.memoperands_empty() &&
9337 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
9338
9339 FrameIndex = Addr->getIndex();
9340 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
9341}
9342
9344 int &FrameIndex) const {
9345 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
9346 assert(Addr && Addr->isFI());
9347 FrameIndex = Addr->getIndex();
9348 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
9349}
9350
9352 int &FrameIndex) const {
9353 if (!MI.mayLoad())
9354 return Register();
9355
9356 if (isMUBUF(MI) || isVGPRSpill(MI))
9357 return isStackAccess(MI, FrameIndex);
9358
9359 if (isSGPRSpill(MI))
9360 return isSGPRStackAccess(MI, FrameIndex);
9361
9362 return Register();
9363}
9364
9366 int &FrameIndex) const {
9367 if (!MI.mayStore())
9368 return Register();
9369
9370 if (isMUBUF(MI) || isVGPRSpill(MI))
9371 return isStackAccess(MI, FrameIndex);
9372
9373 if (isSGPRSpill(MI))
9374 return isSGPRStackAccess(MI, FrameIndex);
9375
9376 return Register();
9377}
9378
9380 unsigned Size = 0;
9382 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
9383 while (++I != E && I->isInsideBundle()) {
9384 assert(!I->isBundle() && "No nested bundle!");
9386 }
9387
9388 return Size;
9389}
9390
9392 unsigned Opc = MI.getOpcode();
9394 unsigned DescSize = Desc.getSize();
9395
9396 // If we have a definitive size, we can use it. Otherwise we need to inspect
9397 // the operands to know the size.
9398 if (isFixedSize(MI)) {
9399 unsigned Size = DescSize;
9400
9401 // If we hit the buggy offset, an extra nop will be inserted in MC so
9402 // estimate the worst case.
9403 if (MI.isBranch() && ST.hasOffset3fBug())
9404 Size += 4;
9405
9406 return Size;
9407 }
9408
9409 // Instructions may have a 32-bit literal encoded after them. Check
9410 // operands that could ever be literals.
9411 if (isVALU(MI) || isSALU(MI)) {
9412 if (isDPP(MI))
9413 return DescSize;
9414 bool HasLiteral = false;
9415 unsigned LiteralSize = 4;
9416 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
9417 const MachineOperand &Op = MI.getOperand(I);
9418 const MCOperandInfo &OpInfo = Desc.operands()[I];
9419 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
9420 HasLiteral = true;
9421 if (ST.has64BitLiterals()) {
9422 switch (OpInfo.OperandType) {
9423 default:
9424 break;
9426 if (!AMDGPU::isValid32BitLiteral(Op.getImm(), true))
9427 LiteralSize = 8;
9428 break;
9430 if (!Op.isImm() || !AMDGPU::isValid32BitLiteral(Op.getImm(), false))
9431 LiteralSize = 8;
9432 break;
9433 }
9434 }
9435 break;
9436 }
9437 }
9438 return HasLiteral ? DescSize + LiteralSize : DescSize;
9439 }
9440
9441 // Check whether we have extra NSA words.
9442 if (isMIMG(MI)) {
9443 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
9444 if (VAddr0Idx < 0)
9445 return 8;
9446
9447 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
9448 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
9449 }
9450
9451 switch (Opc) {
9452 case TargetOpcode::BUNDLE:
9453 return getInstBundleSize(MI);
9454 case TargetOpcode::INLINEASM:
9455 case TargetOpcode::INLINEASM_BR: {
9456 const MachineFunction *MF = MI.getParent()->getParent();
9457 const char *AsmStr = MI.getOperand(0).getSymbolName();
9458 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
9459 }
9460 default:
9461 if (MI.isMetaInstruction())
9462 return 0;
9463
9464 // If D16 Pseudo inst, get correct MC code size
9465 const auto *D16Info = AMDGPU::getT16D16Helper(Opc);
9466 if (D16Info) {
9467 // Assume d16_lo/hi inst are always in same size
9468 unsigned LoInstOpcode = D16Info->LoOp;
9469 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode);
9470 DescSize = Desc.getSize();
9471 }
9472
9473 return DescSize;
9474 }
9475}
9476
9478 if (!isFLAT(MI))
9479 return false;
9480
9481 if (MI.memoperands_empty())
9482 return true;
9483
9484 for (const MachineMemOperand *MMO : MI.memoperands()) {
9485 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
9486 return true;
9487 }
9488 return false;
9489}
9490
9493 static const std::pair<int, const char *> TargetIndices[] = {
9494 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
9495 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
9496 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
9497 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
9498 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
9499 return ArrayRef(TargetIndices);
9500}
9501
9502/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
9503/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
9509
9510/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
9511/// pass.
9516
9517// Called during:
9518// - pre-RA scheduling and post-RA scheduling
9521 const ScheduleDAGMI *DAG) const {
9522 // Borrowed from Arm Target
9523 // We would like to restrict this hazard recognizer to only
9524 // post-RA scheduling; we can tell that we're post-RA because we don't
9525 // track VRegLiveness.
9526 if (!DAG->hasVRegLiveness())
9527 return new GCNHazardRecognizer(DAG->MF);
9529}
9530
9531std::pair<unsigned, unsigned>
9533 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
9534}
9535
9538 static const std::pair<unsigned, const char *> TargetFlags[] = {
9539 {MO_GOTPCREL, "amdgpu-gotprel"},
9540 {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},
9541 {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},
9542 {MO_GOTPCREL64, "amdgpu-gotprel64"},
9543 {MO_REL32_LO, "amdgpu-rel32-lo"},
9544 {MO_REL32_HI, "amdgpu-rel32-hi"},
9545 {MO_REL64, "amdgpu-rel64"},
9546 {MO_ABS32_LO, "amdgpu-abs32-lo"},
9547 {MO_ABS32_HI, "amdgpu-abs32-hi"},
9548 {MO_ABS64, "amdgpu-abs64"},
9549 };
9550
9551 return ArrayRef(TargetFlags);
9552}
9553
9556 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9557 {
9558 {MONoClobber, "amdgpu-noclobber"},
9559 {MOLastUse, "amdgpu-last-use"},
9560 {MOCooperative, "amdgpu-cooperative"},
9561 };
9562
9563 return ArrayRef(TargetFlags);
9564}
9565
9567 const MachineFunction &MF) const {
9569 assert(SrcReg.isVirtual());
9570 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
9571 return AMDGPU::WWM_COPY;
9572
9573 return AMDGPU::COPY;
9574}
9575
9577 Register Reg) const {
9578 // We need to handle instructions which may be inserted during register
9579 // allocation to handle the prolog. The initial prolog instruction may have
9580 // been separated from the start of the block by spills and copies inserted
9581 // needed by the prolog. However, the insertions for scalar registers can
9582 // always be placed at the BB top as they are independent of the exec mask
9583 // value.
9584 const MachineFunction *MF = MI.getParent()->getParent();
9585 bool IsNullOrVectorRegister = true;
9586 if (Reg) {
9587 const MachineRegisterInfo &MRI = MF->getRegInfo();
9588 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
9589 }
9590
9591 uint16_t Opcode = MI.getOpcode();
9593 return IsNullOrVectorRegister &&
9594 (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode) ||
9595 (Opcode == AMDGPU::IMPLICIT_DEF &&
9596 MFI->isWWMReg(MI.getOperand(0).getReg())) ||
9597 (!MI.isTerminator() && Opcode != AMDGPU::COPY &&
9598 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
9599}
9600
9604 const DebugLoc &DL,
9605 Register DestReg) const {
9606 if (ST.hasAddNoCarry())
9607 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
9608
9609 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9610 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
9611 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
9612
9613 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9614 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9615}
9616
9619 const DebugLoc &DL,
9620 Register DestReg,
9621 RegScavenger &RS) const {
9622 if (ST.hasAddNoCarry())
9623 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
9624
9625 // If available, prefer to use vcc.
9626 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
9627 ? Register(RI.getVCC())
9629 *RI.getBoolRC(), I, /* RestoreAfter */ false,
9630 0, /* AllowSpill */ false);
9631
9632 // TODO: Users need to deal with this.
9633 if (!UnusedCarry.isValid())
9634 return MachineInstrBuilder();
9635
9636 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9637 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9638}
9639
9640bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
9641 switch (Opcode) {
9642 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
9643 case AMDGPU::SI_KILL_I1_TERMINATOR:
9644 return true;
9645 default:
9646 return false;
9647 }
9648}
9649
9651 switch (Opcode) {
9652 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
9653 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
9654 case AMDGPU::SI_KILL_I1_PSEUDO:
9655 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
9656 default:
9657 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
9658 }
9659}
9660
9661bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
9662 return Imm <= getMaxMUBUFImmOffset(ST);
9663}
9664
9666 // GFX12 field is non-negative 24-bit signed byte offset.
9667 const unsigned OffsetBits =
9668 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
9669 return (1 << OffsetBits) - 1;
9670}
9671
9673 if (!ST.isWave32())
9674 return;
9675
9676 if (MI.isInlineAsm())
9677 return;
9678
9679 for (auto &Op : MI.implicit_operands()) {
9680 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
9681 Op.setReg(AMDGPU::VCC_LO);
9682 }
9683}
9684
9686 if (!isSMRD(MI))
9687 return false;
9688
9689 // Check that it is using a buffer resource.
9690 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
9691 if (Idx == -1) // e.g. s_memtime
9692 return false;
9693
9694 const auto RCID = MI.getDesc().operands()[Idx].RegClass;
9695 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
9696}
9697
9698// Given Imm, split it into the values to put into the SOffset and ImmOffset
9699// fields in an MUBUF instruction. Return false if it is not possible (due to a
9700// hardware bug needing a workaround).
9701//
9702// The required alignment ensures that individual address components remain
9703// aligned if they are aligned to begin with. It also ensures that additional
9704// offsets within the given alignment can be added to the resulting ImmOffset.
9706 uint32_t &ImmOffset, Align Alignment) const {
9707 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
9708 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
9709 uint32_t Overflow = 0;
9710
9711 if (Imm > MaxImm) {
9712 if (Imm <= MaxImm + 64) {
9713 // Use an SOffset inline constant for 4..64
9714 Overflow = Imm - MaxImm;
9715 Imm = MaxImm;
9716 } else {
9717 // Try to keep the same value in SOffset for adjacent loads, so that
9718 // the corresponding register contents can be re-used.
9719 //
9720 // Load values with all low-bits (except for alignment bits) set into
9721 // SOffset, so that a larger range of values can be covered using
9722 // s_movk_i32.
9723 //
9724 // Atomic operations fail to work correctly when individual address
9725 // components are unaligned, even if their sum is aligned.
9726 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
9727 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
9728 Imm = Low;
9729 Overflow = High - Alignment.value();
9730 }
9731 }
9732
9733 if (Overflow > 0) {
9734 // There is a hardware bug in SI and CI which prevents address clamping in
9735 // MUBUF instructions from working correctly with SOffsets. The immediate
9736 // offset is unaffected.
9737 if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
9738 return false;
9739
9740 // It is not possible to set immediate in SOffset field on some targets.
9741 if (ST.hasRestrictedSOffset())
9742 return false;
9743 }
9744
9745 ImmOffset = Imm;
9746 SOffset = Overflow;
9747 return true;
9748}
9749
9750// Depending on the used address space and instructions, some immediate offsets
9751// are allowed and some are not.
9752// Pre-GFX12, flat instruction offsets can only be non-negative, global and
9753// scratch instruction offsets can also be negative. On GFX12, offsets can be
9754// negative for all variants.
9755//
9756// There are several bugs related to these offsets:
9757// On gfx10.1, flat instructions that go into the global address space cannot
9758// use an offset.
9759//
9760// For scratch instructions, the address can be either an SGPR or a VGPR.
9761// The following offsets can be used, depending on the architecture (x means
9762// cannot be used):
9763// +----------------------------+------+------+
9764// | Address-Mode | SGPR | VGPR |
9765// +----------------------------+------+------+
9766// | gfx9 | | |
9767// | negative, 4-aligned offset | x | ok |
9768// | negative, unaligned offset | x | ok |
9769// +----------------------------+------+------+
9770// | gfx10 | | |
9771// | negative, 4-aligned offset | ok | ok |
9772// | negative, unaligned offset | ok | x |
9773// +----------------------------+------+------+
9774// | gfx10.3 | | |
9775// | negative, 4-aligned offset | ok | ok |
9776// | negative, unaligned offset | ok | ok |
9777// +----------------------------+------+------+
9778//
9779// This function ignores the addressing mode, so if an offset cannot be used in
9780// one addressing mode, it is considered illegal.
9781bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
9782 uint64_t FlatVariant) const {
9783 // TODO: Should 0 be special cased?
9784 if (!ST.hasFlatInstOffsets())
9785 return false;
9786
9787 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
9788 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
9789 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
9790 return false;
9791
9792 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
9793 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
9794 (Offset % 4) != 0) {
9795 return false;
9796 }
9797
9798 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9799 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
9800 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
9801}
9802
9803// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
9804std::pair<int64_t, int64_t>
9805SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
9806 uint64_t FlatVariant) const {
9807 int64_t RemainderOffset = COffsetVal;
9808 int64_t ImmField = 0;
9809
9810 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9811 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
9812
9813 if (AllowNegative) {
9814 // Use signed division by a power of two to truncate towards 0.
9815 int64_t D = 1LL << NumBits;
9816 RemainderOffset = (COffsetVal / D) * D;
9817 ImmField = COffsetVal - RemainderOffset;
9818
9819 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
9820 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
9821 (ImmField % 4) != 0) {
9822 // Make ImmField a multiple of 4
9823 RemainderOffset += ImmField % 4;
9824 ImmField -= ImmField % 4;
9825 }
9826 } else if (COffsetVal >= 0) {
9827 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
9828 RemainderOffset = COffsetVal - ImmField;
9829 }
9830
9831 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
9832 assert(RemainderOffset + ImmField == COffsetVal);
9833 return {ImmField, RemainderOffset};
9834}
9835
9837 if (ST.hasNegativeScratchOffsetBug() &&
9838 FlatVariant == SIInstrFlags::FlatScratch)
9839 return false;
9840
9841 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
9842}
9843
9844static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
9845 switch (ST.getGeneration()) {
9846 default:
9847 break;
9850 return SIEncodingFamily::SI;
9853 return SIEncodingFamily::VI;
9859 return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
9861 }
9862 llvm_unreachable("Unknown subtarget generation!");
9863}
9864
9865bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
9866 switch(MCOp) {
9867 // These opcodes use indirect register addressing so
9868 // they need special handling by codegen (currently missing).
9869 // Therefore it is too risky to allow these opcodes
9870 // to be selected by dpp combiner or sdwa peepholer.
9871 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
9872 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
9873 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
9874 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
9875 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
9876 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
9877 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
9878 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
9879 return true;
9880 default:
9881 return false;
9882 }
9883}
9884
9885#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
9886 case OPCODE##_dpp: \
9887 case OPCODE##_e32: \
9888 case OPCODE##_e64: \
9889 case OPCODE##_e64_dpp: \
9890 case OPCODE##_sdwa:
9891
9892static bool isRenamedInGFX9(int Opcode) {
9893 switch (Opcode) {
9894 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
9895 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
9896 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
9897 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
9898 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
9899 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
9900 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
9901 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
9902 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
9903 //
9904 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
9905 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
9906 case AMDGPU::V_FMA_F16_gfx9_e64:
9907 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
9908 case AMDGPU::V_INTERP_P2_F16:
9909 case AMDGPU::V_MAD_F16_e64:
9910 case AMDGPU::V_MAD_U16_e64:
9911 case AMDGPU::V_MAD_I16_e64:
9912 return true;
9913 default:
9914 return false;
9915 }
9916}
9917
9918int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
9919 Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
9920
9921 unsigned Gen = subtargetEncodingFamily(ST);
9922
9923 if (ST.getGeneration() == AMDGPUSubtarget::GFX9 && isRenamedInGFX9(Opcode))
9925
9926 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
9927 // subtarget has UnpackedD16VMem feature.
9928 // TODO: remove this when we discard GFX80 encoding.
9929 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
9931
9932 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
9933 switch (ST.getGeneration()) {
9934 default:
9936 break;
9939 break;
9942 break;
9943 }
9944 }
9945
9946 if (isMAI(Opcode)) {
9947 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
9948 if (MFMAOp != -1)
9949 Opcode = MFMAOp;
9950 }
9951
9952 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
9953
9954 if (MCOp == (uint16_t)-1 && ST.hasGFX1250Insts())
9956
9957 // -1 means that Opcode is already a native instruction.
9958 if (MCOp == -1)
9959 return Opcode;
9960
9961 if (ST.hasGFX90AInsts()) {
9962 uint16_t NMCOp = (uint16_t)-1;
9963 if (ST.hasGFX940Insts())
9965 if (NMCOp == (uint16_t)-1)
9967 if (NMCOp == (uint16_t)-1)
9969 if (NMCOp != (uint16_t)-1)
9970 MCOp = NMCOp;
9971 }
9972
9973 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
9974 // no encoding in the given subtarget generation.
9975 if (MCOp == (uint16_t)-1)
9976 return -1;
9977
9978 if (isAsmOnlyOpcode(MCOp))
9979 return -1;
9980
9981 return MCOp;
9982}
9983
9984static
9986 assert(RegOpnd.isReg());
9987 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
9988 getRegSubRegPair(RegOpnd);
9989}
9990
9993 assert(MI.isRegSequence());
9994 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
9995 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
9996 auto &RegOp = MI.getOperand(1 + 2 * I);
9997 return getRegOrUndef(RegOp);
9998 }
10000}
10001
10002// Try to find the definition of reg:subreg in subreg-manipulation pseudos
10003// Following a subreg of reg:subreg isn't supported
10006 if (!RSR.SubReg)
10007 return false;
10008 switch (MI.getOpcode()) {
10009 default: break;
10010 case AMDGPU::REG_SEQUENCE:
10011 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
10012 return true;
10013 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
10014 case AMDGPU::INSERT_SUBREG:
10015 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
10016 // inserted the subreg we're looking for
10017 RSR = getRegOrUndef(MI.getOperand(2));
10018 else { // the subreg in the rest of the reg
10019 auto R1 = getRegOrUndef(MI.getOperand(1));
10020 if (R1.SubReg) // subreg of subreg isn't supported
10021 return false;
10022 RSR.Reg = R1.Reg;
10023 }
10024 return true;
10025 }
10026 return false;
10027}
10028
10031 assert(MRI.isSSA());
10032 if (!P.Reg.isVirtual())
10033 return nullptr;
10034
10035 auto RSR = P;
10036 auto *DefInst = MRI.getVRegDef(RSR.Reg);
10037 while (auto *MI = DefInst) {
10038 DefInst = nullptr;
10039 switch (MI->getOpcode()) {
10040 case AMDGPU::COPY:
10041 case AMDGPU::V_MOV_B32_e32: {
10042 auto &Op1 = MI->getOperand(1);
10043 if (Op1.isReg() && Op1.getReg().isVirtual()) {
10044 if (Op1.isUndef())
10045 return nullptr;
10046 RSR = getRegSubRegPair(Op1);
10047 DefInst = MRI.getVRegDef(RSR.Reg);
10048 }
10049 break;
10050 }
10051 default:
10052 if (followSubRegDef(*MI, RSR)) {
10053 if (!RSR.Reg)
10054 return nullptr;
10055 DefInst = MRI.getVRegDef(RSR.Reg);
10056 }
10057 }
10058 if (!DefInst)
10059 return MI;
10060 }
10061 return nullptr;
10062}
10063
10065 Register VReg,
10066 const MachineInstr &DefMI,
10067 const MachineInstr &UseMI) {
10068 assert(MRI.isSSA() && "Must be run on SSA");
10069
10070 auto *TRI = MRI.getTargetRegisterInfo();
10071 auto *DefBB = DefMI.getParent();
10072
10073 // Don't bother searching between blocks, although it is possible this block
10074 // doesn't modify exec.
10075 if (UseMI.getParent() != DefBB)
10076 return true;
10077
10078 const int MaxInstScan = 20;
10079 int NumInst = 0;
10080
10081 // Stop scan at the use.
10082 auto E = UseMI.getIterator();
10083 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
10084 if (I->isDebugInstr())
10085 continue;
10086
10087 if (++NumInst > MaxInstScan)
10088 return true;
10089
10090 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
10091 return true;
10092 }
10093
10094 return false;
10095}
10096
10098 Register VReg,
10099 const MachineInstr &DefMI) {
10100 assert(MRI.isSSA() && "Must be run on SSA");
10101
10102 auto *TRI = MRI.getTargetRegisterInfo();
10103 auto *DefBB = DefMI.getParent();
10104
10105 const int MaxUseScan = 10;
10106 int NumUse = 0;
10107
10108 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
10109 auto &UseInst = *Use.getParent();
10110 // Don't bother searching between blocks, although it is possible this block
10111 // doesn't modify exec.
10112 if (UseInst.getParent() != DefBB || UseInst.isPHI())
10113 return true;
10114
10115 if (++NumUse > MaxUseScan)
10116 return true;
10117 }
10118
10119 if (NumUse == 0)
10120 return false;
10121
10122 const int MaxInstScan = 20;
10123 int NumInst = 0;
10124
10125 // Stop scan when we have seen all the uses.
10126 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
10127 assert(I != DefBB->end());
10128
10129 if (I->isDebugInstr())
10130 continue;
10131
10132 if (++NumInst > MaxInstScan)
10133 return true;
10134
10135 for (const MachineOperand &Op : I->operands()) {
10136 // We don't check reg masks here as they're used only on calls:
10137 // 1. EXEC is only considered const within one BB
10138 // 2. Call should be a terminator instruction if present in a BB
10139
10140 if (!Op.isReg())
10141 continue;
10142
10143 Register Reg = Op.getReg();
10144 if (Op.isUse()) {
10145 if (Reg == VReg && --NumUse == 0)
10146 return false;
10147 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
10148 return true;
10149 }
10150 }
10151}
10152
10155 const DebugLoc &DL, Register Src, Register Dst) const {
10156 auto Cur = MBB.begin();
10157 if (Cur != MBB.end())
10158 do {
10159 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
10160 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
10161 ++Cur;
10162 } while (Cur != MBB.end() && Cur != LastPHIIt);
10163
10164 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
10165 Dst);
10166}
10167
10170 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
10171 if (InsPt != MBB.end() &&
10172 (InsPt->getOpcode() == AMDGPU::SI_IF ||
10173 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
10174 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
10175 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
10176 InsPt++;
10177 return BuildMI(MBB, InsPt, DL,
10178 get(AMDGPU::LaneMaskConstants::get(ST).MovTermOpc), Dst)
10179 .addReg(Src, 0, SrcSubReg)
10180 .addReg(AMDGPU::EXEC, RegState::Implicit);
10181 }
10182 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
10183 Dst);
10184}
10185
10186bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
10187
10190 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
10191 VirtRegMap *VRM) const {
10192 // This is a bit of a hack (copied from AArch64). Consider this instruction:
10193 //
10194 // %0:sreg_32 = COPY $m0
10195 //
10196 // We explicitly chose SReg_32 for the virtual register so such a copy might
10197 // be eliminated by RegisterCoalescer. However, that may not be possible, and
10198 // %0 may even spill. We can't spill $m0 normally (it would require copying to
10199 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
10200 // TargetInstrInfo::foldMemoryOperand() is going to try.
10201 // A similar issue also exists with spilling and reloading $exec registers.
10202 //
10203 // To prevent that, constrain the %0 register class here.
10204 if (isFullCopyInstr(MI)) {
10205 Register DstReg = MI.getOperand(0).getReg();
10206 Register SrcReg = MI.getOperand(1).getReg();
10207 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
10208 (DstReg.isVirtual() != SrcReg.isVirtual())) {
10210 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
10211 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
10212 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
10213 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
10214 return nullptr;
10215 }
10216 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
10217 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
10218 return nullptr;
10219 }
10220 }
10221 }
10222
10223 return nullptr;
10224}
10225
10227 const MachineInstr &MI,
10228 unsigned *PredCost) const {
10229 if (MI.isBundle()) {
10231 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
10232 unsigned Lat = 0, Count = 0;
10233 for (++I; I != E && I->isBundledWithPred(); ++I) {
10234 ++Count;
10235 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
10236 }
10237 return Lat + Count - 1;
10238 }
10239
10240 return SchedModel.computeInstrLatency(&MI);
10241}
10242
10245 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10246 unsigned Opcode = MI.getOpcode();
10247
10248 auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
10249 Register Dst = MI.getOperand(0).getReg();
10250 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
10251 : MI.getOperand(1).getReg();
10252 LLT DstTy = MRI.getType(Dst);
10253 LLT SrcTy = MRI.getType(Src);
10254 unsigned DstAS = DstTy.getAddressSpace();
10255 unsigned SrcAS = SrcTy.getAddressSpace();
10256 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
10257 DstAS == AMDGPUAS::FLAT_ADDRESS &&
10258 ST.hasGloballyAddressableScratch()
10261 };
10262
10263 // If the target supports globally addressable scratch, the mapping from
10264 // scratch memory to the flat aperture changes therefore an address space cast
10265 // is no longer uniform.
10266 if (Opcode == TargetOpcode::G_ADDRSPACE_CAST)
10267 return HandleAddrSpaceCast(MI);
10268
10269 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
10270 auto IID = GI->getIntrinsicID();
10275
10276 switch (IID) {
10277 case Intrinsic::amdgcn_addrspacecast_nonnull:
10278 return HandleAddrSpaceCast(MI);
10279 case Intrinsic::amdgcn_if:
10280 case Intrinsic::amdgcn_else:
10281 // FIXME: Uniform if second result
10282 break;
10283 }
10284
10286 }
10287
10288 // Loads from the private and flat address spaces are divergent, because
10289 // threads can execute the load instruction with the same inputs and get
10290 // different results.
10291 //
10292 // All other loads are not divergent, because if threads issue loads with the
10293 // same arguments, they will always get the same result.
10294 if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD ||
10295 Opcode == AMDGPU::G_SEXTLOAD) {
10296 if (MI.memoperands_empty())
10297 return InstructionUniformity::NeverUniform; // conservative assumption
10298
10299 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10300 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10301 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10302 })) {
10303 // At least one MMO in a non-global address space.
10305 }
10307 }
10308
10309 if (SIInstrInfo::isGenericAtomicRMWOpcode(Opcode) ||
10310 Opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
10311 Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
10312 AMDGPU::isGenericAtomic(Opcode)) {
10314 }
10316}
10317
10320
10321 if (isNeverUniform(MI))
10323
10324 unsigned opcode = MI.getOpcode();
10325 if (opcode == AMDGPU::V_READLANE_B32 ||
10326 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
10327 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
10329
10330 if (isCopyInstr(MI)) {
10331 const MachineOperand &srcOp = MI.getOperand(1);
10332 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
10333 const TargetRegisterClass *regClass =
10334 RI.getPhysRegBaseClass(srcOp.getReg());
10335 return RI.isSGPRClass(regClass) ? InstructionUniformity::AlwaysUniform
10337 }
10339 }
10340
10341 // GMIR handling
10342 if (MI.isPreISelOpcode())
10344
10345 // Atomics are divergent because they are executed sequentially: when an
10346 // atomic operation refers to the same address in each thread, then each
10347 // thread after the first sees the value written by the previous thread as
10348 // original value.
10349
10350 if (isAtomic(MI))
10352
10353 // Loads from the private and flat address spaces are divergent, because
10354 // threads can execute the load instruction with the same inputs and get
10355 // different results.
10356 if (isFLAT(MI) && MI.mayLoad()) {
10357 if (MI.memoperands_empty())
10358 return InstructionUniformity::NeverUniform; // conservative assumption
10359
10360 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10361 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10362 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10363 })) {
10364 // At least one MMO in a non-global address space.
10366 }
10367
10369 }
10370
10371 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
10372 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
10373
10374 // FIXME: It's conceptually broken to report this for an instruction, and not
10375 // a specific def operand. For inline asm in particular, there could be mixed
10376 // uniform and divergent results.
10377 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
10378 const MachineOperand &SrcOp = MI.getOperand(I);
10379 if (!SrcOp.isReg())
10380 continue;
10381
10382 Register Reg = SrcOp.getReg();
10383 if (!Reg || !SrcOp.readsReg())
10384 continue;
10385
10386 // If RegBank is null, this is unassigned or an unallocatable special
10387 // register, which are all scalars.
10388 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
10389 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
10391 }
10392
10393 // TODO: Uniformity check condtions above can be rearranged for more
10394 // redability
10395
10396 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
10397 // currently turned into no-op COPYs by SelectionDAG ISel and are
10398 // therefore no longer recognizable.
10399
10401}
10402
10404 switch (MF.getFunction().getCallingConv()) {
10406 return 1;
10408 return 2;
10410 return 3;
10414 const Function &F = MF.getFunction();
10415 F.getContext().diagnose(DiagnosticInfoUnsupported(
10416 F, "ds_ordered_count unsupported for this calling conv"));
10417 [[fallthrough]];
10418 }
10421 case CallingConv::C:
10422 case CallingConv::Fast:
10423 default:
10424 // Assume other calling conventions are various compute callable functions
10425 return 0;
10426 }
10427}
10428
10430 Register &SrcReg2, int64_t &CmpMask,
10431 int64_t &CmpValue) const {
10432 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
10433 return false;
10434
10435 switch (MI.getOpcode()) {
10436 default:
10437 break;
10438 case AMDGPU::S_CMP_EQ_U32:
10439 case AMDGPU::S_CMP_EQ_I32:
10440 case AMDGPU::S_CMP_LG_U32:
10441 case AMDGPU::S_CMP_LG_I32:
10442 case AMDGPU::S_CMP_LT_U32:
10443 case AMDGPU::S_CMP_LT_I32:
10444 case AMDGPU::S_CMP_GT_U32:
10445 case AMDGPU::S_CMP_GT_I32:
10446 case AMDGPU::S_CMP_LE_U32:
10447 case AMDGPU::S_CMP_LE_I32:
10448 case AMDGPU::S_CMP_GE_U32:
10449 case AMDGPU::S_CMP_GE_I32:
10450 case AMDGPU::S_CMP_EQ_U64:
10451 case AMDGPU::S_CMP_LG_U64:
10452 SrcReg = MI.getOperand(0).getReg();
10453 if (MI.getOperand(1).isReg()) {
10454 if (MI.getOperand(1).getSubReg())
10455 return false;
10456 SrcReg2 = MI.getOperand(1).getReg();
10457 CmpValue = 0;
10458 } else if (MI.getOperand(1).isImm()) {
10459 SrcReg2 = Register();
10460 CmpValue = MI.getOperand(1).getImm();
10461 } else {
10462 return false;
10463 }
10464 CmpMask = ~0;
10465 return true;
10466 case AMDGPU::S_CMPK_EQ_U32:
10467 case AMDGPU::S_CMPK_EQ_I32:
10468 case AMDGPU::S_CMPK_LG_U32:
10469 case AMDGPU::S_CMPK_LG_I32:
10470 case AMDGPU::S_CMPK_LT_U32:
10471 case AMDGPU::S_CMPK_LT_I32:
10472 case AMDGPU::S_CMPK_GT_U32:
10473 case AMDGPU::S_CMPK_GT_I32:
10474 case AMDGPU::S_CMPK_LE_U32:
10475 case AMDGPU::S_CMPK_LE_I32:
10476 case AMDGPU::S_CMPK_GE_U32:
10477 case AMDGPU::S_CMPK_GE_I32:
10478 SrcReg = MI.getOperand(0).getReg();
10479 SrcReg2 = Register();
10480 CmpValue = MI.getOperand(1).getImm();
10481 CmpMask = ~0;
10482 return true;
10483 }
10484
10485 return false;
10486}
10487
10489 Register SrcReg2, int64_t CmpMask,
10490 int64_t CmpValue,
10491 const MachineRegisterInfo *MRI) const {
10492 if (!SrcReg || SrcReg.isPhysical())
10493 return false;
10494
10495 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
10496 return false;
10497
10498 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
10499 this](int64_t ExpectedValue, unsigned SrcSize,
10500 bool IsReversible, bool IsSigned) -> bool {
10501 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10502 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10503 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10504 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10505 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
10506 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10507 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10508 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10509 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10510 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
10511 //
10512 // Signed ge/gt are not used for the sign bit.
10513 //
10514 // If result of the AND is unused except in the compare:
10515 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
10516 //
10517 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
10518 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
10519 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
10520 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
10521 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
10522 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
10523
10524 MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
10525 if (!Def || Def->getParent() != CmpInstr.getParent())
10526 return false;
10527
10528 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
10529 Def->getOpcode() != AMDGPU::S_AND_B64)
10530 return false;
10531
10532 int64_t Mask;
10533 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
10534 if (MO->isImm())
10535 Mask = MO->getImm();
10536 else if (!getFoldableImm(MO, Mask))
10537 return false;
10538 Mask &= maxUIntN(SrcSize);
10539 return isPowerOf2_64(Mask);
10540 };
10541
10542 MachineOperand *SrcOp = &Def->getOperand(1);
10543 if (isMask(SrcOp))
10544 SrcOp = &Def->getOperand(2);
10545 else if (isMask(&Def->getOperand(2)))
10546 SrcOp = &Def->getOperand(1);
10547 else
10548 return false;
10549
10550 // A valid Mask is required to have a single bit set, hence a non-zero and
10551 // power-of-two value. This verifies that we will not do 64-bit shift below.
10552 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
10553 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
10554 if (IsSigned && BitNo == SrcSize - 1)
10555 return false;
10556
10557 ExpectedValue <<= BitNo;
10558
10559 bool IsReversedCC = false;
10560 if (CmpValue != ExpectedValue) {
10561 if (!IsReversible)
10562 return false;
10563 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
10564 if (!IsReversedCC)
10565 return false;
10566 }
10567
10568 Register DefReg = Def->getOperand(0).getReg();
10569 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
10570 return false;
10571
10572 for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
10573 I != E; ++I) {
10574 if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
10575 I->killsRegister(AMDGPU::SCC, &RI))
10576 return false;
10577 }
10578
10579 MachineOperand *SccDef =
10580 Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
10581 SccDef->setIsDead(false);
10582 CmpInstr.eraseFromParent();
10583
10584 if (!MRI->use_nodbg_empty(DefReg)) {
10585 assert(!IsReversedCC);
10586 return true;
10587 }
10588
10589 // Replace AND with unused result with a S_BITCMP.
10590 MachineBasicBlock *MBB = Def->getParent();
10591
10592 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
10593 : AMDGPU::S_BITCMP1_B32
10594 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
10595 : AMDGPU::S_BITCMP1_B64;
10596
10597 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
10598 .add(*SrcOp)
10599 .addImm(BitNo);
10600 Def->eraseFromParent();
10601
10602 return true;
10603 };
10604
10605 switch (CmpInstr.getOpcode()) {
10606 default:
10607 break;
10608 case AMDGPU::S_CMP_EQ_U32:
10609 case AMDGPU::S_CMP_EQ_I32:
10610 case AMDGPU::S_CMPK_EQ_U32:
10611 case AMDGPU::S_CMPK_EQ_I32:
10612 return optimizeCmpAnd(1, 32, true, false);
10613 case AMDGPU::S_CMP_GE_U32:
10614 case AMDGPU::S_CMPK_GE_U32:
10615 return optimizeCmpAnd(1, 32, false, false);
10616 case AMDGPU::S_CMP_GE_I32:
10617 case AMDGPU::S_CMPK_GE_I32:
10618 return optimizeCmpAnd(1, 32, false, true);
10619 case AMDGPU::S_CMP_EQ_U64:
10620 return optimizeCmpAnd(1, 64, true, false);
10621 case AMDGPU::S_CMP_LG_U32:
10622 case AMDGPU::S_CMP_LG_I32:
10623 case AMDGPU::S_CMPK_LG_U32:
10624 case AMDGPU::S_CMPK_LG_I32:
10625 return optimizeCmpAnd(0, 32, true, false);
10626 case AMDGPU::S_CMP_GT_U32:
10627 case AMDGPU::S_CMPK_GT_U32:
10628 return optimizeCmpAnd(0, 32, false, false);
10629 case AMDGPU::S_CMP_GT_I32:
10630 case AMDGPU::S_CMPK_GT_I32:
10631 return optimizeCmpAnd(0, 32, false, true);
10632 case AMDGPU::S_CMP_LG_U64:
10633 return optimizeCmpAnd(0, 64, true, false);
10634 }
10635
10636 return false;
10637}
10638
10640 AMDGPU::OpName OpName) const {
10641 if (!ST.needsAlignedVGPRs())
10642 return;
10643
10644 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
10645 if (OpNo < 0)
10646 return;
10647 MachineOperand &Op = MI.getOperand(OpNo);
10648 if (getOpSize(MI, OpNo) > 4)
10649 return;
10650
10651 // Add implicit aligned super-reg to force alignment on the data operand.
10652 const DebugLoc &DL = MI.getDebugLoc();
10653 MachineBasicBlock *BB = MI.getParent();
10655 Register DataReg = Op.getReg();
10656 bool IsAGPR = RI.isAGPR(MRI, DataReg);
10657 Register Undef = MRI.createVirtualRegister(
10658 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
10659 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
10660 Register NewVR =
10661 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
10662 : &AMDGPU::VReg_64_Align2RegClass);
10663 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
10664 .addReg(DataReg, 0, Op.getSubReg())
10665 .addImm(AMDGPU::sub0)
10666 .addReg(Undef)
10667 .addImm(AMDGPU::sub1);
10668 Op.setReg(NewVR);
10669 Op.setSubReg(AMDGPU::sub0);
10670 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
10671}
10672
10674 if (isIGLP(*MI))
10675 return false;
10676
10678}
10679
10681 if (!isWMMA(MI) && !isSWMMAC(MI))
10682 return false;
10683
10684 if (AMDGPU::isGFX1250(ST))
10685 return AMDGPU::getWMMAIsXDL(MI.getOpcode());
10686
10687 return true;
10688}
10689
10691 unsigned Opcode = MI.getOpcode();
10692
10693 if (AMDGPU::isGFX12Plus(ST))
10694 return isDOT(MI) || isXDLWMMA(MI);
10695
10696 if (!isMAI(MI) || isDGEMM(Opcode) ||
10697 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
10698 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
10699 return false;
10700
10701 if (!ST.hasGFX940Insts())
10702 return true;
10703
10704 return AMDGPU::getMAIIsGFX940XDL(Opcode);
10705}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU Register Bank Select
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
static bool isUndef(const MachineInstr &MI)
TargetInstrInfo::RegSubRegPair RegSubRegPair
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
#define GENERATE_RENAMED_GFX9_CASES(OPCODE)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static MachineInstr * swapImmOperands(MachineInstr &MI, MachineOperand &NonRegOp1, MachineOperand &NonRegOp2)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static bool isRegOrFI(const MachineOperand &MO)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static constexpr AMDGPU::OpName ModifierOpNames[]
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static const TargetRegisterClass * adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI, const MCInstrDesc &TID, unsigned RCID)
static bool shouldReadExec(const MachineInstr &MI)
static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc)
static bool isRenamedInGFX9(int Opcode)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, AMDGPU::OpName OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
static bool canRemat(const MachineInstr &MI)
static MachineBasicBlock * loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
bool IsDead
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
#define LLVM_DEBUG(...)
Definition Debug.h:114
static const LaneMaskConstants & get(const GCNSubtarget &ST)
Class for arbitrary precision integers.
Definition APInt.h:78
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1562
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
const T & front() const
front - Get the first element.
Definition ArrayRef.h:150
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
uint64_t getZExtValue() const
A debug info location.
Definition DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
bool hasAddNoCarry() const
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
constexpr unsigned getAddressSpace() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
LLVM_ABI bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
LLVM_ABI void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
LLVM_ABI VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
static LocationSize precise(uint64_t Value)
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:348
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:418
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:428
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition MCExpr.cpp:212
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
bool mayStore() const
Return true if this instruction could possibly modify memory.
bool mayLoad() const
Return true if this instruction could possibly read memory.
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
unsigned getSize() const
Return the number of bytes in the encoding of this instruction, or zero if the encoding size cannot b...
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
unsigned getOpcode() const
Return the opcode number for this descriptor.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:86
uint8_t OperandType
Information about the type of the operand.
Definition MCInstrDesc.h:98
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition MCInstrDesc.h:92
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:33
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:214
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
LLVM_ABI void setVariableValue(const MCExpr *Value)
Definition MCSymbol.cpp:50
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
LLVM_ABI void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
mop_range explicit_operands()
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
bool isMoveImmediate(QueryType Type=IgnoreBundle) const
Return true if this instruction is a move immediate (including conditional moves) instruction.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void enterBasicBlockEnd(MachineBasicBlock &MBB)
Start tracking liveness from the end of basic block MBB.
bool isRegUsed(Register Reg, bool includeReserved=true) const
Return if a specific register is currently used.
void setRegUsed(Register Reg, LaneBitmask LaneMask=LaneBitmask::getAll())
Tell the scavenger a register is used.
void backward()
Update internal register state and move MBB iterator backwards.
void enterBasicBlock(MachineBasicBlock &MBB)
Start tracking liveness from the begin of basic block MBB.
Register scavengeRegisterBackwards(const TargetRegisterClass &RC, MachineBasicBlock::iterator To, bool RestoreAfter, int SPAdj, bool AllowSpill=true)
Make a register of the specific register class available from the current position backwards to the p...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:102
constexpr bool isValid() const
Definition Register.h:107
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:74
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
static bool isDS(const MachineInstr &MI)
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
Register isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
bool isXDLWMMA(const MachineInstr &MI) const
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
uint64_t getDefaultRsrcDataFormat() const
static bool isSOPP(const MachineInstr &MI)
InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const
bool isIGLP(unsigned Opcode) const
static bool isFLATScratch(const MachineInstr &MI)
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
bool mayAccessScratchThroughFlat(const MachineInstr &MI) const
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
Register isStackAccess(const MachineInstr &MI, int &FrameIndex) const
static bool isMTBUF(const MachineInstr &MI)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
unsigned getInstBundleSize(const MachineInstr &MI) const
static bool isVOP2(const MachineInstr &MI)
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
MachineInstr * getWholeWaveFunctionSetup(MachineFunction &MF) const
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
static bool isDOT(const MachineInstr &MI)
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
static bool isSWMMAC(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
void removeModOperands(MachineInstr &MI) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
bool isSpill(uint16_t Opcode) const
unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
bool isXDL(const MachineInstr &MI) const
static bool isVIMAGE(const MachineInstr &MI)
void enforceOperandRCAlignment(MachineInstr &MI, AMDGPU::OpName OpName) const
static bool isSOP2(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
bool isLegalAV64PseudoImm(uint64_t Imm) const
Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
bool hasModifiersSet(const MachineInstr &MI, AMDGPU::OpName OpName) const
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, unsigned toIdx) const
bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override
static bool isFLATGlobal(const MachineInstr &MI)
bool isGlobalMemoryObject(const MachineInstr *MI) const override
static bool isVSAMPLE(const MachineInstr &MI)
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const override
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isImage(const MachineInstr &MI)
static bool isSOPK(const MachineInstr &MI)
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
const TargetRegisterClass * getRegClass(const MCInstrDesc &TID, unsigned OpNum, const TargetRegisterInfo *TRI) const override
InstructionUniformity getInstructionUniformity(const MachineInstr &MI) const override final
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
static bool isVOP3(const MCInstrDesc &Desc)
bool physRegUsesConstantBus(const MachineOperand &Reg) const
static bool isF16PseudoScalarTrans(unsigned Opcode)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
static bool isDPP(const MachineInstr &MI)
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
bool isLowLatencyInstruction(const MachineInstr &MI) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
bool isAlwaysGDS(uint16_t Opcode) const
static bool isMAI(const MCInstrDesc &Desc)
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void legalizeOperandsVALUt16(MachineInstr &Inst, MachineRegisterInfo &MRI) const
Fix operands in Inst to fix 16bit SALU to VALU lowering.
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst) const
bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo, const MachineOperand &MO) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
static bool isVGPRSpill(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction with the giv...
static bool isWWMRegSpillOpcode(uint16_t Opcode)
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
int64_t getNamedImmOperand(const MachineInstr &MI, AMDGPU::OpName OperandName) const
Get required immediate operand.
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool regUsesConstantBus(const MachineOperand &Reg, const MachineRegisterInfo &MRI) const
static bool isMIMG(const MachineInstr &MI)
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description or operand ind...
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI, const TargetRegisterClass *DstRC=nullptr) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, AMDGPU::OpName Src0OpName, MachineOperand &Src1, AMDGPU::OpName Src1OpName) const
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override
static bool isAtomic(const MachineInstr &MI)
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
bool isLiteralOperandLegal(const MCInstrDesc &InstDesc, const MCOperandInfo &OpInfo) const
static bool sopkIsZext(unsigned Opcode)
static bool isSGPRSpill(const MachineInstr &MI)
static bool isWMMA(const MachineInstr &MI)
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
bool isBarrier(unsigned Opcode) const
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
bool isLegalGFX12PlusPackedMathFP32Operand(const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, const MachineOperand *MO=nullptr) const
Check if MO would be a legal operand for gfx12+ packed math FP32 instructions.
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
static bool isFixedSize(const MachineInstr &MI)
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
uint64_t getScratchRsrcWords23() const
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const
Returns the operand named Op.
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
static bool isLDSDMA(const MachineInstr &MI)
static bool isVOP1(const MachineInstr &MI)
SIInstrInfo(const GCNSubtarget &ST)
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool isWWMReg(Register Reg) const
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
const TargetRegisterClass * getRegClass(unsigned RCID) const
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
const TargetRegisterClass * getProperlyAlignedRC(const TargetRegisterClass *RC) const
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
unsigned getHWRegIndex(MCRegister Reg) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
unsigned getChannelFromSubReg(unsigned SubReg) const
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
SlotIndexes pass.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:281
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool isGlobalMemoryObject(const MachineInstr *MI) const
Returns true if MI is an instruction we are unable to reason about (like a call or something with unm...
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:194
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:174
self_iterator getIterator()
Definition ilist_node.h:134
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
bool isPackedFP32Inst(unsigned Opc)
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
bool getWMMAIsXDL(unsigned Opc)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
LLVM_READONLY int getGlobalVaddrOp(uint16_t Opcode)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
bool getMAIIsGFX940XDL(unsigned Opc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
LLVM_READONLY int getAddr64Inst(uint16_t Opcode)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
const uint64_t RSRC_TID_ENABLE
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
bool isGenericAtomic(unsigned Opc)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCSubtargetInfo &ST)
LLVM_READONLY int getCommuteRev(uint16_t Opcode)
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition SIDefines.h:231
@ OPERAND_REG_IMM_INT64
Definition SIDefines.h:202
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:209
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:222
@ OPERAND_REG_INLINE_C_BF16
Definition SIDefines.h:219
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:224
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:210
@ OPERAND_REG_IMM_BF16
Definition SIDefines.h:206
@ OPERAND_REG_IMM_INT32
Operands with register, 32-bit, or 64-bit immediate.
Definition SIDefines.h:201
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:208
@ OPERAND_REG_IMM_FP16
Definition SIDefines.h:207
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:218
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition SIDefines.h:216
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:211
@ OPERAND_REG_IMM_FP64
Definition SIDefines.h:205
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:225
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:236
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:237
@ OPERAND_REG_IMM_V2INT32
Definition SIDefines.h:212
@ OPERAND_SDWA_VOPC_DST
Definition SIDefines.h:248
@ OPERAND_REG_IMM_FP32
Definition SIDefines.h:204
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:221
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:217
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:223
@ OPERAND_INLINE_C_AV64_PSEUDO
Definition SIDefines.h:242
@ OPERAND_REG_IMM_V2FP32
Definition SIDefines.h:213
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:238
@ OPERAND_REG_INLINE_C_FP16
Definition SIDefines.h:220
@ OPERAND_REG_IMM_INT16
Definition SIDefines.h:203
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition SIDefines.h:228
@ TI_SCRATCH_RSRC_DWORD1
Definition AMDGPU.h:569
@ TI_SCRATCH_RSRC_DWORD3
Definition AMDGPU.h:571
@ TI_SCRATCH_RSRC_DWORD0
Definition AMDGPU.h:568
@ TI_SCRATCH_RSRC_DWORD2
Definition AMDGPU.h:570
@ TI_CONSTDATA_START
Definition AMDGPU.h:567
LLVM_READONLY int getCommuteOrig(uint16_t Opcode)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool isGFX1250(const MCSubtargetInfo &STI)
int getMCOpcode(uint16_t Opcode, unsigned Gen)
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
LLVM_READONLY int getIfAddr64Inst(uint16_t Opcode)
Check if Opcode is an Addr64 opcode.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ OPERAND_GENERIC_4
Definition MCInstrDesc.h:71
@ OPERAND_GENERIC_2
Definition MCInstrDesc.h:69
@ OPERAND_GENERIC_1
Definition MCInstrDesc.h:68
@ OPERAND_GENERIC_3
Definition MCInstrDesc.h:70
@ OPERAND_IMMEDIATE
Definition MCInstrDesc.h:61
@ OPERAND_GENERIC_0
Definition MCInstrDesc.h:67
@ OPERAND_GENERIC_5
Definition MCInstrDesc.h:72
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Not(const Pred &P) -> Not< Pred >
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:310
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:262
@ Offset
Definition DWP.cpp:477
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1707
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition MathExtras.h:216
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2454
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:626
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:557
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:293
Op::Description Desc
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1714
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
auto reverse(ContainerTy &&C)
Definition STLExtras.h:400
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:164
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:405
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
@ Xor
Bitwise or logical XOR of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition SIInstrInfo.h:40
constexpr unsigned BitWidth
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:257
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
constexpr T reverseBits(T Val)
Reverse the bits in Val.
Definition MathExtras.h:127
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1879
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:583
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:86
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
Definition Uniformity.h:23
@ NeverUniform
The result values can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
GenericCycleInfo< MachineSSAContext > MachineCycleInfo
MachineCycleInfo::CycleT MachineCycle
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:853
#define N
static LLVM_ABI Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition APFloat.cpp:219
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:85
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition SIInstrInfo.h:56
MachineInstr * top() const
Definition SIInstrInfo.h:61
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition SIInstrInfo.h:80
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.